Browse Source

resolve conflict

tsteuer 4 years ago
parent
commit
094f32f2fb

+ 1 - 19
Pipfile

@@ -7,33 +7,15 @@ verify_ssl = true
 
 
 [packages]
 [packages]
 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
-pycodestyle = "*"
-ipykernel = "*"
-spyder-kernels = "==0.*"
-cloudpickle = "*"
-openpyxl = "*"
-setuptools = "*"
-scipy = "*"
-matplotlib = "*"
-tsfresh = "*"
-hyperopt = "*"
-xgboost = "*"
-scikit-learn = "*"
 pandas = "!=0.24.0"
 pandas = "!=0.24.0"
-pandas-compat = "*"
-xmltodict = "*"
 sqlalchemy = "*"
 sqlalchemy = "*"
 sqlparse = "*"
 sqlparse = "*"
 pymysql = "*"
 pymysql = "*"
-xlrd = "*"
 pymongo = "*"
 pymongo = "*"
 jsonref = "*"
 jsonref = "*"
-faker = "*"
-xeger = "*"
 simplejson = "*"
 simplejson = "*"
 mysql = "*"
 mysql = "*"
-sqlalchemy-utils = "*"
-apyori==1.1.1
+hyperopt = "*"
 
 
 [requires]
 [requires]
 python_version = "3"
 python_version = "3"

File diff suppressed because it is too large
+ 205 - 684
Pipfile.lock


+ 86 - 25
cdplib/db_handlers/MongodbHandler.py

@@ -13,11 +13,13 @@ Created on Mon Sep 16 13:27:44 2019
 import simplejson
 import simplejson
 import sys
 import sys
 import os
 import os
+import time
 
 
 import pymongo
 import pymongo
 from pymongo import MongoClient
 from pymongo import MongoClient
 import pandas as pd
 import pandas as pd
 import numpy as np
 import numpy as np
+from pprint import pprint
 
 
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
 from cdplib.log import Log
 from cdplib.log import Log
@@ -171,7 +173,7 @@ class MongodbHandler:
         command = {
         command = {
                     'collMod': collection_name,
                     'collMod': collection_name,
                     'validator': {
                     'validator': {
-                        '$jsonSchema': parse_obj.schemas[0]
+                        '$jsonSchema': parse_obj.load_and_parse_schema_for_mongodb(schema_path)
                     },
                     },
                     'validationLevel': validation_level,
                     'validationLevel': validation_level,
                     'validationAction': validation_action
                     'validationAction': validation_action
@@ -239,7 +241,22 @@ class MongodbHandler:
                 self._database[collection_name].insert_many(data, ordered=ordered)
                 self._database[collection_name].insert_many(data, ordered=ordered)
 
 
         except Exception as error:
         except Exception as error:
-            self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
+            if len(data) > 1:
+
+                self._log.warning(('An error occured inserting {} documents into database: {} and collection: {}.').format(len(data), self._database_name, collection_name))
+                self._log.warning('This might be because one or more documents are invalid.') 
+                self._log.warning('We will try to insert the documents one-by-one and report which are invalid.')
+                self._log.warning(('Error: {}').format(error))
+                
+                for row in data:
+
+                    try:
+                        self._database[collection_name].insert_one(row)
+                    except Exception as error:
+                        pprint(row)
+                        self._log.warning(error)
+            else:
+                self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
 
 
         self._log.info(('Data has been inserted into the {} collection').format(collection_name))
         self._log.info(('Data has been inserted into the {} collection').format(collection_name))
 
 
@@ -267,16 +284,22 @@ class MongodbHandler:
 
 
         try:
         try:
             if attribute == None or attribute_value == None:
             if attribute == None or attribute_value == None:
-                data = self._database[collection_name].find({},return_values)
+                query = {}
+                data = self._database[collection_name].find(query,return_values)
+                
             else:
             else:
-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, return_values)
+                query = {attribute: {comparison_operator: attribute_value}}
+                data = self._database[collection_name].find(query, return_values)
 
 
         except Exception as error:
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: {}:{}. \nError:{}').format(collection_name, attribute, comparison_operator, attribute_value, error))
             self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: {}:{}. \nError:{}').format(collection_name, attribute, comparison_operator, attribute_value, error))
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
-        else:
-            return data
+            return None
+
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
 
 
     def aggregate_data_and_generate_dataframe(self, collection_name: str, aggregation_pipeline: list, index: str = None):
     def aggregate_data_and_generate_dataframe(self, collection_name: str, aggregation_pipeline: list, index: str = None):
 
 
@@ -284,11 +307,16 @@ class MongodbHandler:
             data = self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True)
             data = self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True)
         except Exception as error:
         except Exception as error:
             self._log.log_and_raise_error(('A problem occured when aggregating the collection {} with the pipeline {}. \nError: {}').format(collection_name, aggregation_pipeline, error))
             self._log.log_and_raise_error(('A problem occured when aggregating the collection {} with the pipeline {}. \nError: {}').format(collection_name, aggregation_pipeline, error))
+            return None
 
 
         return self.convert_mongo_data_into_dataframe(data, index, collection_name)
         return self.convert_mongo_data_into_dataframe(data, index, collection_name)
 
 
-    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None) -> pd.DataFrame():
+    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None, chunksize: int = 500) -> pd.DataFrame():
 
 
+        start_time = time.time()
+        '''
+        self._log.info('Converting returned mongo data into a DataFrame')
+        
         data = list(data)
         data = list(data)
         try:
         try:
             if len(data)> 0:
             if len(data)> 0:
@@ -299,11 +327,38 @@ class MongodbHandler:
                 df = pd.DataFrame(data)
                 df = pd.DataFrame(data)
                 if index is not None:
                 if index is not None:
                     df.set_index(index, inplace=True)
                     df.set_index(index, inplace=True)
+
+                self._log.info(('DataFrame conversion is done, took {} seconds').format(time.time()-start_time))
                 return df
                 return df
             else:
             else:
                 self._log.warning(('No data for the query was found').format())
                 self._log.warning(('No data for the query was found').format())
         except Exception as error:
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to convert mongo data into pd.Dataframe. \nError: {} ').format(error))
             self._log.log_and_raise_error(('An error occured trying to convert mongo data into pd.Dataframe. \nError: {} ').format(error))
+        '''
+    
+        frames = []
+        records = []
+        for iteration, value in enumerate(data):
+
+            records.append(value)
+            if iteration + 1 % chunksize == 0:
+                frames.append(pd.DataFrame(records))
+                records = []
+
+        if records:
+            frames.append(pd.DataFrame(records))
+
+        return_df = pd.concat(frames, axis=0, sort=False)
+
+        if index is not None:
+            return_df.set_index(index, inplace=True)
+
+        self._log.info(('{} Rows were fetched from {}. DataFrame conversion is done, took {} seconds').format(len(return_df.index), collection_name if collection_name is not None else 'the database', time.time()-start_time))
+        
+        return return_df
+
+ 
+        
 
 
     #def update_data_in_collection(self, query_label: str, query_value: str, update_label:str, update_value: str, collection_name:str):
     #def update_data_in_collection(self, query_label: str, query_value: str, update_label:str, update_value: str, collection_name:str):
     #    self._database[collection_name].update_one({query_label:query_value}, {"$set": {update_label: update_value}})
     #    self._database[collection_name].update_one({query_label:query_value}, {"$set": {update_label: update_value}})
@@ -338,7 +393,7 @@ class MongodbHandler:
         '''
         '''
         return self._database[collection_name].find({query_label:query_value}).count() > 0
         return self._database[collection_name].find({query_label:query_value}).count() > 0
 
 
-    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_as_dataframe: bool = True):
+    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_id: bool = False, return_as_dataframe: bool = True):
         '''
         '''
             Queries data between two dates.
             Queries data between two dates.
 
 
@@ -349,16 +404,20 @@ class MongodbHandler:
             :param str index:
             :param str index:
             :param bool return_as_dataframe:
             :param bool return_as_dataframe:
         '''
         '''
+        assert(isinstance(collection_name, str)),\
+            "Parameter 'collection_name' must be a string type"
         try:
         try:
-            data = self._database[collection_name].find({date_label: {'$gt': from_date_value, '$lt': to_date_value}})
+            query = {date_label: {'$gt': from_date_value, '$lt': to_date_value}}
+            data = self._database[collection_name].find(query, {'_id': return_id})
 
 
         except Exception as error:
         except Exception as error:
-            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: $gt:{}, $lt:{}. \nError:{}').format(collection_name, date_label, from_date_value, to_date_value, error))
+            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}. \nError:{}').format(collection_name, query, error))
 
 
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
-        else:
-            return data
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
 
 
     def query_oldest_or_newest_date_in_collection(self, collection_name: str, date_label: str, oldest: bool = False):
     def query_oldest_or_newest_date_in_collection(self, collection_name: str, date_label: str, oldest: bool = False):
 
 
@@ -385,21 +444,23 @@ class MongodbHandler:
         try:
         try:
 
 
             if attribute == None or attribute_value == None:
             if attribute == None or attribute_value == None:
-                data = self._database[collection_name].find({},{'_id': return_id}).sort(sort_label, direction).limit(limit)
+                query = {}
+                data = self._database[collection_name].find(query,{'_id': return_id}).sort(sort_label, direction).limit(limit)
             else:
             else:
-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, {'_id': return_id}).sort(sort_label, direction).limit(limit)
-
-            if len(list(data)) == 0:
-                self._log.warning('No data was found for the query')
-                return None
+                query = {attribute: {comparison_operator: attribute_value}}
+                data = self._database[collection_name].find(query, {'_id': return_id}).sort(sort_label, direction).limit(limit)
 
 
         except Exception as error:
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to query data from {}, \nError:{}').format(collection_name, error))
             self._log.log_and_raise_error(('An error occured trying to query data from {}, \nError:{}').format(collection_name, error))
 
 
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
         else:
         else:
-            return data
+            self._log.warning('No data was found for the query')
+            return None
 
 
     def update_data_in_collection(self, update_label:str, update_value: str, collection_name:str, query_label: str = None, query_value: str = None, create_if_not_exist: bool = True, find_query: dict = None, update_many: bool = False):
     def update_data_in_collection(self, update_label:str, update_value: str, collection_name:str, query_label: str = None, query_value: str = None, create_if_not_exist: bool = True, find_query: dict = None, update_many: bool = False):
 
 

+ 9 - 6
cdplib/db_handlers/SQLHandler.py

@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
 """
 Created on Tue Sep 18 16:20:50 2018
 Created on Tue Sep 18 16:20:50 2018
 
 
@@ -211,13 +209,17 @@ class SQLHandler:
         transaction = connection.begin()
         transaction = connection.begin()
 
 
         errors = []
         errors = []
+        results = []
 
 
         # in the case of multi-query execute each query
         # in the case of multi-query execute each query
         for sub_query in sqlparse.split(query):
         for sub_query in sqlparse.split(query):
             if len(sub_query) > 0:
             if len(sub_query) > 0:
                 try:
                 try:
-                    connection.execute(sub_query, multi=True)
-
+                    result = connection.execute(sub_query)
+                    if result.returns_rows:
+                        data = pd.DataFrame(result.fetchall())
+                        data.columns = result.keys()
+                        results.append(data)
                 except Exception as e:
                 except Exception as e:
                     errors.append(str(e))
                     errors.append(str(e))
 
 
@@ -231,6 +233,7 @@ class SQLHandler:
 
 
         transaction.commit()
         transaction.commit()
         connection.close()
         connection.close()
+        return results
 
 
     def execute_query_from_file(self, filename: str):
     def execute_query_from_file(self, filename: str):
         '''
         '''
@@ -441,7 +444,7 @@ class SQLHandler:
                                tablename)
                                tablename)
 
 
             data = self.execute(query)
             data = self.execute(query)
-            colnames = data.columns.tolist()
+            colnames = data[0].columns.tolist()
 
 
         return colnames
         return colnames
 
 
@@ -640,4 +643,4 @@ class SQLHandler:
             self._engine.dispose()
             self._engine.dispose()
         except Exception as e:
         except Exception as e:
             print(('An error occured when trying to dispose the SQL engine. Error: {}').format(e))
             print(('An error occured when trying to dispose the SQL engine. Error: {}').format(e))
-            raise Exception(e)
+            raise Exception(e)

+ 96 - 88
cdplib/db_migration/DataFrameToCollection.py

@@ -51,7 +51,8 @@ class DataFrameToCollection():
     def to_list_of_documents(self, data: pd.DataFrame,
     def to_list_of_documents(self, data: pd.DataFrame,
                              grp_fields: list,
                              grp_fields: list,
                              schema: dict = None,
                              schema: dict = None,
-                             _final_step: bool = True) -> list:
+                             _final_step: bool = True,
+                             already_reshaped: list = []) -> list:
         '''
         '''
         Reshapes a pandas dataframe to a list of documents according
         Reshapes a pandas dataframe to a list of documents according
          to a complex (json) mongodb schema
          to a complex (json) mongodb schema
@@ -84,128 +85,135 @@ class DataFrameToCollection():
             if field not in self._unroll_nested_names(data.columns):
             if field not in self._unroll_nested_names(data.columns):
                 continue
                 continue
 
 
-            field_type = schema["properties"][field]["bsonType"]
+            if field in already_reshaped:
+                reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_flattened_list_of_distinct)
+                reshaped_fields.append(reshaped_field)
+            else:
+                field_type = schema["properties"][field]["bsonType"]
 
 
-            # if field has a simple type
-            if field_type not in ["array", "object"]:
+                # if field has a simple type
+                if field_type not in ["array", "object"]:
 
 
-                grp_fields = [c for c in grp_fields if c in data.columns]
+                    grp_fields = [c for c in grp_fields if c in data.columns]
 
 
-                # check that there is only one possible value of this field
-                n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
+                    # check that there is only one possible value of this field
+                    n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
 
 
-                # n_distinct_valus can be 0 if the column only contains NaN values
-                if n_distinct_values > 1:
-                    err = "Field {0} is not unique with respect to {1}"\
-                          .format(field, grp_fields)
+                    # n_distinct_valus can be 0 if the column only contains NaN values
+                    if n_distinct_values > 1:
+                        err = "Field {0} is not unique with respect to {1}"\
+                            .format(field, grp_fields)
 
 
-                    self._log.error(err)
-                    raise Exception(err)
+                        self._log.error(err)
+                        raise Exception(err)
 
 
-                if field not in grp_fields:
-                    reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
-                else:
-                    reshaped_field =\
-                        data[grp_fields].drop_duplicates()\
-                        .set_index(grp_fields, drop=False)[field]
+                    if field not in grp_fields:
+                        reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
+                    else:
+                        reshaped_field =\
+                            data[grp_fields].drop_duplicates()\
+                            .set_index(grp_fields, drop=False)[field]
 
 
-                reshaped_fields.append(reshaped_field)
+                    reshaped_fields.append(reshaped_field)
 
 
-            # if field is sub-document (dictionary)
-            elif field_type == "object":
+                # if field is sub-document (dictionary)
+                elif field_type == "object":
 
 
-                sub_schema = deepcopy(schema["properties"][field])
+                    sub_schema = deepcopy(schema["properties"][field])
 
 
-                # rename sub-schema properties to match with data column names
-                sub_schema["properties"] =\
-                    {".".join([field, k]): v for k, v
-                     in sub_schema["properties"].items()}
+                    # rename sub-schema properties to match with data column names
+                    sub_schema["properties"] =\
+                        {".".join([field, k]): v for k, v
+                        in sub_schema["properties"].items()}
 
 
-                sub_data = self.to_list_of_documents(
-                            data=data,
-                            schema=sub_schema,
-                            grp_fields=grp_fields,
-                            _final_step=False)
+                    sub_data = self.to_list_of_documents(
+                                data=data,
+                                schema=sub_schema,
+                                grp_fields=grp_fields,
+                                _final_step=False,
+                                already_reshaped=already_reshaped)
 
 
-                # Need to be checked since child elements can be empty
-                if sub_data is not None:
+                    # Need to be checked since child elements can be empty
+                    if sub_data is not None:
 
 
-                    reshaped_field = sub_data.apply(self._make_dict, axis=1)
-                    reshaped_field.name = field
+                        reshaped_field = sub_data.apply(self._make_dict, axis=1)
+                        reshaped_field.name = field
 
 
-                    reshaped_fields.append(reshaped_field)
+                        reshaped_fields.append(reshaped_field)
 
 
-            # if field is a list of dictionaries
-            elif field_type == "array":
+                # if field is a list of dictionaries
+                elif field_type == "array":
 
 
 
 
-                items_type = schema["properties"][field]["items"]["bsonType"]
+                    items_type = schema["properties"][field]["items"]["bsonType"]
 
 
-                if items_type == "object":
-                    array_object = time.time()
-                    sub_schema = deepcopy(schema["properties"][field]["items"])
+                    if items_type == "object":
+                        array_object = time.time()
+                        sub_schema = deepcopy(schema["properties"][field]["items"])
 
 
-                    # rename sub-schema properties to match data column names
-                    sub_schema["properties"] =\
-                        {".".join([field, k]): v for k, v in
-                         sub_schema["properties"].items()}
+                        # rename sub-schema properties to match data column names
+                        sub_schema["properties"] =\
+                            {".".join([field, k]): v for k, v in
+                            sub_schema["properties"].items()}
 
 
-                    # extend grp fields by sub-fields of field simple types
-                    sub_grp_fields = [f for f in sub_schema["properties"]
-                                      if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
-                                      and (f in data.columns)]
+                        # extend grp fields by sub-fields of field simple types
+                        sub_grp_fields = [f for f in sub_schema["properties"]
+                                        if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
+                                        and (f in data.columns)]
 
 
-                    if len(sub_grp_fields) == 0:
-                        err = ("One of the sub-keys in a list of documents"
-                               " must be of simple type for the field {}"
-                               .format(field))
+                        if len(sub_grp_fields) == 0:
+                            err = ("One of the sub-keys in a list of documents"
+                                " must be of simple type for the field {}"
+                                .format(field))
 
 
-                        self._log.error(err)
-                        raise Exception(err)
+                            self._log.error(err)
+                            raise Exception(err)
 
 
-                    # group and reshape sub-fields with complex types
-                    sub_data = self.to_list_of_documents(
-                                data=data,
-                                schema=sub_schema,
-                                grp_fields=grp_fields + sub_grp_fields,
-                                _final_step=False)
+                        # group and reshape sub-fields with complex types
+                        sub_data = self.to_list_of_documents(
+                                    data=data,
+                                    schema=sub_schema,
+                                    grp_fields=grp_fields + sub_grp_fields,
+                                    _final_step=False,
+                                    already_reshaped=already_reshaped)
 
 
-                    if sub_data is not None:
+                        if sub_data is not None:
 
 
-                        # gether the results into a list of dictionaries
-                        sub_data = sub_data.apply(self._make_dict, axis=1)
+                            # gether the results into a list of dictionaries
+                            sub_data = sub_data.apply(self._make_dict, axis=1)
 
 
-                        sub_data.name = field
-                        sub_data = sub_data.reset_index(grp_fields)
-                        ######################################################
-                        ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
-                        reshaped_field =\
-                            sub_data.groupby(grp_fields, sort=False)[field]\
-                                    .apply(self._make_list_of_distinct)
-                        ######################################################
-                        reshaped_fields.append(reshaped_field)
+                            sub_data.name = field
+                            sub_data = sub_data.reset_index(grp_fields)
+                            ######################################################
+                            ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
+                            reshaped_field =\
+                                sub_data.groupby(grp_fields, sort=False)[field]\
+                                        .apply(self._make_list_of_distinct)
+                            ######################################################
+                            reshaped_fields.append(reshaped_field)
 
 
 
 
-                # if field is a list of values with simple type
-                elif items_type == "array":
-                    grp_fields = [c for c in grp_fields if c in data.columns]
+                    # if field is a list of values with simple type
+                    elif items_type == "array":
+                        grp_fields = [c for c in grp_fields if c in data.columns]
 
 
-                    if field in data.columns:
+                        if field in data.columns:
 
 
-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
-                                             .apply(self._make_list_of_distinct)
+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_list_of_distinct)
 
 
-                        reshaped_fields.append(reshaped_field)
-                else:
+                            reshaped_fields.append(reshaped_field)
+                    else:
 
 
-                    grp_fields = [c for c in grp_fields if c in data.columns]
+                        grp_fields = [c for c in grp_fields if c in data.columns]
 
 
-                    if field in data.columns:
+                        if field in data.columns:
 
 
-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
-                                             .apply(self._make_flattened_list_of_distinct)
+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_flattened_list_of_distinct)
 
 
-                        reshaped_fields.append(reshaped_field)
+                            reshaped_fields.append(reshaped_field)
 
 
         if len(reshaped_fields) > 0:
         if len(reshaped_fields) > 0:
 
 

+ 2 - 2
cdplib/db_migration/MigrationCleaning.py

@@ -358,11 +358,11 @@ class MigrationCleaning:
                 elif python_type == bool:
                 elif python_type == bool:
 
 
                     data[column] = data[column].str.lower()
                     data[column] = data[column].str.lower()
-                    accepted_bool = {'ja': True, 'j': True, '1': True,
+                    accepted_bool = {'ja': True, 'j': True, '1': True, 1:True,
                                      'yes': True, 'y': True, 'true':True,
                                      'yes': True, 'y': True, 'true':True,
                                      't': True, 'nein': False, 'n': False,
                                      't': True, 'nein': False, 'n': False,
                                      'no': False, 'false': False, 'f': False,
                                      'no': False, 'false': False, 'f': False,
-                                     '0': False}
+                                     '0': False, 0:False}
                     data[column] = data[column].map(accepted_bool)
                     data[column] = data[column].map(accepted_bool)
                     data[column] = data[column].astype(bool)
                     data[column] = data[column].astype(bool)
 
 

+ 39 - 18
cdplib/db_migration/ParseJsonSchema.py

@@ -51,15 +51,17 @@ class ParseJsonSchema(ParseDbSchema):
                 with open(schema_path, "r") as f:
                 with open(schema_path, "r") as f:
                     schema = json.load(f)
                     schema = json.load(f)
 
 
-                ref_flag = self._analyze_schema(schema)
-
-                if ref_flag:
-                    schema = self._format_schema_for_mongo(schema)
+                definitions_flag = self._analyze_schema(schema)
+                
+                if definitions_flag:
+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
                     schema = self._dereference_schema(schema)
                     schema = self._dereference_schema(schema)
-                    schema = self._format_schema_for_mongo(schema)
+                    # Need to do it again since sub schema could also contain
+                    # single quotes
+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
                     self.schemas.append(schema)
                     self.schemas.append(schema)
+                    
                 else:
                 else:
-                    schema = self._format_schema_for_mongo(schema)
                     self.schemas.append(schema)
                     self.schemas.append(schema)
 
 
             except Exception as e:
             except Exception as e:
@@ -199,7 +201,7 @@ class ParseJsonSchema(ParseDbSchema):
                                  required_only=required_only)
                                  required_only=required_only)
 
 
         for schema in schemas[1:]:
         for schema in schemas[1:]:
-
+            
             next_result = self._parse_one(schema=schema,
             next_result = self._parse_one(schema=schema,
                                           field_info=field_info,
                                           field_info=field_info,
                                           required_only=required_only)
                                           required_only=required_only)
@@ -340,7 +342,7 @@ class ParseJsonSchema(ParseDbSchema):
 
 
         return already_parsed
         return already_parsed
 
 
-    def read_schema_and_parse_for_mongodb(self, schema_path: str) -> dict:
+    def load_and_parse_schema_for_mongodb(self, schema_path: str) -> dict:
         '''
         '''
         We need to deference json before import to Mongo DB pymongo can't deal with references
         We need to deference json before import to Mongo DB pymongo can't deal with references
         :param str schema_path: path to the schema file.
         :param str schema_path: path to the schema file.
@@ -353,9 +355,16 @@ class ParseJsonSchema(ParseDbSchema):
             schema = json.load(json_file)
             schema = json.load(json_file)
 
 
         definitions_flag = self._analyze_schema(schema)
         definitions_flag = self._analyze_schema(schema)
-
+        
         if definitions_flag:
         if definitions_flag:
+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
             schema = self._dereference_schema(schema)
             schema = self._dereference_schema(schema)
+             # Need to do it again since sub schema could also contain
+             # single quotes
+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
+            schema = self._format_schema_for_mongo(schema)
+        else:
+            schema = self._format_schema_for_mongo(schema)
 
 
         return schema
         return schema
 
 
@@ -371,12 +380,11 @@ class ParseJsonSchema(ParseDbSchema):
                 definitions_flag = self._analyze_schema(schema[key], definitions_flag)
                 definitions_flag = self._analyze_schema(schema[key], definitions_flag)
 
 
         return definitions_flag
         return definitions_flag
-
-    def _format_schema_for_mongo(self, schema: dict) -> dict:
+    
+    
+    def _clean_desciptions_tags_from_single_quotes(self, schema: dict) -> dict:
         '''
         '''
-        We use in the schema tags whih are not supported by mongo an threfore
-        must be taken care of before setting the schema for mongo.
-        :param str schema_path: path to the schema file.
+        :param dict schema: dictonary containing schema
         '''
         '''
 
 
         for key in list(schema):
         for key in list(schema):
@@ -386,14 +394,27 @@ class ParseJsonSchema(ParseDbSchema):
                 schema[key] = cleaned_description
                 schema[key] = cleaned_description
 
 
             if type(schema[key]) == dict:
             if type(schema[key]) == dict:
-                self._format_schema_for_mongo(schema[key])
+                self._clean_desciptions_tags_from_single_quotes(schema[key])
+                
+        return schema
 
 
-            if key == 'examples':
-                self._remove_examples(schema)
+    def _format_schema_for_mongo(self, schema: dict) -> dict:
+        '''
+        We use in the schema tags whih are not supported by mongo an threfore
+        must be taken care of before setting the schema for mongo.
+        :param str schema_path: path to the schema file.
+        '''
 
 
+        for key in list(schema):
+
+            if type(schema[key]) == dict:
+                self._format_schema_for_mongo(schema[key])
 
 
             if key == 'default' or key == 'default_values':
             if key == 'default' or key == 'default_values':
                 self._remove_defaults(schema)
                 self._remove_defaults(schema)
+                
+            if key == 'examples':
+                self._remove_examples(schema)
 
 
         return schema
         return schema
 
 
@@ -411,7 +432,7 @@ class ParseJsonSchema(ParseDbSchema):
         schema = str(schema).replace("'", "\"")
         schema = str(schema).replace("'", "\"")
         schema = jsonref.loads(schema, base_uri=base_dir_url)
         schema = jsonref.loads(schema, base_uri=base_dir_url)
         schema = deepcopy(schema)
         schema = deepcopy(schema)
-        #schema.pop('definitions', None)
+
         return schema
         return schema
 
 
     def _remove_defaults(self, schema: dict) -> dict:
     def _remove_defaults(self, schema: dict) -> dict:

+ 40 - 4
cdplib/db_migration/ParseMapping.py

@@ -22,10 +22,10 @@ class ParseMapping:
         import json
         import json
         from cdplib.log import Log
         from cdplib.log import Log
 
 
-        self.log = Log('Parse Mapping')
+        self._log = Log('Parse Mapping')
 
 
         if not os.path.isfile(mapping_path):
         if not os.path.isfile(mapping_path):
-            err = "Mapping not found"
+            err = "Mapping not found "+mapping_path
             self._log.error(err)
             self._log.error(err)
             raise FileNotFoundError(err)
             raise FileNotFoundError(err)
 
 
@@ -34,7 +34,7 @@ class ParseMapping:
                 self._mapping = json.load(f)
                 self._mapping = json.load(f)
 
 
         except Exception as e:
         except Exception as e:
-            err = ("Could not load mapping. "
+            err = ("Could not load mapping. " + mapping_path +
                    "Exit with error {}".format(e))
                    "Exit with error {}".format(e))
             self._log.error(err)
             self._log.error(err)
             raise Exception(err)
             raise Exception(err)
@@ -97,6 +97,42 @@ class ParseMapping:
         '''
         '''
         '''
         '''
         return self._get_info(key="date_format")
         return self._get_info(key="date_format")
+    
+    def get_internal_names(self) -> dict:
+        '''
+        '''
+ 
+        if all(["internal_name" in d for d in self._mapping]):
+            internal_names = [d["internal_name"] for d in self._mapping]
+    
+        elif all(["internal_name" not in d for d in self._mapping]):
+            internal_names = list(range(len(self._mapping)))
+
+
+        else:
+            err = ("Incorrectly filled mapping. Internal names should "
+                   "either be in all or in neither of the fields")
+            self._log.error(err)
+            raise Exception(err)
+
+        return internal_names
+
+    def get_mongo_names(self) -> dict:
+        '''
+        '''
+        if all(["mongo_name" in d for d in self._mapping]):
+            mongo_names = [d["mongo_name"] for d in self._mapping]
+
+        elif all(["mongo_name" not in d for d in self._mapping]):
+            mongo_names = list(range(len(self._mapping)))
+
+        else:
+            err = ("Incorrectly filled mapping. Mongo names should "
+                   "either be in all or in neither of the fields")
+            self._log.error(err)
+            raise Exception(err)
+
+        return mongo_names
 
 
     def get_types(self) -> dict:
     def get_types(self) -> dict:
         '''
         '''
@@ -134,7 +170,7 @@ class ParseMapping:
         else:
         else:
             err = ("Incorrectly filled mapping. Column numbers should ",
             err = ("Incorrectly filled mapping. Column numbers should ",
                    "either in all or in neither of the fields")
                    "either in all or in neither of the fields")
-            self.log.err(err)
+            self._log.err(err)
             raise Exception(err)
             raise Exception(err)
 
 
         return column_numbers
         return column_numbers

+ 38 - 0
cdplib/log.py

@@ -60,6 +60,44 @@ class Log():
 
 
         # self._logger.setLevel(log_level)
         # self._logger.setLevel(log_level)
 
 
+
+    @property
+    def magenta(self):
+        return '\033[95m'
+
+    @property
+    def blue(self):
+        return '\033[94m'
+
+    @property
+    def cyan(self):
+        return '\u001b[36m'
+
+    @property
+    def green(self):
+        return '\033[92m'
+
+    @property
+    def yellow(self):
+        return '\033[93m'
+
+    @property
+    def fail(self):
+        return '\033[91m'
+
+    @property
+    def reset(self):
+        return '\033[0m'
+
+    @property
+    def bold(self):
+        return '\033[1m'
+
+    @property
+    def underline(self):
+        return '\033[4m'
+
+
     def info(self, message: str):
     def info(self, message: str):
         self._logger.info(message)
         self._logger.info(message)
 
 

+ 1 - 1
cdplib/unit_tests/TestFlattenData.py

@@ -7,7 +7,7 @@ sys.path.append(os.getcwd())
 from cdplib.log import Log
 from cdplib.log import Log
 from cdplib.FlattenData import FlattenData
 from cdplib.FlattenData import FlattenData
 
 
-class TestMongodbHandler(unittest.TestCase):
+class TestFlattenData(unittest.TestCase):
 
 
     def setUp(self):
     def setUp(self):
         self.flattener = FlattenData()
         self.flattener = FlattenData()

+ 26 - 0
cdplib/unit_tests/TestLog.py

@@ -0,0 +1,26 @@
+import unittest
+import sys
+import os
+from pprint import pprint
+sys.path.append(os.getcwd())
+from cdplib.log import Log
+
+
+class TestLog(unittest.TestCase):
+
+    def setUp(self):
+       self._log = Log('Log Test')
+
+    def test_A_Log_Colors(self):
+        self._log.info('Test Starts Here')
+        self._log.info(self._log.magenta + "Header")
+        self._log.info(self._log.blue + "Blue")
+        self._log.info( self._log.green + "Green")
+        self._log.info(self._log.yellow + "yellow")
+        self._log.info(self._log.fail + "Fail")
+        self._log.info(self._log.reset + "reset")
+        self._log.info(self._log.bold + "bold" )
+        self._log.info(self._log.underline + "underline" )
+
+if __name__ == '__main__':
+    unittest.main()

+ 14 - 5
cdplib/unit_tests/TestMongodbHandler.py

@@ -1,6 +1,7 @@
 import unittest
 import unittest
 import sys
 import sys
 import os
 import os
+import time
 from pymongo import MongoClient
 from pymongo import MongoClient
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
 from cdplib.log import Log
 from cdplib.log import Log
@@ -25,12 +26,14 @@ class TestMongodbHandler(unittest.TestCase):
         self.valid_input = {
         self.valid_input = {
                         "test_value_string": "test_value",
                         "test_value_string": "test_value",
                         "test_value_double": 2.4,
                         "test_value_double": 2.4,
-                        "test_value_double_array": [1.4, 1.6, 3.5]
+                        "test_value_double_array": [1.4, 1.6, 3.5],
+                        "test_value_date": "2020-01-28T15:45:25.000Z"
                         }
                         }
         self.invalid_input = {
         self.invalid_input = {
                         "test_value_string": 1,
                         "test_value_string": 1,
                         "test_value_double": "Wrong value",
                         "test_value_double": "Wrong value",
-                        "test_value_double_array": [1.4, 1.6, 3.5]
+                        "test_value_double_array": [1.4, 1.6, 3.5],
+                        "test_value_date": "2019-01-28T15:45:25.000Z"
                         }
                         }
 
 
 
 
@@ -81,9 +84,10 @@ class TestMongodbHandler(unittest.TestCase):
         Fetch data and confirms thats it is the same as was entered into the database
         Fetch data and confirms thats it is the same as was entered into the database
         Do the same with more specific query
         Do the same with more specific query
         '''
         '''
+
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name, 'test_value_string', 'test_value').to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name, 'test_value_string', 'test_value').to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
-    
+
     def test_F_aggregate_data_and_generate_dataframe(self):
     def test_F_aggregate_data_and_generate_dataframe(self):
         '''
         '''
         Make an aggregation call
         Make an aggregation call
@@ -93,7 +97,7 @@ class TestMongodbHandler(unittest.TestCase):
                                 { '$match': {}}
                                 { '$match': {}}
                                 ]
                                 ]
         self.assertEqual(self.mongodb_handler.aggregate_data_and_generate_dataframe(self.first_collection_name, aggregation_pipeline).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.aggregate_data_and_generate_dataframe(self.first_collection_name, aggregation_pipeline).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
-    
+
     def test_G_update_data_in_collection(self):
     def test_G_update_data_in_collection(self):
         '''
         '''
         Fetch data from database
         Fetch data from database
@@ -104,7 +108,7 @@ class TestMongodbHandler(unittest.TestCase):
         '''
         '''
         original_value = self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         original_value = self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         self.assertEqual(original_value, self.valid_input['test_value_string'])
         self.assertEqual(original_value, self.valid_input['test_value_string'])
-        self.mongodb_handler.update_data_in_collection('test_value_string', 'test_value', 'test_value_string', 'new_test_value', self.first_collection_name)
+        self.mongodb_handler.update_data_in_collection('test_value_string', 'new_test_value', self.first_collection_name, 'test_value_string', 'test_value', create_if_not_exist=False)
         new_value =  self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         new_value =  self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         self.assertNotEqual(original_value, new_value)
         self.assertNotEqual(original_value, new_value)
 
 
@@ -116,6 +120,11 @@ class TestMongodbHandler(unittest.TestCase):
         index = 'test_value_string'
         index = 'test_value_string'
         self.mongodb_handler.create_index(self.first_collection_name, index)
         self.mongodb_handler.create_index(self.first_collection_name, index)
         self.assertTrue(index in list(self.database[self.first_collection_name].index_information().keys()))
         self.assertTrue(index in list(self.database[self.first_collection_name].index_information().keys()))
+
+    def test_I_query_data_between_dates_and_generate_dataframe(self):
+
+            data = self.mongodb_handler.query_data_between_dates_and_generate_dataframe(self.first_collection_name, "test_value_date", "2020-01-27T15:45:25.000Z", "2020-01-29T15:45:25.000Z", index ='test_value_string')
+            self.assertEqual(data['test_value_double'][0], self.valid_input['test_value_double'])
     
     
     def test_Y_drop_collection(self):
     def test_Y_drop_collection(self):
         '''
         '''

+ 13 - 0
hooks/README.txt

@@ -0,0 +1,13 @@
+These files are GIT HOOKS.
+
+A git hook is a a script which is executed when a git command is run.
+The hook in this folder is executed when commiting (pre-commit).
+
+pre-commit executes the unit tests before commiting the changes.
+
+ACHTUNG!
+Changes will still be commited and pushed even if there are errors in the test (for now at least).
+So please pay attention to the tests an make sure that they ran without any problems. If the test
+failed, please fix the issue before pushing the changes!
+
+To use it please copy the files into your .git/hooks folder.

+ 17 - 0
hooks/pre-commit

@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Runs test pipeline before commiting
+#
+# To enable this hook, rename this file to "pre-commit".
+echo
+echo "Running unit tests"
+echo
+
+python cdplib/unit_tests/TestFlattenData.py
+python cdplib/unit_tests/TestLog.py
+python cdplib/unit_tests/TestMongodbHandler.py
+
+echo 
+echo
+echo -e "\033 Unit tests have been run and your data commited, in case any of the tests failed, please correct this before pushing changes"
+echo

+ 5 - 20
setup.py

@@ -1,32 +1,17 @@
 from setuptools import setup,find_packages
 from setuptools import setup,find_packages
 
 
 INSTALL_REQUIRES = [
 INSTALL_REQUIRES = [
-        'pycodestyle',
-        'ipykernel',
-  		'spyder-kernels==0.*',
-        'cloudpickle',
-        'openpyxl',
-        'setuptools',
-        'scipy',
-        'matplotlib',
-        'tsfresh',
-        'hyperopt',
-        'xgboost',
-        'scikit-learn',
-  		'pandas',
-        'pandas-compat',
-        'xmltodict',
+        'pandas',
         'sqlalchemy',
         'sqlalchemy',
         'sqlparse',
         'sqlparse',
         'pymysql',
         'pymysql',
-        'xlrd',
         'pymongo',
         'pymongo',
-        'jsonref', 
-        'faker',
-        'xeger',
+        'jsonref',
         'simplejson',
         'simplejson',
         'mysql',
         'mysql',
-        'sqlalchemy-utils',
+        'sqlalchemy_utils',
+        'sklearn',
+        'hyperopt',
 ]
 ]