65 Incheckningar 86a7442185 ... 094f32f2fb

Upphovsman SHA1 Meddelande Datum
  tsteuer 094f32f2fb resolve conflict 4 år sedan
  ogert b3f46ff21a Solved bug 4 år sedan
  ogert 855ef31527 Searching for bug 4 år sedan
  ogert c9b3587a76 Searching for bug 4 år sedan
  ogert 8869015ff5 Searching for bug 4 år sedan
  ogert 125f105bb4 Change sorting statement back... 4 år sedan
  ogert a4976df51e Change sorting statement 4 år sedan
  ogert fbc8b38b19 catching errors 4 år sedan
  ogert ca7cd306e3 Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib 4 år sedan
  ogert f713f4e759 catching errors 4 år sedan
  tsteuer 1c8a94c13a update parsejsonschmea 4 år sedan
  tsteuer dad695f4f5 add derefernecing in consturctor parsejasonschmea 4 år sedan
  tsteuer c8d229a492 update ParseJsonScema 4 år sedan
  tsteuer 975f42c281 change consturctor to load json normal 4 år sedan
  ogert 247079c241 bug fix 4 år sedan
  ogert e8a93f8936 Bug fixing 4 år sedan
  ogert 80b58fedbe forgot to add variable in recursive function 4 år sedan
  ogert 3b3c10c928 fix missing comma 4 år sedan
  ogert f298651141 update test 4 år sedan
  ogert b43932fd4c Add array for exceptions in to_list_of_documents 4 år sedan
  ogert 709b012624 Reimplemented some functionality which was lost in a push 4 år sedan
  ogert b0b6e5902f Trying to speed up convert mongo into dataframe 4 år sedan
  ogert fd00b2ea13 Trying to speed up convert mongo into dataframe 4 år sedan
  ogert 0683e58d87 Trying to speed up convert mongo into dataframe 4 år sedan
  ogert 6e99bef206 Trying to speed up convert mongo into dataframe 4 år sedan
  ogert cd918c19bc Trying to speed up convert mongo into dataframe 4 år sedan
  ogert abe4223c99 Trying to speed up convert mongo into dataframe 4 år sedan
  ogert a10d5cda12 Trying to speed up convert mongo into dataframe 4 år sedan
  ogert da239b06ba Trying to speed up convert mongo into dataframe 4 år sedan
  ogert 33bc689a6e Trying to speed up convert mongo into dataframe 4 år sedan
  ogert c565344023 Trying to speed up convert mongo into dataframe 4 år sedan
  ogert 5d0e3396a3 merge 4 år sedan
  ogert 957402da9c fix error in unit test for log class 4 år sedan
  ogert 5786080616 fix error in unit test for log class 4 år sedan
  ogert 36700509ff trying to speedup convert mongo to dataframe 4 år sedan
  tsteuer 85bff36ba5 add function to ParseMapping 4 år sedan
  tsteuer 0ca94e96a7 update execute sqlhandler 4 år sedan
  tsteuer ab48cf029e fix bug rowcount in exceute check if statement is drop or delete 4 år sedan
  tsteuer 5db7bf63c6 add function get mongo names and get internal names to parseMapping 4 år sedan
  ogert 6060777446 Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib 4 år sedan
  ogert 97b2113a29 Silenced exception for dispose_engine 4 år sedan
  tsteuer 6f1d260aeb update error file not found Parse Mapping 4 år sedan
  ogert 1b7e40c095 Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib 4 år sedan
  ogert ee2def1fea Silenced exception for dispose_engine 4 år sedan
  tsteuer b6fe55c761 update setup.py 4 år sedan
  ogert b2bfdc1673 Add print statement informning about how long it takes to convert mongo result into dataframe 4 år sedan
  ogert fff0eae0e0 Add print statement informning about how long it takes to convert mongo result into dataframe 4 år sedan
  ogert 331a3f84d5 Updated MongoHandlers unit test 4 år sedan
  ogert d0a60567bd Add cyan color for printout 4 år sedan
  ogert da868ad2fe Add colors to log class 4 år sedan
  ogert 1e01c0b98c Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib 4 år sedan
  ogert 2bf292390c Add color statements to the log class 4 år sedan
  tsteuer 430d190a87 update SqlHandler 4 år sedan
  tsteuer b1737b4746 back to beginning 4 år sedan
  tsteuer 360912c7ab update execute functoin sqlHandler 4 år sedan
  tsteuer e070d03523 return list of results from execute SQLHandler 4 år sedan
  tsteuer 4ab964141b undo changes 4 år sedan
  tsteuer e64f49fd73 adjust execute functoin in sql Handler 4 år sedan
  tsteuer 097e5b9a71 change function load and parse schemafor mongo 4 år sedan
  tsteuer 416eb8b9e2 add lib 4 år sedan
  tsteuer bc1ccc9a5e Update 'setup.py' 4 år sedan
  tsteuer 03d0622e4d Update 'setup.py' 4 år sedan
  ogert 0f6321d948 Solve merge conflict 4 år sedan
  ogert 29f6894900 Delete unnecessary packages from pipfile 4 år sedan
  tsteuer 41277696f2 discard libaries not needed 4 år sedan

+ 1 - 19
Pipfile

@@ -7,33 +7,15 @@ verify_ssl = true
 
 [packages]
 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
-pycodestyle = "*"
-ipykernel = "*"
-spyder-kernels = "==0.*"
-cloudpickle = "*"
-openpyxl = "*"
-setuptools = "*"
-scipy = "*"
-matplotlib = "*"
-tsfresh = "*"
-hyperopt = "*"
-xgboost = "*"
-scikit-learn = "*"
 pandas = "!=0.24.0"
-pandas-compat = "*"
-xmltodict = "*"
 sqlalchemy = "*"
 sqlparse = "*"
 pymysql = "*"
-xlrd = "*"
 pymongo = "*"
 jsonref = "*"
-faker = "*"
-xeger = "*"
 simplejson = "*"
 mysql = "*"
-sqlalchemy-utils = "*"
-apyori==1.1.1
+hyperopt = "*"
 
 [requires]
 python_version = "3"

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 205 - 684
Pipfile.lock


+ 86 - 25
cdplib/db_handlers/MongodbHandler.py

@@ -13,11 +13,13 @@ Created on Mon Sep 16 13:27:44 2019
 import simplejson
 import sys
 import os
+import time
 
 import pymongo
 from pymongo import MongoClient
 import pandas as pd
 import numpy as np
+from pprint import pprint
 
 sys.path.append(os.getcwd())
 from cdplib.log import Log
@@ -171,7 +173,7 @@ class MongodbHandler:
         command = {
                     'collMod': collection_name,
                     'validator': {
-                        '$jsonSchema': parse_obj.schemas[0]
+                        '$jsonSchema': parse_obj.load_and_parse_schema_for_mongodb(schema_path)
                     },
                     'validationLevel': validation_level,
                     'validationAction': validation_action
@@ -239,7 +241,22 @@ class MongodbHandler:
                 self._database[collection_name].insert_many(data, ordered=ordered)
 
         except Exception as error:
-            self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
+            if len(data) > 1:
+
+                self._log.warning(('An error occured inserting {} documents into database: {} and collection: {}.').format(len(data), self._database_name, collection_name))
+                self._log.warning('This might be because one or more documents are invalid.') 
+                self._log.warning('We will try to insert the documents one-by-one and report which are invalid.')
+                self._log.warning(('Error: {}').format(error))
+                
+                for row in data:
+
+                    try:
+                        self._database[collection_name].insert_one(row)
+                    except Exception as error:
+                        pprint(row)
+                        self._log.warning(error)
+            else:
+                self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
 
         self._log.info(('Data has been inserted into the {} collection').format(collection_name))
 
@@ -267,16 +284,22 @@ class MongodbHandler:
 
         try:
             if attribute == None or attribute_value == None:
-                data = self._database[collection_name].find({},return_values)
+                query = {}
+                data = self._database[collection_name].find(query,return_values)
+                
             else:
-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, return_values)
+                query = {attribute: {comparison_operator: attribute_value}}
+                data = self._database[collection_name].find(query, return_values)
 
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: {}:{}. \nError:{}').format(collection_name, attribute, comparison_operator, attribute_value, error))
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
-        else:
-            return data
+            return None
+
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
 
     def aggregate_data_and_generate_dataframe(self, collection_name: str, aggregation_pipeline: list, index: str = None):
 
@@ -284,11 +307,16 @@ class MongodbHandler:
             data = self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True)
         except Exception as error:
             self._log.log_and_raise_error(('A problem occured when aggregating the collection {} with the pipeline {}. \nError: {}').format(collection_name, aggregation_pipeline, error))
+            return None
 
         return self.convert_mongo_data_into_dataframe(data, index, collection_name)
 
-    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None) -> pd.DataFrame():
+    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None, chunksize: int = 500) -> pd.DataFrame():
 
+        start_time = time.time()
+        '''
+        self._log.info('Converting returned mongo data into a DataFrame')
+        
         data = list(data)
         try:
             if len(data)> 0:
@@ -299,11 +327,38 @@ class MongodbHandler:
                 df = pd.DataFrame(data)
                 if index is not None:
                     df.set_index(index, inplace=True)
+
+                self._log.info(('DataFrame conversion is done, took {} seconds').format(time.time()-start_time))
                 return df
             else:
                 self._log.warning(('No data for the query was found').format())
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to convert mongo data into pd.Dataframe. \nError: {} ').format(error))
+        '''
+    
+        frames = []
+        records = []
+        for iteration, value in enumerate(data):
+
+            records.append(value)
+            if iteration + 1 % chunksize == 0:
+                frames.append(pd.DataFrame(records))
+                records = []
+
+        if records:
+            frames.append(pd.DataFrame(records))
+
+        return_df = pd.concat(frames, axis=0, sort=False)
+
+        if index is not None:
+            return_df.set_index(index, inplace=True)
+
+        self._log.info(('{} Rows were fetched from {}. DataFrame conversion is done, took {} seconds').format(len(return_df.index), collection_name if collection_name is not None else 'the database', time.time()-start_time))
+        
+        return return_df
+
+ 
+        
 
     #def update_data_in_collection(self, query_label: str, query_value: str, update_label:str, update_value: str, collection_name:str):
     #    self._database[collection_name].update_one({query_label:query_value}, {"$set": {update_label: update_value}})
@@ -338,7 +393,7 @@ class MongodbHandler:
         '''
         return self._database[collection_name].find({query_label:query_value}).count() > 0
 
-    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_as_dataframe: bool = True):
+    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_id: bool = False, return_as_dataframe: bool = True):
         '''
             Queries data between two dates.
 
@@ -349,16 +404,20 @@ class MongodbHandler:
             :param str index:
             :param bool return_as_dataframe:
         '''
+        assert(isinstance(collection_name, str)),\
+            "Parameter 'collection_name' must be a string type"
         try:
-            data = self._database[collection_name].find({date_label: {'$gt': from_date_value, '$lt': to_date_value}})
+            query = {date_label: {'$gt': from_date_value, '$lt': to_date_value}}
+            data = self._database[collection_name].find(query, {'_id': return_id})
 
         except Exception as error:
-            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: $gt:{}, $lt:{}. \nError:{}').format(collection_name, date_label, from_date_value, to_date_value, error))
+            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}. \nError:{}').format(collection_name, query, error))
 
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
-        else:
-            return data
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
 
     def query_oldest_or_newest_date_in_collection(self, collection_name: str, date_label: str, oldest: bool = False):
 
@@ -385,21 +444,23 @@ class MongodbHandler:
         try:
 
             if attribute == None or attribute_value == None:
-                data = self._database[collection_name].find({},{'_id': return_id}).sort(sort_label, direction).limit(limit)
+                query = {}
+                data = self._database[collection_name].find(query,{'_id': return_id}).sort(sort_label, direction).limit(limit)
             else:
-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, {'_id': return_id}).sort(sort_label, direction).limit(limit)
-
-            if len(list(data)) == 0:
-                self._log.warning('No data was found for the query')
-                return None
+                query = {attribute: {comparison_operator: attribute_value}}
+                data = self._database[collection_name].find(query, {'_id': return_id}).sort(sort_label, direction).limit(limit)
 
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to query data from {}, \nError:{}').format(collection_name, error))
 
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
         else:
-            return data
+            self._log.warning('No data was found for the query')
+            return None
 
     def update_data_in_collection(self, update_label:str, update_value: str, collection_name:str, query_label: str = None, query_value: str = None, create_if_not_exist: bool = True, find_query: dict = None, update_many: bool = False):
 

+ 9 - 6
cdplib/db_handlers/SQLHandler.py

@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
 Created on Tue Sep 18 16:20:50 2018
 
@@ -211,13 +209,17 @@ class SQLHandler:
         transaction = connection.begin()
 
         errors = []
+        results = []
 
         # in the case of multi-query execute each query
         for sub_query in sqlparse.split(query):
             if len(sub_query) > 0:
                 try:
-                    connection.execute(sub_query, multi=True)
-
+                    result = connection.execute(sub_query)
+                    if result.returns_rows:
+                        data = pd.DataFrame(result.fetchall())
+                        data.columns = result.keys()
+                        results.append(data)
                 except Exception as e:
                     errors.append(str(e))
 
@@ -231,6 +233,7 @@ class SQLHandler:
 
         transaction.commit()
         connection.close()
+        return results
 
     def execute_query_from_file(self, filename: str):
         '''
@@ -441,7 +444,7 @@ class SQLHandler:
                                tablename)
 
             data = self.execute(query)
-            colnames = data.columns.tolist()
+            colnames = data[0].columns.tolist()
 
         return colnames
 
@@ -640,4 +643,4 @@ class SQLHandler:
             self._engine.dispose()
         except Exception as e:
             print(('An error occured when trying to dispose the SQL engine. Error: {}').format(e))
-            raise Exception(e)
+            raise Exception(e)

+ 96 - 88
cdplib/db_migration/DataFrameToCollection.py

@@ -51,7 +51,8 @@ class DataFrameToCollection():
     def to_list_of_documents(self, data: pd.DataFrame,
                              grp_fields: list,
                              schema: dict = None,
-                             _final_step: bool = True) -> list:
+                             _final_step: bool = True,
+                             already_reshaped: list = []) -> list:
         '''
         Reshapes a pandas dataframe to a list of documents according
          to a complex (json) mongodb schema
@@ -84,128 +85,135 @@ class DataFrameToCollection():
             if field not in self._unroll_nested_names(data.columns):
                 continue
 
-            field_type = schema["properties"][field]["bsonType"]
+            if field in already_reshaped:
+                reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_flattened_list_of_distinct)
+                reshaped_fields.append(reshaped_field)
+            else:
+                field_type = schema["properties"][field]["bsonType"]
 
-            # if field has a simple type
-            if field_type not in ["array", "object"]:
+                # if field has a simple type
+                if field_type not in ["array", "object"]:
 
-                grp_fields = [c for c in grp_fields if c in data.columns]
+                    grp_fields = [c for c in grp_fields if c in data.columns]
 
-                # check that there is only one possible value of this field
-                n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
+                    # check that there is only one possible value of this field
+                    n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
 
-                # n_distinct_valus can be 0 if the column only contains NaN values
-                if n_distinct_values > 1:
-                    err = "Field {0} is not unique with respect to {1}"\
-                          .format(field, grp_fields)
+                    # n_distinct_valus can be 0 if the column only contains NaN values
+                    if n_distinct_values > 1:
+                        err = "Field {0} is not unique with respect to {1}"\
+                            .format(field, grp_fields)
 
-                    self._log.error(err)
-                    raise Exception(err)
+                        self._log.error(err)
+                        raise Exception(err)
 
-                if field not in grp_fields:
-                    reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
-                else:
-                    reshaped_field =\
-                        data[grp_fields].drop_duplicates()\
-                        .set_index(grp_fields, drop=False)[field]
+                    if field not in grp_fields:
+                        reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
+                    else:
+                        reshaped_field =\
+                            data[grp_fields].drop_duplicates()\
+                            .set_index(grp_fields, drop=False)[field]
 
-                reshaped_fields.append(reshaped_field)
+                    reshaped_fields.append(reshaped_field)
 
-            # if field is sub-document (dictionary)
-            elif field_type == "object":
+                # if field is sub-document (dictionary)
+                elif field_type == "object":
 
-                sub_schema = deepcopy(schema["properties"][field])
+                    sub_schema = deepcopy(schema["properties"][field])
 
-                # rename sub-schema properties to match with data column names
-                sub_schema["properties"] =\
-                    {".".join([field, k]): v for k, v
-                     in sub_schema["properties"].items()}
+                    # rename sub-schema properties to match with data column names
+                    sub_schema["properties"] =\
+                        {".".join([field, k]): v for k, v
+                        in sub_schema["properties"].items()}
 
-                sub_data = self.to_list_of_documents(
-                            data=data,
-                            schema=sub_schema,
-                            grp_fields=grp_fields,
-                            _final_step=False)
+                    sub_data = self.to_list_of_documents(
+                                data=data,
+                                schema=sub_schema,
+                                grp_fields=grp_fields,
+                                _final_step=False,
+                                already_reshaped=already_reshaped)
 
-                # Need to be checked since child elements can be empty
-                if sub_data is not None:
+                    # Need to be checked since child elements can be empty
+                    if sub_data is not None:
 
-                    reshaped_field = sub_data.apply(self._make_dict, axis=1)
-                    reshaped_field.name = field
+                        reshaped_field = sub_data.apply(self._make_dict, axis=1)
+                        reshaped_field.name = field
 
-                    reshaped_fields.append(reshaped_field)
+                        reshaped_fields.append(reshaped_field)
 
-            # if field is a list of dictionaries
-            elif field_type == "array":
+                # if field is a list of dictionaries
+                elif field_type == "array":
 
 
-                items_type = schema["properties"][field]["items"]["bsonType"]
+                    items_type = schema["properties"][field]["items"]["bsonType"]
 
-                if items_type == "object":
-                    array_object = time.time()
-                    sub_schema = deepcopy(schema["properties"][field]["items"])
+                    if items_type == "object":
+                        array_object = time.time()
+                        sub_schema = deepcopy(schema["properties"][field]["items"])
 
-                    # rename sub-schema properties to match data column names
-                    sub_schema["properties"] =\
-                        {".".join([field, k]): v for k, v in
-                         sub_schema["properties"].items()}
+                        # rename sub-schema properties to match data column names
+                        sub_schema["properties"] =\
+                            {".".join([field, k]): v for k, v in
+                            sub_schema["properties"].items()}
 
-                    # extend grp fields by sub-fields of field simple types
-                    sub_grp_fields = [f for f in sub_schema["properties"]
-                                      if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
-                                      and (f in data.columns)]
+                        # extend grp fields by sub-fields of field simple types
+                        sub_grp_fields = [f for f in sub_schema["properties"]
+                                        if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
+                                        and (f in data.columns)]
 
-                    if len(sub_grp_fields) == 0:
-                        err = ("One of the sub-keys in a list of documents"
-                               " must be of simple type for the field {}"
-                               .format(field))
+                        if len(sub_grp_fields) == 0:
+                            err = ("One of the sub-keys in a list of documents"
+                                " must be of simple type for the field {}"
+                                .format(field))
 
-                        self._log.error(err)
-                        raise Exception(err)
+                            self._log.error(err)
+                            raise Exception(err)
 
-                    # group and reshape sub-fields with complex types
-                    sub_data = self.to_list_of_documents(
-                                data=data,
-                                schema=sub_schema,
-                                grp_fields=grp_fields + sub_grp_fields,
-                                _final_step=False)
+                        # group and reshape sub-fields with complex types
+                        sub_data = self.to_list_of_documents(
+                                    data=data,
+                                    schema=sub_schema,
+                                    grp_fields=grp_fields + sub_grp_fields,
+                                    _final_step=False,
+                                    already_reshaped=already_reshaped)
 
-                    if sub_data is not None:
+                        if sub_data is not None:
 
-                        # gether the results into a list of dictionaries
-                        sub_data = sub_data.apply(self._make_dict, axis=1)
+                            # gether the results into a list of dictionaries
+                            sub_data = sub_data.apply(self._make_dict, axis=1)
 
-                        sub_data.name = field
-                        sub_data = sub_data.reset_index(grp_fields)
-                        ######################################################
-                        ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
-                        reshaped_field =\
-                            sub_data.groupby(grp_fields, sort=False)[field]\
-                                    .apply(self._make_list_of_distinct)
-                        ######################################################
-                        reshaped_fields.append(reshaped_field)
+                            sub_data.name = field
+                            sub_data = sub_data.reset_index(grp_fields)
+                            ######################################################
+                            ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
+                            reshaped_field =\
+                                sub_data.groupby(grp_fields, sort=False)[field]\
+                                        .apply(self._make_list_of_distinct)
+                            ######################################################
+                            reshaped_fields.append(reshaped_field)
 
 
-                # if field is a list of values with simple type
-                elif items_type == "array":
-                    grp_fields = [c for c in grp_fields if c in data.columns]
+                    # if field is a list of values with simple type
+                    elif items_type == "array":
+                        grp_fields = [c for c in grp_fields if c in data.columns]
 
-                    if field in data.columns:
+                        if field in data.columns:
 
-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
-                                             .apply(self._make_list_of_distinct)
+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_list_of_distinct)
 
-                        reshaped_fields.append(reshaped_field)
-                else:
+                            reshaped_fields.append(reshaped_field)
+                    else:
 
-                    grp_fields = [c for c in grp_fields if c in data.columns]
+                        grp_fields = [c for c in grp_fields if c in data.columns]
 
-                    if field in data.columns:
+                        if field in data.columns:
 
-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
-                                             .apply(self._make_flattened_list_of_distinct)
+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_flattened_list_of_distinct)
 
-                        reshaped_fields.append(reshaped_field)
+                            reshaped_fields.append(reshaped_field)
 
         if len(reshaped_fields) > 0:
 

+ 2 - 2
cdplib/db_migration/MigrationCleaning.py

@@ -358,11 +358,11 @@ class MigrationCleaning:
                 elif python_type == bool:
 
                     data[column] = data[column].str.lower()
-                    accepted_bool = {'ja': True, 'j': True, '1': True,
+                    accepted_bool = {'ja': True, 'j': True, '1': True, 1:True,
                                      'yes': True, 'y': True, 'true':True,
                                      't': True, 'nein': False, 'n': False,
                                      'no': False, 'false': False, 'f': False,
-                                     '0': False}
+                                     '0': False, 0:False}
                     data[column] = data[column].map(accepted_bool)
                     data[column] = data[column].astype(bool)
 

+ 39 - 18
cdplib/db_migration/ParseJsonSchema.py

@@ -51,15 +51,17 @@ class ParseJsonSchema(ParseDbSchema):
                 with open(schema_path, "r") as f:
                     schema = json.load(f)
 
-                ref_flag = self._analyze_schema(schema)
-
-                if ref_flag:
-                    schema = self._format_schema_for_mongo(schema)
+                definitions_flag = self._analyze_schema(schema)
+                
+                if definitions_flag:
+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
                     schema = self._dereference_schema(schema)
-                    schema = self._format_schema_for_mongo(schema)
+                    # Need to do it again since sub schema could also contain
+                    # single quotes
+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
                     self.schemas.append(schema)
+                    
                 else:
-                    schema = self._format_schema_for_mongo(schema)
                     self.schemas.append(schema)
 
             except Exception as e:
@@ -199,7 +201,7 @@ class ParseJsonSchema(ParseDbSchema):
                                  required_only=required_only)
 
         for schema in schemas[1:]:
-
+            
             next_result = self._parse_one(schema=schema,
                                           field_info=field_info,
                                           required_only=required_only)
@@ -340,7 +342,7 @@ class ParseJsonSchema(ParseDbSchema):
 
         return already_parsed
 
-    def read_schema_and_parse_for_mongodb(self, schema_path: str) -> dict:
+    def load_and_parse_schema_for_mongodb(self, schema_path: str) -> dict:
         '''
         We need to deference json before import to Mongo DB pymongo can't deal with references
         :param str schema_path: path to the schema file.
@@ -353,9 +355,16 @@ class ParseJsonSchema(ParseDbSchema):
             schema = json.load(json_file)
 
         definitions_flag = self._analyze_schema(schema)
-
+        
         if definitions_flag:
+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
             schema = self._dereference_schema(schema)
+             # Need to do it again since sub schema could also contain
+             # single quotes
+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
+            schema = self._format_schema_for_mongo(schema)
+        else:
+            schema = self._format_schema_for_mongo(schema)
 
         return schema
 
@@ -371,12 +380,11 @@ class ParseJsonSchema(ParseDbSchema):
                 definitions_flag = self._analyze_schema(schema[key], definitions_flag)
 
         return definitions_flag
-
-    def _format_schema_for_mongo(self, schema: dict) -> dict:
+    
+    
+    def _clean_desciptions_tags_from_single_quotes(self, schema: dict) -> dict:
         '''
-        We use in the schema tags whih are not supported by mongo an threfore
-        must be taken care of before setting the schema for mongo.
-        :param str schema_path: path to the schema file.
+        :param dict schema: dictonary containing schema
         '''
 
         for key in list(schema):
@@ -386,14 +394,27 @@ class ParseJsonSchema(ParseDbSchema):
                 schema[key] = cleaned_description
 
             if type(schema[key]) == dict:
-                self._format_schema_for_mongo(schema[key])
+                self._clean_desciptions_tags_from_single_quotes(schema[key])
+                
+        return schema
 
-            if key == 'examples':
-                self._remove_examples(schema)
+    def _format_schema_for_mongo(self, schema: dict) -> dict:
+        '''
+        We use in the schema tags whih are not supported by mongo an threfore
+        must be taken care of before setting the schema for mongo.
+        :param str schema_path: path to the schema file.
+        '''
 
+        for key in list(schema):
+
+            if type(schema[key]) == dict:
+                self._format_schema_for_mongo(schema[key])
 
             if key == 'default' or key == 'default_values':
                 self._remove_defaults(schema)
+                
+            if key == 'examples':
+                self._remove_examples(schema)
 
         return schema
 
@@ -411,7 +432,7 @@ class ParseJsonSchema(ParseDbSchema):
         schema = str(schema).replace("'", "\"")
         schema = jsonref.loads(schema, base_uri=base_dir_url)
         schema = deepcopy(schema)
-        #schema.pop('definitions', None)
+
         return schema
 
     def _remove_defaults(self, schema: dict) -> dict:

+ 40 - 4
cdplib/db_migration/ParseMapping.py

@@ -22,10 +22,10 @@ class ParseMapping:
         import json
         from cdplib.log import Log
 
-        self.log = Log('Parse Mapping')
+        self._log = Log('Parse Mapping')
 
         if not os.path.isfile(mapping_path):
-            err = "Mapping not found"
+            err = "Mapping not found "+mapping_path
             self._log.error(err)
             raise FileNotFoundError(err)
 
@@ -34,7 +34,7 @@ class ParseMapping:
                 self._mapping = json.load(f)
 
         except Exception as e:
-            err = ("Could not load mapping. "
+            err = ("Could not load mapping. " + mapping_path +
                    "Exit with error {}".format(e))
             self._log.error(err)
             raise Exception(err)
@@ -97,6 +97,42 @@ class ParseMapping:
         '''
         '''
         return self._get_info(key="date_format")
+    
+    def get_internal_names(self) -> dict:
+        '''
+        '''
+ 
+        if all(["internal_name" in d for d in self._mapping]):
+            internal_names = [d["internal_name"] for d in self._mapping]
+    
+        elif all(["internal_name" not in d for d in self._mapping]):
+            internal_names = list(range(len(self._mapping)))
+
+
+        else:
+            err = ("Incorrectly filled mapping. Internal names should "
+                   "either be in all or in neither of the fields")
+            self._log.error(err)
+            raise Exception(err)
+
+        return internal_names
+
+    def get_mongo_names(self) -> dict:
+        '''
+        '''
+        if all(["mongo_name" in d for d in self._mapping]):
+            mongo_names = [d["mongo_name"] for d in self._mapping]
+
+        elif all(["mongo_name" not in d for d in self._mapping]):
+            mongo_names = list(range(len(self._mapping)))
+
+        else:
+            err = ("Incorrectly filled mapping. Mongo names should "
+                   "either be in all or in neither of the fields")
+            self._log.error(err)
+            raise Exception(err)
+
+        return mongo_names
 
     def get_types(self) -> dict:
         '''
@@ -134,7 +170,7 @@ class ParseMapping:
         else:
             err = ("Incorrectly filled mapping. Column numbers should ",
                    "either in all or in neither of the fields")
-            self.log.err(err)
+            self._log.err(err)
             raise Exception(err)
 
         return column_numbers

+ 38 - 0
cdplib/log.py

@@ -60,6 +60,44 @@ class Log():
 
         # self._logger.setLevel(log_level)
 
+
+    @property
+    def magenta(self):
+        return '\033[95m'
+
+    @property
+    def blue(self):
+        return '\033[94m'
+
+    @property
+    def cyan(self):
+        return '\u001b[36m'
+
+    @property
+    def green(self):
+        return '\033[92m'
+
+    @property
+    def yellow(self):
+        return '\033[93m'
+
+    @property
+    def fail(self):
+        return '\033[91m'
+
+    @property
+    def reset(self):
+        return '\033[0m'
+
+    @property
+    def bold(self):
+        return '\033[1m'
+
+    @property
+    def underline(self):
+        return '\033[4m'
+
+
     def info(self, message: str):
         self._logger.info(message)
 

+ 1 - 1
cdplib/unit_tests/TestFlattenData.py

@@ -7,7 +7,7 @@ sys.path.append(os.getcwd())
 from cdplib.log import Log
 from cdplib.FlattenData import FlattenData
 
-class TestMongodbHandler(unittest.TestCase):
+class TestFlattenData(unittest.TestCase):
 
     def setUp(self):
         self.flattener = FlattenData()

+ 26 - 0
cdplib/unit_tests/TestLog.py

@@ -0,0 +1,26 @@
+import unittest
+import sys
+import os
+from pprint import pprint
+sys.path.append(os.getcwd())
+from cdplib.log import Log
+
+
+class TestLog(unittest.TestCase):
+
+    def setUp(self):
+       self._log = Log('Log Test')
+
+    def test_A_Log_Colors(self):
+        self._log.info('Test Starts Here')
+        self._log.info(self._log.magenta + "Header")
+        self._log.info(self._log.blue + "Blue")
+        self._log.info( self._log.green + "Green")
+        self._log.info(self._log.yellow + "yellow")
+        self._log.info(self._log.fail + "Fail")
+        self._log.info(self._log.reset + "reset")
+        self._log.info(self._log.bold + "bold" )
+        self._log.info(self._log.underline + "underline" )
+
+if __name__ == '__main__':
+    unittest.main()

+ 14 - 5
cdplib/unit_tests/TestMongodbHandler.py

@@ -1,6 +1,7 @@
 import unittest
 import sys
 import os
+import time
 from pymongo import MongoClient
 sys.path.append(os.getcwd())
 from cdplib.log import Log
@@ -25,12 +26,14 @@ class TestMongodbHandler(unittest.TestCase):
         self.valid_input = {
                         "test_value_string": "test_value",
                         "test_value_double": 2.4,
-                        "test_value_double_array": [1.4, 1.6, 3.5]
+                        "test_value_double_array": [1.4, 1.6, 3.5],
+                        "test_value_date": "2020-01-28T15:45:25.000Z"
                         }
         self.invalid_input = {
                         "test_value_string": 1,
                         "test_value_double": "Wrong value",
-                        "test_value_double_array": [1.4, 1.6, 3.5]
+                        "test_value_double_array": [1.4, 1.6, 3.5],
+                        "test_value_date": "2019-01-28T15:45:25.000Z"
                         }
 
 
@@ -81,9 +84,10 @@ class TestMongodbHandler(unittest.TestCase):
         Fetch data and confirms thats it is the same as was entered into the database
         Do the same with more specific query
         '''
+
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name, 'test_value_string', 'test_value').to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
-    
+
     def test_F_aggregate_data_and_generate_dataframe(self):
         '''
         Make an aggregation call
@@ -93,7 +97,7 @@ class TestMongodbHandler(unittest.TestCase):
                                 { '$match': {}}
                                 ]
         self.assertEqual(self.mongodb_handler.aggregate_data_and_generate_dataframe(self.first_collection_name, aggregation_pipeline).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
-    
+
     def test_G_update_data_in_collection(self):
         '''
         Fetch data from database
@@ -104,7 +108,7 @@ class TestMongodbHandler(unittest.TestCase):
         '''
         original_value = self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         self.assertEqual(original_value, self.valid_input['test_value_string'])
-        self.mongodb_handler.update_data_in_collection('test_value_string', 'test_value', 'test_value_string', 'new_test_value', self.first_collection_name)
+        self.mongodb_handler.update_data_in_collection('test_value_string', 'new_test_value', self.first_collection_name, 'test_value_string', 'test_value', create_if_not_exist=False)
         new_value =  self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         self.assertNotEqual(original_value, new_value)
 
@@ -116,6 +120,11 @@ class TestMongodbHandler(unittest.TestCase):
         index = 'test_value_string'
         self.mongodb_handler.create_index(self.first_collection_name, index)
         self.assertTrue(index in list(self.database[self.first_collection_name].index_information().keys()))
+
+    def test_I_query_data_between_dates_and_generate_dataframe(self):
+
+            data = self.mongodb_handler.query_data_between_dates_and_generate_dataframe(self.first_collection_name, "test_value_date", "2020-01-27T15:45:25.000Z", "2020-01-29T15:45:25.000Z", index ='test_value_string')
+            self.assertEqual(data['test_value_double'][0], self.valid_input['test_value_double'])
     
     def test_Y_drop_collection(self):
         '''

+ 13 - 0
hooks/README.txt

@@ -0,0 +1,13 @@
+These files are GIT HOOKS.
+
+A git hook is a a script which is executed when a git command is run.
+The hook in this folder is executed when commiting (pre-commit).
+
+pre-commit executes the unit tests before commiting the changes.
+
+ACHTUNG!
+Changes will still be commited and pushed even if there are errors in the test (for now at least).
+So please pay attention to the tests an make sure that they ran without any problems. If the test
+failed, please fix the issue before pushing the changes!
+
+To use it please copy the files into your .git/hooks folder.

+ 17 - 0
hooks/pre-commit

@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Runs test pipeline before commiting
+#
+# To enable this hook, rename this file to "pre-commit".
+echo
+echo "Running unit tests"
+echo
+
+python cdplib/unit_tests/TestFlattenData.py
+python cdplib/unit_tests/TestLog.py
+python cdplib/unit_tests/TestMongodbHandler.py
+
+echo 
+echo
+echo -e "\033 Unit tests have been run and your data commited, in case any of the tests failed, please correct this before pushing changes"
+echo

+ 5 - 20
setup.py

@@ -1,32 +1,17 @@
 from setuptools import setup,find_packages
 
 INSTALL_REQUIRES = [
-        'pycodestyle',
-        'ipykernel',
-  		'spyder-kernels==0.*',
-        'cloudpickle',
-        'openpyxl',
-        'setuptools',
-        'scipy',
-        'matplotlib',
-        'tsfresh',
-        'hyperopt',
-        'xgboost',
-        'scikit-learn',
-  		'pandas',
-        'pandas-compat',
-        'xmltodict',
+        'pandas',
         'sqlalchemy',
         'sqlparse',
         'pymysql',
-        'xlrd',
         'pymongo',
-        'jsonref', 
-        'faker',
-        'xeger',
+        'jsonref',
         'simplejson',
         'mysql',
-        'sqlalchemy-utils',
+        'sqlalchemy_utils',
+        'sklearn',
+        'hyperopt',
 ]