65 Commits 86a7442185 ... 094f32f2fb

Author SHA1 Message Date
  tsteuer 094f32f2fb resolve conflict 4 years ago
  ogert b3f46ff21a Solved bug 4 years ago
  ogert 855ef31527 Searching for bug 4 years ago
  ogert c9b3587a76 Searching for bug 4 years ago
  ogert 8869015ff5 Searching for bug 4 years ago
  ogert 125f105bb4 Change sorting statement back... 4 years ago
  ogert a4976df51e Change sorting statement 4 years ago
  ogert fbc8b38b19 catching errors 4 years ago
  ogert ca7cd306e3 Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib 4 years ago
  ogert f713f4e759 catching errors 4 years ago
  tsteuer 1c8a94c13a update parsejsonschmea 4 years ago
  tsteuer dad695f4f5 add derefernecing in consturctor parsejasonschmea 4 years ago
  tsteuer c8d229a492 update ParseJsonScema 4 years ago
  tsteuer 975f42c281 change consturctor to load json normal 4 years ago
  ogert 247079c241 bug fix 4 years ago
  ogert e8a93f8936 Bug fixing 4 years ago
  ogert 80b58fedbe forgot to add variable in recursive function 4 years ago
  ogert 3b3c10c928 fix missing comma 4 years ago
  ogert f298651141 update test 4 years ago
  ogert b43932fd4c Add array for exceptions in to_list_of_documents 4 years ago
  ogert 709b012624 Reimplemented some functionality which was lost in a push 4 years ago
  ogert b0b6e5902f Trying to speed up convert mongo into dataframe 4 years ago
  ogert fd00b2ea13 Trying to speed up convert mongo into dataframe 4 years ago
  ogert 0683e58d87 Trying to speed up convert mongo into dataframe 4 years ago
  ogert 6e99bef206 Trying to speed up convert mongo into dataframe 4 years ago
  ogert cd918c19bc Trying to speed up convert mongo into dataframe 4 years ago
  ogert abe4223c99 Trying to speed up convert mongo into dataframe 4 years ago
  ogert a10d5cda12 Trying to speed up convert mongo into dataframe 4 years ago
  ogert da239b06ba Trying to speed up convert mongo into dataframe 4 years ago
  ogert 33bc689a6e Trying to speed up convert mongo into dataframe 4 years ago
  ogert c565344023 Trying to speed up convert mongo into dataframe 4 years ago
  ogert 5d0e3396a3 merge 4 years ago
  ogert 957402da9c fix error in unit test for log class 4 years ago
  ogert 5786080616 fix error in unit test for log class 4 years ago
  ogert 36700509ff trying to speedup convert mongo to dataframe 4 years ago
  tsteuer 85bff36ba5 add function to ParseMapping 4 years ago
  tsteuer 0ca94e96a7 update execute sqlhandler 4 years ago
  tsteuer ab48cf029e fix bug rowcount in exceute check if statement is drop or delete 4 years ago
  tsteuer 5db7bf63c6 add function get mongo names and get internal names to parseMapping 4 years ago
  ogert 6060777446 Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib 4 years ago
  ogert 97b2113a29 Silenced exception for dispose_engine 4 years ago
  tsteuer 6f1d260aeb update error file not found Parse Mapping 4 years ago
  ogert 1b7e40c095 Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib 4 years ago
  ogert ee2def1fea Silenced exception for dispose_engine 4 years ago
  tsteuer b6fe55c761 update setup.py 4 years ago
  ogert b2bfdc1673 Add print statement informning about how long it takes to convert mongo result into dataframe 4 years ago
  ogert fff0eae0e0 Add print statement informning about how long it takes to convert mongo result into dataframe 4 years ago
  ogert 331a3f84d5 Updated MongoHandlers unit test 4 years ago
  ogert d0a60567bd Add cyan color for printout 4 years ago
  ogert da868ad2fe Add colors to log class 4 years ago
  ogert 1e01c0b98c Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib 4 years ago
  ogert 2bf292390c Add color statements to the log class 4 years ago
  tsteuer 430d190a87 update SqlHandler 4 years ago
  tsteuer b1737b4746 back to beginning 4 years ago
  tsteuer 360912c7ab update execute functoin sqlHandler 4 years ago
  tsteuer e070d03523 return list of results from execute SQLHandler 4 years ago
  tsteuer 4ab964141b undo changes 4 years ago
  tsteuer e64f49fd73 adjust execute functoin in sql Handler 4 years ago
  tsteuer 097e5b9a71 change function load and parse schemafor mongo 4 years ago
  tsteuer 416eb8b9e2 add lib 4 years ago
  tsteuer bc1ccc9a5e Update 'setup.py' 4 years ago
  tsteuer 03d0622e4d Update 'setup.py' 4 years ago
  ogert 0f6321d948 Solve merge conflict 4 years ago
  ogert 29f6894900 Delete unnecessary packages from pipfile 4 years ago
  tsteuer 41277696f2 discard libaries not needed 4 years ago

+ 1 - 19
Pipfile

@@ -7,33 +7,15 @@ verify_ssl = true
 
 
 [packages]
 [packages]
 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
-pycodestyle = "*"
-ipykernel = "*"
-spyder-kernels = "==0.*"
-cloudpickle = "*"
-openpyxl = "*"
-setuptools = "*"
-scipy = "*"
-matplotlib = "*"
-tsfresh = "*"
-hyperopt = "*"
-xgboost = "*"
-scikit-learn = "*"
 pandas = "!=0.24.0"
 pandas = "!=0.24.0"
-pandas-compat = "*"
-xmltodict = "*"
 sqlalchemy = "*"
 sqlalchemy = "*"
 sqlparse = "*"
 sqlparse = "*"
 pymysql = "*"
 pymysql = "*"
-xlrd = "*"
 pymongo = "*"
 pymongo = "*"
 jsonref = "*"
 jsonref = "*"
-faker = "*"
-xeger = "*"
 simplejson = "*"
 simplejson = "*"
 mysql = "*"
 mysql = "*"
-sqlalchemy-utils = "*"
-apyori==1.1.1
+hyperopt = "*"
 
 
 [requires]
 [requires]
 python_version = "3"
 python_version = "3"

File diff suppressed because it is too large
+ 205 - 684
Pipfile.lock


+ 86 - 25
cdplib/db_handlers/MongodbHandler.py

@@ -13,11 +13,13 @@ Created on Mon Sep 16 13:27:44 2019
 import simplejson
 import simplejson
 import sys
 import sys
 import os
 import os
+import time
 
 
 import pymongo
 import pymongo
 from pymongo import MongoClient
 from pymongo import MongoClient
 import pandas as pd
 import pandas as pd
 import numpy as np
 import numpy as np
+from pprint import pprint
 
 
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
 from cdplib.log import Log
 from cdplib.log import Log
@@ -171,7 +173,7 @@ class MongodbHandler:
         command = {
         command = {
                     'collMod': collection_name,
                     'collMod': collection_name,
                     'validator': {
                     'validator': {
-                        '$jsonSchema': parse_obj.schemas[0]
+                        '$jsonSchema': parse_obj.load_and_parse_schema_for_mongodb(schema_path)
                     },
                     },
                     'validationLevel': validation_level,
                     'validationLevel': validation_level,
                     'validationAction': validation_action
                     'validationAction': validation_action
@@ -239,7 +241,22 @@ class MongodbHandler:
                 self._database[collection_name].insert_many(data, ordered=ordered)
                 self._database[collection_name].insert_many(data, ordered=ordered)
 
 
         except Exception as error:
         except Exception as error:
-            self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
+            if len(data) > 1:
+
+                self._log.warning(('An error occured inserting {} documents into database: {} and collection: {}.').format(len(data), self._database_name, collection_name))
+                self._log.warning('This might be because one or more documents are invalid.') 
+                self._log.warning('We will try to insert the documents one-by-one and report which are invalid.')
+                self._log.warning(('Error: {}').format(error))
+                
+                for row in data:
+
+                    try:
+                        self._database[collection_name].insert_one(row)
+                    except Exception as error:
+                        pprint(row)
+                        self._log.warning(error)
+            else:
+                self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
 
 
         self._log.info(('Data has been inserted into the {} collection').format(collection_name))
         self._log.info(('Data has been inserted into the {} collection').format(collection_name))
 
 
@@ -267,16 +284,22 @@ class MongodbHandler:
 
 
         try:
         try:
             if attribute == None or attribute_value == None:
             if attribute == None or attribute_value == None:
-                data = self._database[collection_name].find({},return_values)
+                query = {}
+                data = self._database[collection_name].find(query,return_values)
+                
             else:
             else:
-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, return_values)
+                query = {attribute: {comparison_operator: attribute_value}}
+                data = self._database[collection_name].find(query, return_values)
 
 
         except Exception as error:
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: {}:{}. \nError:{}').format(collection_name, attribute, comparison_operator, attribute_value, error))
             self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: {}:{}. \nError:{}').format(collection_name, attribute, comparison_operator, attribute_value, error))
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
-        else:
-            return data
+            return None
+
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
 
 
     def aggregate_data_and_generate_dataframe(self, collection_name: str, aggregation_pipeline: list, index: str = None):
     def aggregate_data_and_generate_dataframe(self, collection_name: str, aggregation_pipeline: list, index: str = None):
 
 
@@ -284,11 +307,16 @@ class MongodbHandler:
             data = self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True)
             data = self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True)
         except Exception as error:
         except Exception as error:
             self._log.log_and_raise_error(('A problem occured when aggregating the collection {} with the pipeline {}. \nError: {}').format(collection_name, aggregation_pipeline, error))
             self._log.log_and_raise_error(('A problem occured when aggregating the collection {} with the pipeline {}. \nError: {}').format(collection_name, aggregation_pipeline, error))
+            return None
 
 
         return self.convert_mongo_data_into_dataframe(data, index, collection_name)
         return self.convert_mongo_data_into_dataframe(data, index, collection_name)
 
 
-    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None) -> pd.DataFrame():
+    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None, chunksize: int = 500) -> pd.DataFrame():
 
 
+        start_time = time.time()
+        '''
+        self._log.info('Converting returned mongo data into a DataFrame')
+        
         data = list(data)
         data = list(data)
         try:
         try:
             if len(data)> 0:
             if len(data)> 0:
@@ -299,11 +327,38 @@ class MongodbHandler:
                 df = pd.DataFrame(data)
                 df = pd.DataFrame(data)
                 if index is not None:
                 if index is not None:
                     df.set_index(index, inplace=True)
                     df.set_index(index, inplace=True)
+
+                self._log.info(('DataFrame conversion is done, took {} seconds').format(time.time()-start_time))
                 return df
                 return df
             else:
             else:
                 self._log.warning(('No data for the query was found').format())
                 self._log.warning(('No data for the query was found').format())
         except Exception as error:
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to convert mongo data into pd.Dataframe. \nError: {} ').format(error))
             self._log.log_and_raise_error(('An error occured trying to convert mongo data into pd.Dataframe. \nError: {} ').format(error))
+        '''
+    
+        frames = []
+        records = []
+        for iteration, value in enumerate(data):
+
+            records.append(value)
+            if iteration + 1 % chunksize == 0:
+                frames.append(pd.DataFrame(records))
+                records = []
+
+        if records:
+            frames.append(pd.DataFrame(records))
+
+        return_df = pd.concat(frames, axis=0, sort=False)
+
+        if index is not None:
+            return_df.set_index(index, inplace=True)
+
+        self._log.info(('{} Rows were fetched from {}. DataFrame conversion is done, took {} seconds').format(len(return_df.index), collection_name if collection_name is not None else 'the database', time.time()-start_time))
+        
+        return return_df
+
+ 
+        
 
 
     #def update_data_in_collection(self, query_label: str, query_value: str, update_label:str, update_value: str, collection_name:str):
     #def update_data_in_collection(self, query_label: str, query_value: str, update_label:str, update_value: str, collection_name:str):
     #    self._database[collection_name].update_one({query_label:query_value}, {"$set": {update_label: update_value}})
     #    self._database[collection_name].update_one({query_label:query_value}, {"$set": {update_label: update_value}})
@@ -338,7 +393,7 @@ class MongodbHandler:
         '''
         '''
         return self._database[collection_name].find({query_label:query_value}).count() > 0
         return self._database[collection_name].find({query_label:query_value}).count() > 0
 
 
-    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_as_dataframe: bool = True):
+    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_id: bool = False, return_as_dataframe: bool = True):
         '''
         '''
             Queries data between two dates.
             Queries data between two dates.
 
 
@@ -349,16 +404,20 @@ class MongodbHandler:
             :param str index:
             :param str index:
             :param bool return_as_dataframe:
             :param bool return_as_dataframe:
         '''
         '''
+        assert(isinstance(collection_name, str)),\
+            "Parameter 'collection_name' must be a string type"
         try:
         try:
-            data = self._database[collection_name].find({date_label: {'$gt': from_date_value, '$lt': to_date_value}})
+            query = {date_label: {'$gt': from_date_value, '$lt': to_date_value}}
+            data = self._database[collection_name].find(query, {'_id': return_id})
 
 
         except Exception as error:
         except Exception as error:
-            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: $gt:{}, $lt:{}. \nError:{}').format(collection_name, date_label, from_date_value, to_date_value, error))
+            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}. \nError:{}').format(collection_name, query, error))
 
 
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
-        else:
-            return data
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
 
 
     def query_oldest_or_newest_date_in_collection(self, collection_name: str, date_label: str, oldest: bool = False):
     def query_oldest_or_newest_date_in_collection(self, collection_name: str, date_label: str, oldest: bool = False):
 
 
@@ -385,21 +444,23 @@ class MongodbHandler:
         try:
         try:
 
 
             if attribute == None or attribute_value == None:
             if attribute == None or attribute_value == None:
-                data = self._database[collection_name].find({},{'_id': return_id}).sort(sort_label, direction).limit(limit)
+                query = {}
+                data = self._database[collection_name].find(query,{'_id': return_id}).sort(sort_label, direction).limit(limit)
             else:
             else:
-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, {'_id': return_id}).sort(sort_label, direction).limit(limit)
-
-            if len(list(data)) == 0:
-                self._log.warning('No data was found for the query')
-                return None
+                query = {attribute: {comparison_operator: attribute_value}}
+                data = self._database[collection_name].find(query, {'_id': return_id}).sort(sort_label, direction).limit(limit)
 
 
         except Exception as error:
         except Exception as error:
             self._log.log_and_raise_error(('An error occured trying to query data from {}, \nError:{}').format(collection_name, error))
             self._log.log_and_raise_error(('An error occured trying to query data from {}, \nError:{}').format(collection_name, error))
 
 
-        if return_as_dataframe:
-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+        if data.collection.count_documents(query) != 0:
+            if return_as_dataframe:
+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
+            else:
+                return data
         else:
         else:
-            return data
+            self._log.warning('No data was found for the query')
+            return None
 
 
     def update_data_in_collection(self, update_label:str, update_value: str, collection_name:str, query_label: str = None, query_value: str = None, create_if_not_exist: bool = True, find_query: dict = None, update_many: bool = False):
     def update_data_in_collection(self, update_label:str, update_value: str, collection_name:str, query_label: str = None, query_value: str = None, create_if_not_exist: bool = True, find_query: dict = None, update_many: bool = False):
 
 

+ 9 - 6
cdplib/db_handlers/SQLHandler.py

@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 """
 """
 Created on Tue Sep 18 16:20:50 2018
 Created on Tue Sep 18 16:20:50 2018
 
 
@@ -211,13 +209,17 @@ class SQLHandler:
         transaction = connection.begin()
         transaction = connection.begin()
 
 
         errors = []
         errors = []
+        results = []
 
 
         # in the case of multi-query execute each query
         # in the case of multi-query execute each query
         for sub_query in sqlparse.split(query):
         for sub_query in sqlparse.split(query):
             if len(sub_query) > 0:
             if len(sub_query) > 0:
                 try:
                 try:
-                    connection.execute(sub_query, multi=True)
-
+                    result = connection.execute(sub_query)
+                    if result.returns_rows:
+                        data = pd.DataFrame(result.fetchall())
+                        data.columns = result.keys()
+                        results.append(data)
                 except Exception as e:
                 except Exception as e:
                     errors.append(str(e))
                     errors.append(str(e))
 
 
@@ -231,6 +233,7 @@ class SQLHandler:
 
 
         transaction.commit()
         transaction.commit()
         connection.close()
         connection.close()
+        return results
 
 
     def execute_query_from_file(self, filename: str):
     def execute_query_from_file(self, filename: str):
         '''
         '''
@@ -441,7 +444,7 @@ class SQLHandler:
                                tablename)
                                tablename)
 
 
             data = self.execute(query)
             data = self.execute(query)
-            colnames = data.columns.tolist()
+            colnames = data[0].columns.tolist()
 
 
         return colnames
         return colnames
 
 
@@ -640,4 +643,4 @@ class SQLHandler:
             self._engine.dispose()
             self._engine.dispose()
         except Exception as e:
         except Exception as e:
             print(('An error occured when trying to dispose the SQL engine. Error: {}').format(e))
             print(('An error occured when trying to dispose the SQL engine. Error: {}').format(e))
-            raise Exception(e)
+            raise Exception(e)

+ 96 - 88
cdplib/db_migration/DataFrameToCollection.py

@@ -51,7 +51,8 @@ class DataFrameToCollection():
     def to_list_of_documents(self, data: pd.DataFrame,
     def to_list_of_documents(self, data: pd.DataFrame,
                              grp_fields: list,
                              grp_fields: list,
                              schema: dict = None,
                              schema: dict = None,
-                             _final_step: bool = True) -> list:
+                             _final_step: bool = True,
+                             already_reshaped: list = []) -> list:
         '''
         '''
         Reshapes a pandas dataframe to a list of documents according
         Reshapes a pandas dataframe to a list of documents according
          to a complex (json) mongodb schema
          to a complex (json) mongodb schema
@@ -84,128 +85,135 @@ class DataFrameToCollection():
             if field not in self._unroll_nested_names(data.columns):
             if field not in self._unroll_nested_names(data.columns):
                 continue
                 continue
 
 
-            field_type = schema["properties"][field]["bsonType"]
+            if field in already_reshaped:
+                reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_flattened_list_of_distinct)
+                reshaped_fields.append(reshaped_field)
+            else:
+                field_type = schema["properties"][field]["bsonType"]
 
 
-            # if field has a simple type
-            if field_type not in ["array", "object"]:
+                # if field has a simple type
+                if field_type not in ["array", "object"]:
 
 
-                grp_fields = [c for c in grp_fields if c in data.columns]
+                    grp_fields = [c for c in grp_fields if c in data.columns]
 
 
-                # check that there is only one possible value of this field
-                n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
+                    # check that there is only one possible value of this field
+                    n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
 
 
-                # n_distinct_valus can be 0 if the column only contains NaN values
-                if n_distinct_values > 1:
-                    err = "Field {0} is not unique with respect to {1}"\
-                          .format(field, grp_fields)
+                    # n_distinct_valus can be 0 if the column only contains NaN values
+                    if n_distinct_values > 1:
+                        err = "Field {0} is not unique with respect to {1}"\
+                            .format(field, grp_fields)
 
 
-                    self._log.error(err)
-                    raise Exception(err)
+                        self._log.error(err)
+                        raise Exception(err)
 
 
-                if field not in grp_fields:
-                    reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
-                else:
-                    reshaped_field =\
-                        data[grp_fields].drop_duplicates()\
-                        .set_index(grp_fields, drop=False)[field]
+                    if field not in grp_fields:
+                        reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
+                    else:
+                        reshaped_field =\
+                            data[grp_fields].drop_duplicates()\
+                            .set_index(grp_fields, drop=False)[field]
 
 
-                reshaped_fields.append(reshaped_field)
+                    reshaped_fields.append(reshaped_field)
 
 
-            # if field is sub-document (dictionary)
-            elif field_type == "object":
+                # if field is sub-document (dictionary)
+                elif field_type == "object":
 
 
-                sub_schema = deepcopy(schema["properties"][field])
+                    sub_schema = deepcopy(schema["properties"][field])
 
 
-                # rename sub-schema properties to match with data column names
-                sub_schema["properties"] =\
-                    {".".join([field, k]): v for k, v
-                     in sub_schema["properties"].items()}
+                    # rename sub-schema properties to match with data column names
+                    sub_schema["properties"] =\
+                        {".".join([field, k]): v for k, v
+                        in sub_schema["properties"].items()}
 
 
-                sub_data = self.to_list_of_documents(
-                            data=data,
-                            schema=sub_schema,
-                            grp_fields=grp_fields,
-                            _final_step=False)
+                    sub_data = self.to_list_of_documents(
+                                data=data,
+                                schema=sub_schema,
+                                grp_fields=grp_fields,
+                                _final_step=False,
+                                already_reshaped=already_reshaped)
 
 
-                # Need to be checked since child elements can be empty
-                if sub_data is not None:
+                    # Need to be checked since child elements can be empty
+                    if sub_data is not None:
 
 
-                    reshaped_field = sub_data.apply(self._make_dict, axis=1)
-                    reshaped_field.name = field
+                        reshaped_field = sub_data.apply(self._make_dict, axis=1)
+                        reshaped_field.name = field
 
 
-                    reshaped_fields.append(reshaped_field)
+                        reshaped_fields.append(reshaped_field)
 
 
-            # if field is a list of dictionaries
-            elif field_type == "array":
+                # if field is a list of dictionaries
+                elif field_type == "array":
 
 
 
 
-                items_type = schema["properties"][field]["items"]["bsonType"]
+                    items_type = schema["properties"][field]["items"]["bsonType"]
 
 
-                if items_type == "object":
-                    array_object = time.time()
-                    sub_schema = deepcopy(schema["properties"][field]["items"])
+                    if items_type == "object":
+                        array_object = time.time()
+                        sub_schema = deepcopy(schema["properties"][field]["items"])
 
 
-                    # rename sub-schema properties to match data column names
-                    sub_schema["properties"] =\
-                        {".".join([field, k]): v for k, v in
-                         sub_schema["properties"].items()}
+                        # rename sub-schema properties to match data column names
+                        sub_schema["properties"] =\
+                            {".".join([field, k]): v for k, v in
+                            sub_schema["properties"].items()}
 
 
-                    # extend grp fields by sub-fields of field simple types
-                    sub_grp_fields = [f for f in sub_schema["properties"]
-                                      if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
-                                      and (f in data.columns)]
+                        # extend grp fields by sub-fields of field simple types
+                        sub_grp_fields = [f for f in sub_schema["properties"]
+                                        if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
+                                        and (f in data.columns)]
 
 
-                    if len(sub_grp_fields) == 0:
-                        err = ("One of the sub-keys in a list of documents"
-                               " must be of simple type for the field {}"
-                               .format(field))
+                        if len(sub_grp_fields) == 0:
+                            err = ("One of the sub-keys in a list of documents"
+                                " must be of simple type for the field {}"
+                                .format(field))
 
 
-                        self._log.error(err)
-                        raise Exception(err)
+                            self._log.error(err)
+                            raise Exception(err)
 
 
-                    # group and reshape sub-fields with complex types
-                    sub_data = self.to_list_of_documents(
-                                data=data,
-                                schema=sub_schema,
-                                grp_fields=grp_fields + sub_grp_fields,
-                                _final_step=False)
+                        # group and reshape sub-fields with complex types
+                        sub_data = self.to_list_of_documents(
+                                    data=data,
+                                    schema=sub_schema,
+                                    grp_fields=grp_fields + sub_grp_fields,
+                                    _final_step=False,
+                                    already_reshaped=already_reshaped)
 
 
-                    if sub_data is not None:
+                        if sub_data is not None:
 
 
-                        # gether the results into a list of dictionaries
-                        sub_data = sub_data.apply(self._make_dict, axis=1)
+                            # gether the results into a list of dictionaries
+                            sub_data = sub_data.apply(self._make_dict, axis=1)
 
 
-                        sub_data.name = field
-                        sub_data = sub_data.reset_index(grp_fields)
-                        ######################################################
-                        ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
-                        reshaped_field =\
-                            sub_data.groupby(grp_fields, sort=False)[field]\
-                                    .apply(self._make_list_of_distinct)
-                        ######################################################
-                        reshaped_fields.append(reshaped_field)
+                            sub_data.name = field
+                            sub_data = sub_data.reset_index(grp_fields)
+                            ######################################################
+                            ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
+                            reshaped_field =\
+                                sub_data.groupby(grp_fields, sort=False)[field]\
+                                        .apply(self._make_list_of_distinct)
+                            ######################################################
+                            reshaped_fields.append(reshaped_field)
 
 
 
 
-                # if field is a list of values with simple type
-                elif items_type == "array":
-                    grp_fields = [c for c in grp_fields if c in data.columns]
+                    # if field is a list of values with simple type
+                    elif items_type == "array":
+                        grp_fields = [c for c in grp_fields if c in data.columns]
 
 
-                    if field in data.columns:
+                        if field in data.columns:
 
 
-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
-                                             .apply(self._make_list_of_distinct)
+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_list_of_distinct)
 
 
-                        reshaped_fields.append(reshaped_field)
-                else:
+                            reshaped_fields.append(reshaped_field)
+                    else:
 
 
-                    grp_fields = [c for c in grp_fields if c in data.columns]
+                        grp_fields = [c for c in grp_fields if c in data.columns]
 
 
-                    if field in data.columns:
+                        if field in data.columns:
 
 
-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
-                                             .apply(self._make_flattened_list_of_distinct)
+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                                .apply(self._make_flattened_list_of_distinct)
 
 
-                        reshaped_fields.append(reshaped_field)
+                            reshaped_fields.append(reshaped_field)
 
 
         if len(reshaped_fields) > 0:
         if len(reshaped_fields) > 0:
 
 

+ 2 - 2
cdplib/db_migration/MigrationCleaning.py

@@ -358,11 +358,11 @@ class MigrationCleaning:
                 elif python_type == bool:
                 elif python_type == bool:
 
 
                     data[column] = data[column].str.lower()
                     data[column] = data[column].str.lower()
-                    accepted_bool = {'ja': True, 'j': True, '1': True,
+                    accepted_bool = {'ja': True, 'j': True, '1': True, 1:True,
                                      'yes': True, 'y': True, 'true':True,
                                      'yes': True, 'y': True, 'true':True,
                                      't': True, 'nein': False, 'n': False,
                                      't': True, 'nein': False, 'n': False,
                                      'no': False, 'false': False, 'f': False,
                                      'no': False, 'false': False, 'f': False,
-                                     '0': False}
+                                     '0': False, 0:False}
                     data[column] = data[column].map(accepted_bool)
                     data[column] = data[column].map(accepted_bool)
                     data[column] = data[column].astype(bool)
                     data[column] = data[column].astype(bool)
 
 

+ 39 - 18
cdplib/db_migration/ParseJsonSchema.py

@@ -51,15 +51,17 @@ class ParseJsonSchema(ParseDbSchema):
                 with open(schema_path, "r") as f:
                 with open(schema_path, "r") as f:
                     schema = json.load(f)
                     schema = json.load(f)
 
 
-                ref_flag = self._analyze_schema(schema)
-
-                if ref_flag:
-                    schema = self._format_schema_for_mongo(schema)
+                definitions_flag = self._analyze_schema(schema)
+                
+                if definitions_flag:
+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
                     schema = self._dereference_schema(schema)
                     schema = self._dereference_schema(schema)
-                    schema = self._format_schema_for_mongo(schema)
+                    # Need to do it again since sub schema could also contain
+                    # single quotes
+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
                     self.schemas.append(schema)
                     self.schemas.append(schema)
+                    
                 else:
                 else:
-                    schema = self._format_schema_for_mongo(schema)
                     self.schemas.append(schema)
                     self.schemas.append(schema)
 
 
             except Exception as e:
             except Exception as e:
@@ -199,7 +201,7 @@ class ParseJsonSchema(ParseDbSchema):
                                  required_only=required_only)
                                  required_only=required_only)
 
 
         for schema in schemas[1:]:
         for schema in schemas[1:]:
-
+            
             next_result = self._parse_one(schema=schema,
             next_result = self._parse_one(schema=schema,
                                           field_info=field_info,
                                           field_info=field_info,
                                           required_only=required_only)
                                           required_only=required_only)
@@ -340,7 +342,7 @@ class ParseJsonSchema(ParseDbSchema):
 
 
         return already_parsed
         return already_parsed
 
 
-    def read_schema_and_parse_for_mongodb(self, schema_path: str) -> dict:
+    def load_and_parse_schema_for_mongodb(self, schema_path: str) -> dict:
         '''
         '''
         We need to deference json before import to Mongo DB pymongo can't deal with references
         We need to deference json before import to Mongo DB pymongo can't deal with references
         :param str schema_path: path to the schema file.
         :param str schema_path: path to the schema file.
@@ -353,9 +355,16 @@ class ParseJsonSchema(ParseDbSchema):
             schema = json.load(json_file)
             schema = json.load(json_file)
 
 
         definitions_flag = self._analyze_schema(schema)
         definitions_flag = self._analyze_schema(schema)
-
+        
         if definitions_flag:
         if definitions_flag:
+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
             schema = self._dereference_schema(schema)
             schema = self._dereference_schema(schema)
+             # Need to do it again since sub schema could also contain
+             # single quotes
+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
+            schema = self._format_schema_for_mongo(schema)
+        else:
+            schema = self._format_schema_for_mongo(schema)
 
 
         return schema
         return schema
 
 
@@ -371,12 +380,11 @@ class ParseJsonSchema(ParseDbSchema):
                 definitions_flag = self._analyze_schema(schema[key], definitions_flag)
                 definitions_flag = self._analyze_schema(schema[key], definitions_flag)
 
 
         return definitions_flag
         return definitions_flag
-
-    def _format_schema_for_mongo(self, schema: dict) -> dict:
+    
+    
+    def _clean_desciptions_tags_from_single_quotes(self, schema: dict) -> dict:
         '''
         '''
-        We use in the schema tags whih are not supported by mongo an threfore
-        must be taken care of before setting the schema for mongo.
-        :param str schema_path: path to the schema file.
+        :param dict schema: dictonary containing schema
         '''
         '''
 
 
         for key in list(schema):
         for key in list(schema):
@@ -386,14 +394,27 @@ class ParseJsonSchema(ParseDbSchema):
                 schema[key] = cleaned_description
                 schema[key] = cleaned_description
 
 
             if type(schema[key]) == dict:
             if type(schema[key]) == dict:
-                self._format_schema_for_mongo(schema[key])
+                self._clean_desciptions_tags_from_single_quotes(schema[key])
+                
+        return schema
 
 
-            if key == 'examples':
-                self._remove_examples(schema)
+    def _format_schema_for_mongo(self, schema: dict) -> dict:
+        '''
+        We use in the schema tags whih are not supported by mongo an threfore
+        must be taken care of before setting the schema for mongo.
+        :param str schema_path: path to the schema file.
+        '''
 
 
+        for key in list(schema):
+
+            if type(schema[key]) == dict:
+                self._format_schema_for_mongo(schema[key])
 
 
             if key == 'default' or key == 'default_values':
             if key == 'default' or key == 'default_values':
                 self._remove_defaults(schema)
                 self._remove_defaults(schema)
+                
+            if key == 'examples':
+                self._remove_examples(schema)
 
 
         return schema
         return schema
 
 
@@ -411,7 +432,7 @@ class ParseJsonSchema(ParseDbSchema):
         schema = str(schema).replace("'", "\"")
         schema = str(schema).replace("'", "\"")
         schema = jsonref.loads(schema, base_uri=base_dir_url)
         schema = jsonref.loads(schema, base_uri=base_dir_url)
         schema = deepcopy(schema)
         schema = deepcopy(schema)
-        #schema.pop('definitions', None)
+
         return schema
         return schema
 
 
     def _remove_defaults(self, schema: dict) -> dict:
     def _remove_defaults(self, schema: dict) -> dict:

+ 40 - 4
cdplib/db_migration/ParseMapping.py

@@ -22,10 +22,10 @@ class ParseMapping:
         import json
         import json
         from cdplib.log import Log
         from cdplib.log import Log
 
 
-        self.log = Log('Parse Mapping')
+        self._log = Log('Parse Mapping')
 
 
         if not os.path.isfile(mapping_path):
         if not os.path.isfile(mapping_path):
-            err = "Mapping not found"
+            err = "Mapping not found "+mapping_path
             self._log.error(err)
             self._log.error(err)
             raise FileNotFoundError(err)
             raise FileNotFoundError(err)
 
 
@@ -34,7 +34,7 @@ class ParseMapping:
                 self._mapping = json.load(f)
                 self._mapping = json.load(f)
 
 
         except Exception as e:
         except Exception as e:
-            err = ("Could not load mapping. "
+            err = ("Could not load mapping. " + mapping_path +
                    "Exit with error {}".format(e))
                    "Exit with error {}".format(e))
             self._log.error(err)
             self._log.error(err)
             raise Exception(err)
             raise Exception(err)
@@ -97,6 +97,42 @@ class ParseMapping:
         '''
         '''
         '''
         '''
         return self._get_info(key="date_format")
         return self._get_info(key="date_format")
+    
+    def get_internal_names(self) -> dict:
+        '''
+        '''
+ 
+        if all(["internal_name" in d for d in self._mapping]):
+            internal_names = [d["internal_name"] for d in self._mapping]
+    
+        elif all(["internal_name" not in d for d in self._mapping]):
+            internal_names = list(range(len(self._mapping)))
+
+
+        else:
+            err = ("Incorrectly filled mapping. Internal names should "
+                   "either be in all or in neither of the fields")
+            self._log.error(err)
+            raise Exception(err)
+
+        return internal_names
+
+    def get_mongo_names(self) -> dict:
+        '''
+        '''
+        if all(["mongo_name" in d for d in self._mapping]):
+            mongo_names = [d["mongo_name"] for d in self._mapping]
+
+        elif all(["mongo_name" not in d for d in self._mapping]):
+            mongo_names = list(range(len(self._mapping)))
+
+        else:
+            err = ("Incorrectly filled mapping. Mongo names should "
+                   "either be in all or in neither of the fields")
+            self._log.error(err)
+            raise Exception(err)
+
+        return mongo_names
 
 
     def get_types(self) -> dict:
     def get_types(self) -> dict:
         '''
         '''
@@ -134,7 +170,7 @@ class ParseMapping:
         else:
         else:
             err = ("Incorrectly filled mapping. Column numbers should ",
             err = ("Incorrectly filled mapping. Column numbers should ",
                    "either in all or in neither of the fields")
                    "either in all or in neither of the fields")
-            self.log.err(err)
+            self._log.err(err)
             raise Exception(err)
             raise Exception(err)
 
 
         return column_numbers
         return column_numbers

+ 38 - 0
cdplib/log.py

@@ -60,6 +60,44 @@ class Log():
 
 
         # self._logger.setLevel(log_level)
         # self._logger.setLevel(log_level)
 
 
+
+    @property
+    def magenta(self):
+        return '\033[95m'
+
+    @property
+    def blue(self):
+        return '\033[94m'
+
+    @property
+    def cyan(self):
+        return '\u001b[36m'
+
+    @property
+    def green(self):
+        return '\033[92m'
+
+    @property
+    def yellow(self):
+        return '\033[93m'
+
+    @property
+    def fail(self):
+        return '\033[91m'
+
+    @property
+    def reset(self):
+        return '\033[0m'
+
+    @property
+    def bold(self):
+        return '\033[1m'
+
+    @property
+    def underline(self):
+        return '\033[4m'
+
+
     def info(self, message: str):
     def info(self, message: str):
         self._logger.info(message)
         self._logger.info(message)
 
 

+ 1 - 1
cdplib/unit_tests/TestFlattenData.py

@@ -7,7 +7,7 @@ sys.path.append(os.getcwd())
 from cdplib.log import Log
 from cdplib.log import Log
 from cdplib.FlattenData import FlattenData
 from cdplib.FlattenData import FlattenData
 
 
-class TestMongodbHandler(unittest.TestCase):
+class TestFlattenData(unittest.TestCase):
 
 
     def setUp(self):
     def setUp(self):
         self.flattener = FlattenData()
         self.flattener = FlattenData()

+ 26 - 0
cdplib/unit_tests/TestLog.py

@@ -0,0 +1,26 @@
+import unittest
+import sys
+import os
+from pprint import pprint
+sys.path.append(os.getcwd())
+from cdplib.log import Log
+
+
+class TestLog(unittest.TestCase):
+
+    def setUp(self):
+       self._log = Log('Log Test')
+
+    def test_A_Log_Colors(self):
+        self._log.info('Test Starts Here')
+        self._log.info(self._log.magenta + "Header")
+        self._log.info(self._log.blue + "Blue")
+        self._log.info( self._log.green + "Green")
+        self._log.info(self._log.yellow + "yellow")
+        self._log.info(self._log.fail + "Fail")
+        self._log.info(self._log.reset + "reset")
+        self._log.info(self._log.bold + "bold" )
+        self._log.info(self._log.underline + "underline" )
+
+if __name__ == '__main__':
+    unittest.main()

+ 14 - 5
cdplib/unit_tests/TestMongodbHandler.py

@@ -1,6 +1,7 @@
 import unittest
 import unittest
 import sys
 import sys
 import os
 import os
+import time
 from pymongo import MongoClient
 from pymongo import MongoClient
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
 from cdplib.log import Log
 from cdplib.log import Log
@@ -25,12 +26,14 @@ class TestMongodbHandler(unittest.TestCase):
         self.valid_input = {
         self.valid_input = {
                         "test_value_string": "test_value",
                         "test_value_string": "test_value",
                         "test_value_double": 2.4,
                         "test_value_double": 2.4,
-                        "test_value_double_array": [1.4, 1.6, 3.5]
+                        "test_value_double_array": [1.4, 1.6, 3.5],
+                        "test_value_date": "2020-01-28T15:45:25.000Z"
                         }
                         }
         self.invalid_input = {
         self.invalid_input = {
                         "test_value_string": 1,
                         "test_value_string": 1,
                         "test_value_double": "Wrong value",
                         "test_value_double": "Wrong value",
-                        "test_value_double_array": [1.4, 1.6, 3.5]
+                        "test_value_double_array": [1.4, 1.6, 3.5],
+                        "test_value_date": "2019-01-28T15:45:25.000Z"
                         }
                         }
 
 
 
 
@@ -81,9 +84,10 @@ class TestMongodbHandler(unittest.TestCase):
         Fetch data and confirms thats it is the same as was entered into the database
         Fetch data and confirms thats it is the same as was entered into the database
         Do the same with more specific query
         Do the same with more specific query
         '''
         '''
+
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name, 'test_value_string', 'test_value').to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name, 'test_value_string', 'test_value').to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
-    
+
     def test_F_aggregate_data_and_generate_dataframe(self):
     def test_F_aggregate_data_and_generate_dataframe(self):
         '''
         '''
         Make an aggregation call
         Make an aggregation call
@@ -93,7 +97,7 @@ class TestMongodbHandler(unittest.TestCase):
                                 { '$match': {}}
                                 { '$match': {}}
                                 ]
                                 ]
         self.assertEqual(self.mongodb_handler.aggregate_data_and_generate_dataframe(self.first_collection_name, aggregation_pipeline).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
         self.assertEqual(self.mongodb_handler.aggregate_data_and_generate_dataframe(self.first_collection_name, aggregation_pipeline).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
-    
+
     def test_G_update_data_in_collection(self):
     def test_G_update_data_in_collection(self):
         '''
         '''
         Fetch data from database
         Fetch data from database
@@ -104,7 +108,7 @@ class TestMongodbHandler(unittest.TestCase):
         '''
         '''
         original_value = self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         original_value = self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         self.assertEqual(original_value, self.valid_input['test_value_string'])
         self.assertEqual(original_value, self.valid_input['test_value_string'])
-        self.mongodb_handler.update_data_in_collection('test_value_string', 'test_value', 'test_value_string', 'new_test_value', self.first_collection_name)
+        self.mongodb_handler.update_data_in_collection('test_value_string', 'new_test_value', self.first_collection_name, 'test_value_string', 'test_value', create_if_not_exist=False)
         new_value =  self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         new_value =  self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
         self.assertNotEqual(original_value, new_value)
         self.assertNotEqual(original_value, new_value)
 
 
@@ -116,6 +120,11 @@ class TestMongodbHandler(unittest.TestCase):
         index = 'test_value_string'
         index = 'test_value_string'
         self.mongodb_handler.create_index(self.first_collection_name, index)
         self.mongodb_handler.create_index(self.first_collection_name, index)
         self.assertTrue(index in list(self.database[self.first_collection_name].index_information().keys()))
         self.assertTrue(index in list(self.database[self.first_collection_name].index_information().keys()))
+
+    def test_I_query_data_between_dates_and_generate_dataframe(self):
+
+            data = self.mongodb_handler.query_data_between_dates_and_generate_dataframe(self.first_collection_name, "test_value_date", "2020-01-27T15:45:25.000Z", "2020-01-29T15:45:25.000Z", index ='test_value_string')
+            self.assertEqual(data['test_value_double'][0], self.valid_input['test_value_double'])
     
     
     def test_Y_drop_collection(self):
     def test_Y_drop_collection(self):
         '''
         '''

+ 13 - 0
hooks/README.txt

@@ -0,0 +1,13 @@
+These files are GIT HOOKS.
+
+A git hook is a a script which is executed when a git command is run.
+The hook in this folder is executed when commiting (pre-commit).
+
+pre-commit executes the unit tests before commiting the changes.
+
+ACHTUNG!
+Changes will still be commited and pushed even if there are errors in the test (for now at least).
+So please pay attention to the tests an make sure that they ran without any problems. If the test
+failed, please fix the issue before pushing the changes!
+
+To use it please copy the files into your .git/hooks folder.

+ 17 - 0
hooks/pre-commit

@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Runs test pipeline before commiting
+#
+# To enable this hook, rename this file to "pre-commit".
+echo
+echo "Running unit tests"
+echo
+
+python cdplib/unit_tests/TestFlattenData.py
+python cdplib/unit_tests/TestLog.py
+python cdplib/unit_tests/TestMongodbHandler.py
+
+echo 
+echo
+echo -e "\033 Unit tests have been run and your data commited, in case any of the tests failed, please correct this before pushing changes"
+echo

+ 5 - 20
setup.py

@@ -1,32 +1,17 @@
 from setuptools import setup,find_packages
 from setuptools import setup,find_packages
 
 
 INSTALL_REQUIRES = [
 INSTALL_REQUIRES = [
-        'pycodestyle',
-        'ipykernel',
-  		'spyder-kernels==0.*',
-        'cloudpickle',
-        'openpyxl',
-        'setuptools',
-        'scipy',
-        'matplotlib',
-        'tsfresh',
-        'hyperopt',
-        'xgboost',
-        'scikit-learn',
-  		'pandas',
-        'pandas-compat',
-        'xmltodict',
+        'pandas',
         'sqlalchemy',
         'sqlalchemy',
         'sqlparse',
         'sqlparse',
         'pymysql',
         'pymysql',
-        'xlrd',
         'pymongo',
         'pymongo',
-        'jsonref', 
-        'faker',
-        'xeger',
+        'jsonref',
         'simplejson',
         'simplejson',
         'mysql',
         'mysql',
-        'sqlalchemy-utils',
+        'sqlalchemy_utils',
+        'sklearn',
+        'hyperopt',
 ]
 ]