Преглед на файлове

renamed libraries to cdplib

tanja преди 4 години
родител
ревизия
3bb02ade48

+ 9 - 8
cdplib/db_handlers/MongodbHandler.py

@@ -21,9 +21,8 @@ import pandas as pd
 import numpy as np
 
 sys.path.append(os.getcwd())
-from libraries.log import Log
-from libraries.configuration import default as cfg
-from libraries.Singleton_Threadsafe import SingletonThreadsafe
+from cdplib.log import Log
+from cdplib.Singleton_Threadsafe import SingletonThreadsafe
 
 
 #class MongodbHandlerPool(metaclass=SingletonThreadsafe):
@@ -40,7 +39,7 @@ class MongodbHandlerPool():
             self._mongodb_handlers = [MongodbHandler() for _ in range(self._size)]
             log.warning("Ran out of Mongodb handlers, 10 more have been added. Are you sure you've returned yours?")
         return self._mongodb_handlers.pop()
-        
+
     def release(self, mongodb_handler):
         if len(self._mongodb_handlers) < self._size:
             self._mongodb_handlers.append(mongodb_handler)
@@ -60,6 +59,8 @@ class MongodbHandler:
         '''
         if database_url is None:
 
+            from libraries.configuration import default as cfg
+
             database_url = "mongodb://{0}:{1}@{2}:{3}"\
                            .format(cfg["MONGO"]["MONGO_USER"],
                                    cfg["MONGO"]["MONGO_PASSWORD"],
@@ -204,7 +205,7 @@ class MongodbHandler:
             try:
                 self._log.info(("Collection '{}' has been created").format(collection_name))
                 return self._database.create_collection(collection_name)
-            
+
             except Exception as error:
                 self._log.log_and_raise_error(('An error occured while creating the new collection: {}. \nError: {}').format(collection_name, error))
         else:
@@ -247,7 +248,7 @@ class MongodbHandler:
                 self._database[collection_name].insert_one(data)
             else:
                 self._database[collection_name].insert_many(data, ordered=ordered)
-        
+
         except Exception as error:
             self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
 
@@ -286,14 +287,14 @@ class MongodbHandler:
 
         try:
             data = self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True)
-         
+
         except Exception as error:
             self._log.log_and_raise_error(('A problem occured when aggregating the collection {} with the pipeline {}. \nError: {}').format(collection_name, aggregation_pipeline, error))
 
         return self.convert_mongo_data_into_dataframe(data)
 
     def convert_mongo_data_into_dataframe(self, data) -> pd.DataFrame():
-        
+
         data = list(data)
         try:
             if len(data)> 0:

+ 8 - 9
cdplib/db_handlers/SQLHandler.py

@@ -15,8 +15,8 @@ import pandas as pd
 import warnings
 
 sys.path.append(os.getcwd())
-from libraries.log import Log
-from libraries.Singleton_Threadsafe import SingletonThreadsafe
+from cdplib.log import Log
+from cdplib.Singleton_Threadsafe import SingletonThreadsafe
 
 class SQLHandlerPool(metaclass=SingletonThreadsafe):
 #class SQLHandlerPool():
@@ -33,7 +33,7 @@ class SQLHandlerPool(metaclass=SingletonThreadsafe):
             self._sql_handlers = [SQLHandler() for _ in range(self._size)]
             self._log.warning("Ran out of SQL handlers, 10 more have been added. Are you sure you've returned yours?")
         return self._sql_handlers.pop()
-        
+
     def release(self, mongodb_handler):
         if len(self._sql_handlers) < self._size:
             self._sql_handlers.append(mongodb_handler)
@@ -58,15 +58,14 @@ class SQLHandler:
              for mysql : mysql+pymysql
              for db2: ibm_db_sa
         '''
-
-        
-        from libraries.configuration import default as cfg
         from sqlalchemy_utils import database_exists, create_database
 
         self._log = Log(name='SQLHandler')
 
         if db_uri is None:
 
+            from libraries.configuration import default as cfg
+
             db_uri = "mysql+pymysql://{0}:{1}@{2}:{3}/{4}?charset=utf8&local_infile=1"\
                      .format(cfg["SQL"]["SQL_USER"],
                              cfg["SQL"]["SQL_PASSWORD"],
@@ -146,7 +145,7 @@ class SQLHandler:
         self.execute("DROP DATABASE IF EXISTS {}".format(database))
         self._engine.execute("CREATE DATABASE {}".format(database))
         self._engine.execute("USE {}".format(database))
-        
+
     @property
     def _db_metadata(self) -> dict:
         '''
@@ -206,7 +205,7 @@ class SQLHandler:
         '''
         connection = self._engine.connect()
         transaction = connection.begin()
-    
+
         errors = []
 
         # in the case of multi-query execute each query
@@ -507,7 +506,7 @@ class SQLHandler:
             data = pd.read_sql(sql=query,
                                con=connection,
                                **read_sql_kwargs)
-                               
+
             connection.close()
             return data
 

+ 396 - 0
cdplib/db_migration/DataFrameToCollection.py

@@ -0,0 +1,396 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul 22 11:05:47 2019
+
+@author: tanya
+
+@description: a function to reshape a pandas dataframe to a list of
+(possibly nested) documents with respect to a (json) mongodb schema
+"""
+
+import pandas as pd
+import numpy as np
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+
+class DataFrameToCollection():
+    '''
+    '''
+    def __init__(self, schema_path: str):
+        '''
+        '''
+        from cdplib.log import Log
+        import json
+
+        self._log = Log("ParseJsonSchema")
+
+
+        if not os.path.isfile(schema_path):
+            err = "JsonSchema not found"
+            self._log.error(err)
+            raise FileNotFoundError(err)
+
+        # load schema to dictionary if it is a valid json file
+        try:
+            with open(schema_path, "r") as f:
+                self.schema = json.load(f)
+
+        except Exception as e:
+            err = ("Could not load json schema, "
+                   "Obtained error {}".format(e))
+
+            self._log.error(err)
+            raise Exception(err)
+
+
+    def to_list_of_documents(self, data: pd.DataFrame,
+                             grp_fields: list,
+                             schema: dict = None,
+                             _final_step: bool = True) -> list:
+        '''
+        Reshapes a pandas dataframe to a list of documents according
+         to a complex (json) mongodb schema
+
+         Remark1: column names of data need to reflect the "nestedness"
+         of the field in the mongodb schema with the help of a "." separator
+         Example: field.sub_field_1, field.sub_field_2
+
+         Remark2: if the schema is stored as a json file, first load it
+         to a dictionary with the help of the python json module
+
+         The function goes recurisively through all the fields and reshapes
+         them correspondingly depending on whether the field is an array,
+         an object, or simple field. For each field we group the data by the
+         grp_fields and reshape it accordingly, the result is a pandas Series.
+         In the end all the series are collected and concatenated.
+        '''
+        from copy import deepcopy
+
+        data = self._melt_duplicated_columns(data)
+
+        reshaped_fields = []
+
+        if schema is None:
+            schema = self.schema
+
+        for field in schema["properties"]:
+
+            if field not in self._unroll_nested_names(data.columns):
+                continue
+
+            field_type = schema["properties"][field]["bsonType"]
+
+            # if field has a simple type
+            if field_type not in ["array", "object"]:
+
+                grp_fields = [c for c in grp_fields if c in data.columns]
+
+                # check that there is only one possible value of this field
+                n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
+
+                if n_distinct_values != 1:
+                    err = "Field {0} is not unique with respect to {1}"\
+                          .format(field, grp_fields)
+
+                    self._log.error(err)
+                    raise Exception(err)
+
+                if field not in grp_fields:
+                    reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
+                else:
+                    reshaped_field =\
+                        data[grp_fields].drop_duplicates()\
+                        .set_index(grp_fields, drop=False)[field]
+
+                reshaped_fields.append(reshaped_field)
+
+            # if field is sub-document (dictionary)
+            elif field_type == "object":
+
+                sub_schema = deepcopy(schema["properties"][field])
+
+                # rename sub-schema properties to match with data column names
+                sub_schema["properties"] =\
+                    {".".join([field, k]): v for k, v
+                     in sub_schema["properties"].items()}
+
+                sub_data = self.to_list_of_documents(
+                            data=data,
+                            schema=sub_schema,
+                            grp_fields=grp_fields,
+                            _final_step=False)
+
+                reshaped_field = sub_data.apply(self._make_dict, axis=1)
+                reshaped_field.name = field
+
+                reshaped_fields.append(reshaped_field)
+
+            # if field is a list of dictionaries
+            elif field_type == "array":
+
+                items_type = schema["properties"][field]["items"]["bsonType"]
+
+                if items_type == "object":
+
+                    sub_schema = deepcopy(schema["properties"][field]["items"])
+
+                    # rename sub-schema properties to match data column names
+                    sub_schema["properties"] =\
+                        {".".join([field, k]): v for k, v in
+                         sub_schema["properties"].items()}
+
+                    # extend grp fields by sub-fields of field simple types
+                    sub_grp_fields = [f for f in sub_schema["properties"]
+                                      if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
+                                      and (f in data.columns)]
+
+                    if len(sub_grp_fields) == 0:
+                        err = ("One of the sub-keys in a list of documents"
+                               " must be of simple type for the field {}"
+                               .format(field))
+
+                        self._log.error(err)
+                        raise Exception(err)
+
+                    # group and reshape sub-fields with complex types
+                    sub_data = self.to_list_of_documents(
+                                data=data,
+                                schema=sub_schema,
+                                grp_fields=grp_fields + sub_grp_fields,
+                                _final_step=False)
+
+                    if sub_data is not None:
+
+                        # gether the results into a list of dictionaries
+                        sub_data = sub_data.apply(self._make_dict, axis=1)
+
+                        sub_data.name = field
+                        sub_data = sub_data.reset_index(grp_fields)
+
+                        reshaped_field =\
+                            sub_data.groupby(grp_fields, sort=False)[field]\
+                                    .apply(self._make_list_of_distinct)
+
+                        reshaped_fields.append(reshaped_field)
+
+                # if field is a list of values with simple type
+                elif items_type == "array":
+
+                    grp_fields = [c for c in grp_fields if c in data.columns]
+
+                    if field in data.columns:
+
+                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                             .apply(self._make_list_of_distinct)
+
+                        reshaped_fields.append(reshaped_field)
+
+                else:
+
+                    grp_fields = [c for c in grp_fields if c in data.columns]
+
+                    if field in data.columns:
+
+                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
+                                             .apply(self._make_flattened_list_of_distinct)
+
+                        reshaped_fields.append(reshaped_field)
+
+        if len(reshaped_fields) > 0:
+
+            reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
+
+            if _final_step:
+                # dropping the index names if it is the final step,
+                # if not the index is needed for merging
+                reshaped_fields =\
+                    reshaped_fields.drop(list(reshaped_fields.index.names), axis=1, errors="ignore")\
+                                   .reset_index(drop=False)
+
+                self._log.info("Done reshaping the dataframe to a list of documents")
+
+            return reshaped_fields
+
+        else:
+            return
+
+    def _melt_duplicated_columns(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        data = data.copy(deep=True)
+
+        for c in set(data.columns):
+            if isinstance(data[c], pd.DataFrame):
+                """
+                data = pd.melt(data, id_vars=[cc for cc in data.columns
+                                              if cc != c], value_vars=c)\
+                         .drop("variable", axis=1)\
+                         .rename(columns={"value": c})
+                """
+                data["temp"] = data[c].apply(self._make_list, axis=1)
+                data.drop(c, axis=1, inplace=True)
+                data = data.rename(columns={"temp": c})
+
+        return data
+
+    def _make_dict(self, x: pd.Series) -> dict:
+        '''
+        Transforms pandas series to a dictionary
+         is meant to be applied to a dataframe in axis = 1,
+         then the index of the input series are the column names
+         of the dataframe
+        '''
+        def custom_is_null(y):
+            if isinstance(pd.notnull(y), bool):
+                return pd.notnull(y)
+            else:
+                return True
+
+        return {f.split(".")[-1]: x[f] for f in x.index
+                if custom_is_null(x[f])}
+
+    def _make_list(self, x: pd.Series) -> list:
+        '''
+        return: list of values in a series
+        '''
+        return list(x)
+
+    def _make_list_of_distinct(self, x: pd.Series) -> list:
+        '''
+        return: list of unique values from a Series where
+         entries are arbitrary objects
+         (pandas unique() method does not work if entries are of complex types)
+        '''
+        uniques = pd.DataFrame({"temp": x.tolist()})\
+                    .assign(temp_str=lambda y: y["temp"].astype(str))\
+                    .drop_duplicates(subset=["temp_str"])\
+                    .drop("temp_str", axis=1).iloc[:, 0].tolist()
+
+        def is_empty(y):
+            is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
+            is_empty_list = (isinstance(y, list) and (len(y) == 0))
+            return is_empty_dict or is_empty_list
+
+        return [el for el in uniques if not is_empty(el)]
+
+    def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
+        '''
+        return: list of unique values from a Series where
+         entries are arbitrary objects
+         (pandas unique() method does not work if entries are of complex types)
+        '''
+        uniques = self._make_list_of_distinct(x)
+        return uniques[0]
+
+    def _unroll_nested_names(self, names: list) -> list:
+        '''
+        Example: transform a list ["name.firstname", "name.surname"]
+        into ["name", "name.firstname", "name.surname"]
+        '''
+        unrolled = []
+
+        for c in names:
+            splitted = c.split(".")
+            for i in range(len(splitted)):
+                unrolled.append(".".join(splitted[:i+1]))
+
+        return unrolled
+
+
+if __name__ == "__main__":
+
+    # Testing
+
+    df = pd.DataFrame({
+                       "a": [1]*8 + [2]*8,
+                       "b": [10]*8 + [20]*8,
+                       "c": [100, 200]*8,
+                       "d.da": [11]*8 + [22]*8,
+                       "d.db": [33]*8 + [34]*8,
+                       "e.ea.eaa": [5]*8 + [55]*8,
+                       "e.ea.eab": [6]*8 + [66]*8,
+                       "e.eb": [2, 2, 3, 3]*4,
+                       "e.ec.eca": [1, 2, 3, 4]*4,
+                       "e.ec.ecb": [5, 6, 7, 8]*4,
+                       "f.fa": [1]*4 + [3]*4 + [11]*4 + [33]*4,
+                       "f.fb": [2]*4 + [3]*2 + [4]*2 + [22]*4 + [44]*4})
+
+    duplicate = pd.DataFrame({"c": [300, 400]*8})
+
+    df = pd.concat([df, duplicate], axis=1)
+
+    schm = {
+              "bsonType": "object",
+              "required": ["a"],
+              "properties": {
+
+                  "a": {"bsonType": "integer"},
+
+                  "b": {"bsonType": "integer"},
+
+                  "c": {
+                      "bsonType": "array",
+                      "items": {"bsonType": "integer"}
+                  },
+                  "d": {
+                      "bsonType": "object",
+                      "properties": {
+                          "da": {"bsonType": "integer"},
+                          "db": {"bsonType": "integer"}
+                       }
+                  },
+                  "e": {
+                      "bsonType": "object",
+                      "properties": {
+                          "ea": {
+                              "bsonType": "object",
+                              "properties": {
+                                  "eaa": {"bsonType": "integer"},
+                                  "eab": {"bsonType": "integer"}
+                               }
+
+                          },
+
+                          "eb": {
+                              "bsonType": "array",
+                              "items": {"bsonType": "integer"}
+                          },
+
+                          "ec": {
+                                "bsonType": "array",
+                                "items": {
+                                  "bsonType": "object",
+                                  "properties": {
+                                      "eca": {"bsonType": "integer"},
+                                      "ecb": {"bsonType": "integer"}
+                                    }
+                                  }
+                          }
+                      }
+                  },
+                  "f": {
+                      "bsonType": "array",
+                      "items": {
+                          "bsonType": "object",
+                          "properties": {
+                              "fa": {"bsonType": "integer"},
+                              "fb": {
+                                  "bsonType": "array",
+                                  "items": {"bsonType": "integer"}
+                              }
+                          }
+                      }
+                  }
+              }
+              }
+
+    grp_fields = ["a"]
+
+    result = DataFrameToCollection().to_list_of_documents(
+                    data=df,
+                    schema=schm,
+                    grp_fields=grp_fields)

+ 10 - 10
cdplib/db_migration/MigrationCleaning.py

@@ -14,11 +14,11 @@ import gc
 
 sys.path.append(os.getcwd())
 
-from libraries.db_migration.ParseMapping import ParseMapping
-from libraries.db_migration.ParseJsonSchema import ParseJsonSchema
-from libraries.utils.ExceptionsHandler import ExceptionsHandler
-from libraries.utils.CleaningUtils import CleaningUtils
-from libraries.log import Log
+from cdplib.db_migration.ParseMapping import ParseMapping
+from cdplib.db_migration.ParseJsonSchema import ParseJsonSchema
+from cdplib.utils.ExceptionsHandler import ExceptionsHandler
+from cdplib.utils.CleaningUtils import CleaningUtils
+from cdplib.log import Log
 
 class MigrationCleaning:
     '''
@@ -38,7 +38,7 @@ class MigrationCleaning:
         '''
         self._log = Log('Migration Cleaning')
         self._exception_handler = ExceptionsHandler()
-        
+
         assert isinstance(inconsist_report_table, str),\
             "Inconsistent report table should be a tablename string"
 
@@ -58,7 +58,7 @@ class MigrationCleaning:
         self._mapping_path = mapping_path
         self._schema_paths = schema_paths
 
-        from libraries.db_handlers.SQLHandler import SQLHandlerPool
+        from cdplib.db_handlers.SQLHandler import SQLHandlerPool
         self._sql_db = SQLHandlerPool(20)
 
     def _assert_dataframe_input(self, data: pd.DataFrame):
@@ -222,7 +222,7 @@ class MigrationCleaning:
         data = data.copy(deep=True)
 
         #db = self._sql_db.aquire()
-        from libraries.db_handlers.SQLHandler import SQLHandler
+        from cdplib.db_handlers.SQLHandler import SQLHandler
         db = SQLHandler()
 
         if invalid_mask.sum() == 0:
@@ -242,7 +242,7 @@ class MigrationCleaning:
 
         db.append_to_table(data=data_inconsist,
                            tablename=self._inconsist_report_table)
-       
+
         n_rows_filtered = len(data_inconsist)
         n_instances_filtered = len(data_inconsist[self._filter_index_columns].drop_duplicates())
 
@@ -517,7 +517,7 @@ if __name__ == "__main__":
 
     # testing
 
-    from libraries.db_handlers.SQLHandler import SQLHandler
+    from cdplib.db_handlers.SQLHandler import SQLHandler
 
     mapping_path = os.path.join(".", "migration_mappings", "rs1_mapping.json")
 

+ 62 - 0
cdplib/db_migration/ParseDbSchema.py

@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 25 08:22:20 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+import abc
+sys.path.append(os.getcwd())
+
+
+class ParseDbSchema(metaclass=abc.ABCMeta):
+    '''
+    '''
+    def __init__(self, schema_paths: [list, str], log_file: str = None):
+        '''
+        '''
+        from cdplib.log import Log
+
+        self._log = Log(name="ParseDbSchema:", log_file=log_file)
+
+        if isinstance(schema_paths, str):
+            schema_paths = [schema_paths]
+
+        for schema_path in schema_paths:
+            if not os.path.isfile(schema_path):
+                err = "Schema not found"
+                self._log.error(err)
+                raise FileNotFoundError(err)
+
+    @abc.abstractmethod
+    def get_fields(self) -> list:
+        '''
+        '''
+        return
+
+    @abc.abstractmethod
+    def get_datetime_fields(self) -> list:
+        '''
+        '''
+        return
+
+    @abc.abstractmethod
+    def get_python_types(self) -> list:
+        '''
+        '''
+        return
+
+    @abc.abstractmethod
+    def get_default_values(self) -> list:
+        '''
+        '''
+        return
+
+    @abc.abstractmethod
+    def get_allowed_values(self) -> list:
+        '''
+        '''
+        return

+ 354 - 0
cdplib/db_migration/ParseJsonSchema.py

@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jan 31 11:41:48 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+from copy import deepcopy
+import numpy as np
+
+sys.path.append(os.getcwd())
+
+from cdplib.db_migration.ParseDbSchema import ParseDbSchema
+
+
+class ParseJsonSchema(ParseDbSchema):
+    '''
+    Class for retrieving column properties from mongodb jsonSchema
+    '''
+
+    def __init__(self, schema_paths: [list, str], log_file: str = None):
+        '''
+        '''
+        import json
+        from cdplib.log import Log
+
+        super().__init__(schema_paths=schema_paths, log_file=log_file)
+
+        self._log = Log(name="ParseJsonSchema", log_file=log_file)
+
+        # load schemas to dictionaries if they are valid json files
+
+        assert(isinstance(schema_paths, (list, str))),\
+            "Schema paths must be either str or lists"
+
+        if isinstance(schema_paths, str):
+            schema_paths = [schema_paths]
+
+        self._schema_paths = schema_paths
+
+        self.schemas = []
+
+        for schema_path in schema_paths:
+            try:
+                with open(schema_path, "r") as f:
+                    self.schemas.append(json.load(f))
+
+            except Exception as e:
+                err = ("Could not load json schema, "
+                       "Obtained error {}".format(e))
+
+                self._log.error(err)
+                raise Exception(err)
+
+    @property
+    def _collection_names(self) -> list:
+        '''
+        '''
+        # Don't use strip() instaed of replace since schema_c.strip(schema_)
+        # will discard the c as well which is not a appropriate output
+        return [os.path.basename(p).replace("schema_","").split(".")[0] for p in self._schema_paths]
+
+    def get_fields(self) -> list:
+        '''
+        '''
+        return self._parse()
+
+    def get_fields_restricted_to_collection(self, collection_name: str) -> list:
+        '''
+        '''
+        schemas = [self.schemas[self._collection_names.index(collection_name)]]
+        return self._parse(schemas=schemas)
+
+    def get_required_fields(self) -> list:
+        '''
+        '''
+        return self._parse(required_only=True)
+
+    def get_mongo_types(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="bsonType")
+
+    def get_datetime_fields(self):
+        '''
+        '''
+        mongo_types = self.get_mongo_types()
+
+        return [k for k, v in mongo_types.items()
+                if v in ["date", "timestamp", "Date", "Timestamp"]]
+
+    def get_python_types(self) -> dict:
+        '''
+        '''
+        mongo_types = self.get_mongo_types()
+        python_types = {}
+
+        bson_to_python_types = {"double": float,
+                                "decimal": float,
+                                "string": str,
+                                "object": object,
+                                "array": list,
+                                "bool": bool,
+                                "int": int,
+                                "long": int,
+                                "date": np.dtype('<M8[ns]'),
+                                "timestamp": np.dtype('<M8[ns]')
+                                }
+
+        for k, v in mongo_types.items():
+
+            if isinstance(v, list):
+                if ("date" in v) or ("timestamp" in v):
+                    v = "date"
+                elif "string" in v:
+                    v = "string"
+                elif ("double" in v) or ("decimal" in v):
+                    v = "double"
+                elif ("null" in v) and (len(v) == 2) and ("int" not in v):
+                    v = [t for t in v if type != "null"][0]
+                else:
+                    err = "Type {0}: {1} not convertibale".format(k, v)
+                    self._log.error(err)
+                    raise Exception(err)
+
+            if v in bson_to_python_types:
+                python_types[k] = bson_to_python_types[v]
+
+        return python_types
+
+    def get_patterns(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="pattern")
+
+    def get_default_values(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="default")
+
+    def get_allowed_values(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="enum")
+
+    def get_maximum_value(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="maximum")
+
+    def get_minimum_value(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="minimum")
+
+    def get_max_items(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="maxItems")
+
+    def get_min_items(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="minItems")
+
+    def get_field_descriptions(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="description")
+
+    def _parse(self,
+               field_info: str = None,
+               required_only: bool = False,
+               schemas: list = None):
+        '''
+        '''
+        if schemas is None:
+            schemas = self.schemas
+
+        result = self._parse_one(schema=schemas[0],
+                                 field_info=field_info,
+                                 required_only=required_only)
+
+        for schema in schemas[1:]:
+
+            next_result = self._parse_one(schema=schema,
+                                          field_info=field_info,
+                                          required_only=required_only)
+
+            if isinstance(result, list):
+                result.extend(next_result)
+            else:
+                result.update(next_result)
+
+        return result
+
+    def _parse_one(self,
+                   schema: dict,
+                   field_info: str = None,
+                   required_only: bool = False,
+                   super_field_name: str = None,
+                   already_parsed: (list, dict) = None) -> (list, dict):
+        '''
+        Recursive function that returns a list of (nested) field names or
+        a dictionary of (nested) field names with field characteristics.
+
+        :param schema: if None => entire self.schema, or a sub-schema
+            of self.schema
+
+        :param field_info: optional, if provided a dictionary of field
+            names with field characteristics is returned (for examples
+            bsonType of each field), else a list of fields is returned
+
+        :param required_only: when True, only returns fields marked as
+            required in the mongo schema
+
+        :param super_field_name: needed for recursion
+            Example: the field 'article' has
+            subfields 'id' and 'supplier'.
+            If we parse the sub-document corresponding to article, then
+            super_field_name is'article' and we might get an output like
+            {'article.id': string, 'article.supplier': string}
+
+        :param alread_parsed: needed for recursion
+
+        '''
+        schema = deepcopy(schema)
+
+        assert(isinstance(schema, dict)),\
+            "Parameter 'schema' must be a dict"
+
+        if field_info is None:
+            # parse a list of fields
+            if already_parsed is None:
+                already_parsed = []
+            else:
+                assert(isinstance(already_parsed, list)),\
+                    "Parameter 'already_parsed' must be of type list"
+        else:
+            # parse a dictionary of field names with field characteristics
+            if already_parsed is None:
+                already_parsed = {}
+            else:
+                assert(isinstance(already_parsed, dict)),\
+                    "Parameter 'already_parsed' must be of type dict"
+
+        # If schema is nested, then
+        # either it is of bsonType object
+        # and the field information is stored under the key 'properties'
+        # or it is of bsonType array
+        # and the field information is stored in sub-schemas
+        # under the key 'items'
+
+        # if schema is of bsonType object
+        if "properties" in schema.keys():
+            if "required" in schema.keys():
+                required_subfields = schema["required"]
+            else:
+                required_subfields = []
+
+            for sub_field_name in schema["properties"].keys():
+
+                sub_schema = schema["properties"][sub_field_name]
+
+                # only process fields that are required
+                if required_only and\
+                        (sub_field_name not in required_subfields):
+                    pass
+                else:
+                    if super_field_name is not None:
+                        field_name = '.'.join([super_field_name,
+                                               sub_field_name])
+                    else:
+                        field_name = sub_field_name
+
+                    # if the given sub-field is nested, parse the
+                    # sub-schema corresponding to this sub-field
+                    self._parse_one(
+                            schema=sub_schema,
+                            super_field_name=field_name,
+                            field_info=field_info,
+                            already_parsed=already_parsed,
+                            required_only=required_only)
+
+        # if schema is of bsonType array
+        elif "items" in schema.keys():
+            # one schema for all items
+            if isinstance(schema["items"], dict):
+
+                sub_schema = schema["items"]
+
+                self._parse_one(schema=sub_schema,
+                                super_field_name=super_field_name,
+                                field_info=field_info,
+                                already_parsed=already_parsed,
+                                required_only=required_only)
+
+            # list of separate schemas for each item
+            elif isinstance(schema["items"], list):
+
+                for sub_schema in schema["items"]:
+                    self._parse_one(schema=sub_schema,
+                                    super_field_name=super_field_name,
+                                    field_info=field_info,
+                                    already_parsed=already_parsed,
+                                    required_only=required_only)
+            else:
+                raise Exception(('Schema is not composed correctly: '
+                                 'items must be a dictionary or a list'))
+        else:
+            # If neither properties nor items is in schema keys
+            # we reached the last level of nestedness,
+            # field information is stored in the schema keys.
+            field_name = super_field_name
+
+            if field_info is None:
+                already_parsed.append(field_name)
+            else:
+                if field_info in schema.keys():
+                    already_parsed[field_name] = schema[field_info]
+                else:
+                    pass
+
+        return already_parsed
+
+
+if __name__ == "__main__":
+
+    # Only for testing
+
+    schema_path = os.path.join(".", "mongo_schema", "schema_wheelsets.json")
+
+    if os.path.isfile(schema_path):
+
+        parse_obj = ParseJsonSchema(schema_paths=schema_path)
+
+        fields = parse_obj.get_fields()
+
+        required_fileds = parse_obj.get_required_fields()
+
+        patterns = parse_obj.get_patterns()
+
+        mongo_types = parse_obj.get_mongo_types()
+
+        python_types_except_dates = parse_obj.get_python_types()
+
+        datetime_fields = parse_obj.get_datetime_fields()
+
+        allowed_values = parse_obj.get_allowed_values()
+
+        descriptions = parse_obj.get_field_descriptions()

+ 164 - 0
cdplib/db_migration/ParseMapping.py

@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 20 15:33:17 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+import numpy as np
+sys.path.append(os.getcwd())
+
+class ParseMapping:
+    '''
+    '''
+    def __init__(self, mapping_path: str, log_name: str = "ParseMapping",
+                 source: str = "original_name", target: str = "mongo_name",
+                 target_collection: str = "mongo_collection"):
+        '''
+        '''
+        import json
+        from cdplog.log import Log
+
+        self.log = Log('Parse Mapping')
+
+        if not os.path.isfile(mapping_path):
+            err = "Mapping not found"
+            self._log.error(err)
+            raise FileNotFoundError(err)
+
+        try:
+            with open(mapping_path, "r") as f:
+                self._mapping = json.load(f)
+
+        except Exception as e:
+            err = ("Could not load mapping. "
+                   "Exit with error {}".format(e))
+            self._log.error(err)
+            raise Exception(err)
+
+        self._source = source
+        self._target = target
+        self._target_collection = target_collection
+
+    def get_field_mapping(self) -> dict:
+        '''
+        '''
+        assert(all([set([self._source, self._target]) <= set(d)
+                    for d in self._mapping]))
+
+        return {d[self._source]: d[self._target] for d in self._mapping}
+
+    def _get_fields_satistisfying_condition(self, key: str, value) -> list:
+        '''
+        '''
+        assert(all([self._source in d for d in self._mapping])),\
+            "Invalid from field"
+
+        return [d[self._source] for d in self._mapping
+                if (key in d) and (d[key] == value)]
+
+    def get_required_fields(self) -> list:
+        '''
+        '''
+        return self._get_fields_satistisfying_condition(key="required",
+                                                        value=1)
+
+    def get_date_fields(self) -> list:
+        '''
+        '''
+        return self._get_fields_satistisfying_condition(key="type",
+                                                        value="Date")
+
+    def get_fields_restricted_to_collecton(self, collection_name: str) -> list:
+        '''
+        '''
+        return self._get_fields_satistisfying_condition(key=self._target_collection,
+                                                        value=collection_name)
+
+    def _get_info(self, key: str, value=None) -> dict:
+        '''
+        '''
+        assert(all([self._source in d for d in self._mapping])),\
+            "Invalid from field"
+
+        return {d[self._source]: d[key] for d in self._mapping
+                if (key in d) and ((value is not None)
+                and (d[key] == value)) or (key in d)}
+
+    def get_default_values(self) -> dict:
+        '''
+        '''
+        return self._get_info(key="default_values")
+
+    def get_date_formats(self) -> dict:
+        '''
+        '''
+        return self._get_info(key="date_format")
+
+    def get_types(self) -> dict:
+        '''
+        '''
+        return self._get_info(key="type")
+
+    def get_python_types(self) -> dict:
+        '''
+        '''
+        sql_to_python_dtypes = {
+                "Text": str,
+                "Date": np.dtype('<M8[ns]'),
+                "Double": float,
+                "Integer": int
+                }
+
+        sql_types = self.get_types()
+
+        return {k: sql_to_python_dtypes[v] for k, v in sql_types.items()}
+
+    def get_value_mappings(self) -> dict:
+        '''
+        '''
+        return self._get_info(key="value_mapping")
+
+    def get_column_numbers(self) -> list:
+        '''
+        '''
+        if all(["column_number" in d for d in self._mapping]):
+            column_numbers = [d["column_number"] for d in self._mapping]
+
+        elif all(["column_number" not in d for d in self._mapping]):
+            column_numbers = list(range(len(self._mapping)))
+
+        else:
+            err = ("Incorrectly filled mapping. Column numbers should ",
+                   "either in all or in neither of the fields")
+            self.log.err(err)
+            raise Exception(err)
+
+        return column_numbers
+
+
+if __name__ == "__main__":
+
+    mapping_path = os.path.join(".", "migration_mappings", "rs0_mapping.json")
+
+    if os.path.isfile(mapping_path):
+
+        print("found mapping path")
+
+        parser = ParseMapping(mapping_path, source="internal_name",
+                              target="mongo_name")
+
+        internal_to_mongo_mapping = parser.get_field_mapping()
+
+        original_to_internal_mapping = parser.get_field_mapping()
+
+        default_values = parser.get_default_values()
+
+        types = parser.get_types()
+
+        column_numbers = parser.get_column_numbers()
+
+        print("Done testing!")