瀏覽代碼

update migrationcleaning

tsteuer 4 年之前
父節點
當前提交
195ff25fc2
共有 1 個文件被更改,包括 28 次插入0 次删除
  1. 28 0
      cdplib/db_migration/MigrationCleaning.py

+ 28 - 0
cdplib/db_migration/MigrationCleaning.py

@@ -19,6 +19,8 @@ from cdplib.db_migration.ParseJsonSchema import ParseJsonSchema
 from cdplib.utils.ExceptionsHandler import ExceptionsHandler
 from cdplib.utils.CleaningUtils import CleaningUtils
 from cdplib.log import Log
+import json
+from boltons.iterutils import remap
 
 class MigrationCleaning:
     '''
@@ -376,6 +378,10 @@ class MigrationCleaning:
                     data[column] = data[column].astype(python_type)
                     python_type = object
                     data[column] = data[column].astype(python_type)
+                    
+                elif python_type == float:
+                    data = data.fillna(np.inf)
+                    data[column] = data[column].astype(python_type)
 
                 else:
 
@@ -523,6 +529,28 @@ class MigrationCleaning:
                 continue
 
         return data
+    
+    def drop_columns_with_no_content(self, data: pd.DataFrame) -> pd.DataFrame():
+        '''
+        '''
+        data = data.dropna(how ='all', axis='columns')
+        for column in data.columns: 
+            unique_values = data[column].unique() 
+            no_content_signs = [None, '-', 'n.a'] 
+            intersection = list(set(unique_values) & set(no_content_signs))
+            if len(intersection) - len(unique_values) == 0:
+                data = data.drop(columns=[column])
+            
+        return data
+    
+    def clean_json_from_None_object(self, data: pd.DataFrame) -> pd.DataFrame():
+        data = data.to_json(date_format="iso")
+        data = json.loads(data)
+        new_data = remap(data, lambda p, k, v: v is not None)
+        new_data = remap(new_data, lambda p, k, v: v != 'None')
+        new_data = remap(new_data, lambda p, k, v: v != 'inf')
+        new_data = remap(new_data, lambda p, k, v: (isinstance(v,bool) or (not isinstance(v,bool) and bool(v))))
+        return new_data
 
     def restrict_to_collection(self, data: pd.DataFrame, collection_name: str) -> pd.DataFrame:
         '''