Browse Source

adapted regex

bscheibel 3 years ago
parent
commit
758e92c150

BIN
__pycache__/order_bounding_boxes_in_each_block.cpython-37.pyc


BIN
__pycache__/read_from_clustered_merged.cpython-37.pyc


+ 0 - 1
order_bounding_boxes_in_each_block.py

@@ -36,7 +36,6 @@ def pdf_to_html(uuid,filepath, path):
     print(filename)
     subprocess.call(['pdftotext', '-bbox-layout',
                      filepath, filename])
-    print("test2")
     return filename
 
 def extract_isos(result):

+ 6 - 3
read_from_clustered_merged.py

@@ -82,10 +82,14 @@ def read(file):
 
 def print_clean(dims): ##alles raus was nicht relevant ist! und zeichen ersetzen!
     dims_new = {}
-    reg_clean = r"ISO|^\S{1}$|[a-zA-Z]{4,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$|^[A-Z]{1}$|^mm$|^\d{2}\.\d{2}\.\d{4}|^-$|A\d|^\d{1}$|^[A-Za-z]{3,}\.?$|^\d{5}|^\d{1}\s\W\s\d"
+    reg_clean = r"ISO|[a-zA-Z]{4,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$|^[A-Z]{1}$|^mm$|^\d{2}\.\d{2}\.\d{4}|^-$|A\d|^\d{1}$|^[A-Za-z]{3,}\.?$|^\d{5}|^\d{1}\s\W\s\d"
+    reg_one_character_only = "^\s*\S{1}\s*x?\s*$" #get rid of singular characters or numbers, or times eg 4x
     for dim in dims:
+        #dim = dim.strip()
         if re.search(reg_clean, dim):
             continue
+        elif re.search(reg_one_character_only,dim):
+            continue
         else:
             coords = dims[dim]
             if re.search(r"b\s\d*\W?\d*\s.",dim):
@@ -117,6 +121,7 @@ def print_clean(dims): ##alles raus was nicht relevant ist! und zeichen ersetzen
             reg12 = re.compile(r"(.*\d{1,4}\W?\d{0,4})\s?\+\s-\s?(\d{1,4}\W?\d{0,4})\s?(\d{1,4}\W?\d{0,3})") ##???? was machst du?? nach toleranzen suchen, mit +/- blabla
             reg13 = re.compile(r"(.*)\+\s\+\s(\d*\W\d*)\s(\d*\W\d*)(.*)")
             reg14 = re.compile(r"(\+\s?\d*,?.?\d*)\s*(\d*,?.?\d*)\s*(\+?\s?\-?\s?\d*,?.?\d*)")
+            reg15 = re.compile(r"\d\s\d\.|\.\d\s\d")
             g = re.search(reg12, dim)
             f = re.search(reg13, dim)
             e = re.search(reg14, dim)
@@ -126,9 +131,7 @@ def print_clean(dims): ##alles raus was nicht relevant ist! und zeichen ersetzen
                 dim = f.group(1) + "+" + f.group(2) + " +" + f.group(3) + f.group(4)
             elif e:
                 dim = e.group(2) + " " + e.group(1) + " " + e.group(3)
-
             dim = dim.replace(" ,",".").replace(", ",".").replace(",",".")
             dims_new[dim] = coords
 
-            print(dims_new)
     return dims_new