filter out text noise by its content

soni-aditya · Jul 5, 2021 · 4eba4f2 · 4eba4f2
1 parent f17fd4f
commit 4eba4f2
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 2 deletions.
diff --git a/detect_text/__pycache__/text_detection.cpython-35.pyc b/detect_text/__pycache__/text_detection.cpython-35.pyc
diff --git a/detect_text/text_detection.py b/detect_text/text_detection.py
@@ -84,15 +84,25 @@ def text_cvt_orc_format(ocr_result):
     return texts
 
 
-def text_detection(input_file='../data/input/30800.jpg', output_file='../data/output', show=False):
+def text_filter_noise(texts):
+    valid_texts = []
+    for text in texts:
+        if len(text.content) <= 1 and text.content.lower() not in ['a', ',', '.', '!', '?']:
+            continue
+        valid_texts.append(text)
+    return valid_texts
+
+
+def text_detection(input_file='../data/input/30800.jpg', output_file='../data/output', word_inline_gap=10, show=False):
     start = time.clock()
     name = input_file.split('/')[-1][:-4]
     oct_root = pjoin(output_file, 'ocr')
     img = cv2.imread(input_file)
 
     ocr_result = ocr.ocr_detection_google(input_file)
     texts = text_cvt_orc_format(ocr_result)
-    texts = text_sentences_recognition(texts, bias_justify=5, bias_gap=50)
+    texts = text_filter_noise(texts)
+    texts = text_sentences_recognition(texts, bias_justify=3, bias_gap=word_inline_gap)
     visualize_texts(img, texts, (600, 900), show=show, write_path=pjoin(oct_root, name+'.png'))
     save_detection_json(pjoin(oct_root, name+'.json'), texts, img.shape)
     print("[Text Detection Completed in %.3f s] %s" % (time.clock() - start, input_file))