added configuration options for tesseract

RobinRojowiec · Jan 22, 2020 · 01775c3 · 01775c3
1 parent 1469cbf
commit 01775c3
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 4 deletions.
diff --git a/preprocessing.py b/preprocessing.py
@@ -0,0 +1,19 @@
+"""
+
+IDE: PyCharm
+Project: ocr-api
+Author: Robin
+Filename: preprocessing.py
+Date: 21.01.2020
+
+"""
+import cv2
+
+
+def preprocess_image(image_path):
+    # load the example image and convert it to grayscale
+    image = cv2.imread(image_path)
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    gray = cv2.threshold(gray, 0, 255,
+                         cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+    cv2.imwrite(image_path, gray)
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ fastapi==0.46.0
 uvicorn==0.11.1
 python-multipart==0.0.5
 pytesseract==0.3.1
-xmltodict==0.12.0
+xmltodict==0.12.0
+pillow==7.0.0
diff --git a/server.py b/server.py
@@ -113,7 +113,7 @@ def hocr_to_simple_json(hocr_dict: dict, lang: str):
 
 
 @app.post("/api/extract", response_model=ExtractedPage, description="Extract text with positions from image")
-def extract_text(file: UploadFile = File(...), lang: str = "eng", text_only: bool = False):
+def extract_text(file: UploadFile = File(...), lang: str = "eng", text_only: bool = False, custom_config: str = None):
     """
     :param file:
     :param lang: available: deu, eng
@@ -124,11 +124,15 @@ def extract_text(file: UploadFile = File(...), lang: str = "eng", text_only: boo
         with open(filepath, "wb") as temp_file:
             temp_file.write(file.file.read())
 
+    # preprocess_image(filepath)
+    if custom_config is None:
+        custom_config = '--oem 3'
+
     if text_only:
-        output = pytesseract.image_to_string(filepath, lang=lang)
+        output = bytes(pytesseract.image_to_string(filepath, lang=lang, config=custom_config), encoding="utf-8")
         response = PlainTextResponse(content=output)
     else:
-        output = pytesseract.image_to_pdf_or_hocr(filepath, lang=lang, extension='hocr')
+        output = pytesseract.image_to_pdf_or_hocr(filepath, lang=lang, extension='hocr', config=custom_config)
         extracted = xmltodict.parse(output)
         response = hocr_to_simple_json(extracted, lang)