Skip to content

Commit

Permalink
added configuration options for tesseract
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinRojowiec committed Jan 22, 2020
1 parent 1469cbf commit 01775c3
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 4 deletions.
19 changes: 19 additions & 0 deletions preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""
IDE: PyCharm
Project: ocr-api
Author: Robin
Filename: preprocessing.py
Date: 21.01.2020
"""
import cv2


def preprocess_image(image_path):
# load the example image and convert it to grayscale
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
cv2.imwrite(image_path, gray)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ fastapi==0.46.0
uvicorn==0.11.1
python-multipart==0.0.5
pytesseract==0.3.1
xmltodict==0.12.0
xmltodict==0.12.0
pillow==7.0.0
10 changes: 7 additions & 3 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def hocr_to_simple_json(hocr_dict: dict, lang: str):


@app.post("/api/extract", response_model=ExtractedPage, description="Extract text with positions from image")
def extract_text(file: UploadFile = File(...), lang: str = "eng", text_only: bool = False):
def extract_text(file: UploadFile = File(...), lang: str = "eng", text_only: bool = False, custom_config: str = None):
"""
:param file:
:param lang: available: deu, eng
Expand All @@ -124,11 +124,15 @@ def extract_text(file: UploadFile = File(...), lang: str = "eng", text_only: boo
with open(filepath, "wb") as temp_file:
temp_file.write(file.file.read())

# preprocess_image(filepath)
if custom_config is None:
custom_config = '--oem 3'

if text_only:
output = pytesseract.image_to_string(filepath, lang=lang)
output = bytes(pytesseract.image_to_string(filepath, lang=lang, config=custom_config), encoding="utf-8")
response = PlainTextResponse(content=output)
else:
output = pytesseract.image_to_pdf_or_hocr(filepath, lang=lang, extension='hocr')
output = pytesseract.image_to_pdf_or_hocr(filepath, lang=lang, extension='hocr', config=custom_config)
extracted = xmltodict.parse(output)
response = hocr_to_simple_json(extracted, lang)

Expand Down

0 comments on commit 01775c3

Please sign in to comment.