Skip to content

Commit

Permalink
refactor: Moved doctr.documents to doctr.io (mindee#390)
Browse files Browse the repository at this point in the history
* refactor: Moved doctr.documents to doctr.io

* refactor: Refactored doctr.datasets

* refactor: Updated imports

* docs: Updated README

* fix: Fixed syntax

* style: Fixed lint

* test: Updated unittests

* refactor: Updated import

* docs: Updated documentation

* refactor: Moved reader types to doctr.utils

* feat: Added binary decoding as tensor image

* test: Updated unittests

* refactor: Refactored API

* fix: Fixed import

* fix: Fixed API routes

* test: Fixed imports

* refactor: Removed unused imports

* test: Removed unused import

* refactor: Removed unused imports

* test: Updated unittests

* test: Fixed imports
  • Loading branch information
fg-mindee authored Jul 26, 2021
1 parent 8732665 commit 2bdcb3f
Show file tree
Hide file tree
Showing 35 changed files with 427 additions and 201 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrai
Documents can be interpreted from PDF or images:

```python
from doctr.documents import DocumentFile
from doctr.io import DocumentFile
# PDF
pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
# Image
Expand All @@ -45,7 +45,7 @@ multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jp
### Putting it together
Let's use the default pretrained model for an example:
```python
from doctr.documents import DocumentFile
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)
Expand Down
5 changes: 3 additions & 2 deletions api/app/routes/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from fastapi import APIRouter, UploadFile, File
from typing import List

from app.vision import decode_image, det_predictor
from doctr.io import decode_img_as_tensor
from app.vision import det_predictor
from app.schemas import DetectionOut


Expand All @@ -16,6 +17,6 @@
@router.post("/", response_model=List[DetectionOut], status_code=200, summary="Perform text detection")
async def text_detection(file: UploadFile = File(...)):
"""Runs DocTR text detection model to analyze the input"""
img = decode_image(file.file.read())
img = decode_img_as_tensor(file.file.read())
boxes, _ = det_predictor([img], training=False)[0]
return [DetectionOut(box=box.tolist()) for box in boxes[:, :-1]]
5 changes: 3 additions & 2 deletions api/app/routes/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from fastapi import APIRouter, UploadFile, File
from typing import List

from app.vision import decode_image, predictor
from doctr.io import decode_img_as_tensor
from app.vision import predictor
from app.schemas import OCROut


Expand All @@ -16,7 +17,7 @@
@router.post("/", response_model=List[OCROut], status_code=200, summary="Perform OCR")
async def perform_ocr(file: UploadFile = File(...)):
"""Runs DocTR OCR model to analyze the input"""
img = decode_image(file.file.read())
img = decode_img_as_tensor(file.file.read())
out = predictor([img], training=False)

return [OCROut(box=(*word.geometry[0], *word.geometry[1]), value=word.value)
Expand Down
5 changes: 3 additions & 2 deletions api/app/routes/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

from fastapi import APIRouter, UploadFile, File

from app.vision import decode_image, reco_predictor
from doctr.io import decode_img_as_tensor
from app.vision import reco_predictor
from app.schemas import RecognitionOut


Expand All @@ -15,6 +16,6 @@
@router.post("/", response_model=RecognitionOut, status_code=200, summary="Perform text recognition")
async def text_recognition(file: UploadFile = File(...)):
"""Runs DocTR text recognition model to analyze the input"""
img = decode_image(file.file.read())
img = decode_img_as_tensor(file.file.read())
out = reco_predictor([img], training=False)
return RecognitionOut(value=out[0][0])
5 changes: 0 additions & 5 deletions api/app/vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,3 @@
predictor = ocr_predictor(pretrained=True)
det_predictor = predictor.det_predictor
reco_predictor = predictor.reco_predictor


def decode_image(img_bytes: bytes) -> tf.Tensor:
"""Decodes an image from bytes"""
return tf.io.decode_image(img_bytes, channels=3)
2 changes: 1 addition & 1 deletion demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
if any(gpu_devices):
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

from doctr.documents import DocumentFile
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from doctr.utils.visualization import synthetize_page, visualize_page

Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Supported datasets
:caption: Package Reference

datasets
documents
io
models
transforms
utils
14 changes: 9 additions & 5 deletions docs/source/documents.rst → docs/source/io.rst
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
doctr.documents
===============
doctr.io
========


.. currentmodule:: doctr.documents
.. currentmodule:: doctr.io

The documents module enables users to easily access content from documents and export analysis
The io module enables users to easily access content from documents and export analysis
results to structured formats.

.. _document_structure:
Expand Down Expand Up @@ -66,7 +66,11 @@ High-performance file reading and conversion to processable structured data.

.. autofunction:: read_pdf

.. autofunction:: read_img
.. autofunction:: read_img_as_numpy

.. autofunction:: read_img_as_tensor

.. autofunction:: decode_img_as_tensor

.. autofunction:: read_html

Expand Down
2 changes: 1 addition & 1 deletion doctr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .file_utils import is_tf_available, is_torch_available
from .version import __version__ # noqa: F401
from . import documents
from . import io
from . import transforms
from . import utils
from . import models
Expand Down
17 changes: 2 additions & 15 deletions doctr/datasets/datasets/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@

import os
from typing import List, Any, Tuple
import numpy as np
from PIL import Image
import torch
from torchvision.transforms.functional import to_tensor

from doctr.io import read_img_as_tensor
from .base import _AbstractDataset, _VisionDataset


Expand All @@ -25,18 +23,7 @@ def _get_img_shape(img: Any) -> Tuple[int, int]:
def _read_sample(self, index: int) -> Tuple[torch.Tensor, Any]:
img_name, target = self.data[index]
# Read image
pil_img = Image.open(os.path.join(self.root, img_name), mode='r').convert('RGB')
if self.fp16:
img = torch.from_numpy(
np.array(pil_img, np.uint8, copy=True)
)
img = img.view(pil_img.size[1], pil_img.size[0], len(pil_img.getbands()))
# put it from HWC to CHW format
img = img.permute((2, 0, 1)).contiguous()
# Switch to FP16
img = img.to(dtype=torch.float16).div(255)
else:
img = to_tensor(pil_img)
img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=torch.float16 if self.fp16 else torch.float32)

return img, target

Expand Down
10 changes: 2 additions & 8 deletions doctr/datasets/datasets/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import List, Any, Tuple
import tensorflow as tf

from doctr.io import read_img_as_tensor
from .base import _AbstractDataset, _VisionDataset


Expand All @@ -22,14 +23,7 @@ def _get_img_shape(img: Any) -> Tuple[int, int]:
def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
img_name, target = self.data[index]
# Read image
img = tf.io.read_file(os.path.join(self.root, img_name))
img = tf.image.decode_jpeg(img, channels=3)
if self.fp16:
img = tf.image.convert_image_dtype(img, dtype=tf.float16)
else:
img = tf.image.convert_image_dtype(img, dtype=tf.float32)

img = tf.clip_by_value(img, 0, 1)
img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32)

return img, target

Expand Down
2 changes: 0 additions & 2 deletions doctr/documents/__init__.py

This file was deleted.

5 changes: 5 additions & 0 deletions doctr/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .elements import *
from .reader import *
from .image import *
from .pdf import *
from .html import *
File renamed without changes.
25 changes: 25 additions & 0 deletions doctr/io/html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (C) 2021, Mindee.

# This program is licensed under the Apache License version 2.
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.

from weasyprint import HTML
from typing import Any

__all__ = ['read_html']


def read_html(url: str, **kwargs: Any) -> bytes:
"""Read a PDF file and convert it into an image in numpy format
Example::
>>> from doctr.documents import read_html
>>> doc = read_html("https://www.yoursite.com")
Args:
url: URL of the target web page
Returns:
decoded PDF file as a bytes stream
"""

return HTML(url, **kwargs).write_pdf()
8 changes: 8 additions & 0 deletions doctr/io/image/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .base import *

from doctr.file_utils import is_tf_available, is_torch_available

if is_tf_available():
from .tensorflow import *
elif is_torch_available():
from .pytorch import * # type: ignore[misc]
53 changes: 53 additions & 0 deletions doctr/io/image/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (C) 2021, Mindee.

# This program is licensed under the Apache License version 2.
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.

from pathlib import Path
from typing import Optional, Tuple
import numpy as np
import cv2
from doctr.utils.common_types import AbstractFile

__all__ = ['read_img_as_numpy']


def read_img_as_numpy(
file: AbstractFile,
output_size: Optional[Tuple[int, int]] = None,
rgb_output: bool = True,
) -> np.ndarray:
"""Read an image file into numpy format
Example::
>>> from doctr.documents import read_img
>>> page = read_img("path/to/your/doc.jpg")
Args:
file: the path to the image file
output_size: the expected output size of each page in format H x W
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
Returns:
the page decoded as numpy ndarray of shape H x W x 3
"""

if isinstance(file, (str, Path)):
if not Path(file).is_file():
raise FileNotFoundError(f"unable to access {file}")
img = cv2.imread(str(file), cv2.IMREAD_COLOR)
elif isinstance(file, bytes):
file = np.frombuffer(file, np.uint8)
img = cv2.imdecode(file, cv2.IMREAD_COLOR)
else:
raise TypeError("unsupported object type for argument 'file'")

# Validity check
if img is None:
raise ValueError("unable to read file.")
# Resizing
if isinstance(output_size, tuple):
img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR)
# Switch the channel order
if rgb_output:
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
70 changes: 70 additions & 0 deletions doctr/io/image/pytorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright (C) 2021, Mindee.

# This program is licensed under the Apache License version 2.
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.

import numpy as np
from PIL import Image
from io import BytesIO
import torch
from torchvision.transforms.functional import to_tensor

from doctr.utils.common_types import AbstractPath

__all__ = ['read_img_as_tensor', 'decode_img_as_tensor']


def _from_pil_img(pil_img: Image, dtype: torch.dtype = torch.float32) -> torch.Tensor:

if dtype == torch.float32:
img = to_tensor(pil_img)
else:
img = torch.from_numpy(
np.array(pil_img, np.uint8, copy=True)
)
img = img.view(pil_img.size[1], pil_img.size[0], len(pil_img.getbands()))
# put it from HWC to CHW format
img = img.permute((2, 0, 1)).contiguous()
if dtype == torch.float16:
# Switch to FP16
img = img.to(dtype=torch.float16).div(255)

return img


def read_img_as_tensor(img_path: AbstractPath, dtype: torch.dtype = torch.float32) -> torch.Tensor:
"""Read an image file as a PyTorch tensor
Args:
img_path: location of the image file
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
Returns:
decoded image as a tensor
"""

if dtype not in (torch.uint8, torch.float16, torch.float32):
raise ValueError("insupported value for dtype")

pil_img = Image.open(img_path, mode='r').convert('RGB')

return _from_pil_img(pil_img, dtype)


def decode_img_as_tensor(img_content: bytes, dtype: torch.dtype = torch.float32) -> torch.Tensor:
"""Read a byte stream as a PyTorch tensor
Args:
img_content: bytes of a decoded image
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
Returns:
decoded image as a tensor
"""

if dtype not in (torch.uint8, torch.float16, torch.float32):
raise ValueError("insupported value for dtype")

pil_img = Image.open(BytesIO(img_content), mode='r').convert('RGB')

return _from_pil_img(pil_img, dtype)
Loading

0 comments on commit 2bdcb3f

Please sign in to comment.