refactor: Moved doctr.documents to doctr.io (mindee#390)

* refactor: Moved doctr.documents to doctr.io * refactor: Refactored doctr.datasets * refactor: Updated imports * docs: Updated README * fix: Fixed syntax * style: Fixed lint * test: Updated unittests * refactor: Updated import * docs: Updated documentation * refactor: Moved reader types to doctr.utils * feat: Added binary decoding as tensor image * test: Updated unittests * refactor: Refactored API * fix: Fixed import * fix: Fixed API routes * test: Fixed imports * refactor: Removed unused imports * test: Removed unused import * refactor: Removed unused imports * test: Updated unittests * test: Fixed imports
ANASS812 · Jul 26, 2021 · 2bdcb3f · 2bdcb3f
1 parent 8732665
commit 2bdcb3f
Show file tree

Hide file tree

Showing 35 changed files with 427 additions and 201 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrai
 Documents can be interpreted from PDF or images:
 
 ```python
-from doctr.documents import DocumentFile
+from doctr.io import DocumentFile
 # PDF
 pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
 # Image
@@ -45,7 +45,7 @@ multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jp
 ### Putting it together
 Let's use the default pretrained model for an example:
 ```python
-from doctr.documents import DocumentFile
+from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 
 model = ocr_predictor(pretrained=True)

diff --git a/api/app/routes/detection.py b/api/app/routes/detection.py
@@ -6,7 +6,8 @@
 from fastapi import APIRouter, UploadFile, File
 from typing import List
 
-from app.vision import decode_image, det_predictor
+from doctr.io import decode_img_as_tensor
+from app.vision import det_predictor
 from app.schemas import DetectionOut
 
 
@@ -16,6 +17,6 @@
 @router.post("/", response_model=List[DetectionOut], status_code=200, summary="Perform text detection")
 async def text_detection(file: UploadFile = File(...)):
     """Runs DocTR text detection model to analyze the input"""
-    img = decode_image(file.file.read())
+    img = decode_img_as_tensor(file.file.read())
     boxes, _ = det_predictor([img], training=False)[0]
     return [DetectionOut(box=box.tolist()) for box in boxes[:, :-1]]
diff --git a/api/app/routes/ocr.py b/api/app/routes/ocr.py
@@ -6,7 +6,8 @@
 from fastapi import APIRouter, UploadFile, File
 from typing import List
 
-from app.vision import decode_image, predictor
+from doctr.io import decode_img_as_tensor
+from app.vision import predictor
 from app.schemas import OCROut
 
 
@@ -16,7 +17,7 @@
 @router.post("/", response_model=List[OCROut], status_code=200, summary="Perform OCR")
 async def perform_ocr(file: UploadFile = File(...)):
     """Runs DocTR OCR model to analyze the input"""
-    img = decode_image(file.file.read())
+    img = decode_img_as_tensor(file.file.read())
     out = predictor([img], training=False)
 
     return [OCROut(box=(*word.geometry[0], *word.geometry[1]), value=word.value)

diff --git a/api/app/routes/recognition.py b/api/app/routes/recognition.py
@@ -5,7 +5,8 @@
 
 from fastapi import APIRouter, UploadFile, File
 
-from app.vision import decode_image, reco_predictor
+from doctr.io import decode_img_as_tensor
+from app.vision import reco_predictor
 from app.schemas import RecognitionOut
 
 
@@ -15,6 +16,6 @@
 @router.post("/", response_model=RecognitionOut, status_code=200, summary="Perform text recognition")
 async def text_recognition(file: UploadFile = File(...)):
     """Runs DocTR text recognition model to analyze the input"""
-    img = decode_image(file.file.read())
+    img = decode_img_as_tensor(file.file.read())
     out = reco_predictor([img], training=False)
     return RecognitionOut(value=out[0][0])
diff --git a/api/app/vision.py b/api/app/vision.py
@@ -15,8 +15,3 @@
 predictor = ocr_predictor(pretrained=True)
 det_predictor = predictor.det_predictor
 reco_predictor = predictor.reco_predictor
-
-
-def decode_image(img_bytes: bytes) -> tf.Tensor:
-    """Decodes an image from bytes"""
-    return tf.io.decode_image(img_bytes, channels=3)
diff --git a/demo/app.py b/demo/app.py
@@ -16,7 +16,7 @@
 if any(gpu_devices):
     tf.config.experimental.set_memory_growth(gpu_devices[0], True)
 
-from doctr.documents import DocumentFile
+from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 from doctr.utils.visualization import synthetize_page, visualize_page
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -77,7 +77,7 @@ Supported datasets
    :caption: Package Reference
 
    datasets
-   documents
+   io
    models
    transforms
    utils
diff --git a/docs/source/documents.rst → docs/source/io.rst b/docs/source/documents.rst → docs/source/io.rst
@@ -1,10 +1,10 @@
-doctr.documents
-===============
+doctr.io
+========
 
 
-.. currentmodule:: doctr.documents
+.. currentmodule:: doctr.io
 
-The documents module enables users to easily access content from documents and export analysis
+The io module enables users to easily access content from documents and export analysis
 results to structured formats.
 
 .. _document_structure:
@@ -66,7 +66,11 @@ High-performance file reading and conversion to processable structured data.
 
 .. autofunction:: read_pdf
 
-.. autofunction:: read_img
+.. autofunction:: read_img_as_numpy
+
+.. autofunction:: read_img_as_tensor
+
+.. autofunction:: decode_img_as_tensor
 
 .. autofunction:: read_html
 

diff --git a/doctr/__init__.py b/doctr/__init__.py
@@ -1,6 +1,6 @@
 from .file_utils import is_tf_available, is_torch_available
 from .version import __version__  # noqa: F401
-from . import documents
+from . import io
 from . import transforms
 from . import utils
 from . import models

diff --git a/doctr/datasets/datasets/pytorch.py b/doctr/datasets/datasets/pytorch.py
@@ -5,11 +5,9 @@
 
 import os
 from typing import List, Any, Tuple
-import numpy as np
-from PIL import Image
 import torch
-from torchvision.transforms.functional import to_tensor
 
+from doctr.io import read_img_as_tensor
 from .base import _AbstractDataset, _VisionDataset
 
 
@@ -25,18 +23,7 @@ def _get_img_shape(img: Any) -> Tuple[int, int]:
     def _read_sample(self, index: int) -> Tuple[torch.Tensor, Any]:
         img_name, target = self.data[index]
         # Read image
-        pil_img = Image.open(os.path.join(self.root, img_name), mode='r').convert('RGB')
-        if self.fp16:
-            img = torch.from_numpy(
-                np.array(pil_img, np.uint8, copy=True)
-            )
-            img = img.view(pil_img.size[1], pil_img.size[0], len(pil_img.getbands()))
-            # put it from HWC to CHW format
-            img = img.permute((2, 0, 1)).contiguous()
-            # Switch to FP16
-            img = img.to(dtype=torch.float16).div(255)
-        else:
-            img = to_tensor(pil_img)
+        img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=torch.float16 if self.fp16 else torch.float32)
 
         return img, target
 

diff --git a/doctr/datasets/datasets/tensorflow.py b/doctr/datasets/datasets/tensorflow.py
@@ -7,6 +7,7 @@
 from typing import List, Any, Tuple
 import tensorflow as tf
 
+from doctr.io import read_img_as_tensor
 from .base import _AbstractDataset, _VisionDataset
 
 
@@ -22,14 +23,7 @@ def _get_img_shape(img: Any) -> Tuple[int, int]:
     def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
         img_name, target = self.data[index]
         # Read image
-        img = tf.io.read_file(os.path.join(self.root, img_name))
-        img = tf.image.decode_jpeg(img, channels=3)
-        if self.fp16:
-            img = tf.image.convert_image_dtype(img, dtype=tf.float16)
-        else:
-            img = tf.image.convert_image_dtype(img, dtype=tf.float32)
-
-        img = tf.clip_by_value(img, 0, 1)
+        img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32)
 
         return img, target
 

diff --git a/doctr/documents/__init__.py b/doctr/documents/__init__.py
diff --git a/doctr/io/__init__.py b/doctr/io/__init__.py
@@ -0,0 +1,5 @@
+from .elements import *
+from .reader import *
+from .image import *
+from .pdf import *
+from .html import *
diff --git a/doctr/documents/elements.py → doctr/io/elements.py b/doctr/documents/elements.py → doctr/io/elements.py
diff --git a/doctr/io/html.py b/doctr/io/html.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2021, Mindee.
+
+# This program is licensed under the Apache License version 2.
+# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+
+from weasyprint import HTML
+from typing import Any
+
+__all__ = ['read_html']
+
+
+def read_html(url: str, **kwargs: Any) -> bytes:
+    """Read a PDF file and convert it into an image in numpy format
+
+    Example::
+        >>> from doctr.documents import read_html
+        >>> doc = read_html("https://www.yoursite.com")
+
+    Args:
+        url: URL of the target web page
+    Returns:
+        decoded PDF file as a bytes stream
+    """
+
+    return HTML(url, **kwargs).write_pdf()
diff --git a/doctr/io/image/__init__.py b/doctr/io/image/__init__.py
@@ -0,0 +1,8 @@
+from .base import *
+
+from doctr.file_utils import is_tf_available, is_torch_available
+
+if is_tf_available():
+    from .tensorflow import *
+elif is_torch_available():
+    from .pytorch import *  # type: ignore[misc]
diff --git a/doctr/io/image/base.py b/doctr/io/image/base.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2021, Mindee.
+
+# This program is licensed under the Apache License version 2.
+# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+
+from pathlib import Path
+from typing import Optional, Tuple
+import numpy as np
+import cv2
+from doctr.utils.common_types import AbstractFile
+
+__all__ = ['read_img_as_numpy']
+
+
+def read_img_as_numpy(
+    file: AbstractFile,
+    output_size: Optional[Tuple[int, int]] = None,
+    rgb_output: bool = True,
+) -> np.ndarray:
+    """Read an image file into numpy format
+
+    Example::
+        >>> from doctr.documents import read_img
+        >>> page = read_img("path/to/your/doc.jpg")
+
+    Args:
+        file: the path to the image file
+        output_size: the expected output size of each page in format H x W
+        rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
+    Returns:
+        the page decoded as numpy ndarray of shape H x W x 3
+    """
+
+    if isinstance(file, (str, Path)):
+        if not Path(file).is_file():
+            raise FileNotFoundError(f"unable to access {file}")
+        img = cv2.imread(str(file), cv2.IMREAD_COLOR)
+    elif isinstance(file, bytes):
+        file = np.frombuffer(file, np.uint8)
+        img = cv2.imdecode(file, cv2.IMREAD_COLOR)
+    else:
+        raise TypeError("unsupported object type for argument 'file'")
+
+    # Validity check
+    if img is None:
+        raise ValueError("unable to read file.")
+    # Resizing
+    if isinstance(output_size, tuple):
+        img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR)
+    # Switch the channel order
+    if rgb_output:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
diff --git a/doctr/io/image/pytorch.py b/doctr/io/image/pytorch.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2021, Mindee.
+
+# This program is licensed under the Apache License version 2.
+# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+
+import numpy as np
+from PIL import Image
+from io import BytesIO
+import torch
+from torchvision.transforms.functional import to_tensor
+
+from doctr.utils.common_types import AbstractPath
+
+__all__ = ['read_img_as_tensor', 'decode_img_as_tensor']
+
+
+def _from_pil_img(pil_img: Image, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+
+    if dtype == torch.float32:
+        img = to_tensor(pil_img)
+    else:
+        img = torch.from_numpy(
+            np.array(pil_img, np.uint8, copy=True)
+        )
+        img = img.view(pil_img.size[1], pil_img.size[0], len(pil_img.getbands()))
+        # put it from HWC to CHW format
+        img = img.permute((2, 0, 1)).contiguous()
+        if dtype == torch.float16:
+            # Switch to FP16
+            img = img.to(dtype=torch.float16).div(255)
+
+    return img
+
+
+def read_img_as_tensor(img_path: AbstractPath, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """Read an image file as a PyTorch tensor
+
+    Args:
+        img_path: location of the image file
+        dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
+
+    Returns:
+        decoded image as a tensor
+    """
+
+    if dtype not in (torch.uint8, torch.float16, torch.float32):
+        raise ValueError("insupported value for dtype")
+
+    pil_img = Image.open(img_path, mode='r').convert('RGB')
+
+    return _from_pil_img(pil_img, dtype)
+
+
+def decode_img_as_tensor(img_content: bytes, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """Read a byte stream as a PyTorch tensor
+
+    Args:
+        img_content: bytes of a decoded image
+        dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
+
+    Returns:
+        decoded image as a tensor
+    """
+
+    if dtype not in (torch.uint8, torch.float16, torch.float32):
+        raise ValueError("insupported value for dtype")
+
+    pil_img = Image.open(BytesIO(img_content), mode='r').convert('RGB')
+
+    return _from_pil_img(pil_img, dtype)