Skip to content

Commit

Permalink
refactor: Switched from PyMuPDF to pypdfium2 (mindee#829)
Browse files Browse the repository at this point in the history
* chore: Updated PDF lib

* refactor: Refactored PDF parsing

* test: Updated unittests

* docs: Updated instructions

* refactor: Switched to another PDF backend

* docs: Updated documentation

* fix: Fixed demo

* refactor: Removed legacy imports

* style: Updated mypy config

* fix: Fixed read_pdf

* chore: Updated deps

* test: Fixed unittests

* fix: Fixed analysis script

* chore: Fixed requirements

* test: Removed PyMuPDF from unittests

* chore: Removed PyMuPDF

* test: Fixed unittest

* fix: Fixed Dockerfile
  • Loading branch information
fg-mindee authored Feb 24, 2022
1 parent 0f79736 commit 2581daa
Show file tree
Hide file tree
Showing 17 changed files with 49 additions and 251 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: Build docker image
run: docker build . -t doctr-py3.8.1-tf2.4-slim
run: docker build . -t doctr-tf-py3.8-slim
- name: Run docker container
run: docker run doctr-py3.8.1-tf2.4-slim python -c 'import doctr'
run: docker run doctr-tf-py3.8-slim python -c 'import doctr'

pytest-api:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.8.1-slim
FROM python:3.8-slim

ENV PYTHONUNBUFFERED 1
ENV PYTHONDONTWRITEBYTECODE 1
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ Documents can be interpreted from PDF or images:
```python
from doctr.io import DocumentFile
# PDF
pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
# Image
single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
# Webpage
webpage_doc = DocumentFile.from_url("https://www.yoursite.com").as_images()
webpage_doc = DocumentFile.from_url("https://www.yoursite.com")
# Multiple page images
multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jpg"])
```
Expand All @@ -51,7 +51,7 @@ from doctr.models import ocr_predictor

model = ocr_predictor(pretrained=True)
# PDF
doc = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
# Analyze
result = model(doc)
```
Expand Down
2 changes: 1 addition & 1 deletion demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def main():
uploaded_file = st.sidebar.file_uploader("Upload files", type=['pdf', 'png', 'jpeg', 'jpg'])
if uploaded_file is not None:
if uploaded_file.name.endswith('.pdf'):
doc = DocumentFile.from_pdf(uploaded_file.read()).as_images()
doc = DocumentFile.from_pdf(uploaded_file.read())
else:
doc = DocumentFile.from_images(uploaded_file.read())
page_idx = st.sidebar.selectbox("Page selection", [idx + 1 for idx in range(len(doc))]) - 1
Expand Down
10 changes: 0 additions & 10 deletions docs/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,3 @@ High-performance file reading and conversion to processable structured data.
.. automethod:: from_url

.. automethod:: from_images

.. autoclass:: PDF

.. automethod:: as_images

.. automethod:: get_words

.. automethod:: get_lines

.. automethod:: get_artefacts
166 changes: 10 additions & 156 deletions doctr/io/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,17 @@
# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.

from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, List

import cv2
import fitz
import numpy as np
import pypdfium2 as pdfium

from doctr.utils.common_types import AbstractFile, Bbox
from doctr.utils.common_types import AbstractFile

__all__ = ['read_pdf', 'PDF']
__all__ = ['read_pdf']


def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
def read_pdf(file: AbstractFile, scale: float = 2, **kwargs: Any) -> List[np.ndarray]:
"""Read a PDF file and convert it into an image in numpy format
Example::
Expand All @@ -24,161 +23,16 @@ def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document:
Args:
file: the path to the PDF file
scale: rendering scale (1 corresponds to 72dpi)
Returns:
the list of pages decoded as numpy ndarray of shape H x W x 3
"""

if not isinstance(file, (str, Path, bytes)):
raise TypeError("unsupported object type for argument 'file'")

if isinstance(file, (str, Path)) and not Path(file).is_file():
raise FileNotFoundError(f"unable to access {file}")

fitz_args: Dict[str, AbstractFile] = {}

if isinstance(file, (str, Path)):
fitz_args['filename'] = file
elif isinstance(file, bytes):
fitz_args['stream'] = file
else:
raise TypeError("unsupported object type for argument 'file'")

# Read pages with fitz and convert them to numpy ndarrays
return fitz.open(**fitz_args, filetype="pdf", **kwargs)


def convert_page_to_numpy(
page: fitz.fitz.Page,
output_size: Optional[Tuple[int, int]] = None,
bgr_output: bool = False,
default_scales: Tuple[float, float] = (2, 2),
) -> np.ndarray:
"""Convert a fitz page to a numpy-formatted image
Args:
page: the page of a file read with PyMuPDF
output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf,
if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726)
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
default_scales: spatial scaling to be applied when output_size is not specified where (1, 1)
corresponds to 72 dpi rendering.
Returns:
the rendered image in numpy format
"""

# If no output size is specified, keep the origin one
if output_size is not None:
scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3])
else:
# Default 72 DPI (scales of (1, 1)) is unnecessarily low
scales = default_scales

transform_matrix = fitz.Matrix(*scales)

# Generate the pixel map using the transformation matrix
pixmap = page.get_pixmap(matrix=transform_matrix)
# Decode it into a numpy
img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3)

# Switch the channel order
if bgr_output:
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

return img


class PDF:
"""PDF document template
Args:
doc: input PDF document
"""
def __init__(self, doc: fitz.Document) -> None:
self.doc = doc

def as_images(self, **kwargs) -> List[np.ndarray]:
"""Convert all document pages to images
Example::
>>> from doctr.documents import DocumentFile
>>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
Args:
kwargs: keyword arguments of `convert_page_to_numpy`
Returns:
the list of pages decoded as numpy ndarray of shape H x W x 3
"""
return [convert_page_to_numpy(page, **kwargs) for page in self.doc]

def get_page_lines(self, idx, **kwargs) -> List[Tuple[Bbox, str]]:
"""Get the annotations for all lines of a given page"""
lines: List[Tuple[Bbox, str]] = []
prev_block, prev_line = -1, -1
current_line = []
xmin, ymin, xmax, ymax = 0, 0, 0, 0
# xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
for info in self.doc[idx].get_text_words(**kwargs):
if prev_block == info[-3] and prev_line == info[-2]:
current_line.append(info[4])
xmin, ymin = min(xmin, info[0]), min(ymin, info[1])
xmax, ymax = max(xmax, info[2]), max(ymax, info[3])
else:
if len(current_line) > 0:
lines.append(((xmin, ymin, xmax, ymax), " ".join(current_line)))
current_line = [info[4]]
prev_block, prev_line = info[-3], info[-2]
xmin, ymin, xmax, ymax = info[:4]

if len(current_line) > 0:
lines.append(((xmin, ymin, xmax, ymax), " ".join(current_line)))

return lines

def get_lines(self, **kwargs) -> List[List[Tuple[Bbox, str]]]:
"""Get the annotations for all lines in the document
Example::
>>> from doctr.documents import DocumentFile
>>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()
Args:
kwargs: keyword arguments of `fitz.Page.get_text_words`
Returns:
the list of pages annotations, represented as a list of tuple (bounding box, value)
"""
return [self.get_page_lines(idx, **kwargs) for idx in range(len(self.doc))]

def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]:
"""Get the annotations for all words of a given page"""

# xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx
return [(info[:4], info[4]) for info in self.doc[idx].get_text_words(**kwargs)]

def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]:
"""Get the annotations for all words in the document
Example::
>>> from doctr.documents import DocumentFile
>>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
Args:
kwargs: keyword arguments of `fitz.Page.get_text_words`
Returns:
the list of pages annotations, represented as a list of tuple (bounding box, value)
"""
return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]

def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]:
return [tuple(self.doc[idx].get_image_bbox(artefact)) # type: ignore[misc]
for artefact in self.doc[idx].get_images(full=True)]

def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]:
"""Get the artefacts for the entire document
Example::
>>> from doctr.documents import DocumentFile
>>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
Returns:
the list of pages artefacts, represented as a list of bounding boxes
"""

return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
return [np.asarray(img) for img, _ in pdfium.render_pdf(file, scale=scale)]
14 changes: 6 additions & 8 deletions doctr/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .html import read_html
from .image import read_img_as_numpy
from .pdf import PDF, read_pdf
from .pdf import read_pdf

__all__ = ['DocumentFile']

Expand All @@ -21,7 +21,7 @@ class DocumentFile:
"""Read a document from multiple extensions"""

@classmethod
def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF:
def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
"""Read a PDF file
Example::
Expand All @@ -31,15 +31,13 @@ def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF:
Args:
file: the path to the PDF file or a binary stream
Returns:
a PDF document
the list of pages decoded as numpy ndarray of shape H x W x 3
"""

doc = read_pdf(file, **kwargs)

return PDF(doc)
return read_pdf(file, **kwargs)

@classmethod
def from_url(cls, url: str, **kwargs) -> PDF:
def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
"""Interpret a web page as a PDF document
Example::
Expand All @@ -49,7 +47,7 @@ def from_url(cls, url: str, **kwargs) -> PDF:
Args:
url: the URL of the target web page
Returns:
a PDF document
the list of pages decoded as numpy ndarray of shape H x W x 3
"""
pdf_stream = read_html(url)
return cls.from_pdf(pdf_stream, **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ ignore_missing_imports = True

ignore_missing_imports = True

[mypy-fitz.*]
[mypy-pypdfium2.*]

ignore_missing_imports = True

Expand Down
2 changes: 1 addition & 1 deletion requirements-pt.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ numpy>=1.16.0
scipy>=1.4.0
h5py>=3.1.0
opencv-python>=3.4.5.20
PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12
pypdfium2>=0.14.0
pyclipper>=1.2.0
shapely>=1.6.0
matplotlib>=3.1.0,<3.4.3
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ numpy>=1.16.0
scipy>=1.4.0
h5py>=3.1.0
opencv-python>=3.4.5.20
PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12
pypdfium2>=0.14.0
pyclipper>=1.2.0
shapely>=1.6.0
matplotlib>=3.1.0,<3.4.3
Expand Down
2 changes: 1 addition & 1 deletion scripts/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def main(args):
model = ocr_predictor(args.detection, args.recognition, pretrained=True)

if args.path.endswith(".pdf"):
doc = DocumentFile.from_pdf(args.path).as_images()
doc = DocumentFile.from_pdf(args.path)
else:
doc = DocumentFile.from_images(args.path)

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"h5py>=3.1.0",
"opencv-python>=3.4.5.20",
"tensorflow>=2.4.0",
"PyMuPDF>=1.16.0,!=1.18.11,!=1.18.12", # 18.11 and 18.12 fail (issue #222)
"pypdfium2>=0.14.0",
"pyclipper>=1.2.0",
"shapely>=1.6.0",
"matplotlib>=3.1.0,<3.4.3",
Expand Down Expand Up @@ -94,7 +94,7 @@ def deps_list(*pkgs):
deps["scipy"],
deps["h5py"],
deps["opencv-python"],
deps["PyMuPDF"],
deps["pypdfium2"],
deps["pyclipper"],
deps["shapely"],
deps["matplotlib"],
Expand Down
Loading

0 comments on commit 2581daa

Please sign in to comment.