Skip to content

Commit

Permalink
Support for marked content section IDs (jsvine#961)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines authored Aug 19, 2023
1 parent d8b9c15 commit 142fc90
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 5 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ Each object is represented as a simple Python `dict`, with the following propert
|`bottom`| Distance of bottom of the character from top of page.|
|`doctop`| Distance of top of character from top of document.|
|`matrix`| The "current transformation matrix" for this character. (See below for details.)|
|`mcid`| The marked content section ID for this character if any (otherwise None)|
|`tag`| The marked content section tag for this character if any (otherwise None)|
|`ncs`|TKTK|
|`stroking_pattern`|TKTK|
|`non_stroking_pattern`|TKTK|
Expand Down Expand Up @@ -191,6 +193,8 @@ my_char_rotation = my_char_ctm.skew_x
|`linewidth`| Thickness of line.|
|`stroking_color`|The color of the line. See [docs/colors.md](docs/colors.md) for details.|
|`non_stroking_color`|The non-stroking color specified for the line’s path. See [docs/colors.md](docs/colors.md) for details.|
|`mcid`| The marked content section ID for this line if any (otherwise None)|
|`tag`| The marked content section tag for this line if any (otherwise None)|
|`object_type`| "line"|

#### `rect` properties
Expand All @@ -210,6 +214,8 @@ my_char_rotation = my_char_ctm.skew_x
|`linewidth`| Thickness of line.|
|`stroking_color`|The color of the rectangle's outline. See [docs/colors.md](docs/colors.md) for details.|
|`non_stroking_color`|The rectangle’s fill color. See [docs/colors.md](docs/colors.md) for details.|
|`mcid`| The marked content section ID for this rect if any (otherwise None)|
|`tag`| The marked content section tag for this rect if any (otherwise None)|
|`object_type`| "rect"|

#### `curve` properties
Expand All @@ -231,6 +237,8 @@ my_char_rotation = my_char_ctm.skew_x
|`fill`| Whether the shape defined by the curve's path is filled.|
|`stroking_color`|The color of the curve's outline. See [docs/colors.md](docs/colors.md) for details.|
|`non_stroking_color`|The curve’s fill color. See [docs/colors.md](docs/colors.md) for details.|
|`mcid`| The marked content section ID for this curve if any (otherwise None)|
|`tag`| The marked content section tag for this curve if any (otherwise None)|
|`object_type`| "curve"|

#### Derived properties
Expand Down Expand Up @@ -531,6 +539,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
- [Shannon Shen](https://github.com/lolipopshock)
- [Matsumoto Toshi](https://github.com/toshi1127)
- [John West](https://github.com/jwestwsj)
- [David Huggins-Daines](https://github.com/dhdaines)
- [Jeremy B. Merrill](https://github.com/jeremybmerrill)

## Contributing
Expand Down
56 changes: 54 additions & 2 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
LTPage,
LTTextContainer,
)
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT
from pdfminer.pdfpage import PDFPage
from pdfminer.psparser import PSLiteral

Expand Down Expand Up @@ -62,6 +62,8 @@
"stream",
"stroke",
"stroking_color",
"mcid",
"tag",
]
)

Expand Down Expand Up @@ -115,6 +117,56 @@ def normalize_color(
return separate_pattern(tuplefied)


class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
"""Extract layout from a specific page, adding marked-content IDs to
objects where found."""

cur_mcid: Optional[int] = None
cur_tag: Optional[str] = None

def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
"""Handle beginning of tag, setting current MCID if any."""
self.cur_tag = decode_text(tag.name)
if isinstance(props, dict) and "MCID" in props:
self.cur_mcid = props["MCID"]
else:
self.cur_mcid = None

def end_tag(self) -> None:
"""Handle beginning of tag, clearing current MCID."""
self.cur_tag = None
self.cur_mcid = None

def tag_cur_item(self) -> None:
"""Add current MCID to what we hope to be the most recent object created
by pdfminer.six."""
# This is somewhat hacky and would not be necessary if
# pdfminer.six supported MCIDs. In reading the code it's
# clear that the `render_*` methods methods will only ever
# create one object, but that is far from being guaranteed.
# Even if pdfminer.six's API would just return the objects it
# creates, we wouldn't have to do this.
cur_obj = self.cur_item._objs[-1]
cur_obj.mcid = self.cur_mcid # type: ignore
cur_obj.tag = self.cur_tag # type: ignore

def render_char(self, *args, **kwargs) -> float: # type: ignore
"""Hook for rendering characters, adding the `mcid` attribute."""
adv = super().render_char(*args, **kwargs)
self.tag_cur_item()
return adv

def render_image(self, *args, **kwargs) -> None: # type: ignore
"""Hook for rendering images, adding the `mcid` attribute."""
super().render_image(*args, **kwargs)
self.tag_cur_item()

def paint_path(self, *args, **kwargs) -> None: # type: ignore
"""Hook for rendering lines and curves, adding the `mcid` attribute."""
super().paint_path(*args, **kwargs)
self.tag_cur_item()


class Page(Container):
cached_properties: List[str] = Container.cached_properties + ["_layout"]
is_original: bool = True
Expand Down Expand Up @@ -174,7 +226,7 @@ def height(self) -> T_num:
def layout(self) -> LTPage:
if hasattr(self, "_layout"):
return self._layout
device = PDFPageAggregator(
device = PDFPageAggregatorWithMarkedContent(
self.pdf.rsrcmgr,
pageno=self.page_number,
laparams=self.pdf.laparams,
Expand Down
Binary file added tests/pdfs/mcid_example.pdf
Binary file not shown.
7 changes: 4 additions & 3 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_csv(self):
assert c.split("\r\n")[9] == (
"char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
'18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,'
)

io = StringIO()
Expand Down Expand Up @@ -125,7 +125,7 @@ def test_cli_csv(self):
assert res.decode("utf-8").split("\r\n")[9] == (
"char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
'18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,'
)

def test_cli_csv_exclude(self):
Expand All @@ -141,6 +141,7 @@ def test_cli_csv_exclude(self):
"3",
"--exclude-attrs",
"matrix",
"mcid",
"ncs",
"non_stroking_pattern",
"stroking_pattern",
Expand All @@ -150,7 +151,7 @@ def test_cli_csv_exclude(self):
assert res.decode("utf-8").split("\r\n")[9] == (
"char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
"18.0,12.996,,,,,,TimesNewRomanPSMT,"
',,"(0, 0, 0)",,18.0,,,,,Y,,1,'
',,"(0, 0, 0)",,18.0,,,,,,Y,,1,'
)

def test_cli_csv_include(self):
Expand Down
55 changes: 55 additions & 0 deletions tests/test_mcids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3

import os
import unittest

import pdfplumber

HERE = os.path.abspath(os.path.dirname(__file__))


class TestMCIDs(unittest.TestCase):
"""Test MCID extraction."""

def test_mcids(self):
path = os.path.join(HERE, "pdfs/mcid_example.pdf")

pdf = pdfplumber.open(path)
page = pdf.pages[0]
# Check text of MCIDS
mcids = []
for c in page.chars:
if "mcid" in c:
while len(mcids) <= c["mcid"]:
mcids.append("")
if not mcids[c["mcid"]]:
mcids[c["mcid"]] = c["tag"] + ": "
mcids[c["mcid"]] += c["text"]
assert mcids == [
"Standard: Test of figures",
"",
"P: 1 ligne",
"P: 2 ligne",
"P: 3 ligne",
"P: 4 ligne",
"P: 0",
"P: 2",
"P: 4",
"P: 6",
"P: 8",
"P: 10",
"P: 12",
"P: Figure 1: Chart",
"",
"P: 1 colonne",
"P: 2 colonne",
"P: 3 colonne",
]
# Check line and curve MCIDs
line_mcids = set(x["mcid"] for x in page.lines)
curve_mcids = set(x["mcid"] for x in page.curves)
assert all(x["tag"] == "Figure" for x in page.lines)
assert all(x["tag"] == "Figure" for x in page.curves)
assert line_mcids & {1, 14}
assert curve_mcids & {1, 14}
# No rects to test unfortunately!

0 comments on commit 142fc90

Please sign in to comment.