Support for marked content section IDs (jsvine#961)

billbo-br · Aug 19, 2023 · 142fc90 · 142fc90
1 parent d8b9c15
commit 142fc90
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -158,6 +158,8 @@ Each object is represented as a simple Python `dict`, with the following propert
 |`bottom`| Distance of bottom of the character from top of page.|
 |`doctop`| Distance of top of character from top of document.|
 |`matrix`| The "current transformation matrix" for this character. (See below for details.)|
+|`mcid`| The marked content section ID for this character if any (otherwise None)|
+|`tag`| The marked content section tag for this character if any (otherwise None)|
 |`ncs`|TKTK|
 |`stroking_pattern`|TKTK|
 |`non_stroking_pattern`|TKTK|
@@ -191,6 +193,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`linewidth`| Thickness of line.|
 |`stroking_color`|The color of the line. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The non-stroking color specified for the line’s path. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The marked content section ID for this line if any (otherwise None)|
+|`tag`| The marked content section tag for this line if any (otherwise None)|
 |`object_type`| "line"|
 
 #### `rect` properties
@@ -210,6 +214,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`linewidth`| Thickness of line.|
 |`stroking_color`|The color of the rectangle's outline. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The rectangle’s fill color. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The marked content section ID for this rect if any (otherwise None)|
+|`tag`| The marked content section tag for this rect if any (otherwise None)|
 |`object_type`| "rect"|
 
 #### `curve` properties
@@ -231,6 +237,8 @@ my_char_rotation = my_char_ctm.skew_x
 |`fill`| Whether the shape defined by the curve's path is filled.|
 |`stroking_color`|The color of the curve's outline. See [docs/colors.md](docs/colors.md) for details.|
 |`non_stroking_color`|The curve’s fill color. See [docs/colors.md](docs/colors.md) for details.|
+|`mcid`| The marked content section ID for this curve if any (otherwise None)|
+|`tag`| The marked content section tag for this curve if any (otherwise None)|
 |`object_type`| "curve"|
 
 #### Derived properties
@@ -531,6 +539,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
 - [Shannon Shen](https://github.com/lolipopshock)
 - [Matsumoto Toshi](https://github.com/toshi1127)
 - [John West](https://github.com/jwestwsj)
+- [David Huggins-Daines](https://github.com/dhdaines)
 - [Jeremy B. Merrill](https://github.com/jeremybmerrill)
 
 ## Contributing

diff --git a/pdfplumber/page.py b/pdfplumber/page.py
@@ -22,7 +22,7 @@
     LTPage,
     LTTextContainer,
 )
-from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT
 from pdfminer.pdfpage import PDFPage
 from pdfminer.psparser import PSLiteral
 
@@ -62,6 +62,8 @@
         "stream",
         "stroke",
         "stroking_color",
+        "mcid",
+        "tag",
     ]
 )
 
@@ -115,6 +117,56 @@ def normalize_color(
     return separate_pattern(tuplefied)
 
 
+class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
+    """Extract layout from a specific page, adding marked-content IDs to
+    objects where found."""
+
+    cur_mcid: Optional[int] = None
+    cur_tag: Optional[str] = None
+
+    def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
+        """Handle beginning of tag, setting current MCID if any."""
+        self.cur_tag = decode_text(tag.name)
+        if isinstance(props, dict) and "MCID" in props:
+            self.cur_mcid = props["MCID"]
+        else:
+            self.cur_mcid = None
+
+    def end_tag(self) -> None:
+        """Handle beginning of tag, clearing current MCID."""
+        self.cur_tag = None
+        self.cur_mcid = None
+
+    def tag_cur_item(self) -> None:
+        """Add current MCID to what we hope to be the most recent object created
+        by pdfminer.six."""
+        # This is somewhat hacky and would not be necessary if
+        # pdfminer.six supported MCIDs.  In reading the code it's
+        # clear that the `render_*` methods methods will only ever
+        # create one object, but that is far from being guaranteed.
+        # Even if pdfminer.six's API would just return the objects it
+        # creates, we wouldn't have to do this.
+        cur_obj = self.cur_item._objs[-1]
+        cur_obj.mcid = self.cur_mcid  # type: ignore
+        cur_obj.tag = self.cur_tag  # type: ignore
+
+    def render_char(self, *args, **kwargs) -> float:  # type: ignore
+        """Hook for rendering characters, adding the `mcid` attribute."""
+        adv = super().render_char(*args, **kwargs)
+        self.tag_cur_item()
+        return adv
+
+    def render_image(self, *args, **kwargs) -> None:  # type: ignore
+        """Hook for rendering images, adding the `mcid` attribute."""
+        super().render_image(*args, **kwargs)
+        self.tag_cur_item()
+
+    def paint_path(self, *args, **kwargs) -> None:  # type: ignore
+        """Hook for rendering lines and curves, adding the `mcid` attribute."""
+        super().paint_path(*args, **kwargs)
+        self.tag_cur_item()
+
+
 class Page(Container):
     cached_properties: List[str] = Container.cached_properties + ["_layout"]
     is_original: bool = True
@@ -174,7 +226,7 @@ def height(self) -> T_num:
     def layout(self) -> LTPage:
         if hasattr(self, "_layout"):
             return self._layout
-        device = PDFPageAggregator(
+        device = PDFPageAggregatorWithMarkedContent(
             self.pdf.rsrcmgr,
             pageno=self.page_number,
             laparams=self.pdf.laparams,

diff --git a/tests/pdfs/mcid_example.pdf b/tests/pdfs/mcid_example.pdf
diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -70,7 +70,7 @@ def test_csv(self):
         assert c.split("\r\n")[9] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
-            ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
+            ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,'
         )
 
         io = StringIO()
@@ -125,7 +125,7 @@ def test_cli_csv(self):
         assert res.decode("utf-8").split("\r\n")[9] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             '18.0,12.996,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"'
-            ',DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,Y,,1,'
+            ',,DeviceRGB,"(0, 0, 0)",,,18.0,,,,,,,Y,,1,'
         )
 
     def test_cli_csv_exclude(self):
@@ -141,6 +141,7 @@ def test_cli_csv_exclude(self):
                 "3",
                 "--exclude-attrs",
                 "matrix",
+                "mcid",
                 "ncs",
                 "non_stroking_pattern",
                 "stroking_pattern",
@@ -150,7 +151,7 @@ def test_cli_csv_exclude(self):
         assert res.decode("utf-8").split("\r\n")[9] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             "18.0,12.996,,,,,,TimesNewRomanPSMT,"
-            ',,"(0, 0, 0)",,18.0,,,,,Y,,1,'
+            ',,"(0, 0, 0)",,18.0,,,,,,Y,,1,'
         )
 
     def test_cli_csv_include(self):

diff --git a/tests/test_mcids.py b/tests/test_mcids.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import os
+import unittest
+
+import pdfplumber
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+
+class TestMCIDs(unittest.TestCase):
+    """Test MCID extraction."""
+
+    def test_mcids(self):
+        path = os.path.join(HERE, "pdfs/mcid_example.pdf")
+
+        pdf = pdfplumber.open(path)
+        page = pdf.pages[0]
+        # Check text of MCIDS
+        mcids = []
+        for c in page.chars:
+            if "mcid" in c:
+                while len(mcids) <= c["mcid"]:
+                    mcids.append("")
+                if not mcids[c["mcid"]]:
+                    mcids[c["mcid"]] = c["tag"] + ": "
+                mcids[c["mcid"]] += c["text"]
+        assert mcids == [
+            "Standard: Test of figures",
+            "",
+            "P: 1 ligne",
+            "P: 2 ligne",
+            "P: 3 ligne",
+            "P: 4 ligne",
+            "P: 0",
+            "P: 2",
+            "P: 4",
+            "P: 6",
+            "P: 8",
+            "P: 10",
+            "P: 12",
+            "P: Figure 1: Chart",
+            "",
+            "P: 1 colonne",
+            "P: 2 colonne",
+            "P: 3 colonne",
+        ]
+        # Check line and curve MCIDs
+        line_mcids = set(x["mcid"] for x in page.lines)
+        curve_mcids = set(x["mcid"] for x in page.curves)
+        assert all(x["tag"] == "Figure" for x in page.lines)
+        assert all(x["tag"] == "Figure" for x in page.curves)
+        assert line_mcids & {1, 14}
+        assert curve_mcids & {1, 14}
+        # No rects to test unfortunately!