forked from pdfminer/pdfminer.six
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfpage.py
140 lines (124 loc) · 5.18 KB
/
pdfpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import logging
from . import settings
from .psparser import LIT
from .pdftypes import PDFObjectNotFound
from .pdftypes import resolve1
from .pdftypes import int_value
from .pdftypes import list_value
from .pdftypes import dict_value
from .pdfparser import PDFParser
from .pdfdocument import PDFDocument
from .pdfdocument import PDFTextExtractionNotAllowed
import six # Python 2+3 compatibility
log = logging.getLogger(__name__)
# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
## PDFPage
##
class PDFPage(object):
"""An object that holds the information about a page.
A PDFPage object is merely a convenience class that has a set
of keys and values, which describe the properties of a page
and point to its contents.
Attributes:
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
contents: a list of PDFStream objects that represents the page content.
lastmod: the last modified time of the page.
resources: a list of resources used by the page.
mediabox: the physical size of the page.
cropbox: the crop rectangle of the page.
rotate: the page rotation (in degree).
annots: the page annotations.
beads: a chain that represents natural reading order.
"""
def __init__(self, doc, pageid, attrs):
"""Initialize a page object.
doc: a PDFDocument object.
pageid: any Python object that can uniquely identify the page.
attrs: a dictionary of page attributes.
"""
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources = resolve1(self.attrs.get('Resources', dict()))
self.mediabox = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [contents]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
@classmethod
def create_pages(klass, document):
def search(obj, parent):
if isinstance(obj, int):
objid = obj
tree = dict_value(document.getobj(objid)).copy()
else:
objid = obj.objid
tree = dict_value(obj).copy()
for (k, v) in six.iteritems(parent):
if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
tree_type = tree.get('Type')
if tree_type is None and not settings.STRICT: # See #64
tree_type = tree.get('type')
if tree_type is LITERAL_PAGES and 'Kids' in tree:
log.info('Pages: Kids=%r', tree['Kids'])
for c in list_value(tree['Kids']):
for x in search(c, tree):
yield x
elif tree_type is LITERAL_PAGE:
log.info('Page: %r', tree)
yield (objid, tree)
pages = False
if 'Pages' in document.catalog:
for (objid, tree) in search(document.catalog['Pages'], document.catalog):
yield klass(document, objid, tree)
pages = True
if not pages:
# fallback when /Pages is missing.
for xref in document.xrefs:
for objid in xref.get_objids():
try:
obj = document.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
yield klass(document, objid, obj)
except PDFObjectNotFound:
pass
return
@classmethod
def get_pages(klass, fp,
pagenos=None, maxpages=0, password='',
caching=True, check_extractable=True):
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching)
# Check if the document allows text extraction. If not, abort.
if check_extractable and not doc.is_extractable:
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# Process each page contained in the document.
for (pageno, page) in enumerate(klass.create_pages(doc)):
if pagenos and (pageno not in pagenos):
continue
yield page
if maxpages and maxpages <= pageno+1:
break
return