forked from pymupdf/PyMuPDF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_insertpdf.py
188 lines (160 loc) · 5.95 KB
/
test_insertpdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
* Join multiple PDFs into a new one.
* Compare with stored earlier result:
- must have identical object definitions
- must have different trailers
* Try inserting files in a loop.
"""
import io
import os
import re
import pymupdf
scriptdir = os.path.abspath(os.path.dirname(__file__))
resources = os.path.join(scriptdir, "resources")
def approx_parse( text):
'''
Splits <text> into sequence of (text, number) pairs. Where sequence of
[0-9.] is not convertible to a number (e.g. '4.5.6'), <number> will be
None.
'''
ret = []
for m in re.finditer('([^0-9]+)([0-9.]*)', text):
text = m.group(1)
try:
number = float( m.group(2))
except Exception:
text += m.group(2)
number = None
ret.append( (text, number))
return ret
def approx_compare( a, b, max_delta):
'''
Compares <a> and <b>, allowing numbers to differ by up to <delta>.
'''
aa = approx_parse( a)
bb = approx_parse( b)
if len(aa) != len(bb):
return 1
ret = 1
for (at, an), (bt, bn) in zip( aa, bb):
if at != bt:
break
if an is not None and bn is not None:
if abs( an - bn) >= max_delta:
print( f'diff={an-bn}: an={an} bn={bn}')
break
elif (an is None) != (bn is None):
break
else:
ret = 0
if ret:
print( f'Differ:\n a={a!r}\n b={b!r}')
return ret
def test_insert():
all_text_original = [] # text on input pages
all_text_combined = [] # text on resulting output pages
# prepare input PDFs
doc1 = pymupdf.open()
for i in range(5): # just arbitrary number of pages
text = f"doc 1, page {i}" # the 'globally' unique text
page = doc1.new_page()
page.insert_text((100, 72), text)
all_text_original.append(text)
doc2 = pymupdf.open()
for i in range(4):
text = f"doc 2, page {i}"
page = doc2.new_page()
page.insert_text((100, 72), text)
all_text_original.append(text)
doc3 = pymupdf.open()
for i in range(3):
text = f"doc 3, page {i}"
page = doc3.new_page()
page.insert_text((100, 72), text)
all_text_original.append(text)
doc4 = pymupdf.open()
for i in range(6):
text = f"doc 4, page {i}"
page = doc4.new_page()
page.insert_text((100, 72), text)
all_text_original.append(text)
new_doc = pymupdf.open() # make combined PDF of input files
new_doc.insert_pdf(doc1)
new_doc.insert_pdf(doc2)
new_doc.insert_pdf(doc3)
new_doc.insert_pdf(doc4)
# read text from all pages and store in list
for page in new_doc:
all_text_combined.append(page.get_text().replace("\n", ""))
# the lists must be equal
assert all_text_combined == all_text_original
def test_issue1417_insertpdf_in_loop():
"""Using a context manager instead of explicitly closing files"""
f = os.path.join(resources, "1.pdf")
big_doc = pymupdf.open()
fd1 = os.open( f, os.O_RDONLY)
os.close( fd1)
for n in range(0, 1025):
with pymupdf.open(f) as pdf:
big_doc.insert_pdf(pdf)
# Create a raw file descriptor. If the above pymupdf.open() context leaks
# a file descriptor, fd will be seen to increment.
fd2 = os.open( f, os.O_RDONLY)
assert fd2 == fd1
os.close( fd2)
big_doc.close()
def _test_insert_adobe():
path = os.path.abspath( f'{__file__}/../../../PyMuPDF-performance/adobe.pdf')
if not os.path.exists(path):
print(f'Not running test_insert_adobe() because does not exist: {os.path.relpath(path)}')
return
a = pymupdf.Document()
b = pymupdf.Document(path)
a.insert_pdf(b)
def _2861_2871_merge_pdf(content: bytes, coverpage: bytes):
with pymupdf.Document(stream=coverpage, filetype="pdf") as coverpage_pdf:
with pymupdf.Document(stream=content, filetype="pdf") as content_pdf:
coverpage_pdf.insert_pdf(content_pdf)
doc = coverpage_pdf.write()
return doc
def test_2861():
path = os.path.abspath(f'{__file__}/../../tests/resources/test_2861.pdf')
with open(path, "rb") as content_pdf:
with open(path, "rb") as coverpage_pdf:
content = content_pdf.read()
coverpage = coverpage_pdf.read()
_2861_2871_merge_pdf(content, coverpage)
def test_2871():
path = os.path.abspath(f'{__file__}/../../tests/resources/test_2871.pdf')
with open(path, "rb") as content_pdf:
with open(path, "rb") as coverpage_pdf:
content = content_pdf.read()
coverpage = coverpage_pdf.read()
_2861_2871_merge_pdf(content, coverpage)
def test_3789():
file_path = os.path.abspath(f'{__file__}/../../tests/resources/test_3789.pdf')
result_path = os.path.abspath(f'{__file__}/../../tests/test_3789_out')
pages_per_split = 5
# Clean pdf
doc = pymupdf.open(file_path)
tmp = io.BytesIO()
tmp.write(doc.write(garbage=4, deflate=True))
source_doc = pymupdf.Document('pdf', tmp.getvalue())
tmp.close()
# Calculate the number of pages per split file and the number of split files
page_range = pages_per_split - 1
split_range = range(0, source_doc.page_count, pages_per_split)
num_splits = len(split_range)
# Loop through each split range and create a new PDF file
for i, start in enumerate(split_range):
output_doc = pymupdf.open()
# Determine the ending page for this split file
to_page = start + page_range if i < num_splits - 1 else -1
output_doc.insert_pdf(source_doc, from_page=start, to_page=to_page)
# Save the output document to a file and add the path to the list of split files
path = f'{result_path}_{i}.pdf'
output_doc.save(path, garbage=2)
print(f'Have saved to {path=}.')
# If this is the last split file, exit the loop
if to_page == -1:
break