Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix range() page numbers for Python3 & prevent long cache file names #69

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Fix range pagenumbers in Python 3, prevent long cache filenames
  • Loading branch information
chk1 committed Oct 28, 2018
commit d4efac58636bc1f96733ebf4da8d9a2051961cf6
12 changes: 10 additions & 2 deletions pdfquery/pdfquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import numbers
import re
import chardet
import sys
import hashlib
try:
from collections import OrderedDict
except ImportError:
Expand Down Expand Up @@ -84,7 +86,11 @@ def _comp_bbox(el, el2):


# assorted helpers
def _flatten(l, ltypes=(list, tuple)):
LTYPES = (list, tuple)
if sys.version_info.major > 2:
LTYPES = (list, tuple, range)

def _flatten(l, ltypes=LTYPES):
# via http://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html
ltype = type(l)
l = list(l)
Expand Down Expand Up @@ -458,7 +464,9 @@ def get_tree(self, *page_numbers):
Return lxml.etree.ElementTree for entire document, or page numbers
given if any.
"""
cache_key = "_".join(map(str, _flatten(page_numbers)))
hasher = hashlib.md5()
hasher.update(str(page_numbers).encode('UTF-8'))
cache_key = "_{}".format(hasher.hexdigest())
tree = self._parse_tree_cacher.get(cache_key)
if tree is None:
# set up root
Expand Down
Binary file added tests/samples/bug67.pdf
Binary file not shown.
32 changes: 32 additions & 0 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import pdfquery
from pdfquery.cache import FileCache
import tempfile

from .utils import BaseTestCase

Expand Down Expand Up @@ -157,3 +158,34 @@ def test_annot_dereferencing(self):
pdf.load()
pdf = pdfquery.PDFQuery("tests/samples/bug42.pdf")
pdf.load()

class TestPageRange(BaseTestCase):
"""
Test various page numbers
"""

@classmethod
def setUpClass(cls):
cache_dir = "{}/".format(tempfile.gettempdir())
print(cache_dir)
cls.pdf = pdfquery.PDFQuery("tests/samples/bug67.pdf", parse_tree_cacher=FileCache(cache_dir))

def test_page_int(self):
self.pdf.load(3)
self.assertEqual(len(self.pdf.pq('LTPage')), 1)
self.pdf.load(0, 10, 25, 49)
self.assertEqual(len(self.pdf.pq('LTPage')), 4)

def test_page_array(self):
self.pdf.load([0, 7, 11])
self.assertEqual(len(self.pdf.pq('LTPage')), 3)
self.pdf.load([10], [0, 12], [30, 40])
self.assertEqual(len(self.pdf.pq('LTPage')), 5)

def test_page_mixed(self):
self.pdf.load([0, 7, 11], [0, 44], 1)
self.assertEqual(len(self.pdf.pq('LTPage')), 6)

def test_page_range(self):
self.pdf.load(range(0, 150))
self.assertEqual(len(self.pdf.pq('LTPage')), 150)