jcushman · jcushman · Jun 7, 2013 · Mar 15, 2013 · Apr 30, 2013
diff --git a/pdfquery.egg-info/PKG-INFO b/pdfquery.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: pdfquery
-Version: 0.1.3
+Version: 0.1.4
 Summary: Concise and friendly PDF scraper using JQuery or XPath selectors.
 Home-page: https://github.com/jcushman/pdfquery
 Author: Jack Cushman
@@ -16,8 +16,6 @@ Description: ========
         PDFQuery is a light wrapper around pdfminer, lxml and pyquery. It's designed to reliably extract data from sets of
         PDFs with as little code as possible.
 
-        *Note: This is an initial release. It works for me, but if let me know if it doesn't work as expected for you.*
-
         .. contents:: **Table of Contents**
 
         Installation
@@ -44,6 +42,14 @@ Description: ========
         Note that we don't have to know where the name is on the page, or what page it's on,
         or how the PDF has it stored internally.
 
+        *Performance Note:* The initial call to pdf.load() runs very slowly, because the underlying
+        pdfminer library has to compare every element on the page to every other element.
+        It will run approximately 5 times faster (in my tests) if you apply
+        these_two_ patches_ to pdfminer.
+
+        .. _these_two: https://github.com/euske/pdfminer/pull/15
+        .. _patches: https://github.com/euske/pdfminer/pull/16
+
         Now let's extract and format a bunch of data all at once::
 
             >>> pdf = pdfquery.PDFQuery("examples/sample.pdf")
@@ -103,6 +109,14 @@ Description: ========
 
             >>> pdf.load(0, 2, 3, range(4,8))
 
+        *Performance Note:* The initial call to pdf.load() runs very slowly, because the underlying
+        pdfminer library has to compare every element on the page to every other element.
+        It will run approximately 5 times faster (in my tests) if you apply
+        these_two_ patches_ to pdfminer.
+
+        .. _these_two: https://github.com/euske/pdfminer/pull/15
+        .. _patches: https://github.com/euske/pdfminer/pull/16
+
         Under the hood, pdf.tree is basically an XML representation of the layout tree generated by pdfminer.pdfinterp. By
         default the tree is processed to combine individual character nodes, remove extra spaces,
         and sort the tree spatially. You can always get back to the original pdfminer Layout object from an element fetched
@@ -292,6 +306,14 @@ Description: ========
         but it's more efficient to call it explicitly with just the page numbers you need. Page numbers can be any
         combination of integers and lists, e.g. ``pdf.load(0,2,3,[4,5,6],range(10,15))``.
 
+        *Performance Note:* The initial call to pdf.load() runs very slowly, because the underlying
+        pdfminer library has to compare every element on the page to every other element.
+        It will run approximately 5 times faster (in my tests) if you apply
+        these_two_ patches_ to pdfminer.
+
+        .. _these_two: https://github.com/euske/pdfminer/pull/15
+        .. _patches: https://github.com/euske/pdfminer/pull/16
+
         Public But Less Useful Methods
         ================================
 

diff --git a/pdfquery.egg-info/SOURCES.txt b/pdfquery.egg-info/SOURCES.txt
@@ -3,7 +3,6 @@ LICENSE.txt
 MANIFEST.in
 README.rst
 setup.py
-examples/.DS_Store
 examples/sample.pdf
 pdfquery/__init__.py
 pdfquery/pdfquery.py

diff --git a/pdfquery/pdfquery.py b/pdfquery/pdfquery.py
@@ -20,7 +20,6 @@ def _xpath_in_bbox(self, xpath, expr):
     xpath.add_post_condition("@x1 <= %s" % x1)
     xpath.add_post_condition("@y1 <= %s" % y1)
     return xpath
-cssselect.Function._xpath_in_bbox = _xpath_in_bbox
 
 def _xpath_overlaps_bbox(self, xpath, expr):
     x0,y0,x1,y1 = map(float, expr.split(","))
@@ -30,7 +29,12 @@ def _xpath_overlaps_bbox(self, xpath, expr):
     xpath.add_post_condition("@x1 >= %s" % x0)
     xpath.add_post_condition("@y1 >= %s" % y0)
     return xpath
-cssselect.Function._xpath_in_bbox = _xpath_in_bbox
+
+try:
+    cssselect.Function._xpath_in_bbox = _xpath_in_bbox
+    cssselect.Function._xpath_in_bbox = _xpath_in_bbox
+except AttributeError:
+    pass
 
 
 # Re-sort the PDFMiner Layout tree so elements that fit inside other elements will be children of them
@@ -425,4 +429,4 @@ def _cached_pages(self, target_page=-1):
 if __name__ == "__main__":
     import doctest
     pdf = PDFQuery("../examples/sample.pdf")
-    doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS)
+    doctest.testmod(extraglobs={'pdf': pdf}, optionflags=doctest.ELLIPSIS)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name='pdfquery',
-    version='0.1.3',
+    version='0.1.4',
     author=u'Jack Cushman',
     author_email='[email protected]',
     packages=find_packages(),
@@ -22,4 +22,4 @@
         "Operating System :: OS Independent",
         "Programming Language :: Python",
         ],
-)
+)