Skip to content

Commit

Permalink
several bugfixes. 20090201 release.
Browse files Browse the repository at this point in the history
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@64 1aa58f4a-7d42-0410-adbc-911cccaed67c
  • Loading branch information
yusuke.shinyama.dummy committed Feb 1, 2009
1 parent f8564fa commit af55d46
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 35 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Makefile for pdfminer

PACKAGE=pdfminer
VERSION=20090117
VERSION=20090201
GNUTAR=tar
SVN=svn
PYTHON=python
Expand Down
7 changes: 4 additions & 3 deletions README.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ <h1>PDFMiner</h1>

<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Sun Jan 18 01:31:16 JST 2009
Last Modified: Mon Feb 2 00:01:01 JST 2009
<!-- hhmts end -->
</div>

Expand Down Expand Up @@ -53,8 +53,8 @@ <h2>What's It?</h2>
<a name="source"></a>
<p>
<strong>Download (source):</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz
</a>
(1.8Mbytes)

Expand Down Expand Up @@ -250,6 +250,7 @@ <h3>dumppdf.py</h3>
<hr noshade>
<h2>Changes</h2>
<ul>
<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
<li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
<li> 2009/01/10: Handling Type3 font metrics correctly.
<li> 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich.
Expand Down
41 changes: 18 additions & 23 deletions pdflib/pdfdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,19 +98,18 @@ def __init__(self, matrix, font, fontsize, charspace, scaling, text):
w = 0
dx = 0
prev = ' '
for t in text:
if isinstance(t, tuple):
for (char,cid,t) in text:
if char:
if prev != ' ' and spwidth < dx:
self.text += ' '
(_,char) = t
self.text += char
prev = char
self.text += char
dx = 0
w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
w += (font.char_width(cid) * fontsize + charspace) * scaling
else:
t *= .001
dx -= t
w += t * fontsize * scaling
w -= t * fontsize * scaling
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
(w,h) = apply_matrix_norm(self.matrix, (w,size))
Expand All @@ -121,18 +120,16 @@ def __init__(self, matrix, font, fontsize, charspace, scaling, text):
self.direction = 2
disp = 0
h = 0
for t in text:
if isinstance(t, tuple):
(disp,char) = t
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += char
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
break
for t in text:
if isinstance(t, tuple):
(_,char) = t
self.text += char
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
for (char,cid,disp) in text:
if not char: continue
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += font.to_unicode(cid)
h += (font.char_width(cid) * fontsize + charspace) * scaling
break
for (char,cid,_) in text:
if not char: continue
self.text += font.to_unicode(cid)
h += (font.char_width(cid) * fontsize + charspace) * scaling
(w,h) = apply_matrix_norm(self.matrix, (size,h))
tx -= w/2
ty += disp
Expand Down Expand Up @@ -189,18 +186,16 @@ def render_string(self, textstate, textmatrix, seq):
textmatrix = mult_matrix(textmatrix, self.ctm)
for x in seq:
if isinstance(x, int) or isinstance(x, float):
text.append(x)
text.append((None, None, x))
else:
chars = font.decode(x)
for cid in chars:
try:
char = font.to_unicode(cid)
text.append((font.char_disp(cid), char))
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
unc = self.handle_undefined_char(cidcoding, cid)
if unc:
text.append(unc)
char = self.handle_undefined_char(cidcoding, cid)
text.append((char, cid, font.char_disp(cid)))
if cid == 32 and not font.is_multibyte():
if text:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
Expand Down
15 changes: 9 additions & 6 deletions pdflib/pdfinterp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
import sys
import sys, re
stderr = sys.stderr
from struct import pack, unpack
try:
Expand Down Expand Up @@ -143,31 +143,34 @@ def fillbuf(self):
self.charpos = 0
return

def get_inline_data(self, pos, target='EI '):
def get_inline_data(self, pos, target='EI'):
self.seek(pos)
i = 0
data = ''
while i < len(target):
while i <= len(target):
self.fillbuf()
if i:
c = self.buf[self.charpos]
data += c
self.charpos += 1
if c == target[i]:
if i >= len(target) and c.isspace():
i += 1
elif c == target[i]:
i += 1
else:
i = 0
else:
try:
j = self.buf.index(target[0], self.charpos)
#print 'found', (0, self.buf[j:j+10])
data += self.buf[self.charpos:j]
data += self.buf[self.charpos:j+1]
self.charpos = j+1
i = 1
except ValueError:
data += self.buf[self.charpos:]
self.charpos = len(self.buf)
data = data[:-len(target)] # strip the last part
data = data[:-(len(target)+1)] # strip the last part
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])', '', data)
return (pos, data)

def flush(self):
Expand Down
4 changes: 2 additions & 2 deletions pdflib/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys, re
import md5, struct
stderr = sys.stderr
from pdflib.utils import choplist, nunpack
from pdflib.utils import choplist, nunpack, decode_text
from pdflib.arcfour import Arcfour
from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
Expand Down Expand Up @@ -430,7 +430,7 @@ def search(entry, level):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
Expand Down
40 changes: 40 additions & 0 deletions pdflib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,43 @@ def nunpack(s, default=0):
return unpack('>L', s)[0]
else:
return TypeError('invalid length: %d' % l)

PDFDocEncoding = ''.join( unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )
Binary file modified samples/simple1.pdf
Binary file not shown.

0 comments on commit af55d46

Please sign in to comment.