Skip to content

Commit

Permalink
layout analysis improved
Browse files Browse the repository at this point in the history
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@245 1aa58f4a-7d42-0410-adbc-911cccaed67c
  • Loading branch information
yusuke.shinyama.dummy committed Oct 17, 2010
1 parent bc1303e commit 3305c07
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 80 deletions.
2 changes: 1 addition & 1 deletion pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName')

def is_vertical(self):
return self.attrs.get('WMode', 0)
return self.attrs.get('WMode', 0) != 0

def set_attr(self, k, v):
self.attrs[k] = v
Expand Down
13 changes: 7 additions & 6 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,11 @@ def write(self, text):

def receive_layout(self, ltpage):
def render(item):
if isinstance(item, LTText):
self.write(item.text)
elif isinstance(item, LTContainer):
if isinstance(item, LTContainer):
for child in item:
render(child)
elif isinstance(item, LTText):
self.write(item.get_text())
if isinstance(item, LTTextBox):
self.write('\n')
if self.showpageno:
Expand Down Expand Up @@ -231,20 +231,21 @@ def render(item):
elif isinstance(item, LTChar):
self.write_text(item.text, item.x0, item.y1, item.get_size())
if self.debug:
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTPolygon):
self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
elif isinstance(item, LTTextLine):
self.write_rect('magenta', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTTextBox):
self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height)
self.write_rect('cyan', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
if self.debug:
self.write_text(str(item.index+1), item.x0, item.y1, 20)
elif isinstance(item, LTFigure):
self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
self.write_rect('yellow', 1, item.x0, item.y1, item.width, item.height)
for child in item:
render(child)
elif isinstance(item, LTImage):
Expand Down
147 changes: 74 additions & 73 deletions pdfminer/layout.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,10 @@
#!/usr/bin/env python
import sys
from sys import maxint as INF
from utils import apply_matrix_pt
from utils import apply_matrix_pt, get_bound, INF
from utils import bsearch, bbox2str, matrix2str
from pdffont import PDFUnicodeNotDefined



## get_bounds
##
def get_bounds(pts):
"""Compute a minimal rectangle that covers all the points."""
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)

def uniq(objs):
done = set()
for obj in objs:
Expand All @@ -39,7 +25,7 @@ class LAParams(object):
def __init__(self,
writing_mode='lr-tb',
line_overlap=0.5,
char_margin=3.0,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
all_texts=False):
Expand All @@ -52,8 +38,8 @@ def __init__(self,
return

def __repr__(self):
return ('<LAParams: writing_mode=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' %
(self.writing_mode, self.char_margin, self.line_margin, self.word_margin, self.all_texts))
return ('<LAParams: char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' %
(self.char_margin, self.line_margin, self.word_margin, self.all_texts))


## LTItem
Expand All @@ -65,7 +51,8 @@ def __init__(self, bbox):
return

def __repr__(self):
return ('<item bbox=%s>' % bbox2str(self.bbox))
return ('<%s %s>' %
(self.__class__.__name__, bbox2str(self.bbox)))

def set_bbox(self, (x0,y0,x1,y1)):
if x1 < x0: (x0,x1) = (x1,x0)
Expand Down Expand Up @@ -123,7 +110,7 @@ class LTPolygon(LTItem):
def __init__(self, linewidth, pts):
self.pts = pts
self.linewidth = linewidth
LTItem.__init__(self, get_bounds(pts))
LTItem.__init__(self, get_bound(pts))
return

def get_pts(self):
Expand Down Expand Up @@ -167,7 +154,9 @@ def __init__(self, name, stream, bbox):

def __repr__(self):
(w,h) = self.srcsize
return '<image %s %dx%d>' % (self.name, w, h)
return ('<%s(%s) %s %dx%d>' %
(self.__class__.__name__, self.name,
bbox2str(self.bbox), w, h))


## LTText
Expand All @@ -179,10 +168,11 @@ def __init__(self, text):
return

def __repr__(self):
return '<text %r>' % self.text
return ('<%s %r>' %
(self.__class__.__name__, self.get_text()))

def is_upright(self):
return True
def get_text(self):
return self.text


## LTAnon
Expand Down Expand Up @@ -239,20 +229,18 @@ def __init__(self, matrix, font, fontsize, scaling, rise, cid):

def __repr__(self):
if self.debug:
return ('<char matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
(matrix2str(self.matrix), self.font, self.fontsize,
bbox2str(self.bbox), self.adv, self.text))
return ('<%s %s matrix=%s font=%r fontsize=%.1f adv=%s text=%r>' %
(self.__class__.__name__, bbox2str(self.bbox),
matrix2str(self.matrix), self.font, self.fontsize,
self.adv, self.get_text()))
else:
return '<char %r>' % self.text

def get_size(self):
return max(self.width, self.height)

def is_vertical(self):
return self.font.is_vertical()

def is_upright(self):
return self.upright
def is_compatible(self, obj):
return True


## LTContainer
Expand All @@ -267,9 +255,6 @@ def __init__(self, objs=None, bbox=(0,0,0,0)):
self._objs = []
return

def __repr__(self):
return ('<container %s>' % bbox2str(self.bbox))

def __iter__(self):
return iter(self.get_objs())

Expand Down Expand Up @@ -302,15 +287,17 @@ def fixate(self):

## LTTextLine
##
class LTTextLine(LTContainer):
class LTTextLine(LTContainer, LTText):

def __init__(self, word_margin=0):
self.word_margin = word_margin
def __init__(self, laparams=None):
self.laparams = laparams
LTContainer.__init__(self)
return

def __repr__(self):
return ('<textline %s>' % bbox2str(self.bbox))
return ('<%s %s %r>' %
(self.__class__.__name__, bbox2str(self.bbox),
self.get_text()))

def get_text(self):
return ''.join( obj.text for obj in self.get_objs() if isinstance(obj, LTText) )
Expand All @@ -320,14 +307,16 @@ def find_neighbors(self, plane, ratio):

class LTTextLineHorizontal(LTTextLine):

def __repr__(self):
return ('<textline-h %s>' % bbox2str(self.bbox))

def get_objs(self):
if self.laparams is None:
for obj in self._objs:
yield obj
return
word_margin = self.laparams.word_margin
x1 = INF
for obj in csort(self._objs, key=lambda obj: obj.x0):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.width
if isinstance(obj, LTChar) and word_margin:
margin = word_margin * obj.width
if x1 < obj.x0-margin:
yield LTAnon(' ')
yield obj
Expand All @@ -342,14 +331,16 @@ def find_neighbors(self, plane, ratio):

class LTTextLineVertical(LTTextLine):

def __repr__(self):
return ('<textline-v %s>' % bbox2str(self.bbox))

def get_objs(self):
if self.laparams is None:
for obj in self._objs:
yield obj
return
word_margin = self.laparams.word_margin
y0 = -INF
for obj in csort(self._objs, key=lambda obj: -obj.y1):
if isinstance(obj, LTChar) and self.word_margin:
margin = self.word_margin * obj.height
if isinstance(obj, LTChar) and word_margin:
margin = word_margin * obj.height
if obj.y1+margin < y0:
yield LTAnon(' ')
yield obj
Expand All @@ -376,7 +367,9 @@ def __init__(self, objs):
return

def __repr__(self):
return ('<textbox(%s) %s %r...>' % (self.index, bbox2str(self.bbox), self.get_text()[:20]))
return ('<%s(%s) %s %r...>' %
(self.__class__.__name__, self.index,
bbox2str(self.bbox), self.get_text()[:20]))

def get_text(self):
return ''.join( obj.get_text() for obj in self.get_objs() if isinstance(obj, LTTextLine) )
Expand Down Expand Up @@ -489,7 +482,7 @@ def get_textobjs(self, objs, laparams):
textobjs = []
otherobjs = []
for obj in objs:
if isinstance(obj, LTText) and obj.is_upright():
if isinstance(obj, LTChar):
textobjs.append(obj)
else:
otherobjs.append(obj)
Expand All @@ -499,11 +492,9 @@ def get_textlines(self, objs, laparams):
obj0 = None
line = None
for obj1 in objs:
if obj0 is None:
obj0 = obj1
else:
if obj0 is not None:
k = 0
if (obj0.is_voverlap(obj1) and
if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and
min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and
obj0.hdistance(obj1) < min(obj0.width, obj1.width) * laparams.char_margin):
# obj0 and obj1 is horizontally aligned:
Expand All @@ -517,7 +508,7 @@ def get_textlines(self, objs, laparams):
# |<--->|
# (char_margin)
k |= 1
if (obj0.is_hoverlap(obj1) and
if (obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and
min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and
obj0.vdistance(obj1) < min(obj0.height, obj1.height) * laparams.char_margin):
# obj0 and obj1 is vertically aligned:
Expand All @@ -538,22 +529,29 @@ def get_textlines(self, objs, laparams):
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
(k & 2 and isinstance(line, LTTextLineVertical)) ):
line.add(obj1)
elif line is None:
if k == 2:
line = LTTextLineVertical(laparams.word_margin)
else:
line = LTTextLineHorizontal(laparams.word_margin)
line.add(obj0)
line.add(obj1)
else:
elif line is not None:
line.fixate()
yield line
line = None
obj0 = obj1
else:
if k == 2:
line = LTTextLineVertical(laparams)
line.add(obj0)
line.add(obj1)
elif k == 1:
line = LTTextLineHorizontal(laparams)
line.add(obj0)
line.add(obj1)
else:
line = LTTextLineHorizontal(laparams)
line.add(obj0)
line.fixate()
yield line
line = None
obj0 = obj1
if line is None:
line = LTTextLineHorizontal(laparams.word_margin)
if obj0 is not None:
line.add(obj0)
line = LTTextLineHorizontal(laparams)
line.add(obj0)
line.fixate()
yield line
return
Expand Down Expand Up @@ -633,14 +631,15 @@ def __init__(self, name, bbox, matrix):
self.name = name
self.matrix = matrix
(x,y,w,h) = bbox
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
bbox = get_bound( apply_matrix_pt(matrix, (p,q))
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
LTAnalyzer.__init__(self, bbox=bbox)
return

def __repr__(self):
return ('<figure %r bbox=%s matrix=%s>' %
(self.name, bbox2str(self.bbox), matrix2str(self.matrix)))
return ('<%s(%s) %s matrix=%s>' %
(self.__class__.__name__, self.name,
bbox2str(self.bbox), matrix2str(self.matrix)))

def analyze(self, laparams=None):
if laparams is not None and laparams.all_texts:
Expand All @@ -660,4 +659,6 @@ def __init__(self, pageid, bbox, rotate=0):
return

def __repr__(self):
return ('<page(%r) bbox=%s rotate=%r>' % (self.pageid, bbox2str(self.bbox), self.rotate))
return ('<%s(%r) %s rotate=%r>' %
(self.__class__.__name__, self.pageid,
bbox2str(self.bbox), self.rotate))
12 changes: 12 additions & 0 deletions pdfminer/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
from sys import maxint as INF
from struct import pack, unpack


Expand Down Expand Up @@ -28,6 +29,17 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
## Utility functions
##

# get_bound
def get_bound(pts):
'''Compute a minimal rectangle that covers all the points.'''
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
for (x,y) in pts:
x0 = min(x0, x)
y0 = min(y0, y)
x1 = max(x1, x)
y1 = max(y1, y)
return (x0,y0,x1,y1)

# pick
def pick(seq, func, maxobj=None):
'''Picks the object obj where func(obj) has the highest value.'''
Expand Down
3 changes: 3 additions & 0 deletions samples/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ XMLS_NONFREE= \
nonfree/naacl06-shinyama.xml \
nonfree/nlp2004slides.xml

all:
$(MAKE) test CMP=cmp

test: htmls texts xmls

clean:
Expand Down

0 comments on commit 3305c07

Please sign in to comment.