diff --git a/pdfminer/layout.py b/pdfminer/layout.py index d9a108cd..b970f25d 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -472,7 +472,8 @@ def __init__(self, bbox): self.groups = None return - def get_textlines(self, laparams, objs): + # group_objects: group text object to textlines. + def group_objects(self, laparams, objs): obj0 = None line = None for obj1 in objs: @@ -538,13 +539,14 @@ def get_textlines(self, laparams, objs): yield line return - def get_textboxes(self, laparams, lines): + # group_textlines: group neighboring lines to textboxes. + def group_textlines(self, laparams, lines): plane = Plane(self.bbox) plane.extend(lines) boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) - assert line in neighbors, line + if line not in neighbors: continue members = [] for obj1 in neighbors: members.append(obj1) @@ -559,6 +561,7 @@ def get_textboxes(self, laparams, lines): boxes[obj] = box done = set() for line in lines: + if line not in boxes: continue box = boxes[line] if box in done: continue @@ -567,6 +570,7 @@ def get_textboxes(self, laparams, lines): yield box return + # group_textboxes: group textboxes hierarchically. def group_textboxes(self, laparams, boxes): assert boxes @@ -633,18 +637,16 @@ def isany(obj1, obj2): def analyze(self, laparams): # textobjs is a list of LTChar objects, i.e. # it has all the individual characters in the page. - (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs) + (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self) for obj in otherobjs: obj.analyze(laparams) if not textobjs: return - textlines = list(self.get_textlines(laparams, textobjs)) - assert len(textobjs) <= sum(len(line._objs) for line in textlines) + textlines = list(self.group_objects(laparams, textobjs)) (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) for obj in empties: obj.analyze(laparams) - textboxes = list(self.get_textboxes(laparams, textlines)) - assert len(textlines) == sum(len(box._objs) for box in textboxes) + textboxes = list(self.group_textlines(laparams, textlines)) if textboxes: self.groups = self.group_textboxes(laparams, textboxes) assigner = IndexAssigner() diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 8a46cd01..e835a075 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -259,6 +259,8 @@ def __contains__(self, obj): return obj in self._objs def _getrange(self, (x0, y0, x1, y1)): + if (x1 <= self.x0 or self.x1 <= x0 or + y1 <= self.y0 or self.y1 <= y0): return x0 = max(self.x0, x0) y0 = max(self.y0, y0) x1 = min(self.x1, x1) diff --git a/samples/nonfree/naacl06-shinyama.xml.ref b/samples/nonfree/naacl06-shinyama.xml.ref index d3b13a10..228b90db 100644 --- a/samples/nonfree/naacl06-shinyama.xml.ref +++ b/samples/nonfree/naacl06-shinyama.xml.ref @@ -3750,14 +3750,14 @@ - + - +