Skip to content

Commit

Permalink
Overall better handling of formatted text
Browse files Browse the repository at this point in the history
  • Loading branch information
fay59 committed Dec 2, 2014
1 parent c9f5112 commit 64345ef
Showing 2 changed files with 180 additions and 47 deletions.
94 changes: 94 additions & 0 deletions htmltext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/bin/env python
# -*- coding: UTF-8 -*-

# OpenTag and CloseTag are NOT HARDENED against HTML injection!
# Do not use them for input that you cannot perfectly predict.

import re

class OpenTag(object):
def __init__(self, tag, coalesce=False, attributes={}):
self.tag = tag
self.coalesce = coalesce
self.attributes = attributes
self.closed_by = None

def __attribute_string(self):
if len(self.attributes) == 0:
return ""
result = ""
for key in self.attributes:
result += ' %s="%s"' % (key, self.attributes[key].replace('"', '"'))
return result

def open(self):
return "<%s%s>" % (self.tag, self.__attribute_string())

def close(self):
return "</%s>" % self.tag

def __str__(self): return self.open()

class CloseTag(object):
def __init__(self, tag):
self.closes = None
self.tag = tag

def __str__(self):
return "</%s>" % self.tag

class HtmlText(object):
def __init__(self):
self.__tokens = []

def append(self, token):
if self.__is_close(token):
for prev in reversed(self.__tokens):
if self.__is_open(prev) and prev.closed_by == None and prev.tag == token.tag:
token.closes = prev
prev.closed_by = token
break
else: raise Exception("No matching OpenTag found!")
elif len(self.__tokens) > 0 and self.__is_open(token):
for i in xrange(-1, -len(self.__tokens), -1):
last = self.__tokens[i]
is_close = self.__is_close(last)
if not (is_close or self.__is_open(last)):
break
if is_close and last.tag == token.tag and last.coalesce and last.attributes == token.attributes:
positive = len(self.__tokens) + i
self.__tokens = self.__tokens[0:positive] + self.__tokens[positive+1:]
return

self.__tokens.append(token)

def to_html(self):
tag_stack = []
result = u""
for token in self.__tokens:
if isinstance(token, OpenTag):
tag_stack.append(token)
result += token.open()
elif isinstance(token, CloseTag):
close_it = tag_stack.pop()
reopen_it = []
while close_it.tag != token.tag:
result += close_it.close()
reopen_it.append(close_it)
close_it = tag_stack.pop()
result += close_it.close()
for tag in reversed(reopen_it):
tag_stack.append(tag)
result += tag.open()
else:
result += unicode(token)

while len(tag_stack) > 0:
result += tag_stack.pop().close()
return result

def __is_open(self, token):
return hasattr(token, "tag") and hasattr(token, "coalesce")

def __is_close(self, token):
return hasattr(token, "tag") and not hasattr(token, "coalesce")
133 changes: 86 additions & 47 deletions x86manual.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@

from pdfminer.layout import *
import pdftable
import htmltext
import sys
import math
import bisect
@@ -91,7 +92,9 @@ def left_aligned_table(source):
if pdftable.pretty_much_equal(item_bounds.x1(), columns[i]):
col_index = i
break
else: raise Exception("No matching column!")
else:
print columns
raise Exception("No matching column!")

row[col_index] = [item]

@@ -110,16 +113,32 @@ def get_text(self):
return self.text

class CharCollection(object):
def __init__(self, rect, iterable):
self.rect = rect
def __init__(self, iterable):
self.chars = [c for c in iterable]
while len(self.chars) > 0 and len(self.chars[-1].get_text().strip()) == 0:
self.chars.pop()

# bounds excluding abnormally-placed characters (exponents, symbols)
self.approx_rect = self.__approximative_bounds()
# actual, complete bounds (modified by caller)
self.rect = self.approx_rect

def bounds(self): return self.approx_rect

def bounds(self): return self.rect
def __approximative_bounds(self):
if len(self.chars) == 0: return self.rect
size = self.font_size()
approx = None
for c in self.chars:
if hasattr(c, "matrix") and c.matrix[0] == size:
rect = pdftable.Rect(c.x0, c.y1, c.x1, c.y0)
if approx == None: approx = rect
elif approx.y1() == rect.y1(): approx = approx.union(rect)
return approx

def append(self, line):
self.rect = self.rect.union(line.rect)
self.approx_rect = self.approx_rect.union(line.approx_rect)
self.chars += line.chars
while len(self.chars[-1].get_text().strip()) == 0:
self.chars.pop()
@@ -156,6 +175,7 @@ def __init__(self, outputDir, laParams):
self.thisPageLtRects = []
self.thisPageTextLines = []
self.__title_stack = []
self.__is_code = False

def flush(self):
tables = []
@@ -164,6 +184,7 @@ def flush(self):
if len(cluster) >= 4:
tables.append(pdftable.Table(cluster))

assert len(tables) > 0
# fill tables
lines = self.textLines
for table in tables:
@@ -213,8 +234,11 @@ def end_page(self, page):
def process_text_line(self, line):
# ignore header and footer
if line.bbox[1] < 740 and line.bbox[1] > 50:
rect = self.__fix_bbox(line.bbox)
self.thisPageTextLines.append(CharCollection(rect, line))
coll = CharCollection(line)
coll.rect = self.__fix_bbox(line.bbox)
coll.approx_rect = self.__fix_rect(coll.approx_rect)
if unicode(coll).find("*") != -1: print coll.rect, coll.approx_rect
self.thisPageTextLines.append(coll)

def process_rect(self, rect):
self.thisPageLtRects.append(self.__fix_bbox(rect.bbox))
@@ -259,10 +283,9 @@ def sort_text(a, b):
for line in lines[1:]:
last = merged[-1]
same_x = pdftable.pretty_much_equal(line.rect.x1(), last.rect.x1())
same_font = last.font_name() == line.font_name()
same_size = last.font_size() == line.font_size()
decent_descent = line.rect.y1() - last.rect.y2() < 2.5
if same_x and same_font and same_size and decent_descent:
decent_descent = line.approx_rect.y1() - last.approx_rect.y2() < 2.5
if same_x and same_size and decent_descent:
lastChar = last.chars[-1].get_text()[-1]
if not (lastChar == "-" or lastChar == "/"):
last.append_char(" ")
@@ -302,14 +325,21 @@ def __output_html(self, element):
if isinstance(element, list):
return "".join([unicode(e) for e in element])
if isinstance(element, CharCollection):
result = self.__output_text(element)
kind, result = self.__output_text(element)
if kind[0] == "h":
level = int(kind[1]) - 1
self.__title_stack = self.__title_stack[0:level]
self.__title_stack.append(result)
result = "<%s>%s</%s>\n" % (kind, result, kind)

elif isinstance(element, pdftable.Table):
print_index = -1
if element.rows() == 1 and element.columns() == 1:
if len(self.__title_stack) == 1:
# instruction table
element = left_aligned_table(element)
elif self.__title_stack[-1].strip().lower() == "instruction operand encoding":
# operands encoding
element = center_aligned_table(element)

result += "<table>\n"
@@ -320,56 +350,65 @@ def __output_html(self, element):
if index <= print_index: continue
index = print_index

size = element.cell_size(col, row)
colspan = (' colspan="%i"' % size[0]) if size[0] > 1 else ""
rowspan = (' rowspan="%i"' % size[1]) if size[1] > 1 else ""
result += "<td%s%s>" % (colspan, rowspan)
cell_tag = "td"
contents = ""
children = self.__merge_text(element.get_at(col, row))
if children != None:
if len(children) == 1:
result += self.__output_html(children[0])
kind, text = self.__output_text(children[0])
text = text.strip()
if kind != "p":
contents = text
cell_tag = "th"
elif text[0:8] == "<strong>":
contents = text[8:-9]
cell_tag = "th"
else:
contents = text
else:
contents = "\n"
for child in children:
result += "<p>%s</p>\n" % self.__output_html(child)
result += "</td>\n"
contents += self.__output_html(child)

size = element.cell_size(col, row)
colspan = (' colspan="%i"' % size[0]) if size[0] > 1 else ""
rowspan = (' rowspan="%i"' % size[1]) if size[1] > 1 else ""
result += "<%s%s%s>%s</%s>\n" % (cell_tag, colspan, rowspan, contents, cell_tag)
result += "</tr>\n"
result += "</table>\n"
return result

def __output_text(self, element):
bold = False
italic = False
superscript = False
if len(element.chars) == 0: return ""

text = htmltext.HtmlText()

stack_index = None
tag = u"p"
# what kind of text block is this?
kind = "p"
if element.font_name() == "NeoSansIntelMedium":
if element.font_size() >= 12:
tag = "h1"
stack_index = 0
if element.font_size() >= 12: kind = "h1"
elif element.font_size() >= 9.9:
if element.bounds().x1() < 50:
tag = "h2"
stack_index = 1
else:
tag = "h3"
stack_index = 2
if element.bounds().x1() < 50: kind = "h2"
else: kind = "h3"
else:
bold = True

self.__title_stack = self.__title_stack[:stack_index]
self.__title_stack.append(unicode(element))
text.append(htmltext.OpenTag("strong"))

result = "<%s>" % tag
if bold: result += "<strong>"
if italic: result += "<em>"
if superscript: result += "<sup>"

# TODO style transitions
result += unicode(element).strip()
style = [element.chars[0].fontname, element.chars[0].matrix[0:4]]
for char in element.chars:
if hasattr(char, "fontname") and hasattr(char, "matrix"):
this_style = [char.fontname, char.matrix[0:4]]
if this_style != style and this_style[0].find("Symbol") == -1:
this_italic = this_style[0].find("Italic") != -1
if this_italic != (style[0].find("Italic") != -1):
if this_italic: text.append(htmltext.OpenTag("em"))
else: text.append(htmltext.CloseTag("em"))

if this_style[1][0] < style[1][0]:
text.append(htmltext.OpenTag("sup"))
elif style[1][0] < this_style[1][0]:
text.append(htmltext.CloseTag("sup"))
style = this_style

text.append(char.get_text())

if superscript: result += "</sup>"
if italic: result += "</em>"
if bold: result += "</strong>"
result += "</%s>" % tag
return result
return (kind, text.to_html())

0 comments on commit 64345ef

Please sign in to comment.