Skip to content

Commit

Permalink
Fixed wanky handling of pseudocode
Browse files Browse the repository at this point in the history
  • Loading branch information
fay59 committed Dec 3, 2014
1 parent 64345ef commit 6ac75cf
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 25 deletions.
5 changes: 4 additions & 1 deletion htmltext.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ def to_html(self):
tag_stack.append(tag)
result += tag.open()
else:
result += unicode(token)
uni = unicode(token)
for pair in [("&", "&amp;"), ("<", "&lt;"), (">", "&gt;")]:
uni = uni.replace(pair[0], pair[1])
result += uni

while len(tag_stack) > 0:
result += tag_stack.pop().close()
Expand Down
14 changes: 11 additions & 3 deletions pdftable.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,20 @@ def count_segments(list, expected_clusters):
def pretty_much_equal(a, b, threshold = 2):
return abs(a - b) < threshold

class ImplicitTable(object):
class TableBase(object):
def get_at(self, x, y): raise Exception("Not implemented")
def rows(self): raise Exception("Not implemented")
def columns(self): raise Exception("Not implemented")
def bounds(self): raise Exception("Not implemented")
def cell_size(self, x, y): raise Exception("Not implemented")
def data_index(self, x, y): raise Exception("Not implemented")

class ImplicitTable(TableBase):
def __init__(self, bounds, table_data):
self.__bounds = bounds
self.__data = table_data

def get_at_pixels(self, x, y):
def get_at_pixel(self, x, y):
raise Exception("Not supported on implicit tables")

def get_at(self, x, y):
Expand Down Expand Up @@ -124,7 +132,7 @@ def debug_html(self):
result += '</table>'
return result

class Table(object):
class Table(TableBase):
def __init__(self, group):
ver = []
hor = []
Expand Down
91 changes: 70 additions & 21 deletions x86manual.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ def sort_topdown_ltr(a, b):
if aa.x1() > bb.x1(): return 1
return 0

class TableDataSet(pdftable.TableBase):
def __init__(self, bounds, data):
self.__data = data
self.__bounds = bounds

def rows(self): return 1
def columns(self): return 1
def bounds(self): return self.__bounds
def get_at(self, x, y):
assert x == 0 and y == 0
return self.__data[:]

def center_aligned_table(source):
assert source.rows() == 1 and source.columns() == 1
bounds = source.bounds()
Expand Down Expand Up @@ -113,15 +125,15 @@ def get_text(self):
return self.text

class CharCollection(object):
def __init__(self, iterable):
def __init__(self, iterable, rect):
self.chars = [c for c in iterable]
while len(self.chars) > 0 and len(self.chars[-1].get_text().strip()) == 0:
self.chars.pop()

# actual, complete bounds (modified by caller)
self.rect = rect
# bounds excluding abnormally-placed characters (exponents, symbols)
self.approx_rect = self.__approximative_bounds()
# actual, complete bounds (modified by caller)
self.rect = self.approx_rect

def bounds(self): return self.approx_rect

Expand Down Expand Up @@ -197,7 +209,34 @@ def flush(self):
orphans.append(line)
lines = orphans

displayable = self.__merge_text(orphans) + tables
# exception tables
i = 0
remaining = []
while i < len(orphans):
line = orphans[i]
if line.font_name() == "NeoSansIntelMedium" and unicode(line).strip().lower()[-10:] == "exceptions":
remaining.append(line)
i += 1

table_data = []
rect = None
while i < len(orphans):
line = orphans[i]
if line.font_name() == "NeoSansIntelMedium": break
table_data.append(line)
if rect == None: rect = line.rect
else: rect = rect.union(line.rect)
i += 1

if len(table_data) > 1:
tables.append(TableDataSet(rect, table_data))
else:
remaining += table_data
else:
remaining.append(line)
i += 1

displayable = self.__merge_text(remaining) + tables
displayable.sort(cmp=sort_topdown_ltr)

self.__output_file(displayable)
Expand Down Expand Up @@ -234,10 +273,8 @@ def end_page(self, page):
def process_text_line(self, line):
# ignore header and footer
if line.bbox[1] < 740 and line.bbox[1] > 50:
coll = CharCollection(line)
coll.rect = self.__fix_bbox(line.bbox)
coll = CharCollection(line, self.__fix_bbox(line.bbox))
coll.approx_rect = self.__fix_rect(coll.approx_rect)
if unicode(coll).find("*") != -1: print coll.rect, coll.approx_rect
self.thisPageTextLines.append(coll)

def process_rect(self, rect):
Expand Down Expand Up @@ -278,13 +315,14 @@ def sort_text(a, b):

if len(lines) == 0: return

lines.sort(cmp=sort_text)
lines.sort(cmp=sort_topdown_ltr)
merged = [lines[0]]
for line in lines[1:]:
last = merged[-1]
print last.approx_rect.y2(), line.approx_rect.y1(), unicode(line)
same_x = pdftable.pretty_much_equal(line.rect.x1(), last.rect.x1())
same_size = last.font_size() == line.font_size()
decent_descent = line.approx_rect.y1() - last.approx_rect.y2() < 2.5
decent_descent = line.approx_rect.y1() - last.approx_rect.y2() < 1.2
if same_x and same_size and decent_descent:
lastChar = last.chars[-1].get_text()[-1]
if not (lastChar == "-" or lastChar == "/"):
Expand Down Expand Up @@ -323,26 +361,34 @@ def write_line(line): result[0] += line + "\n"
def __output_html(self, element):
result = ""
if isinstance(element, list):
return "".join([unicode(e) for e in element])
if isinstance(element, CharCollection):
result = "".join([unicode(e) for e in element])

elif isinstance(element, CharCollection):
kind, result = self.__output_text(element)
if kind[0] == "h":
level = int(kind[1]) - 1
self.__title_stack = self.__title_stack[0:level]
self.__title_stack.append(result)
result = "<%s>%s</%s>\n" % (kind, result, kind)
elif isinstance(element, pdftable.Table):

elif isinstance(element, pdftable.TableBase):
print_index = -1
attributes = ""
if element.rows() == 1 and element.columns() == 1:
if len(self.__title_stack) == 1:
# instruction table
element = left_aligned_table(element)
elif self.__title_stack[-1].strip().lower() == "instruction operand encoding":
# operands encoding
element = center_aligned_table(element)
else:
heading = self.__title_stack[-1].strip().lower()
if heading == "instruction operand encoding":
# operands encoding
element = center_aligned_table(element)
elif heading[-10:] == "exceptions":
# exception table
element = left_aligned_table(element)
attributes += ' class="exception-table"'

result += "<table>\n"
result += "<table%s>\n" % attributes
for row in xrange(0, element.rows()):
result += "<tr>\n"
for col in xrange(0, element.columns()):
Expand Down Expand Up @@ -403,10 +449,13 @@ def __output_text(self, element):
if this_italic: text.append(htmltext.OpenTag("em"))
else: text.append(htmltext.CloseTag("em"))

if this_style[1][0] < style[1][0]:
text.append(htmltext.OpenTag("sup"))
elif style[1][0] < this_style[1][0]:
text.append(htmltext.CloseTag("sup"))
# Intel inconsistently switches between Arial and Verdana
# and uses different font sizes
if this_style[0].find("Arial") == -1 and style[0].find("Arial") == -1:
if this_style[1][0] < style[1][0]:
text.append(htmltext.OpenTag("sup"))
elif style[1][0] < this_style[1][0]:
text.append(htmltext.CloseTag("sup"))
style = this_style

text.append(char.get_text())
Expand Down

0 comments on commit 6ac75cf

Please sign in to comment.