forked from ocropus-archive/DUP-ocropy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocropus-visualize-results
executable file
·113 lines (101 loc) · 4.15 KB
/
ocropus-visualize-results
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
from __future__ import print_function
import glob
import sys
import os
import signal
import argparse
import matplotlib
matplotlib.use("AGG")
import matplotlib.pyplot as plt
import numpy as np
from scipy.ndimage import interpolation
import ocrolib
from ocrolib import morph
signal.signal(signal.SIGINT,lambda *args:sys.exit(1))
parser = argparse.ArgumentParser(description = """
Generate HTML for debugging a book directory.
Input: a directory in standard OCRopus book format
Output: index.html files and thumbnails showing recognition results
""")
parser.add_argument("book",default="book")
parser.add_argument("-N","--npages",type=int,default=100000,help="max number of pages, default: %(default)s")
args = parser.parse_args()
def write_cseg(stream,cseg_file):
cseg = ocrolib.read_line_segmentation(cseg_file)
cseg = ocrolib.read_line_segmentation(cseg_file)
csegs = linerec.extract_csegs(cseg)
stream.write("<table><tr>")
for i,c in enumerate(csegs):
out = ".__"+cseg_file+"_%03d.png"%i
plt.imsave(out,np.amax(c.img)-c.img,cmap=plt.cm.gray)
stream.write("<td><img src=%s height=%d style='border: 1px #ccccff solid;'></td>"%(out,max(2,c.img.shape[0]/2)))
stream.write("</tr></table>")
stream.write("\n")
def genpage(d):
print("===", d)
here = os.getcwd()
try:
os.chdir(d)
with open("index.html","w") as stream:
stream.write("<h1>%s</h1>\n"%d)
images = sorted(glob.glob("??????.bin.png"))
for img in images:
txt = ocrolib.fvariant(img,"txt","")
if os.path.exists(txt):
with open(txt) as tf: text = tf.read()
stream.write("<font color='#000066'><b>%s</b></font><br>\n"%text)
rtxt = ocrolib.fvariant(img,"txt","raw")
if os.path.exists(rtxt):
with open(rtxt) as tf: rtext = tf.read()
stream.write("<font color='gray'><b>%s</b></font><br>\n"%rtext)
stream.write("<p />\n")
image = ocrolib.read_image_gray(img)
stream.write("<img width='%d' src='%s'>\n"%(max(10,image.shape[1]/2),img))
stream.write("<br />\n")
stream.write("<font size=-2>")
stream.write("<a href=%s>%s</a> / "%("..",args.book))
stream.write("<a href=%s>%s</a> / "%("../"+d,d))
stream.write("<a href=%s>%s</a>"%(img,img))
stream.write("</font>")
stream.write("<p />\n")
cseg = ocrolib.fvariant(img,"cseg")
if os.path.exists(cseg):
write_cseg(stream,cseg)
rseg_file = ocrolib.fvariant(img,"rseg")
if os.path.exists(rseg_file):
rseg = ocrolib.read_line_segmentation(rseg_file)
plt.figure(figsize=(20,1),dpi=150)
morph.showlabels(rseg)
figfile = ".__"+rseg_file+"_.png"
plt.savefig(figfile)
stream.write("<img height='50' src='%s'><br>\n"%figfile)
stream.write("<hr>\n")
finally:
os.chdir(here)
os.chdir(args.book)
with open("index.html","w") as stream:
for d in sorted(glob.glob("????"))[:args.npages]:
genpage(d)
if os.path.exists(d+".bin.png"):
image = ocrolib.read_image_gray(d+".bin.png")
else:
image = np.zeros((300,300))
out = ".__"+d+".png"
image = interpolation.zoom(image,(0.125,0.125),order=1)
plt.imsave(out,image,cmap=plt.cm.gray)
stream.write("<table border=1><tr>\n")
stream.write("<td>")
stream.write("<a href='%s/index.html'><img src='%s'></a>"%(d,out))
stream.write("<br>%s<br>"%d)
stream.write("</td>\n")
stream.write("<td>")
count = 0
for fname in sorted(glob.glob(d+"/??????.txt")):
with open(fname) as tf: s = tf.read()
if len(s)<20: continue
stream.write("%s<br>\n"%s[:100])
count += 1
if count>=10: break
stream.write("</td>\n")
stream.write("</tr></table>\n")