forked from ocropus-archive/DUP-ocropy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocrotest-pages
executable file
·267 lines (219 loc) · 10.3 KB
/
ocrotest-pages
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/python
import sys,os,re,glob
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
from pylab import *
import traceback
import resource
resource.setrlimit(resource.RLIMIT_DATA,(2e9,2e9))
import ocrolib
from ocrolib import plotutils
from ocrolib import hocr
from ocrolib import RecognitionError
import ocropreproc
import ocrorast
import ocrofst
ion()
hold(False)
def alert(*args):
sys.stderr.write(" ".join([str(x) for x in args]))
sys.stderr.write("\n")
from optparse import OptionParser
parser = OptionParser(usage="""
%prog [options] image1 image2 ...
IMPORTANT: This is not the preferred way of running OCRopus. Instead,
use the ocropus-{binarize,pseg,lattice,align,hocr} commands.
===
Recognize pages using built-in OCRopus components. This first
uses the page cleaner, then the page segmenter, then the line
recognizers, and finally the language model.
The following components take files as arguments, and those files
are loaded in various ways.
--linerec -- line recognizer (.pymodel, .cmodel, or .model)
--langmod -- language model (OpenFST language model dump)
If you want to see what's going on, run ocropus-pages with the "-d" option
("-D" for continuous output, but this slows down recognition significantly).
With the "-L" option, you also see each text line as it's being recognized.
(For even more insight into what is going on during recognition, use the
ocropus-showpsegs and ocropus-showlrecs commands.)
Advanced Usage:
You can choose from a number of components for the different
processing stages. See the output of "ocropus components" for your
choices.
Possible choices are:
--clean (ICleanupGray) -- binarization, denoising, deskewing
--pseg (ISegmentPage) -- page segmentation
--ticlass (ITextImageSegmentation) -- text/image segmentation (off by default)
For each component, you can pass additional parameters. For example,
--clean StandardPreprocessing:rmbig_minaspect=0.1 uses an instance of
StandardPreprocessing for cleanup and sets its rmbig_minaspect
parameter to 0.1. You can see a list of all the parameters with
"ocropus params StandardPreprocessing".
Instead of component names, you can also pass the names of
constructors of Python classes for each of those components, as in
"--clean my.CleanupPage:threshold=0.3" or "--clean
my.CleanupPage(0.3)". This will import the "my" package and then call
the constructor.
""")
parser.add_option("-C","--clean",help="page cleaner",default="ocropreproc.CommonPreprocessing")
parser.add_option("-P","--pseg",help="line segmenter",default="ocrorast.SegmentPageByRAST")
parser.add_option("-T","--ticlass",help="text image segmenter",default=None)
parser.add_option("-m","--linerec",help="linerec model",default="default.cmodel")
parser.add_option("-l","--langmod",help="langmod",default="default.fst")
parser.add_option("-w","--lweight",help="weight for the language model",type="float",default=1.0)
parser.add_option("-v","--verbose",help="verbose",action="store_true")
parser.add_option("-x","--hocr",help="output XHTML+hOCR",action="store_true")
parser.add_option("-p","--plain",help="output plain text",action="store_true")
parser.add_option("-r","--dpi",help="resolution in dpi",default=200,type=int)
parser.add_option("-S","--silent",action="store_true",help="disable warnings")
parser.add_option("-d","--display",help="display result",action="store_true")
parser.add_option("-D","--Display",help="display continuously",action="store_true")
parser.add_option("-L","--displaylines",help="display lines as well",action="store_true")
parser.add_option("-B","--beam",help="size of beam in beam search",type="int",default=1000)
(options,args) = parser.parse_args()
if len(args)==0:
parser.print_help()
sys.exit(0)
# FIXME add language model weights
assert options.lweight==1.0,"other language model weights not implemented yet"
if options.Display: options.display = 1
if options.display: ion()
if options.displaylines:
sys.stderr.write("[note] recognizer runs significantly slower with -L flag (line display)\n")
# Create/load the various recognition components. Note that you can pass parameters
# to any of these using the syntax documented under ocrolib.make_component.
# The preprocessor: removes noise, performs page deskewing, and other cleanup.
preproc = ocrolib.make_IBinarize(options.clean)
# The page segmenter.
segmenter = ocrolib.make_ISegmentPage(options.pseg)
# The line recognizer. Note that this is loaded, not instantiated.
# You can pass x.model, x.cmodel, and x.pymodel, which loads a C++
# line recognizer, a C++ character recognizer, or a pickled Python line
# recognizer respectively.
linerec = ocrolib.load_linerec(ocrolib.ocropus_find_file(options.linerec))
linerec.norejects = 1
alert("[note]","line recognizer:",linerec)
# The language model, loaded from disk.
if options.langmod=="none":
lmodel = None
else:
lmodel = ocrofst.OcroFST()
lmodel.load(ocrolib.ocropus_find_file(options.langmod))
# The text/image segmenter, if given.
ticlass = None
if options.ticlass is not None:
ticlass = ocrolib.make_ITextImageClassification(options.ticlass)
# Now start the output with printing the hOCR header if hOCR output has been requested.
if options.hocr:
print hocr.header()
# Iterate through the pages specified by the argument. Since this can be somewhat tricky
# with TIFF files, we use the page_iterator abstraction that takes care of all the special
# cases. But basically, this just gives us one gray image after another, plus the file name.
pageno = 0
for page_gray,pagefile in ocrolib.page_iterator(args):
pageno += 1
sys.stderr.write("[note] *** %d %s ***\n"%(pageno,pagefile))
# Output geometric page information.
# FIXME add: bbox, ppageno
if options.hocr:
print "<div class='ocr_page' id='page_%d' ppageno='%d' image='%s'>"% \
(pageno,pageno,pagefile)
# Perform cleanup and binarization of the page.
page_bin,page_gray = preproc.binarize(page_gray)
if not options.silent:
if ocrolib.quick_check_page_components(page_bin,dpi=options.dpi)<0.5:
continue
# Black out images in the binary page image.
# This will cause images to be treated as non-text blocks
# by the page segmenter.
if ticlass is not None:
ocrolib.blackout_images(page_bin,ticlass)
# Perform page segmentation into text columns and text lines.
page_seg = segmenter.segment(page_bin)
# For debugging purposes, display the segmented page image.
if options.display:
clf()
axis = subplot(111)
axis.imshow(page_bin,cmap=cm.gray)
# plotutils.draw_pseg(page_seg,axis)
draw()
if not options.Display:
raw_input("hit ENTER to continue")
else:
ginput(1,timeout=0.1)
# Now iterate through the text lines of the page, in reading order.
# We use the RegionExtractor utility class for that. The page_seg image
# is just an RGB image, see http://docs.google.com/Doc?id=dfxcv4vc_92c8xxp7
regions = ocrolib.RegionExtractor()
regions.setPageLines(page_seg)
# If there are too many text lines, probably something went wrong with the
# page segmentation. (FIXME: make this more flexible)
if regions.length()>150:
alert("[error] too many lines (%d), probably bad input; skipping"%regions.length())
continue
alert("[note]",pagefile,"lines:",regions.length())
for i in range(1,regions.length()):
# Extract the line image and optionally display it.
line = regions.extract(page_bin,i,1)
if options.display and options.displaylines:
clf(); subplot(111); imshow(line,cmap=cm.gray); ginput(1,timeout=0.1)
# Perform some simple sanity checks on the text line image; skip anything
# that doesn't look like a real text line.
if not options.silent:
if ocrolib.quick_check_line_components(line,dpi=options.dpi)<0.5:
continue
# Now perform the actual text line recognition. The variables fst and rseg
# hold the recognition lattice and raw segmentation that the line recognizer
# computes.
try:
fst,rseg = linerec.recognizeLineSeg(line)
except RecognitionError,e:
print "FAILED:",re.sub('\n',' ',str(e)[:80])
except:
traceback.print_exc()
alert("[error]","line recognizer failed")
continue
# Next is the application of the language model. We perform a beam search
# through the recognition lattice and language model. The result is
# stored in an iulib string.
if lmodel is None:
s = fst.bestpath()
if s is None:
continue
cost = 0.0
else:
fst.save("_pages.fst")
# lmodel.save("_lmodel.fst")
s,cost = ocrofst.beam_search_simple(fst,lmodel,options.beam)
if cost>999999.0:
alert("[warn]","beam search didn't find a solution (line not in language model)")
s = fst.bestpath()
cost = 999.0
# Output the text line in various formats.
if options.hocr:
# For hOCR output, we obtain the line bounding box from the region extractor.
# If we wanted to output word or character bounding boxes, we'd have to
# add that information here.
x0,y0,x1,y1 = regions.bboxMath(i)
print "<span class='ocr_line' bbox='%d %d %d %d'>%s</span>"%(x0,y0,x1,y1,s)
elif options.plain:
# For plain output, just output the string itself.
print s
else:
# The default output gives the cost, length, and string.
print "%6.2f\t%3d\t%s"%(cost,len(s),s)
# If display is on, we show the recognized line. (Note that this slows
# down recognition considerably because Matplotlib isn't very fast.)
if options.display and options.displaylines:
axis = subplot(111)
# plotutils.draw_linerec(line,fst,rseg,lmodel,axis)
if not options.Display:
raw_input("hit ENTER to continue")
else:
ginput(1,timeout=0.1) # here to flush the display; times out quickly
# Close off the DIV for the page.
if options.hocr:
print "</div>"
if options.hocr:
print hocr.footer()