Skip to content

Commit

Permalink
Added basic script to generate simple HTML from entire image processi…
Browse files Browse the repository at this point in the history
…ng chain. Needs lots of work, but a good start.
  • Loading branch information
johnoneil committed Aug 25, 2013
1 parent 2ecba82 commit 852263d
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 61 deletions.
112 changes: 112 additions & 0 deletions MangaDetectText
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/python
# vim: set ts=2 expandtab:
"""
Module: ocr
Desc:
Author: John O'Neil
Email: [email protected]
DATE: Saturday, August 25th 2013
Front to back end Manga text detection.
Input is image file of raw manga image
Output is HTML page annotating image with detected text.
"""
#import clean_page as clean
import connected_components as cc
import run_length_smoothing as rls
import clean_page as clean
import ocr

import numpy as np
import cv2
import sys
import argparse
import os
import scipy.ndimage
#from pylab import zeros,amax,median


if __name__ == '__main__':

parser = argparse.ArgumentParser(description='Generate HTML annotation for raw manga scan with detected OCR\'d text.')
parser.add_argument('infile', help='Input (color) raw Manga scan image to annoate.')
parser.add_argument('-o','--output', dest='outfile', help='Output html file.')
#parser.add_argument('-m','--mask', dest='mask', default=None, help='Output (binary) mask for non-graphical regions.')
#parser.add_argument('-b','--binary', dest='binary', default=None, help='Binarized version of input file.')
parser.add_argument('--verbose', help='Verbose operation. Print status messages during processing', action="store_true")
parser.add_argument('--display', help='Display output using OPENCV api and block program exit.', action="store_true")

args = parser.parse_args()
infile = args.infile
outfile = infile + '.html'
if args.outfile is not None:
outfile = args.outfile

img = cv2.imread(sys.argv[1])
(h,w)=img.shape[:2]

gray = clean.grayscale(img)

gaussian_filtered = scipy.ndimage.gaussian_filter(gray, sigma=1.5)
gaussian_binary = clean.binarize(gaussian_filtered)
average_size = cc.average_size(gaussian_binary)

(binary,mask,cleaned) = clean.clean_image_file(infile)

#use multiple of average size as vertical threshold for run length smoothing
vertical_smoothing_threshold = 0.75*average_size
horizontal_smoothing_threshold = 0.75*average_size

inv_cleaned = cv2.bitwise_not(cleaned)
inv_binary = cv2.bitwise_not(binary)
run_length_smoothed_or = rls.RLSO(inv_cleaned, horizontal_smoothing_threshold, vertical_smoothing_threshold)

components = cc.get_connected_components(run_length_smoothed_or)

#perhaps do more strict filtering of connected components because sections of characters
#will not be dripped from run length smoothed areas? Yes. Results quite good.
filtered = cc.filter_by_size(img,components,average_size*100,average_size*1)

#Build html page with image
from django.template import Template, Context
from django.conf import settings
from django.template.loader import render_to_string
settings.configure()
blurbs = ocr.ocr_on_bounding_boxes(inv_binary, filtered)
template = u'''
<html>
<head>
<title>{{ image }}</title>
</head>
<body>
<h1>{{ image }}.</h1>
<img src="{{ image }}" alt="{{ image }}"/>
{% for blurb in blurbs %}
{{ blurb.confidence }} - {{ blurb.text }}<br>
<hl>
{% endfor %}
</body>
</html>
'''

t = Template(template)
c = Context({"image": infile,
"blurbs":blurbs})
#print t.render(c)
#for blurb in blurbs:
# print str(blurb.confidence)+'% :'+ blurb.text
#open(outfile, "w").write(render_to_string(template, c))
open(outfile, "w").write(t.render(c).encode("utf-8"))


if args.display:
cv2.imshow('img',img)
#cv2.imwrite('segmented.png',img)
cv2.imshow('run_length_smoothed_or',run_length_smoothed_or)
#cv2.imwrite('run_length_smoothed.png',run_length_smoothed_or)
#cv2.imwrite('cleaned.png',cleaned)

if cv2.waitKey(0) == 27:
cv2.destroyAllWindows()
cv2.destroyAllWindows()
File renamed without changes.
85 changes: 24 additions & 61 deletions ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@

import tesseract

class Blurb(object):
def __init__(self, x, y, w, h, text, confidence=100.0):
self.x=x
self.y=y
self.w=w
self.h=h
self.text = text
self.confidence = confidence

def draw_2d_slices(img,slices,color=(0,0,255),line_size=2):
for entry in slices:
vert=entry[0]
Expand Down Expand Up @@ -103,6 +112,7 @@ def ocr_on_bounding_boxes(img, components):
#horizontal_lines = []
#vertical_lines = []
#unk_lines = []
blurbs = []
for cc in components:
#horizontal and vertical histogram of nonzero pixels through each section
#just look for completely white sections first.
Expand All @@ -113,8 +123,8 @@ def ocr_on_bounding_boxes(img, components):
x = xs.start
y = ys.start
aspect = float(w)/float(h)
print "..............."
print " w:" + str(w) +" h:" +str(h)+ "at: " +str(x)+","+str(y)
#print "..............."
#print " w:" + str(w) +" h:" +str(h)+ "at: " +str(x)+","+str(y)

#detect vertical columns of non-zero pixels
vertical = []
Expand Down Expand Up @@ -150,54 +160,6 @@ def ocr_on_bounding_boxes(img, components):
start_row=row

if len(vertical)<2 and len(horizontal)<2:continue

#as an experiment, run OCR on all vertical columns independently, allowing us to ignore
#furigana columns when found (columns to the right of columsn that are at least 2x wider)
'''
for i,col in enumerate(vertical):
#is this furigana?
if i < len(vertical)-1:
w_current = col[1].stop-col[1].start
w_next = vertical[i+1][1].stop-vertical[i+1][1].start
if w_current < 0.5*w_next:
#this is probably furigana, continue
continue
col_w= col[1].stop-col[1].start
col_h= col[0].stop-col[0].start
col_x = col[1].start
col_y = col[0].start
#do OCR on this column only
api = tesseract.TessBaseAPI()
api.Init(".","jpn",tesseract.OEM_DEFAULT)
#handle single column lines as "vertical align" and Auto segmentation otherwise
#if len(vertical)<2:
api.SetPageSegMode(5)#tesseract.PSM_VERTICAL_ALIGN)#PSM_AUTO)#PSM_SINGLECHAR)#
#else:
# api.SetPageSegMode(tesseract.PSM_AUTO)#PSM_SINGLECHAR)#
api.SetVariable('chop_enable','T')
api.SetVariable('use_new_state_cost','F')
api.SetVariable('segment_segcost_rating','F')
api.SetVariable('enable_new_segsearch','0')
api.SetVariable('language_model_ngram_on','0')
api.SetVariable('textord_force_make_prop_words','F')
api.SetVariable('tessedit_char_blacklist', '}><L')
gray = cv2.cv.CreateImage((col_w,col_h), 8, 1)
#cv2.cv.SetImageROI(binary,((x,y),(width,height))
sub = cv2.cv.GetSubRect(cv2.cv.fromarray(img), (col_x, col_y, col_w, col_h))
#cv2.cv.copy(sub,gray)
cv2.cv.Copy(sub,gray)
#cv2.cv.CvtColor(cv2.cv.fromarray(img), gray, cv2.cv.CV_BGR2GRAY)
tesseract.SetCvImage(gray, api)
#api.SetImage("image",binary)#,w,h,0)#channel1)#,channel1)
txt=api.GetUTF8Text()
#txt=api.GetHOCRText(0)
conf=api.MeanTextConf()
#cv2.putText(img, str(conf), (x,y), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0))
#image=None
#print "> %s"%txt
#print "***%d %%***"%conf
'''

'''
from http://code.google.com/p/tesseract-ocr/wiki/ControlParams
Expand Down Expand Up @@ -230,20 +192,19 @@ def ocr_on_bounding_boxes(img, components):
api.SetVariable('tessedit_char_blacklist', '}><L')

gray = cv2.cv.CreateImage((w,h), 8, 1)
#cv2.cv.SetImageROI(binary,((x,y),(width,height))
sub = cv2.cv.GetSubRect(cv2.cv.fromarray(img), (x, y, w, h))
#cv2.cv.copy(sub,gray)
cv2.cv.Copy(sub,gray)
#cv2.cv.CvtColor(cv2.cv.fromarray(img), gray, cv2.cv.CV_BGR2GRAY)
tesseract.SetCvImage(gray, api)
#api.SetImage("image",binary)#,w,h,0)#channel1)#,channel1)
txt=api.GetUTF8Text()
#txt=api.GetHOCRText(0)
conf=api.MeanTextConf()
#cv2.putText(img, str(conf), (x,y), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0))
#image=None
print ": %s"%txt
print "*** %d %%***"%conf
if conf>0:
blurb = Blurb(x, y, w, h, txt, confidence=conf)
blurbs.append(blurb)

#print ": %s"%txt
#print "*** %d %%***"%conf

return blurbs


if __name__ == '__main__':
Expand Down Expand Up @@ -275,7 +236,7 @@ def ocr_on_bounding_boxes(img, components):
#areas[component]=area_nz(component,binary)
average_size = median(areas[(areas>3)&(areas<100)])
#average_size = median(areas[areas>3])
print 'Average area of component is: ' + str(average_size)
#print 'Average area of component is: ' + str(average_size)

#use multiple of average size as vertical threshold for run length smoothing
vertical_smoothing_threshold = 0.75*average_size
Expand All @@ -297,7 +258,9 @@ def ocr_on_bounding_boxes(img, components):
#draw_2d_slices(img,vertical_lines,color=(0,255,0))
#draw_bounding_boxes(img,unk_lines,color=(255,0,0),line_size=2)
#draw_2d_slices(img,unk_lines,color=(255,0,0))
ocr_on_bounding_boxes(binary, filtered)
blurbs = ocr_on_bounding_boxes(binary, filtered)
for blurb in blurbs:
print str(blurb.confidence)+'% :'+ blurb.text


#draw_bounding_boxes(img,filtered)
Expand Down

0 comments on commit 852263d

Please sign in to comment.