Skip to content


merge two graphs and add optimized graphs
Browse files Browse the repository at this point in the history
  • Loading branch information
deron committed Mar 25, 2019
1 parent d827bef commit 28b9f51
Show file tree
Hide file tree
Showing 9 changed files with 398 additions and 16 deletions.
2 changes: 1 addition & 1 deletion
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Accurate and Efficient Elevator Button Localization

OCR-RCNN-v2 is designed for autonomous elevator manipulation, the goal of which is to enable the robot to autonomously operate elevators that are previously unvisited. This repository contains the perception part of this project. We published the initial version in paper [A Novel OCR-RCNN for Elevator Button Recognition]( and this version improves the accuracy by 20% and achieves a real-time running speed (640*480 in gtx1070ti). Current version can also run in laptops with at least 2GB GPU memory. The Nvidia TX-2 compatible version will be soon released with the dataset, as well as the post-processing code.
OCR-RCNN-v2 is designed for autonomous elevator manipulation, the goal of which is to enable the robot to autonomously operate elevators that are previously unvisited. This repository contains the perception part of this project. We published the initial version in paper [A Novel OCR-RCNN for Elevator Button Recognition]( and this version improves the accuracy by 20% and achieves a real-time running speed (640*480 ). Current version can also run in laptops with at least 2GB GPU memory. The Nvidia TX-2 compatible version will be soon released with the dataset, as well as the post-processing code.

### Requirements

Expand Down
205 changes: 205 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#!/usr/bin/env python
import os
import imageio
import numpy as np
import tensorflow as tf
from PIL import Image, ImageDraw, ImageFont
from utils.ops import native_crop_and_resize
from utils import visualization_utils as vis_util
import tensorflow.contrib.tensorrt as trt

charset = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5,
'6': 6, '7': 7, '8': 8, '9': 9, 'A': 10, 'B': 11,
'C': 12, 'D': 13, 'E': 14, 'F': 15, 'G': 16, 'H': 17,
'I': 18, 'J': 19, 'K': 20, 'L': 21, 'M': 22, 'N': 23,
'O': 24, 'P': 25, 'R': 26, 'S': 27, 'T': 28, 'U': 29,
'V': 30, 'X': 31, 'Z': 32, '<': 33, '>': 34, '(': 35,
')': 36, '$': 37, '#': 38, '^': 39, 's': 40, '-': 41,
'*': 42, '%': 43, '?': 44, '!': 45, '+': 46} # <nul> = +

class ButtonRecognizer:
def __init__(self, rcnn_path= None, ocr_path=None, use_trt=False, precision='FP16', use_optimized=False):
self.ocr_graph_path = ocr_path
self.rcnn_graph_path = rcnn_path
self.use_trt = use_trt
self.precision=precision #'INT8, FP16, FP32'
self.use_optimized = use_optimized
self.session = None

self.ocr_input = None
self.ocr_output = None
self.rcnn_input = None
self.rcnn_output = None

self.class_num = 1
self.image_size = [480, 640]
self.recognition_size = [180, 180]
self.category_index = {1: {'id': 1, 'name': u'button'}}
self.idx_lbl = {}
for key in charset.keys():
self.idx_lbl[charset[key]] = key
print('Button recognizer initialized!')

def __del__(self):

def optimize_rcnn(self, input_graph_def):
trt_graph = trt.create_inference_graph(
outputs=['detection_boxes', 'detection_scores', 'detection_classes', 'num_detections'],
max_batch_size = 1,
# max_workspace_size_bytes=(2 << 10) << 20,
precision_mode = self.precision)
return trt_graph

def optimize_ocr(self, input_graph_def):
output_graph_def = trt.create_inference_graph(
input_graph_def = input_graph_def,
outputs = ['predicted_chars', 'predicted_scores'],
max_batch_size = 1,
# max_workspace_size_bytes=(2 << 10) << 20,
precision_mode = self.precision)
return output_graph_def

def load_and_merge_graphs(self):
# check graph paths
if self.ocr_graph_path is None:
self.ocr_graph_path = './frozen_model/ocr_graph.pb'
if self.rcnn_graph_path is None:
self.rcnn_graph_path = './frozen_model/detection_graph_640x480.pb'
if self.use_optimized:
self.ocr_graph_path.replace('.pb', '_optimized.pb')
self.rcnn_graph_path.replace('.pb', '_optimized.pb')
assert os.path.exists(self.ocr_graph_path) and os.path.exists(self.rcnn_graph_path)

# merge the frozen graphs
ocr_rcnn_graph = tf.Graph()
with ocr_rcnn_graph.as_default():

# load button detection graph definition
with tf.gfile.GFile(self.rcnn_graph_path, 'rb') as fid:
detection_graph_def = tf.GraphDef()
serialized_graph =
# for node in detection_graph_def.node:
# print
if self.use_trt:
detection_graph_def = self.optimize_rcnn(detection_graph_def)
tf.import_graph_def(detection_graph_def, name='detection')

# load character recognition graph definition
with tf.gfile.GFile(self.ocr_graph_path, 'rb') as fid:
recognition_graph_def = tf.GraphDef()
serialized_graph =
if self.use_trt:
recognition_graph_def = self.optimize_ocr(recognition_graph_def)
tf.import_graph_def(recognition_graph_def, name='recognition')

# retrive detection tensors
rcnn_input = ocr_rcnn_graph.get_tensor_by_name('detection/image_tensor:0')
rcnn_boxes = ocr_rcnn_graph.get_tensor_by_name('detection/detection_boxes:0')
rcnn_scores = ocr_rcnn_graph.get_tensor_by_name('detection/detection_scores:0')
rcnn_number = ocr_rcnn_graph.get_tensor_by_name('detection/num_detections:0')

# crop and resize valida boxes (only valid when rcnn input has an known shape)
rcnn_number = tf.to_int32(rcnn_number)
valid_boxes = tf.slice(rcnn_boxes, [0, 0, 0], [1, rcnn_number[0], 4])
ocr_boxes = native_crop_and_resize(rcnn_input, valid_boxes, self.recognition_size)

# retrive recognition tensors
ocr_input = ocr_rcnn_graph.get_tensor_by_name('recognition/ocr_input:0')
ocr_chars = ocr_rcnn_graph.get_tensor_by_name('recognition/predicted_chars:0')
ocr_beliefs = ocr_rcnn_graph.get_tensor_by_name('recognition/predicted_scores:0')

self.rcnn_input = rcnn_input
self.rcnn_output = [rcnn_boxes, rcnn_scores, rcnn_number, ocr_boxes]
self.ocr_input = ocr_input
self.ocr_output = [ocr_chars, ocr_beliefs]

self.session = tf.Session(graph=ocr_rcnn_graph)

def clear_session(self):
if self.session is not None:

def decode_text(self, codes, scores):
score_ave = 0
text = ''
for char, score in zip(codes, scores):
if not self.idx_lbl[char] == '+':
score_ave += score
text += self.idx_lbl[char]
score_ave /= len(text)
return text, score_ave

def predict(self, image_np, draw=False):
# input data
assert image_np.shape == (480, 640, 3)
img_in = np.expand_dims(image_np, axis=0)

# output data
recognition_list = []

# perform detection and recognition
boxes, scores, number, ocr_boxes =, feed_dict={self.rcnn_input:img_in})
boxes, scores, number = [np.squeeze(x) for x in [boxes, scores, number]]

for i in range(number):
if scores[i] < 0.5: continue
chars, beliefs =, feed_dict={self.ocr_input: ocr_boxes[:,i]})
chars, beliefs = [np.squeeze(x) for x in [chars, beliefs]]
text, belief = self.decode_text(chars, beliefs)
recognition_list.append([boxes[i], scores[i], text, belief])

if draw:
classes = [1]*len(boxes)
self.draw_detection_result(image_np, boxes, classes, scores, self.category_index)
self.draw_recognition_result(image_np, recognition_list)

return recognition_list

def draw_detection_result(image_np, boxes, classes, scores, category, predict_chars=None):

def draw_recognition_result(self, image_np, recognitions):
for item in recognitions:
# crop button patches
y_min = int(item[0][0] * self.image_size[0])
x_min = int(item[0][1] * self.image_size[1])
y_max = int(item[0][2] * self.image_size[0])
x_max = int(item[0][3] * self.image_size[1])
button_patch = image_np[y_min: y_max, x_min: x_max]
# generate image layer for drawing
img_pil = Image.fromarray(button_patch)
img_show = ImageDraw.Draw(img_pil)
# draw at a proper location
x_center = (x_max-x_min) / 2.0
y_center = (y_max-y_min) / 2.0
font_size = min(x_center, y_center)*1.1
text_center = int(x_center-0.5*font_size), int(y_center-0.5*font_size)
font = ImageFont.truetype('/Library/Fonts/Arial.ttf', int(font_size))
img_show.text(text_center, text=item[2], font=font, fill=(255, 0, 255))
image_np[y_min: y_max, x_min: x_max] = np.array(img_pil)

if __name__ == '__main__':
recognizer = ButtonRecognizer(use_optimized=True)
image = imageio.imread('./test_panels/1.jpg')
recognition_list =recognizer.predict(image,True)
image = Image.fromarray(image)
24 changes: 12 additions & 12 deletions
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,22 @@ def init_recognizer(self):
raise IOError('Invalid ocr_graph path! {}'.format(self.graph_path))

# load frozen graph
detection_graph = tf.Graph()
with detection_graph.as_default():
recognition_graph = tf.Graph()
with recognition_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(self.graph_path, 'rb') as fid:
serialized_graph =
tf.import_graph_def(od_graph_def, name='')
self.session = tf.Session(graph=detection_graph)
self.session = tf.Session(graph=recognition_graph)

# prepare input and output request
self.input = detection_graph.get_tensor_by_name('ocr_input:0')
# self.output.append(detection_graph.get_tensor_by_name('chars_logit:0'))
# self.output.append(detection_graph.get_tensor_by_name('chars_log_prob:0'))
# self.output.append(detection_graph.get_tensor_by_name('predicted_text:0'))
self.input = recognition_graph.get_tensor_by_name('ocr_input:0')
# self.output.append(recognition_graph.get_tensor_by_name('chars_logit:0'))
# self.output.append(recognition_graph.get_tensor_by_name('chars_log_prob:0'))
# self.output.append(recognition_graph.get_tensor_by_name('predicted_text:0'))

def clear_session(self):
if self.session is not None:
Expand All @@ -76,15 +76,15 @@ def predict(self, image_np, draw=False):
score_ave /= len(text)

if self.verbose:
self.visualize_detection_result(image_np, text, score_ave)
self.visualize_recognition_result(image_np, text, score_ave)

img_show = self.draw_result(image_np, text, score_ave) if draw else image_np

return text, score_ave, np.array(img_show)

def visualize_detection_result(image_np, text, scores):
def visualize_recognition_result(image_np, text, scores):
img_pil = Image.fromarray(image_np)
img_show = ImageDraw.Draw(img_pil)
font = ImageFont.truetype('/Library/Fonts/Arial.ttf', 60)
Expand All @@ -102,7 +102,7 @@ def draw_result(image_np, text, scores):

if __name__ == '__main__':
recognizer = CharacterRecognizer(verbose=False)
image = imageio.imread('./test_buttons/7_14.png')
image = imageio.imread('./test_buttons/0_0.png')
_, _, img =recognizer.predict(image,True)
image = Image.fromarray(img)
Expand Down
3 changes: 2 additions & 1 deletion →
Original file line number Diff line number Diff line change
Expand Up @@ -74,5 +74,6 @@ def get_image_name_list(target_path):

average_time = overall_time / len(data_list)


12 changes: 10 additions & 2 deletions →
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import print_function
import os
import cv2
import imageio
import PIL.Image
import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -41,11 +42,18 @@ def get_image_name_list(target_path):
return image_name_list

def warm_up(detector, recognizer):
image = imageio.imread('./test_panels/1.jpg')
button = imageio.imread('./test_buttons/0_0.png')

if __name__ == '__main__':
data_dir = './test_panels'
data_list = get_image_name_list(data_dir)
detector = ButtonDetector()
recognizer = CharacterRecognizer(verbose=False)
warm_up(detector, recognizer)
overall_time = 0
for data in data_list:
img_path = os.path.join(data_dir, data+'.jpg')
Expand All @@ -59,9 +67,9 @@ def get_image_name_list(target_path):
time = (t1-t0)/cv2.getTickFrequency()
overall_time += time
print('Time elapsed: {}'.format(time))

average_time = overall_time / len(data_list)


56 changes: 56 additions & 0 deletions
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python
from __future__ import print_function
import os
import cv2
import imageio
import PIL.Image
import PIL.ImageOps as ImageOps
import numpy as np
import tensorflow as tf
from button_recognition import ButtonRecognizer

DRAW = False

def get_image_name_list(target_path):
assert os.path.exists(target_path)
image_name_list = []
file_set = os.walk(target_path)
for root, dirs, files in file_set:
for image_name in files:
return image_name_list

def warm_up(model):
assert isinstance(model, ButtonRecognizer)
image = imageio.imread('./test_panels/1.jpg')

if __name__ == '__main__':
data_dir = './test_panels'
data_list = get_image_name_list(data_dir)
recognizer = ButtonRecognizer(use_optimized=True)
overall_time = 0
for data in data_list:
img_path = os.path.join(data_dir, data+'.jpg')
image =
# resize to 640x480 with ratio kept
img_thumbnail = image.thumbnail((640, 480), PIL.Image.ANTIALIAS)
delta_w, delta_h= 640 - image.size[0], 480 - image.size[1]
padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
new_im = ImageOps.expand(image, padding)
img_np = np.copy(np.asarray(new_im))
# perform button recognition
t0 = cv2.getTickCount()
recognizer.predict(img_np, draw=DRAW)
t1 = cv2.getTickCount()
time = (t1-t0)/cv2.getTickFrequency()
overall_time += time
print('Time elapsed: {}'.format(time))
if DRAW:
image = PIL.Image.fromarray(img_np)

average_time = overall_time / len(data_list)
print('Average_used: {}'.format(average_time))

0 comments on commit 28b9f51

Please sign in to comment.