finished evaluation, but IOU is only 41.0, vs 48.5 of Matconvnet

torrvision · May 25, 2017 · f6e6dc2 · f6e6dc2
1 parent be52a25
commit f6e6dc2
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 64 deletions.
diff --git a/parameters/hyperparams.json b/parameters/hyperparams.json
@@ -1,11 +1,11 @@
 {
 	"response_up": 8,
-	"window_influence": 0.25,
-	"z_lr": 0.01,
+	"window_influence": 0.175,
+	"z_lr": 0.0102,
 	"scale_num": 3,
-	"scale_step": 1.04,
-	"scale_penalty": 0.97,
-	"scale_lr": 0.59,
+	"scale_step": 1.047,
+	"scale_penalty": 0.9825,
+	"scale_lr": 0.68,
 	"scale_min": 0.2,
 	"scale_max": 5
 }
diff --git a/run_tracker_evaluation.py b/run_tracker_evaluation.py
@@ -10,35 +10,63 @@
 from src.pprint_params import pprint_params
 
 def main():
+	# avoid printing TF debugging information
+	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
 	# TODO: this will be passed to the main function
-	hp = {"z_lr":0.006}
-	evaluation = {"video": "vot2016_helicopter"}
+	hp = {}
+	evaluation = {"video": "all"}
 	run = {"visualization":0,"debug":0}
 
 	# read all default parameters and overwrite ones defined by user
 	hp,evaluation,run,env,design = parse_arguments(hp, evaluation, run)
+	final_score_sz = int(hp.response_up * design.score_sz)
+	# build TF graph once for all
+	filename, image, templates_z, scores = siam.build_tracking_graph(final_score_sz, design, env)	
 
 	# iterate through all videos of evaluation.dataset
 	if evaluation.video=='all':
 		dataset_folder = os.path.join(env.root_dataset, evaluation.dataset)
 		videos_list = [v for v in os.listdir(dataset_folder)]
 		videos_list.sort()
-		for i in range(np.size(videos_list)):
+		nv = np.size(videos_list)
+		speed = np.zeros(nv)
+		ious = np.zeros(nv)
+		lengths = np.zeros(nv)
+		for i in range(nv):
 			gt, frame_name_list, frame_sz, pos_x, pos_y, target_w, target_h  = _init_video(env, evaluation, videos_list[i])
-			bboxes, speed = tracker(hp, evaluation, run, env, design, frame_name_list, frame_sz, pos_x, pos_y, target_w, target_h)
+			bboxes, speed[i] = tracker(hp, evaluation, run, design, frame_name_list, pos_x, pos_y, target_w, target_h, final_score_sz, filename, image, templates_z, scores)
+			lengths[i], ious[i] = _compile_results(gt, bboxes, videos_list[i])
+			print str(i)+' -- '+videos_list[i]+' -- IOU: '+("%.2f" % ious[i])+' -- Speed: '+("%.2f" % speed[i])+' --'
+			print
+
+		tot_frames = np.sum(lengths)
+		mean_iou = np.sum(ious*lengths)/tot_frames
+		mean_speed = np.sum(speed*lengths)/tot_frames
+		print '-- Overall stats (averaged per frame) on '+str(nv)+' videos ('+str(tot_frames)+' frames) --'
+		print '-- IOU: '+("%.2f" % mean_iou)+' -- Speed: '+("%.2f" % mean_speed)+ ' --'
+		print
+
 	else:
 		gt, frame_name_list, frame_sz, pos_x, pos_y, target_w, target_h = _init_video(env, evaluation, evaluation.video)
-		bboxes, speed = tracker(hp, evaluation, run, env, design, frame_name_list, frame_sz, pos_x, pos_y, target_w, target_h)
+		bboxes, speed = tracker(hp, evaluation, run, design, frame_name_list, pos_x, pos_y, target_w, target_h, final_score_sz, filename, image, templates_z, scores)
+		_, iou = _compile_results(gt, bboxes, evaluation.video)
+		print evaluation.video+' -- IOU: '+("%.2f" % iou)+' -- Speed: '+("%.2f" % speed)+' --'
+
+def _compile_results(gt, bboxes, video):
+	l = np.size(bboxes,0)
+	gt4 = np.zeros((l, 4))
+	new_ious = np.zeros(l)
+	# np.savetxt('out/'+video+'.bboxes', bboxes, delimiter=',')
+	# np.savetxt('out/'+video+'.gt', gt, delimiter=',')
+	for j in range(l):
+		gt4[j, :] = region_to_bbox(gt[j, :], center=False)
+		new_ious[j] = _compute_iou(bboxes[j,:], gt4[j,:])
 
-	n_bboxes = np.shape(bboxes)[0]
-	ious = np.zeros(n_bboxes)
-	for i in range(n_bboxes):
-		ious[i] = _compute_iou(bboxes[i,:], gt[i])
+	iou = np.mean(new_ious)*100
+
+	return l, iou
 
-	print ious
-	print 'Average IOU: '+str(np.mean(ious))
-
 def _init_video(env, evaluation, video):
 	    video_folder = os.path.join(env.root_dataset, evaluation.dataset, video)
 	    frame_name_list = [f for f in os.listdir(video_folder) if f.endswith(".jpg")]
@@ -54,31 +82,33 @@ def _init_video(env, evaluation, video):
 	    assert len(gt) == len(frame_name_list), ('Number of frames and number of GT lines should be equal.')
 	    ## tracker's state initializations, bbox is in format <cx,cy,w,h>
 	    pos_x, pos_y, target_w, target_h = region_to_bbox(gt[evaluation.start_frame])
+
 	    return gt, frame_name_list, frame_sz, pos_x, pos_y, target_w, target_h
 
-def _compute_iou(boxA, boxB):
-	boxA = region_to_bbox(boxA, center=False)
-	boxB = region_to_bbox(boxB, center=False)
+def _compute_iou(boxA, boxB):	
 	# determine the (x, y)-coordinates of the intersection rectangle	
 	xA = max(boxA[0], boxB[0])
 	yA = max(boxA[1], boxB[1])
 	xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
 	yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
 
-	# compute the area of intersection rectangle
-	interArea = (xB - xA) * (yB - yA)
-
-	# compute the area of both the prediction and ground-truth
-	# rectangles
-	boxAArea = boxA[2] * boxA[3]
-	boxBArea = boxB[2] * boxB[3]
-
-	# compute the intersection over union by taking the intersection
-	# area and dividing it by the sum of prediction + ground-truth
-	# areas - the interesection area
-	iou = max(0, interArea / float(boxAArea + boxBArea - interArea))
-
-	# return the intersection over union value
+ 	if xA<xB and yA<yB:
+		# compute the area of intersection rectangle
+		interArea = (xB - xA) * (yB - yA) 
+		# compute the area of both the prediction and ground-truth
+		# rectangles
+		boxAArea = boxA[2] * boxA[3]
+		boxBArea = boxB[2] * boxB[3]
+		# compute the intersection over union by taking the intersection
+		# area and dividing it by the sum of prediction + ground-truth
+		# areas - the interesection area
+		iou = interArea / float(boxAArea + boxBArea - interArea)
+	else:
+		iou = 0
+
+	assert iou >=0
+	assert iou <= 1.01
+
 	return iou
 
 if __name__ == '__main__':

diff --git a/src/crops.py b/src/crops.py
@@ -2,15 +2,17 @@
 import numpy as np
 import tensorflow as tf
 
-def pad_frame(im, frame_sz, pos_x, pos_y, patch_sz):
+def pad_frame(im, frame_sz, pos_x, pos_y, patch_sz, avg_chan):
 	xleft_pad = tf.maximum(0, tf.cast(-tf.round(pos_x-patch_sz/2), tf.int32))
 	ytop_pad = tf.maximum(0, tf.cast(-tf.round(pos_y-patch_sz/2), tf.int32))
 	xright_pad = tf.maximum(0, tf.cast(tf.round(pos_x+patch_sz/2)-frame_sz[1], tf.int32))
 	ybottom_pad = tf.maximum(0, tf.cast(tf.round(pos_y+patch_sz/2)-frame_sz[0], tf.int32))
 	npad = tf.reduce_max([xleft_pad,ytop_pad,xright_pad,ybottom_pad])
 	paddings = [[npad,npad],[npad,npad],[0,0]]
 	im_padded = im
+	# im_padded = im_padded - avg_chan
 	im_padded = tf.pad(im_padded, paddings, mode='CONSTANT')
+	# im_padded = im_padded + avg_chan
 	return im_padded, npad
 
 def extract_crops_z(im, npad, pos_x, pos_y, sz_src, sz_dst):
@@ -35,7 +37,7 @@ def extract_crops_x(im, npad, pos_x, pos_y, sz_src0, sz_src1, sz_src2, sz_dst):
 	search_area = tf.image.crop_to_bounding_box(im, tf.cast(tr_y,tf.int32), tf.cast(tr_x, tf.int32), tf.cast(sz_src2, tf.int32), tf.cast(sz_src2, tf.int32))
 	offset_s0 = (sz_src2-sz_src0)/2
 	offset_s1 = (sz_src2-sz_src1)/2
-	
+
 	crop_s0 = tf.image.crop_to_bounding_box(search_area, tf.cast(offset_s0,tf.int32), tf.cast(offset_s0,tf.int32), tf.cast(sz_src0,tf.int32), tf.cast(sz_src0,tf.int32))
 	crop_s0 = tf.image.resize_images(crop_s0, [sz_dst,sz_dst], method=tf.image.ResizeMethod.BILINEAR)
 	crop_s1 = tf.image.crop_to_bounding_box(search_area, tf.cast(offset_s1,tf.int32), tf.cast(offset_s1,tf.int32), tf.cast(sz_src1,tf.int32), tf.cast(sz_src1,tf.int32))

diff --git a/src/region_to_bbox.py b/src/region_to_bbox.py
@@ -2,26 +2,27 @@
 
 def region_to_bbox(region, center=True):
 
-	n = len(region)
-	assert n==4 or n==8, ('GT region format is invalid, should have 4 or 8 entries.')
+    n = len(region)
+    assert n==4 or n==8, ('GT region format is invalid, should have 4 or 8 entries.')
 
-	if n==4:
-		return _rect(region, center)
-	else:
-		return _poly(region, center)
+    if n==4:
+        return _rect(region, center)
+    else:
+        return _poly(region, center)
 
 def _rect(region, center):
-    x = region[0]
-    y = region[1]
-    w = region[2]
-    h = region[3]
-    cx = x+w/2
-    cy = y+h/2
-
+
     if center:
+        x = region[0]
+        y = region[1]
+        w = region[2]
+        h = region[3]
+        cx = x+w/2
+        cy = y+h/2
         return cx, cy, w, h
     else:
-        return x, y, w, h
+        return region
+
 
 def _poly(region, center):
     cx = np.mean(region[::2])

diff --git a/src/siamese.py b/src/siamese.py
@@ -38,12 +38,13 @@ def build_tracking_graph(final_score_sz, design, env):
     # Decode the image as a JPEG file, this will turn it into a Tensor
     image = tf.cast(tf.image.decode_jpeg(image_file), tf.int32)
     frame_sz = tf.cast(tf.shape(image), tf.float64)
-    # frame_sz[1], frame_sz[0] = frame_sz[0], frame_sz[1]
+    # used to pad the crops
+    avg_chan = tf.cast(tf.reduce_mean(image, axis=(0,1)), tf.int32)
     # pad with if necessary
-    frame_padded_z, npad_z = pad_frame(image, frame_sz, pos_x_ph, pos_y_ph, z_sz_ph);
-    # extract tensor of z_crops (all identical)
+    frame_padded_z, npad_z = pad_frame(image, frame_sz, pos_x_ph, pos_y_ph, z_sz_ph, avg_chan);
+    # extract tensor of z_crops
     z_crops = extract_crops_z(frame_padded_z, npad_z, pos_x_ph, pos_y_ph, z_sz_ph, design.exemplar_sz)
-    frame_padded_x, npad_x = pad_frame(image, frame_sz, pos_x_ph, pos_y_ph, x_sz2_ph);
+    frame_padded_x, npad_x = pad_frame(image, frame_sz, pos_x_ph, pos_y_ph, x_sz2_ph, avg_chan);
     # extract tensor of x_crops (3 scales)
     x_crops = extract_crops_x(frame_padded_x, npad_x, pos_x_ph, pos_y_ph, x_sz0_ph, x_sz1_ph, x_sz2_ph, design.search_sz)
     # use crops as input of (MatConvnet imported) pre-trained fully-convolutional Siamese net
@@ -103,6 +104,8 @@ def _create_siamese(net_path, net_x, net_z):
             net_x = tf.nn.max_pool(net_x, [1,_pool_sz,_pool_sz,1], strides=[1,_pool_stride[i],_pool_stride[i],1], padding='VALID', name='pool'+str(i+1))
             net_z = tf.nn.max_pool(net_z, [1,_pool_sz,_pool_sz,1], strides=[1,_pool_stride[i],_pool_stride[i],1], padding='VALID', name='pool'+str(i+1))
 
+    print
+
     return net_z, net_x, params_names_list, params_values_list
 
 def _import_from_matconvnet(net_path):

diff --git a/src/tracker.py b/src/tracker.py
@@ -17,14 +17,13 @@
 #os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(gpu_device)
 
 # read default parameters and override with custom ones
-def tracker(hp, evaluation, run, env, design, frame_name_list, frame_sz, pos_x, pos_y, target_w, target_h):
+def tracker(hp, evaluation, run, design, frame_name_list, pos_x, pos_y, target_w, target_h, final_score_sz, filename, image, templates_z, scores):
     num_frames = np.size(frame_name_list)
     # stores tracker's output for evaluation
     bboxes = np.zeros((num_frames,4))
 
     scale_factors = hp.scale_step**np.linspace(-np.ceil(hp.scale_num/2), np.ceil(hp.scale_num/2), hp.scale_num)
-    # cosine window to penalize large displacements
-    final_score_sz = int(hp.response_up * design.score_sz)
+    # cosine window to penalize large displacements    
     hann_1d = np.expand_dims(np.hanning(final_score_sz), axis=0)
     penalty = np.transpose(hann_1d) * hann_1d
     penalty = penalty / np.sum(penalty)
@@ -39,8 +38,6 @@ def tracker(hp, evaluation, run, env, design, frame_name_list, frame_sz, pos_x,
     min_x = hp.scale_min * x_sz
     max_x = hp.scale_max * x_sz
 
-    filename, image, templates_z, scores = siam.build_tracking_graph(final_score_sz, design, env)
-
     # run_metadata = tf.RunMetadata()
     # run_opts = {
     #     'options': tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
@@ -58,8 +55,7 @@ def tracker(hp, evaluation, run, env, design, frame_name_list, frame_sz, pos_x,
         threads = tf.train.start_queue_runners(coord=coord)
 
         # save first frame position (from ground-truth)
-        bboxes[0,:] = pos_x-target_w/2, pos_y-target_h/2, target_w, target_h
-
+        bboxes[0,:] = pos_x-target_w/2, pos_y-target_h/2, target_w, target_h                
 
         image_, templates_z_ = sess.run([image, templates_z], feed_dict={
                                                                         siam.pos_x_ph: pos_x,
@@ -108,8 +104,7 @@ def tracker(hp, evaluation, run, env, design, frame_name_list, frame_sz, pos_x,
             pos_x, pos_y = _update_target_position(pos_x, pos_y, score_, final_score_sz, design.tot_stride, design.search_sz, hp.response_up, x_sz)
             # convert <cx,cy,w,h> to <x,y,w,h> and save output
             bboxes[i,:] = pos_x-target_w/2, pos_y-target_h/2, target_w, target_h
-            print 'Frame '+str(i)+': ('+str(bboxes[i,0])+', '+str(bboxes[i,1])+', '+str(bboxes[i,2])+', '+str(bboxes[i,3])+')'
-
+            # print 'Frame '+str(i)+': ('+str(bboxes[i,0])+', '+str(bboxes[i,1])+', '+str(bboxes[i,2])+', '+str(bboxes[i,3])+')'            
             # update the target representation with a rolling average
             if hp.z_lr>0:
                 new_templates_z_ = sess.run([templates_z], feed_dict={
@@ -129,7 +124,7 @@ def tracker(hp, evaluation, run, env, design, frame_name_list, frame_sz, pos_x,
 
         t_elapsed = time.time() - t_start
         speed = (num_frames-evaluation.start_frame+1)/t_elapsed
-        print 'Speed: '+str(speed)
+
         # Finish off the filename queue coordinator.
         coord.request_stop()
         coord.join(threads)