Skip to content

Commit

Permalink
cosine learning rate
Browse files Browse the repository at this point in the history
  • Loading branch information
StinkyTofu95 committed Feb 28, 2019
1 parent 18299e6 commit b8b1848
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 173 deletions.
14 changes: 6 additions & 8 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,18 @@
IOU_LOSS_THRESH = 0.5

# train
BATCH_SIZE = 32
BATCH_SIZE_STEP2 = 6
LEARN_RATE_INIT = 1e-3
MAX_LEARN_RATE_DECAY_TIME = 2
MAX_WAVE_TIME = 2
MAX_PERIODS = 25
BATCH_SIZE = 6
LEARN_RATE_INIT = 1e-4
LEARN_RATE_END = 1e-6
WARMUP_PERIODS = 2
PERIODS_FOR_STEP0 = 20
MAX_PERIODS = 30
ANCHORS = [[(1.25, 1.625), (2.0, 3.75), (4.125, 2.875)], # Anchors for small obj
[(1.875, 3.8125), (3.875, 2.8125), (3.6875, 7.4375)], # Anchors for medium obj
[(3.625, 2.8125), (4.875, 6.1875), (11.65625, 10.1875)]] # Anchors for big obj
FROZEN = True

ANCHOR_PER_SCALE = 3
MOVING_AVE_DECAY = 0.9995
SAVE_ITER = 1
MAX_BBOX_PER_SCALE = 150

# test
Expand Down
162 changes: 81 additions & 81 deletions mAP/results/results.txt

Large diffs are not rendered by default.

101 changes: 48 additions & 53 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,30 @@
import tensorflow as tf
import numpy as np
import os
import argparse
import logging
import time


class YoloTrain(object):
def __init__(self):
self.__frozen = cfg.FROZEN
self.__anchor_per_scale = cfg.ANCHOR_PER_SCALE
self.__classes = cfg.CLASSES
self.__num_classes = len(self.__classes)
self.__learn_rate_init = cfg.LEARN_RATE_INIT
self.__learn_rate_end = cfg.LEARN_RATE_END
self.__max_periods = cfg.MAX_PERIODS
self.__max_wave_time = cfg.MAX_WAVE_TIME
self.__max_learn_rate_decay_time = cfg.MAX_LEARN_RATE_DECAY_TIME
self.__periods_for_step0 = cfg.PERIODS_FOR_STEP0
self.__warmup_periods = cfg.WARMUP_PERIODS
self.__weights_dir = cfg.WEIGHTS_DIR
self.__weights_file = cfg.WEIGHTS_FILE
self.__time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
self.__log_dir = os.path.join(cfg.LOG_DIR, 'train', self.__time)
self.__moving_ave_decay = cfg.MOVING_AVE_DECAY
self.__save_iter = cfg.SAVE_ITER
self.__max_bbox_per_scale = cfg.MAX_BBOX_PER_SCALE
self.__batch_size_step2 = cfg.BATCH_SIZE_STEP2

self.__train_data = Data('train')
self.__test_data = Data('test')
self.__steps_per_period = len(self.__train_data)

with tf.name_scope('input'):
self.__input_data = tf.placeholder(dtype=tf.float32, name='input_data')
Expand All @@ -56,8 +54,22 @@ def __init__(self):
self.__label_sbbox, self.__label_mbbox, self.__label_lbbox,
self.__sbboxes, self.__mbboxes, self.__lbboxes)

with tf.name_scope('learn'):
self.__learn_rate = tf.Variable(self.__learn_rate_init, trainable=False, name='learn_rate_init')
with tf.name_scope('optimize'):
with tf.name_scope('learn_rate'):
self.__global_step = tf.Variable(1.0, dtype=tf.float64, trainable=False, name='global_step')
warmup_steps = tf.constant(self.__warmup_periods * self.__steps_per_period, dtype=tf.float64,
name='warmup_steps')
train_steps = tf.constant(self.__max_periods * self.__steps_per_period, dtype=tf.float64,
name='train_steps')
self.__learn_rate = tf.cond(
pred=self.__global_step < warmup_steps,
true_fn=lambda: self.__global_step / warmup_steps * self.__learn_rate_init,
false_fn=lambda: self.__learn_rate_end + 0.5 * (self.__learn_rate_init - self.__learn_rate_end) *
(1 + tf.cos(
(self.__global_step - warmup_steps) / (train_steps - warmup_steps) * np.pi))
)
global_step_update = tf.assign_add(self.__global_step, 1.0)

moving_ave = tf.train.ExponentialMovingAverage(self.__moving_ave_decay).apply(tf.trainable_variables())

self.__trainable_var_list = []
Expand All @@ -68,13 +80,13 @@ def __init__(self):
self.__trainable_var_list.append(var)
optimize0 = tf.train.AdamOptimizer(self.__learn_rate).\
minimize(self.__loss, var_list=self.__trainable_var_list)
with tf.control_dependencies([optimize0]):
with tf.control_dependencies([optimize0, global_step_update]):
with tf.control_dependencies([moving_ave]):
self.__train_op_with_frozen_variables = tf.no_op()

optimize1 = tf.train.AdamOptimizer(self.__learn_rate).\
minimize(self.__loss, var_list=tf.trainable_variables())
with tf.control_dependencies([optimize1]):
with tf.control_dependencies([optimize1, global_step_update]):
with tf.control_dependencies([moving_ave]):
self.__train_op_with_all_variables = tf.no_op()

Expand All @@ -88,7 +100,9 @@ def __init__(self):
self.__save = tf.train.Saver(tf.global_variables(), max_to_keep=50)

with tf.name_scope('summary'):
tf.summary.scalar('loss', self.__loss)
self.__loss_ave = tf.Variable(0, dtype=tf.float32, trainable=False)
tf.summary.scalar('loss_ave', self.__loss_ave)
tf.summary.scalar('learn_rate', self.__learn_rate)
self.__summary_op = tf.summary.merge_all()
self.__summary_writer = tf.summary.FileWriter(self.__log_dir)
self.__summary_writer.add_graph(tf.get_default_graph())
Expand All @@ -101,40 +115,18 @@ def train(self):
logging.info('Restoring weights from:\t %s' % ckpt_path)
self.__load.restore(self.__sess, ckpt_path)

learn_rate_decay_time = 0
test_loss_err_list = []
test_loss_last = np.inf
for period in range(self.__max_periods):
wave_time = (np.array(test_loss_err_list) > 0).astype(np.int32).sum()
if self.__frozen and wave_time == self.__max_wave_time:
test_loss_err_list = []
test_loss_last = np.inf
if learn_rate_decay_time < self.__max_learn_rate_decay_time:
learning_rate_value = self.__sess.run(
tf.assign(self.__learn_rate, self.__sess.run(self.__learn_rate) / 10.0)
)
logging.info('The value of learn rate is:\t%f' % learning_rate_value)

# 使用原始learn rate_init * 0.01微调至饱和后再用learn_rate_init * 0.01全部微调
learn_rate_decay_time += 1
if learn_rate_decay_time == (self.__max_learn_rate_decay_time + 1):
self.__train_op = self.__train_op_with_all_variables
logging.info('Train all of weights')
self.__train_data.batch_size_change(self.__batch_size_step2)
self.__test_data.batch_size_change(self.__batch_size_step2)

if not self.__frozen:
if period == self.__periods_for_step0:
self.__train_op = self.__train_op_with_all_variables
logging.info('Train all of weights')

print_loss_iter = len(self.__train_data) / 10
print_loss_iter = self.__steps_per_period / 10
total_train_loss = 0.0

for step, (batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox,
batch_sbboxes, batch_mbboxes, batch_lbboxes) \
in enumerate(self.__train_data):
_, summary_value, loss_value = self.__sess.run(
[self.__train_op, self.__summary_op, self.__loss],
for batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox,\
batch_sbboxes, batch_mbboxes, batch_lbboxes \
in self.__train_data:
_, loss_val, global_step_val = self.__sess.run(
[self.__train_op, self.__loss, self.__global_step],
feed_dict={
self.__input_data: batch_image,
self.__label_sbbox: batch_label_sbbox,
Expand All @@ -147,18 +139,20 @@ def train(self):
}
)
print "running"
if np.isnan(loss_value):
if np.isnan(loss_val):
raise ArithmeticError('The gradient is exploded')
total_train_loss += loss_value
if (step + 1) % print_loss_iter:
total_train_loss += loss_val

if int(global_step_val) % self.__steps_per_period % print_loss_iter:
continue

train_loss = total_train_loss / print_loss_iter
total_train_loss = 0.0
self.__summary_writer.add_summary(summary_value, period * len(self.__train_data) + step)
logging.info('Period:\t%d\tstep:\t%d\ttrain loss:\t%.4f' % (period, step, train_loss))

if (period + 1) % self.__save_iter:
continue
self.__sess.run(tf.assign(self.__loss_ave, train_loss))
summary_val = self.__sess.run(self.__summary_op)
self.__summary_writer.add_summary(summary_val, global_step_val)
logging.info('Period:\t%d\tstep:\t%d\ttrain_loss:\t%.4f' % (period, global_step_val, train_loss))

total_test_loss = 0.0
for batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox, \
Expand All @@ -180,24 +174,25 @@ def train(self):
print "running"
total_test_loss += loss_value
test_loss = total_test_loss / len(self.__test_data)
logging.info('Period:\t%d\ttest loss:\t%.4f' % (period, test_loss))

logging.info('Period:\t%d\ttest_loss:\t%.4f' % (period, test_loss))
saved_model_name = os.path.join(self.__weights_dir, 'yolo.ckpt-%d-%.4f' % (period, test_loss))
self.__save.save(self.__sess, saved_model_name)
logging.info('Saved model:\t%s' % saved_model_name)

test_loss_err_list.append(test_loss - test_loss_last)
test_loss_last = test_loss
self.__summary_writer.close()


if __name__ == '__main__':
if not os.path.exists(os.path.join(cfg.LOG_DIR, 'train')):
os.mkdir(os.path.join(cfg.LOG_DIR, 'train'))
if not os.path.exists(os.path.join(cfg.LOG_DIR, 'test')):
os.mkdir(os.path.join(cfg.LOG_DIR, 'test'))
log_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
logging.basicConfig(filename='log/train/' + log_time + '.log', format='%(filename)s %(asctime)s\t%(message)s',
level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S', filemode='w')

os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU
logging.info('Batch size for step1 is:\t%d' % cfg.BATCH_SIZE)
logging.info('Batch size for step2 is:\t%d' % cfg.BATCH_SIZE_STEP2)
logging.info('Batch size for step is:\t%d' % cfg.BATCH_SIZE)
logging.info('Initial learn rate is:\t%f' % cfg.LEARN_RATE_INIT)
YoloTrain().train()

62 changes: 31 additions & 31 deletions weights/checkpoint
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
model_checkpoint_path: "yolo.ckpt-29-18.0546"
all_model_checkpoint_paths: "yolo.ckpt-0-22.5748"
all_model_checkpoint_paths: "yolo.ckpt-1-20.6436"
all_model_checkpoint_paths: "yolo.ckpt-2-20.8553"
all_model_checkpoint_paths: "yolo.ckpt-3-19.6559"
all_model_checkpoint_paths: "yolo.ckpt-4-19.5505"
all_model_checkpoint_paths: "yolo.ckpt-5-19.5688"
all_model_checkpoint_paths: "yolo.ckpt-6-19.2354"
all_model_checkpoint_paths: "yolo.ckpt-7-19.3226"
all_model_checkpoint_paths: "yolo.ckpt-8-18.4437"
all_model_checkpoint_paths: "yolo.ckpt-9-19.4400"
all_model_checkpoint_paths: "yolo.ckpt-10-18.9024"
all_model_checkpoint_paths: "yolo.ckpt-11-19.2071"
all_model_checkpoint_paths: "yolo.ckpt-12-19.0111"
all_model_checkpoint_paths: "yolo.ckpt-13-19.2987"
all_model_checkpoint_paths: "yolo.ckpt-14-18.1818"
all_model_checkpoint_paths: "yolo.ckpt-15-17.8346"
all_model_checkpoint_paths: "yolo.ckpt-16-17.8885"
all_model_checkpoint_paths: "yolo.ckpt-17-17.8231"
all_model_checkpoint_paths: "yolo.ckpt-18-17.6614"
all_model_checkpoint_paths: "yolo.ckpt-19-17.5994"
all_model_checkpoint_paths: "yolo.ckpt-20-17.8712"
all_model_checkpoint_paths: "yolo.ckpt-21-17.5193"
all_model_checkpoint_paths: "yolo.ckpt-22-17.7131"
all_model_checkpoint_paths: "yolo.ckpt-23-17.9030"
all_model_checkpoint_paths: "yolo.ckpt-24-17.8028"
all_model_checkpoint_paths: "yolo.ckpt-25-18.1727"
all_model_checkpoint_paths: "yolo.ckpt-26-18.1823"
all_model_checkpoint_paths: "yolo.ckpt-27-18.1971"
all_model_checkpoint_paths: "yolo.ckpt-28-18.3417"
all_model_checkpoint_paths: "yolo.ckpt-29-18.0546"
model_checkpoint_path: "yolo.ckpt-29-17.5506"
all_model_checkpoint_paths: "yolo.ckpt-0-56.6992"
all_model_checkpoint_paths: "yolo.ckpt-1-25.2189"
all_model_checkpoint_paths: "yolo.ckpt-2-21.1013"
all_model_checkpoint_paths: "yolo.ckpt-3-19.8796"
all_model_checkpoint_paths: "yolo.ckpt-4-19.4366"
all_model_checkpoint_paths: "yolo.ckpt-5-19.0512"
all_model_checkpoint_paths: "yolo.ckpt-6-18.9765"
all_model_checkpoint_paths: "yolo.ckpt-7-18.8765"
all_model_checkpoint_paths: "yolo.ckpt-8-18.6622"
all_model_checkpoint_paths: "yolo.ckpt-9-19.1907"
all_model_checkpoint_paths: "yolo.ckpt-10-18.6766"
all_model_checkpoint_paths: "yolo.ckpt-11-18.8377"
all_model_checkpoint_paths: "yolo.ckpt-12-18.7018"
all_model_checkpoint_paths: "yolo.ckpt-13-18.7961"
all_model_checkpoint_paths: "yolo.ckpt-14-18.7330"
all_model_checkpoint_paths: "yolo.ckpt-15-18.7711"
all_model_checkpoint_paths: "yolo.ckpt-16-18.7465"
all_model_checkpoint_paths: "yolo.ckpt-17-18.6265"
all_model_checkpoint_paths: "yolo.ckpt-18-18.6452"
all_model_checkpoint_paths: "yolo.ckpt-19-18.5593"
all_model_checkpoint_paths: "yolo.ckpt-20-18.7589"
all_model_checkpoint_paths: "yolo.ckpt-21-18.1916"
all_model_checkpoint_paths: "yolo.ckpt-22-18.2692"
all_model_checkpoint_paths: "yolo.ckpt-23-17.8453"
all_model_checkpoint_paths: "yolo.ckpt-24-17.9026"
all_model_checkpoint_paths: "yolo.ckpt-25-17.7840"
all_model_checkpoint_paths: "yolo.ckpt-26-17.5093"
all_model_checkpoint_paths: "yolo.ckpt-27-17.5841"
all_model_checkpoint_paths: "yolo.ckpt-28-17.6451"
all_model_checkpoint_paths: "yolo.ckpt-29-17.5506"

0 comments on commit b8b1848

Please sign in to comment.