cosine learning rate

pecaso · Feb 28, 2019 · b8b1848 · b8b1848
1 parent 18299e6
commit b8b1848
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 173 deletions.
diff --git a/config.py b/config.py
@@ -7,20 +7,18 @@
 IOU_LOSS_THRESH = 0.5
 
 # train
-BATCH_SIZE = 32
-BATCH_SIZE_STEP2 = 6
-LEARN_RATE_INIT = 1e-3
-MAX_LEARN_RATE_DECAY_TIME = 2
-MAX_WAVE_TIME = 2
-MAX_PERIODS = 25
+BATCH_SIZE = 6
+LEARN_RATE_INIT = 1e-4
+LEARN_RATE_END = 1e-6
+WARMUP_PERIODS = 2
+PERIODS_FOR_STEP0 = 20
+MAX_PERIODS = 30
 ANCHORS = [[(1.25, 1.625), (2.0, 3.75), (4.125, 2.875)],            # Anchors for small obj
            [(1.875, 3.8125), (3.875, 2.8125), (3.6875, 7.4375)],    # Anchors for medium obj
            [(3.625, 2.8125), (4.875, 6.1875), (11.65625, 10.1875)]] # Anchors for big obj
-FROZEN = True
 
 ANCHOR_PER_SCALE = 3
 MOVING_AVE_DECAY = 0.9995
-SAVE_ITER = 1
 MAX_BBOX_PER_SCALE = 150
 
 # test

diff --git a/mAP/results/results.txt b/mAP/results/results.txt
diff --git a/train.py b/train.py
@@ -6,32 +6,30 @@
 import tensorflow as tf
 import numpy as np
 import os
-import argparse
 import logging
 import time
 
 
 class YoloTrain(object):
     def __init__(self):
-        self.__frozen = cfg.FROZEN
         self.__anchor_per_scale = cfg.ANCHOR_PER_SCALE
         self.__classes = cfg.CLASSES
         self.__num_classes = len(self.__classes)
         self.__learn_rate_init = cfg.LEARN_RATE_INIT
+        self.__learn_rate_end = cfg.LEARN_RATE_END
         self.__max_periods = cfg.MAX_PERIODS
-        self.__max_wave_time = cfg.MAX_WAVE_TIME
-        self.__max_learn_rate_decay_time = cfg.MAX_LEARN_RATE_DECAY_TIME
+        self.__periods_for_step0 = cfg.PERIODS_FOR_STEP0
+        self.__warmup_periods = cfg.WARMUP_PERIODS
         self.__weights_dir = cfg.WEIGHTS_DIR
         self.__weights_file = cfg.WEIGHTS_FILE
         self.__time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
         self.__log_dir = os.path.join(cfg.LOG_DIR, 'train', self.__time)
         self.__moving_ave_decay = cfg.MOVING_AVE_DECAY
-        self.__save_iter = cfg.SAVE_ITER
         self.__max_bbox_per_scale = cfg.MAX_BBOX_PER_SCALE
-        self.__batch_size_step2 = cfg.BATCH_SIZE_STEP2
 
         self.__train_data = Data('train')
         self.__test_data = Data('test')
+        self.__steps_per_period = len(self.__train_data)
 
         with tf.name_scope('input'):
             self.__input_data = tf.placeholder(dtype=tf.float32, name='input_data')
@@ -56,8 +54,22 @@ def __init__(self):
                                        self.__label_sbbox, self.__label_mbbox, self.__label_lbbox,
                                        self.__sbboxes, self.__mbboxes, self.__lbboxes)
 
-        with tf.name_scope('learn'):
-            self.__learn_rate = tf.Variable(self.__learn_rate_init, trainable=False, name='learn_rate_init')
+        with tf.name_scope('optimize'):
+            with tf.name_scope('learn_rate'):
+                self.__global_step = tf.Variable(1.0, dtype=tf.float64, trainable=False, name='global_step')
+                warmup_steps = tf.constant(self.__warmup_periods * self.__steps_per_period, dtype=tf.float64,
+                                           name='warmup_steps')
+                train_steps = tf.constant(self.__max_periods * self.__steps_per_period, dtype=tf.float64,
+                                          name='train_steps')
+                self.__learn_rate = tf.cond(
+                    pred=self.__global_step < warmup_steps,
+                    true_fn=lambda: self.__global_step / warmup_steps * self.__learn_rate_init,
+                    false_fn=lambda: self.__learn_rate_end + 0.5 * (self.__learn_rate_init - self.__learn_rate_end) *
+                                     (1 + tf.cos(
+                                         (self.__global_step - warmup_steps) / (train_steps - warmup_steps) * np.pi))
+                )
+                global_step_update = tf.assign_add(self.__global_step, 1.0)
+
             moving_ave = tf.train.ExponentialMovingAverage(self.__moving_ave_decay).apply(tf.trainable_variables())
 
             self.__trainable_var_list = []
@@ -68,13 +80,13 @@ def __init__(self):
                     self.__trainable_var_list.append(var)
             optimize0 = tf.train.AdamOptimizer(self.__learn_rate).\
                 minimize(self.__loss, var_list=self.__trainable_var_list)
-            with tf.control_dependencies([optimize0]):
+            with tf.control_dependencies([optimize0, global_step_update]):
                 with tf.control_dependencies([moving_ave]):
                     self.__train_op_with_frozen_variables = tf.no_op()
 
             optimize1 = tf.train.AdamOptimizer(self.__learn_rate).\
                 minimize(self.__loss, var_list=tf.trainable_variables())
-            with tf.control_dependencies([optimize1]):
+            with tf.control_dependencies([optimize1, global_step_update]):
                 with tf.control_dependencies([moving_ave]):
                     self.__train_op_with_all_variables = tf.no_op()
 
@@ -88,7 +100,9 @@ def __init__(self):
             self.__save = tf.train.Saver(tf.global_variables(), max_to_keep=50)
 
         with tf.name_scope('summary'):
-            tf.summary.scalar('loss', self.__loss)
+            self.__loss_ave = tf.Variable(0, dtype=tf.float32, trainable=False)
+            tf.summary.scalar('loss_ave', self.__loss_ave)
+            tf.summary.scalar('learn_rate', self.__learn_rate)
             self.__summary_op = tf.summary.merge_all()
             self.__summary_writer = tf.summary.FileWriter(self.__log_dir)
             self.__summary_writer.add_graph(tf.get_default_graph())
@@ -101,40 +115,18 @@ def train(self):
         logging.info('Restoring weights from:\t %s' % ckpt_path)
         self.__load.restore(self.__sess, ckpt_path)
 
-        learn_rate_decay_time = 0
-        test_loss_err_list = []
-        test_loss_last = np.inf
         for period in range(self.__max_periods):
-            wave_time = (np.array(test_loss_err_list) > 0).astype(np.int32).sum()
-            if self.__frozen and wave_time == self.__max_wave_time:
-                test_loss_err_list = []
-                test_loss_last = np.inf
-                if learn_rate_decay_time < self.__max_learn_rate_decay_time:
-                    learning_rate_value = self.__sess.run(
-                        tf.assign(self.__learn_rate, self.__sess.run(self.__learn_rate) / 10.0)
-                    )
-                    logging.info('The value of learn rate is:\t%f' % learning_rate_value)
-
-                # 使用原始learn rate_init * 0.01微调至饱和后再用learn_rate_init * 0.01全部微调
-                learn_rate_decay_time += 1
-                if learn_rate_decay_time == (self.__max_learn_rate_decay_time + 1):
-                    self.__train_op = self.__train_op_with_all_variables
-                    logging.info('Train all of weights')
-                    self.__train_data.batch_size_change(self.__batch_size_step2)
-                    self.__test_data.batch_size_change(self.__batch_size_step2)
-
-            if not self.__frozen:
+            if period == self.__periods_for_step0:
                 self.__train_op = self.__train_op_with_all_variables
                 logging.info('Train all of weights')
 
-            print_loss_iter = len(self.__train_data) / 10
+            print_loss_iter = self.__steps_per_period / 10
             total_train_loss = 0.0
-
-            for step, (batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox,
-                       batch_sbboxes, batch_mbboxes, batch_lbboxes) \
-                    in enumerate(self.__train_data):
-                _, summary_value, loss_value = self.__sess.run(
-                    [self.__train_op, self.__summary_op, self.__loss],
+            for batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox,\
+                batch_sbboxes, batch_mbboxes, batch_lbboxes \
+                    in self.__train_data:
+                _, loss_val, global_step_val = self.__sess.run(
+                    [self.__train_op, self.__loss, self.__global_step],
                     feed_dict={
                         self.__input_data: batch_image,
                         self.__label_sbbox: batch_label_sbbox,
@@ -147,18 +139,20 @@ def train(self):
                     }
                 )
                 print "running"
-                if np.isnan(loss_value):
+                if np.isnan(loss_val):
                     raise ArithmeticError('The gradient is exploded')
-                total_train_loss += loss_value
-                if (step + 1) % print_loss_iter:
+                total_train_loss += loss_val
+
+                if int(global_step_val) % self.__steps_per_period  % print_loss_iter:
                     continue
+
                 train_loss = total_train_loss / print_loss_iter
                 total_train_loss = 0.0
-                self.__summary_writer.add_summary(summary_value, period * len(self.__train_data) + step)
-                logging.info('Period:\t%d\tstep:\t%d\ttrain loss:\t%.4f' % (period, step, train_loss))
 
-            if (period + 1) % self.__save_iter:
-                continue
+                self.__sess.run(tf.assign(self.__loss_ave, train_loss))
+                summary_val = self.__sess.run(self.__summary_op)
+                self.__summary_writer.add_summary(summary_val, global_step_val)
+                logging.info('Period:\t%d\tstep:\t%d\ttrain_loss:\t%.4f' % (period, global_step_val, train_loss))
 
             total_test_loss = 0.0
             for batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox, \
@@ -180,24 +174,25 @@ def train(self):
                 print "running"
                 total_test_loss += loss_value
             test_loss = total_test_loss / len(self.__test_data)
-            logging.info('Period:\t%d\ttest loss:\t%.4f' % (period, test_loss))
+
+            logging.info('Period:\t%d\ttest_loss:\t%.4f' % (period, test_loss))
             saved_model_name = os.path.join(self.__weights_dir, 'yolo.ckpt-%d-%.4f' % (period, test_loss))
             self.__save.save(self.__sess, saved_model_name)
             logging.info('Saved model:\t%s' % saved_model_name)
-
-            test_loss_err_list.append(test_loss - test_loss_last)
-            test_loss_last = test_loss
         self.__summary_writer.close()
 
 
 if __name__ == '__main__':
+    if not os.path.exists(os.path.join(cfg.LOG_DIR, 'train')):
+        os.mkdir(os.path.join(cfg.LOG_DIR, 'train'))
+    if not os.path.exists(os.path.join(cfg.LOG_DIR, 'test')):
+        os.mkdir(os.path.join(cfg.LOG_DIR, 'test'))
     log_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
     logging.basicConfig(filename='log/train/' + log_time + '.log', format='%(filename)s %(asctime)s\t%(message)s',
                         level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S', filemode='w')
 
     os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU
-    logging.info('Batch size for step1 is:\t%d' % cfg.BATCH_SIZE)
-    logging.info('Batch size for step2 is:\t%d' % cfg.BATCH_SIZE_STEP2)
+    logging.info('Batch size for step is:\t%d' % cfg.BATCH_SIZE)
     logging.info('Initial learn rate is:\t%f' % cfg.LEARN_RATE_INIT)
     YoloTrain().train()
 
diff --git a/weights/checkpoint b/weights/checkpoint
@@ -1,31 +1,31 @@
-model_checkpoint_path: "yolo.ckpt-29-18.0546"
-all_model_checkpoint_paths: "yolo.ckpt-0-22.5748"
-all_model_checkpoint_paths: "yolo.ckpt-1-20.6436"
-all_model_checkpoint_paths: "yolo.ckpt-2-20.8553"
-all_model_checkpoint_paths: "yolo.ckpt-3-19.6559"
-all_model_checkpoint_paths: "yolo.ckpt-4-19.5505"
-all_model_checkpoint_paths: "yolo.ckpt-5-19.5688"
-all_model_checkpoint_paths: "yolo.ckpt-6-19.2354"
-all_model_checkpoint_paths: "yolo.ckpt-7-19.3226"
-all_model_checkpoint_paths: "yolo.ckpt-8-18.4437"
-all_model_checkpoint_paths: "yolo.ckpt-9-19.4400"
-all_model_checkpoint_paths: "yolo.ckpt-10-18.9024"
-all_model_checkpoint_paths: "yolo.ckpt-11-19.2071"
-all_model_checkpoint_paths: "yolo.ckpt-12-19.0111"
-all_model_checkpoint_paths: "yolo.ckpt-13-19.2987"
-all_model_checkpoint_paths: "yolo.ckpt-14-18.1818"
-all_model_checkpoint_paths: "yolo.ckpt-15-17.8346"
-all_model_checkpoint_paths: "yolo.ckpt-16-17.8885"
-all_model_checkpoint_paths: "yolo.ckpt-17-17.8231"
-all_model_checkpoint_paths: "yolo.ckpt-18-17.6614"
-all_model_checkpoint_paths: "yolo.ckpt-19-17.5994"
-all_model_checkpoint_paths: "yolo.ckpt-20-17.8712"
-all_model_checkpoint_paths: "yolo.ckpt-21-17.5193"
-all_model_checkpoint_paths: "yolo.ckpt-22-17.7131"
-all_model_checkpoint_paths: "yolo.ckpt-23-17.9030"
-all_model_checkpoint_paths: "yolo.ckpt-24-17.8028"
-all_model_checkpoint_paths: "yolo.ckpt-25-18.1727"
-all_model_checkpoint_paths: "yolo.ckpt-26-18.1823"
-all_model_checkpoint_paths: "yolo.ckpt-27-18.1971"
-all_model_checkpoint_paths: "yolo.ckpt-28-18.3417"
-all_model_checkpoint_paths: "yolo.ckpt-29-18.0546"
+model_checkpoint_path: "yolo.ckpt-29-17.5506"
+all_model_checkpoint_paths: "yolo.ckpt-0-56.6992"
+all_model_checkpoint_paths: "yolo.ckpt-1-25.2189"
+all_model_checkpoint_paths: "yolo.ckpt-2-21.1013"
+all_model_checkpoint_paths: "yolo.ckpt-3-19.8796"
+all_model_checkpoint_paths: "yolo.ckpt-4-19.4366"
+all_model_checkpoint_paths: "yolo.ckpt-5-19.0512"
+all_model_checkpoint_paths: "yolo.ckpt-6-18.9765"
+all_model_checkpoint_paths: "yolo.ckpt-7-18.8765"
+all_model_checkpoint_paths: "yolo.ckpt-8-18.6622"
+all_model_checkpoint_paths: "yolo.ckpt-9-19.1907"
+all_model_checkpoint_paths: "yolo.ckpt-10-18.6766"
+all_model_checkpoint_paths: "yolo.ckpt-11-18.8377"
+all_model_checkpoint_paths: "yolo.ckpt-12-18.7018"
+all_model_checkpoint_paths: "yolo.ckpt-13-18.7961"
+all_model_checkpoint_paths: "yolo.ckpt-14-18.7330"
+all_model_checkpoint_paths: "yolo.ckpt-15-18.7711"
+all_model_checkpoint_paths: "yolo.ckpt-16-18.7465"
+all_model_checkpoint_paths: "yolo.ckpt-17-18.6265"
+all_model_checkpoint_paths: "yolo.ckpt-18-18.6452"
+all_model_checkpoint_paths: "yolo.ckpt-19-18.5593"
+all_model_checkpoint_paths: "yolo.ckpt-20-18.7589"
+all_model_checkpoint_paths: "yolo.ckpt-21-18.1916"
+all_model_checkpoint_paths: "yolo.ckpt-22-18.2692"
+all_model_checkpoint_paths: "yolo.ckpt-23-17.8453"
+all_model_checkpoint_paths: "yolo.ckpt-24-17.9026"
+all_model_checkpoint_paths: "yolo.ckpt-25-17.7840"
+all_model_checkpoint_paths: "yolo.ckpt-26-17.5093"
+all_model_checkpoint_paths: "yolo.ckpt-27-17.5841"
+all_model_checkpoint_paths: "yolo.ckpt-28-17.6451"
+all_model_checkpoint_paths: "yolo.ckpt-29-17.5506"