进度保存

本代码无法运行
laoli2046 · Jul 28, 2017 · 1b902fd · 1b902fd
1 parent f9bc686
commit 1b902fd
Show file tree

Hide file tree

Showing 14 changed files with 130 additions and 5,937 deletions.
diff --git a/evaluation.py b/evaluation.py
diff --git a/execute.py b/execute.py
@@ -5,44 +5,11 @@
 
 import taevaluation
 
-from data_helper import loadData, load_embedding, batch_iter, valid_iter
-from polymerization import LstmQa
+from qaData import loadData, loadEmbedding, batchIter, valid_iter
+from qaLSTM import QaLstm
 
 
 
-embedding, word2idx, idx2word = load_embedding(FLAGS.embedding_file, FLAGS.embedding_size)
-# train_questions, train_answers, train_labels, train_questionId = loadData(FLAGS.train_file, word2idx,
-#                                                                           FLAGS.num_unroll_steps,training=True)
-
-test_questions, test_answers, _, test_questionId = loadData(FLAGS.test_file, word2idx, FLAGS.num_unroll_steps)
-# valid_questions, valid_answers, _, valid_questionId = loadData(FLAGS.valid_file, word2idx,FLAGS.num_unroll_steps)
-
-
-def run_step(sess, ori_batch, cand_batch, neg_batch, lstm, dropout=1.):
-    start_time = time.time()
-    feed_dict = {
-        lstm.ori_input_quests: ori_batch,
-        lstm.cand_input_quests: cand_batch,
-        lstm.neg_input_quests: neg_batch,
-        lstm.keep_prob: dropout
-    }
-
-    _, step, ori_cand_score, ori_neg_score, cur_loss, cur_acc = sess.run(
-        [train_op, global_step, lstm.ori_cand, lstm.ori_neg, lstm.loss, lstm.acc], feed_dict)
-    # time_str = datetime.datetime.now().isoformat()
-    # right, wrong, score = [0.0] * 3
-    # for i in range(0, len(ori_batch)):
-    #     if ori_cand_score[i] > 0.55 and ori_neg_score[i] < 0.4:
-    #         right += 1.0
-    #     else:
-    #         wrong += 1.0
-    #     score += ori_cand_score[i] - ori_neg_score[i]
-    time_elapsed = time.time() - start_time
-    print("step:", step,"loss:",cur_loss,"acc:",cur_acc,"time:", time_elapsed)
-    # logger.info("%s: step %s, loss %s, acc %s, score %s, wrong %s, %6.7f secs/batch" % (
-    #     time_str, step, cur_loss, cur_acc, score, wrong, time_elapsed))
-
-    return cur_loss, ori_cand_score
 
 
 def valid_run_step(sess, ori_batch, cand_batch, lstm, dropout=1.):
@@ -71,37 +38,16 @@ def valid_model(sess, lstm, valid_questions, valid_answers, valid_file, result_f
 
 
 
-with tf.Graph().as_default():
-    with tf.device("/gpu:0"):
-        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_options)
-        session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement,
-                                      gpu_options=gpu_options)
-        with tf.Session(config=session_conf).as_default() as sess:
-
-            lstm = LstmQa(FLAGS.batch_size, FLAGS.num_unroll_steps, embedding, FLAGS.embedding_size, FLAGS.rnn_size,
-                          FLAGS.num_rnn_layers)
-            global_step = tf.Variable(0, name="globle_step", trainable=False)
-            tvars = tf.trainable_variables()
-            grads, _ = tf.clip_by_global_norm(tf.gradients(lstm.loss, tvars), FLAGS.max_grad_norm)
-
-            saver = tf.train.Saver()
-            # sess.run(tf.global_variables_initializer())
-
-            saver.restore(sess, 'models/79' + saveFile)
-            tqs, tta, tfa = [], [], []
-            for ori_train, cand_train, neg_train in batch_iter(train_questions, train_answers,
-                                                               train_labels, train_questionId, FLAGS.batch_size):
-                tqs.append(ori_train), tta.append(cand_train), tfa.append(neg_train)
-            for i in range(1):
-                optimizer = tf.train.GradientDescentOptimizer(0.1)
-                optimizer.apply_gradients(zip(grads, tvars))
-                train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
-                for epoch in range(FLAGS.epochs):
-                    for ori_train, cand_train, neg_train in zip(tqs, tta, tfa):
-                        run_step(sess, ori_train, cand_train, neg_train, lstm)
-                    valid_model(sess, lstm, valid_questions, valid_answers, FLAGS.valid_file, FLAGS.result_file)
-                    saver.save(sess, 'model/'+str(i*FLAGS.epochs+epoch)+saveFile)
-                learningRate /= 2
+
+
+
+
+
+
+
+
+
+
             saver.restore(sess,'models/79'+saveFile)
             valid_model(sess, lstm, valid_questions, valid_answers, FLAGS.valid_file, FLAGS.result_file)
             valid_model(sess, lstm, test_questions, test_answers, FLAGS.test_file, FLAGS.result_file,False)
diff --git a/main.py b/main.py
@@ -1,52 +1,118 @@
-import readData
+import os
+
+import time
+
+import qaData
 import taevaluation
 import tensorflow as tf
 
+from qaLSTM import QaLstm
+
+
+def restore():
+    try:
+        saver.restore(sess, trainedModel)
+    except Exception as e:
+        print("加载模型失败，重新开始训练")
+        train()
+
+
+def train():
+    # 准备训练数据
+    qTrain, aTrain, lTrain, qIdTrain = qaData.loadData(trainingFile, word2idx, unrollSteps, True)
+    qDevelop, aDevelop, lDevelop, qIdDevelop = qaData.loadData(developFile, word2idx, unrollSteps, True)
+    trainQuestionCounts = qIdTrain[-1] + 1
+    for i in range(len(qIdDevelop)):
+        qIdDevelop[i] += trainQuestionCounts
+    tqs, tta, tfa = [], [], []
+    for question, trueAnswer, falseAnswer in qaData.batchIter(qTrain + qDevelop, aTrain + aDevelop,
+                                                              lTrain + lDevelop, qIdTrain + qIdDevelop, batchSize):
+        tqs.append(question), tta.append(trueAnswer), tfa.append(falseAnswer)
+    # 开始训练
+    sess.run(tf.global_variables_initializer())
+    for i in range(lrDownCount):
+        optimizer = tf.train.GradientDescentOptimizer(learningRate)
+        optimizer.apply_gradients(zip(grads, tvars))
+        trainOp = optimizer.apply_gradients(zip(grads, tvars), global_step=globalStep)
+        for epoch in range(epochs):
+            for question, trueAnswer, falseAnswer in zip(tqs, tta, tfa):
+                startTime = time.time()
+                feed_dict = {
+                    lstm.ori_input_quests: question,
+                    lstm.cand_input_quests: trueAnswer,
+                    lstm.neg_input_quests: falseAnswer,
+                    lstm.keep_prob: dropout
+                }
+                _, step, _, _, loss, acc = \
+                    sess.run([trainOp, globalStep, lstm.ori_cand, lstm.ori_neg, lstm.loss, lstm.acc], feed_dict)
+                timesUsed = time.time() - startTime
+                print("step:", step, "loss:", loss, "acc:", acc, "time:", timeUsed)
+            saver.save(sess, saveFile)
+        learningRate *= lrDownRate
+
+
 if __name__ == '__main__':
     # 定义参数
     trainingFile = "data/training.data"
-    validFile = "data/develop.data"
+    developFile = "data/develop.data"
     testFile = "data/testing.data"
-    saveFile = "savedModel"
+    saveFile = "newModel/savedModel"
+    trainedModel = "trainedModel/savedModel"
     embeddingFile = "word2vec/zhwiki_2017_03.sg_50d.word2vec"
-    embeddingSize = 50 #词向量的维度
+    embeddingSize = 50  # 词向量的维度
 
     dropout = 1.0
-    learningRate = 0.4
-    batchSize = 20  # 每一批次处理的问题个数
-    epochs = 20
-    tf.flags.DEFINE_integer("rnn_size", 100, "rnn size")
-    tf.flags.DEFINE_integer("num_rnn_layers", 1, "embedding size")
-    tf.flags.DEFINE_integer("num_unroll_steps", 100, "句子中的最大词汇数目")
-    tf.flags.DEFINE_integer("max_grad_norm", 5, "max grad norm")
-    # Misc Parameters
-    tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
-    tf.flags.DEFINE_float("gpu_options", 0.75, "use memory rate")
-
-    gpuMemUsage = 0.8
-    gpuDevice = "/gpu:0"
-
-    # 读取数据
-    trainingList = readData.readFile(trainingFile)
-    testList = readData.readFile(testFile)
-    embeddingDict = readData.readEmbeddingFile(embeddingFile, embeddingSize)
-
-    # 预处理
-    trainingVec = readData.textToVec(trainingList, embeddingDict)
-    testVec = readData.textToVec(testList, embeddingDict)
-    del embeddingDict  # 减少内存占用
-
-    # 定义模型 todo
+    learningRate = 0.4  # 学习速度
+    lrDownRate = 0.5  # 学习速度下降速度
+    lrDownCount = 4  # 学习速度下降次数
+    epochs = 20  # 每次学习速度指数下降之前执行的完整epoch次数
+    batchSize = 20  # 每一批次处理的<b>问题</b>个数
 
-    # 开始训练
-    with tf.Graph().as_default():
-        with tf.device(gpuDevice):
-            gpuOptions = tf.GPUOptions(per_process_gpu_memory_fraction=gpuMemUsage)
-            session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement,
-                                          log_device_placement=FLAGS.log_device_placement,
-                                          gpu_options=gpuOptions)
-            with tf.Session(config=session_conf).as_default() as sess:
-                pass  # todo
-
-    # 评估 todo
-    pass
+    rnnSize = 100  # LSTM cell中隐藏层神经元的个数
+
+    unrollSteps = 100  # 句子中的最大词汇数目
+    max_grad_norm = 5
+
+    allow_soft_placement = True  # Allow device soft device placement
+    gpuMemUsage = 0.8  # 显存最大使用
+    gpuDevice = "/gpu:0"  # GPU设备名
+
+    # 读取测试数据
+    embedding, word2idx, idx2word = qaData.loadEmbedding(embeddingFile, embeddingSize)
+    qTest, aTest, _, qIdTest = qaData.loadData(testFile, word2idx, unrollSteps)
+
+    # 配置TensorFlow
+    with tf.Graph().as_default(), tf.device(gpuDevice):
+        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpuMemUsage)
+        session_conf = tf.ConfigProto(allow_soft_placement=allow_soft_placement, gpu_options=gpu_options)
+        with tf.Session(config=session_conf).as_default() as sess:
+            # 加载LSTM网络
+            globalStep = tf.Variable(0, name="globalStep", trainable=False)
+            lstm = QaLstm(batchSize, unrollSteps, embedding, embeddingSize, rnnSize)
+            tvars = tf.trainable_variables()
+            grads, _ = tf.clip_by_global_norm(tf.gradients(lstm.loss, tvars), max_grad_norm)
+            saver = tf.train.Saver()
+
+            # 加载模型或训练模型
+            if os.path.exists(trainedModel + '.index'):
+                while (True):
+                    choice = input("找到已经训练好的模型，是否载入（y/n）")
+                    if choice.strip().lower() == 'y':
+                        restore()
+                        break
+                    elif choice.strip().lower() == 'n':
+                        choice = input("您真的确定吗？重新训练会消耗大量时间与硬件资源（yes/no）")
+                        if choice.strip().lower() == 'yes':
+                            train()
+                            break
+                        elif choice.strip().lower() == 'no':
+                            restore()
+                            break
+                        else:
+                            print("无效的输入！\n")
+                    else:
+                        print("无效的输入！\n")
+            else:
+                train()
+            # 进行测试，输出结果
+            pass
diff --git a/data_helper.py → qaData.py b/data_helper.py → qaData.py
@@ -12,7 +12,7 @@
 logging.basicConfig(format="%(message)s", level=logging.INFO)
 
 
-def load_embedding(filename, embedding_size):
+def loadEmbedding(filename, embedding_size):
     """
     load embedding
     """
@@ -43,7 +43,7 @@ def load_embedding(filename, embedding_size):
     return embeddings, word2idx, idx2word
 
 
-def sent_to_idx(sent, word2idx, sequence_len):
+def sentenceToIndex(sent, word2idx, sequence_len):
     """
     convert sentence to index array
     """
@@ -79,8 +79,8 @@ def loadData(filename, word2idx, maxLen, training = False):
                 if question != arr[0]:
                     question = arr[0]
                     questionId += 1
-                ori_quest = sent_to_idx(arr[0].strip(), word2idx, maxLen)
-                cand_quest = sent_to_idx(arr[1].strip(), word2idx, maxLen)
+                ori_quest = sentenceToIndex(arr[0].strip(), word2idx, maxLen)
+                cand_quest = sentenceToIndex(arr[1].strip(), word2idx, maxLen)
                 if training:
                     label = int(arr[2])
                     labels.append(label)
@@ -95,12 +95,12 @@ def loadData(filename, word2idx, maxLen, training = False):
     return ori_quests, cand_quests, labels, questionIds
 
 
-def batch_iter(questions, answers, labels, questionIds, batch_size):
+def batchIter(questions, answers, labels, questionIds, batch_size):
     """
     iterate the data
     """
     trueAnswer = ""
-    data_len = questionIds[-1]
+    data_len = questionIds[-1] + 1
     batch_num = int(data_len / batch_size) + 1
     line = 0
     for batch in range(batch_num):

diff --git a/polymerization.py → qaLSTM.py b/polymerization.py → qaLSTM.py
@@ -1,11 +1,10 @@
-# coding:utf-8
 import tensorflow as tf
 from bilstm import biLSTM
 from utils import feature2cos_sim, max_pooling, cal_loss_and_acc
 
 
-class LstmQa(object):
-    def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn_size, num_rnn_layers):
+class QaLstm(object):
+    def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn_size):
         # define input variable
         # if label_weight is None:
         #     label_weight = []
@@ -15,7 +14,7 @@ def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn
         # self.adjust_weight = adjust_weight
         # self.label_weight = label_weight
         self.rnn_size = rnn_size
-        self.num_rnn_layers = num_rnn_layers
+        # self.num_rnn_layers = num_rnn_layers
         self.num_unroll_steps = num_unroll_steps
         # self.max_grad_norm = max_grad_norm
         # self.l2_reg_lambda = l2_reg_lambda
@@ -63,8 +62,7 @@ def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn
         self.ori_neg = feature2cos_sim(ori_q_feat, neg_q_feat)
         self.loss, self.acc = cal_loss_and_acc(self.ori_cand, self.ori_neg)
 
-
         self.test_q_a = feature2cos_sim(test_q_out, test_a_out)
 
-    # def assign_new_lr(self, session, lr_value):
-    #     session.run(self._lr_update, feed_dict={self.new_lr: lr_value})
+        # def assign_new_lr(self, session, lr_value):
+        #     session.run(self._lr_update, feed_dict={self.new_lr: lr_value})