add ensemble

Yongquan-He · Mar 27, 2019 · b234dd9 · b234dd9
1 parent 124bf37
commit b234dd9
Show file tree

Hide file tree

Showing 15 changed files with 874 additions and 829 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 __pycache__
+models/__pycache__
diff --git a/README.md b/README.md
@@ -27,15 +27,13 @@
 
 ## 运行结果
 
-下面是四种不同的模型在该数据集上的运行结果（取最好）：
-
-|      | HMM    | CRF    | BiLSTM | BiLSTM+CRF |
-| ---- | ------ | ------ | ------ | ---------- |
-| 准确率  | 91.22% | 95.43% | 95.26% | 95.87%     |
-
-具体的输出可以查看`output.txt`文件。
+下面是四种不同的模型以及这Ensemble这四个模型预测结果的准确率（取最好）：
 
+|      | HMM    | CRF    | BiLSTM | BiLSTM+CRF | Ensemble |
+| ---- | ------ | ------ | ------ | ---------- | -------- |
+| 准确率  | 91.22% | 95.43% | 95.44% | 95.75%     | 95.89%   |
 
+最后一列Ensemble是将这四个模型的预测结果结合起来，使用“投票表决”的方法得出最后的预测结果。具体的输出可以查看`output.txt`文件。
 
 
 
@@ -87,7 +85,7 @@ HMM模型的训练过程对应隐马尔可夫模型的学习问题（李航 统
 
 为了建立一个条件随机场，我们首先要定义一个特征函数集，该函数集内的每个特征函数都以标注序列作为输入，提取的特征作为输出。假设该函数集为：
 
-![函数集](./imgs/函数集.png)
+![函数集](./imgs/func_set.png)
 
 其中$x=(x_1, ..., x_m)$表示观测序列，$s = (s_1, ...., s_m)$表示状态序列。然后，条件随机场使用对数线性模型来计算给定观测序列下状态序列的条件概率：
 
@@ -158,7 +156,6 @@ LSTM的优点是能够通过双向的设置学习到观测序列（输入的字
 
 * BI-LSTM+CRF 比起Bi-LSTM效果并没有好很多，一种可能的解释是：
   - 数据集太小，不足够让模型学习到转移矩阵（后续尝试在更大的数据集上测试一下结果）
-* 根据验证集的损失变化动态调整学习率（ReduceLROnPlateau）。
 * 尝试更加复杂的模型，参考论文[Chinese NER using Lattice LSTM](https://github.com/jiesutd/LatticeLSTM)
 * 更详细的评估结果：打印混淆矩阵，同时输出每种类别的召回率、准确率、F1指标，便于分析。
 
@@ -179,5 +176,3 @@ LSTM的优点是能够通过双向的设置学习到观测序列（输入的字
 
 
 
-
-
diff --git a/README.pdf b/README.pdf
diff --git a/ckpts/bilstm.pkl b/ckpts/bilstm.pkl
diff --git a/ckpts/bilstm_crf.pkl b/ckpts/bilstm_crf.pkl
diff --git a/ckpts/crf.pkl b/ckpts/crf.pkl
diff --git a/ckpts/hmm.pkl b/ckpts/hmm.pkl
diff --git a/evaluate.py b/evaluate.py
@@ -1,23 +1,25 @@
 import time
+from collections import Counter
 
 from models.hmm import HMM
 from models.crf import CRFModel
 from models.bilstm_crf import BILSTM_Model
-from utils import save_model
+from utils import save_model, flatten_lists
 
 
 def evaluate(tag_lists, target_tag_lists):
     # 评估准确率
-    tag_count = 0.
     correct_count = 0.
-    for pred_tags, target_tags in zip(tag_lists, target_tag_lists):
-        assert len(pred_tags) == len(target_tags)
-        tag_count += len(pred_tags)
-        for pred, tgt in zip(pred_tags, target_tags):
-            if pred == tgt:
-                correct_count += 1.
 
-    return correct_count/tag_count
+    # 展开嵌套列表
+    tag_lists = flatten_lists(tag_lists)
+    target_tag_lists = flatten_lists(target_tag_lists)
+    assert len(tag_lists) == len(target_tag_lists)
+
+    for pred, tgt in zip(tag_lists, target_tag_lists):
+        if pred == tgt:
+            correct_count += 1.
+    return correct_count/len(tag_lists)
 
 
 def hmm_train_eval(train_data, test_data, word2id, tag2id):
@@ -40,6 +42,8 @@ def hmm_train_eval(train_data, test_data, word2id, tag2id):
     accuracy = evaluate(pred_tag_lists, test_tag_lists)
     print("HMM 模型的准确率为：{:.2f}%".format(accuracy * 100))
 
+    return pred_tag_lists
+
 
 def crf_train_eval(train_data, test_data):
 
@@ -55,6 +59,8 @@ def crf_train_eval(train_data, test_data):
     accuracy = evaluate(pred_tag_lists, test_tag_lists)
     print("CRF 模型的准确率为：{:.2f}%".format(accuracy * 100))
 
+    return pred_tag_lists
+
 
 def bilstm_train_and_eval(train_data, dev_data, test_data,
                           word2id, tag2id, crf=True):
@@ -69,9 +75,38 @@ def bilstm_train_and_eval(train_data, dev_data, test_data,
     bilstm_model.train(train_word_lists, train_tag_lists,
                        dev_word_lists, dev_tag_lists, word2id, tag2id)
 
+    model_name = "bilstm_crf" if crf else "bilstm"
+    save_model(bilstm_model, "./ckpts/"+model_name+".pkl")
+
     print("训练完毕,共用时{}秒.".format(int(time.time()-start)))
-    print("评估BILSTM模型中...")
+    print("评估{}模型中...".format(model_name))
     pred_tag_lists, test_tag_lists = bilstm_model.test(
         test_word_lists, test_tag_lists, word2id, tag2id)
     accuracy = evaluate(pred_tag_lists, test_tag_lists)
-    print("BILSTM 模型的准确率为：{:.2f}%".format(accuracy * 100))
+    print("{} 模型的准确率为：{:.2f}%".format(
+        model_name, accuracy * 100
+    ))
+
+    return pred_tag_lists
+
+
+def ensemble_evaluate(results, targets):
+    """ensemble多个模型"""
+    for i in range(len(results)):
+        results[i] = flatten_lists(results[i])
+
+    pred_tags = []
+    for result in zip(*results):
+        ensemble_tag = Counter(result).most_common(1)[0][0]
+        pred_tags.append(ensemble_tag)
+
+    targets = flatten_lists(targets)
+    assert len(pred_tags) == len(targets)
+
+    correct = 0
+    for pred, tgt in zip(pred_tags, targets):
+        if pred == tgt:
+            correct += 1.
+    accuracy = correct/len(targets)
+
+    print("Ensemble四个模型的准确率为{:.2f}%".format(accuracy * 100))
diff --git a/imgs/函数集.png → imgs/func_set.png b/imgs/函数集.png → imgs/func_set.png
diff --git a/main.py b/main.py
@@ -1,7 +1,8 @@
 
 from data import build_corpus
 from utils import extend_maps, prepocess_data_for_lstmcrf
-from evaluate import hmm_train_eval, crf_train_eval, bilstm_train_and_eval
+from evaluate import hmm_train_eval, crf_train_eval, \
+    bilstm_train_and_eval, ensemble_evaluate
 
 
 def main():
@@ -16,7 +17,7 @@ def main():
 
     # 训练评估ｈｍｍ模型
     print("正在训练评估HMM模型...")
-    hmm_train_eval(
+    hmm_pred = hmm_train_eval(
         (train_word_lists, train_tag_lists),
         (test_word_lists, test_tag_lists),
         word2id,
@@ -25,7 +26,7 @@ def main():
 
     # 训练评估CRF模型
     print("正在训练评估CRF模型...")
-    crf_train_eval(
+    crf_pred = crf_train_eval(
         (train_word_lists, train_tag_lists),
         (test_word_lists, test_tag_lists)
     )
@@ -34,7 +35,7 @@ def main():
     print("正在训练评估双向LSTM模型...")
     # LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
     bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
-    bilstm_train_and_eval(
+    lstm_pred = bilstm_train_and_eval(
         (train_word_lists, train_tag_lists),
         (dev_word_lists, dev_tag_lists),
         (test_word_lists, test_tag_lists),
@@ -55,13 +56,18 @@ def main():
     test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
         test_word_lists, test_tag_lists, test=True
     )
-    bilstm_train_and_eval(
+    lstmcrf_pred = bilstm_train_and_eval(
         (train_word_lists, train_tag_lists),
         (dev_word_lists, dev_tag_lists),
         (test_word_lists, test_tag_lists),
         crf_word2id, crf_tag2id
     )
 
+    ensemble_evaluate(
+        [hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
+        test_tag_lists
+    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/models/bilstm_crf.py b/models/bilstm_crf.py
@@ -39,7 +39,6 @@ def __init__(self, vocab_size, out_size, crf=True):
         self.print_step = TrainingConfig.print_step
         self.lr = TrainingConfig.lr
         self.batch_size = TrainingConfig.batch_size
-        self.save_file = TrainingConfig.save_file
 
         # 初始化优化器
         self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
@@ -53,8 +52,8 @@ def train(self, word_lists, tag_lists,
               dev_word_lists, dev_tag_lists,
               word2id, tag2id):
         # 对数据集按照长度进行排序
-        word_lists, tag_lists = sort_by_lengths(word_lists, tag_lists)
-        dev_word_lists, dev_tag_lists = sort_by_lengths(
+        word_lists, tag_lists, _ = sort_by_lengths(word_lists, tag_lists)
+        dev_word_lists, dev_tag_lists, _ = sort_by_lengths(
             dev_word_lists, dev_tag_lists)
 
         B = self.batch_size
@@ -129,7 +128,6 @@ def validate(self, dev_word_lists, dev_tag_lists, word2id, tag2id):
 
             if val_loss < self._best_val_loss:
                 print("保存模型...")
-                torch.save(self.model, self.save_file)
                 self.best_model = self.model
                 self._best_val_loss = val_loss
 
@@ -138,7 +136,7 @@ def validate(self, dev_word_lists, dev_tag_lists, word2id, tag2id):
     def test(self, word_lists, tag_lists, word2id, tag2id):
         """返回最佳模型在测试集上的预测结果"""
         # 准备数据
-        word_lists, tag_lists = sort_by_lengths(word_lists, tag_lists)
+        word_lists, tag_lists, indices = sort_by_lengths(word_lists, tag_lists)
         tensorized_sents, lengths = tensorized(word_lists, word2id)
         tensorized_sents = tensorized_sents.to(self.device)
 
@@ -160,6 +158,15 @@ def test(self, word_lists, tag_lists, word2id, tag2id):
                     tag_list.append(id2tag[ids[j].item()])
             pred_tag_lists.append(tag_list)
 
+        # indices存有根据长度排序后的索引映射的信息
+        # 比如若indices = [1, 2, 0] 则说明原先索引为1的元素映射到的新的索引是0，
+        # 索引为2的元素映射到新的索引是1...
+        # 下面根据indices将pred_tag_lists和tag_lists转化为原来的顺序
+        ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1])
+        indices, _ = list(zip(*ind_maps))
+        pred_tag_lists = [pred_tag_lists[i] for i in indices]
+        tag_lists = [tag_lists[i] for i in indices]
+
         return pred_tag_lists, tag_lists
 
 
@@ -207,14 +214,15 @@ def test(self, test_sents_tensor, lengths, tag2id):
         viterbi = torch.zeros(B, L, T).to(device)
         # backpointer[i, j, k]表示第i个句子，第j个字对应第k个标记时前一个标记的id，用于回溯
         backpointer = (torch.zeros(B, L, T).long() * end_id).to(device)
+        lengths = torch.LongTensor(lengths).to(device)
         # 向前递推
         for step in range(L):
-            batch_size_t = sum(l > step for l in lengths)
+            batch_size_t = (lengths > step).sum().item()
             if step == 0:
                 # 第一个字它的前一个标记只能是start_id
                 viterbi[:batch_size_t, step,
-                        :] = crf_scores[:batch_size_t, step, start_id, :]
-                backpointer[:batch_size_t, step, :] = start_id
+                        :] = crf_scores[: batch_size_t, step, start_id, :]
+                backpointer[: batch_size_t, step, :] = start_id
             else:
                 max_scores, prev_tags = torch.max(
                     viterbi[:batch_size_t, step-1, :].unsqueeze(2) +
@@ -226,7 +234,6 @@ def test(self, test_sents_tensor, lengths, tag2id):
 
         # 在回溯的时候我们只需要用到backpointer矩阵
         backpointer = backpointer.view(B, -1)  # [B, L * T]
-        lengths = torch.Tensor(lengths)
         tagids = []  # 存放结果
         tags_t = None
         for step in range(L-1, 0, -1):
@@ -248,8 +255,6 @@ def test(self, test_sents_tensor, lengths, tag2id):
                 index = index.to(device)
                 index += offset.long()
 
-            # import pdb
-            # pdb.set_trace()
             try:
                 tags_t = backpointer[:batch_size_t].gather(
                     dim=1, index=index.unsqueeze(1).long())

diff --git a/models/config.py b/models/config.py
@@ -5,8 +5,6 @@ class TrainingConfig(object):
     lr = 0.001
     epoches = 30
     print_step = 5
-    # 保存模型的的文件名
-    save_file = "./ckpts/bilstm.pkl"
 
 
 class LSTMConfig(object):

diff --git a/models/util.py b/models/util.py
@@ -44,16 +44,21 @@ def tensorized(batch, maps):
             batch_tensor[i][j] = maps.get(e, UNK)
     # batch各个元素的长度
     lengths = [len(l) for l in batch]
+
     return batch_tensor, lengths
 
 
 def sort_by_lengths(word_lists, tag_lists):
     pairs = list(zip(word_lists, tag_lists))
-    pairs.sort(key=lambda pair: len(pair[0]), reverse=True)
+    indices = sorted(range(len(pairs)),
+                     key=lambda k: len(pairs[k][0]),
+                     reverse=True)
+    pairs = [pairs[i] for i in indices]
+    # pairs.sort(key=lambda pair: len(pair[0]), reverse=True)
 
     word_lists, tag_lists = list(zip(*pairs))
 
-    return word_lists, tag_lists
+    return word_lists, tag_lists, indices
 
 
 def cal_loss(logits, targets, tag2id):
@@ -144,12 +149,7 @@ def cal_lstm_crf_loss(crf_scores, targets, tag2id):
     all_path_scores = scores_upto_t[:, end_id].sum()
 
     # 训练大约两个epoch loss变成负数，从数学的角度上来说，loss = -logP
-    # 其中 0 < P < 1,所以loss > 0，所以这里面还有bug.....
     loss = (all_path_scores - golden_scores) / batch_size
-    if loss.item() < 0:
-        import pdb
-        pdb.set_trace()
-
     return loss