Skip to content

Commit

Permalink
add ensemble
Browse files Browse the repository at this point in the history
  • Loading branch information
luopeixiang committed Mar 27, 2019
1 parent 124bf37 commit b234dd9
Show file tree
Hide file tree
Showing 15 changed files with 874 additions and 829 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
__pycache__
models/__pycache__
17 changes: 6 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,13 @@

## 运行结果

下面是四种不同的模型在该数据集上的运行结果(取最好):

| | HMM | CRF | BiLSTM | BiLSTM+CRF |
| ---- | ------ | ------ | ------ | ---------- |
| 准确率 | 91.22% | 95.43% | 95.26% | 95.87% |

具体的输出可以查看`output.txt`文件。
下面是四种不同的模型以及这Ensemble这四个模型预测结果的准确率(取最好):

| | HMM | CRF | BiLSTM | BiLSTM+CRF | Ensemble |
| ---- | ------ | ------ | ------ | ---------- | -------- |
| 准确率 | 91.22% | 95.43% | 95.44% | 95.75% | 95.89% |

最后一列Ensemble是将这四个模型的预测结果结合起来,使用“投票表决”的方法得出最后的预测结果。具体的输出可以查看`output.txt`文件。



Expand Down Expand Up @@ -87,7 +85,7 @@ HMM模型的训练过程对应隐马尔可夫模型的学习问题(李航 统

为了建立一个条件随机场,我们首先要定义一个特征函数集,该函数集内的每个特征函数都以标注序列作为输入,提取的特征作为输出。假设该函数集为:

![函数集](./imgs/函数集.png)
![函数集](./imgs/func_set.png)

其中$x=(x_1, ..., x_m)$表示观测序列,$s = (s_1, ...., s_m)$表示状态序列。然后,条件随机场使用对数线性模型来计算给定观测序列下状态序列的条件概率:

Expand Down Expand Up @@ -158,7 +156,6 @@ LSTM的优点是能够通过双向的设置学习到观测序列(输入的字

* BI-LSTM+CRF 比起Bi-LSTM效果并没有好很多,一种可能的解释是:
- 数据集太小,不足够让模型学习到转移矩阵(后续尝试在更大的数据集上测试一下结果)
* 根据验证集的损失变化动态调整学习率(ReduceLROnPlateau)。
* 尝试更加复杂的模型,参考论文[Chinese NER using Lattice LSTM](https://github.com/jiesutd/LatticeLSTM)
* 更详细的评估结果:打印混淆矩阵,同时输出每种类别的召回率、准确率、F1指标,便于分析。

Expand All @@ -179,5 +176,3 @@ LSTM的优点是能够通过双向的设置学习到观测序列(输入的字





Binary file modified README.pdf
Binary file not shown.
Binary file modified ckpts/bilstm.pkl
Binary file not shown.
Binary file added ckpts/bilstm_crf.pkl
Binary file not shown.
Binary file modified ckpts/crf.pkl
Binary file not shown.
Binary file modified ckpts/hmm.pkl
Binary file not shown.
57 changes: 46 additions & 11 deletions evaluate.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
import time
from collections import Counter

from models.hmm import HMM
from models.crf import CRFModel
from models.bilstm_crf import BILSTM_Model
from utils import save_model
from utils import save_model, flatten_lists


def evaluate(tag_lists, target_tag_lists):
# 评估准确率
tag_count = 0.
correct_count = 0.
for pred_tags, target_tags in zip(tag_lists, target_tag_lists):
assert len(pred_tags) == len(target_tags)
tag_count += len(pred_tags)
for pred, tgt in zip(pred_tags, target_tags):
if pred == tgt:
correct_count += 1.

return correct_count/tag_count
# 展开嵌套列表
tag_lists = flatten_lists(tag_lists)
target_tag_lists = flatten_lists(target_tag_lists)
assert len(tag_lists) == len(target_tag_lists)

for pred, tgt in zip(tag_lists, target_tag_lists):
if pred == tgt:
correct_count += 1.
return correct_count/len(tag_lists)


def hmm_train_eval(train_data, test_data, word2id, tag2id):
Expand All @@ -40,6 +42,8 @@ def hmm_train_eval(train_data, test_data, word2id, tag2id):
accuracy = evaluate(pred_tag_lists, test_tag_lists)
print("HMM 模型的准确率为:{:.2f}%".format(accuracy * 100))

return pred_tag_lists


def crf_train_eval(train_data, test_data):

Expand All @@ -55,6 +59,8 @@ def crf_train_eval(train_data, test_data):
accuracy = evaluate(pred_tag_lists, test_tag_lists)
print("CRF 模型的准确率为:{:.2f}%".format(accuracy * 100))

return pred_tag_lists


def bilstm_train_and_eval(train_data, dev_data, test_data,
word2id, tag2id, crf=True):
Expand All @@ -69,9 +75,38 @@ def bilstm_train_and_eval(train_data, dev_data, test_data,
bilstm_model.train(train_word_lists, train_tag_lists,
dev_word_lists, dev_tag_lists, word2id, tag2id)

model_name = "bilstm_crf" if crf else "bilstm"
save_model(bilstm_model, "./ckpts/"+model_name+".pkl")

print("训练完毕,共用时{}秒.".format(int(time.time()-start)))
print("评估BILSTM模型中...")
print("评估{}模型中...".format(model_name))
pred_tag_lists, test_tag_lists = bilstm_model.test(
test_word_lists, test_tag_lists, word2id, tag2id)
accuracy = evaluate(pred_tag_lists, test_tag_lists)
print("BILSTM 模型的准确率为:{:.2f}%".format(accuracy * 100))
print("{} 模型的准确率为:{:.2f}%".format(
model_name, accuracy * 100
))

return pred_tag_lists


def ensemble_evaluate(results, targets):
"""ensemble多个模型"""
for i in range(len(results)):
results[i] = flatten_lists(results[i])

pred_tags = []
for result in zip(*results):
ensemble_tag = Counter(result).most_common(1)[0][0]
pred_tags.append(ensemble_tag)

targets = flatten_lists(targets)
assert len(pred_tags) == len(targets)

correct = 0
for pred, tgt in zip(pred_tags, targets):
if pred == tgt:
correct += 1.
accuracy = correct/len(targets)

print("Ensemble四个模型的准确率为{:.2f}%".format(accuracy * 100))
File renamed without changes
16 changes: 11 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@

from data import build_corpus
from utils import extend_maps, prepocess_data_for_lstmcrf
from evaluate import hmm_train_eval, crf_train_eval, bilstm_train_and_eval
from evaluate import hmm_train_eval, crf_train_eval, \
bilstm_train_and_eval, ensemble_evaluate


def main():
Expand All @@ -16,7 +17,7 @@ def main():

# 训练评估hmm模型
print("正在训练评估HMM模型...")
hmm_train_eval(
hmm_pred = hmm_train_eval(
(train_word_lists, train_tag_lists),
(test_word_lists, test_tag_lists),
word2id,
Expand All @@ -25,7 +26,7 @@ def main():

# 训练评估CRF模型
print("正在训练评估CRF模型...")
crf_train_eval(
crf_pred = crf_train_eval(
(train_word_lists, train_tag_lists),
(test_word_lists, test_tag_lists)
)
Expand All @@ -34,7 +35,7 @@ def main():
print("正在训练评估双向LSTM模型...")
# LSTM模型训练的时候需要在word2id和tag2id加入PAD和UNK
bilstm_word2id, bilstm_tag2id = extend_maps(word2id, tag2id, for_crf=False)
bilstm_train_and_eval(
lstm_pred = bilstm_train_and_eval(
(train_word_lists, train_tag_lists),
(dev_word_lists, dev_tag_lists),
(test_word_lists, test_tag_lists),
Expand All @@ -55,13 +56,18 @@ def main():
test_word_lists, test_tag_lists = prepocess_data_for_lstmcrf(
test_word_lists, test_tag_lists, test=True
)
bilstm_train_and_eval(
lstmcrf_pred = bilstm_train_and_eval(
(train_word_lists, train_tag_lists),
(dev_word_lists, dev_tag_lists),
(test_word_lists, test_tag_lists),
crf_word2id, crf_tag2id
)

ensemble_evaluate(
[hmm_pred, crf_pred, lstm_pred, lstmcrf_pred],
test_tag_lists
)


if __name__ == "__main__":
main()
27 changes: 16 additions & 11 deletions models/bilstm_crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def __init__(self, vocab_size, out_size, crf=True):
self.print_step = TrainingConfig.print_step
self.lr = TrainingConfig.lr
self.batch_size = TrainingConfig.batch_size
self.save_file = TrainingConfig.save_file

# 初始化优化器
self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
Expand All @@ -53,8 +52,8 @@ def train(self, word_lists, tag_lists,
dev_word_lists, dev_tag_lists,
word2id, tag2id):
# 对数据集按照长度进行排序
word_lists, tag_lists = sort_by_lengths(word_lists, tag_lists)
dev_word_lists, dev_tag_lists = sort_by_lengths(
word_lists, tag_lists, _ = sort_by_lengths(word_lists, tag_lists)
dev_word_lists, dev_tag_lists, _ = sort_by_lengths(
dev_word_lists, dev_tag_lists)

B = self.batch_size
Expand Down Expand Up @@ -129,7 +128,6 @@ def validate(self, dev_word_lists, dev_tag_lists, word2id, tag2id):

if val_loss < self._best_val_loss:
print("保存模型...")
torch.save(self.model, self.save_file)
self.best_model = self.model
self._best_val_loss = val_loss

Expand All @@ -138,7 +136,7 @@ def validate(self, dev_word_lists, dev_tag_lists, word2id, tag2id):
def test(self, word_lists, tag_lists, word2id, tag2id):
"""返回最佳模型在测试集上的预测结果"""
# 准备数据
word_lists, tag_lists = sort_by_lengths(word_lists, tag_lists)
word_lists, tag_lists, indices = sort_by_lengths(word_lists, tag_lists)
tensorized_sents, lengths = tensorized(word_lists, word2id)
tensorized_sents = tensorized_sents.to(self.device)

Expand All @@ -160,6 +158,15 @@ def test(self, word_lists, tag_lists, word2id, tag2id):
tag_list.append(id2tag[ids[j].item()])
pred_tag_lists.append(tag_list)

# indices存有根据长度排序后的索引映射的信息
# 比如若indices = [1, 2, 0] 则说明原先索引为1的元素映射到的新的索引是0,
# 索引为2的元素映射到新的索引是1...
# 下面根据indices将pred_tag_lists和tag_lists转化为原来的顺序
ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1])
indices, _ = list(zip(*ind_maps))
pred_tag_lists = [pred_tag_lists[i] for i in indices]
tag_lists = [tag_lists[i] for i in indices]

return pred_tag_lists, tag_lists


Expand Down Expand Up @@ -207,14 +214,15 @@ def test(self, test_sents_tensor, lengths, tag2id):
viterbi = torch.zeros(B, L, T).to(device)
# backpointer[i, j, k]表示第i个句子,第j个字对应第k个标记时前一个标记的id,用于回溯
backpointer = (torch.zeros(B, L, T).long() * end_id).to(device)
lengths = torch.LongTensor(lengths).to(device)
# 向前递推
for step in range(L):
batch_size_t = sum(l > step for l in lengths)
batch_size_t = (lengths > step).sum().item()
if step == 0:
# 第一个字它的前一个标记只能是start_id
viterbi[:batch_size_t, step,
:] = crf_scores[:batch_size_t, step, start_id, :]
backpointer[:batch_size_t, step, :] = start_id
:] = crf_scores[: batch_size_t, step, start_id, :]
backpointer[: batch_size_t, step, :] = start_id
else:
max_scores, prev_tags = torch.max(
viterbi[:batch_size_t, step-1, :].unsqueeze(2) +
Expand All @@ -226,7 +234,6 @@ def test(self, test_sents_tensor, lengths, tag2id):

# 在回溯的时候我们只需要用到backpointer矩阵
backpointer = backpointer.view(B, -1) # [B, L * T]
lengths = torch.Tensor(lengths)
tagids = [] # 存放结果
tags_t = None
for step in range(L-1, 0, -1):
Expand All @@ -248,8 +255,6 @@ def test(self, test_sents_tensor, lengths, tag2id):
index = index.to(device)
index += offset.long()

# import pdb
# pdb.set_trace()
try:
tags_t = backpointer[:batch_size_t].gather(
dim=1, index=index.unsqueeze(1).long())
Expand Down
2 changes: 0 additions & 2 deletions models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ class TrainingConfig(object):
lr = 0.001
epoches = 30
print_step = 5
# 保存模型的的文件名
save_file = "./ckpts/bilstm.pkl"


class LSTMConfig(object):
Expand Down
14 changes: 7 additions & 7 deletions models/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,21 @@ def tensorized(batch, maps):
batch_tensor[i][j] = maps.get(e, UNK)
# batch各个元素的长度
lengths = [len(l) for l in batch]

return batch_tensor, lengths


def sort_by_lengths(word_lists, tag_lists):
pairs = list(zip(word_lists, tag_lists))
pairs.sort(key=lambda pair: len(pair[0]), reverse=True)
indices = sorted(range(len(pairs)),
key=lambda k: len(pairs[k][0]),
reverse=True)
pairs = [pairs[i] for i in indices]
# pairs.sort(key=lambda pair: len(pair[0]), reverse=True)

word_lists, tag_lists = list(zip(*pairs))

return word_lists, tag_lists
return word_lists, tag_lists, indices


def cal_loss(logits, targets, tag2id):
Expand Down Expand Up @@ -144,12 +149,7 @@ def cal_lstm_crf_loss(crf_scores, targets, tag2id):
all_path_scores = scores_upto_t[:, end_id].sum()

# 训练大约两个epoch loss变成负数,从数学的角度上来说,loss = -logP
# 其中 0 < P < 1,所以loss > 0,所以这里面还有bug.....
loss = (all_path_scores - golden_scores) / batch_size
if loss.item() < 0:
import pdb
pdb.set_trace()

return loss


Expand Down
Loading

0 comments on commit b234dd9

Please sign in to comment.