Skip to content

Commit

Permalink
add a example
Browse files Browse the repository at this point in the history
  • Loading branch information
920232796 committed Jan 21, 2021
1 parent 4e0803a commit 4736940
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 12 deletions.
Binary file modified .DS_Store
Binary file not shown.
14 changes: 7 additions & 7 deletions bert_seq2seq/bert_relation_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,22 +75,22 @@ def forward(self, text, subject_ids, position_enc=None, subject_labels=None, obj
enc_layers, _ = self.bert(text,
output_all_encoded_layers=True)
squence_out = enc_layers[use_layer_num]
sub_out = enc_layers[-2]
sub_out = enc_layers[-1]
# print(squence_out.shape)

# transform_out = self.layer_norm(squence_out)
subject_pred_out = self.subject_pred(squence_out)

subject_pred_act = self.activation(subject_pred_out)

subject_pred_act = subject_pred_act**2
# subject_pred_act = subject_pred_act**2

subject_vec = self.extrac_subject(sub_out, subject_ids)
object_layer_norm = self.layer_norm_cond([sub_out, subject_vec])
object_pred_out = self.object_pred(object_layer_norm)
object_pred_act = self.activation(object_pred_out)

object_pred_act = object_pred_act**4
# object_pred_act = object_pred_act**4

batch_size, seq_len, target_size = object_pred_act.shape

Expand All @@ -117,12 +117,12 @@ def predict_subject(self, text,use_layer_num=-1, device="cpu"):
self.target_mask = (text > 0).float()
enc_layers, _ = self.bert(text, output_all_encoded_layers=True)
squence_out = enc_layers[use_layer_num]
sub_out = enc_layers[-2]
sub_out = enc_layers[-1]
# transform_out = self.layer_norm(squence_out)
subject_pred_out = self.subject_pred(squence_out)
subject_pred_act = self.activation(subject_pred_out)

subject_pred_act = subject_pred_act**2
# subject_pred_act = subject_pred_act**2

# subject_pred_act = (subject_pred_act > 0.5).long()
return subject_pred_act
Expand All @@ -138,13 +138,13 @@ def predict_object_predicate(self, text, subject_ids, use_layer_num=-1, device="

enc_layers, _ = self.bert(text, output_all_encoded_layers=True)
squence_out = enc_layers[use_layer_num]
sub_out = enc_layers[-2]
sub_out = enc_layers[-1]
subject_vec = self.extrac_subject(sub_out, subject_ids)
object_layer_norm = self.layer_norm_cond([sub_out, subject_vec])
object_pred_out = self.object_pred(object_layer_norm)
object_pred_act = self.activation(object_pred_out)

object_pred_act = object_pred_act**4
# object_pred_act = object_pred_act**4

batch_size, seq_len, target_size = object_pred_act.shape
object_pred_act = object_pred_act.view((batch_size, seq_len, int(target_size/2), 2))
Expand Down
226 changes: 226 additions & 0 deletions examples/relationship_classify_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
## 关系分类的例子
import sys

sys.path.append("/Users/xingzhaohu/Downloads/code/python/ml/ml_code/bert/bert_seq2seq")
import torch
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import pandas as pd
import numpy as np
import os
import json
import time
import bert_seq2seq
from torch.utils.data import Dataset, DataLoader
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
from bert_seq2seq.utils import load_bert, load_model_params, load_recent_model

data_path = "./person.xlsx"
vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置
model_name = "roberta" # 选择模型名字
model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置
recent_model_path = "" # 用于把已经训练好的模型继续训练
model_save_path = "./bert_relationship_classify_model.bin"
batch_size = 16
lr = 1e-5
# 加载字典
word2idx = load_chinese_base_vocab(vocab_path)

target = []
person_relation = pd.read_excel(data_path)
for index, row in person_relation.iterrows():
p = row[2]
if p not in target:
target.append(p)

print(target)

def read_corpus(data_path):
"""
读原始数据
"""
sents_src = []
sents_tgt = []

person_relation = pd.read_excel(data_path)
for index, row in person_relation.iterrows():
p = row[2]
s = row[0]
o = row[1]
text = row[3]
text = s + "#" + o + "#" + text
sents_src.append(text)
sents_tgt.append(int(target.index(p)))

return sents_src, sents_tgt


## 自定义dataset
class NLUDataset(Dataset):
"""
针对特定数据集,定义一个相关的取数据的方式
"""

def __init__(self, sents_src, sents_tgt):
## 一般init函数是加载所有数据
super(NLUDataset, self).__init__()
# 读原始数据
# self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
self.sents_src = sents_src
self.sents_tgt = sents_tgt

self.idx2word = {k: v for v, k in word2idx.items()}
self.tokenizer = Tokenizer(word2idx)

def __getitem__(self, i):
## 得到单个数据
# print(i)
src = self.sents_src[i]
tgt = self.sents_tgt[i]
token_ids, token_type_ids = self.tokenizer.encode(src)
output = {
"token_ids": token_ids,
"token_type_ids": token_type_ids,
"target_id": tgt
}
return output

def __len__(self):
return len(self.sents_src)


def collate_fn(batch):
"""
动态padding, batch为一部分sample
"""

def padding(indice, max_length, pad_idx=0):
"""
pad 函数
"""
pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice]
return torch.tensor(pad_indice)

token_ids = [data["token_ids"] for data in batch]
max_length = max([len(t) for t in token_ids])
token_type_ids = [data["token_type_ids"] for data in batch]
target_ids = [data["target_id"] for data in batch]
target_ids = torch.tensor(target_ids, dtype=torch.long)

token_ids_padded = padding(token_ids, max_length)
token_type_ids_padded = padding(token_type_ids, max_length)
# target_ids_padded = token_ids_padded[:, 1:].contiguous()

return token_ids_padded, token_type_ids_padded, target_ids


class Trainer:
def __init__(self):
# 加载数据
self.sents_src, self.sents_tgt = read_corpus(data_path)
self.tokenier = Tokenizer(word2idx)
# 判断是否有可用GPU
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: " + str(self.device))
# 定义模型
self.bert_model = load_bert(word2idx, model_name=model_name, model_class="cls", target_size=len(target))
## 加载预训练的模型参数~
load_model_params(self.bert_model, model_path)
# 将模型发送到计算设备(GPU或CPU)
self.bert_model.to(self.device)
# 声明需要优化的参数
self.optim_parameters = list(self.bert_model.parameters())
self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3)
# 声明自定义的数据加载器
dataset = NLUDataset(self.sents_src, self.sents_tgt)
self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

def train(self, epoch):
# 一个epoch的训练
self.bert_model.train()
self.iteration(epoch, dataloader=self.dataloader, train=True)

def save(self, save_path):
"""
保存模型
"""
torch.save(self.bert_model.state_dict(), save_path)
print("{} saved!".format(save_path))

def iteration(self, epoch, dataloader, train=True):
total_loss = 0
start_time = time.time() ## 得到当前时间
step = 0
for token_ids, token_type_ids, target_ids in tqdm(dataloader, position=0, leave=True):
step += 1
if step % 100 == 0:
self.bert_model.eval()
test_data = [
"杨惠之#吴道子#好在《历代名画补遗》中提到了杨惠之的线索,原来是他是吴道子的同门师兄弟,他们有一个共同的老师,叫做张僧繇。",
"杨惠之#张僧繇#好在《历代名画补遗》中提到了杨惠之的线索,原来是他是吴道子的同门师兄弟,他们有一个共同的老师,叫做张僧繇。",
"吴道子#张僧繇#好在《历代名画补遗》中提到了杨惠之的线索,原来是他是吴道子的同门师兄弟,他们有一个共同的老师,叫做张僧繇。"
]
for text in test_data:
text, text_ids = self.tokenier.encode(text)
text = torch.tensor(text, device=self.device).view(1, -1)
print(target[torch.argmax(self.bert_model(text)).item()])
self.bert_model.train()

token_ids = token_ids.to(self.device)
token_type_ids = token_type_ids.to(self.device)
target_ids = target_ids.to(self.device)
# 因为传入了target标签,因此会计算loss并且返回
predictions, loss = self.bert_model(token_ids,
labels=target_ids,

)
# 反向传播
if train:
# 清空之前的梯度
self.optimizer.zero_grad()
# 反向传播, 获取新的梯度
loss.backward()
# 用获取的梯度更新模型参数
self.optimizer.step()

# 为计算当前epoch的平均loss
total_loss += loss.item()

end_time = time.time()
spend_time = end_time - start_time
# 打印训练信息
print("epoch is " + str(epoch) + ". loss is " + str(total_loss) + ". spend time is " + str(spend_time))
# 保存模型
self.save(model_save_path)


if __name__ == '__main__':

trainer = Trainer()
train_epoches = 10
for epoch in range(train_epoches):
# 训练一个epoch
trainer.train(epoch)

# # 测试一下自定义数据集
# vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置
# sents_src, sents_tgt = read_corpus("./corpus/新闻标题文本分类/Train.txt")

# dataset = NLUDataset(sents_src, sents_tgt, vocab_path)
# word2idx = load_chinese_base_vocab(vocab_path)
# tokenier = Tokenizer(word2idx)
# dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
# for token_ids, token_type_ids, target_ids in dataloader:
# # print(token_ids.shape)
# print(tokenier.decode(token_ids[0].tolist()))
# print(tokenier.decode(token_ids[1].tolist()))
# print(token_type_ids)
# print(target_ids)

# bert_model = load_bert(vocab_path, model_class="encoder", target_size=14)
# bert_model(token_ids)
# # print(tokenier.decode(target_ids[0].tolist()))
# # print(tokenier.decode(target_ids[1].tolist()))
# break

11 changes: 6 additions & 5 deletions test/细粒度ner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,10 @@ def ner_print(model, test_data, device="cpu"):
bert_model.eval()
## 加载训练的模型参数~
load_recent_model(bert_model, recent_model_path=model_path, device=device)
test_data = ["在广州经营小古董珠宝店的潘凝已经收藏了200多款泰迪熊,其中不少更是老牌泰迪熊厂商史蒂夫、赫曼。",
"2009年1月,北京市长郭金龙在其政府工作报告中曾明确提出,限价房不停建",
"昨天,记者连线农业银行亳州市支行办公室主任沈伦,他表示,亳州市支行已经对此事进行了讨论和研究",
"他们又有会怎样的读书经历。曾经留学海外的香港《号外杂志》主编、著名城市文化学者和作家陈冠中先生"
]
# test_data = ["在广州经营小古董珠宝店的潘凝已经收藏了200多款泰迪熊,其中不少更是老牌泰迪熊厂商史蒂夫、赫曼。",
# "2009年1月,北京市长郭金龙在其政府工作报告中曾明确提出,限价房不停建",
# "昨天,记者连线农业银行亳州市支行办公室主任沈伦,他表示,亳州市支行已经对此事进行了讨论和研究",
# "他们又有会怎样的读书经历。曾经留学海外的香港《号外杂志》主编、著名城市文化学者和作家陈冠中先生"
# ]
test_data = ["曹操南征荆州,刘表之子刘琮投降,刘备领军民十余万避难,于当阳遭遇曹军追兵,惨败。"]
ner_print(bert_model, test_data, device=device)

0 comments on commit 4736940

Please sign in to comment.