Skip to content

Commit

Permalink
support english gpt2 and generate story example.
Browse files Browse the repository at this point in the history
  • Loading branch information
920232796 committed Mar 13, 2021
1 parent 3d32359 commit 8a01543
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 11 deletions.
16 changes: 10 additions & 6 deletions bert_seq2seq/gpt2_generate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ def sample_generate(self, text, input_max_length=256, out_max_length=200, top_k=

return self.tokenizer.decode(np.array(output_ids))

def sample_generate_english(self, text, input_max_length=256, out_max_length=200, top_k=30, top_p=0.0):
def sample_generate_english(self, text, input_max_length=256, out_max_length=200, top_k=30, top_p=0.0, add_eos=False):

token_ids = self.tokenizer.encode(text, max_length=input_max_length, truncation=True)

token_ids = torch.tensor(token_ids, device=self.device, dtype=torch.long)[:-1].view(1, -1)
if add_eos:
token_ids = token_ids + [self.word2ix["<EOS>"]]
token_ids = torch.tensor(token_ids, device=self.device, dtype=torch.long).view(1, -1)
output_ids = []
sep_id = self.word2ix["<EOS>"]
with torch.no_grad():
Expand All @@ -59,6 +60,7 @@ def sample_generate_english(self, text, input_max_length=256, out_max_length=200
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
if sep_id == next_token.item():
break
# pass
output_ids.append(next_token.item())
token_ids = torch.cat((token_ids, next_token.long().unsqueeze(0)), dim=1)

Expand All @@ -75,13 +77,15 @@ def _make_causal_mask(self, input_ids_shape: torch.Size):
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len)


def forward(self, x, label=None):

def forward(self, x, labels=None):
if labels is not None:
labels = labels.to(self.device)
x = x.to(self.device)
# input_ids = torch.tensor([[1, 2, 3, 5, -100], [4, 5, 6, -100, -100]])
attention_mask = self._make_causal_mask(x.shape)
pad_mask = (x != -100).float()
attention_mask = attention_mask * pad_mask.unsqueeze(1).unsqueeze(1)

loss, lm_logit = self.model(x, label, attention_mask=attention_mask)
loss, lm_logit = self.model(x, labels=labels, attention_mask=attention_mask)

return loss, lm_logit
8 changes: 4 additions & 4 deletions bert_seq2seq/model/gpt2_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ def forward(
past_key_values = tuple([None] * len(self.h))
else:
past_length = past_key_values[0][0].size(-2)

if position_ids is None:
device = input_ids.device if input_ids is not None else inputs_embeds.device
position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
Expand All @@ -444,20 +445,20 @@ def forward(
# Attention mask.
if attention_mask is not None:
assert batch_size > 0, "batch_size has to be defined and > 0"
attention_mask = attention_mask.view(batch_size, -1)
# attention_mask = attention_mask.view(batch_size, -1)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
#  this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask = attention_mask[:, None, None, :]
# attention_mask = attention_mask[:, None, None, :]

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
# attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility
attention_mask = (1.0 - attention_mask) * -10000.0

# If a 2D ou 3D attention mask is provided for the cross-attention
Expand Down Expand Up @@ -531,7 +532,6 @@ def __init__(self, config):
self.transformer = GPT2Model(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)


# Model parallel
self.model_parallel = False
self.device_map = None
Expand Down
170 changes: 170 additions & 0 deletions examples/gpt2_english_story_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@

import torch
from tqdm import tqdm
import torch.nn as nn
from torch.optim import Adam
import numpy as np
import os
import json
import time
import glob
import pandas as pd
import bert_seq2seq
from torch.utils.data import Dataset, DataLoader
from bert_seq2seq.utils import load_gpt
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("pranavpsv/gpt2-genre-story-generator")
word2ix = tokenizer.get_vocab()
# print(len(word2ix))
# print(word2ix["<EOS>"])
# print(word2ix["<PAD>"])
# print(tokenizer.eos_token_id)

data_path = "./corpus/英文讲故事数据集/train.csv"
model_path = "./state_dict/english_gpt_model/english_gpt_story.bin"
model_save_path = "./state_dict/gpt_auto_story.bin"
batch_size = 8
lr = 1e-5
maxlen = 256

def load_data():
sents_src = []
sents_tgt = []
df = pd.read_csv(data_path)
for i, row in df.iterrows():
sents_src.append(row[1])
tgt = ""
for j in range(2, 7):
tgt += row[j]
sents_tgt.append(tgt)

return sents_src, sents_tgt

class GPTDataset(Dataset):
"""
针对特定数据集,定义一个相关的取数据的方式
"""

def __init__(self):
## 一般init函数是加载所有数据
super(GPTDataset, self).__init__()
## 拿到所有文件名字
self.sents_src, self.sents_tgt = load_data()
self.tokenizer = tokenizer

def __getitem__(self, i):
## 得到单个数据

src_d = self.sents_src[i]
tgt_d = self.sents_tgt[i]
src_ids = self.tokenizer.encode(src_d) + [self.tokenizer.eos_token_id]
tgt_ids = self.tokenizer.encode(tgt_d) + [self.tokenizer.eos_token_id]
output = {
"token_ids": src_ids + tgt_ids,
}
return output



def __len__(self):
return len(self.sents_src)


def collate_fn(batch):
"""
动态padding, batch为一部分sample
"""

def padding(indice, max_length, pad_idx=0):
"""
pad 函数
"""
pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice]
return torch.tensor(pad_indice)

token_ids = [data["token_ids"] for data in batch]
max_length = max([len(t) for t in token_ids])

token_ids_padded = padding(token_ids, max_length, pad_idx=word2ix["<PAD>"])
token_target_padded = token_ids_padded.clone()
token_target_padded[token_target_padded == word2ix["<PAD>"]] = -100
return token_ids_padded, token_target_padded


class Trainer:
def __init__(self):
# 判断是否有可用GPU
# self.device = torch.device("cpu")
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: " + str(self.device))
# 定义模型
self.model = load_gpt(word2ix, tokenizer=tokenizer)
self.model.load_pretrain_params(model_path)
# 加载已经训练好的模型,继续训练

# 将模型发送到计算设备(GPU或CPU)
self.model.set_device(self.device)
# 声明需要优化的参数
self.optim_parameters = list(self.model.parameters())
self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3)
# 声明自定义的数据加载器
dataset = GPTDataset()
self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

def train(self, epoch):
# 一个epoch的训练
self.model.train()
self.iteration(epoch, dataloader=self.dataloader, train=True)

def save(self, save_path):
"""
保存模型
"""
self.model.save_all_params(save_path)
print("{} saved!".format(save_path))

def iteration(self, epoch, dataloader, train=True):
total_loss = 0
start_time = time.time() ## 得到当前时间
step = 0
report_loss = 0
for token_ids, token_target in tqdm(dataloader, position=0, leave=True):
step += 1
if step % 1000 == 0:
self.model.eval()
print(self.model.sample_generate_english("David Drops the Weight", out_max_length=300, add_eos=True))
print("loss is " + str(report_loss))
report_loss = 0
self.model.train()
if step % 6000 == 0:
self.save(model_save_path)

# 因为传入了target标签,因此会计算loss并且返回
loss, pred_logit = self.model(token_ids, labels=token_target)
report_loss += loss.item()
# 反向传播
if train:
# 清空之前的梯度
self.optimizer.zero_grad()
# 反向传播, 获取新的梯度
loss.backward()
# 用获取的梯度更新模型参数
self.optimizer.step()

# 为计算当前epoch的平均loss
total_loss += loss.item()

end_time = time.time()
spend_time = end_time - start_time
# 打印训练信息
print("epoch is " + str(epoch) + ". loss is " + str(total_loss) + ". spend time is " + str(spend_time))


if __name__ == '__main__':

trainer = Trainer()
train_epoches = 20

for epoch in range(train_epoches):
# 训练一个epoch
trainer.train(epoch)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='bert_seq2seq',
version='1.2.3',
version='1.2.4',
description='use torch to do bert_seq2seq task',
long_description='bert_seq2seq: https://github.com/920232796/bert_seq2seq',
license='Apache License 2.0',
Expand Down

0 comments on commit 8a01543

Please sign in to comment.