forked from huggingface/transfer-learning-conv-ai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPegasus_Interact.py
140 lines (124 loc) · 5.95 KB
/
Pegasus_Interact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""Script for fine-tuning Pegasus
Example usage:
# use XSum dataset as example, with first 1000 docs as training data
from datasets import load_dataset
dataset = load_dataset("xsum")
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
# use Pegasus Large model as base for fine-tuning
model_name = 'google/pegasus-large'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
trainer.train()
Reference:
https://huggingface.co/transformers/master/custom_datasets.html
"""
from transformers import PegasusForConditionalGeneration, PegasusModel, PegasusTokenizer, Trainer, TrainingArguments
import torch
from datetime import datetime
import json
import logging
from collections import defaultdict
import os
import tarfile
import tempfile
import socket
from itertools import chain
from argparse import ArgumentParser
import random
from transformers import cached_path
PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
logger = logging.getLogger(__file__)
def build_input_from_segments(persona, history, with_eos=True):
""" Build a sequence of input from 3 segments: persona, history and last reply. """
#bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
#sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
#sequence = [sequence[0]] + [[1 if (len(sequence)-i) % 2 else 0] + s for i, s in enumerate(sequence[1:])]
instance = {}
#instance["input_ids"] = history[1] + ' ' + history[3]
history_chatbot = history[1::2]
instance["input_ids"] = " ".join(history)
#instance["input_ids"] = " ".join(history[-1])
instance["decoder_input_ids"] = " ".join(persona)
return instance
def build_input_from_segments_faiss(persona, persona_faiss, with_eos=True):
""" Build a sequence of input from 3 segments: persona, history and last reply. """
#bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
#sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
#sequence = [sequence[0]] + [[1 if (len(sequence)-i) % 2 else 0] + s for i, s in enumerate(sequence[1:])]
instance = {}
instance["input_ids"] = " ".join(persona_faiss)
#instance["input_ids"] = " ".join(history[-1])
instance["decoder_input_ids"] = " ".join(persona)
return instance
def get_dataset(dataset_path, dataset_cache=None):
""" Get PERSONACHAT from S3 """
dataset_path = dataset_path or PERSONACHAT_URL
if dataset_cache and os.path.isfile(dataset_cache):
logger.info("Load tokenized dataset from cache at %s", dataset_cache)
dataset = torch.load(dataset_cache)
else:
logger.info("Download dataset from %s", dataset_path)
personachat_file = cached_path(dataset_path)
with open('data_personachat.json', "r", encoding="utf-8") as f:
dataset = json.loads(f.read())
logger.info("Tokenize and encode the dataset")
if dataset_cache:
torch.save(dataset, dataset_cache)
return dataset
def get_data_loaders():
""" Prepare the dataset for training and evaluation """
dataset_path = ""
dataset_cache = None
personachat = get_dataset(dataset_path, dataset_cache)
logger.info("Build inputs and labels")
datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
personality = []
history_complete = []
for dataset_name, dataset in personachat.items():
num_candidates = len(dataset[0]["utterances"][0]["candidates"])
if num_candidates > 0 and dataset_name == 'train':
num_candidates = min(1, num_candidates)
for dialog in dataset:
persona = dialog["persona_info"].copy()
#datasets[personality].append(persona)
count_history = 0
for utterance in dialog["utterances"]:
count_history = count_history + 1
#history = utterance["history"][-(2*2+1):]
history = utterance["history"]
#history_complete.append(history)
#Selección de impares
history_chatbot = history[1::2]
if len(history_chatbot) > (len(persona)-1):
instance = build_input_from_segments(persona, history_chatbot[-len(persona):])
for input_name, input_array in instance.items():
datasets[dataset_name][input_name].append(input_array)
return datasets
def run():
parser = ArgumentParser()
parser.add_argument("--model_checkpoint", type=str, default="results2_3epochs_2batch/checkpoint-143500", help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
args = parser.parse_args()
tokenizer = PegasusTokenizer.from_pretrained(args.model_checkpoint)
model = PegasusForConditionalGeneration.from_pretrained(args.model_checkpoint)
model.to("cpu")
dataset = get_data_loaders()
count= 0
while True:
print("History input:")
print(random.choice(dataset['valid']['input_ids']))
print("\n Persona Input:")
print(random.choice(dataset['valid']['decoder_input_ids']))
count = count + 1
raw_text = input(">>> ")
while not raw_text:
print('Prompt should not be empty!')
raw_text = input(">>> ")
#batch = tokenizer.prepare_seq2seq_batch(raw_text, truncation=True, padding='longest')
batch = tokenizer(raw_text, truncation=True, padding="longest", return_tensors="pt").to('cpu')
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
#tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
print("Result of decoding")
print(tgt_text)
if __name__ == "__main__":
run()