Skip to content

Commit

Permalink
updated
Browse files Browse the repository at this point in the history
Signed-off-by: ftgreat <[email protected]>
  • Loading branch information
ftgreat committed Mar 15, 2023
1 parent dbd5a4e commit 12433c8
Show file tree
Hide file tree
Showing 23 changed files with 46 additions and 80 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,6 @@ The majority of FlagAI is licensed under the [Apache 2.0 license](LICENSE), howe
### &#8627; Star History
<div align="center">

[![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]
![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]

</div>
3 changes: 1 addition & 2 deletions examples/AltCLIP/altclip_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
trainer = Trainer(env_type="pytorch",
pytorch_device=device,
experiment_name="clip_finetuning",
eval_interval=10,
batch_size=4,
lr=1e-4,
epochs=10,
Expand Down Expand Up @@ -63,4 +62,4 @@ def cifar10_collate_fn(batch):
}

if __name__ == "__main__":
trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn, metric_methods=["accuracy"])
trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn)
7 changes: 2 additions & 5 deletions examples/AltDiffusion/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@
model.eval()
model.to(device)
predictor = Predictor(model)
# predictor.predict_generate_images(
# "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation"
# )
predictor.predict_generate_images(
"Anime portrait of a crouching smiling baby inside a glass bottle, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation"
)
"Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation"
)
2 changes: 1 addition & 1 deletion examples/bert_title_generation_english/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_dir = "../state_dict/"
model_dir = "./checkpoints/"

# Note "./checkpoints_seq2seq/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_seq2seq.
model_save_path = "./checkpoints_seq2seq/7079/mp_rank_00_model_states.pt"
Expand Down
42 changes: 6 additions & 36 deletions examples/glm_blank_filling/glm_generate_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,58 +5,29 @@
from flagai.model.glm_model import GLMModel
from flagai.data.tokenizer import Tokenizer
from flagai.model.predictor.predictor import Predictor
import bminf

if __name__ == "__main__":
"""Main training program."""
print('Generate Samples')
# Random seeds for reproducability.
# Model,
model_name = 'GLM-10b-ch'
model_name = 'GLM-large-ch'
model = GLMModel.from_pretrain(model_name=model_name,
download_path="./checkpoints")
tokenizer = Tokenizer.from_pretrained(model_name)
with torch.cuda.device(0):
model = bminf.wrapper(model, quantization=False, memory_limit=30 << 39)
predictor = Predictor(model, tokenizer)
<<<<<<< HEAD

# model.load_state_dict(torch.load("../glm_pretrain/checkpoints/1000/pytorch_model.bin")["module"])
model.cuda(torch.cuda.current_device())

text = ["今天天气不错[gMASK]"]
=======
predictor = Predictor(model, tokenizer)
# generate samples
text = [
'问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]"
]
>>>>>>> master
for t in text:
output = predictor.predict_generate_randomsample(
t, top_k=50, repetition_penalty=4.0, top_p=1.0)
print(t, '\n', output)
<<<<<<< HEAD

# text = [
# '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]"
# ]
# for t in text:
# output = predictor.predict_generate_randomsample(
# t, top_k=50, repetition_penalty=4.0, top_p=1.0)
# print(t, '\n', output)

# text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"]
# for t in text:
# output = predictor.predict_generate_randomsample(
# t, top_k=50, repetition_penalty=4.0, top_p=1.0)
# print(t, '\n', output)

# text = [
# "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。",
# "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。"
# ]
# for t in text:
# output = predictor.predict_generate_randomsample(
# t, top_k=50, repetition_penalty=4.0, top_p=1.0)
# print(t, '\n', output)
=======

text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"]
for t in text:
Expand All @@ -71,5 +42,4 @@
for t in text:
output = predictor.predict_generate_randomsample(
t, top_k=50, repetition_penalty=4.0, top_p=1.0)
print(t, '\n', output)
>>>>>>> master
print(t, '\n', output)
2 changes: 1 addition & 1 deletion examples/roberta_semantic_matching/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

cur_dir = os.path.dirname(os.path.abspath(__file__))
train_path = cur_dir + "/data/train.tsv"
model_dir = "./state_dict/"
model_dir = "./checkpoints/"
maxlen = 256

auto_loader = AutoLoader("semantic-matching",
Expand Down
8 changes: 4 additions & 4 deletions flagai/data/dataset/block/blocklm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ def __init__(self,
self.encoder_decoder = encoder_decoder
self.shuffle_blocks = shuffle_blocks
self.sentinel_token = sentinel_token
self.generation_mask = 'gMASK' if task_mask else 'MASK'
self.generation_mask = 'gMASK' if task_mask else 'mask'
self.generation_mask = self.tokenizer.get_command_id(
self.generation_mask)
self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
self.gap_sentence_mask = 'sMASK' if task_mask else 'mask'
self.gap_sentence_mask = self.tokenizer.get_command_id(
self.gap_sentence_mask)
self.random_position = random_position
Expand Down Expand Up @@ -205,7 +205,7 @@ def make_masked_data(self,
#
position_ids = np.arange(len(tokens), dtype=np.int64)
targets = copy.deepcopy(tokens)
mask_id = self.tokenizer.get_command_id('MASK')
mask_id = self.tokenizer.get_command_id('mask')
mlm_masks = np.zeros(len(tokens), dtype=np.int64)
for start, end in block_spans:
for idx in range(start, end):
Expand Down Expand Up @@ -273,7 +273,7 @@ def make_block_data(self,
elif task == 'gap_sentence':
mask_id = self.gap_sentence_mask
else:
mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
mask_token = 'mask' if idx == 0 else f'MASK{idx}'
mask_id = self.tokenizer.get_command_id(mask_token)
local_spans.append((current_length, current_length + start - last))
source_tokens.append(tokens[last:start])
Expand Down
14 changes: 7 additions & 7 deletions flagai/data/dataset/data_collator/collate_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(self, args, tokenizer, task_name):

def encode(self, example):
cls_id = self.tokenizer.get_command_id('cls')
mask_token = 'sMASK' if self.args.task_mask else 'MASK'
mask_token = 'sMASK' if self.args.task_mask else 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
pad_id = self.tokenizer.get_command_id('pad')
sop_id = self.tokenizer.get_command_id('sop')
Expand Down Expand Up @@ -175,7 +175,7 @@ def sub_finder(mylist, pattern):
source_tokens = [cls_id] + source_tokens + [mask_id
] + answer_tokens
elif self.task_name in ["cmrc"]:
mask_id = self.tokenizer.get_command_id('MASK')
mask_id = self.tokenizer.get_command_id('mask')
source_text = example.text_a
target_text = example.meta["answer"].strip()
question = example.meta["question"].strip()
Expand All @@ -191,7 +191,7 @@ def sub_finder(mylist, pattern):
mask_id
] + source_tokens[:max_src_length]
elif self.task_name in ["wsc"]:
mask_id = self.tokenizer.get_command_id('MASK')
mask_id = self.tokenizer.get_command_id('mask')
source_text = example.text_a
target_text = example.meta["answer"].strip()
question = example.meta["question"].strip()
Expand Down Expand Up @@ -307,10 +307,10 @@ def __init__(self,
self.encoder_decoder = encoder_decoder
self.shuffle_blocks = shuffle_blocks
self.sentinel_token = sentinel_token
self.generation_mask = 'gMASK' if task_mask else 'MASK'
self.generation_mask = 'gMASK' if task_mask else 'mask'
self.generation_mask = self.tokenizer.get_command_id(
self.generation_mask)
self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
self.gap_sentence_mask = 'sMASK' if task_mask else 'mask'
self.gap_sentence_mask = self.tokenizer.get_command_id(
self.gap_sentence_mask)
self.random_position = random_position
Expand Down Expand Up @@ -426,7 +426,7 @@ def make_masked_data(self,

position_ids = np.arange(len(tokens), dtype=np.int64)
targets = copy.deepcopy(tokens)
mask_id = self.tokenizer.get_command_id('MASK')
mask_id = self.tokenizer.get_command_id('mask')
mlm_masks = np.zeros(len(tokens), dtype=np.int64)
for start, end in block_spans:
for idx in range(start, end):
Expand Down Expand Up @@ -494,7 +494,7 @@ def make_block_data(self,
elif task == 'gap_sentence':
mask_id = self.gap_sentence_mask
else:
mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
mask_token = 'mask' if idx == 0 else f'MASK{idx}'
mask_id = self.tokenizer.get_command_id(mask_token)
local_spans.append((current_length, current_length + start - last))
source_tokens.append(tokens[last:start])
Expand Down
4 changes: 2 additions & 2 deletions flagai/data/dataset/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids,

# Prepare ids for special tokens
if mask_id is None:
mask_id = tokenizer.get_command_id('MASK')
mask_id = tokenizer.get_command_id('mask')
eos_id = tokenizer.get_command_id('eos') # end of sentence token
cls_id = tokenizer.get_command_id('cls') # start of sentence token
sep_id = tokenizer.get_command_id('sep') # seperator of two texts token
Expand Down Expand Up @@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids,
#
def build_decoder_input(enc_ids, answer_ids, max_seq_length,
max_dec_seq_length, tokenizer):
mask_id = tokenizer.get_command_id('MASK')
mask_id = tokenizer.get_command_id('mask')
eos_id = tokenizer.get_command_id('eos')
sop_id = tokenizer.get_command_id('sop')
masks = []
Expand Down
4 changes: 2 additions & 2 deletions flagai/data/dataset/language_model/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, args, documents, tokenizer, num_original_tokens,
self.left_weights = [0] + self.weights[:-1]
self.unidirectional = args.unidirectional
self.block_lm = args.block_lm
mask_token = "gMASK" if args.task_mask else 'MASK'
mask_token = "gMASK" if args.task_mask else 'mask'
self.mask_id = self.tokenizer.get_command_id(mask_token)

def __len__(self):
Expand Down Expand Up @@ -115,7 +115,7 @@ def __init__(self, args, tokenizer, strict=True):
self.strict = strict
self.block_lm = args.block_lm
self.unidirectional = args.unidirectional
mask_token = "gMASK" if args.task_mask else 'MASK'
mask_token = "gMASK" if args.task_mask else 'mask'
self.mask_id = self.tokenizer.get_command_id(mask_token)

self.tokens = []
Expand Down
4 changes: 2 additions & 2 deletions flagai/data/dataset/seq2seq/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ def __len__(self):
def __getitem__(self, idx):
example = self.example_list[idx]
source_text, target_text = example.text_a, example.text_b
mask_token = 'MASK'
mask_token = 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
sop_id = self.tokenizer.get_command_id('sop')
eop_id = self.tokenizer.get_command_id('eop')
Expand Down Expand Up @@ -612,7 +612,7 @@ def __len__(self):
def __getitem__(self, idx):
example = self.example_list[idx]
source_text = example.text_a
mask_token = 'gMASK' if self.args.task_mask else 'MASK'
mask_token = 'gMASK' if self.args.task_mask else 'mask'
mask_id = self.tokenizer.get_command_id(mask_token)
sop_id = self.tokenizer.get_command_id('sop')
eop_id = self.tokenizer.get_command_id('eop')
Expand Down
8 changes: 4 additions & 4 deletions flagai/data/dataset/superglue/pvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@ def spell_length(self):
@property
def mask(self) -> str:
"""Return the underlying LM's mask token"""
return self.tokenizer.get_command_id('MASK')
return self.tokenizer.get_command_id('mask')

@property
def mask_id(self) -> int:
"""Return the underlying LM's mask id"""
return self.tokenizer.get_command_id('MASK')
return self.tokenizer.get_command_id('mask')

@property
def max_num_verbalizers(self) -> int:
Expand Down Expand Up @@ -574,13 +574,13 @@ def spell_length(self):
@property
def mask(self) -> str:
"""Return the underlying LM's mask token"""
mask_token = 'MASK'
mask_token = 'mask'
return self.tokenizer.get_command_id(mask_token)

@property
def mask_id(self) -> int:
"""Return the underlying LM's mask id"""
mask_token = 'MASK'
mask_token = 'mask'
return self.tokenizer.get_command_id(mask_token)

def get_answers(self, example: InputExample):
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/bert/bert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/galactica/galactica_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, download_dir) -> None:
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
Expand Down
4 changes: 2 additions & 2 deletions flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(self,
self.text_tokenizer.encoder['</s>']),
CommandToken('cls', '[CLS]',
self.text_tokenizer.encoder['<s>']),
CommandToken('MASK',
CommandToken('mask',
'[MASK]',
self.text_tokenizer.encoder['<mask>'],
lstrip=True),
Expand Down Expand Up @@ -88,7 +88,7 @@ def __init__(self,
CommandToken('sop', '<|startofpiece|>', self.num_tokens),
CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
CommandToken('cls', '[CLS]', self.num_tokens + 2),
CommandToken('MASK',
CommandToken('mask',
'[MASK]',
self.num_tokens + 3,
lstrip=True),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(self,
CommandToken('eos', '<|endoftext|>', self.num_text_tokens),
CommandToken('sep', '[SEP]', self.num_text_tokens + 1),
CommandToken('cls', '[CLS]', self.num_text_tokens + 2),
CommandToken('MASK',
CommandToken('mask',
'[MASK]',
self.num_text_tokens + 3,
lstrip=True),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self,
self._command_tokens = [
CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.text_tokenizer.vocab['[MASK]']),
CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/opt/opt_en_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/roberta/roberta_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None):
self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/t5/t5_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, tokenizer_model_type="t5-base", cache_dir=None):

CommandToken('pad', '[PAD]', self.num_tokens + 1),
CommandToken('cls', '[CLS]', self.num_tokens + 2),
CommandToken('MASK', '[MASK]',
CommandToken('mask', '[MASK]',
self.num_tokens + 3),
]
self._command_tokens.extend([
Expand Down
2 changes: 1 addition & 1 deletion flagai/data/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __str__(self):
('sep', 4),
('L2R', 5),
('cls', 6),
('MASK', 7),
('mask', 7),
]
DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
"""define some default type tokens for bert training"""
Expand Down
Loading

0 comments on commit 12433c8

Please sign in to comment.