From 12433c8b1bde1b5b4c753cffa7114b0b0872b6ce Mon Sep 17 00:00:00 2001 From: ftgreat Date: Wed, 15 Mar 2023 14:56:49 +0800 Subject: [PATCH] updated Signed-off-by: ftgreat --- README.md | 2 +- examples/AltCLIP/altclip_finetuning.py | 3 +- examples/AltDiffusion/generate.py | 7 +--- .../bert_title_generation_english/generate.py | 2 +- .../glm_blank_filling/glm_generate_samples.py | 42 +++---------------- examples/roberta_semantic_matching/train.py | 2 +- flagai/data/dataset/block/blocklm_utils.py | 8 ++-- .../data/dataset/data_collator/collate_fn.py | 14 +++---- flagai/data/dataset/data_utils.py | 4 +- flagai/data/dataset/language_model/dataset.py | 4 +- flagai/data/dataset/seq2seq/dataset.py | 4 +- flagai/data/dataset/superglue/pvp.py | 8 ++-- flagai/data/tokenizer/bert/bert_tokenizer.py | 2 +- .../galactica/galactica_tokenizer.py | 2 +- .../glm_10b_en/glm_10b_en_bpe_tokenizer.py | 4 +- .../glm_large_ch/glm_large_ch_tokenizer.py | 2 +- .../glm_large_en/glm_large_en_tokenizer.py | 2 +- flagai/data/tokenizer/opt/opt_en_tokenizer.py | 2 +- .../tokenizer/roberta/roberta_tokenizer.py | 2 +- flagai/data/tokenizer/t5/t5_tokenizer.py | 2 +- flagai/data/tokenizer/tokenizer.py | 2 +- flagai/model/predictor/utils.py | 4 +- flagai/test_utils.py | 2 +- 23 files changed, 46 insertions(+), 80 deletions(-) diff --git a/README.md b/README.md index 416ccc80..d48f1be3 100644 --- a/README.md +++ b/README.md @@ -260,6 +260,6 @@ The majority of FlagAI is licensed under the [Apache 2.0 license](LICENSE), howe ### ↳ Star History
-[![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)] +![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]
diff --git a/examples/AltCLIP/altclip_finetuning.py b/examples/AltCLIP/altclip_finetuning.py index 7eeb9703..2b95fd4c 100644 --- a/examples/AltCLIP/altclip_finetuning.py +++ b/examples/AltCLIP/altclip_finetuning.py @@ -32,7 +32,6 @@ trainer = Trainer(env_type="pytorch", pytorch_device=device, experiment_name="clip_finetuning", - eval_interval=10, batch_size=4, lr=1e-4, epochs=10, @@ -63,4 +62,4 @@ def cifar10_collate_fn(batch): } if __name__ == "__main__": - trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn, metric_methods=["accuracy"]) \ No newline at end of file + trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn) \ No newline at end of file diff --git a/examples/AltDiffusion/generate.py b/examples/AltDiffusion/generate.py index d55a364c..79e59d17 100644 --- a/examples/AltDiffusion/generate.py +++ b/examples/AltDiffusion/generate.py @@ -17,9 +17,6 @@ model.eval() model.to(device) predictor = Predictor(model) -# predictor.predict_generate_images( -# "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" -# ) predictor.predict_generate_images( - "Anime portrait of a crouching smiling baby inside a glass bottle, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" -) \ No newline at end of file + "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation" +) diff --git a/examples/bert_title_generation_english/generate.py b/examples/bert_title_generation_english/generate.py index 1124d16d..fdfa2f41 100644 --- a/examples/bert_title_generation_english/generate.py +++ b/examples/bert_title_generation_english/generate.py @@ -7,7 +7,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -model_dir = "../state_dict/" +model_dir = "./checkpoints/" # Note "./checkpoints_seq2seq/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_seq2seq. model_save_path = "./checkpoints_seq2seq/7079/mp_rank_00_model_states.pt" diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 4bcdf374..01b1bf00 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -5,58 +5,29 @@ from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer from flagai.model.predictor.predictor import Predictor -import bminf if __name__ == "__main__": """Main training program.""" print('Generate Samples') # Random seeds for reproducability. # Model, - model_name = 'GLM-10b-ch' + model_name = 'GLM-large-ch' model = GLMModel.from_pretrain(model_name=model_name, download_path="./checkpoints") tokenizer = Tokenizer.from_pretrained(model_name) - with torch.cuda.device(0): - model = bminf.wrapper(model, quantization=False, memory_limit=30 << 39) - predictor = Predictor(model, tokenizer) -<<<<<<< HEAD + + # model.load_state_dict(torch.load("../glm_pretrain/checkpoints/1000/pytorch_model.bin")["module"]) + model.cuda(torch.cuda.current_device()) - text = ["今天天气不错[gMASK]"] -======= + predictor = Predictor(model, tokenizer) # generate samples text = [ '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" ] ->>>>>>> master for t in text: output = predictor.predict_generate_randomsample( t, top_k=50, repetition_penalty=4.0, top_p=1.0) print(t, '\n', output) -<<<<<<< HEAD - - # text = [ - # '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" - # ] - # for t in text: - # output = predictor.predict_generate_randomsample( - # t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # print(t, '\n', output) - - # text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] - # for t in text: - # output = predictor.predict_generate_randomsample( - # t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # print(t, '\n', output) - - # text = [ - # "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", - # "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" - # ] - # for t in text: - # output = predictor.predict_generate_randomsample( - # t, top_k=50, repetition_penalty=4.0, top_p=1.0) - # print(t, '\n', output) -======= text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] for t in text: @@ -71,5 +42,4 @@ for t in text: output = predictor.predict_generate_randomsample( t, top_k=50, repetition_penalty=4.0, top_p=1.0) - print(t, '\n', output) ->>>>>>> master + print(t, '\n', output) \ No newline at end of file diff --git a/examples/roberta_semantic_matching/train.py b/examples/roberta_semantic_matching/train.py index e0648063..30e9821f 100644 --- a/examples/roberta_semantic_matching/train.py +++ b/examples/roberta_semantic_matching/train.py @@ -27,7 +27,7 @@ cur_dir = os.path.dirname(os.path.abspath(__file__)) train_path = cur_dir + "/data/train.tsv" -model_dir = "./state_dict/" +model_dir = "./checkpoints/" maxlen = 256 auto_loader = AutoLoader("semantic-matching", diff --git a/flagai/data/dataset/block/blocklm_utils.py b/flagai/data/dataset/block/blocklm_utils.py index 4687305f..44fda3d2 100644 --- a/flagai/data/dataset/block/blocklm_utils.py +++ b/flagai/data/dataset/block/blocklm_utils.py @@ -86,10 +86,10 @@ def __init__(self, self.encoder_decoder = encoder_decoder self.shuffle_blocks = shuffle_blocks self.sentinel_token = sentinel_token - self.generation_mask = 'gMASK' if task_mask else 'MASK' + self.generation_mask = 'gMASK' if task_mask else 'mask' self.generation_mask = self.tokenizer.get_command_id( self.generation_mask) - self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK' + self.gap_sentence_mask = 'sMASK' if task_mask else 'mask' self.gap_sentence_mask = self.tokenizer.get_command_id( self.gap_sentence_mask) self.random_position = random_position @@ -205,7 +205,7 @@ def make_masked_data(self, # position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): @@ -273,7 +273,7 @@ def make_block_data(self, elif task == 'gap_sentence': mask_id = self.gap_sentence_mask else: - mask_token = 'MASK' if idx == 0 else f'MASK{idx}' + mask_token = 'mask' if idx == 0 else f'MASK{idx}' mask_id = self.tokenizer.get_command_id(mask_token) local_spans.append((current_length, current_length + start - last)) source_tokens.append(tokens[last:start]) diff --git a/flagai/data/dataset/data_collator/collate_fn.py b/flagai/data/dataset/data_collator/collate_fn.py index 73b2f8e5..6eb629d5 100644 --- a/flagai/data/dataset/data_collator/collate_fn.py +++ b/flagai/data/dataset/data_collator/collate_fn.py @@ -126,7 +126,7 @@ def __init__(self, args, tokenizer, task_name): def encode(self, example): cls_id = self.tokenizer.get_command_id('cls') - mask_token = 'sMASK' if self.args.task_mask else 'MASK' + mask_token = 'sMASK' if self.args.task_mask else 'mask' mask_id = self.tokenizer.get_command_id(mask_token) pad_id = self.tokenizer.get_command_id('pad') sop_id = self.tokenizer.get_command_id('sop') @@ -175,7 +175,7 @@ def sub_finder(mylist, pattern): source_tokens = [cls_id] + source_tokens + [mask_id ] + answer_tokens elif self.task_name in ["cmrc"]: - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -191,7 +191,7 @@ def sub_finder(mylist, pattern): mask_id ] + source_tokens[:max_src_length] elif self.task_name in ["wsc"]: - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') source_text = example.text_a target_text = example.meta["answer"].strip() question = example.meta["question"].strip() @@ -307,10 +307,10 @@ def __init__(self, self.encoder_decoder = encoder_decoder self.shuffle_blocks = shuffle_blocks self.sentinel_token = sentinel_token - self.generation_mask = 'gMASK' if task_mask else 'MASK' + self.generation_mask = 'gMASK' if task_mask else 'mask' self.generation_mask = self.tokenizer.get_command_id( self.generation_mask) - self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK' + self.gap_sentence_mask = 'sMASK' if task_mask else 'mask' self.gap_sentence_mask = self.tokenizer.get_command_id( self.gap_sentence_mask) self.random_position = random_position @@ -426,7 +426,7 @@ def make_masked_data(self, position_ids = np.arange(len(tokens), dtype=np.int64) targets = copy.deepcopy(tokens) - mask_id = self.tokenizer.get_command_id('MASK') + mask_id = self.tokenizer.get_command_id('mask') mlm_masks = np.zeros(len(tokens), dtype=np.int64) for start, end in block_spans: for idx in range(start, end): @@ -494,7 +494,7 @@ def make_block_data(self, elif task == 'gap_sentence': mask_id = self.gap_sentence_mask else: - mask_token = 'MASK' if idx == 0 else f'MASK{idx}' + mask_token = 'mask' if idx == 0 else f'MASK{idx}' mask_id = self.tokenizer.get_command_id(mask_token) local_spans.append((current_length, current_length + start - last)) source_tokens.append(tokens[last:start]) diff --git a/flagai/data/dataset/data_utils.py b/flagai/data/dataset/data_utils.py index 4f0ee38d..1efee372 100644 --- a/flagai/data/dataset/data_utils.py +++ b/flagai/data/dataset/data_utils.py @@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids, # Prepare ids for special tokens if mask_id is None: - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') # end of sentence token cls_id = tokenizer.get_command_id('cls') # start of sentence token sep_id = tokenizer.get_command_id('sep') # seperator of two texts token @@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids, # def build_decoder_input(enc_ids, answer_ids, max_seq_length, max_dec_seq_length, tokenizer): - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') sop_id = tokenizer.get_command_id('sop') masks = [] diff --git a/flagai/data/dataset/language_model/dataset.py b/flagai/data/dataset/language_model/dataset.py index b291251b..a911df81 100644 --- a/flagai/data/dataset/language_model/dataset.py +++ b/flagai/data/dataset/language_model/dataset.py @@ -38,7 +38,7 @@ def __init__(self, args, documents, tokenizer, num_original_tokens, self.left_weights = [0] + self.weights[:-1] self.unidirectional = args.unidirectional self.block_lm = args.block_lm - mask_token = "gMASK" if args.task_mask else 'MASK' + mask_token = "gMASK" if args.task_mask else 'mask' self.mask_id = self.tokenizer.get_command_id(mask_token) def __len__(self): @@ -115,7 +115,7 @@ def __init__(self, args, tokenizer, strict=True): self.strict = strict self.block_lm = args.block_lm self.unidirectional = args.unidirectional - mask_token = "gMASK" if args.task_mask else 'MASK' + mask_token = "gMASK" if args.task_mask else 'mask' self.mask_id = self.tokenizer.get_command_id(mask_token) self.tokens = [] diff --git a/flagai/data/dataset/seq2seq/dataset.py b/flagai/data/dataset/seq2seq/dataset.py index adc28149..b0bc4148 100644 --- a/flagai/data/dataset/seq2seq/dataset.py +++ b/flagai/data/dataset/seq2seq/dataset.py @@ -477,7 +477,7 @@ def __len__(self): def __getitem__(self, idx): example = self.example_list[idx] source_text, target_text = example.text_a, example.text_b - mask_token = 'MASK' + mask_token = 'mask' mask_id = self.tokenizer.get_command_id(mask_token) sop_id = self.tokenizer.get_command_id('sop') eop_id = self.tokenizer.get_command_id('eop') @@ -612,7 +612,7 @@ def __len__(self): def __getitem__(self, idx): example = self.example_list[idx] source_text = example.text_a - mask_token = 'gMASK' if self.args.task_mask else 'MASK' + mask_token = 'gMASK' if self.args.task_mask else 'mask' mask_id = self.tokenizer.get_command_id(mask_token) sop_id = self.tokenizer.get_command_id('sop') eop_id = self.tokenizer.get_command_id('eop') diff --git a/flagai/data/dataset/superglue/pvp.py b/flagai/data/dataset/superglue/pvp.py index d4d07b39..8a4d6ee3 100644 --- a/flagai/data/dataset/superglue/pvp.py +++ b/flagai/data/dataset/superglue/pvp.py @@ -97,12 +97,12 @@ def spell_length(self): @property def mask(self) -> str: """Return the underlying LM's mask token""" - return self.tokenizer.get_command_id('MASK') + return self.tokenizer.get_command_id('mask') @property def mask_id(self) -> int: """Return the underlying LM's mask id""" - return self.tokenizer.get_command_id('MASK') + return self.tokenizer.get_command_id('mask') @property def max_num_verbalizers(self) -> int: @@ -574,13 +574,13 @@ def spell_length(self): @property def mask(self) -> str: """Return the underlying LM's mask token""" - mask_token = 'MASK' + mask_token = 'mask' return self.tokenizer.get_command_id(mask_token) @property def mask_id(self) -> int: """Return the underlying LM's mask id""" - mask_token = 'MASK' + mask_token = 'mask' return self.tokenizer.get_command_id(mask_token) def get_answers(self, example: InputExample): diff --git a/flagai/data/tokenizer/bert/bert_tokenizer.py b/flagai/data/tokenizer/bert/bert_tokenizer.py index eec168ea..3c935713 100644 --- a/flagai/data/tokenizer/bert/bert_tokenizer.py +++ b/flagai/data/tokenizer/bert/bert_tokenizer.py @@ -75,7 +75,7 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py index fdaf5be6..f028d0f0 100644 --- a/flagai/data/tokenizer/galactica/galactica_tokenizer.py +++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py @@ -15,7 +15,7 @@ def __init__(self, download_dir) -> None: self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py index b762b66b..e592d33d 100644 --- a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py +++ b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py @@ -60,7 +60,7 @@ def __init__(self, self.text_tokenizer.encoder['']), CommandToken('cls', '[CLS]', self.text_tokenizer.encoder['']), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.text_tokenizer.encoder[''], lstrip=True), @@ -88,7 +88,7 @@ def __init__(self, CommandToken('sop', '<|startofpiece|>', self.num_tokens), CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1), CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.num_tokens + 3, lstrip=True), diff --git a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py index 69048d3a..b91797f6 100644 --- a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py +++ b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py @@ -55,7 +55,7 @@ def __init__(self, CommandToken('eos', '<|endoftext|>', self.num_text_tokens), CommandToken('sep', '[SEP]', self.num_text_tokens + 1), CommandToken('cls', '[CLS]', self.num_text_tokens + 2), - CommandToken('MASK', + CommandToken('mask', '[MASK]', self.num_text_tokens + 3, lstrip=True), diff --git a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py index ff4e1e4a..db4c726f 100644 --- a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py +++ b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py @@ -59,7 +59,7 @@ def __init__(self, self._command_tokens = [ CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']), CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.text_tokenizer.vocab['[MASK]']), CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']), CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']), diff --git a/flagai/data/tokenizer/opt/opt_en_tokenizer.py b/flagai/data/tokenizer/opt/opt_en_tokenizer.py index 5c1c0de8..9e8e528c 100644 --- a/flagai/data/tokenizer/opt/opt_en_tokenizer.py +++ b/flagai/data/tokenizer/opt/opt_en_tokenizer.py @@ -35,7 +35,7 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/roberta/roberta_tokenizer.py b/flagai/data/tokenizer/roberta/roberta_tokenizer.py index 553a8a83..f1b270e4 100644 --- a/flagai/data/tokenizer/roberta/roberta_tokenizer.py +++ b/flagai/data/tokenizer/roberta/roberta_tokenizer.py @@ -38,7 +38,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None): self._command_tokens = [ CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.get_specialid_from_text_tokenizer('mask')), CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), diff --git a/flagai/data/tokenizer/t5/t5_tokenizer.py b/flagai/data/tokenizer/t5/t5_tokenizer.py index ef793b67..499aa83e 100644 --- a/flagai/data/tokenizer/t5/t5_tokenizer.py +++ b/flagai/data/tokenizer/t5/t5_tokenizer.py @@ -45,7 +45,7 @@ def __init__(self, tokenizer_model_type="t5-base", cache_dir=None): CommandToken('pad', '[PAD]', self.num_tokens + 1), CommandToken('cls', '[CLS]', self.num_tokens + 2), - CommandToken('MASK', '[MASK]', + CommandToken('mask', '[MASK]', self.num_tokens + 3), ] self._command_tokens.extend([ diff --git a/flagai/data/tokenizer/tokenizer.py b/flagai/data/tokenizer/tokenizer.py index c3ba085f..43585688 100644 --- a/flagai/data/tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/tokenizer.py @@ -54,7 +54,7 @@ def __str__(self): ('sep', 4), ('L2R', 5), ('cls', 6), - ('MASK', 7), + ('mask', 7), ] DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS) """define some default type tokens for bert training""" diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 72077041..61d91ce6 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -1133,7 +1133,7 @@ def alm_beamsearch(model, tokenizer, text, out_max_length, beam_size, eod_token= dtype=torch.long) position_ids = torch.stack((position_ids, block_position_ids), dim=0) position_ids = position_ids.unsqueeze(0) - mask_tokens = ['MASK', 'sMASK', 'gMASK'] + mask_tokens = ['mask', 'sMASK', 'gMASK'] mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens] end_tokens = [tokenizer.get_command_id('eop'), eod_token] mask_positions = [] @@ -1434,7 +1434,7 @@ def glm_generate_sample( dtype=torch.long) position_ids = torch.stack((position_ids, block_position_ids), dim=0) position_ids = position_ids.unsqueeze(0) - mask_tokens = ['MASK', 'sMASK', 'gMASK'] + mask_tokens = ['mask', 'sMASK', 'gMASK'] mask_tokens = [tokenizer.get_command_id(token) for token in mask_tokens] end_tokens = [tokenizer.get_command_id('eop'), eod_token] mask_positions = [] diff --git a/flagai/test_utils.py b/flagai/test_utils.py index 83dacde3..5faa0aec 100644 --- a/flagai/test_utils.py +++ b/flagai/test_utils.py @@ -14,7 +14,7 @@ def build_input_from_ids(text_a_ids=None, mask_id=None, masked_lm=False): if mask_id is None: - mask_id = tokenizer.get_command_id('MASK') + mask_id = tokenizer.get_command_id('mask') eos_id = tokenizer.get_command_id('eos') cls_id = tokenizer.get_command_id('cls') sep_id = tokenizer.get_command_id('sep')