From 9dc6e10ffe45e4d97aca72a54834ebde61f072be Mon Sep 17 00:00:00 2001 From: Anhforth Date: Mon, 20 Feb 2023 16:10:47 +0800 Subject: [PATCH] upadted Signed-off-by: Anhforth --- examples/glm_blank_filling/glm_generate_samples.py | 3 ++- .../glm_blank_filling/glm_generate_samples_en.py | 3 ++- examples/t5_title_generation/generate.py | 14 +++++++------- flagai/data/tokenizer/uni_tokenizer/tokenizer.py | 11 ++++++++++- flagai/model/predictor/utils.py | 3 ++- tests/test_tokenizer.py | 4 ++-- 6 files changed, 25 insertions(+), 13 deletions(-) diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py index 40e385a9..f290f662 100644 --- a/examples/glm_blank_filling/glm_generate_samples.py +++ b/examples/glm_blank_filling/glm_generate_samples.py @@ -1,7 +1,8 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") - +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer diff --git a/examples/glm_blank_filling/glm_generate_samples_en.py b/examples/glm_blank_filling/glm_generate_samples_en.py index 009b4ed1..96347ec8 100644 --- a/examples/glm_blank_filling/glm_generate_samples_en.py +++ b/examples/glm_blank_filling/glm_generate_samples_en.py @@ -1,7 +1,8 @@ # Copyright © 2022 BAAI. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License") - +import sys +sys.path.append("/home/yanzhaodong/anhforth/FlagAI") import torch from flagai.model.glm_model import GLMModel from flagai.data.tokenizer import Tokenizer diff --git a/examples/t5_title_generation/generate.py b/examples/t5_title_generation/generate.py index 8e1b630c..5a4910e8 100644 --- a/examples/t5_title_generation/generate.py +++ b/examples/t5_title_generation/generate.py @@ -19,12 +19,12 @@ beam_size=3, input_max_length=512, out_max_length=100) - out_2 = predictor.predict_generate_randomsample(text, - input_max_length=512, - out_max_length=100, - repetition_penalty=1.5, - top_k=20, - top_p=0.8) +# out_2 = predictor.predict_generate_randomsample(text, +# input_max_length=512, +# out_max_length=100, +# repetition_penalty=1.5, +# top_k=20, +# top_p=0.8) print(f"out_1 is {out_1}") - print(f"out_2 is {out_2}") +# print(f"out_2 is {out_2}") diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 876c0989..30db5574 100644 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -52,6 +52,7 @@ def __init__(self, add_task_mask=True, add_decoder_mask=False, fix_command_token=True, + pre_tokenizer=None, **kwargs): super().__init__(**kwargs) if self.tokenizer_class == "wp": @@ -75,6 +76,9 @@ def __init__(self, if self.tokenizer_model_name.lower().startswith('glm') or self.tokenizer_model_name.lower().startswith('alm'): add_block_symbols=True # self.is_clip = self.tokenizer_model_name.startswith('clip') + # if self.tokenizer_model_name.startswith('t5'): + # import jieba + # self.pre_tokenizer = lambda x: jieba.cut(x, HMM=False) self.num_tokens = self.text_tokenizer.vocab_size with open(self.special_tokens_map, encoding='utf8') as file: dct=json.load(file) sp_tokens = [(k.replace("_token",""),v['content']) for k,v in dct.items()] @@ -590,7 +594,8 @@ def encode_plus_non_glm( truncation=True, max_length=None, ): - + if self.tokenizer_model_name.startswith('t5'): + assert second_text is None, "t5 does not support multi-sentence encoding" def get_input_ids(text): tokens = self.text_tokenizer.tokenize(text) return self.text_tokenizer.convert_tokens_to_ids(tokens) @@ -753,6 +758,10 @@ def tokenize_as_tensor(self, texts): eot_token=eot_token) def tokenize(self, text, maxlen=None, add_spatial_tokens=False): + """ + add_spatial_token: (bool) Add cls at the front and sep at the end + max_len: Truncate the length to max_len + """ tokens = self.text_tokenizer.tokenize(text) if add_spatial_tokens: diff --git a/flagai/model/predictor/utils.py b/flagai/model/predictor/utils.py index 22eab1da..6d78c842 100644 --- a/flagai/model/predictor/utils.py +++ b/flagai/model/predictor/utils.py @@ -928,8 +928,9 @@ def t5_random_sample(model, tokenizer, text, input_max_length, out_max_length, TopPLogitsProcessor(top_p=top_p), ] list_processor = ListProcessor(lp) + from tqdm import trange with torch.no_grad(): - for step in range(out_max_length): + for step in trange(out_max_length): scores = model(**{ "input_ids": token_ids, "decoder_input_ids": input_decoder_ids diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 686d3fa0..666bda4f 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -49,8 +49,8 @@ def test_tokenizer_t5(self): self.assertEqual(tokenizer.DecodeIds([3, 7704, 3832, 656, 140, 1095]), 'fried chicken makes me happy', 'DecodeIds Error') self.assertEqual([(v.name, k,v.Id) for k,v in tokenizer.command_token_map.items()], - [('eos', '<|endoftext|>', 32000), ('sep', '[SEP]', 32001), ('cls', '[CLS]', 32002), - ('MASK', '[MASK]', 32003), ('unk', '[UNK]', 32004)]) + [('eos', '[PAD]', 0), ('cls', '[CLS]', 101), ('MASK', '[MASK]', 103), + ('unk', '[UNK]', 100), ('sep', '[SEP]', 102)]) # def test_tokenizer_roberta(self): # tokenizer = Tokenizer.from_pretrained('RoBERTa-base-ch')