updated

Signed-off-by: ftgreat <[email protected]>
great1001 · Mar 15, 2023 · 12433c8 · 12433c8
1 parent dbd5a4e
commit 12433c8
Show file tree

Hide file tree

Showing 23 changed files with 46 additions and 80 deletions.
diff --git a/README.md b/README.md
@@ -260,6 +260,6 @@ The majority of FlagAI is licensed under the [Apache 2.0 license](LICENSE), howe
 ### &#8627; Star History
 <div align="center">
 
-[![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]
+![Star History Chart](https://api.star-history.com/svg?repos=FlagAI-Open/FlagAI&type=Date)]
 
 </div>
diff --git a/examples/AltCLIP/altclip_finetuning.py b/examples/AltCLIP/altclip_finetuning.py
@@ -32,7 +32,6 @@
 trainer = Trainer(env_type="pytorch",
                 pytorch_device=device,
                 experiment_name="clip_finetuning",
-                eval_interval=10,
                 batch_size=4,
                 lr=1e-4,
                 epochs=10,
@@ -63,4 +62,4 @@ def cifar10_collate_fn(batch):
     }
 
 if __name__ == "__main__":
-    trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn, metric_methods=["accuracy"])
+    trainer.train(model=model, train_dataset=dataset, collate_fn=cifar10_collate_fn)
diff --git a/examples/AltDiffusion/generate.py b/examples/AltDiffusion/generate.py
@@ -17,9 +17,6 @@
 model.eval()
 model.to(device)
 predictor = Predictor(model)
-# predictor.predict_generate_images(
-#     "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation"
-# )
 predictor.predict_generate_images(
-    "Anime portrait of a crouching smiling baby inside a glass bottle, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation"
-)
+    "Anime portrait of natalie portman as an anime girl by stanley artgerm lau, wlop, rossdraws, james jean, andrei riabovitchev, marc simonetti, and sakimichan, trending on artstation"
+)
diff --git a/examples/bert_title_generation_english/generate.py b/examples/bert_title_generation_english/generate.py
@@ -7,7 +7,7 @@
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-model_dir = "../state_dict/"
+model_dir = "./checkpoints/"
 
 # Note "./checkpoints_seq2seq/{}/mp_rank_00_model_states.pt", {} is a directory in the checkpoints_seq2seq.
 model_save_path = "./checkpoints_seq2seq/7079/mp_rank_00_model_states.pt"

diff --git a/examples/glm_blank_filling/glm_generate_samples.py b/examples/glm_blank_filling/glm_generate_samples.py
@@ -5,58 +5,29 @@
 from flagai.model.glm_model import GLMModel
 from flagai.data.tokenizer import Tokenizer
 from flagai.model.predictor.predictor import Predictor
-import bminf 
 
 if __name__ == "__main__":
     """Main training program."""
     print('Generate Samples')
     # Random seeds for reproducability.
     # Model,
-    model_name = 'GLM-10b-ch'
+    model_name = 'GLM-large-ch'
     model = GLMModel.from_pretrain(model_name=model_name,
                                    download_path="./checkpoints")
     tokenizer = Tokenizer.from_pretrained(model_name)
-    with torch.cuda.device(0):
-        model = bminf.wrapper(model, quantization=False, memory_limit=30 << 39)
-    predictor = Predictor(model, tokenizer)
-<<<<<<< HEAD
+
+    # model.load_state_dict(torch.load("../glm_pretrain/checkpoints/1000/pytorch_model.bin")["module"])
+    model.cuda(torch.cuda.current_device())
 
-    text = ["今天天气不错[gMASK]"]
-=======
+    predictor = Predictor(model, tokenizer)
     # generate samples
     text = [
         '问题：啤酒伤胃吗？回答：[gMASK]', "问题：隔夜菜能吃吗？回答：[gMASK]", "问题：如何评价许嵩？回答：[gMASK]"
     ]
->>>>>>> master
     for t in text:
         output = predictor.predict_generate_randomsample(
             t, top_k=50, repetition_penalty=4.0, top_p=1.0)
         print(t, '\n', output)
-<<<<<<< HEAD
-
-    # text = [
-    #     '问题：啤酒伤胃吗？回答：[gMASK]', "问题：隔夜菜能吃吗？回答：[gMASK]", "问题：如何评价许嵩？回答：[gMASK]"
-    # ]
-    # for t in text:
-    #     output = predictor.predict_generate_randomsample(
-    #         t, top_k=50, repetition_penalty=4.0, top_p=1.0)
-    #     print(t, '\n', output)
-
-    # text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"]
-    # for t in text:
-    #     output = predictor.predict_generate_randomsample(
-    #         t, top_k=50, repetition_penalty=4.0, top_p=1.0)
-    #     print(t, '\n', output)
-
-    # text = [
-    #     "人工智能是一个以计算机科学为基础，由计算机、数学、哲学等多学科交叉融合的交叉学科，[sMASK]，具有非常巨大的前景。",
-    #     "最近十多年来，人工神经网络的研究工作不断深入，已经取得了很大的进展，[sMASK]，表现出了良好的智能特性。"
-    # ]
-    # for t in text:
-    #     output = predictor.predict_generate_randomsample(
-    #         t, top_k=50, repetition_penalty=4.0, top_p=1.0)
-    #     print(t, '\n', output)
-=======
 
     text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"]
     for t in text:
@@ -71,5 +42,4 @@
     for t in text:
         output = predictor.predict_generate_randomsample(
             t, top_k=50, repetition_penalty=4.0, top_p=1.0)
-        print(t, '\n', output)
->>>>>>> master
+        print(t, '\n', output)
diff --git a/examples/roberta_semantic_matching/train.py b/examples/roberta_semantic_matching/train.py
@@ -27,7 +27,7 @@
 
 cur_dir = os.path.dirname(os.path.abspath(__file__))
 train_path = cur_dir + "/data/train.tsv"
-model_dir = "./state_dict/"
+model_dir = "./checkpoints/"
 maxlen = 256
 
 auto_loader = AutoLoader("semantic-matching",

diff --git a/flagai/data/dataset/block/blocklm_utils.py b/flagai/data/dataset/block/blocklm_utils.py
@@ -86,10 +86,10 @@ def __init__(self,
         self.encoder_decoder = encoder_decoder
         self.shuffle_blocks = shuffle_blocks
         self.sentinel_token = sentinel_token
-        self.generation_mask = 'gMASK' if task_mask else 'MASK'
+        self.generation_mask = 'gMASK' if task_mask else 'mask'
         self.generation_mask = self.tokenizer.get_command_id(
             self.generation_mask)
-        self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
+        self.gap_sentence_mask = 'sMASK' if task_mask else 'mask'
         self.gap_sentence_mask = self.tokenizer.get_command_id(
             self.gap_sentence_mask)
         self.random_position = random_position
@@ -205,7 +205,7 @@ def make_masked_data(self,
         #
         position_ids = np.arange(len(tokens), dtype=np.int64)
         targets = copy.deepcopy(tokens)
-        mask_id = self.tokenizer.get_command_id('MASK')
+        mask_id = self.tokenizer.get_command_id('mask')
         mlm_masks = np.zeros(len(tokens), dtype=np.int64)
         for start, end in block_spans:
             for idx in range(start, end):
@@ -273,7 +273,7 @@ def make_block_data(self,
             elif task == 'gap_sentence':
                 mask_id = self.gap_sentence_mask
             else:
-                mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
+                mask_token = 'mask' if idx == 0 else f'MASK{idx}'
                 mask_id = self.tokenizer.get_command_id(mask_token)
             local_spans.append((current_length, current_length + start - last))
             source_tokens.append(tokens[last:start])

diff --git a/flagai/data/dataset/data_collator/collate_fn.py b/flagai/data/dataset/data_collator/collate_fn.py
@@ -126,7 +126,7 @@ def __init__(self, args, tokenizer, task_name):
 
     def encode(self, example):
         cls_id = self.tokenizer.get_command_id('cls')
-        mask_token = 'sMASK' if self.args.task_mask else 'MASK'
+        mask_token = 'sMASK' if self.args.task_mask else 'mask'
         mask_id = self.tokenizer.get_command_id(mask_token)
         pad_id = self.tokenizer.get_command_id('pad')
         sop_id = self.tokenizer.get_command_id('sop')
@@ -175,7 +175,7 @@ def sub_finder(mylist, pattern):
             source_tokens = [cls_id] + source_tokens + [mask_id
                                                         ] + answer_tokens
         elif self.task_name in ["cmrc"]:
-            mask_id = self.tokenizer.get_command_id('MASK')
+            mask_id = self.tokenizer.get_command_id('mask')
             source_text = example.text_a
             target_text = example.meta["answer"].strip()
             question = example.meta["question"].strip()
@@ -191,7 +191,7 @@ def sub_finder(mylist, pattern):
                 mask_id
             ] + source_tokens[:max_src_length]
         elif self.task_name in ["wsc"]:
-            mask_id = self.tokenizer.get_command_id('MASK')
+            mask_id = self.tokenizer.get_command_id('mask')
             source_text = example.text_a
             target_text = example.meta["answer"].strip()
             question = example.meta["question"].strip()
@@ -307,10 +307,10 @@ def __init__(self,
         self.encoder_decoder = encoder_decoder
         self.shuffle_blocks = shuffle_blocks
         self.sentinel_token = sentinel_token
-        self.generation_mask = 'gMASK' if task_mask else 'MASK'
+        self.generation_mask = 'gMASK' if task_mask else 'mask'
         self.generation_mask = self.tokenizer.get_command_id(
             self.generation_mask)
-        self.gap_sentence_mask = 'sMASK' if task_mask else 'MASK'
+        self.gap_sentence_mask = 'sMASK' if task_mask else 'mask'
         self.gap_sentence_mask = self.tokenizer.get_command_id(
             self.gap_sentence_mask)
         self.random_position = random_position
@@ -426,7 +426,7 @@ def make_masked_data(self,
 
         position_ids = np.arange(len(tokens), dtype=np.int64)
         targets = copy.deepcopy(tokens)
-        mask_id = self.tokenizer.get_command_id('MASK')
+        mask_id = self.tokenizer.get_command_id('mask')
         mlm_masks = np.zeros(len(tokens), dtype=np.int64)
         for start, end in block_spans:
             for idx in range(start, end):
@@ -494,7 +494,7 @@ def make_block_data(self,
             elif task == 'gap_sentence':
                 mask_id = self.gap_sentence_mask
             else:
-                mask_token = 'MASK' if idx == 0 else f'MASK{idx}'
+                mask_token = 'mask' if idx == 0 else f'MASK{idx}'
                 mask_id = self.tokenizer.get_command_id(mask_token)
             local_spans.append((current_length, current_length + start - last))
             source_tokens.append(tokens[last:start])

diff --git a/flagai/data/dataset/data_utils.py b/flagai/data/dataset/data_utils.py
@@ -134,7 +134,7 @@ def build_input_from_ids(text_a_ids,
 
     # Prepare ids for special tokens
     if mask_id is None:
-        mask_id = tokenizer.get_command_id('MASK')
+        mask_id = tokenizer.get_command_id('mask')
     eos_id = tokenizer.get_command_id('eos')  # end of sentence token
     cls_id = tokenizer.get_command_id('cls')  # start of sentence token
     sep_id = tokenizer.get_command_id('sep')  # seperator of two texts token
@@ -235,7 +235,7 @@ def build_input_from_ids(text_a_ids,
 #
 def build_decoder_input(enc_ids, answer_ids, max_seq_length,
                         max_dec_seq_length, tokenizer):
-    mask_id = tokenizer.get_command_id('MASK')
+    mask_id = tokenizer.get_command_id('mask')
     eos_id = tokenizer.get_command_id('eos')
     sop_id = tokenizer.get_command_id('sop')
     masks = []

diff --git a/flagai/data/dataset/language_model/dataset.py b/flagai/data/dataset/language_model/dataset.py
@@ -38,7 +38,7 @@ def __init__(self, args, documents, tokenizer, num_original_tokens,
         self.left_weights = [0] + self.weights[:-1]
         self.unidirectional = args.unidirectional
         self.block_lm = args.block_lm
-        mask_token = "gMASK" if args.task_mask else 'MASK'
+        mask_token = "gMASK" if args.task_mask else 'mask'
         self.mask_id = self.tokenizer.get_command_id(mask_token)
 
     def __len__(self):
@@ -115,7 +115,7 @@ def __init__(self, args, tokenizer, strict=True):
         self.strict = strict
         self.block_lm = args.block_lm
         self.unidirectional = args.unidirectional
-        mask_token = "gMASK" if args.task_mask else 'MASK'
+        mask_token = "gMASK" if args.task_mask else 'mask'
         self.mask_id = self.tokenizer.get_command_id(mask_token)
 
         self.tokens = []

diff --git a/flagai/data/dataset/seq2seq/dataset.py b/flagai/data/dataset/seq2seq/dataset.py
@@ -477,7 +477,7 @@ def __len__(self):
     def __getitem__(self, idx):
         example = self.example_list[idx]
         source_text, target_text = example.text_a, example.text_b
-        mask_token = 'MASK'
+        mask_token = 'mask'
         mask_id = self.tokenizer.get_command_id(mask_token)
         sop_id = self.tokenizer.get_command_id('sop')
         eop_id = self.tokenizer.get_command_id('eop')
@@ -612,7 +612,7 @@ def __len__(self):
     def __getitem__(self, idx):
         example = self.example_list[idx]
         source_text = example.text_a
-        mask_token = 'gMASK' if self.args.task_mask else 'MASK'
+        mask_token = 'gMASK' if self.args.task_mask else 'mask'
         mask_id = self.tokenizer.get_command_id(mask_token)
         sop_id = self.tokenizer.get_command_id('sop')
         eop_id = self.tokenizer.get_command_id('eop')

diff --git a/flagai/data/dataset/superglue/pvp.py b/flagai/data/dataset/superglue/pvp.py
@@ -97,12 +97,12 @@ def spell_length(self):
     @property
     def mask(self) -> str:
         """Return the underlying LM's mask token"""
-        return self.tokenizer.get_command_id('MASK')
+        return self.tokenizer.get_command_id('mask')
 
     @property
     def mask_id(self) -> int:
         """Return the underlying LM's mask id"""
-        return self.tokenizer.get_command_id('MASK')
+        return self.tokenizer.get_command_id('mask')
 
     @property
     def max_num_verbalizers(self) -> int:
@@ -574,13 +574,13 @@ def spell_length(self):
     @property
     def mask(self) -> str:
         """Return the underlying LM's mask token"""
-        mask_token = 'MASK'
+        mask_token = 'mask'
         return self.tokenizer.get_command_id(mask_token)
 
     @property
     def mask_id(self) -> int:
         """Return the underlying LM's mask id"""
-        mask_token = 'MASK'
+        mask_token = 'mask'
         return self.tokenizer.get_command_id(mask_token)
 
     def get_answers(self, example: InputExample):

diff --git a/flagai/data/tokenizer/bert/bert_tokenizer.py b/flagai/data/tokenizer/bert/bert_tokenizer.py
@@ -75,7 +75,7 @@ def __init__(self, tokenizer_model_type=None, cache_dir=None):
         self._command_tokens = [
             CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
             CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
-            CommandToken('MASK', '[MASK]',
+            CommandToken('mask', '[MASK]',
                          self.get_specialid_from_text_tokenizer('mask')),
             CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
             CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),

diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py
@@ -15,7 +15,7 @@ def __init__(self, download_dir) -> None:
         self._command_tokens = [
             CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
             CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
-            CommandToken('MASK', '[MASK]',
+            CommandToken('mask', '[MASK]',
                          self.get_specialid_from_text_tokenizer('mask')),
             CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
             CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),

diff --git a/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py b/flagai/data/tokenizer/glm_10b_en/glm_10b_en_bpe_tokenizer.py
@@ -60,7 +60,7 @@ def __init__(self,
                              self.text_tokenizer.encoder['</s>']),
                 CommandToken('cls', '[CLS]',
                              self.text_tokenizer.encoder['<s>']),
-                CommandToken('MASK',
+                CommandToken('mask',
                              '[MASK]',
                              self.text_tokenizer.encoder['<mask>'],
                              lstrip=True),
@@ -88,7 +88,7 @@ def __init__(self,
                     CommandToken('sop', '<|startofpiece|>', self.num_tokens),
                     CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
                     CommandToken('cls', '[CLS]', self.num_tokens + 2),
-                    CommandToken('MASK',
+                    CommandToken('mask',
                                  '[MASK]',
                                  self.num_tokens + 3,
                                  lstrip=True),

diff --git a/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py b/flagai/data/tokenizer/glm_large_ch/glm_large_ch_tokenizer.py
@@ -55,7 +55,7 @@ def __init__(self,
             CommandToken('eos', '<|endoftext|>', self.num_text_tokens),
             CommandToken('sep', '[SEP]', self.num_text_tokens + 1),
             CommandToken('cls', '[CLS]', self.num_text_tokens + 2),
-            CommandToken('MASK',
+            CommandToken('mask',
                          '[MASK]',
                          self.num_text_tokens + 3,
                          lstrip=True),

diff --git a/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py b/flagai/data/tokenizer/glm_large_en/glm_large_en_tokenizer.py
@@ -59,7 +59,7 @@ def __init__(self,
         self._command_tokens = [
             CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
             CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
-            CommandToken('MASK', '[MASK]',
+            CommandToken('mask', '[MASK]',
                          self.text_tokenizer.vocab['[MASK]']),
             CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
             CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),

diff --git a/flagai/data/tokenizer/opt/opt_en_tokenizer.py b/flagai/data/tokenizer/opt/opt_en_tokenizer.py
@@ -35,7 +35,7 @@ def __init__(self, tokenizer_model_type="facebook/opt-125m", cache_dir=None):
         self._command_tokens = [
             CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
             CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
-            CommandToken('MASK', '[MASK]',
+            CommandToken('mask', '[MASK]',
                          self.get_specialid_from_text_tokenizer('mask')),
             CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
             CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),

diff --git a/flagai/data/tokenizer/roberta/roberta_tokenizer.py b/flagai/data/tokenizer/roberta/roberta_tokenizer.py
@@ -38,7 +38,7 @@ def __init__(self, tokenizer_model_type="roberta-base", cache_dir=None):
         self._command_tokens = [
             CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
             CommandToken('cls', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
-            CommandToken('MASK', '[MASK]',
+            CommandToken('mask', '[MASK]',
                          self.get_specialid_from_text_tokenizer('mask')),
             CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
             CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),

diff --git a/flagai/data/tokenizer/t5/t5_tokenizer.py b/flagai/data/tokenizer/t5/t5_tokenizer.py
@@ -45,7 +45,7 @@ def __init__(self, tokenizer_model_type="t5-base", cache_dir=None):
 
             CommandToken('pad', '[PAD]', self.num_tokens + 1),
             CommandToken('cls', '[CLS]', self.num_tokens + 2),
-            CommandToken('MASK', '[MASK]',
+            CommandToken('mask', '[MASK]',
                          self.num_tokens + 3),
         ]
         self._command_tokens.extend([

diff --git a/flagai/data/tokenizer/tokenizer.py b/flagai/data/tokenizer/tokenizer.py
@@ -54,7 +54,7 @@ def __str__(self):
     ('sep', 4),
     ('L2R', 5),
     ('cls', 6),
-    ('MASK', 7),
+    ('mask', 7),
 ]
 DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
 """define some default type tokens for bert training"""