saved work on 7.6

Signed-off-by: Anhforth <[email protected]>
ccartermices · Jul 6, 2022 · 265d35a · 265d35a
1 parent 7c8c0b1
commit 265d35a
Show file tree

Hide file tree

Showing 39 changed files with 1,924 additions and 302 deletions.
diff --git a/doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md b/doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
@@ -118,7 +118,7 @@ class GLMTitleGenerationCollateFN():
 ```python
 train_src, train_tgt = read_file()
 print('-----------train data length:', len(train_src))
-my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
+my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command_id('pad'))
 train_dataset = GLMTitleGenerationDataset(train_src,
                                    train_tgt)
 ```

diff --git a/doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md b/doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
@@ -131,7 +131,7 @@ class GLMPoetryDynamicCollateFN():
 ```python
 train_src, train_tgt = read_file()
 print('-----------train data length:', len(train_src))
-my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
+my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command_id('pad'))
 train_dataset = GLMPoetryDataset(train_src,
                                    train_tgt)
 ```

diff --git a/docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md b/docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
@@ -119,7 +119,7 @@ class GLMTitleGenerationCollateFN():
 ```python
 train_src, train_tgt = read_file()
 print('-----------train data length:', len(train_src))
-my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
+my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command_id('pad'))
 train_dataset = GLMTitleGenerationDataset(train_src,
                                    train_tgt)
 ```

diff --git a/docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md b/docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
@@ -122,7 +122,7 @@ class GLMPoetryDynamicCollateFN():
 ```python
 train_src, train_tgt = read_file()
 print('-----------train data length:', len(train_src))
-my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
+my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command_id('pad'))
 train_dataset = GLMPoetryDataset(train_src,
                                    train_tgt)
 ```

diff --git a/examples/glm_large_en_tokenizer.py b/examples/glm_large_en_tokenizer.py
@@ -59,7 +59,7 @@ def __init__(self,
 
         self._command_tokens = [
             CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
-            CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
+            CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
             CommandToken('MASK', '[MASK]',
                          self.text_tokenizer.vocab['[MASK]']),
             CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),

diff --git a/examples/glm_poetry_generation/train.py b/examples/glm_poetry_generation/train.py
@@ -133,7 +133,7 @@ def __call__(self, batch):
 train_src, train_tgt = read_file()
 print('-----------train data length:', len(train_src))
 my_collate_fn = GLMPoetryDynamicCollateFN(
-    pad_id=tokenizer.get_command('pad').Id)
+    pad_id=tokenizer.get_command_id('pad'))
 train_dataset = BertSeq2seqDataset(train_src, train_tgt)
 
 trainer.train(model, train_dataset=train_dataset, collate_fn=my_collate_fn)
diff --git a/examples/glm_pretrain/train.py b/examples/glm_pretrain/train.py
@@ -59,7 +59,7 @@ def create_dataset(tokenizer, should_split):
     collate_fn = None
     if ds_args.block_lm:
         collate_fn = ConstructBlockStrategy(
-            tokenizer, 512, eod_token=tokenizer.get_command('eos').Id)
+            tokenizer, 512, eod_token=tokenizer.get_command_id('eos'))
     metric_methods = DEFAULT_METRICS['pretrain']
     trainer.train(model,
                   collate_fn=collate_fn,

diff --git a/examples/glm_superglue/tst_superglue.py b/examples/glm_superglue/tst_superglue.py
@@ -0,0 +1,68 @@
+# Copyright © 2022 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+import torch
+from flagai.trainer import Trainer
+from flagai.model.glm_model import GLMForSingleTokenCloze, GLMForMultiTokenCloze
+from ..uni_tokenizer.glm_tokenizer import GLMTokenizer
+from flagai.data.dataset import SuperGlueDataset
+from flagai.test_utils import CollateArguments
+from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS
+from flagai.data.dataset import ConstructSuperglueStrategy
+from flagai.data.dataset.superglue.control import MULTI_TOKEN_TASKS
+
+for task_name in [
+        'boolq'
+]:
+    trainer = Trainer(env_type='pytorch',
+                      epochs=1,
+                      batch_size=1,
+                      eval_interval=100,
+                      log_interval=50,
+                      experiment_name='glm_large',
+                      pytorch_device='cuda',
+                      load_dir=None,
+                      fp16=True,
+                      lr=1e-4,
+                      save_interval=10)
+    print("downloading...")
+
+    cl_args = CollateArguments()
+    cl_args.multi_token = task_name in MULTI_TOKEN_TASKS
+    if task_name in CH_TASKS:
+        model_name = 'GLM-large-ch'
+        tokenizer = GLMTokenizer.from_pretrained(model_name)
+    else:
+        model_name = 'GLM-large-en'
+        tokenizer = GLMTokenizer.from_pretrained(model_name)
+
+    if task_name in MULTI_TOKEN_TASKS:
+        model = GLMForMultiTokenCloze.from_pretrain(
+            model_name=model_name, only_download_config=True)
+    else:
+        model = GLMForSingleTokenCloze.from_pretrain(
+            model_name=model_name, only_download_config=True)
+
+    train_dataset = SuperGlueDataset(task_name=task_name,
+                                     data_dir='./datasets/',
+                                     dataset_type='train',
+                                     tokenizer=tokenizer)
+    train_dataset.example_list = train_dataset.example_list[:1]
+    collate_fn = ConstructSuperglueStrategy(cl_args,
+                                            tokenizer,
+                                            task_name=task_name)
+
+    valid_dataset = SuperGlueDataset(task_name=task_name,
+                                     data_dir='./datasets/',
+                                     dataset_type='dev',
+                                     tokenizer=tokenizer)
+    valid_dataset.example_list = valid_dataset.example_list[:1]
+    print(task_name)
+    metric_methods = DEFAULT_METRICS[task_name]
+    trainer.train(model,
+                  collate_fn=collate_fn,
+                  train_dataset=train_dataset,
+                  valid_dataset=valid_dataset,
+                  metric_methods=metric_methods)
+
+
diff --git a/examples/glm_title_generation/train.py b/examples/glm_title_generation/train.py
@@ -130,7 +130,7 @@ def __call__(self, batch):
 
 sents_src, sents_tgt = read_file()
 my_collate_fn = GLMPoetryDynamicCollateFN(
-    pad_id=tokenizer.get_command('pad').Id)
+    pad_id=tokenizer.get_command_id('pad'))
 
 data_len = len(sents_tgt)
 train_size = int(data_len * 0.8)

diff --git a/examples/tst_tokenizer.py b/examples/tst_tokenizer.py
@@ -8,35 +8,47 @@
 # from examples.uni_tokenizer.base_tokenizer import BaseTokenizer
 # tokenizer = BaseTokenizer.from_pretrained('GLM-large-en')
 
+from flagai.data.tokenizer.glm_large_en.wordpiece import GLMLargeEnTokenizer
+from flagai.data.tokenizer.glm_10b_en.glm_10b_en_tokenizer import GLM10bENTokenizer
 
-class Animal(object):
-    @classmethod
-    def move(cls):
-        # return cls.jump(cls,8,12)
-        return cls(8,12)
-    def __init__(self, name=None, age=None):
-        print(name, age)
-        # super(Animal, self).__init__()
-        self.name = name
-        self.age = age
-        print('parent')
+tokenizer = GLMLargeEnTokenizer.from_pretrained("GLM-large-en")
+# tokenizer = GLM10bENTokenizer.from_pretrained("gpt2")
+print(tokenizer.vocab['[CLS]'])
+print(tokenizer.vocab['[UNK]'])
+print(tokenizer.vocab['[SEP]'])
+print(tokenizer.vocab['<|startofpiece|>'])
+print(tokenizer.vocab['<|endofpiece|>'])
+print(tokenizer.vocab_size())
+# print(list(dict(tokenizer.encoder).items())[-2:])
 
-    # def jump(self,  name, age):
-    #     print("jump")
-
-class cat(Animal):
-    def __init__(self, age, piece, **kwargs):
-        super().__init__(**kwargs)
-        self.age = age+age
-        self.piece = 8
-        print("is it?")
-
-
-    def jump(self,  name, age):
-        self.age = 13
-        print(self.age)
-
-a = cat.move()
+# class Animal(object):
+#     @classmethod
+#     def move(cls):
+#         # return cls.jump(cls,8,12)
+#         return cls(8,12)
+#     def __init__(self, name=None, age=None):
+#         print(name, age)
+#         # super(Animal, self).__init__()
+#         self.name = name
+#         self.age = age
+#         print('parent')
+#
+#     # def jump(self,  name, age):
+#     #     print("jump")
+#
+# class cat(Animal):
+#     def __init__(self, age, piece, **kwargs):
+#         super().__init__(**kwargs)
+#         self.age = age+age
+#         self.piece = 8
+#         print("is it?")
+#
+#
+#     def jump(self,  name, age):
+#         self.age = 13
+#         print(self.age)
+#
+# a = cat.move()
 # a.jump(2,4)
 # a = cat(age=9,piece=7).move()
 # print(a.piece)

diff --git a/examples/uni_tokenizer/base_tokenizer.py b/examples/uni_tokenizer/base_tokenizer.py
@@ -24,8 +24,8 @@ def from_pretrained(cls,
         merges_file = 'merges.txt'
         sp_model_file = 'spiece.model'
         if cache_dir is None:
-            cache_dir = os.path.join(os.path.dirname(__file__), 'vocabs')
-
+            # cache_dir = os.path.join(os.path.dirname(__file__), 'vocabs')
+            cache_dir = "/root/.cache/FlagAI/"+tokenizer_model_name
         tokenizer_class = ""
         # search the cache directory for certain files
 
@@ -70,6 +70,7 @@ def from_pretrained(cls,
         else:
             raise NotImplementedError("Cannot find a tokenizer class that matches the files settings in the directory or ModelHub")
 
+
     def __init__(self,
                  vocab_file=None,
                  merges_file=None,

diff --git a/examples/uni_tokenizer/bpe_tokenizer.py b/examples/uni_tokenizer/bpe_tokenizer.py
@@ -35,7 +35,6 @@ def __init__(self,
                  vocab_file,
                  merges_file,
                  errors='replace',
-                 special_tokens=None,
                  max_len=None,
                  **kwargs):
         super().__init__(**kwargs)
@@ -57,35 +56,18 @@ def __init__(self,
 
         self.special_tokens = {}
         self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
+        # self.set_special_tokens(special_tokens)
 
     @property
     def vocab_size(self):
         return len(self.encoder)
 
-    # def get_vocab(self):
-    #     return dict(self.encoder, **self.added_tokens_encoder)
+    def get_vocab(self):
+        return dict(self.encoder)
 
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
 
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i)
-                                   for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {
-            v: k
-            for k, v in self.special_tokens.items()
-        }
-        logger.info("Special tokens {}".format(self.special_tokens))
-
     def bpe(self, token):
         if token in self.cache:
             return self.cache[token]
@@ -142,19 +124,15 @@ def tokenize(self, text):
                               for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 
+    def convert_token_to_id(self, token):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        return self.encoder.get(token, 0)
+
     def convert_tokens_to_ids(self, tokens):
         """ Converts a sequence of tokens into ids using the vocab. """
         ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
         for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
+            ids.append(self.convert_token_to_id(token))
         if len(ids) > self.max_len:
             logger.warning(
                 "Token indices sequence length is longer than the specified maximum "
@@ -163,15 +141,15 @@ def convert_tokens_to_ids(self, tokens):
                 format(len(ids), self.max_len))
         return ids
 
+    def convert_id_to_token(self, id):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        return self.decoder[id]
+
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         """Converts a sequence of ids in BPE tokens using the vocab."""
         tokens = []
         for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
+            tokens.append(self.decoder[i])
         return tokens
 
     def encode(self, text):

diff --git a/examples/uni_tokenizer/glm_bpe_tokenizer.py b/examples/uni_tokenizer/glm_bpe_tokenizer.py
@@ -56,7 +56,7 @@ def __init__(self,
                              self.encoder['</s>']),
                 CommandToken('sep', '[SEP]',
                              self.encoder['</s>']),
-                CommandToken('ENC', '[CLS]',
+                CommandToken('cls', '[CLS]',
                              self.encoder['<s>']),
                 CommandToken('MASK',
                              '[MASK]',
@@ -85,7 +85,7 @@ def __init__(self,
                 self._command_tokens.extend([
                     CommandToken('sop', '<|startofpiece|>', self.num_tokens),
                     CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
-                    CommandToken('ENC', '[CLS]', self.num_tokens + 2),
+                    CommandToken('cls', '[CLS]', self.num_tokens + 2),
                     CommandToken('MASK',
                                  '[MASK]',
                                  self.num_tokens + 3,