Skip to content

Commit

Permalink
saved work on 7.6
Browse files Browse the repository at this point in the history
Signed-off-by: Anhforth <[email protected]>
  • Loading branch information
Anhforth committed Jul 6, 2022
1 parent 7c8c0b1 commit 265d35a
Show file tree
Hide file tree
Showing 39 changed files with 1,924 additions and 302 deletions.
2 changes: 1 addition & 1 deletion doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class GLMTitleGenerationCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMTitleGenerationDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ class GLMPoetryDynamicCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMPoetryDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class GLMTitleGenerationCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMTitleGenerationDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ class GLMPoetryDynamicCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMPoetryDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion examples/glm_large_en_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self,

self._command_tokens = [
CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
CommandToken('cls', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
CommandToken('MASK', '[MASK]',
self.text_tokenizer.vocab['[MASK]']),
CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
Expand Down
2 changes: 1 addition & 1 deletion examples/glm_poetry_generation/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def __call__(self, batch):
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(
pad_id=tokenizer.get_command('pad').Id)
pad_id=tokenizer.get_command_id('pad'))
train_dataset = BertSeq2seqDataset(train_src, train_tgt)

trainer.train(model, train_dataset=train_dataset, collate_fn=my_collate_fn)
2 changes: 1 addition & 1 deletion examples/glm_pretrain/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def create_dataset(tokenizer, should_split):
collate_fn = None
if ds_args.block_lm:
collate_fn = ConstructBlockStrategy(
tokenizer, 512, eod_token=tokenizer.get_command('eos').Id)
tokenizer, 512, eod_token=tokenizer.get_command_id('eos'))
metric_methods = DEFAULT_METRICS['pretrain']
trainer.train(model,
collate_fn=collate_fn,
Expand Down
68 changes: 68 additions & 0 deletions examples/glm_superglue/tst_superglue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright © 2022 BAAI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License")
import torch
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze, GLMForMultiTokenCloze
from ..uni_tokenizer.glm_tokenizer import GLMTokenizer
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS
from flagai.data.dataset import ConstructSuperglueStrategy
from flagai.data.dataset.superglue.control import MULTI_TOKEN_TASKS

for task_name in [
'boolq'
]:
trainer = Trainer(env_type='pytorch',
epochs=1,
batch_size=1,
eval_interval=100,
log_interval=50,
experiment_name='glm_large',
pytorch_device='cuda',
load_dir=None,
fp16=True,
lr=1e-4,
save_interval=10)
print("downloading...")

cl_args = CollateArguments()
cl_args.multi_token = task_name in MULTI_TOKEN_TASKS
if task_name in CH_TASKS:
model_name = 'GLM-large-ch'
tokenizer = GLMTokenizer.from_pretrained(model_name)
else:
model_name = 'GLM-large-en'
tokenizer = GLMTokenizer.from_pretrained(model_name)

if task_name in MULTI_TOKEN_TASKS:
model = GLMForMultiTokenCloze.from_pretrain(
model_name=model_name, only_download_config=True)
else:
model = GLMForSingleTokenCloze.from_pretrain(
model_name=model_name, only_download_config=True)

train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
tokenizer=tokenizer)
train_dataset.example_list = train_dataset.example_list[:1]
collate_fn = ConstructSuperglueStrategy(cl_args,
tokenizer,
task_name=task_name)

valid_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='dev',
tokenizer=tokenizer)
valid_dataset.example_list = valid_dataset.example_list[:1]
print(task_name)
metric_methods = DEFAULT_METRICS[task_name]
trainer.train(model,
collate_fn=collate_fn,
train_dataset=train_dataset,
valid_dataset=valid_dataset,
metric_methods=metric_methods)


2 changes: 1 addition & 1 deletion examples/glm_title_generation/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def __call__(self, batch):

sents_src, sents_tgt = read_file()
my_collate_fn = GLMPoetryDynamicCollateFN(
pad_id=tokenizer.get_command('pad').Id)
pad_id=tokenizer.get_command_id('pad'))

data_len = len(sents_tgt)
train_size = int(data_len * 0.8)
Expand Down
66 changes: 39 additions & 27 deletions examples/tst_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,35 +8,47 @@
# from examples.uni_tokenizer.base_tokenizer import BaseTokenizer
# tokenizer = BaseTokenizer.from_pretrained('GLM-large-en')

from flagai.data.tokenizer.glm_large_en.wordpiece import GLMLargeEnTokenizer
from flagai.data.tokenizer.glm_10b_en.glm_10b_en_tokenizer import GLM10bENTokenizer

class Animal(object):
@classmethod
def move(cls):
# return cls.jump(cls,8,12)
return cls(8,12)
def __init__(self, name=None, age=None):
print(name, age)
# super(Animal, self).__init__()
self.name = name
self.age = age
print('parent')
tokenizer = GLMLargeEnTokenizer.from_pretrained("GLM-large-en")
# tokenizer = GLM10bENTokenizer.from_pretrained("gpt2")
print(tokenizer.vocab['[CLS]'])
print(tokenizer.vocab['[UNK]'])
print(tokenizer.vocab['[SEP]'])
print(tokenizer.vocab['<|startofpiece|>'])
print(tokenizer.vocab['<|endofpiece|>'])
print(tokenizer.vocab_size())
# print(list(dict(tokenizer.encoder).items())[-2:])

# def jump(self, name, age):
# print("jump")

class cat(Animal):
def __init__(self, age, piece, **kwargs):
super().__init__(**kwargs)
self.age = age+age
self.piece = 8
print("is it?")


def jump(self, name, age):
self.age = 13
print(self.age)

a = cat.move()
# class Animal(object):
# @classmethod
# def move(cls):
# # return cls.jump(cls,8,12)
# return cls(8,12)
# def __init__(self, name=None, age=None):
# print(name, age)
# # super(Animal, self).__init__()
# self.name = name
# self.age = age
# print('parent')
#
# # def jump(self, name, age):
# # print("jump")
#
# class cat(Animal):
# def __init__(self, age, piece, **kwargs):
# super().__init__(**kwargs)
# self.age = age+age
# self.piece = 8
# print("is it?")
#
#
# def jump(self, name, age):
# self.age = 13
# print(self.age)
#
# a = cat.move()
# a.jump(2,4)
# a = cat(age=9,piece=7).move()
# print(a.piece)
Expand Down
5 changes: 3 additions & 2 deletions examples/uni_tokenizer/base_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def from_pretrained(cls,
merges_file = 'merges.txt'
sp_model_file = 'spiece.model'
if cache_dir is None:
cache_dir = os.path.join(os.path.dirname(__file__), 'vocabs')

# cache_dir = os.path.join(os.path.dirname(__file__), 'vocabs')
cache_dir = "/root/.cache/FlagAI/"+tokenizer_model_name
tokenizer_class = ""
# search the cache directory for certain files

Expand Down Expand Up @@ -70,6 +70,7 @@ def from_pretrained(cls,
else:
raise NotImplementedError("Cannot find a tokenizer class that matches the files settings in the directory or ModelHub")


def __init__(self,
vocab_file=None,
merges_file=None,
Expand Down
48 changes: 13 additions & 35 deletions examples/uni_tokenizer/bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def __init__(self,
vocab_file,
merges_file,
errors='replace',
special_tokens=None,
max_len=None,
**kwargs):
super().__init__(**kwargs)
Expand All @@ -57,35 +56,18 @@ def __init__(self,

self.special_tokens = {}
self.special_tokens_decoder = {}
self.set_special_tokens(special_tokens)
# self.set_special_tokens(special_tokens)

@property
def vocab_size(self):
return len(self.encoder)

# def get_vocab(self):
# return dict(self.encoder, **self.added_tokens_encoder)
def get_vocab(self):
return dict(self.encoder)

def __len__(self):
return len(self.encoder) + len(self.special_tokens)

def set_special_tokens(self, special_tokens):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if not special_tokens:
self.special_tokens = {}
self.special_tokens_decoder = {}
return
self.special_tokens = dict((tok, len(self.encoder) + i)
for i, tok in enumerate(special_tokens))
self.special_tokens_decoder = {
v: k
for k, v in self.special_tokens.items()
}
logger.info("Special tokens {}".format(self.special_tokens))

def bpe(self, token):
if token in self.cache:
return self.cache[token]
Expand Down Expand Up @@ -142,19 +124,15 @@ def tokenize(self, text):
for bpe_token in self.bpe(token).split(' '))
return bpe_tokens

def convert_token_to_id(self, token):
""" Converts a sequence of tokens into ids using the vocab. """
return self.encoder.get(token, 0)

def convert_tokens_to_ids(self, tokens):
""" Converts a sequence of tokens into ids using the vocab. """
ids = []
if isinstance(tokens, str) or (sys.version_info[0] == 2):
if tokens in self.special_tokens:
return self.special_tokens[tokens]
else:
return self.encoder.get(tokens, 0)
for token in tokens:
if token in self.special_tokens:
ids.append(self.special_tokens[token])
else:
ids.append(self.encoder.get(token, 0))
ids.append(self.convert_token_to_id(token))
if len(ids) > self.max_len:
logger.warning(
"Token indices sequence length is longer than the specified maximum "
Expand All @@ -163,15 +141,15 @@ def convert_tokens_to_ids(self, tokens):
format(len(ids), self.max_len))
return ids

def convert_id_to_token(self, id):
"""Converts a sequence of ids in BPE tokens using the vocab."""
return self.decoder[id]

def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens = []
for i in ids:
if i in self.special_tokens_decoder:
if not skip_special_tokens:
tokens.append(self.special_tokens_decoder[i])
else:
tokens.append(self.decoder[i])
tokens.append(self.decoder[i])
return tokens

def encode(self, text):
Expand Down
4 changes: 2 additions & 2 deletions examples/uni_tokenizer/glm_bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self,
self.encoder['</s>']),
CommandToken('sep', '[SEP]',
self.encoder['</s>']),
CommandToken('ENC', '[CLS]',
CommandToken('cls', '[CLS]',
self.encoder['<s>']),
CommandToken('MASK',
'[MASK]',
Expand Down Expand Up @@ -85,7 +85,7 @@ def __init__(self,
self._command_tokens.extend([
CommandToken('sop', '<|startofpiece|>', self.num_tokens),
CommandToken('eop', '<|endofpiece|>', self.num_tokens + 1),
CommandToken('ENC', '[CLS]', self.num_tokens + 2),
CommandToken('cls', '[CLS]', self.num_tokens + 2),
CommandToken('MASK',
'[MASK]',
self.num_tokens + 3,
Expand Down
Loading

0 comments on commit 265d35a

Please sign in to comment.