Skip to content

Commit

Permalink
finished example testing
Browse files Browse the repository at this point in the history
Signed-off-by: Anhforth <[email protected]>
  • Loading branch information
Anhforth committed Jul 18, 2022
1 parent 9649aa4 commit 67c1288
Show file tree
Hide file tree
Showing 7 changed files with 20 additions and 22 deletions.
2 changes: 1 addition & 1 deletion examples/bert_title_generation_english/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
maxlen = 512
auto_loader = AutoLoader(
"seq2seq",
model_name="bert-base-uncased",
model_name="BERT-base-en",
model_dir=model_dir,
)
model = auto_loader.get_model()
Expand Down
5 changes: 3 additions & 2 deletions examples/glm_superglue/train_10b_clue.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -21,11 +21,12 @@
save_dir="./glm_superglue_en",
save_interval=1)

model_name = "GLM-large-ch"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-ch")


tokenizer = GLMLargeChTokenizer()
tokenizer = Tokenizer.from_pretrained("GLM-large-ch")
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
8 changes: 4 additions & 4 deletions examples/glm_superglue/train_10b_superglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -28,11 +28,11 @@
# deepspeed_config='./deepspeed.json',
# training_script=__file__)

model_name = "GLM-large-en"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")
model_name=model_name)

tokenizer = GLMLargeEnWordPieceTokenizer()

tokenizer = Tokenizer.from_pretrained(model_name)
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
12 changes: 4 additions & 8 deletions examples/glm_superglue/train_prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
#
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze, GLMForMultiTokenCloze, GLMForMultiTokenClozeFast, GLMForSequenceClassification
from flagai.data.tokenizer import GLMLargeEnWordPieceTokenizer, GLMLargeChTokenizer
from flagai.model.glm_model import GLMForSequenceClassification
from flagai.data.tokenizer import Tokenizer

from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS
import unittest
from flagai.data.dataset import ConstructSuperglueStrategy


Expand All @@ -32,13 +31,10 @@

if task_name in CH_TASKS:
model_name = 'GLM-large-ch'
tokenizer = GLMLargeChTokenizer(add_block_symbols=True,
add_task_mask=False,
add_decoder_mask=False,
fix_command_token=True)
add_block_symbols=True,
else:
model_name = 'GLM-large-en'
tokenizer = GLMLargeEnWordPieceTokenizer()
tokenizer = Tokenizer.from_pretrained(model_name)

model = GLMForSequenceClassification.from_pretrain(model_name=model_name, spell_length=2,
class_num=3, tune_prefix_layers=1)
Expand Down
7 changes: 4 additions & 3 deletions examples/glm_superglue/train_qqp_pytorch_fp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -24,9 +24,10 @@
warm_up=0.1,
save_dir="./glm_large_qqp_pytorch_fp16")

model_name = "GLM-large-en"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")
tokenizer = GLMLargeEnWordPieceTokenizer()
model_name=model_name)
tokenizer = Tokenizer.from_pretrained(model_name)
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
3 changes: 2 additions & 1 deletion examples/t5_flagai_11b/train_title_with_flagai_t5_11b.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from flagai.trainer import Trainer
from flagai.model.t5_model import T5ForConditionalGeneration
from transformers import T5Tokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.model.predictor.predictor import Predictor
from torch.utils.data import Dataset
import os
Expand Down Expand Up @@ -53,7 +54,7 @@ def read_file():

return src, tgt

tokenizer = T5Tokenizer.from_pretrained('t5-11b')
tokenizer = Tokenizer.from_pretrained('T5-base-en')
# path to your downloaded model files is /mnt/t5-11b
model = T5ForConditionalGeneration.from_pretrain(download_path='/mnt',
model_name='t5-11b',checkpoint_activations=True)
Expand Down
5 changes: 2 additions & 3 deletions flagai/data/tokenizer/uni_tokenizer/wp_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,8 @@ def load_vocab(vocab_file):
with open(vocab_file, "r", encoding="utf-8") as reader:
while True:
token = reader.readline()
print(len(token), token[0], token[-1],21222222222)
if token.startswith('{') and token.endswith('{'):
return json.loads(token)
# if token.startswith('{') and token.endswith('{'):
# return json.loads(token)
if not token:
break
token = token.strip()
Expand Down

0 comments on commit 67c1288

Please sign in to comment.