Skip to content

Commit

Permalink
add galactica model
Browse files Browse the repository at this point in the history
  • Loading branch information
920232796 committed Feb 27, 2023
1 parent 000e153 commit 6a3049c
Show file tree
Hide file tree
Showing 5 changed files with 334 additions and 5 deletions.
27 changes: 27 additions & 0 deletions examples/galactica/generate_galactica_1.3b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from flagai.model.predictor.predictor import Predictor
from flagai.auto_model.auto_loader import AutoLoader
import torch
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

loader = AutoLoader(task_name="lm",
model_name="galactica-1.3b-en",
model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/")

model = loader.get_model()
model.to(device)
model.eval()

tokenizer = loader.get_tokenizer()

predictor = Predictor(model, tokenizer)

text = "Please write a abstract about the computer vision. \n"
out = predictor.predict_generate_randomsample(text,
out_max_length=700,
top_k=50,
repetition_penalty=1.2,
temperature=0.7
)
print(out)


18 changes: 14 additions & 4 deletions flagai/auto_model/auto_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __getattr__(self, name):
"glm_title-generation": ["flagai.model.glm_model", "GLMForSeq2Seq"],
"opt_seq2seq": ("flagai.model.opt_model", "OPTModel"),
"opt_lm": ("flagai.model.opt_model", "OPTModel"),
"galactica_lm": ("flagai.model.galactica_model", "GalacticaModel"),
"vit_classification": ("flagai.model.vision.vit", "VisionTransformer"),
"clip_txt_img_matching": ("flagai.model.mm.clip_model", "CLIP"),
"swinv1_classification": ("flagai.model.vision.swinv1", "SwinTransformer"),
Expand Down Expand Up @@ -90,6 +91,10 @@ def __getattr__(self, name):
"glm-10b-ch": ["flagai.model.glm_model", "GLMModel", "glm", "nlp"],
"cpm3": ["flagai.model.cpm3_model", "CPM3", "cpm3", "nlp"],
"cpm3-train": ["flagai.model.cpm3_train_model", "CPM3", "cpm3", "nlp"],
"galactica-1.3b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"],
"galactica-6.7b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"],
"galactica-30b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"],
"galactica-120b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"],
"vit-base-p16-224":
["flagai.model.vision.vit", "VisionTransformer", "vit", "vision"],
"vit-base-p16-384":
Expand Down Expand Up @@ -131,6 +136,7 @@ def __getattr__(self, name):
"altclip-bert-b": ["flagai.models.mm.AltCLIP", "AltCLIP", "altclip", "mm", "flagai.model.mm.AltCLIP",
"AltCLIPProcessBert"],
"eva-clip": ["flagai.model.mm.eva_clip_model", "EVA_CLIP", "evaclip", "mm"],

}


Expand Down Expand Up @@ -205,10 +211,14 @@ def __init__(self,
self.model.half()

if model_type == "nlp":
tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"),
"Tokenizer")
self.tokenizer = tokenizer_class.from_pretrained(
model_name, cache_dir=download_path)
if brief_model_name in ["galactica", ]:
self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]),
MODEL_DICT[model_name][5])(download_path)
else :
tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"),
"Tokenizer")
self.tokenizer = tokenizer_class.from_pretrained(
model_name, cache_dir=download_path)

elif model_type == "mm":
if model_name.startswith("altdiffusion"):
Expand Down
84 changes: 84 additions & 0 deletions flagai/data/tokenizer/galactica/galactica_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from transformers import PreTrainedTokenizerFast


from ..tokenizer import CommandToken, Tokenizer

class GalacticaTokenizer(Tokenizer):
def __init__(self, download_dir) -> None:
pass
self.text_tokenizer = PreTrainedTokenizerFast.from_pretrained(download_dir)
# parse tokens and vocabs from tokenizer
self._tokens = list(self.text_tokenizer.get_vocab().keys())
self._vocab = {k: v for k, v in self.text_tokenizer.get_vocab().items()}
self.num_tokens = len(self._tokens)

self._command_tokens = [
CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')),
CommandToken('MASK', '[MASK]',
self.get_specialid_from_text_tokenizer('mask')),
CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')),
CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')),
CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')),
]

self.command_name_map = {tok.name: tok for tok in self._command_tokens}
self.command_token_map = {
tok.token: tok
for tok in self._command_tokens
}
self.command_id_map = {tok.Id: tok for tok in self._command_tokens}

def get_specialid_from_text_tokenizer(self, token):
if token in ["eos", "sep"]:
return self._vocab.get('</s>')
elif token == "cls":
return self._vocab.get('<s>')
elif token == "unk":
return self._vocab.get('<unk>')
elif token == "pad":
return self._vocab.get('<pad>')
elif token == "mask":
return self._vocab.get('<mask>')
else:
raise NameError("token not exists")

def encode_plus(self, text, max_length=512):
return self.text_tokenizer.encode_plus(text, truncation=True, max_length=max_length)

def decode(self, ids):
return self.text_tokenizer.decode(ids)

def get_vocab(self):
return self.text_tokenizer.get_vocab()

def get_command_id(self, name):
"""get command token corresponding to `name`"""
return self.command_name_map[name].Id

def get_command(self, name):
"""get command token corresponding to `name`"""
return self.command_name_map[name]

def encode_plus(self,
text,
second_text=None,
truncation=False,
max_length=None,):

return self.text_tokenizer.encode_plus(text,
text_pair=second_text,
truncation=truncation,
max_length=max_length,
add_special_tokens=True)

def tokenize(self, **kwargs):
return self.text_tokenizer.tokenize(**kwargs)

def __len__(self):
return len(self.text_tokenizer)

if __name__ == "__main__":
pass


207 changes: 207 additions & 0 deletions flagai/model/galactica_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
# coding=utf-8
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Galactica model."""
import random
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
from flagai.model.layers.activations import ACT2FN
from flagai.model.gpt2_model import GPT2Model, GPT2Stack, GPT2Config


class OPTLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""

def __init__(self, num_embeddings: int, embedding_dim: int):
# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)

def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
attention_mask = attention_mask.long()

# create positions depending on attention_mask
positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
# cut positions if `past_key_values_length` is > 0
positions = positions[:, past_key_values_length:]

return super().forward(positions + self.offset)

class OPTStack(GPT2Stack):
def __init__(self, config: GPT2Config):
super(OPTStack, self).__init__(config)
self.wpe = OPTLearnedPositionalEmbedding(config.n_positions, config.hidden_size)
self.ln_f = None
if config.do_layer_norm_before:
self.ln_f = nn.LayerNorm(config.hidden_size)

if config.n_embd != config.hidden_size:
self.project_out = nn.Linear(config.hidden_size, config.n_embd, bias=False)
else:
self.project_out = None

if config.n_embd != config.hidden_size:
self.project_in = nn.Linear(config.n_embd, config.hidden_size, bias=False)
else:
self.project_in = None

def get_position_embeddings(self, **kwargs):
pass
padding_mask = kwargs["padding_mask"]
past_length = kwargs["past_length"]
position_embeds = self.wpe(padding_mask, past_length)
return position_embeds

def trans_opt_to_gpt_config(opt_config_json):
trans_config_json = {}
trans_key = {
"ffn_dim": "n_inner",
"hidden_size": "hidden_size",
"max_position_embeddings": "n_positions",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
"vocab_size": "vocab_size",
"activation_function": "activation_function",
"checkpoint_activations": "checkpoint_activations",
"word_embed_proj_dim": "n_embd",
"do_layer_norm_before": "do_layer_norm_before",
}
for k, v in opt_config_json.items():
if k in trans_key:
trans_config_json[trans_key[k]] = v

return trans_config_json

class GalacticaModel(GPT2Model):

def __init__(self, config, **kwargs):
config = trans_opt_to_gpt_config(config)
super(GalacticaModel, self).__init__(config, **kwargs)
self.transformer = OPTStack(self.config_gpt)

def load_weights(self, checkpoint_path):
checkpoint = torch.load(checkpoint_path,
map_location=torch.device("cpu"))
if "module" in checkpoint:
# ddp
checkpoint = checkpoint["module"]

checkpoint_ = {}
for k, v in checkpoint.items():
if k[:6] == "model.":
checkpoint_[k[6:]] = v
else :
checkpoint_[k] = v

checkpoint = self.transpose_weight(checkpoint_)
self.load_state_dict(checkpoint, strict=False)
self.lm_head.weight.data = nn.Parameter(self.transformer.wte.weight.data)

return checkpoint

def transpose_weight(self, checkpoints):

checkponts_ = {
"transformer.wte.weight": checkpoints["decoder.embed_tokens.weight"],
"transformer.wpe.weight": checkpoints["decoder.embed_positions.weight"],
}

if "decoder.project_in.weight" in checkpoints:
checkponts_["transformer.project_in.weight"] = checkpoints["decoder.project_in.weight"]
checkponts_["transformer.project_out.weight"] = checkpoints["decoder.project_out.weight"]

if "decoder.final_layer_norm.weight" in checkpoints:
checkponts_["transformer.ln_f.weight"] = checkpoints["decoder.final_layer_norm.weight"]
checkponts_["transformer.ln_f.bias"] = checkpoints["decoder.final_layer_norm.bias"]

q_weight = None
k_weight = None
v_weight = None
q_bias = None
k_bias = None
v_bias = None
for k, v in checkpoints.items():
# first ln
if "decoder.layers" in k and "self_attn_layer_norm" in k:
layer_id = k.split(".")[2]
weight_or_bias = k.split(".")[-1]
checkponts_[f"transformer.h.{layer_id}.ln_1.{weight_or_bias}"] = v
continue

# qkv
if "self_attn.k_proj.weight" in k:
k_weight = v
continue
if "self_attn.k_proj.bias" in k:
k_bias = v
continue

if "self_attn.v_proj.weight" in k:
v_weight = v
continue
if "self_attn.v_proj.bias" in k:
v_bias = v
continue

if "self_attn.q_proj.weight" in k:
q_weight = v
qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
layer_id = k.split(".")[2]
checkponts_[f"transformer.h.{layer_id}.attn.c_attn.weight"] = qkv_weight
continue

if "self_attn.q_proj.bias" in k:
q_bias = v
qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
layer_id = k.split(".")[2]
checkponts_[f"transformer.h.{layer_id}.attn.c_attn.bias"] = qkv_bias
continue

# att out
if "decoder.layers" in k and "self_attn.out_proj" in k:
layer_id = k.split(".")[2]
weight_or_bias = k.split(".")[-1]
checkponts_[f"transformer.h.{layer_id}.attn.c_proj.{weight_or_bias}"] = v
continue

# fc1
if "decoder.layers" in k and "fc1" in k:
layer_id = k.split(".")[2]
weight_or_bias = k.split(".")[-1]
checkponts_[f"transformer.h.{layer_id}.mlp.c_fc.{weight_or_bias}"] = v
continue

# fc2
if "decoder.layers" in k and "fc2" in k:
layer_id = k.split(".")[2]
weight_or_bias = k.split(".")[-1]
checkponts_[f"transformer.h.{layer_id}.mlp.c_proj.{weight_or_bias}"] = v
continue

# second ln
if "decoder.layers" in k and "final_layer_norm" in k:
layer_id = k.split(".")[2]
weight_or_bias = k.split(".")[-1]
checkponts_[f"transformer.h.{layer_id}.ln_2.{weight_or_bias}"] = v
continue

return checkponts_
3 changes: 2 additions & 1 deletion flagai/model/predictor/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,8 @@ def predict_generate_randomsample(self,
device)

elif "gpt" in self.class_name.lower(
) or "opt" in self.class_name.lower():
) or "opt" in self.class_name.lower() \
or "galactica" in self.class_name.lower():
return gpt_random_sample_use_cache(self.model, self.tokenizer,
text, input_max_length,
out_max_length, top_k, top_p,
Expand Down

0 comments on commit 6a3049c

Please sign in to comment.