diff --git a/examples/galactica/generate_galactica_1.3b.py b/examples/galactica/generate_galactica_1.3b.py new file mode 100644 index 00000000..fe404336 --- /dev/null +++ b/examples/galactica/generate_galactica_1.3b.py @@ -0,0 +1,27 @@ +from flagai.model.predictor.predictor import Predictor +from flagai.auto_model.auto_loader import AutoLoader +import torch +device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu") + +loader = AutoLoader(task_name="lm", + model_name="galactica-1.3b-en", + model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/") + +model = loader.get_model() +model.to(device) +model.eval() + +tokenizer = loader.get_tokenizer() + +predictor = Predictor(model, tokenizer) + +text = "Please write a abstract about the computer vision. \n" +out = predictor.predict_generate_randomsample(text, + out_max_length=700, + top_k=50, + repetition_penalty=1.2, + temperature=0.7 + ) +print(out) + + diff --git a/flagai/auto_model/auto_loader.py b/flagai/auto_model/auto_loader.py index a7a480f5..7fea3564 100644 --- a/flagai/auto_model/auto_loader.py +++ b/flagai/auto_model/auto_loader.py @@ -55,6 +55,7 @@ def __getattr__(self, name): "glm_title-generation": ["flagai.model.glm_model", "GLMForSeq2Seq"], "opt_seq2seq": ("flagai.model.opt_model", "OPTModel"), "opt_lm": ("flagai.model.opt_model", "OPTModel"), + "galactica_lm": ("flagai.model.galactica_model", "GalacticaModel"), "vit_classification": ("flagai.model.vision.vit", "VisionTransformer"), "clip_txt_img_matching": ("flagai.model.mm.clip_model", "CLIP"), "swinv1_classification": ("flagai.model.vision.swinv1", "SwinTransformer"), @@ -90,6 +91,10 @@ def __getattr__(self, name): "glm-10b-ch": ["flagai.model.glm_model", "GLMModel", "glm", "nlp"], "cpm3": ["flagai.model.cpm3_model", "CPM3", "cpm3", "nlp"], "cpm3-train": ["flagai.model.cpm3_train_model", "CPM3", "cpm3", "nlp"], + "galactica-1.3b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], + "galactica-6.7b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], + "galactica-30b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], + "galactica-120b-en": ["flagai.model.galactica_model", "GalacticaModel", "galactica", "nlp", "flagai.data.tokenizer.galactica.galactica_tokenizer", "GalacticaTokenizer"], "vit-base-p16-224": ["flagai.model.vision.vit", "VisionTransformer", "vit", "vision"], "vit-base-p16-384": @@ -131,6 +136,7 @@ def __getattr__(self, name): "altclip-bert-b": ["flagai.models.mm.AltCLIP", "AltCLIP", "altclip", "mm", "flagai.model.mm.AltCLIP", "AltCLIPProcessBert"], "eva-clip": ["flagai.model.mm.eva_clip_model", "EVA_CLIP", "evaclip", "mm"], + } @@ -205,10 +211,14 @@ def __init__(self, self.model.half() if model_type == "nlp": - tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), - "Tokenizer") - self.tokenizer = tokenizer_class.from_pretrained( - model_name, cache_dir=download_path) + if brief_model_name in ["galactica", ]: + self.tokenizer = getattr(LazyImport(MODEL_DICT[model_name][4]), + MODEL_DICT[model_name][5])(download_path) + else : + tokenizer_class = getattr(LazyImport("flagai.data.tokenizer"), + "Tokenizer") + self.tokenizer = tokenizer_class.from_pretrained( + model_name, cache_dir=download_path) elif model_type == "mm": if model_name.startswith("altdiffusion"): diff --git a/flagai/data/tokenizer/galactica/galactica_tokenizer.py b/flagai/data/tokenizer/galactica/galactica_tokenizer.py new file mode 100644 index 00000000..87a28412 --- /dev/null +++ b/flagai/data/tokenizer/galactica/galactica_tokenizer.py @@ -0,0 +1,84 @@ +from transformers import PreTrainedTokenizerFast + + +from ..tokenizer import CommandToken, Tokenizer + +class GalacticaTokenizer(Tokenizer): + def __init__(self, download_dir) -> None: + pass + self.text_tokenizer = PreTrainedTokenizerFast.from_pretrained(download_dir) + # parse tokens and vocabs from tokenizer + self._tokens = list(self.text_tokenizer.get_vocab().keys()) + self._vocab = {k: v for k, v in self.text_tokenizer.get_vocab().items()} + self.num_tokens = len(self._tokens) + + self._command_tokens = [ + CommandToken('pad', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), + CommandToken('ENC', '[CLS]', self.get_specialid_from_text_tokenizer('cls')), + CommandToken('MASK', '[MASK]', + self.get_specialid_from_text_tokenizer('mask')), + CommandToken('unk', '[UNK]', self.get_specialid_from_text_tokenizer('unk')), + CommandToken('sep', '[SEP]', self.get_specialid_from_text_tokenizer('sep')), + CommandToken('eos', '[PAD]', self.get_specialid_from_text_tokenizer('pad')), + ] + + self.command_name_map = {tok.name: tok for tok in self._command_tokens} + self.command_token_map = { + tok.token: tok + for tok in self._command_tokens + } + self.command_id_map = {tok.Id: tok for tok in self._command_tokens} + + def get_specialid_from_text_tokenizer(self, token): + if token in ["eos", "sep"]: + return self._vocab.get('') + elif token == "cls": + return self._vocab.get('') + elif token == "unk": + return self._vocab.get('') + elif token == "pad": + return self._vocab.get('') + elif token == "mask": + return self._vocab.get('') + else: + raise NameError("token not exists") + + def encode_plus(self, text, max_length=512): + return self.text_tokenizer.encode_plus(text, truncation=True, max_length=max_length) + + def decode(self, ids): + return self.text_tokenizer.decode(ids) + + def get_vocab(self): + return self.text_tokenizer.get_vocab() + + def get_command_id(self, name): + """get command token corresponding to `name`""" + return self.command_name_map[name].Id + + def get_command(self, name): + """get command token corresponding to `name`""" + return self.command_name_map[name] + + def encode_plus(self, + text, + second_text=None, + truncation=False, + max_length=None,): + + return self.text_tokenizer.encode_plus(text, + text_pair=second_text, + truncation=truncation, + max_length=max_length, + add_special_tokens=True) + + def tokenize(self, **kwargs): + return self.text_tokenizer.tokenize(**kwargs) + + def __len__(self): + return len(self.text_tokenizer) + +if __name__ == "__main__": + pass + + diff --git a/flagai/model/galactica_model.py b/flagai/model/galactica_model.py new file mode 100644 index 00000000..98e1f9cb --- /dev/null +++ b/flagai/model/galactica_model.py @@ -0,0 +1,207 @@ +# coding=utf-8 +# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Galactica model.""" +import random +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss +from flagai.model.layers.activations import ACT2FN +from flagai.model.gpt2_model import GPT2Model, GPT2Stack, GPT2Config + + +class OPTLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + attention_mask = attention_mask.long() + + # create positions depending on attention_mask + positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1 + # cut positions if `past_key_values_length` is > 0 + positions = positions[:, past_key_values_length:] + + return super().forward(positions + self.offset) + +class OPTStack(GPT2Stack): + def __init__(self, config: GPT2Config): + super(OPTStack, self).__init__(config) + self.wpe = OPTLearnedPositionalEmbedding(config.n_positions, config.hidden_size) + self.ln_f = None + if config.do_layer_norm_before: + self.ln_f = nn.LayerNorm(config.hidden_size) + + if config.n_embd != config.hidden_size: + self.project_out = nn.Linear(config.hidden_size, config.n_embd, bias=False) + else: + self.project_out = None + + if config.n_embd != config.hidden_size: + self.project_in = nn.Linear(config.n_embd, config.hidden_size, bias=False) + else: + self.project_in = None + + def get_position_embeddings(self, **kwargs): + pass + padding_mask = kwargs["padding_mask"] + past_length = kwargs["past_length"] + position_embeds = self.wpe(padding_mask, past_length) + return position_embeds + +def trans_opt_to_gpt_config(opt_config_json): + trans_config_json = {} + trans_key = { + "ffn_dim": "n_inner", + "hidden_size": "hidden_size", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + "vocab_size": "vocab_size", + "activation_function": "activation_function", + "checkpoint_activations": "checkpoint_activations", + "word_embed_proj_dim": "n_embd", + "do_layer_norm_before": "do_layer_norm_before", + } + for k, v in opt_config_json.items(): + if k in trans_key: + trans_config_json[trans_key[k]] = v + + return trans_config_json + +class GalacticaModel(GPT2Model): + + def __init__(self, config, **kwargs): + config = trans_opt_to_gpt_config(config) + super(GalacticaModel, self).__init__(config, **kwargs) + self.transformer = OPTStack(self.config_gpt) + + def load_weights(self, checkpoint_path): + checkpoint = torch.load(checkpoint_path, + map_location=torch.device("cpu")) + if "module" in checkpoint: + # ddp + checkpoint = checkpoint["module"] + + checkpoint_ = {} + for k, v in checkpoint.items(): + if k[:6] == "model.": + checkpoint_[k[6:]] = v + else : + checkpoint_[k] = v + + checkpoint = self.transpose_weight(checkpoint_) + self.load_state_dict(checkpoint, strict=False) + self.lm_head.weight.data = nn.Parameter(self.transformer.wte.weight.data) + + return checkpoint + + def transpose_weight(self, checkpoints): + + checkponts_ = { + "transformer.wte.weight": checkpoints["decoder.embed_tokens.weight"], + "transformer.wpe.weight": checkpoints["decoder.embed_positions.weight"], + } + + if "decoder.project_in.weight" in checkpoints: + checkponts_["transformer.project_in.weight"] = checkpoints["decoder.project_in.weight"] + checkponts_["transformer.project_out.weight"] = checkpoints["decoder.project_out.weight"] + + if "decoder.final_layer_norm.weight" in checkpoints: + checkponts_["transformer.ln_f.weight"] = checkpoints["decoder.final_layer_norm.weight"] + checkponts_["transformer.ln_f.bias"] = checkpoints["decoder.final_layer_norm.bias"] + + q_weight = None + k_weight = None + v_weight = None + q_bias = None + k_bias = None + v_bias = None + for k, v in checkpoints.items(): + # first ln + if "decoder.layers" in k and "self_attn_layer_norm" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.ln_1.{weight_or_bias}"] = v + continue + + # qkv + if "self_attn.k_proj.weight" in k: + k_weight = v + continue + if "self_attn.k_proj.bias" in k: + k_bias = v + continue + + if "self_attn.v_proj.weight" in k: + v_weight = v + continue + if "self_attn.v_proj.bias" in k: + v_bias = v + continue + + if "self_attn.q_proj.weight" in k: + q_weight = v + qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0) + layer_id = k.split(".")[2] + checkponts_[f"transformer.h.{layer_id}.attn.c_attn.weight"] = qkv_weight + continue + + if "self_attn.q_proj.bias" in k: + q_bias = v + qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0) + layer_id = k.split(".")[2] + checkponts_[f"transformer.h.{layer_id}.attn.c_attn.bias"] = qkv_bias + continue + + # att out + if "decoder.layers" in k and "self_attn.out_proj" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.attn.c_proj.{weight_or_bias}"] = v + continue + + # fc1 + if "decoder.layers" in k and "fc1" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.mlp.c_fc.{weight_or_bias}"] = v + continue + + # fc2 + if "decoder.layers" in k and "fc2" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.mlp.c_proj.{weight_or_bias}"] = v + continue + + # second ln + if "decoder.layers" in k and "final_layer_norm" in k: + layer_id = k.split(".")[2] + weight_or_bias = k.split(".")[-1] + checkponts_[f"transformer.h.{layer_id}.ln_2.{weight_or_bias}"] = v + continue + + return checkponts_ \ No newline at end of file diff --git a/flagai/model/predictor/predictor.py b/flagai/model/predictor/predictor.py index 937e3bc5..03cf7ff4 100644 --- a/flagai/model/predictor/predictor.py +++ b/flagai/model/predictor/predictor.py @@ -322,7 +322,8 @@ def predict_generate_randomsample(self, device) elif "gpt" in self.class_name.lower( - ) or "opt" in self.class_name.lower(): + ) or "opt" in self.class_name.lower() \ + or "galactica" in self.class_name.lower(): return gpt_random_sample_use_cache(self.model, self.tokenizer, text, input_max_length, out_max_length, top_k, top_p,