Skip to content

Commit

Permalink
added bminf
Browse files Browse the repository at this point in the history
Signed-off-by: ftgreat <[email protected]>
  • Loading branch information
ftgreat committed Mar 2, 2023
1 parent bcdf9b1 commit 0811530
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 44 deletions.
45 changes: 45 additions & 0 deletions examples/bminf_generate/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@

# BMInf

## 简介/Overview

BMInf is a low-resource inference package for large-scale pretrained language models.

BMInf supports running models with more than 10 billion parameters on a single NVIDIA GTX 1060 GPU in its minimum requirements. Running with better GPUs leads to better performance. In cases where the GPU memory supports the large model inference (such as V100 or A100), BMInf still has a significant performance improvement over the existing PyTorch implementation.

BMInf Github Repository address: https://github.com/OpenBMB/BMInf

BMInf (Big Model Inference) 是一个用于大规模预训练语言模型(pretrained language models, PLM)推理阶段的低资源工具包。

BMInf最低支持在NVIDIA GTX 1060单卡运行百亿大模型。在此基础上,使用更好的gpu运行会有更好的性能。在显存支持进行大模型推理的情况下(如V100或A100显卡),BMInf的实现较现有PyTorch版本仍有较大性能提升。

BMInf 仓库地址:https://github.com/OpenBMB/BMInf

## 应用/Application

在模型加载参数之后,使用如下代码来用BMInf转换模型

```Python
with torch.cuda.device(0):
model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30)
```
The `quantization` parameter represents whether to use the model quantization technique, but if it is a generated class model, it needs to be set to `False`.

You can use the `memory_limit` parameter to set the maximum available storage, the unit is Mb.

`quantization`参数代表是否使用了模型量化的技巧,但如果是生成类模型,则需要设置成`False`

可以用`memory_limit`参数设置最大的可用存储,单位为Mb

如果`bminf.wrapper`不能很好的适配你的模型,你可以用以下的方法来进行手动适配。

*`torch.nn.ModuleList` 替换为 `bminf.TransformerBlockList`.
```python
module_list = bminf.TransformerBlockList([
], [CUDA_DEVICE_INDEX])
```

*`torch.nn.Linear` 替换为 `bminf.QuantizedLinear`.
```python
linear = bminf.QuantizedLinear(torch.nn.Linear(...))
```
37 changes: 37 additions & 0 deletions examples/bminf_generate/cpm1_generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import sys
sys.path.append("/home/yanzhaodong/anhforth/FlagAI")
import torch
from flagai.auto_model.auto_loader import AutoLoader
from flagai.model.predictor.predictor import Predictor
import bminf
import time


if __name__ == '__main__':

text = '''默写古诗:
白日依山尽,黄河入海流。
床前明月光,'''

loader = AutoLoader(task_name="lm",
model_name="CPM-large-ch",
model_dir="./checkpoints",
device="cpu")

model = loader.get_model()
time_start=time.time()
with torch.cuda.device(0):
model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30)
tokenizer = loader.get_tokenizer()

predictor = Predictor(model=model,
tokenizer=tokenizer,
)

out = predictor.predict_generate_randomsample(text,
top_p=0.9,
out_max_length=50)
time_end=time.time()
print('time cost',time_end-time_start,'s')

print(out)
36 changes: 36 additions & 0 deletions examples/bminf_generate/galactica_6.7b_generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from flagai.model.predictor.predictor import Predictor
from flagai.auto_model.auto_loader import AutoLoader
import torch
import bminf
import time
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


loader = AutoLoader(task_name="lm",
model_name="galactica-6.7b-en",
model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/")

model = loader.get_model()
with torch.cuda.device(0):
model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30)
model.to(device)
model.eval()
tokenizer = loader.get_tokenizer()
predictor = Predictor(model, tokenizer)
print("model loaded")
time_start=time.time()

text = "Please write a abstract about the computer vision. \n"
out = predictor.predict_generate_randomsample(text,
out_max_length=700,
top_k=50,
repetition_penalty=1.2,
temperature=0.7
)

time_end=time.time()
print('time cost',time_end-time_start,'s')
print(out)



File renamed without changes.
9 changes: 7 additions & 2 deletions examples/cpm_1/generate_bminf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from flagai.auto_model.auto_loader import AutoLoader
from flagai.model.predictor.predictor import Predictor
import bminf
import time


if __name__ == '__main__':

Expand All @@ -17,8 +19,9 @@
device="cpu")

model = loader.get_model()
# with torch.cuda.device(0):
# model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30)
time_start=time.time()
with torch.cuda.device(0):
model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30)
tokenizer = loader.get_tokenizer()

predictor = Predictor(model=model,
Expand All @@ -28,5 +31,7 @@
out = predictor.predict_generate_randomsample(text,
top_p=0.9,
out_max_length=50)
time_end=time.time()
print('time cost',time_end-time_start,'s')

print(out)
50 changes: 17 additions & 33 deletions examples/galactica/generate_galactica_1.3b.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,25 @@
import sys
sys.path.append("/home/yanzhaodong/anhforth/FlagAI")
from flagai.model.predictor.predictor import Predictor
from flagai.auto_model.auto_loader import AutoLoader
import torch
import bminf
import time
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

loader = AutoLoader(task_name="lm",
model_name="galactica-1.3b-en",
model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/")

from flagai.data.tokenizer import Tokenizer
tokenizer = Tokenizer.from_pretrained("galactica-1.3b-en",cache_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/")


# loader = AutoLoader(task_name="lm",
# model_name="galactica-1.3b-en",
# model_dir="/share/projset/baaishare/baai-mrnd/xingzhaohu/")

# model = loader.get_model()
# model.to(device)
# model.eval()
# time_start=time.time()
# # with torch.cuda.device(0):
# # model = bminf.wrapper(model, quantization=False, memory_limit=20 << 30)
# tokenizer = loader.get_tokenizer()

# predictor = Predictor(model, tokenizer)

# text = "Please write a abstract about the computer vision. \n"
# out = predictor.predict_generate_randomsample(text,
# out_max_length=700,
# top_k=50,
# repetition_penalty=1.2,
# temperature=0.7
# )
# time_end=time.time()
# print('time cost',time_end-time_start,'s')
# print(out)
model = loader.get_model()
model.to(device)
model.eval()

tokenizer = loader.get_tokenizer()

predictor = Predictor(model, tokenizer)

text = "Please write a abstract about the computer vision. \n"
out = predictor.predict_generate_randomsample(text,
out_max_length=700,
top_k=50,
repetition_penalty=1.2,
temperature=0.7
)
print(out)
2 changes: 0 additions & 2 deletions examples/gpt2_text_writting/generate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Copyright © 2022 BAAI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License")
import sys
sys.path.append("/home/yanzhaodong/anhforth/FlagAI")
from flagai.auto_model.auto_loader import AutoLoader
from flagai.model.predictor.predictor import Predictor

Expand Down
6 changes: 5 additions & 1 deletion flagai/data/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,8 +457,12 @@ def DecodeTokens(self, tokens):
"""A list of tokens => recovered text string"""
return self.text_tokenizer.convert_tokens_to_string(tokens)

def convert_tokens_to_ids(self, tokens):
return self.text_tokenizer.convert_tokens_to_ids(tokens)

def convert_ids_to_tokens(self, ids):
return self.text_tokenizer.convert_ids_to_tokens(ids)

# class BaseTokenizer(object):

class TextTokenizer(object):
"""
Expand Down
6 changes: 0 additions & 6 deletions flagai/data/tokenizer/uni_tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ def __init__(self,
}
self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
self._command_token_tokens = list(self.command_token_map.keys())
# import pdb;pdb.set_trace()
vocab = self.text_tokenizer.get_vocab()
self.token_start_id = vocab.get('<s>', None)
if not self.token_start_id:
Expand All @@ -164,10 +163,8 @@ def __init__(self,
self.token_end_id = vocab.get('<|endoftext|>', None)
if not self.token_end_id:
self.token_end_id = vocab.get('[SEP]', None)
# import pdb;pdb.set_trace()
print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()]))
# logger.info("All special tokens: %s", str([(k,v.Id) for k,v in self.command_name_map.items()]))
import pdb;pbb.set_trace()


def get_vocab(self):
Expand Down Expand Up @@ -229,7 +226,6 @@ def _encode(self, text):
return ids

def convert_tokens_to_ids(self, tokens):
import pdb;pdb.set_trace()
res = []
for token in tokens:
if token in self.command_token_map:
Expand All @@ -239,8 +235,6 @@ def convert_tokens_to_ids(self, tokens):
return res

def convert_ids_to_tokens(self, ids):
# if torch.is_tensor(ids):
# ids = ids.tolist()
res = []
for id in ids:
if id in self.command_id_map:
Expand Down

0 comments on commit 0811530

Please sign in to comment.