Skip to content

Commit

Permalink
feat(mixtral): correct support for mixtral (bentoml#772)
Browse files Browse the repository at this point in the history
feat(mixtral): support inference with pt

Signed-off-by: Aaron <[email protected]>
  • Loading branch information
aarnphm authored Dec 13, 2023
1 parent eb31034 commit 3ab78cd
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 14 deletions.
69 changes: 69 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,7 @@ You can specify any of the following Mistral models via `openllm start`:
- [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha)
- [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
- [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
- [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
- [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
Expand Down Expand Up @@ -765,6 +766,74 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
<details>
<summary>Mixtral</summary>
### Quickstart
Run the following command to quickly spin up a Mixtral server:
```bash
openllm start mistralai/Mixtral-8x7B-Instruct-v0.1
```
In a different terminal, run the following command to interact with the server:
```bash
export OPENLLM_ENDPOINT=http://localhost:3000
openllm query 'What are large language models?'
```
> **Note:** Any Mixtral variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mixtral) to see more Mixtral-compatible models.
### Supported models
You can specify any of the following Mixtral models via `openllm start`:
- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
### Supported backends
OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch.
> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`.
- vLLM (Recommended):
To install vLLM, run `pip install "openllm[vllm]"`
```bash
openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --backend vllm
```
> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving.
> **Note:** Currently, adapters are yet to be supported with vLLM.
- PyTorch:
```bash
openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --backend pt
```
</details>
<details>
<summary>MPT</summary>
Expand Down
7 changes: 7 additions & 0 deletions local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,18 @@ done

validate_extensions

# Check if .python-version file exists from GIT_ROOT, otherwise symlink from .python-version-default to .python-version
if [ ! -f "$GIT_ROOT/.python-version" ]; then
echo "Symlinking .python-version-default to .python-version"
ln -s "$GIT_ROOT/.python-version-default" "$GIT_ROOT/.python-version"
fi

# Check if the EXTENSIONS array is empty
if [ ${#EXTENSIONS[@]} -eq 0 ]; then
echo "No extensions specified"
EXTENSIONS_STR=""
else
echo "Installing extensions: ${EXTENSIONS[*]}"
EXTENSIONS_STR="[${EXTENSIONS[*]}]"
EXTENSIONS_STR=${EXTENSIONS_STR// /,} # Replace spaces with commas
fi
Expand Down
10 changes: 3 additions & 7 deletions openllm-core/src/openllm_core/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
from __future__ import annotations

from .configuration_auto import (
CONFIG_MAPPING as CONFIG_MAPPING,
CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
AutoConfig as AutoConfig,
)
# AUTOGENERATED BY update-config-stubs.py. DO NOT EDIT
from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig
from .configuration_baichuan import BaichuanConfig as BaichuanConfig
from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
Expand All @@ -13,6 +8,7 @@
from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig
from .configuration_llama import LlamaConfig as LlamaConfig
from .configuration_mistral import MistralConfig as MistralConfig
from .configuration_mixtral import MixtralConfig as MixtralConfig
from .configuration_mpt import MPTConfig as MPTConfig
from .configuration_opt import OPTConfig as OPTConfig
from .configuration_phi import PhiConfig as PhiConfig
Expand Down
8 changes: 6 additions & 2 deletions openllm-core/src/openllm_core/config/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,19 @@
[
('flan_t5', 'FlanT5Config'),
('baichuan', 'BaichuanConfig'),
('chatglm', 'ChatGLMConfig'), #
('chatglm', 'ChatGLMConfig'),
('falcon', 'FalconConfig'),
('gpt_neox', 'GPTNeoXConfig'),
('dolly_v2', 'DollyV2Config'),
('stablelm', 'StableLMConfig'), #
('stablelm', 'StableLMConfig'),
('llama', 'LlamaConfig'),
('mpt', 'MPTConfig'),
('opt', 'OPTConfig'),
('phi', 'PhiConfig'),
('qwen', 'QwenConfig'),
('starcoder', 'StarCoderConfig'),
('mistral', 'MistralConfig'),
('mixtral', 'MixtralConfig'),
('yi', 'YiConfig'),
]
)
Expand Down Expand Up @@ -137,6 +138,9 @@ def for_model(cls, model_name: t.Literal['llama'], **attrs: t.Any) -> openllm_co
def for_model(cls, model_name: t.Literal['mistral'], **attrs: t.Any) -> openllm_core.config.MistralConfig: ...
@t.overload
@classmethod
def for_model(cls, model_name: t.Literal['mixtral'], **attrs: t.Any) -> openllm_core.config.MixtralConfig: ...
@t.overload
@classmethod
def for_model(cls, model_name: t.Literal['mpt'], **attrs: t.Any) -> openllm_core.config.MPTConfig: ...
@t.overload
@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class MistralConfig(openllm_core.LLMConfig):
'model_ids': [
'HuggingFaceH4/zephyr-7b-alpha',
'HuggingFaceH4/zephyr-7b-beta',
'mistralai/Mistral-7B-Instruct-v0.2',
'mistralai/Mistral-7B-Instruct-v0.1',
'mistralai/Mistral-7B-v0.1',
],
Expand Down
58 changes: 58 additions & 0 deletions openllm-core/src/openllm_core/config/configuration_mixtral.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

import openllm_core, typing as t

if t.TYPE_CHECKING:
from openllm_core._schemas import MessageParam

SINST_KEY, EINST_KEY, BOS_TOKEN, EOS_TOKEN = '[INST]', '[/INST]', '<s>', '</s>'


class MixtralConfig(openllm_core.LLMConfig):
"""The Mixtral-8x7B Large Language Model (LLM) is a pretrained generative Sparse Mixture of Experts. The Mixtral-8x7B outperforms Llama 2 70B on most benchmarks we tested.
Refer to [Mixtral's HuggingFace page](https://huggingface.co/docs/transformers/main/model_doc/mixtral)
for more information.
"""

__config__ = {
'name_type': 'lowercase',
'url': 'https://mistral.ai',
'architecture': 'MixtralForCausalLM',
'default_id': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
'model_ids': ['mistralai/Mixtral-8x7B-Instruct-v0.1', 'mistralai/Mixtral-8x7B-v0.1'],
}

class GenerationConfig:
max_new_tokens: int = 20
temperature: float = 0.7

class SamplingParams:
best_of: int = 1

# NOTE: see https://docs.mistral.ai/usage/guardrailing/ and https://docs.mistral.ai/llm/mistral-instruct-v0.1
@property
def template(self) -> str:
return '''{start_key}{start_inst} {system_message} {instruction} {end_inst}\n'''.format(
start_inst=SINST_KEY,
end_inst=EINST_KEY,
start_key=BOS_TOKEN,
system_message='{system_message}',
instruction='{instruction}',
)

# NOTE: https://docs.mistral.ai/usage/guardrailing/
@property
def system_message(self) -> str:
return '''Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity.'''

@property
def chat_template(self) -> str:
return repr("{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}")

@property
def chat_messages(self) -> list[MessageParam]:
from openllm_core._schemas import MessageParam
return [MessageParam(role='user', content='What is your favourite condiment?'),
MessageParam(role='assistant', content="Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"),
MessageParam(role='user', content='Do you have mayonnaise recipes?')]
69 changes: 69 additions & 0 deletions openllm-python/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion openllm-python/src/openllm/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
'''

# update-config-stubs.py: import stubs start
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
# update-config-stubs.py: import stubs stop

from openllm_cli._sdk import (
Expand Down
16 changes: 12 additions & 4 deletions tools/update-config-stubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@
END_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs stop\n'

ROOT = Path(__file__).parent.parent
_TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py'
_TARGET_AUTO_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / 'configuration_auto.py'
_TARGET_INIT_FILE = ROOT / 'openllm-python' / 'src' / 'openllm' / '__init__.pyi'
_TARGET_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'_configuration.py'
_TARGET_AUTO_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'config'/'configuration_auto.py'
_TARGET_CORE_INIT_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'config'/'__init__.py'
_TARGET_INIT_FILE = ROOT/'openllm-python'/'src'/'openllm'/'__init__.pyi'

sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
sys.path.insert(0, (ROOT/'openllm-core'/'src').__fspath__())
from openllm_core._configuration import GenerationConfig, ModelSettings, SamplingParams
from openllm_core.config.configuration_auto import CONFIG_MAPPING_NAMES
from openllm_core.utils import codegen
Expand Down Expand Up @@ -217,6 +218,13 @@ def main() -> int:
processed = processed[:start_import_stubs_idx] + [START_IMPORT_STUBS_COMMENT, lines, END_IMPORT_STUBS_COMMENT] + processed[end_import_stubs_idx + 1 :]
with _TARGET_INIT_FILE.open('w') as f: f.writelines(processed)

lines = [
f'# AUTOGENERATED BY {os.path.basename(__file__)}. DO NOT EDIT\n',
'from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig\n',
*[f'from .configuration_{k} import {a} as {a}\n' for k, a in CONFIG_MAPPING_NAMES.items()]
]
with _TARGET_CORE_INIT_FILE.open('w') as f: f.writelines(lines)

return 0


Expand Down

0 comments on commit 3ab78cd

Please sign in to comment.