feat(mixtral): correct support for mixtral (bentoml#772)

feat(mixtral): support inference with pt Signed-off-by: Aaron <[email protected]>
Hipoooop · Dec 13, 2023 · 3ab78cd · 3ab78cd
1 parent eb31034
commit 3ab78cd
Show file tree

Hide file tree

Showing 9 changed files with 226 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -724,6 +724,7 @@ You can specify any of the following Mistral models via `openllm start`:
 
 - [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha)
 - [HuggingFaceH4/zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta)
+- [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
 - [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
 - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 
@@ -765,6 +766,74 @@ openllm start HuggingFaceH4/zephyr-7b-alpha --backend pt
 
 <details>
 
+<summary>Mixtral</summary>
+
+
+### Quickstart
+
+Run the following command to quickly spin up a Mixtral server:
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1
+```
+In a different terminal, run the following command to interact with the server:
+
+```bash
+export OPENLLM_ENDPOINT=http://localhost:3000
+openllm query 'What are large language models?'
+```
+
+
+> **Note:** Any Mixtral variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search=mixtral) to see more Mixtral-compatible models.
+
+
+
+### Supported models
+
+You can specify any of the following Mixtral models via `openllm start`:
+
+
+- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+
+### Supported backends
+
+OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch.
+
+
+
+> **Important:** We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`.
+
+
+
+- vLLM (Recommended):
+
+
+To install vLLM, run `pip install "openllm[vllm]"`
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --backend vllm
+```
+
+
+> **Important:** Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving.
+
+
+
+> **Note:** Currently, adapters are yet to be supported with vLLM.
+
+
+- PyTorch:
+
+
+```bash
+openllm start mistralai/Mixtral-8x7B-Instruct-v0.1 --backend pt
+```
+
+</details>
+
+<details>
+
 <summary>MPT</summary>
 
 

diff --git a/local.sh b/local.sh
@@ -82,11 +82,18 @@ done
 
 validate_extensions
 
+# Check if .python-version file exists from GIT_ROOT, otherwise symlink from .python-version-default to .python-version
+if [ ! -f "$GIT_ROOT/.python-version" ]; then
+  echo "Symlinking .python-version-default to .python-version"
+  ln -s "$GIT_ROOT/.python-version-default" "$GIT_ROOT/.python-version"
+fi
+
 # Check if the EXTENSIONS array is empty
 if [ ${#EXTENSIONS[@]} -eq 0 ]; then
   echo "No extensions specified"
   EXTENSIONS_STR=""
 else
+  echo "Installing extensions: ${EXTENSIONS[*]}"
   EXTENSIONS_STR="[${EXTENSIONS[*]}]"
   EXTENSIONS_STR=${EXTENSIONS_STR// /,} # Replace spaces with commas
 fi

diff --git a/openllm-core/src/openllm_core/config/__init__.py b/openllm-core/src/openllm_core/config/__init__.py
@@ -1,10 +1,5 @@
-from __future__ import annotations
-
-from .configuration_auto import (
-  CONFIG_MAPPING as CONFIG_MAPPING,
-  CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
-  AutoConfig as AutoConfig,
-)
+# AUTOGENERATED BY update-config-stubs.py. DO NOT EDIT
+from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig
 from .configuration_baichuan import BaichuanConfig as BaichuanConfig
 from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig
 from .configuration_dolly_v2 import DollyV2Config as DollyV2Config
@@ -13,6 +8,7 @@
 from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig
 from .configuration_llama import LlamaConfig as LlamaConfig
 from .configuration_mistral import MistralConfig as MistralConfig
+from .configuration_mixtral import MixtralConfig as MixtralConfig
 from .configuration_mpt import MPTConfig as MPTConfig
 from .configuration_opt import OPTConfig as OPTConfig
 from .configuration_phi import PhiConfig as PhiConfig

diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -26,18 +26,19 @@
     [
       ('flan_t5', 'FlanT5Config'),
       ('baichuan', 'BaichuanConfig'),
-      ('chatglm', 'ChatGLMConfig'),  #
+      ('chatglm', 'ChatGLMConfig'),
       ('falcon', 'FalconConfig'),
       ('gpt_neox', 'GPTNeoXConfig'),
       ('dolly_v2', 'DollyV2Config'),
-      ('stablelm', 'StableLMConfig'),  #
+      ('stablelm', 'StableLMConfig'),
       ('llama', 'LlamaConfig'),
       ('mpt', 'MPTConfig'),
       ('opt', 'OPTConfig'),
       ('phi', 'PhiConfig'),
       ('qwen', 'QwenConfig'),
       ('starcoder', 'StarCoderConfig'),
       ('mistral', 'MistralConfig'),
+      ('mixtral', 'MixtralConfig'),
       ('yi', 'YiConfig'),
     ]
   )
@@ -137,6 +138,9 @@ def for_model(cls, model_name: t.Literal['llama'], **attrs: t.Any) -> openllm_co
   def for_model(cls, model_name: t.Literal['mistral'], **attrs: t.Any) -> openllm_core.config.MistralConfig: ...
   @t.overload
   @classmethod
+  def for_model(cls, model_name: t.Literal['mixtral'], **attrs: t.Any) -> openllm_core.config.MixtralConfig: ...
+  @t.overload
+  @classmethod
   def for_model(cls, model_name: t.Literal['mpt'], **attrs: t.Any) -> openllm_core.config.MPTConfig: ...
   @t.overload
   @classmethod

diff --git a/openllm-core/src/openllm_core/config/configuration_mistral.py b/openllm-core/src/openllm_core/config/configuration_mistral.py
@@ -26,6 +26,7 @@ class MistralConfig(openllm_core.LLMConfig):
     'model_ids': [
       'HuggingFaceH4/zephyr-7b-alpha',
       'HuggingFaceH4/zephyr-7b-beta',
+      'mistralai/Mistral-7B-Instruct-v0.2',
       'mistralai/Mistral-7B-Instruct-v0.1',
       'mistralai/Mistral-7B-v0.1',
     ],

diff --git a/openllm-core/src/openllm_core/config/configuration_mixtral.py b/openllm-core/src/openllm_core/config/configuration_mixtral.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import openllm_core, typing as t
+
+if t.TYPE_CHECKING:
+  from openllm_core._schemas import MessageParam
+
+SINST_KEY, EINST_KEY, BOS_TOKEN, EOS_TOKEN = '[INST]', '[/INST]', '<s>', '</s>'
+
+
+class MixtralConfig(openllm_core.LLMConfig):
+  """The Mixtral-8x7B Large Language Model (LLM) is a pretrained generative Sparse Mixture of Experts. The Mixtral-8x7B outperforms Llama 2 70B on most benchmarks we tested.
+
+  Refer to [Mixtral's HuggingFace page](https://huggingface.co/docs/transformers/main/model_doc/mixtral)
+  for more information.
+  """
+
+  __config__ = {
+    'name_type': 'lowercase',
+    'url': 'https://mistral.ai',
+    'architecture': 'MixtralForCausalLM',
+    'default_id': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
+    'model_ids': ['mistralai/Mixtral-8x7B-Instruct-v0.1', 'mistralai/Mixtral-8x7B-v0.1'],
+  }
+
+  class GenerationConfig:
+    max_new_tokens: int = 20
+    temperature: float = 0.7
+
+  class SamplingParams:
+    best_of: int = 1
+
+  # NOTE: see https://docs.mistral.ai/usage/guardrailing/ and https://docs.mistral.ai/llm/mistral-instruct-v0.1
+  @property
+  def template(self) -> str:
+    return '''{start_key}{start_inst} {system_message} {instruction} {end_inst}\n'''.format(
+      start_inst=SINST_KEY,
+      end_inst=EINST_KEY,
+      start_key=BOS_TOKEN,
+      system_message='{system_message}',
+      instruction='{instruction}',
+    )
+
+  # NOTE: https://docs.mistral.ai/usage/guardrailing/
+  @property
+  def system_message(self) -> str:
+    return '''Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity.'''
+
+  @property
+  def chat_template(self) -> str:
+    return repr("{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}")
+
+  @property
+  def chat_messages(self) -> list[MessageParam]:
+    from openllm_core._schemas import MessageParam
+    return [MessageParam(role='user', content='What is your favourite condiment?'),
+            MessageParam(role='assistant', content="Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"),
+            MessageParam(role='user', content='Do you have mayonnaise recipes?')]
diff --git a/openllm-python/README.md b/openllm-python/README.md
diff --git a/openllm-python/src/openllm/__init__.pyi b/openllm-python/src/openllm/__init__.pyi
@@ -11,7 +11,7 @@ Fine-tune, serve, deploy, and monitor any LLMs with ease.
 '''
 
 # update-config-stubs.py: import stubs start
-from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
+from openlm_core.config import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MistralConfig as MistralConfig, MixtralConfig as MixtralConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, PhiConfig as PhiConfig, QwenConfig as QwenConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig, YiConfig as YiConfig
 # update-config-stubs.py: import stubs stop
 
 from openllm_cli._sdk import (

diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py
@@ -18,11 +18,12 @@
 END_IMPORT_STUBS_COMMENT = f'# {os.path.basename(__file__)}: import stubs stop\n'
 
 ROOT = Path(__file__).parent.parent
-_TARGET_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / '_configuration.py'
-_TARGET_AUTO_FILE = ROOT / 'openllm-core' / 'src' / 'openllm_core' / 'config' / 'configuration_auto.py'
-_TARGET_INIT_FILE = ROOT / 'openllm-python' / 'src' / 'openllm' / '__init__.pyi'
+_TARGET_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'_configuration.py'
+_TARGET_AUTO_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'config'/'configuration_auto.py'
+_TARGET_CORE_INIT_FILE = ROOT/'openllm-core'/'src'/'openllm_core'/'config'/'__init__.py'
+_TARGET_INIT_FILE = ROOT/'openllm-python'/'src'/'openllm'/'__init__.pyi'
 
-sys.path.insert(0, (ROOT / 'openllm-core' / 'src').__fspath__())
+sys.path.insert(0, (ROOT/'openllm-core'/'src').__fspath__())
 from openllm_core._configuration import GenerationConfig, ModelSettings, SamplingParams
 from openllm_core.config.configuration_auto import CONFIG_MAPPING_NAMES
 from openllm_core.utils import codegen
@@ -217,6 +218,13 @@ def main() -> int:
   processed = processed[:start_import_stubs_idx] + [START_IMPORT_STUBS_COMMENT, lines, END_IMPORT_STUBS_COMMENT] + processed[end_import_stubs_idx + 1 :]
   with _TARGET_INIT_FILE.open('w') as f: f.writelines(processed)
 
+  lines = [
+    f'# AUTOGENERATED BY {os.path.basename(__file__)}. DO NOT EDIT\n',
+    'from .configuration_auto import CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, AutoConfig as AutoConfig\n',
+    *[f'from .configuration_{k} import {a} as {a}\n' for k, a in CONFIG_MAPPING_NAMES.items()]
+  ]
+  with _TARGET_CORE_INIT_FILE.open('w') as f: f.writelines(lines)
+
   return 0