feat(engine): CTranslate2 (bentoml#698)

* chore: update instruction for dependencies Signed-off-by: Aaron <[email protected]> * feat(experimental): CTranslate2 Signed-off-by: Aaron <[email protected]> --------- Signed-off-by: Aaron <[email protected]>
Hipoooop · Nov 19, 2023 · 816c1ee · 816c1ee
1 parent 539f250
commit 816c1ee
Show file tree

Hide file tree

Showing 31 changed files with 945 additions and 350 deletions.
diff --git a/README.md b/README.md
@@ -416,6 +416,25 @@ openllm start databricks/dolly-v2-3b --backend vllm
 openllm start databricks/dolly-v2-3b --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+openllm start databricks/dolly-v2-3b --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>
@@ -494,6 +513,25 @@ openllm start tiiuae/falcon-7b --backend vllm
 openllm start tiiuae/falcon-7b --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+openllm start tiiuae/falcon-7b --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>
@@ -615,22 +653,33 @@ openllm start eleutherai/gpt-neox-20b --backend vllm
 openllm start eleutherai/gpt-neox-20b --backend pt
 ```
 
-</details>
+- CTranslate2 (*experimental*):
 
-<details>
 
-<summary>Llama</summary>
+```bash
+openllm start eleutherai/gpt-neox-20b --backend ctranslate
+```
 
 
-### Quickstart
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
 
 
 
-> **Note:** Llama requires to install with:
-> ```bash
-> pip install "openllm[llama]"
-> ```
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
 
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
+</details>
+
+<details>
+
+<summary>Llama</summary>
+
+
+### Quickstart
 
 Run the following command to quickly spin up a Llama server:
 
@@ -701,6 +750,25 @@ openllm start meta-llama/Llama-2-70b-chat-hf --backend vllm
 openllm start meta-llama/Llama-2-70b-chat-hf --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+openllm start meta-llama/Llama-2-70b-chat-hf --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>
@@ -852,6 +920,25 @@ TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend vllm
 TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+TRUST_REMOTE_CODE=True openllm start mosaicml/mpt-7b --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>
@@ -924,6 +1011,25 @@ openllm start facebook/opt-125m --backend vllm
 openllm start facebook/opt-125m --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+openllm start facebook/opt-125m --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>
@@ -1061,6 +1167,25 @@ openllm start stabilityai/stablelm-tuned-alpha-3b --backend vllm
 openllm start stabilityai/stablelm-tuned-alpha-3b --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+openllm start stabilityai/stablelm-tuned-alpha-3b --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>
@@ -1137,6 +1262,25 @@ openllm start bigcode/starcoder --backend vllm
 openllm start bigcode/starcoder --backend pt
 ```
 
+- CTranslate2 (*experimental*):
+
+
+```bash
+openllm start bigcode/starcoder --backend ctranslate
+```
+
+
+> **Note:** Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16
+
+
+
+> **Note:** We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.
+
+
+
+> **Important:** CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.
+
+
 </details>
 
 <details>

diff --git a/all.sh b/all.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 
-printf "Running mirror.sh\n"
-bash ./tools/mirror.sh
 printf "Running update-mypy.py\n"
 python ./tools/update-mypy.py
 printf "Running update-config-stubs.py\n"
 python ./tools/dependencies.py
 printf "Running dependencies.py\n"
 python ./tools/update-config-stubs.py
+printf "Running update-readme.py\n"
+python ./tools/update-readme.py
+printf "Running mirror.sh\n"
+bash ./tools/mirror.sh
diff --git a/mypy.ini b/mypy.ini
diff --git a/openllm-client/src/openllm_client/_utils.pyi b/openllm-client/src/openllm_client/_utils.pyi
@@ -37,6 +37,7 @@ from openllm_core.utils.import_utils import (
   is_autogptq_available as is_autogptq_available,
   is_bentoml_available as is_bentoml_available,
   is_bitsandbytes_available as is_bitsandbytes_available,
+  is_ctranslate_available as is_ctranslate_available,
   is_grpc_available as is_grpc_available,
   is_jupyter_available as is_jupyter_available,
   is_jupytext_available as is_jupytext_available,

diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
@@ -457,7 +457,7 @@ def __getitem__(self, key: str) -> t.Any:
     model_ids=['__default__'],
     architecture='PreTrainedModel',
     serialisation='legacy',
-    backend=('pt', 'vllm'),
+    backend=('pt', 'vllm', 'ctranslate'),
     name_type='dasherize',
     url='',
     model_type='causal_lm',

diff --git a/openllm-core/src/openllm_core/_schemas.py b/openllm-core/src/openllm_core/_schemas.py
@@ -7,23 +7,16 @@
 
 from ._configuration import LLMConfig
 from .config import AutoConfig
-from .utils import ReprMixin, converter, gen_random_uuid
+from .utils import converter, gen_random_uuid
 
 if t.TYPE_CHECKING:
   import vllm
 
   from ._typing_compat import Self
 
 
-@attr.define(repr=False)
-class _SchemaMixin(ReprMixin):
-  @property
-  def __repr_keys__(self):
-    return list(attr.fields_dict(self.__class__))
-
-  def __repr_args__(self):
-    yield from ((k, getattr(self, k)) for k in self.__repr_keys__)
-
+@attr.define
+class _SchemaMixin:
   def model_dump(self) -> dict[str, t.Any]:
     return converter.unstructure(self)
 
@@ -34,7 +27,7 @@ def with_options(self, **options: t.Any) -> Self:
     return attr.evolve(self, **options)
 
 
-@attr.define(repr=False)
+@attr.define
 class MetadataOutput(_SchemaMixin):
   model_id: str
   timeout: int
@@ -56,7 +49,7 @@ def model_dump(self) -> dict[str, t.Any]:
     }
 
 
-@attr.define(repr=False)
+@attr.define
 class GenerationInput(_SchemaMixin):
   prompt: str
   llm_config: LLMConfig
@@ -116,7 +109,7 @@ def examples(_: type[GenerationInput]) -> dict[str, t.Any]:
 FinishReason = t.Literal['length', 'stop']
 
 
-@attr.define(repr=False)
+@attr.define
 class CompletionChunk(_SchemaMixin):
   index: int
   text: str
@@ -129,7 +122,7 @@ def model_dump_json(self) -> str:
     return orjson.dumps(self.model_dump(), option=orjson.OPT_NON_STR_KEYS).decode('utf-8')
 
 
-@attr.define(repr=False)
+@attr.define
 class GenerationOutput(_SchemaMixin):
   prompt: str
   finished: bool

diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
@@ -5,17 +5,20 @@
 import attr
 
 if t.TYPE_CHECKING:
+  from ctranslate2 import Generator, Translator
   from peft.peft_model import PeftModel
   from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
 
   from .utils.lazy import VersionInfo
 else:
   # NOTE: t.Any is also a type
-  PeftModel = PreTrainedModel = PreTrainedTokenizer = PreTrainedTokenizerBase = PreTrainedTokenizerFast = t.Any
+  PeftModel = (
+    PreTrainedModel
+  ) = PreTrainedTokenizer = PreTrainedTokenizerBase = PreTrainedTokenizerFast = Generator = Translator = t.Any
   # NOTE: that VersionInfo is from openllm.utils.lazy.VersionInfo
   VersionInfo = t.Any
 
-M = t.TypeVar('M', bound=t.Union[PreTrainedModel, PeftModel])
+M = t.TypeVar('M', bound=t.Union[PreTrainedModel, PeftModel, Generator, Translator])
 T = t.TypeVar('T', bound=t.Union[PreTrainedTokenizerFast, PreTrainedTokenizer, PreTrainedTokenizerBase])
 
 
@@ -33,7 +36,7 @@ def get_literal_args(typ: t.Any) -> tuple[str, ...]:
 LiteralDtype = t.Literal['float16', 'float32', 'bfloat16', 'int8', 'int16']
 LiteralSerialisation = t.Literal['safetensors', 'legacy']
 LiteralQuantise = t.Literal['int8', 'int4', 'gptq', 'awq', 'squeezellm']
-LiteralBackend = t.Literal['pt', 'vllm', 'ctranslate', 'ggml', 'mlc']
+LiteralBackend = t.Literal['pt', 'vllm', 'ctranslate']  # TODO: ggml
 AdapterType = t.Literal[
   'lora', 'adalora', 'adaption_prompt', 'prefix_tuning', 'p_tuning', 'prompt_tuning', 'ia3', 'loha', 'lokr'
 ]

diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py
@@ -185,15 +185,20 @@ def infer_class_from_name(cls, name: str) -> type[openllm_core.LLMConfig]:
       f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}."
     )
 
+  _cached_mapping = None
+
+  @classmethod
+  def _CONFIG_MAPPING_NAMES_TO_ARCHITECTURE(cls) -> dict[str, str]:
+    if cls._cached_mapping is None:
+      AutoConfig._cached_mapping = {v.__config__['architecture']: k for k, v in CONFIG_MAPPING.items()}
+    return AutoConfig._cached_mapping
+
   @classmethod
   def infer_class_from_llm(cls, llm: openllm.LLM[M, T]) -> type[openllm_core.LLMConfig]:
     if not is_bentoml_available():
       raise MissingDependencyError(
         "'infer_class_from_llm' requires 'bentoml' to be available. Make sure to install it with 'pip install bentoml'"
       )
-    CONFIG_MAPPING_NAMES_TO_ARCHITECTURE: dict[str, str] = {
-      v.__config__['architecture']: k for k, v in CONFIG_MAPPING.items()
-    }
     if llm._local:
       config_file = os.path.join(llm.model_id, CONFIG_FILE_NAME)
     else:
@@ -218,8 +223,8 @@ def infer_class_from_llm(cls, llm: openllm.LLM[M, T]) -> type[openllm_core.LLMCo
       loaded_config = orjson.loads(f.read())
     if 'architectures' in loaded_config:
       for architecture in loaded_config['architectures']:
-        if architecture in CONFIG_MAPPING_NAMES_TO_ARCHITECTURE:
-          return cls.infer_class_from_name(CONFIG_MAPPING_NAMES_TO_ARCHITECTURE[architecture])
+        if architecture in cls._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
+          return cls.infer_class_from_name(cls._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture])
     raise ValueError(
       f"Failed to determine config class for '{llm.model_id}'. Make sure {llm.model_id} is saved with openllm."
     )
diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py
@@ -25,6 +25,7 @@ class BaichuanConfig(openllm_core.LLMConfig):
     'timeout': 3600000,
     'url': 'https://github.com/baichuan-inc/Baichuan-7B',
     'requirements': ['cpm-kernels'],
+    'backend': ('pt', 'vllm'),
     'architecture': 'BaiChuanForCausalLM',
     # NOTE: See the following
     # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555