From 66fd62df1b68d1182623d7a934469ecbfa61422c Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Wed, 18 Sep 2024 15:46:51 -0400 Subject: [PATCH] [Model] Udpate default prefill chunk size and max batch size (#2917) This PR updates the default prefill chunk size from 2048 to 8192, and the default max batch size from 80 to 128. --- python/mlc_llm/cli/gen_config.py | 2 +- python/mlc_llm/model/baichuan/baichuan_model.py | 8 ++++---- python/mlc_llm/model/chatglm3/chatglm3_model.py | 8 ++++---- python/mlc_llm/model/cohere/cohere_model.py | 8 ++++---- python/mlc_llm/model/gemma/gemma_model.py | 8 ++++---- python/mlc_llm/model/gpt2/gpt2_model.py | 8 ++++---- python/mlc_llm/model/gpt_bigcode/gpt_bigcode_model.py | 8 ++++---- python/mlc_llm/model/gpt_neox/gpt_neox_model.py | 8 ++++---- python/mlc_llm/model/internlm/internlm_model.py | 8 ++++---- python/mlc_llm/model/internlm2/internlm2_model.py | 8 ++++---- python/mlc_llm/model/llama/llama_model.py | 8 ++++---- python/mlc_llm/model/minicpm/minicpm_model.py | 8 ++++---- python/mlc_llm/model/mistral/mistral_model.py | 4 ++-- python/mlc_llm/model/orion/orion_model.py | 8 ++++---- python/mlc_llm/model/phi/phi_model.py | 8 ++++---- python/mlc_llm/model/phi3/phi3_model.py | 8 ++++---- python/mlc_llm/model/phi3v/phi3v_model.py | 8 ++++---- python/mlc_llm/model/qwen/qwen_model.py | 8 ++++---- python/mlc_llm/model/qwen2/qwen2_model.py | 8 ++++---- python/mlc_llm/model/stable_lm/stablelm_model.py | 8 ++++---- python/mlc_llm/model/starcoder2/starcoder2_model.py | 8 ++++---- 21 files changed, 79 insertions(+), 79 deletions(-) diff --git a/python/mlc_llm/cli/gen_config.py b/python/mlc_llm/cli/gen_config.py index 5d17e698e6..d898ab1549 100644 --- a/python/mlc_llm/cli/gen_config.py +++ b/python/mlc_llm/cli/gen_config.py @@ -86,7 +86,7 @@ def _parse_output(path: Union[str, Path]) -> Path: parser.add_argument( "--max-batch-size", type=int, - default=80, + default=128, help=HELP["max_batch_size"] + ' (default: "%(default)s")', ) parser.add_argument( diff --git a/python/mlc_llm/model/baichuan/baichuan_model.py b/python/mlc_llm/model/baichuan/baichuan_model.py index bce68b830a..ab32abfd8f 100644 --- a/python/mlc_llm/model/baichuan/baichuan_model.py +++ b/python/mlc_llm/model/baichuan/baichuan_model.py @@ -68,17 +68,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/chatglm3/chatglm3_model.py b/python/mlc_llm/model/chatglm3/chatglm3_model.py index fa4b24e87a..37ad863829 100644 --- a/python/mlc_llm/model/chatglm3/chatglm3_model.py +++ b/python/mlc_llm/model/chatglm3/chatglm3_model.py @@ -74,17 +74,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/cohere/cohere_model.py b/python/mlc_llm/model/cohere/cohere_model.py index 180c60ba13..540eff3315 100644 --- a/python/mlc_llm/model/cohere/cohere_model.py +++ b/python/mlc_llm/model/cohere/cohere_model.py @@ -63,17 +63,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) if self.num_key_value_heads == 0 or self.num_key_value_heads is None: self.num_key_value_heads = self.num_attention_heads diff --git a/python/mlc_llm/model/gemma/gemma_model.py b/python/mlc_llm/model/gemma/gemma_model.py index f8af129446..d74e84ab4a 100644 --- a/python/mlc_llm/model/gemma/gemma_model.py +++ b/python/mlc_llm/model/gemma/gemma_model.py @@ -72,17 +72,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/gpt2/gpt2_model.py b/python/mlc_llm/model/gpt2/gpt2_model.py index d24b73955b..506f82decb 100644 --- a/python/mlc_llm/model/gpt2/gpt2_model.py +++ b/python/mlc_llm/model/gpt2/gpt2_model.py @@ -65,17 +65,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring,too-many-locals diff --git a/python/mlc_llm/model/gpt_bigcode/gpt_bigcode_model.py b/python/mlc_llm/model/gpt_bigcode/gpt_bigcode_model.py index fd84601112..5892956159 100644 --- a/python/mlc_llm/model/gpt_bigcode/gpt_bigcode_model.py +++ b/python/mlc_llm/model/gpt_bigcode/gpt_bigcode_model.py @@ -57,17 +57,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/gpt_neox/gpt_neox_model.py b/python/mlc_llm/model/gpt_neox/gpt_neox_model.py index c7832ea68e..cf8ebdf9ef 100644 --- a/python/mlc_llm/model/gpt_neox/gpt_neox_model.py +++ b/python/mlc_llm/model/gpt_neox/gpt_neox_model.py @@ -72,17 +72,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/internlm/internlm_model.py b/python/mlc_llm/model/internlm/internlm_model.py index 4c7793ca2a..05d87003c0 100644 --- a/python/mlc_llm/model/internlm/internlm_model.py +++ b/python/mlc_llm/model/internlm/internlm_model.py @@ -67,17 +67,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/internlm2/internlm2_model.py b/python/mlc_llm/model/internlm2/internlm2_model.py index 75af3b86a8..c039cc2e8f 100644 --- a/python/mlc_llm/model/internlm2/internlm2_model.py +++ b/python/mlc_llm/model/internlm2/internlm2_model.py @@ -69,17 +69,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/llama/llama_model.py b/python/mlc_llm/model/llama/llama_model.py index dfad1b13ce..a9177308de 100644 --- a/python/mlc_llm/model/llama/llama_model.py +++ b/python/mlc_llm/model/llama/llama_model.py @@ -89,17 +89,17 @@ def __post_init__(self): # pylint: disable=too-many-branches logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/minicpm/minicpm_model.py b/python/mlc_llm/model/minicpm/minicpm_model.py index 7cbe261d2c..c991aa2339 100644 --- a/python/mlc_llm/model/minicpm/minicpm_model.py +++ b/python/mlc_llm/model/minicpm/minicpm_model.py @@ -77,17 +77,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/mistral/mistral_model.py b/python/mlc_llm/model/mistral/mistral_model.py index 3786d2f049..854ce5be56 100644 --- a/python/mlc_llm/model/mistral/mistral_model.py +++ b/python/mlc_llm/model/mistral/mistral_model.py @@ -87,9 +87,9 @@ def __post_init__(self): # pylint: disable=too-many-branches logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(*prefill_chunk_size_candidates, 2048), + min(*prefill_chunk_size_candidates, 8192), ) - self.prefill_chunk_size = min(*prefill_chunk_size_candidates, 2048) + self.prefill_chunk_size = min(*prefill_chunk_size_candidates, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/orion/orion_model.py b/python/mlc_llm/model/orion/orion_model.py index 8ab70b8ba8..97c4c2bb52 100644 --- a/python/mlc_llm/model/orion/orion_model.py +++ b/python/mlc_llm/model/orion/orion_model.py @@ -72,17 +72,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/phi/phi_model.py b/python/mlc_llm/model/phi/phi_model.py index c012736b61..2558f49a81 100644 --- a/python/mlc_llm/model/phi/phi_model.py +++ b/python/mlc_llm/model/phi/phi_model.py @@ -67,17 +67,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) if self.num_key_value_heads == 0 or self.num_key_value_heads is None: self.num_key_value_heads = self.num_attention_heads if self.intermediate_size == 0 or self.intermediate_size is None: diff --git a/python/mlc_llm/model/phi3/phi3_model.py b/python/mlc_llm/model/phi3/phi3_model.py index 06c20c8dca..95aad6a97b 100644 --- a/python/mlc_llm/model/phi3/phi3_model.py +++ b/python/mlc_llm/model/phi3/phi3_model.py @@ -72,17 +72,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) if self.num_key_value_heads == 0 or self.num_key_value_heads is None: self.num_key_value_heads = self.num_attention_heads diff --git a/python/mlc_llm/model/phi3v/phi3v_model.py b/python/mlc_llm/model/phi3v/phi3v_model.py index 4bb9c3b5a6..d874bd9655 100644 --- a/python/mlc_llm/model/phi3v/phi3v_model.py +++ b/python/mlc_llm/model/phi3v/phi3v_model.py @@ -101,17 +101,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) if self.num_key_value_heads == 0 or self.num_key_value_heads is None: self.num_key_value_heads = self.num_attention_heads diff --git a/python/mlc_llm/model/qwen/qwen_model.py b/python/mlc_llm/model/qwen/qwen_model.py index 7fb7e0eb82..23d735a27b 100644 --- a/python/mlc_llm/model/qwen/qwen_model.py +++ b/python/mlc_llm/model/qwen/qwen_model.py @@ -65,17 +65,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/qwen2/qwen2_model.py b/python/mlc_llm/model/qwen2/qwen2_model.py index 2dae3240cf..93d2936125 100644 --- a/python/mlc_llm/model/qwen2/qwen2_model.py +++ b/python/mlc_llm/model/qwen2/qwen2_model.py @@ -67,17 +67,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring,too-many-locals diff --git a/python/mlc_llm/model/stable_lm/stablelm_model.py b/python/mlc_llm/model/stable_lm/stablelm_model.py index 4f874af633..95e545f78b 100644 --- a/python/mlc_llm/model/stable_lm/stablelm_model.py +++ b/python/mlc_llm/model/stable_lm/stablelm_model.py @@ -66,17 +66,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring diff --git a/python/mlc_llm/model/starcoder2/starcoder2_model.py b/python/mlc_llm/model/starcoder2/starcoder2_model.py index c94fb754f4..c7bab4c104 100644 --- a/python/mlc_llm/model/starcoder2/starcoder2_model.py +++ b/python/mlc_llm/model/starcoder2/starcoder2_model.py @@ -69,17 +69,17 @@ def __post_init__(self): logger.info( "%s defaults to %d", bold("prefill_chunk_size"), - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) elif self.prefill_chunk_size > self.context_window_size: logger.info( "Overriding %s from %d to %d", bold("prefill_chunk_size"), self.prefill_chunk_size, - min(self.context_window_size, 2048), + min(self.context_window_size, 8192), ) - self.prefill_chunk_size = min(self.context_window_size, 2048) + self.prefill_chunk_size = min(self.context_window_size, 8192) # pylint: disable=invalid-name,missing-docstring