Skip to content

Commit

Permalink
[Bugfix] Fix max_num_batched_tokens for MLA (vllm-project#13620)
Browse files Browse the repository at this point in the history
Signed-off-by: mgoin <[email protected]>
  • Loading branch information
mgoin authored Feb 21, 2025
1 parent bfbc0b3 commit 71face8
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@

logger = init_logger(__name__)

# This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput.
_DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120

Expand Down Expand Up @@ -1526,15 +1529,17 @@ def __post_init__(self) -> None:
# for now. Have max_num_batched_tokens set to max_model_len
# so we don't reject sequences on account of a short
# max_num_batched_tokens.
self.max_num_batched_tokens = max(self.max_model_len, 2048)
self.max_num_batched_tokens = max(
self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS)
else:
# This value is chosen to have a balance between ITL
# and TTFT. Note it is not optimized for throughput.
self.max_num_batched_tokens = 2048
self.max_num_batched_tokens = (
_DEFAULT_MAX_NUM_BATCHED_TOKENS)
else:
# If max_model_len is too short, use 2048 as the default value
# If max_model_len is too short, use
# _DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
# for higher throughput.
self.max_num_batched_tokens = max(self.max_model_len, 2048)
self.max_num_batched_tokens = max(
self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS)

if self.runner_type == "pooling":
# Choose specific value for higher throughput
Expand Down Expand Up @@ -3333,6 +3338,9 @@ def __post_init__(self):
"caching to be disabled.")
self.scheduler_config.enable_chunked_prefill = False
self.scheduler_config.chunked_prefill_enabled = False
self.scheduler_config.max_num_batched_tokens = max(
self.scheduler_config.max_model_len,
_DEFAULT_MAX_NUM_BATCHED_TOKENS)

if self.cache_config is not None:
self.cache_config.enable_prefix_caching = False
Expand Down

0 comments on commit 71face8

Please sign in to comment.