Skip to content

Commit

Permalink
[Misc] Clean Up EngineArgs.create_engine_config (vllm-project#13734)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-redhat authored Feb 24, 2025
1 parent db986c1 commit 1f0ae3e
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 40 deletions.
4 changes: 4 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,10 @@ def metrics_info(self):
return {key: str(value) for key, value in self.__dict__.items()}

def _verify_args(self) -> None:
if self.cpu_offload_gb < 0:
raise ValueError("CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

if self.gpu_memory_utilization > 1.0:
raise ValueError(
"GPU memory utilization must be less than 1.0. Got "
Expand Down
65 changes: 25 additions & 40 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,17 @@ def from_cli_args(cls, args: argparse.Namespace):
return engine_args

def create_model_config(self) -> ModelConfig:
# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"

# NOTE: This is to allow model loading from S3 in CI
if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
and self.model in MODELS_ON_S3
and self.load_format == LoadFormat.AUTO): # noqa: E501
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
self.load_format = LoadFormat.RUNAI_STREAMER

return ModelConfig(
model=self.model,
task=self.task,
Expand Down Expand Up @@ -1101,26 +1112,6 @@ def create_model_config(self) -> ModelConfig:
)

def create_load_config(self) -> LoadConfig:
return LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
)

def create_engine_config(self,
usage_context: Optional[UsageContext] = None
) -> VllmConfig:
from vllm.platforms import current_platform
current_platform.pre_register_and_update()

if envs.VLLM_USE_V1:
self._override_v1_engine_args(usage_context)

# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"

# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
Expand All @@ -1137,19 +1128,23 @@ def create_engine_config(self,
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")

assert self.cpu_offload_gb >= 0, (
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")
return LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
)

device_config = DeviceConfig(device=self.device)
def create_engine_config(self,
usage_context: Optional[UsageContext] = None
) -> VllmConfig:
from vllm.platforms import current_platform
current_platform.pre_register_and_update()

# NOTE: This is to allow model loading from S3 in CI
if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
and self.model in MODELS_ON_S3
and self.load_format == LoadFormat.AUTO): # noqa: E501
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
self.load_format = LoadFormat.RUNAI_STREAMER
if envs.VLLM_USE_V1:
self._override_v1_engine_args(usage_context)

device_config = DeviceConfig(device=self.device)
model_config = self.create_model_config()

if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
Expand Down Expand Up @@ -1281,16 +1276,6 @@ def create_engine_config(self,
if speculative_config is None \
else speculative_config.num_lookahead_slots

if not self.use_v2_block_manager:
logger.warning(
"[DEPRECATED] Block manager v1 has been removed, "
"and setting --use-v2-block-manager to True or False has "
"no effect on vLLM behavior. Please remove "
"--use-v2-block-manager in your engine argument. "
"If your use case is not supported by "
"SelfAttnBlockSpaceManager (i.e. block manager v2),"
" please file an issue with detailed information.")

scheduler_config = SchedulerConfig(
runner_type=model_config.runner_type,
max_num_batched_tokens=self.max_num_batched_tokens,
Expand Down

0 comments on commit 1f0ae3e

Please sign in to comment.