From cc983fc3e64c1ba215e981a43dd0119c03c74ff1 Mon Sep 17 00:00:00 2001 From: Chen Shen Date: Wed, 21 Jun 2023 11:19:52 -0700 Subject: [PATCH] [serve, vllm] add vllm-example that we can reference to (#36617) This adds a vllm example on serve that we can refer to. --------- Signed-off-by: Chen Shen Co-authored-by: shrekris-anyscale <92341594+shrekris-anyscale@users.noreply.github.com> --- .buildkite/pipeline.gpu_large.yml | 2 +- ci/env/install-dependencies.sh | 5 + doc/BUILD | 10 ++ doc/source/serve/doc_code/vllm_example.py | 126 ++++++++++++++++++++++ 4 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 doc/source/serve/doc_code/vllm_example.py diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 45715a60d422b..76d8e1cd1c65c 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -50,7 +50,7 @@ ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_DOC_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - - DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 INSTALL_VLLM=1 ./ci/env/install-dependencies.sh - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt - ./ci/env/env_info.sh # Test examples with newer version of `transformers` diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh index 0d12d349a992e..ec96566c71453 100755 --- a/ci/env/install-dependencies.sh +++ b/ci/env/install-dependencies.sh @@ -394,6 +394,11 @@ install_pip_packages() { requirements_packages+=("holidays==0.24") # holidays 0.25 causes `import prophet` to fail. fi + # Additional dependency for vllm. + if [ "${INSTALL_VLLM-}" = 1 ]; then + requirements_packages+=("vllm") + fi + # Data processing test dependencies. if [ "${DATA_PROCESSING_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then requirements_files+=("${WORKSPACE_DIR}/python/requirements/data_processing/requirements.txt") diff --git a/doc/BUILD b/doc/BUILD index e41975e1cf34d..2dd3b9bda2eff 100644 --- a/doc/BUILD +++ b/doc/BUILD @@ -200,6 +200,7 @@ py_test_run_all_subdirectory( "source/ray-air/doc_code/computer_vision.py", "source/ray-air/doc_code/hf_trainer.py", # Too large "source/ray-air/doc_code/predictors.py", + "source/serve/doc_code/vllm_example.py", # Requires GPU ], extra_srcs = [], tags = ["exclusive", "team:ml"], @@ -279,6 +280,15 @@ py_test_run_all_subdirectory( tags = ["exclusive", "team:ml", "ray_air", "gpu"], ) +py_test( + name = "vllm_example", + size = "large", + include = ["source/serve/doc_code/vllm_example.py"], + exclude = [], + extra_srcs = [], + tags = ["exclusive", "team:serve", "gpu"], +) + py_test( name = "pytorch_resnet_finetune", size = "large", diff --git a/doc/source/serve/doc_code/vllm_example.py b/doc/source/serve/doc_code/vllm_example.py new file mode 100644 index 0000000000000..bb88519acb026 --- /dev/null +++ b/doc/source/serve/doc_code/vllm_example.py @@ -0,0 +1,126 @@ +import json +from typing import AsyncGenerator + +from fastapi import BackgroundTasks +from starlette.requests import Request +from starlette.responses import StreamingResponse, Response +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.sampling_params import SamplingParams +from vllm.utils import random_uuid + +from ray import serve + + +@serve.deployment(ray_actor_options={"num_gpus": 1}) +class VLLMPredictDeployment: + def __init__(self, **kwargs): + """ + Construct a VLLM deployment. + + Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py + for the full list of arguments. + + Args: + model: name or path of the huggingface model to use + download_dir: directory to download and load the weights, + default to the default cache dir of huggingface. + use_np_weights: save a numpy copy of model weights for + faster loading. This can increase the disk usage by up to 2x. + use_dummy_weights: use dummy values for model weights. + dtype: data type for model weights and activations. + The "auto" option will use FP16 precision + for FP32 and FP16 models, and BF16 precision. + for BF16 models. + seed: random seed. + worker_use_ray: use Ray for distributed serving, will be + automatically set when using more than 1 GPU + pipeline_parallel_size: number of pipeline stages. + tensor_parallel_size: number of tensor parallel replicas. + block_size: token block size. + swap_space: CPU swap space size (GiB) per GPU. + gpu_memory_utilization: the percentage of GPU memory to be used for + the model executor + max_num_batched_tokens: maximum number of batched tokens per iteration + max_num_seqs: maximum number of sequences per iteration. + disable_log_stats: disable logging statistics. + engine_use_ray: use Ray to start the LLM engine in a separate + process as the server process. + disable_log_requests: disable logging requests. + """ + args = AsyncEngineArgs(**kwargs) + self.engine = AsyncLLMEngine.from_engine_args(args) + + async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]: + num_returned = 0 + async for request_output in results_generator: + text_outputs = [output.text for output in request_output.outputs] + assert len(text_outputs) == 1 + text_output = text_outputs[0][num_returned:] + ret = {"text": text_output} + yield (json.dumps(ret) + "\n").encode("utf-8") + num_returned += len(text_output) + + async def may_abort_request(self, request_id) -> None: + await self.engine.abort(request_id) + + async def __call__(self, request: Request) -> Response: + """Generate completion for the request. + + The request should be a JSON object with the following fields: + - prompt: the prompt to use for the generation. + - stream: whether to stream the results or not. + - other fields: the sampling parameters (See `SamplingParams` for details). + """ + request_dict = await request.json() + prompt = request_dict.pop("prompt") + stream = request_dict.pop("stream", False) + sampling_params = SamplingParams(**request_dict) + request_id = random_uuid() + results_generator = self.engine.generate(prompt, sampling_params, request_id) + if stream: + background_tasks = BackgroundTasks() + # Using background_taks to abort the the request + # if the client disconnects. + background_tasks.add_task(self.may_abort_request, request_id) + return StreamingResponse( + self.stream_results(results_generator), background=background_tasks + ) + + # Non-streaming case + final_output = None + async for request_output in results_generator: + if await request.is_disconnected(): + # Abort the request if the client disconnects. + await self.engine.abort(request_id) + return Response(status_code=499) + final_output = request_output + + assert final_output is not None + prompt = final_output.prompt + text_outputs = [prompt + output.text for output in final_output.outputs] + ret = {"text": text_outputs} + return Response(content=json.dumps(ret)) + + +def send_sample_request(): + import requests + + prompt = "How do I cook fried rice?" + sample_input = {"prompt": prompt, "stream": True} + output = requests.post("http://localhost:8000/", json=sample_input) + for line in output.iter_lines(): + print(line.decode("utf-8")) + + +if __name__ == "__main__": + # To run this example, you need to install vllm which requires + # OS: Linux + # Python: 3.8 or higher + # CUDA: 11.0 – 11.8 + # GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, etc.) + # see https://vllm.readthedocs.io/en/latest/getting_started/installation.html + # for more details. + deployment = VLLMPredictDeployment.bind(model="facebook/opt-125m") + serve.run(deployment) + send_sample_request()