Skip to content

Commit

Permalink
[Ray Serve] Add 2 examples of serving models on HPU (ray-project#42820)
Browse files Browse the repository at this point in the history
This PR adds 2 examples of using Ray Serve with HPU. One is for serving llama2-7b on single HPU, the other is for serving llama2-70b on 8 HPU using DeepSpeed.

In addition, it also enables "hccl" backend in air/util/torch_dist.py, which can be used to initialize torch dist of, for example, deepspeed workers.

---------

Signed-off-by: Zhi Lin <[email protected]>
Co-authored-by: shrekris-anyscale <[email protected]>
Co-authored-by: angelinalg <[email protected]>
  • Loading branch information
3 people authored Feb 20, 2024
1 parent 2b92f57 commit 136bffe
Show file tree
Hide file tree
Showing 5 changed files with 323 additions and 1 deletion.
8 changes: 7 additions & 1 deletion doc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ py_test_run_all_subdirectory(
include = ["source/serve/doc_code/**/*.py"],
exclude = [
"source/serve/doc_code/aws_neuron_core_inference_serve.py",
"source/serve/doc_code/hpu_inference_serve.py",
"source/serve/doc_code/hpu_inference_serve_deepspeed.py",
"source/serve/doc_code/hpu_inference_client.py",
"source/serve/doc_code/distilbert.py",
"source/serve/doc_code/stable_diffusion.py",
"source/serve/doc_code/object_detection.py",
Expand Down Expand Up @@ -153,7 +156,10 @@ py_test_run_all_subdirectory(
"source/serve/doc_code/object_detection.py",
],
exclude = [
"source/serve/doc_code/aws_neuron_core_inference_serve.py"
"source/serve/doc_code/aws_neuron_core_inference_serve.py",
"source/serve/doc_code/hpu_inference_serve.py",
"source/serve/doc_code/hpu_inference_serve_deepspeed.py",
"source/serve/doc_code/hpu_inference_client.py",
],
extra_srcs = [],
tags = ["exclusive", "team:serve", "gpu"],
Expand Down
23 changes: 23 additions & 0 deletions doc/source/serve/doc_code/hpu_inference_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# __main_code_start__
import requests

# Prompt for the model
prompt = "Once upon a time,"

# Add generation config here
config = {}

# Non-streaming response
sample_input = {"text": prompt, "config": config, "stream": False}
outputs = requests.post("http://127.0.0.1:8000/", json=sample_input, stream=False)
print(outputs.text, flush=True)

# Streaming response
sample_input["stream"] = True
outputs = requests.post("http://127.0.0.1:8000/", json=sample_input, stream=True)
outputs.raise_for_status()
for output in outputs.iter_content(chunk_size=None, decode_unicode=True):
print(output, end="", flush=True)
print()

# __main_code_end__
136 changes: 136 additions & 0 deletions doc/source/serve/doc_code/hpu_inference_serve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# __model_def_start__
import asyncio
from functools import partial
from queue import Empty
from typing import Dict, Any

from starlette.responses import Request, StreamingResponse
import torch

from ray import serve


# Define the Ray Serve deployment
@serve.deployment(ray_actor_options={"num_cpus": 10, "resources": {"HPU": 1}})
class LlamaModel:
def __init__(self, model_id_or_path: str):
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from optimum.habana.transformers.modeling_utils import (
adapt_transformers_to_gaudi,
)

# Tweak transformers to optimize performance
adapt_transformers_to_gaudi()

self.device = torch.device("hpu")

self.tokenizer = AutoTokenizer.from_pretrained(
model_id_or_path, use_fast=False, use_auth_token=""
)
hf_config = AutoConfig.from_pretrained(
model_id_or_path,
torchscript=True,
use_auth_token="",
trust_remote_code=False,
)
# Load the model in Gaudi
model = AutoModelForCausalLM.from_pretrained(
model_id_or_path,
config=hf_config,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
use_auth_token="",
)
model = model.eval().to(self.device)

from habana_frameworks.torch.hpu import wrap_in_hpu_graph

# Enable hpu graph runtime
self.model = wrap_in_hpu_graph(model)

# Set pad token, etc.
self.tokenizer.pad_token_id = self.model.generation_config.pad_token_id
self.tokenizer.padding_side = "left"

# Use async loop in streaming
self.loop = asyncio.get_running_loop()

def tokenize(self, prompt: str):
"""Tokenize the input and move to HPU."""

input_tokens = self.tokenizer(prompt, return_tensors="pt", padding=True)
return input_tokens.input_ids.to(device=self.device)

def generate(self, prompt: str, **config: Dict[str, Any]):
"""Take a prompt and generate a response."""

input_ids = self.tokenize(prompt)
gen_tokens = self.model.generate(input_ids, **config)
return self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]

async def consume_streamer_async(self, streamer):
"""Consume the streamer asynchronously."""

while True:
try:
for token in streamer:
yield token
break
except Empty:
await asyncio.sleep(0.001)

def streaming_generate(self, prompt: str, streamer, **config: Dict[str, Any]):
"""Generate a streamed response given an input."""

input_ids = self.tokenize(prompt)
self.model.generate(input_ids, streamer=streamer, **config)

async def __call__(self, http_request: Request):
"""Handle HTTP requests."""

# Load fields from the request
json_request: str = await http_request.json()
text = json_request["text"]
# Config used in generation
config = json_request.get("config", {})
streaming_response = json_request["stream"]

# Prepare prompts
prompts = []
if isinstance(text, list):
prompts.extend(text)
else:
prompts.append(text)

# Process config
config.setdefault("max_new_tokens", 128)

# Enable HPU graph runtime
config["hpu_graphs"] = True
# Lazy mode should be True when using HPU graphs
config["lazy_mode"] = True

# Non-streaming case
if not streaming_response:
return self.generate(prompts, **config)

# Streaming case
from transformers import TextIteratorStreamer

streamer = TextIteratorStreamer(
self.tokenizer, skip_prompt=True, timeout=0, skip_special_tokens=True
)
# Convert the streamer into a generator
self.loop.run_in_executor(
None, partial(self.streaming_generate, prompts, streamer, **config)
)
return StreamingResponse(
self.consume_streamer_async(streamer),
status_code=200,
media_type="text/plain",
)


# Replace the model ID with path if necessary
entrypoint = LlamaModel.bind("meta-llama/Llama-2-7b-chat-hf")
# __model_def_end__
1 change: 1 addition & 0 deletions doc/source/serve/tutorials/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ stable-diffusion
text-classification
object-detection
aws-neuron-core-inference
intel-habana-gaudi-inference
gradio-integration
batch
streaming
Expand Down
156 changes: 156 additions & 0 deletions doc/source/serve/tutorials/intel-habana-gaudi-inference.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# Serve a model on Intel Habana Gaudi

[Habana Gaudi AI Processors (HPUs)](https://habana.ai) are AI hardware accelerators designed by Habana Labs. For more information, see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/index.html) and [Gaudi Developer Docs](https://developer.habana.ai/).

This tutorial shows how to deploy [Llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) using a single HPU:

* How to load a model in an HPU

* How to perform generation on an HPU

* How to enable HPU Graph optimizations

This tutorial helps you serve a large language model on HPUs.

## Environment setup

We recommend using a prebuilt container to run these examples. To run a container, you need Docker. See [Install Docker Engine](https://docs.docker.com/engine/install/) for installation instructions.

Next, follow [Run Using Containers](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html?highlight=installer#run-using-containers) to install the Habana drivers and container runtime. To verify your installation, start a shell and run `hl-smi`. It should print status information about the HPUs on the machine:

```text
+-----------------------------------------------------------------------------+
| HL-SMI Version: hl-1.14.0-fw-48.0.1.0 |
| Driver Version: 1.15.0-c43dc7b |
|-------------------------------+----------------------+----------------------+
| AIP Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | AIP-Util Compute M. |
|===============================+======================+======================|
| 0 HL-225 N/A | 0000:09:00.0 N/A | 0 |
| N/A 26C N/A 87W / 600W | 768MiB / 98304MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 1 HL-225 N/A | 0000:08:00.0 N/A | 0 |
| N/A 28C N/A 99W / 600W | 768MiB / 98304MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 2 HL-225 N/A | 0000:0a:00.0 N/A | 0 |
| N/A 24C N/A 98W / 600W | 768MiB / 98304MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 3 HL-225 N/A | 0000:0c:00.0 N/A | 0 |
| N/A 27C N/A 87W / 600W | 768MiB / 98304MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 4 HL-225 N/A | 0000:0b:00.0 N/A | 0 |
| N/A 25C N/A 112W / 600W | 768MiB / 98304MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 5 HL-225 N/A | 0000:0d:00.0 N/A | 0 |
| N/A 26C N/A 111W / 600W | 26835MiB / 98304MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 6 HL-225 N/A | 0000:0f:00.0 N/A | 0 |
| N/A 24C N/A 93W / 600W | 768MiB / 98304MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 7 HL-225 N/A | 0000:0e:00.0 N/A | 0 |
| N/A 25C N/A 86W / 600W | 768MiB / 98304MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| Compute Processes: AIP Memory |
| AIP PID Type Process name Usage |
|=============================================================================|
| 0 N/A N/A N/A N/A |
| 1 N/A N/A N/A N/A |
| 2 N/A N/A N/A N/A |
| 3 N/A N/A N/A N/A |
| 4 N/A N/A N/A N/A |
| 5 N/A N/A N/A N/A |
| 6 N/A N/A N/A N/A |
| 7 N/A N/A N/A N/A |
+=============================================================================+
```

Next, start the Habana container:
```bash
docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
```

To follow the examples in this tutorial, mount the directory containing the examples and models into the container. Inside the container, run:
```bash
pip install ray[tune,serve]
pip install git+https://github.com/huggingface/optimum-habana.git
```

Start Ray in the container with `ray start --head`. You are now ready to run the examples.

## Running a model on a single HPU

This example shows how to deploy a Llama2-7b model on an HPU for inference.

First, define a deployment that serves a Llama2-7b model using an HPU. Note that we enable [HPU graph optimizations](https://docs.habana.ai/en/latest/Gaudi_Overview/SynapseAI_Software_Suite.html?highlight=graph#graph-compiler-and-runtime) for better performance.

```{literalinclude} ../doc_code/hpu_inference_serve.py
:language: python
:start-after: __model_def_start__
:end-before: __model_def_end__
```

Copy the code above and save it as `hpu_inference_serve.py`. Start the deployment like this:

```bash
serve run hpu_inference_serve:entrypoint
```

The terminal should print logs as the deployment starts up:

```text
2024-02-01 05:38:34,021 INFO scripts.py:438 -- Running import path: 'ray_serve_7b:entrypoint'.
2024-02-01 05:38:36,112 INFO worker.py:1540 -- Connecting to existing Ray cluster at address: 10.111.128.177:6379...
2024-02-01 05:38:36,124 INFO worker.py:1715 -- Connected to Ray cluster. View the dashboard at 127.0.0.1:8265
(ProxyActor pid=17179) INFO 2024-02-01 05:38:39,573 proxy 10.111.128.177 proxy.py:1141 - Proxy actor b0c697edb66f42a46f802f4603000000 starting on node 7776cd4634f69216c8354355018195b290314ad24fd9565404a2ed12.
(ProxyActor pid=17179) INFO 2024-02-01 05:38:39,580 proxy 10.111.128.177 proxy.py:1346 - Starting HTTP server on node: 7776cd4634f69216c8354355018195b290314ad24fd9565404a2ed12 listening on port 8000
(ProxyActor pid=17179) INFO: Started server process [17179]
(ServeController pid=17084) INFO 2024-02-01 05:38:39,677 controller 17084 deployment_state.py:1545 - Deploying new version of deployment LlamaModel in application 'default'. Setting initial target number of replicas to 1.
(ServeController pid=17084) INFO 2024-02-01 05:38:39,780 controller 17084 deployment_state.py:1829 - Adding 1 replica to deployment LlamaModel in application 'default'.
(ServeReplica:default:LlamaModel pid=17272) [WARNING|utils.py:198] 2024-02-01 05:38:48,700 >> optimum-habana v1.11.0.dev0 has been validated for SynapseAI v1.14.0 but the driver version is v1.15.0, this could lead to undefined behavior!
(ServeReplica:default:LlamaModel pid=17272) /usr/local/lib/python3.10/dist-packages/transformers/models/auto/tokenization_auto.py:655: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.
(ServeReplica:default:LlamaModel pid=17272) warnings.warn(
(ServeReplica:default:LlamaModel pid=17272) /usr/local/lib/python3.10/dist-packages/transformers/models/auto/configuration_auto.py:1020: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.
(ServeReplica:default:LlamaModel pid=17272) warnings.warn(
(ServeReplica:default:LlamaModel pid=17272) /usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py:472: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.
(ServeReplica:default:LlamaModel pid=17272) warnings.warn(
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 50%|█████ | 1/2 [00:17<00:17, 17.90s/it]
(ServeController pid=17084) WARNING 2024-02-01 05:39:09,835 controller 17084 deployment_state.py:2171 - Deployment 'LlamaModel' in application 'default' has 1 replicas that have taken more than 30s to initialize. This may be caused by a slow __init__ or reconfigure method.
Loading checkpoint shards: 100%|██████████| 2/2 [00:24<00:00, 12.36s/it]
(ServeReplica:default:LlamaModel pid=17272) /usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
(ServeReplica:default:LlamaModel pid=17272) warnings.warn(
(ServeReplica:default:LlamaModel pid=17272) /usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:367: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
(ServeReplica:default:LlamaModel pid=17272) warnings.warn(
(ServeReplica:default:LlamaModel pid=17272) ============================= HABANA PT BRIDGE CONFIGURATION ===========================
(ServeReplica:default:LlamaModel pid=17272) PT_HPU_LAZY_MODE = 1
(ServeReplica:default:LlamaModel pid=17272) PT_RECIPE_CACHE_PATH =
(ServeReplica:default:LlamaModel pid=17272) PT_CACHE_FOLDER_DELETE = 0
(ServeReplica:default:LlamaModel pid=17272) PT_HPU_RECIPE_CACHE_CONFIG =
(ServeReplica:default:LlamaModel pid=17272) PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
(ServeReplica:default:LlamaModel pid=17272) PT_HPU_LAZY_ACC_PAR_MODE = 1
(ServeReplica:default:LlamaModel pid=17272) PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
(ServeReplica:default:LlamaModel pid=17272) ---------------------------: System Configuration :---------------------------
(ServeReplica:default:LlamaModel pid=17272) Num CPU Cores : 156
(ServeReplica:default:LlamaModel pid=17272) CPU RAM : 495094196 KB
(ServeReplica:default:LlamaModel pid=17272) ------------------------------------------------------------------------------
2024-02-01 05:39:25,873 SUCC scripts.py:483 -- Deployed Serve app successfully.
```

In another shell, use the following code to send requests to the deployment to perform generation tasks.

```{literalinclude} ../doc_code/hpu_inference_client.py
:language: python
:start-after: __main_code_start__
:end-before: __main_code_end__
```

Here is an example output:
```text
Once upon a time, in a far-off land, there was a magical kingdom called "Happily Ever Laughter." It was a place where laughter was the key to unlocking all the joys of life, and where everyone lived in perfect harmony.
In this kingdom, there was a beautiful princess named Lily. She was kind, gentle, and had a heart full of laughter. Every day, she would wake up with a smile on her face, ready to face whatever adventures the day might bring.
One day, a wicked sorcerer cast a spell on the kingdom, causing all
in a far-off land, there was a magical kingdom called "Happily Ever Laughter." It was a place where laughter was the key to unlocking all the joys of life, and where everyone lived in perfect harmony.
In this kingdom, there was a beautiful princess named Lily. She was kind, gentle, and had a heart full of laughter. Every day, she would wake up with a smile on her face, ready to face whatever adventures the day might bring.
One day, a wicked sorcerer cast a spell on the kingdom, causing all
```

0 comments on commit 136bffe

Please sign in to comment.