Skip to content

Commit

Permalink
embeddings endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
l4b4r4b4b4 committed Aug 21, 2024
1 parent f4d7d80 commit 2739d4d
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 16 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
Dockerfile
Dockerfile.cpu
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "vllm"]
path = vllm
url = https://github.com/vllm-project/vllm.git
63 changes: 63 additions & 0 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.

FROM ubuntu:22.04 AS cpu-test-1

RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp

ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"

RUN echo 'ulimit -c 0' >> ~/.bashrc

RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl

ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=vllm/requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt

FROM cpu-test-1 AS build

WORKDIR /workspace/vllm

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=vllm/requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=vllm/requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt

COPY ./vllm ./

# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl

WORKDIR /workspace/

RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

WORKDIR /app/
RUN pip show vllm | grep Version | awk '{print $2}'

COPY requirements.cpu.txt /app/requirements.txt

RUN python3.10 -m pip install -r requirements.txt
# RUN pip show torch && sleep 60
COPY . /app

# CMD python3.10 server_vllm.py --model "hivata/functionary-small-v2.1-AWQ" --host 0.0.0.0 --trust-remote-code
ENTRYPOINT [ "python3.10", "server_vllm.py" ]
1 change: 0 additions & 1 deletion functionary/prompt_template/prompt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Dict, List, Optional, Union
from PIL import Image
from io import BytesIO
import os
import base64
import requests
import torch
Expand Down
10 changes: 10 additions & 0 deletions requirements.cpu.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
fastapi
uvicorn
pydantic
scipy
jsonref
requests
PyYAML
typer
protobuf
triton
37 changes: 22 additions & 15 deletions server_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,19 @@
from functionary.openai_types import ChatCompletionRequest
from functionary.vllm_inference import process_chat_completion
import requests
import json

import torch

DEVICE = "auto"
# Check if CUDA is available
if torch.cuda.is_available():
print(f"CUDA is available! Number of devices: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f"Device {i}: {torch.cuda.get_device_name(i)}")
else:
print("CUDA is not available. Setting device to cpu")
DEVICE = "cpu"


TIMEOUT_KEEP_ALIVE = 5 # seconds

Expand Down Expand Up @@ -100,20 +112,24 @@ class EmbeddingData(BaseModel):
index: int
embedding: List[float]


class UsageData(BaseModel):
prompt_tokens: int
total_tokens: int


class EmbeddingResponse(BaseModel):
object: str = "list"
data: List[EmbeddingData]
model: str
usage: UsageData


class EmbeddingRequest(BaseModel):
input: Union[str, List[str]]
model: str


@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def get_embedding(request: EmbeddingRequest, authorization: str = Header(None)):
if not authorization or not authorization.startswith("Bearer "):
Expand All @@ -123,33 +139,23 @@ async def get_embedding(request: EmbeddingRequest, authorization: str = Header(N
inputs = [request.input] if isinstance(request.input, str) else request.input

url = "http://embeddings:8080/embed"
headers = {'Content-Type': 'application/json'}
data = {
"inputs": inputs
}
headers = {"Content-Type": "application/json"}
data = {"inputs": inputs}

response = requests.post(url, headers=headers, json=data)
embeddings = response.json()
print("Embeddings response: ", embeddings)

# Construct the response data
data = [
EmbeddingData(
object="embedding",
index=i,
embedding=embeddings[i]
)
EmbeddingData(object="embedding", index=i, embedding=embeddings[i])
for i in range(len(inputs))
]

response = EmbeddingResponse(
object="list",
data=data,
model=request.model,
usage=UsageData(
prompt_tokens=len(inputs) * 5,
total_tokens=len(inputs) * 5
)
usage=UsageData(prompt_tokens=len(inputs) * 5, total_tokens=len(inputs) * 5),
)

return response
Expand Down Expand Up @@ -198,6 +204,7 @@ async def get_embedding(request: EmbeddingRequest, authorization: str = Header(N
else:
from vllm.engine.async_llm_engine import AsyncLLMEngine

args.device = DEVICE
app.add_middleware(
CORSMiddleware,
allow_origins=args.allowed_origins,
Expand Down
1 change: 1 addition & 0 deletions vllm
Submodule vllm added at 9587b0

0 comments on commit 2739d4d

Please sign in to comment.