embeddings endpoint

l4b4r4b4b4 · Aug 21, 2024 · 2739d4d · 2739d4d
1 parent f4d7d80
commit 2739d4d
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 16 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1 +1,2 @@
 Dockerfile
+Dockerfile.cpu
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "vllm"]
+	path = vllm
+	url = https://github.com/vllm-project/vllm.git
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -0,0 +1,63 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-1
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+# intel-openmp provides additional performance improvement vs. openmp
+# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install intel-openmp
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
+
+ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=vllm/requirements-build.txt,target=requirements-build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements-build.txt
+
+FROM cpu-test-1 AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=vllm/requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=vllm/requirements-cpu.txt,target=requirements-cpu.txt \
+    pip install -v -r requirements-cpu.txt
+
+COPY ./vllm ./
+
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+WORKDIR /app/
+RUN pip show vllm | grep Version | awk '{print $2}'
+
+COPY requirements.cpu.txt /app/requirements.txt
+
+RUN python3.10 -m pip install -r requirements.txt
+# RUN pip show torch && sleep 60
+COPY . /app
+
+# CMD python3.10 server_vllm.py --model "hivata/functionary-small-v2.1-AWQ" --host 0.0.0.0 --trust-remote-code
+ENTRYPOINT [ "python3.10", "server_vllm.py" ]
diff --git a/functionary/prompt_template/prompt_utils.py b/functionary/prompt_template/prompt_utils.py
@@ -3,7 +3,6 @@
 from typing import Dict, List, Optional, Union
 from PIL import Image
 from io import BytesIO
-import os
 import base64
 import requests
 import torch

diff --git a/requirements.cpu.txt b/requirements.cpu.txt
@@ -0,0 +1,10 @@
+fastapi
+uvicorn
+pydantic
+scipy
+jsonref
+requests
+PyYAML
+typer
+protobuf
+triton
diff --git a/server_vllm.py b/server_vllm.py
@@ -33,7 +33,19 @@
 from functionary.openai_types import ChatCompletionRequest
 from functionary.vllm_inference import process_chat_completion
 import requests
-import json
+
+import torch
+
+DEVICE = "auto"
+# Check if CUDA is available
+if torch.cuda.is_available():
+    print(f"CUDA is available! Number of devices: {torch.cuda.device_count()}")
+    for i in range(torch.cuda.device_count()):
+        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
+else:
+    print("CUDA is not available. Setting device to cpu")
+    DEVICE = "cpu"
+
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
@@ -100,20 +112,24 @@ class EmbeddingData(BaseModel):
     index: int
     embedding: List[float]
 
+
 class UsageData(BaseModel):
     prompt_tokens: int
     total_tokens: int
 
+
 class EmbeddingResponse(BaseModel):
     object: str = "list"
     data: List[EmbeddingData]
     model: str
     usage: UsageData
 
+
 class EmbeddingRequest(BaseModel):
     input: Union[str, List[str]]
     model: str
 
+
 @app.post("/v1/embeddings", response_model=EmbeddingResponse)
 async def get_embedding(request: EmbeddingRequest, authorization: str = Header(None)):
     if not authorization or not authorization.startswith("Bearer "):
@@ -123,33 +139,23 @@ async def get_embedding(request: EmbeddingRequest, authorization: str = Header(N
     inputs = [request.input] if isinstance(request.input, str) else request.input
 
     url = "http://embeddings:8080/embed"
-    headers = {'Content-Type': 'application/json'}
-    data = {
-        "inputs": inputs
-    }
+    headers = {"Content-Type": "application/json"}
+    data = {"inputs": inputs}
 
     response = requests.post(url, headers=headers, json=data)
     embeddings = response.json()
-    print("Embeddings response: ", embeddings)
 
     # Construct the response data
     data = [
-        EmbeddingData(
-            object="embedding",
-            index=i,
-            embedding=embeddings[i]
-        )
+        EmbeddingData(object="embedding", index=i, embedding=embeddings[i])
         for i in range(len(inputs))
     ]
 
     response = EmbeddingResponse(
         object="list",
         data=data,
         model=request.model,
-        usage=UsageData(
-            prompt_tokens=len(inputs) * 5,
-            total_tokens=len(inputs) * 5
-        )
+        usage=UsageData(prompt_tokens=len(inputs) * 5, total_tokens=len(inputs) * 5),
     )
 
     return response
@@ -198,6 +204,7 @@ async def get_embedding(request: EmbeddingRequest, authorization: str = Header(N
     else:
         from vllm.engine.async_llm_engine import AsyncLLMEngine
 
+    args.device = DEVICE
     app.add_middleware(
         CORSMiddleware,
         allow_origins=args.allowed_origins,

diff --git a/vllm b/vllm