tet commit

YaoJiayi · May 28, 2024 · 2000655 · 2000655
1 parent 67883dd
commit 2000655
Show file tree

Hide file tree

Showing 384 changed files with 63,224 additions and 0 deletions.
diff --git a/et --soft HEAD~1d b/et --soft HEAD~1d
@@ -0,0 +1,23 @@
+[33mcommit e50b714436f4c1512416dea693d4dd85a209ecda[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m, [m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m
+Author: YaoJiayi <[email protected]>
+Date:   Tue May 28 11:20:34 2024 -0500
+
+    test commit
+
+[33mcommit 67883dd49e43a7bbdd427aba40acf409cbfa1f22[m
+Author: Jiayi Yao <[email protected]>
+Date:   Sun May 26 00:12:54 2024 -0500
+
+    Update README.md
+
+[33mcommit 99d43ec3c98766f149201b8e74a2604db072c9d5[m
+Author: Jiayi Yao <[email protected]>
+Date:   Sun May 26 00:12:28 2024 -0500
+
+    Update README.md
+
+[33mcommit 4bda3004445b16bbd12cbfac9ab8c577d1ca90b8[m
+Author: Jiayi Yao <[email protected]>
+Date:   Sun May 26 00:08:53 2024 -0500
+
+    Initial commit
diff --git a/examples/fuse_gen.py b/examples/fuse_gen.py
diff --git a/examples/normal_gen.py b/examples/normal_gen.py
diff --git a/examples/preprocess.py b/examples/preprocess.py
diff --git a/inputs/1.json b/inputs/1.json
diff --git a/vllm_fuse/.buildkite/run-benchmarks.sh b/vllm_fuse/.buildkite/run-benchmarks.sh
@@ -0,0 +1,69 @@
+# This script is run by buildkite to run the benchmarks and upload the results to buildkite
+
+set -ex
+set -o pipefail
+
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+
+# run python-based benchmarks and upload the result to buildkite
+python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+bench_latency_exit_code=$?
+
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+bench_throughput_exit_code=$?
+
+# run server-based benchmarks and upload the result to buildkite
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+server_pid=$!
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+# wait for server to start, timeout after 600 seconds
+timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+python3 benchmarks/benchmark_serving.py \
+    --backend openai \
+    --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --model meta-llama/Llama-2-7b-chat-hf \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer meta-llama/Llama-2-7b-chat-hf \
+    --save-result \
+    2>&1 | tee benchmark_serving.txt
+bench_serving_exit_code=$?
+kill $server_pid
+
+# write the results into a markdown file
+echo "### Latency Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
+
+echo "### Throughput Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
+
+echo "### Serving Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
+
+# upload the results to buildkite
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+
+# exit with the exit code of the benchmarks
+if [ $bench_latency_exit_code -ne 0 ]; then
+    exit $bench_latency_exit_code
+fi
+
+if [ $bench_throughput_exit_code -ne 0 ]; then
+    exit $bench_throughput_exit_code
+fi
+
+if [ $bench_serving_exit_code -ne 0 ]; then
+    exit $bench_serving_exit_code
+fi
+
+/workspace/buildkite-agent artifact upload openai-*.json
diff --git a/vllm_fuse/.buildkite/test-pipeline.yaml b/vllm_fuse/.buildkite/test-pipeline.yaml
@@ -0,0 +1,75 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.
+
+steps:
+- label: Regression Test
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  command: pytest -v -s async_engine
+
+- label: Basic Correctness Test
+  command: pytest -v -s --forked basic_correctness
+
+- label: Core Test
+  command: pytest -v -s core
+
+- label: Distributed Comm Ops Test
+  command: pytest -v -s --forked test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Distributed Correctness Test
+  command: pytest -v -s --forked test_basic_distributed_correctness.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Engine Test
+  command: pytest -v -s engine test_sequence.py
+
+- label: Entrypoints Test
+  command: pytest -v -s entrypoints
+
+- label: Kernels Test
+  command: pytest -v -s kernels
+  soft_fail: true
+
+- label: Models Test
+  commands:
+    - pytest -v -s models --forked
+  soft_fail: true
+
+- label: Prefix Caching Test
+  commands:
+    - pytest -v -s prefix_caching
+
+- label: Samplers Test
+  command: pytest -v -s samplers --forked
+
+- label: Worker Test
+  command: pytest -v -s worker
+
+- label: Speculative decoding tests
+  command: pytest -v -s spec_decode
+
+- label: LoRA Test
+  command: pytest -v -s lora --forked
+
+- label: Metrics Test
+  command: pytest -v -s metrics
+
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  commands:
+  - pip install aiohttp
+  - bash run-benchmarks.sh
+
+- label: Documentation Build
+  working_dir: "/vllm-workspace/docs"
+  no_gpu: True
+  commands:
+  - pip install -r requirements-docs.txt
+  - SPHINXOPTS=\"-W\" make html
diff --git a/vllm_fuse/.buildkite/test-template.j2 b/vllm_fuse/.buildkite/test-template.j2
@@ -0,0 +1,56 @@
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    commands:
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+  - wait
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: kubernetes
+    soft_fail: {{ step.soft_fail or false }}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+    plugins:
+      - kubernetes:
+          podSpec:
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+            containers:
+              - image: "{{ docker_image }}"
+                command: ["bash"]
+                args:
+                - '-c'
+                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                {% if not step.no_gpu %}
+                resources:
+                  requests:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                  limits:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                {% endif %}
+                env:
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+  {% endfor %}
diff --git a/vllm_fuse/.dockerignore b/vllm_fuse/.dockerignore
@@ -0,0 +1 @@
+vllm/*.so
diff --git a/vllm_fuse/.github/workflows/publish.yml b/vllm_fuse/.github/workflows/publish.yml
@@ -0,0 +1,102 @@
+# This workflow will upload a Python Package to Release asset
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Create Release
+
+on:
+  push:
+    tags:
+      - v*
+
+# Needed to create release and upload assets
+permissions:
+  contents: write
+
+jobs:
+  release:
+    # Retrieve tag and create release
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/create_release.js')
+            await script(github, context, core)
+
+  wheel:
+    name: Build Wheel
+    runs-on: ${{ matrix.os }}
+    needs: release
+
+    strategy:
+      fail-fast: false
+      matrix:
+          os: ['ubuntu-20.04']
+          python-version: ['3.8', '3.9', '3.10', '3.11']
+          pytorch-version: ['2.1.2']  # Must be the most recent version that meets requirements.txt.
+          cuda-version: ['11.8', '12.1']
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Linux Env
+        if: ${{ runner.os == 'Linux' }}
+        run: |
+          bash -x .github/workflows/scripts/env.sh
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+            python-version: ${{ matrix.python-version }}
+
+      - name: Install CUDA ${{ matrix.cuda-version }}
+        run: |
+          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+
+      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
+        run: |
+          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
+
+      - name: Build wheel
+        shell: bash
+        run: |
+          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          asset_name=${wheel_name//"linux"/"manylinux1"}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+
+      - name: Upload Release Asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/${{ env.wheel_name }}
+          asset_name: ${{ env.asset_name }}
+          asset_content_type: application/*
+
+      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
+      # - name: Publish package
+      #   uses: pypa/gh-action-pypi-publish@release/v1.8
+      #   with:
+      #     repository-url: https://test.pypi.org/legacy/
+      #     password: ${{ secrets.PYPI_API_TOKEN }}
+      #     skip-existing: true
diff --git a/vllm_fuse/.github/workflows/ruff.yml b/vllm_fuse/.github/workflows/ruff.yml
@@ -0,0 +1,34 @@
+name: ruff
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
+    - name: Analysing the code with ruff
+      run: |
+        ruff vllm tests
+    - name: Spelling check with codespell
+      run: |
+         codespell --toml pyproject.toml
diff --git a/vllm_fuse/.github/workflows/scripts/build.sh b/vllm_fuse/.github/workflows/scripts/build.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+python_executable=python$1
+cuda_home=/usr/local/cuda-$2
+
+# Update paths
+PATH=${cuda_home}/bin:$PATH
+LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
+
+# Install requirements
+$python_executable -m pip install wheel packaging
+$python_executable -m pip install -r requirements.txt
+
+# Limit the number of parallel jobs to avoid OOM
+export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
+
+# Build
+$python_executable setup.py bdist_wheel --dist-dir=dist