[Vision] Support Phi-3.5-vision, the first VLM in WebLLM (mlc-ai#563)

This PR supports the first Vision Language Model, Phi-3.5-vision. For a full example, see `examples/vision-model`. Overall usage follows OpenAI API. We add `Phi-3.5-vision-instruct-q4f16_1-MLC` and `Phi-3.5-vision-instruct-q4f16_1-MLC` to prebuilt model list.
dam5200 · Sep 23, 2024 · 9c0aec4 · 9c0aec4
1 parent cf59d7a
commit 9c0aec4
Show file tree

Hide file tree

Showing 18 changed files with 1,690 additions and 271 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -36,6 +36,7 @@ These examples demonstrate various capabilities via WebLLM's OpenAI-like API.
 - [json-schema](json-schema): besides guaranteeing output to be in JSON, ensure output to adhere to a specific JSON schema specified the user
 - [seed-to-reproduce](seed-to-reproduce): use seeding to ensure reproducible output with fields `seed`.
 - [function-calling](function-calling) (WIP): function calling with fields `tools` and `tool_choice` (with preliminary support).
+- [vision-model](vision-model): process request with image input using Vision Language Model (e.g. Phi3.5-vision)
 
 #### Chrome Extension
 

diff --git a/examples/vision-model/README.md b/examples/vision-model/README.md
@@ -0,0 +1,14 @@
+# WebLLM Get Started App
+
+This folder provides a minimum demo to show WebLLM API in a webapp setting.
+To try it out, you can do the following steps under this folder
+
+```bash
+npm install
+npm start
+```
+
+Note if you would like to hack WebLLM core package.
+You can change web-llm dependencies as `"file:../.."`, and follow the build from source
+instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
diff --git a/examples/vision-model/package.json b/examples/vision-model/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "get-started",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "start": "parcel src/vision_model.html  --port 8888",
+    "build": "parcel build src/vision_model.html --dist-dir lib"
+  },
+  "devDependencies": {
+    "buffer": "^5.7.1",
+    "parcel": "^2.8.3",
+    "process": "^0.11.10",
+    "tslib": "^2.3.1",
+    "typescript": "^4.9.5",
+    "url": "^0.11.3"
+  },
+  "dependencies": {
+    "@mlc-ai/web-llm": "file:../.."
+  }
+}
diff --git a/examples/vision-model/src/utils.ts b/examples/vision-model/src/utils.ts
@@ -0,0 +1,32 @@
+export function getImageDataFromURL(url: string): Promise<ImageData> {
+  return new Promise((resolve, reject) => {
+    // Converts img to any, and later `as CanvasImageSource`, otherwise build complains
+    const img: any = new Image();
+    img.crossOrigin = "anonymous"; // Important for CORS
+    img.onload = () => {
+      const canvas: HTMLCanvasElement = document.createElement("canvas");
+      const ctx: CanvasRenderingContext2D = canvas.getContext("2d")!;
+      canvas.width = img.width;
+      canvas.height = img.height;
+      ctx.drawImage(img as CanvasImageSource, 0, 0);
+
+      const imageData = ctx.getImageData(0, 0, img.width, img.height);
+      resolve(imageData);
+    };
+    img.onerror = () => reject(new Error("Failed to load image"));
+    img.src = url;
+  });
+}
+
+export async function imageURLToBase64(url: string): Promise<string> {
+  const imageData: ImageData = await getImageDataFromURL(url);
+  const canvas = document.createElement("canvas");
+  const ctx = canvas.getContext("2d");
+
+  canvas.width = imageData.width;
+  canvas.height = imageData.height;
+
+  ctx!.putImageData(imageData, 0, 0);
+
+  return canvas.toDataURL();
+}
diff --git a/examples/vision-model/src/vision_model.html b/examples/vision-model/src/vision_model.html
@@ -0,0 +1,23 @@
+<!doctype html>
+<html>
+  <script>
+    webLLMGlobal = {};
+  </script>
+  <body>
+    <h2>WebLLM Test Page</h2>
+    Open console to see output
+    <br />
+    <br />
+    <label id="init-label"> </label>
+
+    <h3>Prompt</h3>
+    <label id="prompt-label"> </label>
+
+    <h3>Response</h3>
+    <label id="generate-label"> </label>
+    <br />
+    <label id="stats-label"> </label>
+
+    <script type="module" src="./vision_model.ts"></script>
+  </body>
+</html>
diff --git a/examples/vision-model/src/vision_model.ts b/examples/vision-model/src/vision_model.ts
@@ -0,0 +1,104 @@
+import * as webllm from "@mlc-ai/web-llm";
+import { imageURLToBase64 } from "./utils";
+
+function setLabel(id: string, text: string) {
+  const label = document.getElementById(id);
+  if (label == null) {
+    throw Error("Cannot find label " + id);
+  }
+  label.innerText = text;
+}
+
+const proxyUrl = "https://cors-anywhere.herokuapp.com/";
+const url_https_street = "https://www.ilankelman.org/stopsigns/australia.jpg";
+const url_https_tree = "https://www.ilankelman.org/sunset.jpg";
+const url_https_sea =
+  "https://www.islandvulnerability.org/index/silhouette.jpg";
+
+async function main() {
+  // can feed request with either base64 or http url
+  const url_base64_street = await imageURLToBase64(proxyUrl + url_https_street);
+
+  const initProgressCallback = (report: webllm.InitProgressReport) => {
+    setLabel("init-label", report.text);
+  };
+  const selectedModel = "Phi-3.5-vision-instruct-q4f16_1-MLC";
+  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+    selectedModel,
+    {
+      initProgressCallback: initProgressCallback,
+      logLevel: "INFO", // specify the log level
+    },
+    {
+      context_window_size: 6144,
+    },
+  );
+
+  // 1. Single image input (with choices)
+  const messages: webllm.ChatCompletionMessageParam[] = [
+    {
+      role: "user",
+      content: [
+        { type: "text", text: "List the items in each image concisely." },
+        {
+          type: "image_url",
+          image_url: {
+            url: url_base64_street,
+          },
+        },
+        {
+          type: "image_url",
+          image_url: {
+            url: proxyUrl + url_https_sea,
+          },
+        },
+      ],
+    },
+  ];
+  const request0: webllm.ChatCompletionRequest = {
+    stream: false, // can be streaming, same behavior
+    messages: messages,
+  };
+  const reply0 = await engine.chat.completions.create(request0);
+  const replyMessage0 = await engine.getMessage();
+  console.log(reply0);
+  console.log(replyMessage0);
+  console.log(reply0.usage);
+
+  // 2. A follow up text-only question
+  messages.push({ role: "assistant", content: replyMessage0 });
+  messages.push({ role: "user", content: "What is special about each image?" });
+  const request1: webllm.ChatCompletionRequest = {
+    stream: false, // can be streaming, same behavior
+    messages: messages,
+  };
+  const reply1 = await engine.chat.completions.create(request1);
+  const replyMessage1 = await engine.getMessage();
+  console.log(reply1);
+  console.log(replyMessage1);
+  console.log(reply1.usage);
+
+  // 3. A follow up multi-image question
+  messages.push({ role: "assistant", content: replyMessage1 });
+  messages.push({
+    role: "user",
+    content: [
+      { type: "text", text: "What about this image? Answer concisely." },
+      {
+        type: "image_url",
+        image_url: { url: proxyUrl + url_https_tree },
+      },
+    ],
+  });
+  const request2: webllm.ChatCompletionRequest = {
+    stream: false, // can be streaming, same behavior
+    messages: messages,
+  };
+  const reply2 = await engine.chat.completions.create(request2);
+  const replyMessage2 = await engine.getMessage();
+  console.log(reply2);
+  console.log(replyMessage2);
+  console.log(reply2.usage);
+}
+
+main();
diff --git a/src/config.ts b/src/config.ts
@@ -229,6 +229,7 @@ export function postInitAndCheckGenerationConfigValues(
 export enum ModelType {
   "LLM",
   "embedding",
+  "VLM", // vision-language model
 }
 
 /**
@@ -512,6 +513,37 @@ export const prebuiltAppConfig: AppConfig = {
         context_window_size: 1024,
       },
     },
+    // Phi-3.5-vision-instruct
+    {
+      model:
+        "https://huggingface.co/mlc-ai/Phi-3.5-vision-instruct-q4f16_1-MLC",
+      model_id: "Phi-3.5-vision-instruct-q4f16_1-MLC",
+      model_lib:
+        modelLibURLPrefix +
+        modelVersion +
+        "/Phi-3.5-vision-instruct-q4f16_1-ctx4k_cs2k-webgpu.wasm",
+      vram_required_MB: 3952.18,
+      low_resource_required: true,
+      overrides: {
+        context_window_size: 4096,
+      },
+      model_type: ModelType.VLM,
+    },
+    {
+      model:
+        "https://huggingface.co/mlc-ai/Phi-3.5-vision-instruct-q4f32_1-MLC",
+      model_id: "Phi-3.5-vision-instruct-q4f32_1-MLC",
+      model_lib:
+        modelLibURLPrefix +
+        modelVersion +
+        "/Phi-3.5-vision-instruct-q4f32_1-ctx4k_cs2k-webgpu.wasm",
+      vram_required_MB: 5879.84,
+      low_resource_required: true,
+      overrides: {
+        context_window_size: 4096,
+      },
+      model_type: ModelType.VLM,
+    },
     // Mistral variants
     {
       model: