Skip to content

Commit

Permalink
[Vision] Support Phi-3.5-vision, the first VLM in WebLLM (mlc-ai#563)
Browse files Browse the repository at this point in the history
This PR supports the first Vision Language Model, Phi-3.5-vision. For a
full example, see `examples/vision-model`. Overall usage follows OpenAI
API. We add `Phi-3.5-vision-instruct-q4f16_1-MLC` and
`Phi-3.5-vision-instruct-q4f16_1-MLC` to prebuilt model list.
  • Loading branch information
CharlieFRuan authored Sep 23, 2024
1 parent cf59d7a commit 9c0aec4
Show file tree
Hide file tree
Showing 18 changed files with 1,690 additions and 271 deletions.
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ These examples demonstrate various capabilities via WebLLM's OpenAI-like API.
- [json-schema](json-schema): besides guaranteeing output to be in JSON, ensure output to adhere to a specific JSON schema specified the user
- [seed-to-reproduce](seed-to-reproduce): use seeding to ensure reproducible output with fields `seed`.
- [function-calling](function-calling) (WIP): function calling with fields `tools` and `tool_choice` (with preliminary support).
- [vision-model](vision-model): process request with image input using Vision Language Model (e.g. Phi3.5-vision)

#### Chrome Extension

Expand Down
14 changes: 14 additions & 0 deletions examples/vision-model/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# WebLLM Get Started App

This folder provides a minimum demo to show WebLLM API in a webapp setting.
To try it out, you can do the following steps under this folder

```bash
npm install
npm start
```

Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
20 changes: 20 additions & 0 deletions examples/vision-model/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "get-started",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/vision_model.html --port 8888",
"build": "parcel build src/vision_model.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "file:../.."
}
}
32 changes: 32 additions & 0 deletions examples/vision-model/src/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
export function getImageDataFromURL(url: string): Promise<ImageData> {
return new Promise((resolve, reject) => {
// Converts img to any, and later `as CanvasImageSource`, otherwise build complains
const img: any = new Image();
img.crossOrigin = "anonymous"; // Important for CORS
img.onload = () => {
const canvas: HTMLCanvasElement = document.createElement("canvas");
const ctx: CanvasRenderingContext2D = canvas.getContext("2d")!;
canvas.width = img.width;
canvas.height = img.height;
ctx.drawImage(img as CanvasImageSource, 0, 0);

const imageData = ctx.getImageData(0, 0, img.width, img.height);
resolve(imageData);
};
img.onerror = () => reject(new Error("Failed to load image"));
img.src = url;
});
}

export async function imageURLToBase64(url: string): Promise<string> {
const imageData: ImageData = await getImageDataFromURL(url);
const canvas = document.createElement("canvas");
const ctx = canvas.getContext("2d");

canvas.width = imageData.width;
canvas.height = imageData.height;

ctx!.putImageData(imageData, 0, 0);

return canvas.toDataURL();
}
23 changes: 23 additions & 0 deletions examples/vision-model/src/vision_model.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<!doctype html>
<html>
<script>
webLLMGlobal = {};
</script>
<body>
<h2>WebLLM Test Page</h2>
Open console to see output
<br />
<br />
<label id="init-label"> </label>

<h3>Prompt</h3>
<label id="prompt-label"> </label>

<h3>Response</h3>
<label id="generate-label"> </label>
<br />
<label id="stats-label"> </label>

<script type="module" src="./vision_model.ts"></script>
</body>
</html>
104 changes: 104 additions & 0 deletions examples/vision-model/src/vision_model.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import * as webllm from "@mlc-ai/web-llm";
import { imageURLToBase64 } from "./utils";

function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}

const proxyUrl = "https://cors-anywhere.herokuapp.com/";
const url_https_street = "https://www.ilankelman.org/stopsigns/australia.jpg";
const url_https_tree = "https://www.ilankelman.org/sunset.jpg";
const url_https_sea =
"https://www.islandvulnerability.org/index/silhouette.jpg";

async function main() {
// can feed request with either base64 or http url
const url_base64_street = await imageURLToBase64(proxyUrl + url_https_street);

const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
const selectedModel = "Phi-3.5-vision-instruct-q4f16_1-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{
initProgressCallback: initProgressCallback,
logLevel: "INFO", // specify the log level
},
{
context_window_size: 6144,
},
);

// 1. Single image input (with choices)
const messages: webllm.ChatCompletionMessageParam[] = [
{
role: "user",
content: [
{ type: "text", text: "List the items in each image concisely." },
{
type: "image_url",
image_url: {
url: url_base64_street,
},
},
{
type: "image_url",
image_url: {
url: proxyUrl + url_https_sea,
},
},
],
},
];
const request0: webllm.ChatCompletionRequest = {
stream: false, // can be streaming, same behavior
messages: messages,
};
const reply0 = await engine.chat.completions.create(request0);
const replyMessage0 = await engine.getMessage();
console.log(reply0);
console.log(replyMessage0);
console.log(reply0.usage);

// 2. A follow up text-only question
messages.push({ role: "assistant", content: replyMessage0 });
messages.push({ role: "user", content: "What is special about each image?" });
const request1: webllm.ChatCompletionRequest = {
stream: false, // can be streaming, same behavior
messages: messages,
};
const reply1 = await engine.chat.completions.create(request1);
const replyMessage1 = await engine.getMessage();
console.log(reply1);
console.log(replyMessage1);
console.log(reply1.usage);

// 3. A follow up multi-image question
messages.push({ role: "assistant", content: replyMessage1 });
messages.push({
role: "user",
content: [
{ type: "text", text: "What about this image? Answer concisely." },
{
type: "image_url",
image_url: { url: proxyUrl + url_https_tree },
},
],
});
const request2: webllm.ChatCompletionRequest = {
stream: false, // can be streaming, same behavior
messages: messages,
};
const reply2 = await engine.chat.completions.create(request2);
const replyMessage2 = await engine.getMessage();
console.log(reply2);
console.log(replyMessage2);
console.log(reply2.usage);
}

main();
32 changes: 32 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ export function postInitAndCheckGenerationConfigValues(
export enum ModelType {
"LLM",
"embedding",
"VLM", // vision-language model
}

/**
Expand Down Expand Up @@ -512,6 +513,37 @@ export const prebuiltAppConfig: AppConfig = {
context_window_size: 1024,
},
},
// Phi-3.5-vision-instruct
{
model:
"https://huggingface.co/mlc-ai/Phi-3.5-vision-instruct-q4f16_1-MLC",
model_id: "Phi-3.5-vision-instruct-q4f16_1-MLC",
model_lib:
modelLibURLPrefix +
modelVersion +
"/Phi-3.5-vision-instruct-q4f16_1-ctx4k_cs2k-webgpu.wasm",
vram_required_MB: 3952.18,
low_resource_required: true,
overrides: {
context_window_size: 4096,
},
model_type: ModelType.VLM,
},
{
model:
"https://huggingface.co/mlc-ai/Phi-3.5-vision-instruct-q4f32_1-MLC",
model_id: "Phi-3.5-vision-instruct-q4f32_1-MLC",
model_lib:
modelLibURLPrefix +
modelVersion +
"/Phi-3.5-vision-instruct-q4f32_1-ctx4k_cs2k-webgpu.wasm",
vram_required_MB: 5879.84,
low_resource_required: true,
overrides: {
context_window_size: 4096,
},
model_type: ModelType.VLM,
},
// Mistral variants
{
model:
Expand Down
Loading

0 comments on commit 9c0aec4

Please sign in to comment.