Skip to content

Commit

Permalink
Implement the Llama 3.2 vision models (EricLBuehler#796)
Browse files Browse the repository at this point in the history
* Add the MLlama vision bits

* Restructure

* Typos

* Add skeleton for text model, add text mlp

* Add the self and cross attn text model parts

* Add mllama model

* Add most of the preprocessor

* Add the rest of the processor and wire things up

* Clean up a bit

* Add an example

* Rename

* Loads now

* Another batch of fixes

* Vision model forward runs

* Add back in the cache for cross attn

* Inputs processor gives correct values

* Fix the nans

* Problem seems to be in vision encoder

* Upcasting seems to do something

* Problems confirmed to ONLY be in text model

* Maybe remove some nans

* Seems to work now!!

* Confirmed working, remove the debuggers

* Preapply the tanh

* Rework the interactive mode

* A bugfix

* Another bugfix!

* Add device mapping support

* Add ISQ support for mllama

* Add ISQ support

* Add support for no images and multi images

* Fix dim

* Fix slice assign dim

* Add examples and docs

* Add a demo video

* Update VLLAMA.md
  • Loading branch information
EricLBuehler authored Sep 29, 2024
1 parent 776c116 commit f33ac29
Show file tree
Hide file tree
Showing 40 changed files with 3,745 additions and 360 deletions.
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,20 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis

*After following installation instructions*

- 🔥🧠 AnyMoE: Build a memory-efficient MoE model from anything, in seconds
- 🦙📷 Run the **Llama 3.2 Vision** Model: [documentation and guide here](docs/VLLAMA.md)

<img src="https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" alt="Mount Washington" width = "400" height = "267">
<h6><a href = "https://www.nhmagazine.com/mount-washington/">Credit</a></h6>

```
./mistralrs_server -i toml -f toml-selectors/anymoe_lora.toml
./mistralrs_server -i vision-plain -m lamm-mit/Cephalo-Llama-3.2-11B-Vision-Instruct-128k -a vllama
```
- 🦙 Run the Llama 3.1 model
- 🔥🧠 AnyMoE: Build a memory-efficient MoE model from anything, in seconds
```
./mistralrs_server -i plain -m meta-llama/Meta-Llama-3.1-8B-Instruct -a llama
./mistralrs_server -i toml -f toml-selectors/anymoe_lora.toml
```
- φ³ Run the new Phi 3.5/3.1/3 model with 128K context window
```
Expand All @@ -55,9 +57,6 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
- φ³ 📷 Run the Phi 3 vision model: [documentation and guide here](docs/PHI3V.md)
<img src="https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" alt="Mount Washington" width = "400" height = "267">
<h6><a href = "https://www.nhmagazine.com/mount-washington/">Credit</a></h6>
```
./mistralrs_server --port 1234 vision-plain -m microsoft/Phi-3.5-vision-instruct -a phi3v
```
Expand Down
122 changes: 35 additions & 87 deletions docs/IDEFICS2.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,97 +84,45 @@ You can find this example [here](../mistralrs/examples/idefics2/main.rs).
This is a minimal example of running the Idefics 2 model with a dummy image.

```rust
use either::Either;
use image::{ColorType, DynamicImage};
use indexmap::IndexMap;
use std::sync::Arc;
use tokio::sync::mpsc::channel;

use mistralrs::{
Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, MistralRs, MistralRsBuilder,
ModelDType, NormalRequest, Request, RequestMessage, Response, Result, SamplingParams,
SchedulerConfig, TokenSource, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig,
};

/// Gets the best device, cpu, cuda if compiled with CUDA
pub(crate) fn best_device() -> Result<Device> {
#[cfg(not(feature = "metal"))]
{
Device::cuda_if_available(0)
}
#[cfg(feature = "metal")]
{
Device::new_metal(0)
}
}

fn setup() -> anyhow::Result<Arc<MistralRs>> {
// Select a Mistral model
let loader = VisionLoaderBuilder::new(
VisionSpecificConfig {
use_flash_attn: false,
},
None,
None,
Some("HuggingFaceM4/idefics2-8b-chatty".to_string()),
)
.build(VisionLoaderType::Idefics2);
// Load, into a Pipeline
let pipeline = loader.load_model_from_hf(
None,
TokenSource::CacheToken,
&ModelDType::Auto,
&best_device()?,
false,
DeviceMapMetadata::dummy(),
None,
None, // No PagedAttention.
)?;
// Create the MistralRs, which is a runner
Ok(MistralRsBuilder::new(
pipeline,
SchedulerConfig::DefaultScheduler {
method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
},
use anyhow::Result;
use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};

#[tokio::main]
async fn main() -> Result<()> {
let model = VisionModelBuilder::new(
"HuggingFaceM4/idefics2-8b-chatty",
VisionLoaderType::Idefics2,
)
.build())
}
.with_isq(IsqType::Q4K)
.with_logging()
.build()
.await?;

let bytes = match reqwest::blocking::get(
"https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
) {
Ok(http_resp) => http_resp.bytes()?.to_vec(),
Err(e) => anyhow::bail!(e),
};
let image = image::load_from_memory(&bytes)?;

let messages = VisionMessages::new().add_idefics_image_message(
TextMessageRole::User,
"What is depicted here? Please describe the scene in detail.",
image,
);

let response = model.send_chat_request(messages).await?;

println!("{}", response.choices[0].message.content.as_ref().unwrap());
dbg!(
response.usage.avg_prompt_tok_per_sec,
response.usage.avg_compl_tok_per_sec
);

fn main() -> anyhow::Result<()> {
let mistralrs = setup()?;

let (tx, mut rx) = channel(10_000);
let request = Request::Normal(NormalRequest {
messages: RequestMessage::VisionChat {
images: vec![DynamicImage::new(1280, 720, ColorType::Rgb8)],
messages: vec![IndexMap::from([
("role".to_string(), Either::Left("user".to_string())),
(
"content".to_string(),
Either::Left("What is shown in this image?".to_string()),
),
])],
},
sampling_params: SamplingParams::default(),
response: tx,
return_logprobs: false,
is_streaming: false,
id: 0,
constraint: Constraint::None,
suffix: None,
adapters: None,
tools: None,
tool_choice: None,
});
mistralrs.get_sender()?.blocking_send(request)?;

let response = rx.blocking_recv().unwrap();
match response {
Response::Done(c) => println!("Text: {}", c.choices[0].message.content),
_ => unreachable!(),
}
Ok(())
}

```

## Python
Expand Down
110 changes: 34 additions & 76 deletions docs/LLaVA.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,84 +93,42 @@ You can find this example [here](../mistralrs/examples/llava_next/main.rs).
This is a minimal example of running the LLaVA and LLaVANext model with a dummy image.

```rust
use either::Either;
use image::{ColorType, DynamicImage};
use indexmap::IndexMap;
use std::sync::Arc;
use tokio::sync::mpsc::channel;

use mistralrs::{
Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, MistralRs, MistralRsBuilder,
ModelDType, NormalRequest, Request, RequestMessage, Response, SamplingParams, SchedulerConfig,
TokenSource, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig,
};

fn setup() -> anyhow::Result<Arc<MistralRs>> {
// Select a Mistral model
let loader = VisionLoaderBuilder::new(
VisionSpecificConfig {
use_flash_attn: false,
},
None,
None,
Some("llava-hf/llava-v1.6-mistral-7b-hf".to_string()),
)
.build(VisionLoaderType::LLaVANext);
// Load, into a Pipeline

let pipeline = loader.load_model_from_hf(
None,
TokenSource::CacheToken,
&ModelDType::Auto,
&Device::cuda_if_available(0)?,
false,
DeviceMapMetadata::dummy(),
None,
None, // No PagedAttention.
)?;
// Create the MistralRs, which is a runner
Ok(MistralRsBuilder::new(
pipeline,
SchedulerConfig::DefaultScheduler {
method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
},
use anyhow::Result;
use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};

#[tokio::main]
async fn main() -> Result<()> {
let model = VisionModelBuilder::new(
"llava-hf/llava-v1.6-mistral-7b-hf",
VisionLoaderType::LLaVANext,
)
.build())
}
.with_isq(IsqType::Q4K)
.with_logging()
.build()
.await?;

let bytes = match reqwest::blocking::get(
"https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
) {
Ok(http_resp) => http_resp.bytes()?.to_vec(),
Err(e) => anyhow::bail!(e),
};
let image = image::load_from_memory(&bytes)?;

let messages = VisionMessages::new().add_llava_image_message(
TextMessageRole::User,
"What is depicted here? Please describe the scene in detail.",
image,
);

let response = model.send_chat_request(messages).await?;

println!("{}", response.choices[0].message.content.as_ref().unwrap());
dbg!(
response.usage.avg_prompt_tok_per_sec,
response.usage.avg_compl_tok_per_sec
);

fn main() -> anyhow::Result<()> {
let mistralrs = setup()?;

let (tx, mut rx) = channel(10_000);
let request = Request::Normal(NormalRequest {
messages: RequestMessage::VisionChat {
images: vec![DynamicImage::new(1280, 720, ColorType::Rgb8)],
messages: vec![IndexMap::from([
("role".to_string(), Either::Left("user".to_string())),
(
"content".to_string(),
Either::Left("<image>What is shown in this image?".to_string()),
),
])],
},
sampling_params: SamplingParams::default(),
response: tx,
return_logprobs: false,
is_streaming: false,
id: 0,
constraint: Constraint::None,
suffix: None,
adapters: None,
tools: None,
tool_choice: None,
});
mistralrs.get_sender()?.blocking_send(request)?;

let response = rx.blocking_recv().unwrap();
match response {
Response::Done(c) => println!("Text: {}", c.choices[0].message.content),
_ => unreachable!(),
}
Ok(())
}
```
Expand Down
Loading

0 comments on commit f33ac29

Please sign in to comment.