Implement the Llama 3.2 vision models (EricLBuehler#796)

* Add the MLlama vision bits * Restructure * Typos * Add skeleton for text model, add text mlp * Add the self and cross attn text model parts * Add mllama model * Add most of the preprocessor * Add the rest of the processor and wire things up * Clean up a bit * Add an example * Rename * Loads now * Another batch of fixes * Vision model forward runs * Add back in the cache for cross attn * Inputs processor gives correct values * Fix the nans * Problem seems to be in vision encoder * Upcasting seems to do something * Problems confirmed to ONLY be in text model * Maybe remove some nans * Seems to work now!! * Confirmed working, remove the debuggers * Preapply the tanh * Rework the interactive mode * A bugfix * Another bugfix! * Add device mapping support * Add ISQ support for mllama * Add ISQ support * Add support for no images and multi images * Fix dim * Fix slice assign dim * Add examples and docs * Add a demo video * Update VLLAMA.md
apepkuss · Sep 29, 2024 · f33ac29 · f33ac29
1 parent 776c116
commit f33ac29
Show file tree

Hide file tree

Showing 40 changed files with 3,745 additions and 360 deletions.
diff --git a/README.md b/README.md
@@ -29,18 +29,20 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
 
 *After following installation instructions*
 
-- 🔥🧠 AnyMoE: Build a memory-efficient MoE model from anything, in seconds
+- 🦙📷 Run the **Llama 3.2 Vision** Model: [documentation and guide here](docs/VLLAMA.md)
+
+    <img src="https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" alt="Mount Washington" width = "400" height = "267">
+    <h6><a href = "https://www.nhmagazine.com/mount-washington/">Credit</a></h6>
 
     ```
-    ./mistralrs_server -i toml -f toml-selectors/anymoe_lora.toml
+    ./mistralrs_server -i vision-plain -m lamm-mit/Cephalo-Llama-3.2-11B-Vision-Instruct-128k -a vllama
     ```
 
-- 🦙 Run the Llama 3.1 model
+- 🔥🧠 AnyMoE: Build a memory-efficient MoE model from anything, in seconds
 
     ```
-    ./mistralrs_server -i plain -m meta-llama/Meta-Llama-3.1-8B-Instruct -a llama
+    ./mistralrs_server -i toml -f toml-selectors/anymoe_lora.toml
     ```
-
 - φ³ Run the new Phi 3.5/3.1/3 model with 128K context window
 
     ```
@@ -55,9 +57,6 @@ Please submit requests for new models [here](https://github.com/EricLBuehler/mis
 
 - φ³ 📷 Run the Phi 3 vision model: [documentation and guide here](docs/PHI3V.md)
 
-    <img src="https://www.nhmagazine.com/content/uploads/2019/05/mtwashingtonFranconia-2-19-18-108-Edit-Edit.jpg" alt="Mount Washington" width = "400" height = "267">
-    <h6><a href = "https://www.nhmagazine.com/mount-washington/">Credit</a></h6>
-
     ```
     ./mistralrs_server --port 1234 vision-plain -m microsoft/Phi-3.5-vision-instruct -a phi3v
     ```

diff --git a/docs/IDEFICS2.md b/docs/IDEFICS2.md
@@ -84,97 +84,45 @@ You can find this example [here](../mistralrs/examples/idefics2/main.rs).
 This is a minimal example of running the Idefics 2 model with a dummy image.
 
 ```rust
-use either::Either;
-use image::{ColorType, DynamicImage};
-use indexmap::IndexMap;
-use std::sync::Arc;
-use tokio::sync::mpsc::channel;
-
-use mistralrs::{
-    Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, MistralRs, MistralRsBuilder,
-    ModelDType, NormalRequest, Request, RequestMessage, Response, Result, SamplingParams,
-    SchedulerConfig, TokenSource, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig,
-};
-
-/// Gets the best device, cpu, cuda if compiled with CUDA
-pub(crate) fn best_device() -> Result<Device> {
-    #[cfg(not(feature = "metal"))]
-    {
-        Device::cuda_if_available(0)
-    }
-    #[cfg(feature = "metal")]
-    {
-        Device::new_metal(0)
-    }
-}
-
-fn setup() -> anyhow::Result<Arc<MistralRs>> {
-    // Select a Mistral model
-    let loader = VisionLoaderBuilder::new(
-        VisionSpecificConfig {
-            use_flash_attn: false,
-        },
-        None,
-        None,
-        Some("HuggingFaceM4/idefics2-8b-chatty".to_string()),
-    )
-    .build(VisionLoaderType::Idefics2);
-    // Load, into a Pipeline
-    let pipeline = loader.load_model_from_hf(
-        None,
-        TokenSource::CacheToken,
-        &ModelDType::Auto,
-        &best_device()?,
-        false,
-        DeviceMapMetadata::dummy(),
-        None,
-        None, // No PagedAttention.
-    )?;
-    // Create the MistralRs, which is a runner
-    Ok(MistralRsBuilder::new(
-        pipeline,
-        SchedulerConfig::DefaultScheduler {
-            method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
-        },
+use anyhow::Result;
+use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let model = VisionModelBuilder::new(
+        "HuggingFaceM4/idefics2-8b-chatty",
+        VisionLoaderType::Idefics2,
     )
-    .build())
-}
+    .with_isq(IsqType::Q4K)
+    .with_logging()
+    .build()
+    .await?;
+
+    let bytes = match reqwest::blocking::get(
+        "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
+    ) {
+        Ok(http_resp) => http_resp.bytes()?.to_vec(),
+        Err(e) => anyhow::bail!(e),
+    };
+    let image = image::load_from_memory(&bytes)?;
+
+    let messages = VisionMessages::new().add_idefics_image_message(
+        TextMessageRole::User,
+        "What is depicted here? Please describe the scene in detail.",
+        image,
+    );
+
+    let response = model.send_chat_request(messages).await?;
+
+    println!("{}", response.choices[0].message.content.as_ref().unwrap());
+    dbg!(
+        response.usage.avg_prompt_tok_per_sec,
+        response.usage.avg_compl_tok_per_sec
+    );
 
-fn main() -> anyhow::Result<()> {
-    let mistralrs = setup()?;
-
-    let (tx, mut rx) = channel(10_000);
-    let request = Request::Normal(NormalRequest {
-        messages: RequestMessage::VisionChat {
-            images: vec![DynamicImage::new(1280, 720, ColorType::Rgb8)],
-            messages: vec![IndexMap::from([
-                ("role".to_string(), Either::Left("user".to_string())),
-                (
-                    "content".to_string(),
-                    Either::Left("What is shown in this image?".to_string()),
-                ),
-            ])],
-        },
-        sampling_params: SamplingParams::default(),
-        response: tx,
-        return_logprobs: false,
-        is_streaming: false,
-        id: 0,
-        constraint: Constraint::None,
-        suffix: None,
-        adapters: None,
-        tools: None,
-        tool_choice: None,
-    });
-    mistralrs.get_sender()?.blocking_send(request)?;
-
-    let response = rx.blocking_recv().unwrap();
-    match response {
-        Response::Done(c) => println!("Text: {}", c.choices[0].message.content),
-        _ => unreachable!(),
-    }
     Ok(())
 }
+
 ```
 
 ## Python

diff --git a/docs/LLaVA.md b/docs/LLaVA.md
@@ -93,84 +93,42 @@ You can find this example [here](../mistralrs/examples/llava_next/main.rs).
 This is a minimal example of running the LLaVA and LLaVANext model with a dummy image.
 
 ```rust
-use either::Either;
-use image::{ColorType, DynamicImage};
-use indexmap::IndexMap;
-use std::sync::Arc;
-use tokio::sync::mpsc::channel;
-
-use mistralrs::{
-    Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, MistralRs, MistralRsBuilder,
-    ModelDType, NormalRequest, Request, RequestMessage, Response, SamplingParams, SchedulerConfig,
-    TokenSource, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig,
-};
-
-fn setup() -> anyhow::Result<Arc<MistralRs>> {
-    // Select a Mistral model
-    let loader = VisionLoaderBuilder::new(
-        VisionSpecificConfig {
-            use_flash_attn: false,
-        },
-        None,
-        None,
-        Some("llava-hf/llava-v1.6-mistral-7b-hf".to_string()),
-    )
-    .build(VisionLoaderType::LLaVANext);
-    // Load, into a Pipeline
-
-    let pipeline = loader.load_model_from_hf(
-        None,
-        TokenSource::CacheToken,
-        &ModelDType::Auto,
-        &Device::cuda_if_available(0)?,
-        false,
-        DeviceMapMetadata::dummy(),
-        None,
-        None, // No PagedAttention.
-    )?;
-    // Create the MistralRs, which is a runner
-    Ok(MistralRsBuilder::new(
-        pipeline,
-        SchedulerConfig::DefaultScheduler {
-            method: DefaultSchedulerMethod::Fixed(5.try_into().unwrap()),
-        },
+use anyhow::Result;
+use mistralrs::{IsqType, TextMessageRole, VisionLoaderType, VisionMessages, VisionModelBuilder};
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let model = VisionModelBuilder::new(
+        "llava-hf/llava-v1.6-mistral-7b-hf",
+        VisionLoaderType::LLaVANext,
     )
-    .build())
-}
+    .with_isq(IsqType::Q4K)
+    .with_logging()
+    .build()
+    .await?;
+
+    let bytes = match reqwest::blocking::get(
+        "https://d2r55xnwy6nx47.cloudfront.net/uploads/2018/02/Ants_Lede1300.jpg",
+    ) {
+        Ok(http_resp) => http_resp.bytes()?.to_vec(),
+        Err(e) => anyhow::bail!(e),
+    };
+    let image = image::load_from_memory(&bytes)?;
+
+    let messages = VisionMessages::new().add_llava_image_message(
+        TextMessageRole::User,
+        "What is depicted here? Please describe the scene in detail.",
+        image,
+    );
+
+    let response = model.send_chat_request(messages).await?;
+
+    println!("{}", response.choices[0].message.content.as_ref().unwrap());
+    dbg!(
+        response.usage.avg_prompt_tok_per_sec,
+        response.usage.avg_compl_tok_per_sec
+    );
 
-fn main() -> anyhow::Result<()> {
-    let mistralrs = setup()?;
-
-    let (tx, mut rx) = channel(10_000);
-    let request = Request::Normal(NormalRequest {
-        messages: RequestMessage::VisionChat {
-            images: vec![DynamicImage::new(1280, 720, ColorType::Rgb8)],
-            messages: vec![IndexMap::from([
-                ("role".to_string(), Either::Left("user".to_string())),
-                (
-                    "content".to_string(),
-                    Either::Left("<image>What is shown in this image?".to_string()),
-                ),
-            ])],
-        },
-        sampling_params: SamplingParams::default(),
-        response: tx,
-        return_logprobs: false,
-        is_streaming: false,
-        id: 0,
-        constraint: Constraint::None,
-        suffix: None,
-        adapters: None,
-        tools: None,
-        tool_choice: None,
-    });
-    mistralrs.get_sender()?.blocking_send(request)?;
-
-    let response = rx.blocking_recv().unwrap();
-    match response {
-        Response::Done(c) => println!("Text: {}", c.choices[0].message.content),
-        _ => unreachable!(),
-    }
     Ok(())
 }
 ```