Skip to content

Commit

Permalink
update to b3750, implementing new llama_perf api
Browse files Browse the repository at this point in the history
  • Loading branch information
AsbjornOlling committed Nov 26, 2024
1 parent c530eda commit b888e98
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 1 deletion.
12 changes: 12 additions & 0 deletions llama-cpp-2/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::slice;

use crate::llama_batch::LlamaBatch;
use crate::model::{LlamaLoraAdapter, LlamaModel};
use crate::timing::LlamaTimings;
use crate::token::data::LlamaTokenData;
use crate::token::LlamaToken;
use crate::{
Expand Down Expand Up @@ -263,6 +264,17 @@ impl<'model> LlamaContext<'model> {
unsafe { slice::from_raw_parts(data, len) }
}

/// Reset the timings for the context.
pub fn reset_timings(&mut self) {
unsafe { llama_cpp_sys_2::llama_perf_context_reset(self.context.as_ptr()) }
}

/// Returns the timings for the context.
pub fn timings(&mut self) -> LlamaTimings {
let timings = unsafe { llama_cpp_sys_2::llama_perf_context(self.context.as_ptr()) };
LlamaTimings { timings }
}

/// Sets a lora adapter.
///
/// # Errors
Expand Down
1 change: 1 addition & 0 deletions llama-cpp-2/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pub mod context;
pub mod llama_backend;
pub mod llama_batch;
pub mod model;
pub mod timing;
pub mod token;
pub mod token_type;

Expand Down
130 changes: 130 additions & 0 deletions llama-cpp-2/src/timing.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
//! Safe wrapper around `llama_timings`.
use std::fmt::{Debug, Display, Formatter};

/// A wrapper around `llama_timings`.
#[derive(Clone, Copy, Debug)]
pub struct LlamaTimings {
pub(crate) timings: llama_cpp_sys_2::llama_perf_context_data,
}

impl LlamaTimings {
/// Create a new `LlamaTimings`.
/// ```
/// # use llama_cpp_2::timing::LlamaTimings;
/// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5, 6);
/// let timings_str = "load time = 2.00 ms
/// prompt eval time = 3.00 ms / 5 tokens (0.60 ms per token, 1666.67 tokens per second)
/// eval time = 4.00 ms / 6 runs (0.67 ms per token, 1500.00 tokens per second)\n";
/// assert_eq!(timings_str, format!("{}", timings));
/// ```
#[allow(clippy::too_many_arguments)]
#[must_use]
pub fn new(
t_start_ms: f64,
t_load_ms: f64,
t_p_eval_ms: f64,
t_eval_ms: f64,
n_p_eval: i32,
n_eval: i32,
) -> Self {
Self {
timings: llama_cpp_sys_2::llama_perf_context_data {
t_start_ms,
t_load_ms,
t_p_eval_ms,
t_eval_ms,
n_p_eval,
n_eval,
},
}
}

/// Get the start time in milliseconds.
#[must_use]
pub fn t_start_ms(&self) -> f64 {
self.timings.t_start_ms
}

/// Get the load time in milliseconds.
#[must_use]
pub fn t_load_ms(&self) -> f64 {
self.timings.t_load_ms
}

/// Get the prompt evaluation time in milliseconds.
#[must_use]
pub fn t_p_eval_ms(&self) -> f64 {
self.timings.t_p_eval_ms
}

/// Get the evaluation time in milliseconds.
#[must_use]
pub fn t_eval_ms(&self) -> f64 {
self.timings.t_eval_ms
}

/// Get the number of prompt evaluations.
#[must_use]
pub fn n_p_eval(&self) -> i32 {
self.timings.n_p_eval
}

/// Get the number of evaluations.
#[must_use]
pub fn n_eval(&self) -> i32 {
self.timings.n_eval
}

/// Set the start time in milliseconds.
pub fn set_t_start_ms(&mut self, t_start_ms: f64) {
self.timings.t_start_ms = t_start_ms;
}

/// Set the load time in milliseconds.
pub fn set_t_load_ms(&mut self, t_load_ms: f64) {
self.timings.t_load_ms = t_load_ms;
}

/// Set the prompt evaluation time in milliseconds.
pub fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
self.timings.t_p_eval_ms = t_p_eval_ms;
}

/// Set the evaluation time in milliseconds.
pub fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
self.timings.t_eval_ms = t_eval_ms;
}

/// Set the number of prompt evaluations.
pub fn set_n_p_eval(&mut self, n_p_eval: i32) {
self.timings.n_p_eval = n_p_eval;
}

/// Set the number of evaluations.
pub fn set_n_eval(&mut self, n_eval: i32) {
self.timings.n_eval = n_eval;
}
}

impl Display for LlamaTimings {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
writeln!(f, "load time = {:.2} ms", self.t_load_ms())?;
writeln!(
f,
"prompt eval time = {:.2} ms / {} tokens ({:.2} ms per token, {:.2} tokens per second)",
self.t_p_eval_ms(),
self.n_p_eval(),
self.t_p_eval_ms() / f64::from(self.n_p_eval()),
1e3 / self.t_p_eval_ms() * f64::from(self.n_p_eval())
)?;
writeln!(
f,
"eval time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
self.t_eval_ms(),
self.n_eval(),
self.t_eval_ms() / f64::from(self.n_eval()),
1e3 / self.t_eval_ms() * f64::from(self.n_eval())
)?;
Ok(())
}
}
2 changes: 1 addition & 1 deletion llama-cpp-sys-2/llama.cpp

0 comments on commit b888e98

Please sign in to comment.