forked from ollama/ollama
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
llm(llama): pass rope factors (ollama#5924)
- Loading branch information
Showing
1 changed file
with
71 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001 | ||
From: Michael Yang <[email protected]> | ||
Date: Tue, 23 Jul 2024 14:33:29 -0700 | ||
Subject: [PATCH] llama 3.1 rope scaling | ||
|
||
--- | ||
src/llama.cpp | 14 ++++++++++++-- | ||
1 file changed, 12 insertions(+), 2 deletions(-) | ||
|
||
diff --git a/src/llama.cpp b/src/llama.cpp | ||
index 8fe51971..a9969df8 100644 | ||
--- a/src/llama.cpp | ||
+++ b/src/llama.cpp | ||
@@ -2472,6 +2472,7 @@ struct llama_layer { | ||
// long rope factors | ||
struct ggml_tensor * rope_long = nullptr; | ||
struct ggml_tensor * rope_short = nullptr; | ||
+ struct ggml_tensor * rope_freqs = nullptr; | ||
|
||
// bitnet scale | ||
struct ggml_tensor * wq_scale; | ||
@@ -6143,6 +6144,8 @@ static bool llm_load_tensors( | ||
|
||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); | ||
|
||
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); | ||
+ | ||
if (n_expert == 0) { | ||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); | ||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); | ||
@@ -8620,6 +8623,10 @@ struct llm_build_context { | ||
// choose long/short freq factors based on the context size | ||
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; | ||
|
||
+ if (model.layers[il].rope_freqs != nullptr) { | ||
+ return model.layers[il].rope_freqs; | ||
+ } | ||
+ | ||
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { | ||
return model.layers[il].rope_long; | ||
} | ||
@@ -8814,6 +8821,9 @@ struct llm_build_context { | ||
|
||
// self-attention | ||
{ | ||
+ // rope freq factors for llama3; may return nullptr for llama2 and other models | ||
+ struct ggml_tensor * rope_factors = build_rope_factors(il); | ||
+ | ||
// compute Q and K and RoPE them | ||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); | ||
cb(Qcur, "Qcur", il); | ||
@@ -8837,14 +8847,14 @@ struct llm_build_context { | ||
} | ||
|
||
Qcur = ggml_rope_ext( | ||
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, | ||
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, | ||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | ||
ext_factor, attn_factor, beta_fast, beta_slow | ||
); | ||
cb(Qcur, "Qcur", il); | ||
|
||
Kcur = ggml_rope_ext( | ||
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, | ||
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, | ||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | ||
ext_factor, attn_factor, beta_fast, beta_slow | ||
); | ||
-- | ||
2.45.2 | ||
|