Skip to content

Commit

Permalink
llm: update llama.cpp commit to 8962422 (ollama#6618)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmorganca authored Sep 4, 2024
1 parent f29b167 commit 5e2653f
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 422 deletions.
19 changes: 12 additions & 7 deletions llm/ext_server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ struct llama_server_context

n_ctx = llama_n_ctx(ctx);

add_bos_token = llama_should_add_bos_token(model);
add_bos_token = llama_add_bos_token(model);

return true;
}
Expand Down Expand Up @@ -1031,7 +1031,7 @@ struct llama_server_context
continue;
}

if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG_TEE("Error processing the given image");
return false;
}
Expand Down Expand Up @@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
Expand Down Expand Up @@ -2287,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
params.cpuparams.n_threads = std::stoi(argv[i]);
}
else if (arg == "--grp-attn-n" || arg == "-gan")
{
Expand Down Expand Up @@ -2315,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
invalid_param = true;
break;
}
params.n_threads_batch = std::stoi(argv[i]);
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
}
else if (arg == "--threads-http")
{
Expand Down Expand Up @@ -2626,6 +2626,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
params.kv_overrides.back().key[0] = 0;
}

postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);

if (invalid_param)
{
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
Expand Down Expand Up @@ -2775,8 +2780,8 @@ int main(int argc, char **argv) {
{"commit", LLAMA_COMMIT}});

LOG_INFO("system info", {
{"n_threads", params.n_threads},
{"n_threads_batch", params.n_threads_batch},
{"n_threads", params.cpuparams.n_threads},
{"n_threads_batch", params.cpuparams_batch.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});
Expand Down
2 changes: 1 addition & 1 deletion llm/generate/gen_darwin.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ sign() {
fi
}

COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"

case "${GOARCH}" in
"amd64")
Expand Down
2 changes: 1 addition & 1 deletion llm/llama.cpp
10 changes: 5 additions & 5 deletions llm/patches/05-default-pretokenizer.diff
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index a207451f..2ddf431d 100644
index 88355971..dd7d41ed 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
Expand All @@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
tokenizer_pre == "codeshell") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
tokenizer_pre == "exaone") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
} else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
Expand Down
28 changes: 13 additions & 15 deletions llm/patches/06-embeddings.diff
Original file line number Diff line number Diff line change
@@ -1,45 +1,43 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index 1fe2b9f7..a43312a7 100644
index 88355971..d7db689b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd;

// TODO: use a per-batch flag for logits presence instead
- const bool has_logits = !cparams.embeddings;
+ const bool has_logits = cparams.causal_attn;
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);

const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
@@ -16175,20 +16175,23 @@ static int llama_decode_internal(
// no output
res = nullptr;
embd = nullptr;
- } else if (cparams.embeddings) {
- res = nullptr; // do not extract logits for embedding case
- embd = gf->nodes[gf->n_nodes - 1];
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
- embd = gf->nodes[gf->n_nodes - 2];
- res = nullptr; // do not extract logits for embedding case
- embd = nullptr;
+ }
+
+ if (cparams.embeddings) {
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
for (int i = gf->n_nodes - 1; i >= 0; --i) {
- if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
- embd = gf->nodes[i];
+ embd = gf->nodes[i];
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
+ break;
+ }
break;
}
}
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
- } else {
+ } else {
- GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
} else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
}
+
+ if (!cparams.causal_attn) {
+ res = nullptr; // do not extract logits when not needed
+ }
+
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);

ggml_backend_sched_alloc_graph(lctx.sched, gf);
Loading

0 comments on commit 5e2653f

Please sign in to comment.