llm: update llama.cpp commit to 8962422 (ollama#6618)

rMartine · Sep 4, 2024 · 5e2653f · 5e2653f
1 parent f29b167
commit 5e2653f
Show file tree

Hide file tree

Showing 7 changed files with 32 additions and 422 deletions.
diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
@@ -425,7 +425,7 @@ struct llama_server_context
 
         n_ctx = llama_n_ctx(ctx);
 
-        add_bos_token = llama_should_add_bos_token(model);
+        add_bos_token = llama_add_bos_token(model);
 
         return true;
     }
@@ -1031,7 +1031,7 @@ struct llama_server_context
                 continue;
             }
 
-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                 LOG_TEE("Error processing the given image");
                 return false;
             }
@@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("options:\n");
     printf("  -h, --help                show this help message and exit\n");
     printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
     printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
     printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
     printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
@@ -2287,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                 invalid_param = true;
                 break;
             }
-            params.n_threads = std::stoi(argv[i]);
+            params.cpuparams.n_threads = std::stoi(argv[i]);
         }
         else if (arg == "--grp-attn-n" || arg == "-gan")
         {
@@ -2315,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
                 invalid_param = true;
                 break;
             }
-            params.n_threads_batch = std::stoi(argv[i]);
+            params.cpuparams_batch.n_threads = std::stoi(argv[i]);
         }
         else if (arg == "--threads-http")
         {
@@ -2626,6 +2626,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
         params.kv_overrides.back().key[0] = 0;
     }
 
+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
     if (invalid_param)
     {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
@@ -2775,8 +2780,8 @@ int main(int argc, char **argv) {
                             {"commit", LLAMA_COMMIT}});
 
     LOG_INFO("system info", {
-                                {"n_threads", params.n_threads},
-                                {"n_threads_batch", params.n_threads_batch},
+                                {"n_threads", params.cpuparams.n_threads},
+                                {"n_threads_batch", params.cpuparams_batch.n_threads},
                                 {"total_threads", std::thread::hardware_concurrency()},
                                 {"system_info", llama_print_system_info()},
                             });

diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
@@ -19,7 +19,7 @@ sign() {
     fi
 }
 
-COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
+COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
 
 case "${GOARCH}" in
 "amd64")

diff --git a/llm/llama.cpp b/llm/llama.cpp
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index a207451f..2ddf431d 100644
+index 88355971..dd7d41ed 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
+@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
              vocab.tokenizer_add_space_prefix = false;
              vocab.tokenizer_clean_spaces = true;
@@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "codeshell") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "exaone") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);

diff --git a/llm/patches/06-embeddings.diff b/llm/patches/06-embeddings.diff
@@ -1,45 +1,43 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 1fe2b9f7..a43312a7 100644
+index 88355971..d7db689b 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
      const auto n_embd  = hparams.n_embd;
 
      // TODO: use a per-batch flag for logits presence instead
 -    const bool has_logits = !cparams.embeddings;
 +    const bool has_logits =  cparams.causal_attn;
-     const bool has_embd   =  lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
+     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
      const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
+@@ -16175,20 +16175,23 @@ static int llama_decode_internal(
              // no output
              res  = nullptr;
              embd = nullptr;
 -        } else if (cparams.embeddings) {
--            res = nullptr; // do not extract logits for embedding case
--            embd = gf->nodes[gf->n_nodes - 1];
--            if (strcmp(embd->name, "result_embd_pooled") != 0) {
--                embd = gf->nodes[gf->n_nodes - 2];
+-            res  = nullptr; // do not extract logits for embedding case
+-            embd = nullptr;
 +        }
 +
 +        if (cparams.embeddings) {
-+            for (int i = gf->n_nodes - 1; i >= 0; --i) {
+             for (int i = gf->n_nodes - 1; i >= 0; --i) {
+-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
+-                    embd = gf->nodes[i];
 +                embd = gf->nodes[i];
 +                if (strcmp(embd->name, "result_embd_pooled") == 0) {
-+                    break;
-+                }
+                     break;
+                 }
              }
-             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
--        } else {
-+         } else {
+-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
+         } else {
              embd = nullptr; // do not extract embeddings when not needed
              GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
          }
 +
 +        if (!cparams.causal_attn) {
 +            res = nullptr; // do not extract logits when not needed
 +        }
-+
          // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
          ggml_backend_sched_alloc_graph(lctx.sched, gf);