diff --git a/llama.cpp b/llama.cpp index b2e8be1ee43a39..57b37f6efe2cb0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3828,6 +3828,9 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * cur; struct ggml_tensor * inpL; +#if defined(GGML_USE_KOMPUTE) + struct ggml_tensor * toDeviceTensor = nullptr; +#endif if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -3837,7 +3840,9 @@ static struct ggml_cgraph * llm_build_falcon( memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); } ggml_set_name(inp_tokens, "inp_tokens"); - +#if defined(GGML_USE_KOMPUTE) + toDeviceTensor = inp_tokens; +#endif inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { #ifdef GGML_USE_MPI @@ -3850,6 +3855,9 @@ static struct ggml_cgraph * llm_build_falcon( if (!ggml_allocr_is_measure(lctx.alloc)) { memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); } +#if defined(GGML_USE_KOMPUTE) + toDeviceTensor = inpL; +#endif } const int i_gpu_start = n_layer - n_gpu_layers; @@ -4142,6 +4150,16 @@ static struct ggml_cgraph * llm_build_falcon( ggml_free(ctx0); +#if defined(GGML_USE_KOMPUTE) + if (lctx.ctx_kompute) { + if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) { + ggml_vk_h2d_all(lctx.ctx_kompute); + } else { + ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor); + } + } +#endif + return gf; } @@ -7442,9 +7460,8 @@ struct llama_context * llama_new_context_with_model( #undef LLAMA_METAL_CHECK_BUF } #elif defined(GGML_USE_KOMPUTE) - // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported if (ggml_vk_has_device() && model->n_gpu_layers > 0 - && model->arch == LLM_ARCH_LLAMA + && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON) && (model->ftype == LLAMA_FTYPE_ALL_F32 || model->ftype == LLAMA_FTYPE_MOSTLY_F16 || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0