Explicitly clear the kv cache each time we eval tokens to match n_past. (nomic-ai#1808)

manyoso · web-flow · commit 96cee4f9ace3 · 2024-01-03T14:06:08.000-05:00
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
@@ -298,6 +298,8 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
 
 bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
 {
+    llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
+
     llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
 
     batch.n_tokens = tokens.size();

Original file line number	Diff line number	Diff line change
`@@ -298,6 +298,8 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const`
`298`	`298`
`299`	`299`	`bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const`
`300`	`300`	`{`
	`301`	`+ llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);`
	`302`	`+`
`301`	`303`	`llama_batch batch = llama_batch_init(tokens.size(), 0, 1);`
`302`	`304`
`303`	`305`	`batch.n_tokens = tokens.size();`