forked from nomic-ai/gpt4all
-
Notifications
You must be signed in to change notification settings - Fork 0
/
llmodel_shared.h
46 lines (37 loc) · 916 Bytes
/
llmodel_shared.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#pragma once
#include <cstdint>
#include <cstddef>
#include <vector>
#include <ggml.h>
struct llm_buffer {
uint8_t * addr = NULL;
size_t size = 0;
void resize(size_t size) {
delete[] addr;
addr = new uint8_t[size];
this->size = size;
}
~llm_buffer() {
delete[] addr;
}
};
struct llm_kv_cache {
struct ggml_tensor * k;
struct ggml_tensor * v;
struct ggml_context * ctx = NULL;
llm_buffer buf;
int n; // number of tokens currently in the cache
~llm_kv_cache() {
if (ctx) {
ggml_free(ctx);
}
}
};
inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.addr;
}
ggml_graph_compute(graph, &plan);
}