Skip to content

Commit

Permalink
ggml-backend: Don't recreate the scheduler for each context
Browse files Browse the repository at this point in the history
We don't need to create and destroy the GGML scheduler for every
context. This introduces extra CPU overhead for every forward
pass and extra memory for contexts that don't actually get scheduled
(for example, KV caches). We can instead just have one scheduler
for the backend and reset it each time we call Compute.

This improves token generation performance by 1-2% and removes
scheduler create/destroy from profile traces.
  • Loading branch information
jessegross committed Feb 20, 2025
1 parent bd6a7d5 commit e5bcc51
Showing 1 changed file with 21 additions and 13 deletions.
34 changes: 21 additions & 13 deletions ml/backend/ggml/ggml.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ type Backend struct {
meta *fs.GGML
cpus, gpus []Context
tensors map[string]*Context

sched *C.struct_ggml_backend_sched
}

func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
Expand Down Expand Up @@ -182,10 +184,24 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
return nil, err
}

backends := make([]*C.struct_ggml_backend, len(gpus)+len(cpus))
bufts := make([]*C.struct_ggml_backend_buffer_type, len(gpus)+len(cpus))
for i, c := range append(gpus, cpus...) {
backends[i] = c.backend
bufts[i] = C.ggml_backend_get_default_buffer_type(c.backend)
}

return &Backend{
meta: meta,
cpus: cpus,
gpus: gpus,
sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
C.int(len(backends)),
C.size_t(max(8192, len(meta.Tensors().Items())*5)),
true,
),
}, nil
}

Expand Down Expand Up @@ -219,31 +235,23 @@ func (b *Backend) NewContext() ml.Context {
})

backends := make([]*C.struct_ggml_backend, len(b.gpus)+len(b.cpus))
bufts := make([]*C.struct_ggml_backend_buffer_type, len(b.gpus)+len(b.cpus))
for i, c := range append(b.gpus, b.cpus...) {
backends[i] = c.backend
bufts[i] = C.ggml_backend_get_default_buffer_type(c.backend)
}

return &Context{
b: b,
ctx: c,
backend: backends[0],
nodes: nodes,
sched: C.ggml_backend_sched_new(
(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
C.int(len(backends)),
C.size_t(nodes),
true,
),
}
}

type Context struct {
b *Backend
ctx *C.struct_ggml_context
backend *C.struct_ggml_backend

sched *C.struct_ggml_backend_sched
graph *C.struct_ggml_cgraph
nodes int
}
Expand All @@ -257,12 +265,13 @@ func (c *Context) Forward(t ml.Tensor) {
}

func (c *Context) Compute(tensors ...ml.Tensor) {
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph)
C.ggml_backend_sched_reset(c.b.sched)

needSync := true
sync := func() {
if needSync {
C.ggml_backend_sched_synchronize(c.sched)
C.ggml_backend_sched_synchronize(c.b.sched)
needSync = false
}
}
Expand Down Expand Up @@ -350,7 +359,6 @@ func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {

func (c *Context) Close() {
if c != nil {
C.ggml_backend_sched_free(c.sched)
C.ggml_free(c.ctx)
}
}
Expand Down

0 comments on commit e5bcc51

Please sign in to comment.