Skip to content

Commit

Permalink
io_uring: add blkcg accounting to offloaded operations
Browse files Browse the repository at this point in the history
There are a few operations that are offloaded to the worker threads. In
this case, we lose process context and end up in kthread context. This
results in ios to be not accounted to the issuing cgroup and
consequently end up as issued by root. Just like others, adopt the
personality of the blkcg too when issuing via the workqueues.

For the SQPOLL thread, it will live and attach in the inited cgroup's
context.

Signed-off-by: Dennis Zhou <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
  • Loading branch information
dennisszhou authored and axboe committed Oct 1, 2020
1 parent de29393 commit 91d8f51
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 0 deletions.
23 changes: 23 additions & 0 deletions fs/io-wq.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <linux/rculist_nulls.h>
#include <linux/fs_struct.h>
#include <linux/task_work.h>
#include <linux/blk-cgroup.h>

#include "io-wq.h"

Expand Down Expand Up @@ -57,6 +58,9 @@ struct io_worker {

struct rcu_head rcu;
struct mm_struct *mm;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
Expand Down Expand Up @@ -177,6 +181,13 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
worker->mm = NULL;
}

#ifdef CONFIG_BLK_CGROUP
if (worker->blkcg_css) {
kthread_associate_blkcg(NULL);
worker->blkcg_css = NULL;
}
#endif

return dropped_lock;
}

Expand Down Expand Up @@ -439,6 +450,17 @@ static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
work->flags |= IO_WQ_WORK_CANCEL;
}

static inline void io_wq_switch_blkcg(struct io_worker *worker,
struct io_wq_work *work)
{
#ifdef CONFIG_BLK_CGROUP
if (work->blkcg_css != worker->blkcg_css) {
kthread_associate_blkcg(work->blkcg_css);
worker->blkcg_css = work->blkcg_css;
}
#endif
}

static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
Expand Down Expand Up @@ -467,6 +489,7 @@ static void io_impersonate_work(struct io_worker *worker,
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
io_wq_switch_blkcg(worker, work);
}

static void io_assign_current_work(struct io_worker *worker,
Expand Down
3 changes: 3 additions & 0 deletions fs/io-wq.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ struct io_wq_work {
struct io_wq_work_node list;
struct files_struct *files;
struct mm_struct *mm;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
const struct cred *creds;
struct nsproxy *nsproxy;
struct fs_struct *fs;
Expand Down
92 changes: 92 additions & 0 deletions fs/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
#include <linux/blk-cgroup.h>

#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
Expand Down Expand Up @@ -300,6 +301,10 @@ struct io_ring_ctx {
/* Only used for accounting purposes */
struct mm_struct *mm_account;

#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *sqo_blkcg_css;
#endif

struct io_sq_data *sq_data; /* if using sq thread polling */

struct wait_queue_head sqo_sq_wait;
Expand Down Expand Up @@ -748,6 +753,8 @@ struct io_op_def {
unsigned needs_fsize : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
/* needs blkcg context, issues async io potentially */
unsigned needs_blkcg : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
Expand All @@ -761,6 +768,7 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
.needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
Expand All @@ -771,15 +779,18 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
.pollout = 1,
.needs_fsize = 1,
.needs_async_data = 1,
.needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
.needs_blkcg = 1,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
Expand All @@ -788,6 +799,7 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_fsize = 1,
.needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
Expand All @@ -797,6 +809,7 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
.needs_blkcg = 1,
},
[IORING_OP_SENDMSG] = {
.needs_mm = 1,
Expand All @@ -805,6 +818,7 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
.needs_fs = 1,
.pollout = 1,
.needs_async_data = 1,
.needs_blkcg = 1,
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_RECVMSG] = {
Expand All @@ -815,6 +829,7 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
.needs_blkcg = 1,
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
Expand Down Expand Up @@ -847,15 +862,18 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
.needs_fsize = 1,
.needs_blkcg = 1,
},
[IORING_OP_OPENAT] = {
.file_table = 1,
.needs_fs = 1,
.needs_blkcg = 1,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
.needs_file_no_error = 1,
.file_table = 1,
.needs_blkcg = 1,
},
[IORING_OP_FILES_UPDATE] = {
.needs_mm = 1,
Expand All @@ -865,13 +883,15 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
.needs_mm = 1,
.needs_fs = 1,
.file_table = 1,
.needs_blkcg = 1,
},
[IORING_OP_READ] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
Expand All @@ -880,30 +900,36 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_fsize = 1,
.needs_blkcg = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
.needs_blkcg = 1,
},
[IORING_OP_MADVISE] = {
.needs_mm = 1,
.needs_blkcg = 1,
},
[IORING_OP_SEND] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_blkcg = 1,
},
[IORING_OP_RECV] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.needs_blkcg = 1,
},
[IORING_OP_OPENAT2] = {
.file_table = 1,
.needs_fs = 1,
.needs_blkcg = 1,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
Expand All @@ -913,6 +939,7 @@ static const struct io_op_def io_op_defs[] __read_mostly = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.needs_blkcg = 1,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
Expand Down Expand Up @@ -1011,6 +1038,26 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
return __io_sq_thread_acquire_mm(ctx);
}

static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
struct cgroup_subsys_state **cur_css)

{
#ifdef CONFIG_BLK_CGROUP
/* puts the old one when swapping */
if (*cur_css != ctx->sqo_blkcg_css) {
kthread_associate_blkcg(ctx->sqo_blkcg_css);
*cur_css = ctx->sqo_blkcg_css;
}
#endif
}

static void io_sq_thread_unassociate_blkcg(void)
{
#ifdef CONFIG_BLK_CGROUP
kthread_associate_blkcg(NULL);
#endif
}

static inline void req_set_fail_links(struct io_kiocb *req)
{
if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
Expand Down Expand Up @@ -1148,6 +1195,10 @@ static bool io_req_clean_work(struct io_kiocb *req)
mmdrop(req->work.mm);
req->work.mm = NULL;
}
#ifdef CONFIG_BLK_CGROUP
if (req->work.blkcg_css)
css_put(req->work.blkcg_css);
#endif
if (req->work.creds) {
put_cred(req->work.creds);
req->work.creds = NULL;
Expand Down Expand Up @@ -1187,6 +1238,19 @@ static void io_prep_async_work(struct io_kiocb *req)
mmgrab(current->mm);
req->work.mm = current->mm;
}
#ifdef CONFIG_BLK_CGROUP
if (!req->work.blkcg_css && def->needs_blkcg) {
rcu_read_lock();
req->work.blkcg_css = blkcg_css();
/*
* This should be rare, either the cgroup is dying or the task
* is moving cgroups. Just punt to root for the handful of ios.
*/
if (!css_tryget_online(req->work.blkcg_css))
req->work.blkcg_css = NULL;
rcu_read_unlock();
}
#endif
if (!req->work.creds)
req->work.creds = get_current_cred();
if (!req->work.fs && def->needs_fs) {
Expand Down Expand Up @@ -6789,6 +6853,7 @@ static void io_sqd_init_new(struct io_sq_data *sqd)

static int io_sq_thread(void *data)
{
struct cgroup_subsys_state *cur_css = NULL;
const struct cred *old_cred = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
Expand Down Expand Up @@ -6818,6 +6883,7 @@ static int io_sq_thread(void *data)
revert_creds(old_cred);
old_cred = override_creds(ctx->creds);
}
io_sq_thread_associate_blkcg(ctx, &cur_css);

ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);

Expand All @@ -6841,6 +6907,8 @@ static int io_sq_thread(void *data)

io_run_task_work();

if (cur_css)
io_sq_thread_unassociate_blkcg();
if (old_cred)
revert_creds(old_cred);

Expand Down Expand Up @@ -8304,6 +8372,11 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}

#ifdef CONFIG_BLK_CGROUP
if (ctx->sqo_blkcg_css)
css_put(ctx->sqo_blkcg_css);
#endif

io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
Expand Down Expand Up @@ -9288,6 +9361,25 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
mmgrab(current->mm);
ctx->mm_account = current->mm;

#ifdef CONFIG_BLK_CGROUP
/*
* The sq thread will belong to the original cgroup it was inited in.
* If the cgroup goes offline (e.g. disabling the io controller), then
* issued bios will be associated with the closest cgroup later in the
* block layer.
*/
rcu_read_lock();
ctx->sqo_blkcg_css = blkcg_css();
ret = css_tryget_online(ctx->sqo_blkcg_css);
rcu_read_unlock();
if (!ret) {
/* don't init against a dying cgroup, have the user try again */
ctx->sqo_blkcg_css = NULL;
ret = -ENODEV;
goto err;
}
#endif

/*
* Account memory _before_ installing the file descriptor. Once
* the descriptor is installed, it can get closed at any time. Also
Expand Down

0 comments on commit 91d8f51

Please sign in to comment.