From 5769a351b89cd4d82016f18fa5f6c4077403564d Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 17 Jun 2020 17:53:55 +0800 Subject: [PATCH 001/127] io_uring: change the poll type to be 32-bits poll events should be 32-bits to cover EPOLLEXCLUSIVE. Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed. We call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should check the feature bit first. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- include/uapi/linux/io_uring.h | 4 +++- tools/io_uring/liburing.h | 6 +++++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a78201b9617965..0eb063daa9b5fe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4589,7 +4589,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u16 events; + u32 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4598,7 +4598,10 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (!poll->file) return -EBADF; - events = READ_ONCE(sqe->poll_events); + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; io_get_req_task(req); @@ -7928,7 +7931,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | + IORING_FEAT_POLL_32BITS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -8217,7 +8221,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 92c22699a5a74e..8d033961cb7841 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -31,7 +31,8 @@ struct io_uring_sqe { union { __kernel_rwf_t rw_flags; __u32 fsync_flags; - __u16 poll_events; + __u16 poll_events; /* compatibility */ + __u32 poll32_events; /* word-reversed for BE */ __u32 sync_range_flags; __u32 msg_flags; __u32 timeout_flags; @@ -248,6 +249,7 @@ struct io_uring_params { #define IORING_FEAT_RW_CUR_POS (1U << 3) #define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_FAST_POLL (1U << 5) +#define IORING_FEAT_POLL_32BITS (1U << 6) /* * io_uring_register(2) opcodes and arguments diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h index 5f305c86b89200..28a837b6069dfd 100644 --- a/tools/io_uring/liburing.h +++ b/tools/io_uring/liburing.h @@ -10,6 +10,7 @@ extern "C" { #include #include "../../include/uapi/linux/io_uring.h" #include +#include #include "barrier.h" /* @@ -145,11 +146,14 @@ static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, } static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, - short poll_mask) + unsigned poll_mask) { memset(sqe, 0, sizeof(*sqe)); sqe->opcode = IORING_OP_POLL_ADD; sqe->fd = fd; +#if __BYTE_ORDER == __BIG_ENDIAN + poll_mask = __swahw32(poll_mask); +#endif sqe->poll_events = poll_mask; } From a31eb4a2f1650fa578082ad9e9845487ecd90abe Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Wed, 17 Jun 2020 17:53:56 +0800 Subject: [PATCH 002/127] io_uring: use EPOLLEXCLUSIVE flag to aoid thundering herd type behavior Applications can pass this flag in to avoid accept thundering herd. Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0eb063daa9b5fe..311e8038ae5855 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4245,7 +4245,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = 0; poll->head = head; - add_wait_queue(head, &poll->wait); + + if (poll->events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(head, &poll->wait); + else + add_wait_queue(head, &poll->wait); } static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, @@ -4602,7 +4606,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe #ifdef __BIG_ENDIAN events = swahw32(events); #endif - poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | + (events & EPOLLEXCLUSIVE); io_get_req_task(req); return 0; From a087e2b519929152fdde8299457e32d5a8994a7c Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:07 -0700 Subject: [PATCH 003/127] io_uring: add wrappers for memory accounting Facilitate separation of locked memory usage reporting vs. limiting for upcoming patches. No functional changes. Signed-off-by: Bijan Mottahedeh [axboe: kill unnecessary () around return in io_account_mem()] Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 311e8038ae5855..9db9f09499d160 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6968,12 +6968,14 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return ret; } -static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) { atomic_long_sub(nr_pages, &user->locked_vm); } -static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +static inline int __io_account_mem(struct user_struct *user, + unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -6991,6 +6993,20 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + if (ctx->account_mem) + __io_unaccount_mem(ctx->user, nr_pages); +} + +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + if (ctx->account_mem) + return __io_account_mem(ctx->user, nr_pages); + + return 0; +} + static void io_mem_free(void *ptr) { struct page *page; @@ -7065,8 +7081,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7149,11 +7164,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - if (ctx->account_mem) { - ret = io_account_mem(ctx->user, nr_pages); - if (ret) - goto err; - } + ret = io_account_mem(ctx, nr_pages); + if (ret) + goto err; ret = 0; if (!pages || nr_pages > got_pages) { @@ -7166,8 +7179,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); goto err; } got_pages = nr_pages; @@ -7177,8 +7189,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); goto err; } @@ -7209,8 +7220,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages); kvfree(imu->bvec); goto err; } @@ -7315,9 +7325,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, - ring_pages(ctx->sq_entries, ctx->cq_entries)); + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->cancel_hash); @@ -7887,7 +7895,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, account_mem = !capable(CAP_IPC_LOCK); if (account_mem) { - ret = io_account_mem(user, + ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { free_uid(user); @@ -7898,7 +7906,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) { if (account_mem) - io_unaccount_mem(user, ring_pages(p->sq_entries, + __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; From aad5d8da1b301fe399d65f2dcb84df2ec60caaa3 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:08 -0700 Subject: [PATCH 004/127] io_uring: rename ctx->account_mem field Rename account_mem to limit_name to clarify its purpose. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9db9f09499d160..fcaf9eee3420df 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -226,7 +226,7 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int account_mem: 1; + unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; @@ -6995,13 +6995,13 @@ static inline int __io_account_mem(struct user_struct *user, static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->account_mem) + if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->account_mem) + if (ctx->limit_mem) return __io_account_mem(ctx->user, nr_pages); return 0; @@ -7853,7 +7853,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, { struct user_struct *user = NULL; struct io_ring_ctx *ctx; - bool account_mem; + bool limit_mem; int ret; if (!entries) @@ -7892,9 +7892,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, } user = get_uid(current_user()); - account_mem = !capable(CAP_IPC_LOCK); + limit_mem = !capable(CAP_IPC_LOCK); - if (account_mem) { + if (limit_mem) { ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { @@ -7905,14 +7905,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) { - if (account_mem) + if (limit_mem) __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->account_mem = account_mem; + ctx->limit_mem = limit_mem; ctx->user = user; ctx->creds = get_current_cred(); From 309758254ea62e07471abcaeca5b5c2173f4ebc2 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:09 -0700 Subject: [PATCH 005/127] io_uring: report pinned memory usage Report pinned memory usage always, regardless of whether locked memory limit is enforced. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fcaf9eee3420df..5ea55de3edefb9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6997,12 +6997,23 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); + + if (ctx->sqo_mm) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->limit_mem) - return __io_account_mem(ctx->user, nr_pages); + int ret; + + if (ctx->limit_mem) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + + if (ctx->sqo_mm) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); return 0; } @@ -7304,8 +7315,10 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); - if (ctx->sqo_mm) + if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); @@ -7912,7 +7925,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->limit_mem = limit_mem; ctx->user = user; ctx->creds = get_current_cred(); @@ -7960,6 +7972,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries)); + ctx->limit_mem = limit_mem; return ret; err: io_ring_ctx_wait_and_kill(ctx); From 2e0464d48f32a9e78e2aa85cbbedc77ecbb6ed60 Mon Sep 17 00:00:00 2001 From: Bijan Mottahedeh Date: Tue, 16 Jun 2020 16:36:10 -0700 Subject: [PATCH 006/127] io_uring: separate reporting of ring pages from registered pages Ring pages are not pinned so it is more appropriate to report them as locked. Signed-off-by: Bijan Mottahedeh Signed-off-by: Jens Axboe --- fs/io_uring.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5ea55de3edefb9..10b293780703a0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -880,6 +880,11 @@ static const struct io_op_def io_op_defs[] = { }, }; +enum io_mem_account { + ACCT_LOCKED, + ACCT_PINNED, +}; + static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); @@ -6993,16 +6998,22 @@ static inline int __io_account_mem(struct user_struct *user, return 0; } -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) { if (ctx->limit_mem) __io_unaccount_mem(ctx->user, nr_pages); - if (ctx->sqo_mm) - atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm -= nr_pages; + else if (acct == ACCT_PINNED) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + } } -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) { int ret; @@ -7012,8 +7023,12 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return ret; } - if (ctx->sqo_mm) - atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm += nr_pages; + else if (acct == ACCT_PINNED) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + } return 0; } @@ -7092,7 +7107,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - io_unaccount_mem(ctx, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7175,7 +7190,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - ret = io_account_mem(ctx, nr_pages); + ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); if (ret) goto err; @@ -7190,7 +7205,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -7200,7 +7215,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } @@ -7231,7 +7246,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - io_unaccount_mem(ctx, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); kvfree(imu->bvec); goto err; } @@ -7338,7 +7353,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries)); + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries), + ACCT_LOCKED); free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->cancel_hash); @@ -7972,7 +7988,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries)); + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); ctx->limit_mem = limit_mem; return ret; err: From 5a473e8311b582a40c10409a0f4bb39f42aa8123 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 Jun 2020 11:23:39 -0600 Subject: [PATCH 007/127] block: provide plug based way of signaling forced no-wait semantics Provide a way for the caller to specify that IO should be marked with REQ_NOWAIT to avoid blocking on allocation. Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++++ include/linux/blkdev.h | 1 + 2 files changed, 7 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 03252af8c82c82..62a4904db921cf 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -958,6 +958,7 @@ generic_make_request_checks(struct bio *bio) struct request_queue *q; int nr_sectors = bio_sectors(bio); blk_status_t status = BLK_STS_IOERR; + struct blk_plug *plug; char b[BDEVNAME_SIZE]; might_sleep(); @@ -971,6 +972,10 @@ generic_make_request_checks(struct bio *bio) goto end_io; } + plug = blk_mq_plug(q, bio); + if (plug && plug->nowait) + bio->bi_opf |= REQ_NOWAIT; + /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. @@ -1800,6 +1805,7 @@ void blk_start_plug(struct blk_plug *plug) INIT_LIST_HEAD(&plug->cb_list); plug->rq_count = 0; plug->multiple_queues = false; + plug->nowait = false; /* * Store ordering should not be needed here, since a potential diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8fd900998b4e2e..6e067dca94cf23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1189,6 +1189,7 @@ struct blk_plug { struct list_head cb_list; /* md requires an unplug callback */ unsigned short rq_count; bool multiple_queues; + bool nowait; }; #define BLK_MAX_REQUEST_COUNT 16 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) From ac8691c415e0ce0b8734cb6d9df2df18608eebed Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jun 2020 08:30:41 -0600 Subject: [PATCH 008/127] io_uring: always plug for any number of IOs Currently we only plug if we're doing more than two request. We're going to be relying on always having the plug there to pass down information, so plug unconditionally. Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 10b293780703a0..de894455f6bd2e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -676,7 +676,6 @@ struct io_kiocb { }; }; -#define IO_PLUG_THRESHOLD 2 #define IO_IOPOLL_BATCH 8 struct io_submit_state { @@ -5914,7 +5913,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct file *ring_file, int ring_fd) { - struct io_submit_state state, *statep = NULL; + struct io_submit_state state; struct io_kiocb *link = NULL; int i, submitted = 0; @@ -5931,10 +5930,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, nr); - statep = &state; - } + io_submit_state_start(&state, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; @@ -5949,14 +5945,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, io_consume_sqe(ctx); break; } - req = io_alloc_req(ctx, statep); + req = io_alloc_req(ctx, &state); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - err = io_init_req(ctx, req, sqe, statep); + err = io_init_req(ctx, req, sqe, &state); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; @@ -5982,8 +5978,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } if (link) io_queue_link_head(link); - if (statep) - io_submit_state_end(&state); + io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); From 4503b7676a2e0abe69c2f2c0d8b03aec53f2f048 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jun 2020 10:00:27 -0600 Subject: [PATCH 009/127] io_uring: catch -EIO from buffered issue request failure -EIO bubbles up like -EAGAIN if we fail to allocate a request at the lower level. Play it safe and treat it like -EAGAIN in terms of sync retry, to avoid passing back an errant -EIO. Catch some of these early for block based file, as non-mq devices generally do not support NOWAIT. That saves us some overhead by not first trying, then retrying from async context. We can go straight to async punt instead. Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index de894455f6bd2e..c5ee6d1a92d304 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2088,6 +2088,15 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return state->file; } +static bool io_bdev_nowait(struct block_device *bdev) +{ +#ifdef CONFIG_BLOCK + return !bdev || queue_is_mq(bdev_get_queue(bdev)); +#else + return true; +#endif +} + /* * If we tracked the file through the SCM inflight mechanism, we could support * any file. For now, just ensure that anything potentially problematic is done @@ -2097,10 +2106,19 @@ static bool io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) - return true; - if (S_ISREG(mode) && file->f_op != &io_uring_fops) + if (S_ISBLK(mode)) { + if (io_bdev_nowait(file->f_inode->i_bdev)) + return true; + return false; + } + if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; + if (S_ISREG(mode)) { + if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + file->f_op != &io_uring_fops) + return true; + return false; + } /* any ->read/write should understand O_NONBLOCK */ if (file->f_flags & O_NONBLOCK) @@ -2650,7 +2668,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { - ssize_t ret2; + ssize_t ret2 = 0; if (req->file->f_op->read_iter) ret2 = call_read_iter(req->file, kiocb, &iter); @@ -2658,7 +2676,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) { + if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2); } else { copy_iov: From b63534c41e20b474483b4ddf47efc858c17352e0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 Jun 2020 11:28:00 -0600 Subject: [PATCH 010/127] io_uring: re-issue block requests that failed because of resources Mark the plug with nowait == true, which will cause requests to avoid blocking on request allocation. If they do, we catch them and reissue them from a task_work based handler. Normally we can catch -EAGAIN directly, but the hard case is for split requests. As an example, the application issues a 512KB request. The block core will split this into 128KB if that's the max size for the device. The first request issues just fine, but we run into -EAGAIN for some latter splits for the same request. As the bio is split, we don't get to see the -EAGAIN until one of the actual reads complete, and hence we cannot handle it inline as part of submission. This does potentially cause re-reads of parts of the range, as the whole request is reissued. There's currently no better way to handle this. Signed-off-by: Jens Axboe --- fs/io_uring.c | 148 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 124 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c5ee6d1a92d304..f3dbf83fabf381 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -900,6 +900,13 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe); +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock); +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -1978,12 +1985,115 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) __io_cqring_add_event(req, res, cflags); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req, int error) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + ssize_t ret = -ECANCELED; + struct iov_iter iter; + int rw; + + if (error) { + ret = error; + goto end_req; + } + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + rw = READ; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + rw = WRITE; + break; + default: + printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", + req->opcode); + goto end_req; + } + + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + if (!ret) + return true; + kfree(iovec); +end_req: + io_cqring_add_event(req, ret); + req_set_fail_links(req); + io_put_req(req); + return false; +} + +static void io_rw_resubmit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; + int err; + + __set_current_state(TASK_RUNNING); + + err = io_sq_thread_acquire_mm(ctx, req); + + if (io_resubmit_prep(req, err)) { + refcount_inc(&req->refs); + io_queue_async_work(req); + } +} +#endif + +static bool io_rw_reissue(struct io_kiocb *req, long res) +{ +#ifdef CONFIG_BLOCK + struct task_struct *tsk; + int ret; + + if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + return false; + + tsk = req->task; + init_task_work(&req->task_work, io_rw_resubmit); + ret = task_work_add(tsk, &req->task_work, true); + if (!ret) + return true; +#endif + return false; +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - io_complete_rw_common(kiocb, res); - io_put_req(req); + if (!io_rw_reissue(req, res)) { + io_complete_rw_common(kiocb, res); + io_put_req(req); + } } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2169,6 +2279,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; + if (kiocb->ki_flags & IOCB_DIRECT) + io_get_req_task(req); + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; @@ -2668,6 +2781,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2 = 0; if (req->file->f_op->read_iter) @@ -2679,6 +2793,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { kiocb_done(kiocb, ret2); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -2765,6 +2881,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { + unsigned long nr_segs = iter.nr_segs; ssize_t ret2; /* @@ -2802,6 +2919,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2); } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); @@ -4282,28 +4401,6 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&pt->req->apoll->poll, pt, head); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } - - return 0; -} - static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); @@ -5814,6 +5911,9 @@ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { blk_start_plug(&state->plug); +#ifdef CONFIG_BLOCK + state->plug.nowait = true; +#endif state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; From 2e85abf053b99a6488f1b529d7aa3b8d7478adae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 08:59:42 -0600 Subject: [PATCH 011/127] mm: allow read-ahead with IOCB_NOWAIT set The read-ahead shouldn't block, so allow it to be done even if IOCB_NOWAIT is set in the kiocb. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- mm/filemap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb4d..3378d4fca88317 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2028,8 +2028,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) - goto would_block; page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); From c7510ab2cf5ccd997fe7f194edfe09cc511abf99 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 23 May 2020 08:22:14 -0600 Subject: [PATCH 012/127] mm: abstract out wake_page_match() from wake_page_function() No functional changes in this patch, just in preparation for allowing more callers. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- include/linux/pagemap.h | 37 +++++++++++++++++++++++++++++++++++++ mm/filemap.c | 35 ++++------------------------------- 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cf2468da68e91e..2f18221bb5c8f0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -496,6 +496,43 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, return pgoff; } +/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ +struct wait_page_key { + struct page *page; + int bit_nr; + int page_match; +}; + +struct wait_page_queue { + struct page *page; + int bit_nr; + wait_queue_entry_t wait; +}; + +static inline int wake_page_match(struct wait_page_queue *wait_page, + struct wait_page_key *key) +{ + if (wait_page->page != key->page) + return 0; + key->page_match = 1; + + if (wait_page->bit_nr != key->bit_nr) + return 0; + + /* + * Stop walking if it's locked. + * Is this safe if put_and_wait_on_page_locked() is in use? + * Yes: the waker must hold a reference to this page, and if PG_locked + * has now already been set by another task, that task must also hold + * a reference to the *same usage* of this page; so there is no need + * to walk on to wake even the put_and_wait_on_page_locked() callers. + */ + if (test_bit(key->bit_nr, &key->page->flags)) + return -1; + + return 1; +} + extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, diff --git a/mm/filemap.c b/mm/filemap.c index 3378d4fca88317..c3175dbd8fbac5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -987,43 +987,16 @@ void __init pagecache_init(void) page_writeback_init(); } -/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ -struct wait_page_key { - struct page *page; - int bit_nr; - int page_match; -}; - -struct wait_page_queue { - struct page *page; - int bit_nr; - wait_queue_entry_t wait; -}; - static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); + int ret; - if (wait_page->page != key->page) - return 0; - key->page_match = 1; - - if (wait_page->bit_nr != key->bit_nr) - return 0; - - /* - * Stop walking if it's locked. - * Is this safe if put_and_wait_on_page_locked() is in use? - * Yes: the waker must hold a reference to this page, and if PG_locked - * has now already been set by another task, that task must also hold - * a reference to the *same usage* of this page; so there is no need - * to walk on to wake even the put_and_wait_on_page_locked() callers. - */ - if (test_bit(key->bit_nr, &key->page->flags)) - return -1; - + ret = wake_page_match(wait_page, key); + if (ret != 1) + return ret; return autoremove_wake_function(wait, mode, sync, key); } From dd3e6d5039de1cbff4e20e2b34390ff44cdb182f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:12:09 -0600 Subject: [PATCH 013/127] mm: add support for async page locking Normally waiting for a page to become unlocked, or locking the page, requires waiting for IO to complete. Add support for lock_page_async() and wait_on_page_locked_async(), which are callback based instead. This allows a caller to get notified when a page becomes unlocked, rather than wait for it. We add a new iocb field, ki_waitq, to pass in the necessary data for this to happen. We can unionize this with ki_cookie, since that is only used for polled IO. Polled IO can never co-exist with async callbacks, as it is (by definition) polled completions. struct wait_page_key is made public, and we define struct wait_page_async as the interface between the caller and the core. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- include/linux/fs.h | 7 ++++++- include/linux/pagemap.h | 17 ++++++++++++++++ mm/filemap.c | 45 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f881a892ea746..2a5cf6080e6882 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,8 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +/* iocb->ki_waitq is valid */ +#define IOCB_WAITQ (1 << 8) struct kiocb { struct file *ki_filp; @@ -328,7 +330,10 @@ struct kiocb { int ki_flags; u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ - unsigned int ki_cookie; /* for ->iopoll */ + union { + unsigned int ki_cookie; /* for ->iopoll */ + struct wait_page_queue *ki_waitq; /* for async buffered IO */ + }; randomized_struct_fields_end }; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2f18221bb5c8f0..e053e1d9a4d71c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -535,6 +535,7 @@ static inline int wake_page_match(struct wait_page_queue *wait_page, extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); +extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); @@ -571,6 +572,22 @@ static inline int lock_page_killable(struct page *page) return 0; } +/* + * lock_page_async - Lock the page, unless this would block. If the page + * is already locked, then queue a callback when the page becomes unlocked. + * This callback can then retry the operation. + * + * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page + * was already locked and the callback defined in 'wait' was queued. + */ +static inline int lock_page_async(struct page *page, + struct wait_page_queue *wait) +{ + if (!trylock_page(page)) + return __lock_page_async(page, wait); + return 0; +} + /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. diff --git a/mm/filemap.c b/mm/filemap.c index c3175dbd8fbac5..e8aaf43bee9f80 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1180,6 +1180,36 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit_killable); +static int __wait_on_page_locked_async(struct page *page, + struct wait_page_queue *wait, bool set) +{ + struct wait_queue_head *q = page_waitqueue(page); + int ret = 0; + + wait->page = page; + wait->bit_nr = PG_locked; + + spin_lock_irq(&q->lock); + __add_wait_queue_entry_tail(q, &wait->wait); + SetPageWaiters(page); + if (set) + ret = !trylock_page(page); + else + ret = PageLocked(page); + /* + * If we were succesful now, we know we're still on the + * waitqueue as we're still under the lock. This means it's + * safe to remove and return success, we know the callback + * isn't going to trigger. + */ + if (!ret) + __remove_wait_queue(q, &wait->wait); + else + ret = -EIOCBQUEUED; + spin_unlock_irq(&q->lock); + return ret; +} + /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. @@ -1342,6 +1372,11 @@ int __lock_page_killable(struct page *__page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +int __lock_page_async(struct page *page, struct wait_page_queue *wait) +{ + return __wait_on_page_locked_async(page, wait, true); +} + /* * Return values: * 1 - page is locked; mmap_lock is still held. @@ -2131,6 +2166,11 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, } readpage: + if (iocb->ki_flags & IOCB_NOWAIT) { + unlock_page(page); + put_page(page); + goto would_block; + } /* * A previous I/O error may have been due to temporary * failures, eg. multipath errors. @@ -2150,7 +2190,10 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, } if (!PageUptodate(page)) { - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { From 1a0a7853b901c35a742b3bf176cf4701a5c5817c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:18:38 -0600 Subject: [PATCH 014/127] mm: support async buffered reads in generic_file_buffered_read() Use the async page locking infrastructure, if IOCB_WAITQ is set in the passed in iocb. The caller must expect an -EIOCBQUEUED return value, which means that IO is started but not done yet. This is similar to how O_DIRECT signals the same operation. Once the callback is received by the caller for IO completion, the caller must retry the operation. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- mm/filemap.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index e8aaf43bee9f80..a5b1fa8f7ce478 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1210,6 +1210,14 @@ static int __wait_on_page_locked_async(struct page *page, return ret; } +static int wait_on_page_locked_async(struct page *page, + struct wait_page_queue *wait) +{ + if (!PageLocked(page)) + return 0; + return __wait_on_page_locked_async(compound_head(page), wait, false); +} + /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. @@ -2049,17 +2057,25 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, index, last_index - index); } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_NOWAIT) { - put_page(page); - goto would_block; - } - /* * See comment in do_read_cache_page on why * wait_on_page_locked is used to avoid unnecessarily * serialisations and why it's safe. */ - error = wait_on_page_locked_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) { + if (written) { + put_page(page); + goto out; + } + error = wait_on_page_locked_async(page, + iocb->ki_waitq); + } else { + if (iocb->ki_flags & IOCB_NOWAIT) { + put_page(page); + goto would_block; + } + error = wait_on_page_locked_killable(page); + } if (unlikely(error)) goto readpage_error; if (PageUptodate(page)) @@ -2147,7 +2163,10 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, page_not_up_to_date: /* Get exclusive access to the page ... */ - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; @@ -2190,10 +2209,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_WAITQ) - error = lock_page_async(page, iocb->ki_waitq); - else - error = lock_page_killable(page); + error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { From c2a25ec0f1005dde004cd671484f578a9c8ca7de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:12:51 -0600 Subject: [PATCH 015/127] fs: add FMODE_BUF_RASYNC If set, this indicates that the file system supports IOCB_WAITQ for buffered reads. Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a5cf6080e6882..4090320360f44f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) +/* File supports async buffered reads */ +#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are From a304f0744824fd37d6e1aab4f9715f907724ad11 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:14:08 -0600 Subject: [PATCH 016/127] block: flag block devices as supporting IOCB_WAITQ Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 0ae656e022fd57..679d9346b87147 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1851,7 +1851,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; From f89fb730aa02f451fba1f8d5964dfec244d2e2d1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:27:33 -0600 Subject: [PATCH 017/127] xfs: flag files as supporting buffered async reads XFS uses generic_file_read_iter(), which already supports this. Acked-by: Darrick J. Wong Signed-off-by: Jens Axboe --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 00db81eac80d6c..fdbff4860d61c6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1080,7 +1080,7 @@ xfs_file_open( return -EFBIG; if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return 0; } From 8730f12b7962b21ea9ad2756abce1e205d22db84 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 10:19:22 -0600 Subject: [PATCH 018/127] btrfs: flag files as supporting buffered async reads btrfs uses generic_file_read_iter(), which already supports this. Acked-by: Chris Mason Signed-off-by: Jens Axboe --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2c14312b05e8ee..234a418eb6dad8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3472,7 +3472,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) static int btrfs_file_open(struct inode *inode, struct file *filp) { - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return generic_file_open(inode, filp); } From d1932dc3dc268f8dd5201c64971324d06ba977cc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 10:18:23 -0600 Subject: [PATCH 019/127] mm: add kiocb_wait_page_queue_init() helper Checks if the file supports it, and initializes the values that we need. Caller passes in 'data' pointer, if any, and the callback function to be used. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- include/linux/pagemap.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e053e1d9a4d71c..7386bc67cc5a76 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -533,6 +533,27 @@ static inline int wake_page_match(struct wait_page_queue *wait_page, return 1; } +static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, + struct wait_page_queue *wait, + wait_queue_func_t func, + void *data) +{ + /* Can't support async wakeup with polled IO */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) { + wait->wait.func = func; + wait->wait.private = data; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_waitq = wait; + return 0; + } + + return -EOPNOTSUPP; +} + extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); From bcf5a06304d69a3bb194a494d87b532d5e90b01c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 May 2020 09:24:42 -0600 Subject: [PATCH 020/127] io_uring: support true async buffered reads, if file provides it If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt the buffered read to an io-wq worker. Instead we can rely on page unlocking callbacks to support retry based async IO. This is a lot more efficient than doing async thread offload. The retry is done similarly to how we handle poll based retry. From the unlock callback, we simply queue the retry to a task_work based handler. Signed-off-by: Jens Axboe --- fs/io_uring.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 135 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f3dbf83fabf381..5d1685e206c15c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,6 +78,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -503,6 +504,8 @@ struct io_async_rw { struct iovec *iov; ssize_t nr_segs; ssize_t size; + struct wait_page_queue wpq; + struct callback_head task_work; }; struct io_async_ctx { @@ -2750,6 +2753,126 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static void __io_async_buf_error(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_async_buf_cancel(struct callback_head *cb) +{ + struct io_async_rw *rw; + struct io_kiocb *req; + + rw = container_of(cb, struct io_async_rw, task_work); + req = rw->wpq.wait.private; + __io_async_buf_error(req, -ECANCELED); +} + +static void io_async_buf_retry(struct callback_head *cb) +{ + struct io_async_rw *rw; + struct io_ring_ctx *ctx; + struct io_kiocb *req; + + rw = container_of(cb, struct io_async_rw, task_work); + req = rw->wpq.wait.private; + ctx = req->ctx; + + __set_current_state(TASK_RUNNING); + if (!io_sq_thread_acquire_mm(ctx, req)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL); + mutex_unlock(&ctx->uring_lock); + } else { + __io_async_buf_error(req, -EFAULT); + } +} + +static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, + int sync, void *arg) +{ + struct wait_page_queue *wpq; + struct io_kiocb *req = wait->private; + struct io_async_rw *rw = &req->io->rw; + struct wait_page_key *key = arg; + struct task_struct *tsk; + int ret; + + wpq = container_of(wait, struct wait_page_queue, wait); + + ret = wake_page_match(wpq, key); + if (ret != 1) + return ret; + + list_del_init(&wait->entry); + + init_task_work(&rw->task_work, io_async_buf_retry); + /* submit ref gets dropped, acquire a new one */ + refcount_inc(&req->refs); + tsk = req->task; + ret = task_work_add(tsk, &rw->task_work, true); + if (unlikely(ret)) { + /* queue just for cancelation */ + init_task_work(&rw->task_work, io_async_buf_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &rw->task_work, true); + } + wake_up_process(tsk); + return 1; +} + +static bool io_rw_should_retry(struct io_kiocb *req) +{ + struct kiocb *kiocb = &req->rw.kiocb; + int ret; + + /* never retry for NOWAIT, we just complete with -EAGAIN */ + if (req->flags & REQ_F_NOWAIT) + return false; + + /* already tried, or we're doing O_DIRECT */ + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + return false; + /* + * just use poll if we can, and don't attempt if the fs doesn't + * support callback based unlocks + */ + if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + return false; + + /* + * If request type doesn't require req->io to defer in general, + * we need to allocate it here + */ + if (!req->io && __io_alloc_async_ctx(req)) + return false; + + ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, + io_async_buf_func, req); + if (!ret) { + io_get_req_task(req); + return true; + } + + return false; +} + +static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +{ + if (req->file->f_op->read_iter) + return call_read_iter(req->file, &req->rw.kiocb, iter); + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); +} + static int io_read(struct io_kiocb *req, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; @@ -2784,10 +2907,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) unsigned long nr_segs = iter.nr_segs; ssize_t ret2 = 0; - if (req->file->f_op->read_iter) - ret2 = call_read_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); + ret2 = io_iter_do_read(req, &iter); /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { @@ -2804,6 +2924,17 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (!(req->flags & REQ_F_NOWAIT) && !file_can_poll(req->file)) req->flags |= REQ_F_MUST_PUNT; + /* if we can retry, do so with the callbacks armed */ + if (io_rw_should_retry(req)) { + ret2 = io_iter_do_read(req, &iter); + if (ret2 == -EIOCBQUEUED) { + goto out_free; + } else if (ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2); + goto out_free; + } + } + kiocb->ki_flags &= ~IOCB_WAITQ; return -EAGAIN; } } From 62ef73165091476d31f31e33d9d0d48b088c129d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:50 +0300 Subject: [PATCH 021/127] io_uring: remove setting REQ_F_MUST_PUNT in rw io_{read,write}() { ... copy_iov: // prep async if (!(flags & REQ_F_NOWAIT) && !file_can_poll(file)) flags |= REQ_F_MUST_PUNT; } REQ_F_MUST_PUNT there is pointless, because if it happens then REQ_F_NOWAIT is known to be _not_ set, and the request will go async path in __io_queue_sqe() anyway. file_can_poll() check is also repeated in arm_poll*(), so don't need it. Remove the mentioned assignment REQ_F_MUST_PUNT in preparation for killing the flag. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5d1685e206c15c..13f72d2a3fecee 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2920,10 +2920,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) inline_vecs, &iter); if (ret) goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; /* if we can retry, do so with the callbacks armed */ if (io_rw_should_retry(req)) { ret2 = io_iter_do_read(req, &iter); @@ -3057,10 +3053,6 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) inline_vecs, &iter); if (ret) goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } From 24c74678634b3cbdb325b3b7706366c83811b311 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:51 +0300 Subject: [PATCH 022/127] io_uring: remove REQ_F_MUST_PUNT REQ_F_MUST_PUNT may seem looking good and clear, but it's the same as not having REQ_F_NOWAIT set. That rather creates more confusion. Moreover, it doesn't even affect any behaviour (e.g. see the patch removing it from io_{read,write}). Kill theg flag and update already outdated comments. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 13f72d2a3fecee..93af915a98e68c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -534,7 +534,6 @@ enum { REQ_F_LINK_TIMEOUT_BIT, REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_MUST_PUNT_BIT, REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, @@ -582,8 +581,6 @@ enum { REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* must be punted even for NONBLOCK */ - REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), /* no timeout sequence */ REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ @@ -2894,10 +2891,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; @@ -2993,10 +2987,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (req->flags & REQ_F_LINK_HEAD) req->result = io_size; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; @@ -3717,8 +3708,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) /* if the file has a flush method, be safe and punt to async */ if (close->put_file->f_op->flush && force_nonblock) { + /* was never set, but play safe */ + req->flags &= ~REQ_F_NOWAIT; /* avoid grabbing files - we don't need the files */ - req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT; + req->flags |= REQ_F_NO_FILE_TABLE; return -EAGAIN; } @@ -4645,7 +4638,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (!req->file || !file_can_poll(req->file)) return false; - if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + if (req->flags & REQ_F_POLLED) return false; if (!def->pollin && !def->pollout) return false; @@ -5852,8 +5845,7 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || - (req->flags & REQ_F_MUST_PUNT))) { + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (io_arm_poll_handler(req)) { if (linked_timeout) io_queue_linked_timeout(linked_timeout); From b90cd197f9315f968d5ee4e6ee9f4e3067f2c883 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:52 +0300 Subject: [PATCH 023/127] io_uring: set @poll->file after @poll init It's a good practice to modify fields of a struct after but not before it was initialised. Even though io_init_poll_iocb() doesn't touch poll->file, call it first. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 93af915a98e68c..cc1f2f3b7bfa89 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4596,8 +4596,8 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, struct io_ring_ctx *ctx = req->ctx; bool cancel = false; - poll->file = req->file; io_init_poll_iocb(poll, mask, wake_func); + poll->file = req->file; poll->wait.private = req; ipt->pt._key = mask; From f6b6c7d6a9600bdbf5826f57137630e1670e2d87 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 21 Jun 2020 13:09:53 +0300 Subject: [PATCH 024/127] io_uring: kill NULL checks for submit state After recent changes, io_submit_sqes() always passes valid submit state, so kill leftovers checking it for NULL. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cc1f2f3b7bfa89..c686061c376277 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1376,11 +1376,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!state) { - req = kmem_cache_alloc(req_cachep, gfp); - if (unlikely(!req)) - goto fallback; - } else if (!state->free_reqs) { + if (!state->free_reqs) { size_t sz; int ret; From d3cac64c498c4fb2df46b97ee6f4c7d6d75f5e3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 12:38:13 +0300 Subject: [PATCH 025/127] io_uring: fix NULL-mm for linked reqs __io_queue_sqe() tries to handle all request of a link, so it's not enough to grab mm in io_sq_thread_acquire_mm() based just on the head. Don't check req->needs_mm and do it always. Signed-off-by: Pavel Begunkov --- fs/io_uring.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c686061c376277..72739188b2ffa3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1991,10 +1991,9 @@ static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) } } -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { - if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (!current->mm) { if (unlikely(!mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); @@ -2003,6 +2002,14 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, return 0; } +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req, int error) { @@ -2781,7 +2788,7 @@ static void io_async_buf_retry(struct callback_head *cb) ctx = req->ctx; __set_current_state(TASK_RUNNING); - if (!io_sq_thread_acquire_mm(ctx, req)) { + if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); From e1e16097e265daac918ce355bf1a0d1677adf0c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:17:17 -0600 Subject: [PATCH 026/127] io_uring: provide generic io_req_complete() helper We have lots of callers of: io_cqring_add_event(req, result); io_put_req(req); Provide a helper that does this for us. It helps clean up the code, and also provides a more convenient location for us to change the completion handling. Signed-off-by: Jens Axboe --- fs/io_uring.c | 106 ++++++++++++++++++++------------------------------ 1 file changed, 43 insertions(+), 63 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 72739188b2ffa3..17d7bafaf8cf07 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1335,7 +1335,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) +static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; @@ -1348,9 +1348,15 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) io_cqring_ev_posted(ctx); } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags) { - __io_cqring_add_event(req, res, 0); + io_cqring_add_event(req, res, cflags); + io_put_req(req); +} + +static void io_req_complete(struct io_kiocb *req, long res) +{ + __io_req_complete(req, res, 0); } static inline bool io_is_fallback_req(struct io_kiocb *req) @@ -1978,7 +1984,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); - __io_cqring_add_event(req, res, cflags); + io_cqring_add_event(req, res, cflags); } static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) @@ -2048,9 +2054,8 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) return true; kfree(iovec); end_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return false; } @@ -3117,10 +3122,9 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3154,10 +3158,9 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3171,8 +3174,7 @@ static int io_nop(struct io_kiocb *req) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(req, 0); - io_put_req(req); + io_req_complete(req, 0); return 0; } @@ -3211,8 +3213,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock) req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3245,8 +3246,7 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3342,8 +3342,7 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3416,8 +3415,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3504,8 +3502,7 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3548,8 +3545,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -3585,8 +3581,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) ret = do_madvise(ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -3625,8 +3620,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3665,8 +3659,7 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3722,10 +3715,9 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) ret = filp_close(close->put_file, req->work.files); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); fput(close->put_file); close->put_file = NULL; - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3759,8 +3751,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) req->sync.flags); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3859,10 +3850,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3902,10 +3892,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; } - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -4102,10 +4091,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags); return 0; } @@ -4159,10 +4147,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags); return 0; } @@ -4201,8 +4188,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; req_set_fail_links(req); } - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -4262,8 +4248,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock) out: if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } #else /* !CONFIG_NET */ @@ -4555,7 +4540,7 @@ static void io_async_task_func(struct callback_head *cb) if (!canceled) { __set_current_state(TASK_RUNNING); if (io_sq_thread_acquire_mm(ctx, req)) { - io_cqring_add_event(req, -EFAULT); + io_cqring_add_event(req, -EFAULT, 0); goto end_req; } mutex_lock(&ctx->uring_lock); @@ -4804,10 +4789,9 @@ static int io_poll_remove(struct io_kiocb *req) ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -5163,8 +5147,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -5657,8 +5640,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (ret) { req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); } io_steal_work(req, workptr); @@ -5775,8 +5757,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req(prev); } else { - io_cqring_add_event(req, -ETIME); - io_put_req(req); + io_req_complete(req, -ETIME); } return HRTIMER_NORESTART; } @@ -5885,9 +5866,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) /* and drop final reference, if we failed */ if (ret) { - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); } if (nxt) { req = nxt; @@ -5909,9 +5889,9 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) { if (ret != -EIOCBQUEUED) { fail_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { if (!req->io) { @@ -5937,8 +5917,8 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) static inline void io_queue_link_head(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_cqring_add_event(req, -ECANCELED); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, -ECANCELED); } else io_queue_sqe(req, NULL); } @@ -6195,8 +6175,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (unlikely(err)) { fail_req: - io_cqring_add_event(req, err); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, err); break; } From 013538bd65fd3cdbf3ca8b0c99b962c70473c803 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:29:15 -0600 Subject: [PATCH 027/127] io_uring: add 'io_comp_state' to struct io_submit_state No functional changes in this patch, just in preparation for passing back pending completions to the caller and completing them in a batched fashion. Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 17d7bafaf8cf07..002ab5eae20fd3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -678,6 +678,12 @@ struct io_kiocb { #define IO_IOPOLL_BATCH 8 +struct io_comp_state { + unsigned int nr; + struct list_head list; + struct io_ring_ctx *ctx; +}; + struct io_submit_state { struct blk_plug plug; @@ -687,6 +693,11 @@ struct io_submit_state { void *reqs[IO_IOPOLL_BATCH]; unsigned int free_reqs; + /* + * Batch completion logic + */ + struct io_comp_state comp; + /* * File reference cache */ @@ -6006,12 +6017,15 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - unsigned int max_ios) + struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); #ifdef CONFIG_BLOCK state->plug.nowait = true; #endif + state->comp.nr = 0; + INIT_LIST_HEAD(&state->comp.list); + state->comp.ctx = ctx; state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; @@ -6146,7 +6160,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - io_submit_state_start(&state, nr); + io_submit_state_start(&state, ctx, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; From f13fad7ba41cef806358885fbb3f9004f3214b2d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 09:34:30 -0600 Subject: [PATCH 028/127] io_uring: pass down completion state on the issue side No functional changes in this patch, just in preparation for having the completion state be available on the issue side. Later on, this will allow requests that complete inline to be completed in batches. Signed-off-by: Jens Axboe --- fs/io_uring.c | 67 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 17 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 002ab5eae20fd3..46241c1ad1b882 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -909,7 +909,8 @@ static void io_cleanup_req(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe); + const struct io_uring_sqe *sqe, + struct io_comp_state *cs); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, @@ -2806,7 +2807,7 @@ static void io_async_buf_retry(struct callback_head *cb) __set_current_state(TASK_RUNNING); if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); + __io_queue_sqe(req, NULL, NULL); mutex_unlock(&ctx->uring_lock); } else { __io_async_buf_error(req, -EFAULT); @@ -4430,7 +4431,7 @@ static void io_poll_task_func(struct callback_head *cb) struct io_ring_ctx *ctx = nxt->ctx; mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL); + __io_queue_sqe(nxt, NULL, NULL); mutex_unlock(&ctx->uring_lock); } } @@ -4555,7 +4556,7 @@ static void io_async_task_func(struct callback_head *cb) goto end_req; } mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); + __io_queue_sqe(req, NULL, NULL); mutex_unlock(&ctx->uring_lock); } else { io_cqring_ev_posted(ctx); @@ -5352,7 +5353,7 @@ static void io_cleanup_req(struct io_kiocb *req) } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + bool force_nonblock, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5637,7 +5638,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (!ret) { do { - ret = io_issue_sqe(req, NULL, false); + ret = io_issue_sqe(req, NULL, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -5814,7 +5815,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt; @@ -5834,7 +5836,7 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, true); + ret = io_issue_sqe(req, sqe, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -5892,7 +5894,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) revert_creds(old_creds); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { int ret; @@ -5921,21 +5924,22 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe); + __io_queue_sqe(req, sqe, cs); } } -static inline void io_queue_link_head(struct io_kiocb *req) +static inline void io_queue_link_head(struct io_kiocb *req, + struct io_comp_state *cs) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { io_put_req(req); io_req_complete(req, -ECANCELED); } else - io_queue_sqe(req, NULL); + io_queue_sqe(req, NULL, cs); } static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **link) + struct io_kiocb **link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5975,7 +5979,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head); + io_queue_link_head(head, cs); *link = NULL; } } else { @@ -5995,18 +5999,47 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req, sqe, cs); } } return 0; } +static void io_submit_flush_completions(struct io_comp_state *cs) +{ + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, list); + list_del(&req->list); + io_cqring_fill_event(req, req->result); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + /* * Batched submission is done, ensure local IO is flushed out. */ static void io_submit_state_end(struct io_submit_state *state) { + if (!list_empty(&state->comp.list)) + io_submit_flush_completions(&state->comp); blk_finish_plug(&state->plug); io_state_file_put(state); if (state->free_reqs) @@ -6196,7 +6229,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, io_async_submit(ctx)); - err = io_submit_sqe(req, sqe, &link); + err = io_submit_sqe(req, sqe, &link, &state.comp); if (err) goto fail_req; } @@ -6207,7 +6240,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, percpu_ref_put_many(&ctx->refs, nr - ref_used); } if (link) - io_queue_link_head(link); + io_queue_link_head(link, &state.comp); io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ From 229a7b63507a3e84afb17c3bbb67505a81d28a1d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 10:13:11 -0600 Subject: [PATCH 029/127] io_uring: pass in completion state to appropriate issue side handlers Provide the completion state to the handlers that we know can complete inline, so they can utilize this for batching completions. Cap the max batch count at 32. This should be enough to provide a good amortization of the cost of the lock+commit dance for completions, while still being low enough not to cause any real latency issues for SQPOLL applications. Xuan Zhuo reports that this changes his profile from: 17.97% [kernel] [k] copy_user_generic_unrolled 13.92% [kernel] [k] io_commit_cqring 11.04% [kernel] [k] __io_cqring_fill_event 10.33% [kernel] [k] udp_recvmsg 5.94% [kernel] [k] skb_release_data 4.31% [kernel] [k] udp_rmem_release 2.68% [kernel] [k] __check_object_size 2.24% [kernel] [k] __slab_free 2.22% [kernel] [k] _raw_spin_lock_bh 2.21% [kernel] [k] kmem_cache_free 2.13% [kernel] [k] free_pcppages_bulk 1.83% [kernel] [k] io_submit_sqes 1.38% [kernel] [k] page_frag_free 1.31% [kernel] [k] inet_recvmsg to 19.99% [kernel] [k] copy_user_generic_unrolled 11.63% [kernel] [k] skb_release_data 9.36% [kernel] [k] udp_rmem_release 8.64% [kernel] [k] udp_recvmsg 6.21% [kernel] [k] __slab_free 4.39% [kernel] [k] __check_object_size 3.64% [kernel] [k] free_pcppages_bulk 2.41% [kernel] [k] kmem_cache_free 2.00% [kernel] [k] io_submit_sqes 1.95% [kernel] [k] page_frag_free 1.54% [kernel] [k] io_put_req [...] 0.07% [kernel] [k] io_commit_cqring 0.44% [kernel] [k] __io_cqring_fill_event Signed-off-by: Jens Axboe --- fs/io_uring.c | 153 ++++++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 67 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 46241c1ad1b882..6c9ca4fcbc310e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1360,15 +1360,50 @@ static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) io_cqring_ev_posted(ctx); } -static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags) +static void io_submit_flush_completions(struct io_comp_state *cs) { - io_cqring_add_event(req, res, cflags); - io_put_req(req); + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, list); + list_del(&req->list); + io_cqring_fill_event(req, req->result); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, + struct io_comp_state *cs) +{ + if (!cs) { + io_cqring_add_event(req, res, cflags); + io_put_req(req); + } else { + req->result = res; + list_add_tail(&req->list, &cs->list); + if (++cs->nr >= 32) + io_submit_flush_completions(cs); + } } static void io_req_complete(struct io_kiocb *req, long res) { - __io_req_complete(req, res, 0); + __io_req_complete(req, res, 0, NULL); } static inline bool io_is_fallback_req(struct io_kiocb *req) @@ -3179,14 +3214,14 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req) +static int io_nop(struct io_kiocb *req, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_req_complete(req, 0); + __io_req_complete(req, 0, 0, cs); return 0; } @@ -3408,7 +3443,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return i; } -static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3427,7 +3463,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3485,7 +3521,8 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) return i ? i : -ENOMEM; } -static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3514,7 +3551,7 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3545,7 +3582,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) +static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; @@ -3557,7 +3595,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; #else return -EOPNOTSUPP; @@ -3702,7 +3740,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_close(struct io_kiocb *req, bool force_nonblock) +static int io_close(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_close *close = &req->close; int ret; @@ -3729,7 +3768,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) req_set_fail_links(req); fput(close->put_file); close->put_file = NULL; - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3815,7 +3854,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_msghdr *kmsg = NULL; struct socket *sock; @@ -3864,11 +3904,12 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct socket *sock; int ret; @@ -3906,7 +3947,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4049,7 +4090,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, return ret; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_msghdr *kmsg = NULL; struct socket *sock; @@ -4105,11 +4147,12 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, cflags); + __io_req_complete(req, ret, cflags, cs); return 0; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_buffer *kbuf = NULL; struct socket *sock; @@ -4161,7 +4204,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, cflags); + __io_req_complete(req, ret, cflags, cs); return 0; } @@ -4181,7 +4224,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_accept *accept = &req->accept; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -4200,7 +4244,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; req_set_fail_links(req); } - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4224,7 +4268,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) &io->connect.address); } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_ctx __io, *io; unsigned file_flags; @@ -4260,7 +4305,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock) out: if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } #else /* !CONFIG_NET */ @@ -5141,7 +5186,8 @@ static int io_files_update_prep(struct io_kiocb *req, return 0; } -static int io_files_update(struct io_kiocb *req, bool force_nonblock) +static int io_files_update(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_files_update up; @@ -5159,7 +5205,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_req_complete(req, ret); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -5360,7 +5406,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, switch (req->opcode) { case IORING_OP_NOP: - ret = io_nop(req); + ret = io_nop(req, cs); break; case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -5422,9 +5468,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, force_nonblock); + ret = io_sendmsg(req, force_nonblock, cs); else - ret = io_send(req, force_nonblock); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: case IORING_OP_RECV: @@ -5434,9 +5480,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, force_nonblock); + ret = io_recvmsg(req, force_nonblock, cs); else - ret = io_recv(req, force_nonblock); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -5460,7 +5506,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_accept(req, force_nonblock); + ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: if (sqe) { @@ -5468,7 +5514,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_connect(req, force_nonblock); + ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: if (sqe) { @@ -5500,7 +5546,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_close(req, force_nonblock); + ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: if (sqe) { @@ -5508,7 +5554,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_files_update(req, force_nonblock); + ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: if (sqe) { @@ -5548,7 +5594,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_epoll_ctl(req, force_nonblock); + ret = io_epoll_ctl(req, force_nonblock, cs); break; case IORING_OP_SPLICE: if (sqe) { @@ -5564,7 +5610,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_provide_buffers(req, force_nonblock); + ret = io_provide_buffers(req, force_nonblock, cs); break; case IORING_OP_REMOVE_BUFFERS: if (sqe) { @@ -5572,7 +5618,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_remove_buffers(req, force_nonblock); + ret = io_remove_buffers(req, force_nonblock, cs); break; case IORING_OP_TEE: if (sqe) { @@ -6006,33 +6052,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_submit_flush_completions(struct io_comp_state *cs) -{ - struct io_ring_ctx *ctx = cs->ctx; - - spin_lock_irq(&ctx->completion_lock); - while (!list_empty(&cs->list)) { - struct io_kiocb *req; - - req = list_first_entry(&cs->list, struct io_kiocb, list); - list_del(&req->list); - io_cqring_fill_event(req, req->result); - if (!(req->flags & REQ_F_LINK_HEAD)) { - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - } else { - spin_unlock_irq(&ctx->completion_lock); - io_put_req(req); - spin_lock_irq(&ctx->completion_lock); - } - } - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - cs->nr = 0; -} - /* * Batched submission is done, ensure local IO is flushed out. */ From a1d7c393c4711a9ce6c239c3ab053a50dc96505a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 22 Jun 2020 11:09:46 -0600 Subject: [PATCH 030/127] io_uring: enable READ/WRITE to use deferred completions A bit more surgery required here, as completions are generally done through the kiocb->ki_complete() callback, even if they complete inline. This enables the regular read/write path to use the io_comp_state logic to batch inline completions. Signed-off-by: Jens Axboe --- fs/io_uring.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6c9ca4fcbc310e..0bba12e4e5598b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2019,7 +2019,8 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } -static void io_complete_rw_common(struct kiocb *kiocb, long res) +static void io_complete_rw_common(struct kiocb *kiocb, long res, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); int cflags = 0; @@ -2031,7 +2032,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); - io_cqring_add_event(req, res, cflags); + __io_req_complete(req, res, cflags, cs); } static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) @@ -2141,14 +2142,18 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) return false; } +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs) +{ + if (!io_rw_reissue(req, res)) + io_complete_rw_common(&req->rw.kiocb, res, cs); +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - if (!io_rw_reissue(req, res)) { - io_complete_rw_common(kiocb, res); - io_put_req(req); - } + __io_complete_rw(req, res, res2, NULL); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2382,14 +2387,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) - io_complete_rw(kiocb, ret, 0); + __io_complete_rw(req, ret, 0, cs); else io_rw_done(kiocb, ret); } @@ -2925,7 +2931,8 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); } -static int io_read(struct io_kiocb *req, bool force_nonblock) +static int io_read(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -2960,7 +2967,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); } else { iter.count = iov_count; iter.nr_segs = nr_segs; @@ -2975,7 +2982,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (ret2 == -EIOCBQUEUED) { goto out_free; } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); goto out_free; } } @@ -3021,7 +3028,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static int io_write(struct io_kiocb *req, bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; @@ -3090,7 +3098,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); + kiocb_done(kiocb, ret2, cs); } else { iter.count = iov_count; iter.nr_segs = nr_segs; @@ -5416,7 +5424,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_read(req, force_nonblock); + ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: @@ -5426,7 +5434,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_write(req, force_nonblock); + ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: if (sqe) { From c40f63790ec957e9449056fb78d8c2523eff96b5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Jun 2020 15:39:59 -0600 Subject: [PATCH 031/127] io_uring: use task_work for links if possible Currently links are always done in an async fashion, unless we catch them inline after we successfully complete a request without having to resort to blocking. This isn't necessarily the most efficient approach, it'd be more ideal if we could just use the task_work handling for this. Outside of saving an async jump, we can also do less prep work for these kinds of requests. Running dependent links from the task_work handler yields some nice performance benefits. As an example, examples/link-cp from the liburing repository uses read+write links to implement a copy operation. Without this patch, the a cache fold 4G file read from a VM runs in about 3 seconds: $ time examples/link-cp /data/file /dev/null real 0m2.986s user 0m0.051s sys 0m2.843s and a subsequent cache hot run looks like this: $ time examples/link-cp /data/file /dev/null real 0m0.898s user 0m0.069s sys 0m0.797s With this patch in place, the cold case takes about 2.4 seconds: $ time examples/link-cp /data/file /dev/null real 0m2.400s user 0m0.020s sys 0m2.366s and the cache hot case looks like this: $ time examples/link-cp /data/file /dev/null real 0m0.676s user 0m0.010s sys 0m0.665s As expected, the (mostly) cache hot case yields the biggest improvement, running about 25% faster with this change, while the cache cold case yields about a 20% increase in performance. Outside of the performance increase, we're using less CPU as well, as we're not using the async offload threads at all for this anymore. Signed-off-by: Jens Axboe --- fs/io_uring.c | 191 +++++++++++++++++++++++++++++++------------------- 1 file changed, 117 insertions(+), 74 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0bba12e4e5598b..b628e4429b75e0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -898,6 +898,7 @@ enum io_mem_account { static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); @@ -951,6 +952,41 @@ static void __io_put_req_task(struct io_kiocb *req) put_task_struct(req->task); } +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) +{ + if (!current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + static void io_file_put_work(struct work_struct *work); /* @@ -1664,6 +1700,64 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) } } +static void __io_req_task_cancel(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_req_task_cancel(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_cancel(req, -ECANCELED); +} + +static void __io_req_task_submit(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + __set_current_state(TASK_RUNNING); + if (!__io_sq_thread_acquire_mm(ctx)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL, NULL); + mutex_unlock(&ctx->uring_lock); + } else { + __io_req_task_cancel(req, -EFAULT); + } +} + +static void io_req_task_submit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_submit(req); +} + +static void io_req_task_queue(struct io_kiocb *req) +{ + struct task_struct *tsk = req->task; + int ret; + + init_task_work(&req->task_work, io_req_task_submit); + + ret = task_work_add(tsk, &req->task_work, true); + if (unlikely(ret)) { + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, true); + } + wake_up_process(tsk); +} + static void io_free_req(struct io_kiocb *req) { struct io_kiocb *nxt = NULL; @@ -1671,8 +1765,12 @@ static void io_free_req(struct io_kiocb *req) io_req_find_next(req, &nxt); __io_free_req(req); - if (nxt) - io_queue_async_work(nxt); + if (nxt) { + if (nxt->flags & REQ_F_WORK_INITIALIZED) + io_queue_async_work(nxt); + else + io_req_task_queue(nxt); + } } static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) @@ -2013,12 +2111,6 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } -static inline void req_set_fail_links(struct io_kiocb *req) -{ - if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; -} - static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs) { @@ -2035,35 +2127,6 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, __io_req_complete(req, res, cflags, cs); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - -static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) -{ - if (!current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } - - return 0; -} - -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (!io_op_defs[req->opcode].needs_mm) - return 0; - return __io_sq_thread_acquire_mm(ctx); -} - #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req, int error) { @@ -2811,20 +2874,6 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void __io_async_buf_error(struct io_kiocb *req, int error) -{ - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->completion_lock); - io_cqring_fill_event(req, error); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); - req_set_fail_links(req); - io_double_put_req(req); -} - static void io_async_buf_cancel(struct callback_head *cb) { struct io_async_rw *rw; @@ -2832,27 +2881,18 @@ static void io_async_buf_cancel(struct callback_head *cb) rw = container_of(cb, struct io_async_rw, task_work); req = rw->wpq.wait.private; - __io_async_buf_error(req, -ECANCELED); + __io_req_task_cancel(req, -ECANCELED); } static void io_async_buf_retry(struct callback_head *cb) { struct io_async_rw *rw; - struct io_ring_ctx *ctx; struct io_kiocb *req; rw = container_of(cb, struct io_async_rw, task_work); req = rw->wpq.wait.private; - ctx = req->ctx; - __set_current_state(TASK_RUNNING); - if (!__io_sq_thread_acquire_mm(ctx)) { - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } else { - __io_async_buf_error(req, -EFAULT); - } + __io_req_task_submit(req); } static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, @@ -5218,22 +5258,24 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, } static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) + const struct io_uring_sqe *sqe, bool for_async) { ssize_t ret = 0; if (!sqe) return 0; - io_req_init_async(req); + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { + io_req_init_async(req); - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } + if (io_op_defs[req->opcode].file_table) { + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } - io_req_work_grab_env(req, &io_op_defs[req->opcode]); + io_req_work_grab_env(req, &io_op_defs[req->opcode]); + } switch (req->opcode) { case IORING_OP_NOP: @@ -5347,7 +5389,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req->io) { if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (ret < 0) return ret; } @@ -5966,7 +6008,7 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EAGAIN; if (io_alloc_async_ctx(req)) goto fail_req; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (unlikely(ret < 0)) goto fail_req; } @@ -6022,13 +6064,14 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, false); if (ret) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; return ret; } trace_io_uring_link(ctx, req, head); + io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ @@ -6048,7 +6091,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe); + ret = io_req_defer_prep(req, sqe, true); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; From e883a79d8ced8e123f8c4042a29a7524c39935ab Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 18:20:53 +0300 Subject: [PATCH 032/127] io-wq: compact io-wq flags numbers Renumerate IO_WQ flags, so they take adjacent bits Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io-wq.h b/fs/io-wq.h index 071f1a99780026..04239dfb12b072 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -5,10 +5,10 @@ struct io_wq; enum { IO_WQ_WORK_CANCEL = 1, - IO_WQ_WORK_HASHED = 4, - IO_WQ_WORK_UNBOUND = 32, - IO_WQ_WORK_NO_CANCEL = 256, - IO_WQ_WORK_CONCURRENT = 512, + IO_WQ_WORK_HASHED = 2, + IO_WQ_WORK_UNBOUND = 4, + IO_WQ_WORK_NO_CANCEL = 8, + IO_WQ_WORK_CONCURRENT = 16, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; From f4db7182e0de981a3f1b356e0cf43c6815423055 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Jun 2020 18:20:54 +0300 Subject: [PATCH 033/127] io-wq: return next work from ->do_work() directly It's easier to return next work from ->do_work() than having an in-out argument. Looks nicer and easier to compile. Also, merge io_wq_assign_next() into its only user. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 8 +++----- fs/io-wq.h | 2 +- fs/io_uring.c | 53 ++++++++++++++++++++------------------------------- 3 files changed, 25 insertions(+), 38 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 47c5f3aeb46007..72f759e1d6eb0b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -523,9 +523,8 @@ static void io_worker_handle_work(struct io_worker *worker) work->flags |= IO_WQ_WORK_CANCEL; hash = io_get_work_hash(work); - linked = old_work = work; - wq->do_work(&linked); - linked = (old_work == linked) ? NULL : linked; + old_work = work; + linked = wq->do_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { @@ -781,8 +780,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) struct io_wq_work *old_work = work; work->flags |= IO_WQ_WORK_CANCEL; - wq->do_work(&work); - work = (work == old_work) ? NULL : work; + work = wq->do_work(work); wq->free_work(old_work); } while (work); } diff --git a/fs/io-wq.h b/fs/io-wq.h index 04239dfb12b072..114f12ec2d6572 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -101,7 +101,7 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) } typedef void (free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work **); +typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *); struct io_wq_data { struct user_struct *user; diff --git a/fs/io_uring.c b/fs/io_uring.c index b628e4429b75e0..2e44b378826564 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -895,7 +895,6 @@ enum io_mem_account { ACCT_PINNED, }; -static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -1773,20 +1772,6 @@ static void io_free_req(struct io_kiocb *req) } } -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) -{ - struct io_kiocb *link; - const struct io_op_def *def = &io_op_defs[nxt->opcode]; - - if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) - io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - - *workptr = &nxt->work; - link = io_prep_linked_timeout(nxt); - if (link) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; -} - /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1806,24 +1791,29 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_steal_work(struct io_kiocb *req, - struct io_wq_work **workptr) +static struct io_wq_work *io_steal_work(struct io_kiocb *req) { + struct io_kiocb *link, *nxt = NULL; + /* - * It's in an io-wq worker, so there always should be at least - * one reference, which will be dropped in io_put_work() just - * after the current handler returns. - * - * It also means, that if the counter dropped to 1, then there is - * no asynchronous users left, so it's safe to steal the next work. + * A ref is owned by io-wq in which context we're. So, if that's the + * last one, it's safe to steal next work. False negatives are Ok, + * it just will be re-punted async in io_put_work() */ - if (refcount_read(&req->refs) == 1) { - struct io_kiocb *nxt = NULL; + if (refcount_read(&req->refs) != 1) + return NULL; - io_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); - } + io_req_find_next(req, &nxt); + if (!nxt) + return NULL; + + if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) + io_wq_hash_work(&nxt->work, file_inode(nxt->file)); + + link = io_prep_linked_timeout(nxt); + if (link) + nxt->flags |= REQ_F_QUEUE_TIMEOUT; + return &nxt->work; } /* @@ -5718,9 +5708,8 @@ static void io_arm_async_linked_timeout(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_wq_submit_work(struct io_wq_work **workptr) +static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { - struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); int ret = 0; @@ -5751,7 +5740,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_req_complete(req, ret); } - io_steal_work(req, workptr); + return io_steal_work(req); } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, From 1e16c2f917a59d27fb6b540c44d66978c8ad29ef Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 26 Jun 2020 16:32:50 -0700 Subject: [PATCH 034/127] io_uring: fix function args for !CONFIG_NET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build errors when CONFIG_NET is not set/enabled: ../fs/io_uring.c:5472:10: error: too many arguments to function ‘io_sendmsg’ ../fs/io_uring.c:5474:10: error: too many arguments to function ‘io_send’ ../fs/io_uring.c:5484:10: error: too many arguments to function ‘io_recvmsg’ ../fs/io_uring.c:5486:10: error: too many arguments to function ‘io_recv’ ../fs/io_uring.c:5510:9: error: too many arguments to function ‘io_accept’ ../fs/io_uring.c:5518:9: error: too many arguments to function ‘io_connect’ Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: io-uring@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index af4d7a5c49f415..43ddda2a3d4901 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4360,12 +4360,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4376,12 +4378,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EOPNOTSUPP; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4391,7 +4395,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4401,7 +4406,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } From 8ef77766ba8694968ed4ba24311b4bacee14f235 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:59 +0300 Subject: [PATCH 035/127] io_uring: fix req->work corruption req->work and req->task_work are in a union, so io_req_task_queue() screws everything that was in work. De-union them for now. [ 704.367253] BUG: unable to handle page fault for address: ffffffffaf7330d0 [ 704.367256] #PF: supervisor write access in kernel mode [ 704.367256] #PF: error_code(0x0003) - permissions violation [ 704.367261] CPU: 6 PID: 1654 Comm: io_wqe_worker-0 Tainted: G I 5.8.0-rc2-00038-ge28d0bdc4863-dirty #498 [ 704.367265] RIP: 0010:_raw_spin_lock+0x1e/0x36 ... [ 704.367276] __alloc_fd+0x35/0x150 [ 704.367279] __get_unused_fd_flags+0x25/0x30 [ 704.367280] io_openat2+0xcb/0x1b0 [ 704.367283] io_issue_sqe+0x36a/0x1320 [ 704.367294] io_wq_submit_work+0x58/0x160 [ 704.367295] io_worker_handle_work+0x2a3/0x430 [ 704.367296] io_wqe_worker+0x2a0/0x350 [ 704.367301] kthread+0x136/0x180 [ 704.367304] ret_from_fork+0x22/0x30 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 43ddda2a3d4901..dcf3ffb5ecf316 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -668,12 +668,12 @@ struct io_kiocb { * restore the work, if needed. */ struct { - struct callback_head task_work; struct hlist_node hash_node; struct async_poll *apoll; }; struct io_wq_work work; }; + struct callback_head task_work; }; #define IO_IOPOLL_BATCH 8 From 906a8c3fdbc367325d4200e39212a2a7715b7b0e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:55 +0300 Subject: [PATCH 036/127] io_uring: fix punting req w/o grabbed env It's not enough to check for REQ_F_WORK_INITIALIZED and punt async assuming that io_req_work_grab_env() was called, it may not have been. E.g. io_close_prep() and personality path set the flag without further async init. As a quick fix, always pass next work through io_req_task_queue(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index dcf3ffb5ecf316..483457f6a7df21 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1766,12 +1766,8 @@ static void io_free_req(struct io_kiocb *req) io_req_find_next(req, &nxt); __io_free_req(req); - if (nxt) { - if (nxt->flags & REQ_F_WORK_INITIALIZED) - io_queue_async_work(nxt); - else - io_req_task_queue(nxt); - } + if (nxt) + io_req_task_queue(nxt); } /* From 1bcb8c5d65a845e0ecb9e82237c399b29b8d15ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:56 +0300 Subject: [PATCH 037/127] io_uring: fix feeding io-wq with uninit reqs io_steal_work() can't be sure that @nxt has req->work properly set, so we can't pass it to io-wq as is. A dirty quick fix -- drag it through io_req_task_queue(), and always return NULL from io_steal_work(). e.g. [ 50.770161] BUG: kernel NULL pointer dereference, address: 00000000 [ 50.770164] #PF: supervisor write access in kernel mode [ 50.770164] #PF: error_code(0x0002) - not-present page [ 50.770168] CPU: 1 PID: 1448 Comm: io_wqe_worker-0 Tainted: G I 5.8.0-rc2-00035-g2237d76530eb-dirty #494 [ 50.770172] RIP: 0010:override_creds+0x19/0x30 ... [ 50.770183] io_worker_handle_work+0x25c/0x430 [ 50.770185] io_wqe_worker+0x2a0/0x350 [ 50.770190] kthread+0x136/0x180 [ 50.770194] ret_from_fork+0x22/0x30 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 483457f6a7df21..658949bed77fe5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1791,7 +1791,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *link, *nxt = NULL; + struct io_kiocb *nxt = NULL; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1808,10 +1808,15 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - link = io_prep_linked_timeout(nxt); - if (link) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; - return &nxt->work; + io_req_task_queue(nxt); + /* + * If we're going to return actual work, here should be timeout prep: + * + * link = io_prep_linked_timeout(nxt); + * if (link) + * nxt->flags |= REQ_F_QUEUE_TIMEOUT; + */ + return NULL; } /* From a6d45dd0d43e6d1275e002704540688b6768bc22 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:57 +0300 Subject: [PATCH 038/127] io_uring: don't mark link's head for_async No reason to mark a head of a link as for-async in io_req_defer_prep(). grab_env(), etc. That will be done further during submission if neccessary. Mark for_async=false saving extra grab_env() in many cases. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 658949bed77fe5..545b137c7b4a26 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6092,7 +6092,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe, false); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; From 710c2bfb66474a186b0196e3342d43db0e6c04e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:58 +0300 Subject: [PATCH 039/127] io_uring: fix missing io_grab_files() We won't have valid ring_fd, ring_file in task work. Grab files early. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 545b137c7b4a26..4a9929c0b4ad0a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5270,15 +5270,15 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; - if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { + if (io_op_defs[req->opcode].file_table) { io_req_init_async(req); + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } - + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { + io_req_init_async(req); io_req_work_grab_env(req, &io_op_defs[req->opcode]); } From 8c9cb6cd9a46ae6fb7cb6c39cf6a48a53440feef Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:29 +0300 Subject: [PATCH 040/127] io_uring: fix refs underflow in io_iopoll_queue() Now io_complete_rw_common() puts a ref, extra io_req_put() in io_iopoll_queue() causes undeflow. Remove it. [ 455.998620] refcount_t: underflow; use-after-free. [ 455.998743] WARNING: CPU: 6 PID: 285394 at lib/refcount.c:28 refcount_warn_saturate+0xae/0xf0 [ 455.998772] CPU: 6 PID: 285394 Comm: read-write2 Tainted: G I E 5.8.0-rc2-00048-g1b1aa738f167-dirty #509 [ 455.998772] RIP: 0010:refcount_warn_saturate+0xae/0xf0 ... [ 455.998778] Call Trace: [ 455.998778] io_put_req+0x44/0x50 [ 455.998778] io_iopoll_complete+0x245/0x370 [ 455.998779] io_iopoll_getevents+0x12f/0x1a0 [ 455.998779] io_iopoll_reap_events.part.0+0x5e/0xa0 [ 455.998780] io_ring_ctx_wait_and_kill+0x132/0x1c0 [ 455.998780] io_uring_release+0x20/0x30 [ 455.998780] __fput+0xcd/0x230 [ 455.998781] ____fput+0xe/0x10 [ 455.998781] task_work_run+0x67/0xa0 [ 455.998781] do_exit+0x35d/0xb70 [ 455.998782] do_group_exit+0x43/0xa0 [ 455.998783] get_signal+0x140/0x900 [ 455.998783] do_signal+0x37/0x780 [ 455.998784] __prepare_exit_to_usermode+0x126/0x1c0 [ 455.998785] __syscall_return_slowpath+0x3b/0x1c0 [ 455.998785] do_syscall_64+0x5f/0xa0 [ 455.998785] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: a1d7c393c47 ("io_uring: enable READ/WRITE to use deferred completions") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4a9929c0b4ad0a..ab9f2f3a9b5675 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1904,7 +1904,6 @@ static void io_iopoll_queue(struct list_head *again) /* shouldn't happen unless io_uring is dying, cancel reqs */ if (unlikely(!current->mm)) { io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); - io_put_req(req); continue; } From e6543a816edca00b6b4c48625d142059d7211059 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:30 +0300 Subject: [PATCH 041/127] io_uring: remove inflight batching in free_many() io_free_req_many() is used only for iopoll requests, i.e. reads/writes. Hence no need to batch inflight unhooking. For safety, it'll be done by io_dismantle_req(), which replaces __io_req_aux_free(), and looks more solid and cleaner. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ab9f2f3a9b5675..9863cec8020fb3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1504,7 +1504,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static void __io_req_aux_free(struct io_kiocb *req) +static void io_dismantle_req(struct io_kiocb *req) { if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); @@ -1514,11 +1514,6 @@ static void __io_req_aux_free(struct io_kiocb *req) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); io_req_work_drop_env(req); -} - -static void __io_free_req(struct io_kiocb *req) -{ - __io_req_aux_free(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1530,7 +1525,11 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } +} +static void __io_free_req(struct io_kiocb *req) +{ + io_dismantle_req(req); percpu_ref_put(&req->ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -1549,35 +1548,11 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) if (!rb->to_free) return; if (rb->need_iter) { - int i, inflight = 0; - unsigned long flags; - - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; - - if (req->flags & REQ_F_INFLIGHT) - inflight++; - __io_req_aux_free(req); - } - if (!inflight) - goto do_free; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; - - if (req->flags & REQ_F_INFLIGHT) { - list_del(&req->inflight_entry); - if (!--inflight) - break; - } - } - spin_unlock_irqrestore(&ctx->inflight_lock, flags); + int i; - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); + for (i = 0; i < rb->to_free; i++) + io_dismantle_req(rb->reqs[i]); } -do_free: kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); percpu_ref_put_many(&ctx->refs, rb->to_free); rb->to_free = rb->need_iter = 0; From 2757a23e7f6441eabf605ca59eeb88c34071757d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:31 +0300 Subject: [PATCH 042/127] io_uring: dismantle req early and remove need_iter Every request in io_req_multi_free() is has ->file set. Instead of pointlessly defering and counting reqs with file, dismantle it on place and save for batch dealloc. It also saves us from potentially skipping io_cleanup_req(), put_task(), etc. Never happens though, becacuse ->file is always there. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9863cec8020fb3..8cb5252269d747 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1540,22 +1540,16 @@ static void __io_free_req(struct io_kiocb *req) struct req_batch { void *reqs[IO_IOPOLL_BATCH]; int to_free; - int need_iter; }; static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) { if (!rb->to_free) return; - if (rb->need_iter) { - int i; - for (i = 0; i < rb->to_free; i++) - io_dismantle_req(rb->reqs[i]); - } kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = rb->need_iter = 0; + rb->to_free = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1846,9 +1840,7 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) return false; - if (req->file || req->io) - rb->need_iter++; - + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) io_free_req_many(req->ctx, rb); @@ -1900,7 +1892,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = rb.need_iter = 0; + rb.to_free = 0; while (!list_empty(done)) { int cflags = 0; From c3524383333e4ff2f720ab0c02b3a329f72de78b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:32 +0300 Subject: [PATCH 043/127] io_uring: batch-free linked requests as well There is no reason to not batch deallocation of linked requests. Take away its next req first and handle it as everything else in io_req_multi_free(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8cb5252269d747..af8d1d64f858cd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1728,17 +1728,21 @@ static void io_req_task_queue(struct io_kiocb *req) wake_up_process(tsk); } -static void io_free_req(struct io_kiocb *req) +static void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = NULL; io_req_find_next(req, &nxt); - __io_free_req(req); - if (nxt) io_req_task_queue(nxt); } +static void io_free_req(struct io_kiocb *req) +{ + io_queue_next(req); + __io_free_req(req); +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1835,16 +1839,19 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +static inline void io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) { - if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) - return false; + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) io_free_req_many(req->ctx, rb); - return true; } static int io_put_kbuf(struct io_kiocb *req) @@ -1910,9 +1917,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) - io_free_req(req); + if (refcount_dec_and_test(&req->refs)) + io_req_multi_free(&rb, req); } io_commit_cqring(ctx); From 2d6500d44c1374808040d120e625a22b013c9f0d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:33 +0300 Subject: [PATCH 044/127] io_uring: cosmetic changes for batch free Move all batch free bits close to each other and rename in a consistent way. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 +++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index af8d1d64f858cd..18a452ac81ccc1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1537,21 +1537,6 @@ static void __io_free_req(struct io_kiocb *req) clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); } -struct req_batch { - void *reqs[IO_IOPOLL_BATCH]; - int to_free; -}; - -static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) -{ - if (!rb->to_free) - return; - - kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = 0; -} - static bool io_link_cancel_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1743,6 +1728,41 @@ static void io_free_req(struct io_kiocb *req) __io_free_req(req); } +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; +}; + +static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = 0; +} + +static void io_req_free_batch_finish(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + if (rb->to_free) + __io_req_free_batch_flush(ctx, rb); +} + +static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) +{ + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); + + io_dismantle_req(req); + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + __io_req_free_batch_flush(req->ctx, rb); +} + /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. @@ -1839,21 +1859,6 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline void io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) -{ - if (unlikely(io_is_fallback_req(req))) { - io_free_req(req); - return; - } - if (req->flags & REQ_F_LINK_HEAD) - io_queue_next(req); - - io_dismantle_req(req); - rb->reqs[rb->to_free++] = req; - if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) - io_free_req_many(req->ctx, rb); -} - static int io_put_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; @@ -1918,13 +1923,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, (*nr_events)++; if (refcount_dec_and_test(&req->refs)) - io_req_multi_free(&rb, req); + io_req_free_batch(&rb, req); } io_commit_cqring(ctx); if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); + io_req_free_batch_finish(ctx, &rb); if (!list_empty(&again)) io_iopoll_queue(&again); From 9b0d911acce00b67f7e7336f838b732de7d917d6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:34 +0300 Subject: [PATCH 045/127] io_uring: kill REQ_F_LINK_NEXT After pulling nxt from a request, it's no more a links head, so clear REQ_F_LINK_HEAD. Absence of this flag also indicates that there are no linked requests, so replacing REQ_F_LINK_NEXT, which can be killed. Linked timeouts also behave leaving the flag intact when necessary. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 18a452ac81ccc1..14c5655c043415 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -526,7 +526,6 @@ enum { REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_HEAD_BIT, - REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, @@ -565,8 +564,6 @@ enum { /* head of a link */ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), - /* already grabbed next link */ - REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -1559,10 +1556,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) struct io_ring_ctx *ctx = req->ctx; bool wake_ev = false; - /* Already got next link */ - if (req->flags & REQ_F_LINK_NEXT) - return; - /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the @@ -1587,7 +1580,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) break; } - req->flags |= REQ_F_LINK_NEXT; if (wake_ev) io_cqring_ev_posted(ctx); } @@ -1628,6 +1620,7 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { if (likely(!(req->flags & REQ_F_LINK_HEAD))) return; + req->flags &= ~REQ_F_LINK_HEAD; /* * If LINK is set, we have dependent requests in this chain. If we From 6795c5aba247653f99d1f336ff496dd74659b322 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:35 +0300 Subject: [PATCH 046/127] io_uring: clean up req->result setting by rw Assign req->result to io_size early in io_{read,write}(), it's enough and makes it more straightforward. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 14c5655c043415..f283d111666b76 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2384,7 +2384,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; - req->result = 0; req->iopoll_completed = 0; } else { if (kiocb->ki_flags & IOCB_HIPRI) @@ -2957,10 +2956,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; - req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; + req->result = io_size; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) @@ -3054,10 +3051,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - req->result = 0; io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; + req->result = io_size; /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) From 3adfecaa647ff8afa4b6f5907193cf751a0f8351 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:37 +0300 Subject: [PATCH 047/127] io_uring: do task_work_run() during iopoll There are a lot of new users of task_work, and some of task_work_add() may happen while we do io polling, thus make iopoll from time to time to do task_work_run(), so it doesn't poll for sitting there reqs. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index f283d111666b76..c514a520970317 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2052,6 +2052,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); + if (current->task_works) + task_work_run(); mutex_lock(&ctx->uring_lock); } From f3a6fa2267480d7f19fbde8316372be46055e548 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 28 Jun 2020 12:52:38 +0300 Subject: [PATCH 048/127] io_uring: fix iopoll -EAGAIN handling req->iopoll() is not necessarily called by a task that submitted a request. Because of that, it's dangerous to grab_env() and punt async on -EGAIN, potentially grabbing another task's mm and corrupting its memory. Do resubmit from the submitter task context. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c514a520970317..9d3d8d3866cc18 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -892,6 +892,7 @@ enum io_mem_account { ACCT_PINNED, }; +static bool io_rw_reissue(struct io_kiocb *req, long res); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -1873,14 +1874,9 @@ static void io_iopoll_queue(struct list_head *again) req = list_first_entry(again, struct io_kiocb, list); list_del(&req->list); - /* shouldn't happen unless io_uring is dying, cancel reqs */ - if (unlikely(!current->mm)) { + /* should have ->mm unless io_uring is dying, kill reqs then */ + if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); - continue; - } - - refcount_inc(&req->refs); - io_queue_async_work(req); } while (!list_empty(again)); } @@ -2387,6 +2383,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; + io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; From fb49278624f75e15d36c3c43d322ca8961fb40e9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 12:59:48 +0300 Subject: [PATCH 049/127] io_uring: fix missing wake_up io_rw_reissue() Don't forget to wake up a process to which io_rw_reissue() added task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d3d8d3866cc18..92c7e2a96912b7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2168,8 +2168,10 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) tsk = req->task; init_task_work(&req->task_work, io_rw_resubmit); ret = task_work_add(tsk, &req->task_work, true); - if (!ret) + if (!ret) { + wake_up_process(tsk); return true; + } #endif return false; } From 7c86ffeeed303187f266ed17bd87a9b375955709 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:12:59 +0300 Subject: [PATCH 050/127] io_uring: deduplicate freeing linked timeouts Linked timeout cancellation code is repeated in in io_req_link_next() and io_fail_links(), and they differ in details even though shouldn't. Basing on the fact that there is maximum one armed linked timeout in a link, and it immediately follows the head, extract a function that will check for it and defuse. Justification: - DRY and cleaner - better inlining for io_req_link_next() (just 1 call site now) - isolates linked_timeouts from common path - reduces time under spinlock for failed links - actually less code Signed-off-by: Pavel Begunkov [axboe: fold in locking fix for io_fail_links()] Signed-off-by: Jens Axboe --- fs/io_uring.c | 107 +++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 49 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 92c7e2a96912b7..a0aea78162a6f4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1552,48 +1552,57 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link; bool wake_ev = false; + unsigned long flags = 0; /* false positive warning */ + + if (!(req->flags & REQ_F_COMP_LOCKED)) + spin_lock_irqsave(&ctx->completion_lock, flags); + + if (list_empty(&req->link_list)) + goto out; + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + if (link->opcode != IORING_OP_LINK_TIMEOUT) + goto out; + + list_del_init(&link->link_list); + wake_ev = io_link_cancel_timeout(link); + req->flags &= ~REQ_F_LINK_TIMEOUT; +out: + if (!(req->flags & REQ_F_COMP_LOCKED)) + spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (wake_ev) + io_cqring_ev_posted(ctx); +} + +static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +{ + struct io_kiocb *nxt; /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the * safe side. */ - while (!list_empty(&req->link_list)) { - struct io_kiocb *nxt = list_first_entry(&req->link_list, - struct io_kiocb, link_list); - - if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT))) { - list_del_init(&nxt->link_list); - wake_ev |= io_link_cancel_timeout(nxt); - req->flags &= ~REQ_F_LINK_TIMEOUT; - continue; - } - - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK_HEAD; - *nxtptr = nxt; - break; - } + if (unlikely(list_empty(&req->link_list))) + return; - if (wake_ev) - io_cqring_ev_posted(ctx); + nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK_HEAD; + *nxtptr = nxt; } /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void io_fail_links(struct io_kiocb *req) +static void __io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, @@ -1602,18 +1611,29 @@ static void io_fail_links(struct io_kiocb *req) list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->opcode == IORING_OP_LINK_TIMEOUT) { - io_link_cancel_timeout(link); - } else { - io_cqring_fill_event(link, -ECANCELED); - __io_double_put_req(link); - } + io_cqring_fill_event(link, -ECANCELED); + __io_double_put_req(link); req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); +} + +static void io_fail_links(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + __io_fail_links(req); + } + io_cqring_ev_posted(ctx); } @@ -1623,30 +1643,19 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) return; req->flags &= ~REQ_F_LINK_HEAD; + if (req->flags & REQ_F_LINK_TIMEOUT) + io_kill_linked_timeout(req); + /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) { + if (req->flags & REQ_F_FAIL_LINK) io_fail_links(req); - } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == - REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - /* - * If this is a timeout link, we could be racing with the - * timeout timer. Grab the completion lock for this case to - * protect against that. - */ - spin_lock_irqsave(&ctx->completion_lock, flags); - io_req_link_next(req, nxt); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else { + else io_req_link_next(req, nxt); - } } static void __io_req_task_cancel(struct io_kiocb *req, int error) From 9b5f7bd93272689ec8dc2cfd40a812265c23414e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:00 +0300 Subject: [PATCH 051/127] io_uring: replace find_next() out param with ret Generally, it's better to return a value directly than having out parameter. It's cleaner and saves from some kinds of ugly bugs. May also be faster. Return next request from io_req_find_next() and friends directly instead of passing out parameter. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a0aea78162a6f4..0234dc2c962579 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1578,7 +1578,7 @@ static void io_kill_linked_timeout(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_req_link_next(struct io_kiocb *req) { struct io_kiocb *nxt; @@ -1588,13 +1588,13 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * safe side. */ if (unlikely(list_empty(&req->link_list))) - return; + return NULL; nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); list_del_init(&req->link_list); if (!list_empty(&nxt->link_list)) nxt->flags |= REQ_F_LINK_HEAD; - *nxtptr = nxt; + return nxt; } /* @@ -1637,10 +1637,10 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_LINK_HEAD))) - return; + return NULL; req->flags &= ~REQ_F_LINK_HEAD; if (req->flags & REQ_F_LINK_TIMEOUT) @@ -1652,10 +1652,10 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) - io_fail_links(req); - else - io_req_link_next(req, nxt); + if (likely(!(req->flags & REQ_F_FAIL_LINK))) + return io_req_link_next(req); + io_fail_links(req); + return NULL; } static void __io_req_task_cancel(struct io_kiocb *req, int error) @@ -1718,9 +1718,8 @@ static void io_req_task_queue(struct io_kiocb *req) static void io_queue_next(struct io_kiocb *req) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *nxt = io_req_find_next(req); - io_req_find_next(req, &nxt); if (nxt) io_req_task_queue(nxt); } @@ -1770,13 +1769,15 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ -__attribute__((nonnull)) -static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) { + struct io_kiocb *nxt = NULL; + if (refcount_dec_and_test(&req->refs)) { - io_req_find_next(req, nxtptr); + nxt = io_req_find_next(req); __io_free_req(req); } + return nxt; } static void io_put_req(struct io_kiocb *req) @@ -1797,7 +1798,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if (refcount_read(&req->refs) != 1) return NULL; - io_req_find_next(req, &nxt); + nxt = io_req_find_next(req); if (!nxt) return NULL; @@ -4488,7 +4489,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); req->flags |= REQ_F_COMP_LOCKED; - io_put_req_find_next(req, nxt); + *nxt = io_put_req_find_next(req); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -5938,9 +5939,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } err: - nxt = NULL; /* drop submission reference */ - io_put_req_find_next(req, &nxt); + nxt = io_put_req_find_next(req); if (linked_timeout) { if (!ret) From a1a4661691c5f1a3af4c04f56ad68e2d1dbee3af Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:01 +0300 Subject: [PATCH 052/127] io_uring: kill REQ_F_TIMEOUT Now REQ_F_TIMEOUT is set but never used, kill it Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0234dc2c962579..e9c8f52daf8fca 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -531,7 +531,6 @@ enum { REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, - REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, @@ -574,8 +573,6 @@ enum { REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), /* has linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* timeout request */ - REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* no timeout sequence */ @@ -5063,7 +5060,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = &req->io->timeout; data->req = req; - req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; From 8eb7e2d00763367f345ef0b2a2eb4f8001ae40ce Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:02 +0300 Subject: [PATCH 053/127] io_uring: kill REQ_F_TIMEOUT_NOSEQ There are too many useless flags, kill REQ_F_TIMEOUT_NOSEQ, which can be easily infered from req.timeout itself. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e9c8f52daf8fca..8495c17b53d6c8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -532,7 +532,6 @@ enum { REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, @@ -575,8 +574,6 @@ enum { REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* no timeout sequence */ - REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ @@ -1010,6 +1007,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +static inline bool io_is_timeout_noseq(struct io_kiocb *req) +{ + return !req->timeout.off; +} + static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -1222,7 +1224,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req = list_first_entry(&ctx->timeout_list, struct io_kiocb, list); - if (req->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(req)) break; if (req->timeout.target_seq != ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts)) @@ -5087,8 +5089,7 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - if (!off) { - req->flags |= REQ_F_TIMEOUT_NOSEQ; + if (io_is_timeout_noseq(req)) { entry = ctx->timeout_list.prev; goto add; } @@ -5103,7 +5104,7 @@ static int io_timeout(struct io_kiocb *req) list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ if (off >= nxt->timeout.target_seq - tail) From ecfc51777487da4da530710e0b13de4c8cb4a6d2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 13:13:03 +0300 Subject: [PATCH 054/127] io_uring: fix potential use after free on fallback request free After __io_free_req() puts a ctx ref, it should be assumed that the ctx may already be gone. However, it can be accessed when putting the fallback req. Free the req first and then put the ctx. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8495c17b53d6c8..b54e358e6b31e7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1526,12 +1526,15 @@ static void io_dismantle_req(struct io_kiocb *req) static void __io_free_req(struct io_kiocb *req) { + struct io_ring_ctx *ctx; + io_dismantle_req(req); - percpu_ref_put(&req->ctx->refs); + ctx = req->ctx; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else - clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); + clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req); + percpu_ref_put(&ctx->refs); } static bool io_link_cancel_timeout(struct io_kiocb *req) From 351fd53595a3ceb88756a005e3b864f7c8cb86e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:40 +0300 Subject: [PATCH 055/127] io_uring: don't pass def into io_req_work_grab_env Remove struct io_op_def *def parameter from io_req_work_grab_env(), it's trivially deducible from req->opcode and fast. The API is cleaner this way, and also helps the complier to understand that it's a real constant and could be register-cached. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b54e358e6b31e7..2b7666e81c131f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1101,9 +1101,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req, - const struct io_op_def *def) +static inline void io_req_work_grab_env(struct io_kiocb *req) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; @@ -1161,7 +1162,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, } io_req_init_async(req); - io_req_work_grab_env(req, def); + io_req_work_grab_env(req); *link = io_prep_linked_timeout(req); } @@ -5255,7 +5256,7 @@ static int io_req_defer_prep(struct io_kiocb *req, if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { io_req_init_async(req); - io_req_work_grab_env(req, &io_op_defs[req->opcode]); + io_req_work_grab_env(req); } switch (req->opcode) { From edcdfcc149a8d0c11d4dd2b23b5338af22e31a5f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:41 +0300 Subject: [PATCH 056/127] io_uring: do init work in grab_env() Place io_req_init_async() in io_req_work_grab_env() so it won't be forgotten. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b7666e81c131f..3b2f6fd8f58fd5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1105,6 +1105,8 @@ static inline void io_req_work_grab_env(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + io_req_init_async(req); + if (!req->work.mm && def->needs_mm) { mmgrab(current->mm); req->work.mm = current->mm; @@ -1161,9 +1163,7 @@ static inline void io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; } - io_req_init_async(req); io_req_work_grab_env(req); - *link = io_prep_linked_timeout(req); } @@ -5254,10 +5254,8 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } - if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) { - io_req_init_async(req); + if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) io_req_work_grab_env(req); - } switch (req->opcode) { case IORING_OP_NOP: From debb85f496c9cc70663eac31d3ad9153839c844c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:42 +0300 Subject: [PATCH 057/127] io_uring: factor out grab_env() from defer_prep() Remove io_req_work_grab_env() call from io_req_defer_prep(), just call it when neccessary. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3b2f6fd8f58fd5..caf908382cdb3d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5240,7 +5240,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock, } static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe, bool for_async) + const struct io_uring_sqe *sqe) { ssize_t ret = 0; @@ -5254,9 +5254,6 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } - if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) - io_req_work_grab_env(req); - switch (req->opcode) { case IORING_OP_NOP: break; @@ -5369,9 +5366,10 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req->io) { if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe); if (ret < 0) return ret; + io_req_work_grab_env(req); } spin_lock_irq(&ctx->completion_lock); @@ -5983,9 +5981,10 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = -EAGAIN; if (io_alloc_async_ctx(req)) goto fail_req; - ret = io_req_defer_prep(req, sqe, true); + ret = io_req_defer_prep(req, sqe); if (unlikely(ret < 0)) goto fail_req; + io_req_work_grab_env(req); } /* @@ -6039,7 +6038,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, false); + ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; @@ -6066,7 +6065,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, sqe, false); + ret = io_req_defer_prep(req, sqe); if (ret) req->flags |= REQ_F_FAIL_LINK; *link = req; From cbdcb4357c000861b77369c34e110fa893d23607 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 29 Jun 2020 19:18:43 +0300 Subject: [PATCH 058/127] io_uring: do grab_env() just before punting Currently io_steal_work() is disabled, and every linked request should go through task_work for initialisation. Do io_req_work_grab_env() just before io-wq punting and for the whole link, so any request reachable by io_steal_work() is prepared. This is also interesting for another reason -- it localises io_req_work_grab_env() into one place just before io-wq punting, helping to to better manage req->work lifetime and add some neat cleanup/optimisations later. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 ++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index caf908382cdb3d..9bc4339057ef82 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1101,7 +1101,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req) +static void io_req_work_grab_env(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1150,8 +1150,7 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) } } -static inline void io_prep_async_work(struct io_kiocb *req, - struct io_kiocb **link) +static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1164,15 +1163,22 @@ static inline void io_prep_async_work(struct io_kiocb *req, } io_req_work_grab_env(req); - *link = io_prep_linked_timeout(req); } -static inline void io_queue_async_work(struct io_kiocb *req) +static void io_prep_async_link(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; + struct io_kiocb *cur; - io_prep_async_work(req, &link); + io_prep_async_work(req); + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(cur, &req->link_list, link_list) + io_prep_async_work(cur); +} + +static void __io_queue_async_work(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link = io_prep_linked_timeout(req); trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); @@ -1182,6 +1188,13 @@ static inline void io_queue_async_work(struct io_kiocb *req) io_queue_linked_timeout(link); } +static void io_queue_async_work(struct io_kiocb *req) +{ + /* init ->work of the whole link before punting */ + io_prep_async_link(req); + __io_queue_async_work(req); +} + static void io_kill_timeout(struct io_kiocb *req) { int ret; @@ -1215,7 +1228,8 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) if (req_need_defer(req)) break; list_del_init(&req->list); - io_queue_async_work(req); + /* punt-init is done before queueing for defer */ + __io_queue_async_work(req); } while (!list_empty(&ctx->defer_list)); } @@ -1791,7 +1805,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *timeout, *nxt = NULL; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1805,18 +1819,10 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) if (!nxt) return NULL; - if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file) - io_wq_hash_work(&nxt->work, file_inode(nxt->file)); - - io_req_task_queue(nxt); - /* - * If we're going to return actual work, here should be timeout prep: - * - * link = io_prep_linked_timeout(nxt); - * if (link) - * nxt->flags |= REQ_F_QUEUE_TIMEOUT; - */ - return NULL; + timeout = io_prep_linked_timeout(nxt); + if (timeout) + nxt->flags |= REQ_F_QUEUE_TIMEOUT; + return &nxt->work; } /* @@ -5369,8 +5375,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) ret = io_req_defer_prep(req, sqe); if (ret < 0) return ret; - io_req_work_grab_env(req); } + io_prep_async_link(req); spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { @@ -5984,7 +5990,6 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_req_defer_prep(req, sqe); if (unlikely(ret < 0)) goto fail_req; - io_req_work_grab_env(req); } /* From ab0b6451db2a8ed630b89ef3826b8ea994149444 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 30 Jun 2020 08:43:15 -0600 Subject: [PATCH 059/127] io_uring: clean up io_kill_linked_timeout() locking Avoid jumping through hoops to silence unused variable warnings, and also fix sparse rightfully complaining about the locking context: fs/io_uring.c:1593:39: warning: context imbalance in 'io_kill_linked_timeout' - unexpected unlock Provide the functional helper as __io_kill_linked_timeout(), and have separate the locking from it. Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9bc4339057ef82..3c12221f549e94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1569,28 +1569,38 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_kill_linked_timeout(struct io_kiocb *req) +static bool __io_kill_linked_timeout(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link; - bool wake_ev = false; - unsigned long flags = 0; /* false positive warning */ - - if (!(req->flags & REQ_F_COMP_LOCKED)) - spin_lock_irqsave(&ctx->completion_lock, flags); + bool wake_ev; if (list_empty(&req->link_list)) - goto out; + return false; link = list_first_entry(&req->link_list, struct io_kiocb, link_list); if (link->opcode != IORING_OP_LINK_TIMEOUT) - goto out; + return false; list_del_init(&link->link_list); wake_ev = io_link_cancel_timeout(link); req->flags &= ~REQ_F_LINK_TIMEOUT; -out: - if (!(req->flags & REQ_F_COMP_LOCKED)) + return wake_ev; +} + +static void io_kill_linked_timeout(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + bool wake_ev; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + wake_ev = __io_kill_linked_timeout(req); + } + if (wake_ev) io_cqring_ev_posted(ctx); } From cf2f54255d0342cfbd273cbb964ad6bc7674f587 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:40 +0300 Subject: [PATCH 060/127] io_uring: don't fail iopoll requeue without ->mm Actually, io_iopoll_queue() may have NULL ->mm, that's if SQ thread didn't grabbed mm before doing iopoll. Don't fail reqs there, as after recent changes it won't be punted directly but rather through task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3c12221f549e94..43419f5bef8cd7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1902,9 +1902,7 @@ static void io_iopoll_queue(struct list_head *again) do { req = list_first_entry(again, struct io_kiocb, list); list_del(&req->list); - - /* should have ->mm unless io_uring is dying, kill reqs then */ - if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN)) + if (!io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); } while (!list_empty(again)); } From ea1164e574e9af0a15ab730ead0861a4c7724142 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:41 +0300 Subject: [PATCH 061/127] io_uring: fix NULL mm in io_poll_task_func() io_poll_task_func() hand-coded link submission forgetting to set TASK_RUNNING, acquire mm, etc. Call existing helper for that, i.e. __io_req_task_submit(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 43419f5bef8cd7..2c17c2613205b7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4518,13 +4518,8 @@ static void io_poll_task_func(struct callback_head *cb) struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); - if (nxt) { - struct io_ring_ctx *ctx = nxt->ctx; - - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } + if (nxt) + __io_req_task_submit(nxt); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, From 0be0b0e33b0bfd08264b108512e44b3907fe987b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:42 +0300 Subject: [PATCH 062/127] io_uring: simplify io_async_task_func() Greatly simplify io_async_task_func() removing duplicated functionality of __io_req_task_submit(). This do one extra spin lock/unlock for cancelled poll case, but that shouldn't happen often. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2c17c2613205b7..82b35948ac5bf9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4608,7 +4608,6 @@ static void io_async_task_func(struct callback_head *cb) struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - bool canceled = false; trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); @@ -4618,15 +4617,8 @@ static void io_async_task_func(struct callback_head *cb) } /* If req is still hashed, it cannot have been canceled. Don't check. */ - if (hash_hashed(&req->hash_node)) { + if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - } else { - canceled = READ_ONCE(apoll->poll.canceled); - if (canceled) { - io_cqring_fill_event(req, -ECANCELED); - io_commit_cqring(ctx); - } - } spin_unlock_irq(&ctx->completion_lock); @@ -4635,21 +4627,10 @@ static void io_async_task_func(struct callback_head *cb) memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll); - if (!canceled) { - __set_current_state(TASK_RUNNING); - if (io_sq_thread_acquire_mm(ctx, req)) { - io_cqring_add_event(req, -EFAULT, 0); - goto end_req; - } - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL, NULL); - mutex_unlock(&ctx->uring_lock); - } else { - io_cqring_ev_posted(ctx); -end_req: - req_set_fail_links(req); - io_double_put_req(req); - } + if (!READ_ONCE(apoll->poll.canceled)) + __io_req_task_submit(req); + else + __io_req_task_cancel(req, -ECANCELED); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, From 3fa5e0f331280237af918ab2e7a160f5a68d3e7d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:43 +0300 Subject: [PATCH 063/127] io_uring: optimise io_req_find_next() fast check gcc 9.2.0 compiles io_req_find_next() as a separate function leaving the first REQ_F_LINK_HEAD fast check not inlined. Help it by splitting out the check from the function. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 82b35948ac5bf9..9a43847c682315 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1664,12 +1664,9 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK_HEAD))) - return NULL; req->flags &= ~REQ_F_LINK_HEAD; - if (req->flags & REQ_F_LINK_TIMEOUT) io_kill_linked_timeout(req); @@ -1685,6 +1682,13 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return NULL; } +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_LINK_HEAD))) + return NULL; + return __io_req_find_next(req); +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; From 8eb06d7e8dd853d70668617dda57de4f6cebe651 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Jun 2020 15:20:39 +0300 Subject: [PATCH 064/127] io_uring: fix missing ->mm on exit There is a fancy bug, where exiting user task may not have ->mm, that makes task_works to try to do kthread_use_mm(ctx->sqo_mm). Don't do that if sqo_mm is NULL. [ 290.460558] WARNING: CPU: 6 PID: 150933 at kernel/kthread.c:1238 kthread_use_mm+0xf3/0x110 [ 290.460579] CPU: 6 PID: 150933 Comm: read-write2 Tainted: G I E 5.8.0-rc2-00066-g9b21720607cf #531 [ 290.460580] RIP: 0010:kthread_use_mm+0xf3/0x110 ... [ 290.460584] Call Trace: [ 290.460584] __io_sq_thread_acquire_mm.isra.0.part.0+0x25/0x30 [ 290.460584] __io_req_task_submit+0x64/0x80 [ 290.460584] io_req_task_submit+0x15/0x20 [ 290.460585] task_work_run+0x67/0xa0 [ 290.460585] do_exit+0x35d/0xb70 [ 290.460585] do_group_exit+0x43/0xa0 [ 290.460585] get_signal+0x140/0x900 [ 290.460586] do_signal+0x37/0x780 [ 290.460586] __prepare_exit_to_usermode+0x126/0x1c0 [ 290.460586] __syscall_return_slowpath+0x3b/0x1c0 [ 290.460587] do_syscall_64+0x5f/0xa0 [ 290.460587] entry_SYSCALL_64_after_hwframe+0x44/0xa9 following with faults. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9a43847c682315..cfad2acd4d8693 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -958,7 +958,7 @@ static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + if (unlikely(!ctx->sqo_mm || !mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); } @@ -7216,10 +7216,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - mmgrab(current->mm); - ctx->sqo_mm = current->mm; - if (ctx->flags & IORING_SETUP_SQPOLL) { + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; @@ -7263,8 +7263,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; + if (ctx->sqo_mm) { + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } return ret; } From 4c6e277c4cc4a6b3b2b9c66a7b014787ae757cc1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jul 2020 11:29:10 -0600 Subject: [PATCH 065/127] io_uring: abstract out task work running Provide a helper to run task_work instead of checking and running manually in a bunch of different spots. While doing so, also move the task run state setting where we run the task work. Then we can move it out of the callback helpers. This also helps ensure we only do this once per task_work list run, not per task_work item. Suggested-by: Oleg Nesterov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7426e4f23f9be3..65a6978e17958b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1714,7 +1714,6 @@ static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - __set_current_state(TASK_RUNNING); if (!__io_sq_thread_acquire_mm(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL, NULL); @@ -1899,6 +1898,17 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static inline bool io_run_task_work(void) +{ + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); + return true; + } + + return false; +} + static void io_iopoll_queue(struct list_head *again) { struct io_kiocb *req; @@ -2079,8 +2089,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); - if (current->task_works) - task_work_run(); + io_run_task_work(); mutex_lock(&ctx->uring_lock); } @@ -2176,8 +2185,6 @@ static void io_rw_resubmit(struct callback_head *cb) struct io_ring_ctx *ctx = req->ctx; int err; - __set_current_state(TASK_RUNNING); - err = io_sq_thread_acquire_mm(ctx, req); if (io_resubmit_prep(req, err)) { @@ -6361,8 +6368,7 @@ static int io_sq_thread(void *data) if (!list_empty(&ctx->poll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { - if (current->task_works) - task_work_run(); + io_run_task_work(); cond_resched(); continue; } @@ -6394,8 +6400,7 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } - if (current->task_works) { - task_work_run(); + if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); continue; } @@ -6420,8 +6425,7 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (current->task_works) - task_work_run(); + io_run_task_work(); io_sq_thread_drop_mm(ctx); revert_creds(old_cred); @@ -6486,9 +6490,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { if (io_cqring_events(ctx, false) >= min_events) return 0; - if (!current->task_works) + if (!io_run_task_work()) break; - task_work_run(); } while (1); if (sig) { @@ -6510,8 +6513,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (current->task_works) - task_work_run(); + if (io_run_task_work()) + continue; if (signal_pending(current)) { if (current->jobctl & JOBCTL_TASK_WORK) { spin_lock_irq(¤t->sighand->siglock); @@ -7953,8 +7956,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (current->task_works) - task_work_run(); + io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; From c2c4c83c58cbca23527fee93b49738a5a84272a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jul 2020 15:37:11 -0600 Subject: [PATCH 066/127] io_uring: use new io_req_task_work_add() helper throughout Since we now have that in the 5.9 branch, convert the existing users of task_work_add() to use this new helper. Signed-off-by: Jens Axboe --- fs/io_uring.c | 77 +++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 65a6978e17958b..2b849984bae551 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1689,6 +1689,29 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +{ + struct task_struct *tsk = req->task; + struct io_ring_ctx *ctx = req->ctx; + int ret, notify = TWA_RESUME; + + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. + * If we're not using an eventfd, then TWA_RESUME is always fine, + * as we won't have dependencies between request completions for + * other kernel wait conditions. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) + notify = 0; + else if (ctx->cq_ev_fd) + notify = TWA_SIGNAL; + + ret = task_work_add(tsk, cb, notify); + if (!ret) + wake_up_process(tsk); + return ret; +} + static void __io_req_task_cancel(struct io_kiocb *req, int error) { struct io_ring_ctx *ctx = req->ctx; @@ -1732,18 +1755,19 @@ static void io_req_task_submit(struct callback_head *cb) static void io_req_task_queue(struct io_kiocb *req) { - struct task_struct *tsk = req->task; int ret; init_task_work(&req->task_work, io_req_task_submit); - ret = task_work_add(tsk, &req->task_work, true); + ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, true); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); } - wake_up_process(tsk); } static void io_queue_next(struct io_kiocb *req) @@ -2197,19 +2221,15 @@ static void io_rw_resubmit(struct callback_head *cb) static bool io_rw_reissue(struct io_kiocb *req, long res) { #ifdef CONFIG_BLOCK - struct task_struct *tsk; int ret; if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - tsk = req->task; init_task_work(&req->task_work, io_rw_resubmit); - ret = task_work_add(tsk, &req->task_work, true); - if (!ret) { - wake_up_process(tsk); + ret = io_req_task_work_add(req, &req->task_work); + if (!ret) return true; - } #endif return false; } @@ -2909,7 +2929,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, struct io_kiocb *req = wait->private; struct io_async_rw *rw = &req->io->rw; struct wait_page_key *key = arg; - struct task_struct *tsk; int ret; wpq = container_of(wait, struct wait_page_queue, wait); @@ -2923,15 +2942,16 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, init_task_work(&rw->task_work, io_async_buf_retry); /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - tsk = req->task; - ret = task_work_add(tsk, &rw->task_work, true); + ret = io_req_task_work_add(req, &rw->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + /* queue just for cancelation */ init_task_work(&rw->task_work, io_async_buf_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &rw->task_work, true); + task_work_add(tsk, &rw->task_work, 0); + wake_up_process(tsk); } - wake_up_process(tsk); return 1; } @@ -4424,33 +4444,9 @@ struct io_poll_table { int error; }; -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) -{ - struct task_struct *tsk = req->task; - struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; - - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. - */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) - notify = TWA_SIGNAL; - - ret = task_work_add(tsk, cb, notify); - if (!ret) - wake_up_process(tsk); - return ret; -} - static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { - struct task_struct *tsk; int ret; /* for instances that support it check for an event match first: */ @@ -4461,7 +4457,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, list_del_init(&poll->wait.entry); - tsk = req->task; req->result = mask; init_task_work(&req->task_work, func); /* @@ -4472,6 +4467,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); task_work_add(tsk, &req->task_work, 0); From 6df1db6b542436c6d429caa66e1045862fa36155 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:06 +0300 Subject: [PATCH 067/127] io_uring: fix mis-refcounting linked timeouts io_prep_linked_timeout() sets REQ_F_LINK_TIMEOUT altering refcounting of the following linked request. After that someone should call io_queue_linked_timeout(), otherwise a submission reference of the linked timeout won't be ever dropped. That's what happens in io_steal_work() if io-wq decides to postpone linked request with io_wqe_enqueue(). io_queue_linked_timeout() can also be potentially called twice without synchronisation during re-submission, e.g. io_rw_resubmit(). There are the rules, whoever did io_prep_linked_timeout() must also call io_queue_linked_timeout(). To not do it twice, io_prep_linked_timeout() will return non NULL only for the first call. That's controlled by REQ_F_LINK_TIMEOUT flag. Also kill REQ_F_QUEUE_TIMEOUT. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2b849984bae551..cf1b3d4ac241e5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -538,7 +538,6 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, - REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, REQ_F_TASK_PINNED_BIT, @@ -586,8 +585,6 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), - /* needs to queue linked timeout */ - REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), /* req->task is refcounted */ @@ -1842,7 +1839,7 @@ static void io_put_req(struct io_kiocb *req) static struct io_wq_work *io_steal_work(struct io_kiocb *req) { - struct io_kiocb *timeout, *nxt = NULL; + struct io_kiocb *nxt; /* * A ref is owned by io-wq in which context we're. So, if that's the @@ -1853,13 +1850,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req) return NULL; nxt = io_req_find_next(req); - if (!nxt) - return NULL; - - timeout = io_prep_linked_timeout(nxt); - if (timeout) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; - return &nxt->work; + return nxt ? &nxt->work : NULL; } /* @@ -5702,24 +5693,15 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_arm_async_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link; - - /* link head's timeout is queued in io_queue_async_work() */ - if (!(req->flags & REQ_F_QUEUE_TIMEOUT)) - return; - - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - io_queue_linked_timeout(link); -} - static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *timeout; int ret = 0; - io_arm_async_linked_timeout(req); + timeout = io_prep_linked_timeout(req); + if (timeout) + io_queue_linked_timeout(timeout); /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == @@ -5893,8 +5875,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK_HEAD)) return NULL; - /* for polled retry, if flag is set, we already went through here */ - if (req->flags & REQ_F_POLLED) + if (req->flags & REQ_F_LINK_TIMEOUT) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, From 652532ad459524d32c6bf1522e0b88d83b084d1a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:07 +0300 Subject: [PATCH 068/127] io_uring: keep queue_sqe()'s fail path separately A preparation path, extracts error path into a separate block. It looks saner then calling req_set_fail_links() after io_put_req_find_next(), even though it have been working well. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cf1b3d4ac241e5..7147e87a24b509 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5937,22 +5937,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, goto exit; } + if (unlikely(ret)) { err: - /* drop submission reference */ - nxt = io_put_req_find_next(req); - - if (linked_timeout) { - if (!ret) - io_queue_linked_timeout(linked_timeout); - else - io_put_req(linked_timeout); - } - - /* and drop final reference, if we failed */ - if (ret) { + /* un-prep timeout, so it'll be killed as any other linked */ + req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); + io_put_req(req); io_req_complete(req, ret); + goto exit; } + + /* drop submission reference */ + nxt = io_put_req_find_next(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); + if (nxt) { req = nxt; From 8b3656af2a37dc538d21e144a5a94bacae05e9f1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 3 Jul 2020 22:15:08 +0300 Subject: [PATCH 069/127] io_uring: fix lost cqe->flags Don't forget to fill cqe->flags properly in io_submit_flush_completions() Fixes: a1d7c393c4711 ("io_uring: enable READ/WRITE to use deferred completions") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7147e87a24b509..9464f9470bbc24 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1416,7 +1416,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(req, req->result); + __io_cqring_fill_event(req, req->result, req->cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -1441,6 +1441,7 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, io_put_req(req); } else { req->result = res; + req->cflags = cflags; list_add_tail(&req->list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); From 3aadc23e6054353ca056bf14e87250c79efbd7ed Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:29 +0300 Subject: [PATCH 070/127] io_uring: don't delay iopoll'ed req completion ->iopoll() may have completed current request, but instead of reaping it, io_do_iopoll() just continues with the next request in the list. As a result it can leave just polled and completed request in the list up until next syscall. Even outer loop in io_iopoll_getevents() doesn't help the situation. E.g. poll_list: req0 -> req1 If req0->iopoll() completed both requests, and @min<=1, then @req0 will be left behind. Check whether a req was completed after ->iopoll(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9464f9470bbc24..60f1a81c6c354f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2015,6 +2015,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ret < 0) break; + /* iopoll may have completed current req */ + if (READ_ONCE(req->iopoll_completed)) + list_move_tail(&req->list, &done); + if (ret && spin) spin = false; ret = 0; From eba0a4dd2aa5c47ca5b0c56ffb6d6665e047ff72 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:30 +0300 Subject: [PATCH 071/127] io_uring: fix stopping iopoll'ing too early Nobody adjusts *nr_events (number of completed requests) before calling io_iopoll_getevents(), so the passed @min shouldn't be adjusted as well. Othewise it can return less than initially asked @min without hitting need_resched(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 60f1a81c6c354f..332008f346e31a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2044,7 +2044,7 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, ret = io_do_iopoll(ctx, nr_events, min); if (ret < 0) return ret; - if (!min || *nr_events >= min) + if (*nr_events >= min) return 0; } @@ -2087,8 +2087,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ mutex_lock(&ctx->uring_lock); do { - int tmin = 0; - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that @@ -2113,10 +2111,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, mutex_lock(&ctx->uring_lock); } - if (*nr_events < min) - tmin = min - *nr_events; - - ret = io_iopoll_getevents(ctx, nr_events, tmin); + ret = io_iopoll_getevents(ctx, nr_events, min); if (ret <= 0) break; ret = 0; From 3fcee5a6d5414df8ff4ee22f2477bde76d34527c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 6 Jul 2020 17:59:31 +0300 Subject: [PATCH 072/127] io_uring: briefly loose locks while reaping events It's not nice to hold @uring_lock for too long io_iopoll_reap_events(). For instance, the lock is needed to publish requests to @poll_list, and that locks out tasks doing that for no good reason. Loose it occasionally. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 332008f346e31a..6e3169834bf7fb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2069,8 +2069,13 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. + * Also let task_work, etc. to progress by releasing the mutex */ - cond_resched(); + if (need_resched()) { + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } } mutex_unlock(&ctx->uring_lock); } From 9dedd56301564acdbb1dd37cf09250a4c7b783c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:20 +0300 Subject: [PATCH 073/127] io_uring: partially inline io_iopoll_getevents() io_iopoll_reap_events() doesn't care about returned valued of io_iopoll_getevents() and does the same checks for list emptiness and need_resched(). Just use io_do_iopoll(). io_sq_thread() doesn't check return value as well. It also passes min=0, so there never be the second iteration inside io_poll_getevents(). Inline it there too. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e3169834bf7fb..104af675f6fbed 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2064,7 +2064,7 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->poll_list)) { unsigned int nr_events = 0; - io_iopoll_getevents(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 1); /* * Ensure we allow local-to-the-cpu processing to take place, @@ -6318,8 +6318,8 @@ static int io_sq_thread(void *data) unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list)) - io_iopoll_getevents(ctx, &nr_events, 0); + if (!list_empty(&ctx->poll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; mutex_unlock(&ctx->uring_lock); From 7668b92a69b8201e2dd16a47a08efb93e909f419 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:21 +0300 Subject: [PATCH 074/127] io_uring: remove nr_events arg from iopoll_check() Nobody checks io_iopoll_check()'s output parameter @nr_events. Remove the parameter and declare it further down the stack. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 104af675f6fbed..38bf42320f5682 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2080,9 +2080,9 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { + unsigned int nr_events = 0; int iters = 0, ret = 0; /* @@ -2116,11 +2116,11 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, mutex_lock(&ctx->uring_lock); } - ret = io_iopoll_getevents(ctx, nr_events, min); + ret = io_iopoll_getevents(ctx, &nr_events, min); if (ret <= 0) break; ret = 0; - } while (min && !*nr_events && !need_resched()); + } while (min && !nr_events && !need_resched()); mutex_unlock(&ctx->uring_lock); return ret; @@ -7977,8 +7977,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - unsigned nr_events = 0; - min_complete = min(min_complete, ctx->cq_entries); /* @@ -7989,7 +7987,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_iopoll_check(ctx, &nr_events, min_complete); + ret = io_iopoll_check(ctx, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } From b2edc0a77fac19bbdef63cedb2ea34aec1a9a499 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 7 Jul 2020 16:36:22 +0300 Subject: [PATCH 075/127] io_uring: don't burn CPU for iopoll on exit First of all don't spin in io_ring_ctx_wait_and_kill() on iopoll. Requests won't complete faster because of that, but only lengthen io_uring_release(). The same goes for offloaded cleanup in io_ring_exit_work() -- it already has waiting loop, don't do blocking active spinning. For that, pass min=0 into io_iopoll_[try_]reap_events(), so it won't actively spin. Leave the function if io_do_iopoll() there can't complete a request to sleep in io_ring_exit_work(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 38bf42320f5682..4c9a494c9f9fa3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2055,7 +2055,7 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; @@ -2064,8 +2064,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->poll_list)) { unsigned int nr_events = 0; - io_do_iopoll(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 0); + /* let it sleep and repeat later if can't complete a request */ + if (nr_events == 0) + break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. @@ -7648,7 +7651,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->sqo_mm = NULL; } - io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); @@ -7715,11 +7717,8 @@ static int io_remove_personalities(int id, void *p, void *data) static void io_ring_exit_work(struct work_struct *work) { - struct io_ring_ctx *ctx; - - ctx = container_of(work, struct io_ring_ctx, exit_work); - if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + exit_work); /* * If we're doing polled IO and end up having requests being @@ -7727,11 +7726,11 @@ static void io_ring_exit_work(struct work_struct *work) * we're waiting for refs to drop. We need to reap these manually, * as nobody else will be looking for them. */ - while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { - io_iopoll_reap_events(ctx); + do { if (ctx->rings) io_cqring_overflow_flush(ctx, true); - } + io_iopoll_try_reap_events(ctx); + } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } @@ -7747,10 +7746,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); - io_iopoll_reap_events(ctx); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); INIT_WORK(&ctx->exit_work, io_ring_exit_work); queue_work(system_wq, &ctx->exit_work); From aa340845ae6f019e0a12321a1741c14679bb0664 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jul 2020 21:47:11 +0300 Subject: [PATCH 076/127] io_uring: fix a use after free in io_async_task_func() The "apoll" variable is freed and then used on the next line. We need to move the free down a few lines. Fixes: 0be0b0e33b0b ("io_uring: simplify io_async_task_func()") Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4c9a494c9f9fa3..14168fbc7d797d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4655,12 +4655,13 @@ static void io_async_task_func(struct callback_head *cb) /* restore ->work in case we need to retry again */ if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); - kfree(apoll); if (!READ_ONCE(apoll->poll.canceled)) __io_req_task_submit(req); else __io_req_task_cancel(req, -ECANCELED); + + kfree(apoll); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, From 5acbbc8ed3a9aef71c6eb5f19ba24f7321200220 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Jul 2020 15:15:26 -0600 Subject: [PATCH 077/127] io_uring: only call kfree() for a non-zero pointer It's safe to call kfree() with a NULL pointer, but it's also pointless. Most of the time we don't have any data to free, and at millions of requests per second, the redundant function call adds noticeable overhead (about 1.3% of the runtime). Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 14168fbc7d797d..51ff88330f9a88 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1519,7 +1519,8 @@ static void io_dismantle_req(struct io_kiocb *req) if (req->flags & REQ_F_NEED_CLEANUP) io_cleanup_req(req); - kfree(req->io); + if (req->io) + kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); From 2bc9930e78fe0cb3e7b7e3169de0a40baee38d29 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2020 09:43:27 -0600 Subject: [PATCH 078/127] io_uring: get rid of __req_need_defer() We just have one caller of this, req_need_defer(), just inline the code in there instead. Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 51ff88330f9a88..7f2a2cb5c05681 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1069,18 +1069,14 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return NULL; } -static inline bool __req_need_defer(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - - return req->sequence != ctx->cached_cq_tail - + atomic_read(&ctx->cached_cq_overflow); -} - static inline bool req_need_defer(struct io_kiocb *req) { - if (unlikely(req->flags & REQ_F_IO_DRAIN)) - return __req_need_defer(req); + if (unlikely(req->flags & REQ_F_IO_DRAIN)) { + struct io_ring_ctx *ctx = req->ctx; + + return req->sequence != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); + } return false; } From 4349f30ecb8068d146a1e57bb12f46e745323b4c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2020 15:07:01 -0600 Subject: [PATCH 079/127] io_uring: remove dead 'ctx' argument and move forward declaration We don't use 'ctx' at all in io_sq_thread_drop_mm(), it just works on the mm of the current task. Drop the argument. Move io_file_put_work() to where we have the other forward declarations of functions. Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7f2a2cb5c05681..3ce02a1613ccc2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -902,6 +902,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_comp_state *cs); +static void io_file_put_work(struct work_struct *work); static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, @@ -942,7 +943,7 @@ static void __io_put_req_task(struct io_kiocb *req) put_task_struct(req->task); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +static void io_sq_thread_drop_mm(void) { struct mm_struct *mm = current->mm; @@ -977,8 +978,6 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } -static void io_file_put_work(struct work_struct *work); - /* * Note: must call io_req_init_async() for the first time you * touch any members of io_wq_work. @@ -6339,7 +6338,7 @@ static int io_sq_thread(void *data) * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); /* * We're polling. If we're within the defined idle @@ -6410,7 +6409,7 @@ static int io_sq_thread(void *data) io_run_task_work(); - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); revert_creds(old_cred); kthread_parkme(); From b36200f543ff07a1cb346aa582349141df2c8068 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Sat, 11 Jul 2020 11:31:11 +0200 Subject: [PATCH 080/127] io_uring: fix sq array offset calculation rings_size() sets sq_offset to the total size of the rings (the returned value which is used for memory allocation). This is wrong: sq array should be located within the rings, not after them. Set sq_offset to where it should be. Fixes: 75b28affdd6a ("io_uring: allocate the two rings together") Signed-off-by: Dmitry Vyukov Acked-by: Hristo Venev Cc: io-uring@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ff3851d40df4fe..ca932fb3c67d88 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7416,6 +7416,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return SIZE_MAX; #endif + if (sq_offset) + *sq_offset = off; + sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; @@ -7423,9 +7426,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; - if (sq_offset) - *sq_offset = off; - return off; } From 270a5940700bb6cf9abf36ea10cf1fa0d453aa7a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:04 +0300 Subject: [PATCH 081/127] io_uring: rename sr->msg into umsg Every second field in send/recv is called msg, make it a bit more understandable by renaming ->msg, which is a user provided ptr, to ->umsg. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ca932fb3c67d88..f17f098c403a3e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -414,7 +414,7 @@ struct io_connect { struct io_sr_msg { struct file *file; union { - struct user_msghdr __user *msg; + struct user_msghdr __user *umsg; void __user *buf; }; int msg_flags; @@ -3903,7 +3903,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); #ifdef CONFIG_COMPAT @@ -3919,7 +3919,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io->msg.msg.msg_name = &io->msg.addr; io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = sendmsg_copy_msghdr(&io->msg.msg, sr->umsg, sr->msg_flags, &io->msg.iov); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; @@ -3952,7 +3952,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, kmsg->msg.msg_name = &io.msg.addr; io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, + ret = sendmsg_copy_msghdr(&io.msg.msg, sr->umsg, sr->msg_flags, &io.msg.iov); if (ret) return ret; @@ -4030,8 +4030,8 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) size_t iov_len; int ret; - ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr, - &uiov, &iov_len); + ret = __copy_msghdr_from_user(&io->msg.msg, sr->umsg, + &io->msg.uaddr, &uiov, &iov_len); if (ret) return ret; @@ -4065,7 +4065,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, compat_size_t len; int ret; - msg_compat = (struct compat_msghdr __user *) sr->msg; + msg_compat = (struct compat_msghdr __user *) sr->umsg; ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, &ptr, &len); if (ret) @@ -4142,7 +4142,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); @@ -4207,7 +4207,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, else if (force_nonblock) flags |= MSG_DONTWAIT; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) { ret = io_setup_async_msg(req, kmsg); From 1400e69705baf98d1c9cb73b592a3a68aab1d852 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:05 +0300 Subject: [PATCH 082/127] io_uring: use more specific type in rcv/snd msg cp send/recv msghdr initialisation works with struct io_async_msghdr, but pulls the whole struct io_async_ctx for no reason. That complicates it with composite accessing, e.g. io->msg. Use and pass the most specific type, which is struct io_async_msghdr. It is the larget field in union io_async_ctx and doesn't save stack space, but looks clearer. The most of the changes are replacing "io->msg." with "iomsg->" Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 63 +++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f17f098c403a3e..8acbaddaebb726 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3935,7 +3935,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, sock = sock_from_file(req->file, &ret); if (sock) { - struct io_async_ctx io; + struct io_async_msghdr iomsg; unsigned flags; if (req->io) { @@ -3948,14 +3948,13 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, } else { struct io_sr_msg *sr = &req->sr_msg; - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; - - io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->umsg, - sr->msg_flags, &io.msg.iov); + iomsg.msg.msg_name = &iomsg.addr; + iomsg.iov = iomsg.fast_iov; + ret = sendmsg_copy_msghdr(&iomsg.msg, sr->umsg, + sr->msg_flags, &iomsg.iov); if (ret) return ret; + kmsg = &iomsg; } flags = req->sr_msg.msg_flags; @@ -4023,30 +4022,31 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, return 0; } -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = &req->sr_msg; struct iovec __user *uiov; size_t iov_len; int ret; - ret = __copy_msghdr_from_user(&io->msg.msg, sr->umsg, - &io->msg.uaddr, &uiov, &iov_len); + ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, + &iomsg->uaddr, &uiov, &iov_len); if (ret) return ret; if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL; - if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov))) + if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov))) return -EFAULT; - sr->len = io->msg.iov[0].iov_len; - iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1, + sr->len = iomsg->iov[0].iov_len; + iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1, sr->len); - io->msg.iov = NULL; + iomsg->iov = NULL; } else { ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &io->msg.iov, &io->msg.msg.msg_iter); + &iomsg->iov, &iomsg->msg.msg_iter); if (ret > 0) ret = 0; } @@ -4056,7 +4056,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) #ifdef CONFIG_COMPAT static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_ctx *io) + struct io_async_msghdr *iomsg) { struct compat_msghdr __user *msg_compat; struct io_sr_msg *sr = &req->sr_msg; @@ -4066,7 +4066,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, int ret; msg_compat = (struct compat_msghdr __user *) sr->umsg; - ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, + ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr, &ptr, &len); if (ret) return ret; @@ -4083,12 +4083,12 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, return -EFAULT; if (clen < 0) return -EINVAL; - sr->len = io->msg.iov[0].iov_len; - io->msg.iov = NULL; + sr->len = iomsg->iov[0].iov_len; + iomsg->iov = NULL; } else { ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, - &io->msg.iov, - &io->msg.msg.msg_iter); + &iomsg->iov, + &iomsg->msg.msg_iter); if (ret < 0) return ret; } @@ -4097,17 +4097,18 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, } #endif -static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + iomsg->iov = iomsg->fast_iov; #ifdef CONFIG_COMPAT if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, io); + return __io_compat_recvmsg_copy_hdr(req, iomsg); #endif - return __io_recvmsg_copy_hdr(req, io); + return __io_recvmsg_copy_hdr(req, iomsg); } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, @@ -4157,7 +4158,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; - ret = io_recvmsg_copy_hdr(req, io); + ret = io_recvmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -4173,7 +4174,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, sock = sock_from_file(req->file, &ret); if (sock) { struct io_buffer *kbuf; - struct io_async_ctx io; + struct io_async_msghdr iomsg; unsigned flags; if (req->io) { @@ -4184,12 +4185,10 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; - - ret = io_recvmsg_copy_hdr(req, &io); + ret = io_recvmsg_copy_hdr(req, &iomsg); if (ret) return ret; + kmsg = &iomsg; } kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); From 2ae523ed07f14391d685651f671a7858fe8c368a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:41:06 +0300 Subject: [PATCH 083/127] io_uring: extract io_sendmsg_copy_hdr() Don't repeat send msg initialisation code, it's error prone. Extract and use a helper function. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8acbaddaebb726..a198466544e766 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3893,6 +3893,15 @@ static int io_setup_async_msg(struct io_kiocb *req, return -EAGAIN; } +static int io_sendmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->iov = iomsg->fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, + req->sr_msg.msg_flags, &iomsg->iov); +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; @@ -3917,10 +3926,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->umsg, sr->msg_flags, - &io->msg.iov); + ret = io_sendmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -3946,12 +3952,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, kmsg->iov = kmsg->fast_iov; kmsg->msg.msg_iter.iov = kmsg->iov; } else { - struct io_sr_msg *sr = &req->sr_msg; - - iomsg.msg.msg_name = &iomsg.addr; - iomsg.iov = iomsg.fast_iov; - ret = sendmsg_copy_msghdr(&iomsg.msg, sr->umsg, - sr->msg_flags, &iomsg.iov); + ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; kmsg = &iomsg; From e73751225bae1e9b67e957afb273366fbb6ca136 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 12 Jul 2020 20:42:04 +0300 Subject: [PATCH 084/127] io_uring: replace rw->task_work with rq->task_work io_kiocb::task_work was de-unionised, and is not planned to be shared back, because it's too useful and commonly used. Hence, instead of keeping a separate task_work in struct io_async_rw just reuse req->task_work. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a198466544e766..ddff3abff363ae 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -505,7 +505,6 @@ struct io_async_rw { ssize_t nr_segs; ssize_t size; struct wait_page_queue wpq; - struct callback_head task_work; }; struct io_async_ctx { @@ -2901,33 +2900,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_async_buf_cancel(struct callback_head *cb) -{ - struct io_async_rw *rw; - struct io_kiocb *req; - - rw = container_of(cb, struct io_async_rw, task_work); - req = rw->wpq.wait.private; - __io_req_task_cancel(req, -ECANCELED); -} - -static void io_async_buf_retry(struct callback_head *cb) -{ - struct io_async_rw *rw; - struct io_kiocb *req; - - rw = container_of(cb, struct io_async_rw, task_work); - req = rw->wpq.wait.private; - - __io_req_task_submit(req); -} - static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, int sync, void *arg) { struct wait_page_queue *wpq; struct io_kiocb *req = wait->private; - struct io_async_rw *rw = &req->io->rw; struct wait_page_key *key = arg; int ret; @@ -2939,17 +2916,17 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, list_del_init(&wait->entry); - init_task_work(&rw->task_work, io_async_buf_retry); + init_task_work(&req->task_work, io_req_task_submit); /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &rw->task_work); + ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { struct task_struct *tsk; /* queue just for cancelation */ - init_task_work(&rw->task_work, io_async_buf_cancel); + init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &rw->task_work, 0); + task_work_add(tsk, &req->task_work, 0); wake_up_process(tsk); } return 1; From b64e3444d4e1c71fe148a4f4535395b1fdd73200 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:18 +0300 Subject: [PATCH 085/127] io_uring: simplify io_req_map_rw() Don't deref req->io->rw every time, but put it in a local variable. This looks prettier, generates less instructions, and doesn't break alias analysis. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ddff3abff363ae..3b8465dd0214ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2828,15 +2828,17 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - req->io->rw.nr_segs = iter->nr_segs; - req->io->rw.size = io_size; - req->io->rw.iov = iovec; - if (!req->io->rw.iov) { - req->io->rw.iov = req->io->rw.fast_iov; - if (req->io->rw.iov != fast_iov) - memcpy(req->io->rw.iov, fast_iov, + struct io_async_rw *rw = &req->io->rw; + + rw->nr_segs = iter->nr_segs; + rw->size = io_size; + if (!iovec) { + rw->iov = rw->fast_iov; + if (rw->iov != fast_iov) + memcpy(rw->iov, fast_iov, sizeof(struct iovec) * iter->nr_segs); } else { + rw->iov = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } From c3e330a493740a2a8312dcb7b1cffceaec7f619a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:19 +0300 Subject: [PATCH 086/127] io_uring: add a helper for async rw iovec prep Preparing reads/writes for async is a bit tricky. Extract a helper to not repeat it twice. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3b8465dd0214ef..31466bcd833e1e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2872,11 +2872,27 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, return 0; } +static inline int io_rw_prep_async(struct io_kiocb *req, int rw, + bool force_nonblock) +{ + struct io_async_ctx *io = req->io; + struct iov_iter iter; + ssize_t ret; + + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); + req->io = io; + if (unlikely(ret < 0)) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; +} + static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -2889,17 +2905,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, READ, force_nonblock); } static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, @@ -3043,8 +3049,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -3059,17 +3063,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, WRITE, force_nonblock); } static int io_write(struct io_kiocb *req, bool force_nonblock, From 252917c30f551e8e4377faac81d7fcf8e9629df1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 22:59:20 +0300 Subject: [PATCH 087/127] io_uring: follow **iovec idiom in io_import_iovec As for import_iovec(), return !=NULL iovec from io_import_iovec() only when it should be freed. That includes returning NULL when iovec is already in req->io, because it should be deallocated by other means, e.g. inside op handler. After io_setup_async_rw() local iovec to ->io, just mark it NULL, to follow the idea in io_{read,write} as well. That's easier to follow, and especially useful if we want to reuse per-op space for completion data. Signed-off-by: Pavel Begunkov [axboe: only call kfree() on non-NULL pointer] Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 31466bcd833e1e..64ae5b681c628c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2740,10 +2740,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->io) { struct io_async_rw *iorw = &req->io->rw; - *iovec = iorw->iov; - iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); - if (iorw->iov == iorw->fast_iov) - *iovec = NULL; + iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); + *iovec = NULL; return iorw->size; } @@ -3026,6 +3024,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, inline_vecs, &iter); if (ret) goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; /* if we can retry, do so with the callbacks armed */ if (io_rw_should_retry(req)) { ret2 = io_iter_do_read(req, &iter); @@ -3041,7 +3041,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, } } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } @@ -3143,11 +3143,13 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, inline_vecs, &iter); if (ret) goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; return -EAGAIN; } } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } From 3ca405ebfc1c3445b049dd25ca3338cbc99837d1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:08 +0300 Subject: [PATCH 088/127] io_uring: share completion list w/ per-op space Calling io_req_complete(req) means that the request is done, and there is nothing left but to clean it up. That also means that per-op data after that should not be used, so we're free to reuse it in completion path, e.g. to store overflow_list as done in this patch. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 64ae5b681c628c..3cadd5f963b74f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -487,6 +487,11 @@ struct io_statx { struct statx __user *buffer; }; +struct io_completion { + struct file *file; + struct list_head list; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -622,6 +627,8 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; + /* use only after cleaning per-op data, see io_clean_op() */ + struct io_completion compl; }; struct io_async_ctx *io; @@ -896,7 +903,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, static int io_grab_files(struct io_kiocb *req); static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs); -static void io_cleanup_req(struct io_kiocb *req); +static void __io_clean_op(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); static void __io_queue_sqe(struct io_kiocb *req, @@ -936,6 +943,12 @@ static void io_get_req_task(struct io_kiocb *req) req->flags |= REQ_F_TASK_PINNED; } +static inline void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & REQ_F_NEED_CLEANUP) + __io_clean_op(req); +} + /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ static void __io_put_req_task(struct io_kiocb *req) { @@ -1413,8 +1426,8 @@ static void io_submit_flush_completions(struct io_comp_state *cs) while (!list_empty(&cs->list)) { struct io_kiocb *req; - req = list_first_entry(&cs->list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&cs->list, struct io_kiocb, compl.list); + list_del(&req->compl.list); __io_cqring_fill_event(req, req->result, req->cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; @@ -1439,9 +1452,10 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, io_cqring_add_event(req, res, cflags); io_put_req(req); } else { + io_clean_op(req); req->result = res; req->cflags = cflags; - list_add_tail(&req->list, &cs->list); + list_add_tail(&req->compl.list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); } @@ -1515,8 +1529,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, static void io_dismantle_req(struct io_kiocb *req) { - if (req->flags & REQ_F_NEED_CLEANUP) - io_cleanup_req(req); + io_clean_op(req); if (req->io) kfree(req->io); @@ -5402,7 +5415,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } -static void io_cleanup_req(struct io_kiocb *req) +static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; From 540e32a0855e700affa29b1112bf2dbb1fa7702a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:09 +0300 Subject: [PATCH 089/127] io_uring: rename ctx->poll into ctx->iopoll It supports both polling and I/O polling. Rename ctx->poll to clearly show that it's only in I/O poll case. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3cadd5f963b74f..c8ebd227c837c6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -320,12 +320,12 @@ struct io_ring_ctx { spinlock_t completion_lock; /* - * ->poll_list is protected by the ctx->uring_lock for + * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head poll_list; + struct list_head iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_file; @@ -1064,7 +1064,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -2009,7 +2009,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, list) { struct kiocb *kiocb = &req->rw.kiocb; /* @@ -2051,7 +2051,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list) && !need_resched()) { + while (!list_empty(&ctx->iopoll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); @@ -2074,7 +2074,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; io_do_iopoll(ctx, &nr_events, 0); @@ -2291,12 +2291,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->poll_list)) { + if (list_empty(&ctx->iopoll_list)) { ctx->poll_multi_file = false; } else if (!ctx->poll_multi_file) { struct io_kiocb *list_req; - list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, list); if (list_req->file != req->file) ctx->poll_multi_file = true; @@ -2307,9 +2307,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->list, &ctx->poll_list); + list_add(&req->list, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->poll_list); + list_add_tail(&req->list, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) @@ -6329,11 +6329,11 @@ static int io_sq_thread(void *data) while (!kthread_should_park()) { unsigned int to_submit; - if (!list_empty(&ctx->poll_list)) { + if (!list_empty(&ctx->iopoll_list)) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list) && !need_resched()) + if (!list_empty(&ctx->iopoll_list) && !need_resched()) io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; @@ -6362,7 +6362,7 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (!list_empty(&ctx->poll_list) || need_resched() || + if (!list_empty(&ctx->iopoll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { io_run_task_work(); @@ -6375,13 +6375,13 @@ static int io_sq_thread(void *data) /* * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to poll_list, it - * is because reqs may have been punted to io worker and - * will be added to poll_list later, hence check the - * poll_list again. + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. */ if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->poll_list)) { + !list_empty_careful(&ctx->iopoll_list)) { finish_wait(&ctx->sqo_wait, &wait); continue; } From d21ffe7eca82d47b489760899912f81e30456e2e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:10 +0300 Subject: [PATCH 090/127] io_uring: use inflight_entry list for iopoll'ing req->inflight_entry is used to track requests that grabbed files_struct. Let's share it with iopoll list, because the only iopoll'ed ops are reads and writes, which don't need a file table. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c8ebd227c837c6..8a89480a57ec60 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -651,6 +651,10 @@ struct io_kiocb { struct list_head link_list; + /* + * 1. used with ctx->iopoll_list with reads/writes + * 2. to track reqs with ->files (see io_op_def::file_table) + */ struct list_head inflight_entry; struct percpu_ref *fixed_file_refs; @@ -1943,8 +1947,8 @@ static void io_iopoll_queue(struct list_head *again) struct io_kiocb *req; do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(again, struct io_kiocb, inflight_entry); + list_del(&req->inflight_entry); if (!io_rw_reissue(req, -EAGAIN)) io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); } while (!list_empty(again)); @@ -1967,13 +1971,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, while (!list_empty(done)) { int cflags = 0; - req = list_first_entry(done, struct io_kiocb, list); + req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { req->iopoll_completed = 0; - list_move_tail(&req->list, &again); + list_move_tail(&req->inflight_entry, &again); continue; } - list_del(&req->list); + list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_kbuf(req); @@ -2009,7 +2013,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; /* @@ -2018,7 +2022,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * and complete those lists first, if we have entries there. */ if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->list, &done); + list_move_tail(&req->inflight_entry, &done); continue; } if (!list_empty(&done)) @@ -2030,7 +2034,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, /* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) - list_move_tail(&req->list, &done); + list_move_tail(&req->inflight_entry, &done); if (ret && spin) spin = false; @@ -2297,7 +2301,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) struct io_kiocb *list_req; list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, - list); + inflight_entry); if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -2307,9 +2311,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->list, &ctx->iopoll_list); + list_add(&req->inflight_entry, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->iopoll_list); + list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) From 40d8ddd4facb80760d5a0c61a7cf026d5ff73ff0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:11 +0300 Subject: [PATCH 091/127] io_uring: use completion list for CQ overflow As with the completion path, also use compl.list for overflowed requests. If cleaned up properly, nobody needs per-op data there anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a89480a57ec60..2122b37e68e324 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1339,8 +1339,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) break; req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - list); - list_move(&req->list, &list); + compl.list); + list_move(&req->compl.list, &list); req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); @@ -1362,8 +1362,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) io_cqring_ev_posted(ctx); while (!list_empty(&list)) { - req = list_first_entry(&list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&list, struct io_kiocb, compl.list); + list_del(&req->compl.list); io_put_req(req); } @@ -1396,11 +1396,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) set_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } + io_clean_op(req); req->flags |= REQ_F_OVERFLOW; - refcount_inc(&req->refs); req->result = res; req->cflags = cflags; - list_add_tail(&req->list, &ctx->cq_overflow_list); + refcount_inc(&req->refs); + list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } } @@ -7835,7 +7836,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (cancel_req->flags & REQ_F_OVERFLOW) { spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->list); + list_del(&cancel_req->compl.list); cancel_req->flags &= ~REQ_F_OVERFLOW; if (list_empty(&ctx->cq_overflow_list)) { clear_bit(0, &ctx->sq_check_overflow); From 135fcde8496b03d31648171dbc038990112e41d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:12 +0300 Subject: [PATCH 092/127] io_uring: add req->timeout.list Instead of using shared req->list, hang timeouts up on their own list entry. struct io_timeout have enough extra space for it, but if that will be a problem ->inflight_entry can reused for that. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2122b37e68e324..2544795cfd3053 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -396,6 +396,7 @@ struct io_timeout { int flags; u32 off; u32 target_seq; + struct list_head list; }; struct io_rw { @@ -1213,7 +1214,7 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); - list_del_init(&req->list); + list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); io_put_req(req); @@ -1225,7 +1226,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) io_kill_timeout(req); spin_unlock_irq(&ctx->completion_lock); } @@ -1248,7 +1249,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->timeout_list)) { struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, list); + struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1256,7 +1257,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) - atomic_read(&ctx->cq_timeouts)) break; - list_del_init(&req->list); + list_del_init(&req->timeout.list); io_kill_timeout(req); } } @@ -4997,8 +4998,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. */ - if (!list_empty(&req->list)) - list_del_init(&req->list); + if (!list_empty(&req->timeout.list)) + list_del_init(&req->timeout.list); io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); @@ -5015,9 +5016,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) struct io_kiocb *req; int ret = -ENOENT; - list_for_each_entry(req, &ctx->timeout_list, list) { + list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (user_data == req->user_data) { - list_del_init(&req->list); + list_del_init(&req->timeout.list); ret = 0; break; } @@ -5139,7 +5140,8 @@ static int io_timeout(struct io_kiocb *req) * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, + timeout.list); if (io_is_timeout_noseq(nxt)) continue; @@ -5148,7 +5150,7 @@ static int io_timeout(struct io_kiocb *req) break; } add: - list_add(&req->list, entry); + list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); From 7d6ddea6beaf6639cf3a2b291dcdac6fe1edc584 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:13 +0300 Subject: [PATCH 093/127] io_uring: remove init for unused list poll*() doesn't use req->list, don't init it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2544795cfd3053..1e4ac48b15572a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4964,7 +4964,6 @@ static int io_poll_add(struct io_kiocb *req) req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); - INIT_LIST_HEAD(&req->list); ipt.pt._qproc = io_poll_queue_proc; mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, From 27dc8338e5fb0e0ed5b272e792f4ffad7f3bc03e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:14 +0300 Subject: [PATCH 094/127] io_uring: use non-intrusive list for defer The only left user of req->list is DRAIN, hence instead of keeping a separate per request list for it, do that with old fashion non-intrusive lists allocated on demand. That's a really slow path, so that's OK. This removes req->list and so sheds 16 bytes from io_kiocb. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1e4ac48b15572a..6e6e7131078566 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -641,7 +641,6 @@ struct io_kiocb { u16 buf_index; struct io_ring_ctx *ctx; - struct list_head list; unsigned int flags; refcount_t refs; struct task_struct *task; @@ -676,6 +675,11 @@ struct io_kiocb { struct callback_head task_work; }; +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; +}; + #define IO_IOPOLL_BATCH 8 struct io_comp_state { @@ -1234,14 +1238,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { - struct io_kiocb *req = list_first_entry(&ctx->defer_list, - struct io_kiocb, list); + struct io_defer_entry *de = list_first_entry(&ctx->defer_list, + struct io_defer_entry, list); - if (req_need_defer(req)) + if (req_need_defer(de->req)) break; - list_del_init(&req->list); + list_del_init(&de->list); /* punt-init is done before queueing for defer */ - __io_queue_async_work(req); + __io_queue_async_work(de->req); + kfree(de); } while (!list_empty(&ctx->defer_list)); } @@ -5394,6 +5399,7 @@ static int io_req_defer_prep(struct io_kiocb *req, static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; + struct io_defer_entry *de; int ret; /* Still need defer if there is pending req in defer list. */ @@ -5408,15 +5414,20 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } io_prep_async_link(req); + de = kmalloc(sizeof(*de), GFP_KERNEL); + if (!de) + return -ENOMEM; spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); + kfree(de); return 0; } trace_io_uring_defer(ctx, req, req->user_data); - list_add_tail(&req->list, &ctx->defer_list); + de->req = req; + list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } From 9cf7c104deaef52d6fd7c103a716e31d9815ede8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:15 +0300 Subject: [PATCH 095/127] io_uring: remove sequence from io_kiocb req->sequence is used only for deferred (i.e. DRAIN) requests, but initialised for every request. Remove req->sequence from io_kiocb together with its initialisation in io_init_req(). Replace it with a new field in struct io_defer_entry, that will be calculated only when needed in io_req_defer(), which is a slow path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e6e7131078566..efa132831f3d28 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -639,6 +639,7 @@ struct io_kiocb { u8 iopoll_completed; u16 buf_index; + u32 result; struct io_ring_ctx *ctx; unsigned int flags; @@ -646,8 +647,6 @@ struct io_kiocb { struct task_struct *task; unsigned long fsize; u64 user_data; - u32 result; - u32 sequence; struct list_head link_list; @@ -678,6 +677,7 @@ struct io_kiocb { struct io_defer_entry { struct list_head list; struct io_kiocb *req; + u32 seq; }; #define IO_IOPOLL_BATCH 8 @@ -1090,13 +1090,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return NULL; } -static inline bool req_need_defer(struct io_kiocb *req) +static bool req_need_defer(struct io_kiocb *req, u32 seq) { if (unlikely(req->flags & REQ_F_IO_DRAIN)) { struct io_ring_ctx *ctx = req->ctx; - return req->sequence != ctx->cached_cq_tail - + atomic_read(&ctx->cached_cq_overflow); + return seq != ctx->cached_cq_tail + + atomic_read(&ctx->cached_cq_overflow); } return false; @@ -1241,7 +1241,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req)) + if (req_need_defer(de->req, de->seq)) break; list_del_init(&de->list); /* punt-init is done before queueing for defer */ @@ -5396,14 +5396,35 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } +static u32 io_get_sequence(struct io_kiocb *req) +{ + struct io_kiocb *pos; + struct io_ring_ctx *ctx = req->ctx; + u32 total_submitted, nr_reqs = 1; + + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(pos, &req->link_list, link_list) + nr_reqs++; + + total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; + return total_submitted - nr_reqs; +} + static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; + u32 seq; /* Still need defer if there is pending req in defer list. */ - if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list)) + if (likely(list_empty_careful(&ctx->defer_list) && + !(req->flags & REQ_F_IO_DRAIN))) + return 0; + + seq = io_get_sequence(req); + /* Still a chance to pass the sequence check */ + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; if (!req->io) { @@ -5419,7 +5440,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; spin_lock_irq(&ctx->completion_lock); - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { + if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(de); return 0; @@ -5427,6 +5448,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) trace_io_uring_defer(ctx, req, req->user_data); de->req = req; + de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; @@ -6204,12 +6226,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags; int id; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); req->io = NULL; From 0f7e466b393abab86be96ffcf00af383afddc0d1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 13 Jul 2020 23:37:16 +0300 Subject: [PATCH 096/127] io_uring: place cflags into completion data req->cflags is used only for defer-completion path, just use completion data to store it. With the 4 bytes from the ->sequence patch and compacting io_kiocb, this frees 8 bytes. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index efa132831f3d28..4d0fd9ddd3dc2a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -491,6 +491,7 @@ struct io_statx { struct io_completion { struct file *file; struct list_head list; + int cflags; }; struct io_async_connect { @@ -633,7 +634,6 @@ struct io_kiocb { }; struct io_async_ctx *io; - int cflags; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; @@ -1351,7 +1351,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, req->cflags); + WRITE_ONCE(cqe->flags, req->compl.cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1405,7 +1405,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) io_clean_op(req); req->flags |= REQ_F_OVERFLOW; req->result = res; - req->cflags = cflags; + req->compl.cflags = cflags; refcount_inc(&req->refs); list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } @@ -1439,7 +1439,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs) req = list_first_entry(&cs->list, struct io_kiocb, compl.list); list_del(&req->compl.list); - __io_cqring_fill_event(req, req->result, req->cflags); + __io_cqring_fill_event(req, req->result, req->compl.cflags); if (!(req->flags & REQ_F_LINK_HEAD)) { req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -1465,7 +1465,7 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, } else { io_clean_op(req); req->result = res; - req->cflags = cflags; + req->compl.cflags = cflags; list_add_tail(&req->compl.list, &cs->list); if (++cs->nr >= 32) io_submit_flush_completions(cs); From dca9cf8b87f55c96f072c1fc6bc90e2b97a8e19f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:49 +0300 Subject: [PATCH 097/127] io_uring: inline io_req_work_grab_env() The only caller of io_req_work_grab_env() is io_prep_async_work(), and they are both initialising req->work. Inline grab_env(), it's easier to keep this way, moreover there already were bugs with misplacing io_req_init_async(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 50 ++++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4d0fd9ddd3dc2a..a06d5b9cc04647 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1115,31 +1115,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static void io_req_work_grab_env(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - - io_req_init_async(req); - - if (!req->work.mm && def->needs_mm) { - mmgrab(current->mm); - req->work.mm = current->mm; - } - if (!req->work.creds) - req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } -} - -static inline void io_req_work_drop_env(struct io_kiocb *req) +static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) return; @@ -1177,8 +1153,22 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - io_req_work_grab_env(req); + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (!req->work.fs && def->needs_fs) { + spin_lock(¤t->fs->lock); + if (!current->fs->in_exec) { + req->work.fs = current->fs; + req->work.fs->users++; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } } static void io_prep_async_link(struct io_kiocb *req) @@ -1547,7 +1537,7 @@ static void io_dismantle_req(struct io_kiocb *req) if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); __io_put_req_task(req); - io_req_work_drop_env(req); + io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -4825,7 +4815,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) io_put_req(req); /* * restore ->work because we will call - * io_req_work_drop_env below when dropping the + * io_req_clean_work below when dropping the * final reference. */ if (req->flags & REQ_F_WORK_INITIALIZED) @@ -4965,7 +4955,7 @@ static int io_poll_add(struct io_kiocb *req) __poll_t mask; /* ->work is in union with hash_node and others */ - io_req_work_drop_env(req); + io_req_clean_work(req); req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); From 1c2da9e8839d6437b43f2c805411d1a0cbd70165 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:50 +0300 Subject: [PATCH 098/127] io_uring: remove empty cleanup of OP_OPEN* reqs A switch in __io_clean_op() doesn't have default, it's pointless to list opcodes that doesn't do any cleanup. Remove IORING_OP_OPEN* from there. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a06d5b9cc04647..8d6f1c4e8dac5a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5473,9 +5473,6 @@ static void __io_clean_op(struct io_kiocb *req) if (req->flags & REQ_F_BUFFER_SELECTED) kfree(req->sr_msg.kbuf); break; - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - break; case IORING_OP_SPLICE: case IORING_OP_TEE: io_put_file(req, req->splice.file_in, From 327d6d968b195cfc48ff97c49b56520aac922f65 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:51 +0300 Subject: [PATCH 099/127] io_uring: alloc ->io in io_req_defer_prep() Every call to io_req_defer_prep() is prepended with allocating ->io, just do that in the function. And while we're at it, mark error paths with unlikey and replace "if (ret < 0)" with "if (ret)". There is only one change in the observable behaviour, that's instead of killing the head request right away on error, it postpones it until the link is assembled, that looks more preferable. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8d6f1c4e8dac5a..6a1cd2aea0185d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5279,6 +5279,9 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; + if (io_alloc_async_ctx(req)) + return -EAGAIN; + if (io_op_defs[req->opcode].file_table) { io_req_init_async(req); ret = io_grab_files(req); @@ -5418,10 +5421,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) - return -EAGAIN; ret = io_req_defer_prep(req, sqe); - if (ret < 0) + if (ret) return ret; } io_prep_async_link(req); @@ -6024,11 +6025,8 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } } else if (req->flags & REQ_F_FORCE_ASYNC) { if (!req->io) { - ret = -EAGAIN; - if (io_alloc_async_ctx(req)) - goto fail_req; ret = io_req_defer_prep(req, sqe); - if (unlikely(ret < 0)) + if (unlikely(ret)) goto fail_req; } @@ -6081,11 +6079,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) { + if (unlikely(ret)) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; return ret; @@ -6108,11 +6103,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_LINK_HEAD; INIT_LIST_HEAD(&req->link_list); - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) + if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; *link = req; } else { From 57f1a64958543fe18a7fe0addbfb31bb2ceeaea2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jul 2020 12:46:52 +0300 Subject: [PATCH 100/127] io_uring/io-wq: move RLIMIT_FSIZE to io-wq RLIMIT_SIZE in needed only for execution from an io-wq context, hence move all preparations from hot path to io-wq work setup. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 1 + fs/io-wq.h | 1 + fs/io_uring.c | 22 +++++++++------------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 72f759e1d6eb0b..8702d3c3b291a0 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -462,6 +462,7 @@ static void io_impersonate_work(struct io_worker *worker, io_wq_switch_mm(worker, work); if (worker->cur_creds != work->creds) io_wq_switch_creds(worker, work); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize; } static void io_assign_current_work(struct io_worker *worker, diff --git a/fs/io-wq.h b/fs/io-wq.h index 114f12ec2d6572..ddaf9614cf9bc1 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -89,6 +89,7 @@ struct io_wq_work { struct mm_struct *mm; const struct cred *creds; struct fs_struct *fs; + unsigned long fsize; unsigned flags; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a1cd2aea0185d..8b2f7a1bbd06d5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -645,7 +645,6 @@ struct io_kiocb { unsigned int flags; refcount_t refs; struct task_struct *task; - unsigned long fsize; u64 user_data; struct list_head link_list; @@ -736,6 +735,7 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + unsigned needs_fsize : 1; }; static const struct io_op_def io_op_defs[] = { @@ -755,6 +755,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -769,6 +770,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -821,6 +823,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .needs_fsize = 1, }, [IORING_OP_OPENAT] = { .file_table = 1, @@ -852,6 +855,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -1169,6 +1173,10 @@ static void io_prep_async_work(struct io_kiocb *req) } spin_unlock(¤t->fs->lock); } + if (def->needs_fsize) + req->work.fsize = rlimit(RLIMIT_FSIZE); + else + req->work.fsize = RLIM_INFINITY; } static void io_prep_async_link(struct io_kiocb *req) @@ -3072,8 +3080,6 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - req->fsize = rlimit(RLIMIT_FSIZE); - /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -3130,17 +3136,11 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, } kiocb->ki_flags |= IOCB_WRITE; - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; - if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. @@ -3335,7 +3335,6 @@ static int io_fallocate_prep(struct io_kiocb *req, req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); req->sync.mode = READ_ONCE(sqe->len); - req->fsize = rlimit(RLIMIT_FSIZE); return 0; } @@ -3346,11 +3345,8 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) /* fallocate always requiring blocking context */ if (force_nonblock) return -EAGAIN; - - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); io_req_complete(req, ret); From 06ef3608b0eed673fcbc62cf74c8d3ad0007a337 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:33 +0300 Subject: [PATCH 101/127] io_uring: simplify file ref tracking in submission state Currently, file refs in struct io_submit_state are tracked with 2 vars: @has_refs -- how many refs were initially taken @used_refs -- number of refs used Replace it with a single variable counting how many refs left at the current moment. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8b2f7a1bbd06d5..28b47533454a5a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -707,7 +707,6 @@ struct io_submit_state { struct file *file; unsigned int fd; unsigned int has_refs; - unsigned int used_refs; unsigned int ios_left; }; @@ -2327,10 +2326,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req) static void __io_state_file_put(struct io_submit_state *state) { - int diff = state->has_refs - state->used_refs; - - if (diff) - fput_many(state->file, diff); + if (state->has_refs) + fput_many(state->file, state->has_refs); state->file = NULL; } @@ -2352,7 +2349,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (state->file) { if (state->fd == fd) { - state->used_refs++; + state->has_refs--; state->ios_left--; return state->file; } @@ -2363,9 +2360,8 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return NULL; state->fd = fd; - state->has_refs = state->ios_left; - state->used_refs = 1; state->ios_left--; + state->has_refs = state->ios_left; return state->file; } From 7a7cacba8b4560403615b04d57bdcd1f93f90f10 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:27:59 +0300 Subject: [PATCH 102/127] io_uring: indent left {send,recv}[msg]() Flip over "if (sock)" condition with return on error, the upper layer will take care. That change will be handy later, but already removes an extra jump from hot path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 265 +++++++++++++++++++++++++------------------------- 1 file changed, 131 insertions(+), 134 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 28b47533454a5a..264b1e5e2d5408 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3916,42 +3916,41 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg = NULL; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_async_msghdr iomsg; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + if (unlikely(!sock)) + return ret; - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; } + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -3964,39 +3963,38 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, static int io_send(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, - &msg.msg_iter); - if (ret) - return ret; + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; if (ret < 0) req_set_fail_links(req); @@ -4149,62 +4147,62 @@ static int io_recvmsg_prep(struct io_kiocb *req, static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg = NULL; struct socket *sock; + struct io_buffer *kbuf; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_buffer *kbuf; - struct io_async_msghdr iomsg; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } + if (unlikely(!sock)) + return ret; - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) { - return PTR_ERR(kbuf); - } else if (kbuf) { - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, - 1, req->sr_msg.len); - } + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_recvmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) { + return PTR_ERR(kbuf); + } else if (kbuf) { + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + 1, req->sr_msg.len); + } - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, - kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - ret = io_setup_async_msg(req, kmsg); - if (ret != -EAGAIN) - kfree(kbuf); - return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (kbuf) + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, + kmsg->uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + ret = io_setup_async_msg(req, kmsg); + if (ret != -EAGAIN) kfree(kbuf); + return ret; } + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (kbuf) + kfree(kbuf); if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); @@ -4215,51 +4213,50 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { struct io_buffer *kbuf = NULL; + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + void __user *buf = sr->buf; struct socket *sock; + struct iovec iov; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - void __user *buf = sr->buf; - struct msghdr msg; - struct iovec iov; - unsigned flags; - - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - else if (kbuf) - buf = u64_to_user_ptr(kbuf->addr); + if (unlikely(!sock)) + return ret; - ret = import_single_range(READ, buf, sr->len, &iov, - &msg.msg_iter); - if (ret) { - kfree(kbuf); - return ret; - } + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + else if (kbuf) + buf = u64_to_user_ptr(kbuf->addr); - req->flags |= REQ_F_NEED_CLEANUP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; + ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) { + kfree(kbuf); + return ret; } + req->flags |= REQ_F_NEED_CLEANUP; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = sock_recvmsg(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) From 6b754c8b912a164fbb15b7b839d51709c3d9ee6f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:00 +0300 Subject: [PATCH 103/127] io_uring: remove extra checks in send/recv With the return on a bad socket, kmsg is always non-null by the end of the function, prune left extra checks and initialisations. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 264b1e5e2d5408..ac3c16ea7d23d1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3916,7 +3916,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr iomsg, *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; int ret; @@ -3951,7 +3951,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) @@ -4147,7 +4147,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_async_msghdr iomsg, *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; @@ -4199,7 +4199,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (kbuf) kfree(kbuf); - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4212,7 +4212,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, static int io_recv(struct io_kiocb *req, bool force_nonblock, struct io_comp_state *cs) { - struct io_buffer *kbuf = NULL; + struct io_buffer *kbuf; struct io_sr_msg *sr = &req->sr_msg; struct msghdr msg; void __user *buf = sr->buf; From 14c32eee9286621dd437b53460e44bd11e5bc08d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:01 +0300 Subject: [PATCH 104/127] io_uring: don't forget cflags in io_recv() Instead of returning error from io_recv(), go through generic cleanup path, because it'll retain cflags for userspace. Do the same for io_send() for consistency. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ac3c16ea7d23d1..2ffacfbf9094ac 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3976,7 +3976,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) - return ret; + return ret;; msg.msg_name = NULL; msg.msg_control = NULL; @@ -4232,10 +4232,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, buf = u64_to_user_ptr(kbuf->addr); ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) { - kfree(kbuf); - return ret; - } + if (unlikely(ret)) + goto out_free; req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; @@ -4256,7 +4254,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; - +out_free: kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) From 0e1b6fe3d1e5f1b79c5bec37881c98febfba7718 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:02 +0300 Subject: [PATCH 105/127] io_uring: free selected-bufs if error'ed io_clean_op() may be skipped even if there is a selected io_buffer, that's because *select_buffer() funcions never set REQ_F_NEED_CLEANUP. Trigger io_clean_op() when REQ_F_BUFFER_SELECTED is set as well, and and clear the flag if was freed out of it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 85 ++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2ffacfbf9094ac..4448b1e9a75489 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -957,7 +957,7 @@ static void io_get_req_task(struct io_kiocb *req) static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_NEED_CLEANUP) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) __io_clean_op(req); } @@ -1931,6 +1931,7 @@ static int io_put_kbuf(struct io_kiocb *req) cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; cflags |= IORING_CQE_F_BUFFER; req->rw.addr = 0; + req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(kbuf); return cflags; } @@ -4188,20 +4189,16 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - ret = io_setup_async_msg(req, kmsg); - if (ret != -EAGAIN) - kfree(kbuf); - return ret; - } + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (kbuf) kfree(kbuf); - if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); - req->flags &= ~REQ_F_NEED_CLEANUP; + req->flags &= ~(REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED); if (ret < 0) req_set_fail_links(req); @@ -4235,7 +4232,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (unlikely(ret)) goto out_free; - req->flags |= REQ_F_NEED_CLEANUP; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; @@ -4255,7 +4251,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; out_free: - kfree(kbuf); + if (kbuf) + kfree(kbuf); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); @@ -5436,39 +5433,45 @@ static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - if (req->flags & REQ_F_BUFFER_SELECTED) + if (req->flags & REQ_F_BUFFER_SELECTED) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: kfree((void *)(unsigned long)req->rw.addr); - /* fallthrough */ - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); - break; - case IORING_OP_RECVMSG: - if (req->flags & REQ_F_BUFFER_SELECTED) - kfree(req->sr_msg.kbuf); - /* fallthrough */ - case IORING_OP_SENDMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); - break; - case IORING_OP_RECV: - if (req->flags & REQ_F_BUFFER_SELECTED) + break; + case IORING_OP_RECVMSG: + case IORING_OP_RECV: kfree(req->sr_msg.kbuf); - break; - case IORING_OP_SPLICE: - case IORING_OP_TEE: - io_put_file(req, req->splice.file_in, - (req->splice.flags & SPLICE_F_FD_IN_FIXED)); - break; + break; + } + req->flags &= ~REQ_F_BUFFER_SELECTED; + } + + if (req->flags & REQ_F_NEED_CLEANUP) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (io->rw.iov != io->rw.fast_iov) + kfree(io->rw.iov); + break; + case IORING_OP_RECVMSG: + case IORING_OP_SENDMSG: + if (io->msg.iov != io->msg.fast_iov) + kfree(io->msg.iov); + break; + case IORING_OP_SPLICE: + case IORING_OP_TEE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; + } + req->flags &= ~REQ_F_NEED_CLEANUP; } - - req->flags &= ~REQ_F_NEED_CLEANUP; } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, From bc02ef3325e3ef524ef29b65681ca4207b781224 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:03 +0300 Subject: [PATCH 106/127] io_uring: move BUFFER_SELECT check into *recv[msg] Move REQ_F_BUFFER_SELECT flag check out of io_recv_buffer_select(), and do that in its call sites That saves us from double error checking and possibly an extra function call. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4448b1e9a75489..8dd9037e332ec1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4098,9 +4098,6 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, struct io_sr_msg *sr = &req->sr_msg; struct io_buffer *kbuf; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return NULL; - kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); if (IS_ERR(kbuf)) return kbuf; @@ -4150,7 +4147,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, { struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - struct io_buffer *kbuf; + struct io_buffer *kbuf = NULL; unsigned flags; int ret, cflags = 0; @@ -4172,10 +4169,10 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, kmsg = &iomsg; } - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) { - return PTR_ERR(kbuf); - } else if (kbuf) { + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, 1, req->sr_msg.len); @@ -4222,11 +4219,12 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (unlikely(!sock)) return ret; - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - else if (kbuf) + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); + } ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) From 8ff069bf2efd7b7aeb90b56ea8edc165c93d8940 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:04 +0300 Subject: [PATCH 107/127] io_uring: extract io_put_kbuf() helper Extract a common helper for cleaning up a selected buffer, this will be used shortly. By the way, correct cflags types to unsigned and, as kbufs are anyway tracked by a flag, remove useless zeroing req->rw.addr. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8dd9037e332ec1..871ada2a29c3ed 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1922,20 +1922,25 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static int io_put_kbuf(struct io_kiocb *req) +static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) { - struct io_buffer *kbuf; - int cflags; + unsigned int cflags; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; cflags |= IORING_CQE_F_BUFFER; - req->rw.addr = 0; req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(kbuf); return cflags; } +static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) +{ + struct io_buffer *kbuf; + + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; + return io_put_kbuf(req, kbuf); +} + static inline bool io_run_task_work(void) { if (current->task_works) { @@ -1985,7 +1990,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); + cflags = io_put_rw_kbuf(req); __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; @@ -2177,7 +2182,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res, if (res != req->result) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); + cflags = io_put_rw_kbuf(req); __io_req_complete(req, res, cflags, cs); } From 7fbb1b541f4286cc337b9bca1e5bad0ce4ee978c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jul 2020 23:28:05 +0300 Subject: [PATCH 108/127] io_uring: don't open-code recv kbuf managment Don't implement fast path of kbuf freeing and management inlined into io_recv{,msg}(), that's error prone and duplicates handling. Replace it with a helper io_put_recv_kbuf(), which mimics io_put_rw_kbuf() in the io_read/write(). This also keeps cflags calculation in one place, removing duplication between rw and recv/send. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 871ada2a29c3ed..6e5ea7991c0813 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4098,7 +4098,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - int *cflags, bool needs_lock) + bool needs_lock) { struct io_sr_msg *sr = &req->sr_msg; struct io_buffer *kbuf; @@ -4109,12 +4109,14 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, sr->kbuf = kbuf; req->flags |= REQ_F_BUFFER_SELECTED; - - *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - *cflags |= IORING_CQE_F_BUFFER; return kbuf; } +static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) +{ + return io_put_kbuf(req, req->sr_msg.kbuf); +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4152,7 +4154,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, { struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - struct io_buffer *kbuf = NULL; + struct io_buffer *kbuf; unsigned flags; int ret, cflags = 0; @@ -4175,7 +4177,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, } if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + kbuf = io_recv_buffer_select(req, !force_nonblock); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); @@ -4196,12 +4198,11 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; - if (kbuf) - kfree(kbuf); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); - req->flags &= ~(REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED); - + req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); @@ -4225,7 +4226,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, return ret; if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + kbuf = io_recv_buffer_select(req, !force_nonblock); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); @@ -4254,9 +4255,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, if (ret == -ERESTARTSYS) ret = -EINTR; out_free: - if (kbuf) - kfree(kbuf); - req->flags &= ~REQ_F_NEED_CLEANUP; + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (ret < 0) req_set_fail_links(req); __io_req_complete(req, ret, cflags, cs); From 5dbcad51f78434e782d0470b8b5fc4380700c35f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:31:20 +0300 Subject: [PATCH 109/127] io_uring: don't miscount pinned memory io_sqe_buffer_unregister() uses cxt->sqo_mm for memory accounting, but io_ring_ctx_free() drops ->sqo_mm before leaving pinned_vm over-accounted. Postpone mm cleanup for when it's not needed anymore. Fixes: 309758254ea62 ("io_uring: report pinned memory usage") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e5ea7991c0813..ba7ce103667b5d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7670,12 +7670,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); + io_sqe_buffer_unregister(ctx); if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; } - io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); From cbcf72148da4af55ea81cfb351ea7c026ff1014f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:31:21 +0300 Subject: [PATCH 110/127] io_uring: return locked and pinned page accounting Locked and pinned memory accounting in io_{,un}account_mem() depends on having ->sqo_mm, which is NULL after a recent change for non SQPOLL'ed io_ring. That disables the accounting. Return ->sqo_mm initialisation back, and do __io_sq_thread_acquire_mm() based on IORING_SETUP_SQPOLL flag. Fixes: 8eb06d7e8dd85 ("io_uring: fix missing ->mm on exit") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ba7ce103667b5d..680b16f71a036f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -981,7 +981,8 @@ static void io_sq_thread_drop_mm(void) static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { if (!current->mm) { - if (unlikely(!ctx->sqo_mm || !mmget_not_zero(ctx->sqo_mm))) + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || + !mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); } @@ -7259,10 +7260,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - if (ctx->flags & IORING_SETUP_SQPOLL) { - mmgrab(current->mm); - ctx->sqo_mm = current->mm; + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto err; From dd6f843a9fca8f225c86fee5f50da429c369c045 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:32:51 +0300 Subject: [PATCH 111/127] tasks: add put_task_struct_many() put_task_struct_many() is as put_task_struct() but puts several references at once. Useful to batching it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/sched/task.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 38359071236ad7..1301077f9c24cd 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -126,6 +126,12 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } +static inline void put_task_struct_many(struct task_struct *t, int nr) +{ + if (refcount_sub_and_test(nr, &t->usage)) + __put_task_struct(t); +} + void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT From 5af1d13e8f0d8839db04a71ec786f369b0e67234 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 18 Jul 2020 11:32:52 +0300 Subject: [PATCH 112/127] io_uring: batch put_task_struct() As every iopoll request have a task ref, it becomes expensive to put them one by one, instead we can put several at once integrating that into io_req_free_batch(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 680b16f71a036f..3a415d924b93e8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1544,7 +1544,6 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - __io_put_req_task(req); io_req_clean_work(req); if (req->flags & REQ_F_INFLIGHT) { @@ -1564,6 +1563,7 @@ static void __io_free_req(struct io_kiocb *req) struct io_ring_ctx *ctx; io_dismantle_req(req); + __io_put_req_task(req); ctx = req->ctx; if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -1807,8 +1807,18 @@ static void io_free_req(struct io_kiocb *req) struct req_batch { void *reqs[IO_IOPOLL_BATCH]; int to_free; + + struct task_struct *task; + int task_refs; }; +static inline void io_init_req_batch(struct req_batch *rb) +{ + rb->to_free = 0; + rb->task_refs = 0; + rb->task = NULL; +} + static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, struct req_batch *rb) { @@ -1822,6 +1832,10 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, { if (rb->to_free) __io_req_free_batch_flush(ctx, rb); + if (rb->task) { + put_task_struct_many(rb->task, rb->task_refs); + rb->task = NULL; + } } static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) @@ -1833,6 +1847,17 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) if (req->flags & REQ_F_LINK_HEAD) io_queue_next(req); + if (req->flags & REQ_F_TASK_PINNED) { + if (req->task != rb->task) { + if (rb->task) + put_task_struct_many(rb->task, rb->task_refs); + rb->task = req->task; + rb->task_refs = 0; + } + rb->task_refs++; + req->flags &= ~REQ_F_TASK_PINNED; + } + io_dismantle_req(req); rb->reqs[rb->to_free++] = req; if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) @@ -1978,7 +2003,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = 0; + io_init_req_batch(&rb); while (!list_empty(done)) { int cflags = 0; From 23b3628e45924419399da48c2b3a522b05557c91 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 23 Jul 2020 20:57:24 +0800 Subject: [PATCH 113/127] io_uring: clear IORING_SQ_NEED_WAKEUP after executing task works In io_sq_thread(), if there are task works to handle, current codes will skip schedule() and go on polling sq again, but forget to clear IORING_SQ_NEED_WAKEUP flag, fix this issue. Also add two helpers to set and clear IORING_SQ_NEED_WAKEUP flag, Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3a415d924b93e8..6f3f18a99f4f42 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6344,6 +6344,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, return submitted; } +static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) +{ + /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + +static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; @@ -6417,10 +6432,7 @@ static int io_sq_thread(void *data) continue; } - /* Tell userspace we may need a wakeup call */ - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_set_wakeup_flag(ctx); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6430,6 +6442,7 @@ static int io_sq_thread(void *data) } if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); + io_ring_clear_wakeup_flag(ctx); continue; } if (signal_pending(current)) @@ -6437,17 +6450,13 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); } mutex_lock(&ctx->uring_lock); From ae34817bd93e373a03203a4c6892735c430a14e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jul 2020 20:25:20 +0300 Subject: [PATCH 114/127] io_uring: don't do opcode prep twice Calling into opcode prep handlers may be dangerous, as they re-read SQE but might not re-initialise requests completely. If io_req_defer() passed fast checks and is done with preparations, punt it async. As all other cases are covered with nulling @sqe, this guarantees that io_[opcode]_prep() are visited only once per request. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f3f18a99f4f42..38e4c39029638c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5447,7 +5447,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(de); - return 0; + io_queue_async_work(req); + return -EIOCBQUEUED; } trace_io_uring_defer(ctx, req, req->user_data); From f56040b81999871973d21f334b4657957422c90e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jul 2020 20:25:21 +0300 Subject: [PATCH 115/127] io_uring: deduplicate io_grab_files() calls Move io_req_init_async() into io_grab_files(), it's safer this way. Note that io_queue_async_work() does *init_async(), so it's valid to move out of __io_queue_sqe() punt path. Also, add a helper around io_grab_files(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 38e4c39029638c..c7e8e9a1b27bd9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -912,7 +912,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_grab_files(struct io_kiocb *req); +static int io_prep_work_files(struct io_kiocb *req); static void io_complete_rw_common(struct kiocb *kiocb, long res, struct io_comp_state *cs); static void __io_clean_op(struct io_kiocb *req); @@ -5294,13 +5294,9 @@ static int io_req_defer_prep(struct io_kiocb *req, if (io_alloc_async_ctx(req)) return -EAGAIN; - - if (io_op_defs[req->opcode].file_table) { - io_req_init_async(req); - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } + ret = io_prep_work_files(req); + if (unlikely(ret)) + return ret; switch (req->opcode) { case IORING_OP_NOP: @@ -5851,6 +5847,8 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + io_req_init_async(req); + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; if (!ctx->ring_file) @@ -5876,6 +5874,13 @@ static int io_grab_files(struct io_kiocb *req) return ret; } +static inline int io_prep_work_files(struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].file_table) + return 0; + return io_grab_files(req); +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5987,14 +5992,9 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, goto exit; } punt: - io_req_init_async(req); - - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (ret) - goto err; - } - + ret = io_prep_work_files(req); + if (unlikely(ret)) + goto err; /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. From b65e0dd6a2de050d3fc4c0db4969a245f4e7273e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jul 2020 14:41:58 +0300 Subject: [PATCH 116/127] io_uring: mark ->work uninitialised after cleanup Remove REQ_F_WORK_INITIALIZED after io_req_clean_work(). That's a cold path but is safer for those using io_req_clean_work() out of *dismantle_req()/*io_free(). And for the same reason zero work.fs Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c7e8e9a1b27bd9..59f1f473ffc720 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1141,7 +1141,9 @@ static void io_req_clean_work(struct io_kiocb *req) spin_unlock(&req->work.fs->lock); if (fs) free_fs_struct(fs); + req->work.fs = NULL; } + req->flags &= ~REQ_F_WORK_INITIALIZED; } static void io_prep_async_work(struct io_kiocb *req) @@ -4969,7 +4971,6 @@ static int io_poll_add(struct io_kiocb *req) /* ->work is in union with hash_node and others */ io_req_clean_work(req); - req->flags &= ~REQ_F_WORK_INITIALIZED; INIT_HLIST_NODE(&req->hash_node); ipt.pt._qproc = io_poll_queue_proc; From f063c5477eb392c315aa25ad538b4920b367ea05 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jul 2020 14:41:59 +0300 Subject: [PATCH 117/127] io_uring: fix missing io_queue_linked_timeout() Whoever called io_prep_linked_timeout() should also do io_queue_linked_timeout(). __io_queue_sqe() doesn't follow that for the punting path leaving linked timeouts prepared but never queued. Fixes: 6df1db6b54243 ("io_uring: fix mis-refcounting linked timeouts") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 59f1f473ffc720..3e406bc1f85501 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5987,20 +5987,20 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, * doesn't support non-blocking read/write attempts */ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - if (io_arm_poll_handler(req)) { - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - goto exit; - } + if (!io_arm_poll_handler(req)) { punt: - ret = io_prep_work_files(req); - if (unlikely(ret)) - goto err; - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); + ret = io_prep_work_files(req); + if (unlikely(ret)) + goto err; + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); + } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); goto exit; } From b089ed390b5c9bc248a32168709cfa01099caf9d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jul 2020 14:42:00 +0300 Subject: [PATCH 118/127] io-wq: update hash bits Linked requests are hashed, remove a comment stating otherwise. Also move hash bits to emphasise that we don't carry it through loop iteration and set it every time. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 8702d3c3b291a0..e92c4724480cad 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -490,7 +490,6 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; - unsigned int hash; get_next: /* * If we got some work, mark us as busy. If we didn't, but @@ -513,6 +512,7 @@ static void io_worker_handle_work(struct io_worker *worker) /* handle a whole dependent link */ do { struct io_wq_work *old_work, *next_hashed, *linked; + unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); io_impersonate_work(worker, work); @@ -523,7 +523,6 @@ static void io_worker_handle_work(struct io_worker *worker) if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; - hash = io_get_work_hash(work); old_work = work; linked = wq->do_work(work); @@ -542,8 +541,6 @@ static void io_worker_handle_work(struct io_worker *worker) spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; - /* dependent work is not hashed */ - hash = -1U; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; From 010e8e6be2194678f7e4bb3044c088bbee779f57 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:45 +0300 Subject: [PATCH 119/127] io_uring: de-unionise io_kiocb As io_kiocb have enough space, move ->work out of a union. It's safer this way and removes ->work memcpy bouncing. By the way make tabulation in struct io_kiocb consistent. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 59 ++++++++++++--------------------------------------- 1 file changed, 14 insertions(+), 45 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3e406bc1f85501..86ec5669fe50be 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -600,7 +600,6 @@ enum { struct async_poll { struct io_poll_iocb poll; struct io_poll_iocb *double_poll; - struct io_wq_work work; }; /* @@ -641,36 +640,26 @@ struct io_kiocb { u16 buf_index; u32 result; - struct io_ring_ctx *ctx; - unsigned int flags; - refcount_t refs; - struct task_struct *task; - u64 user_data; + struct io_ring_ctx *ctx; + unsigned int flags; + refcount_t refs; + struct task_struct *task; + u64 user_data; - struct list_head link_list; + struct list_head link_list; /* * 1. used with ctx->iopoll_list with reads/writes * 2. to track reqs with ->files (see io_op_def::file_table) */ - struct list_head inflight_entry; - - struct percpu_ref *fixed_file_refs; - - union { - /* - * Only commands that never go async can use the below fields, - * obviously. Right now only IORING_OP_POLL_ADD uses them, and - * async armed poll handlers for regular commands. The latter - * restore the work, if needed. - */ - struct { - struct hlist_node hash_node; - struct async_poll *apoll; - }; - struct io_wq_work work; - }; - struct callback_head task_work; + struct list_head inflight_entry; + + struct percpu_ref *fixed_file_refs; + struct callback_head task_work; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + struct hlist_node hash_node; + struct async_poll *apoll; + struct io_wq_work work; }; struct io_defer_entry { @@ -4668,10 +4657,6 @@ static void io_async_task_func(struct callback_head *cb) io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - /* restore ->work in case we need to retry again */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); - if (!READ_ONCE(apoll->poll.canceled)) __io_req_task_submit(req); else @@ -4763,9 +4748,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&apoll->work, &req->work, sizeof(req->work)); - io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4784,8 +4766,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (ret) { io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); return false; @@ -4828,14 +4808,6 @@ static bool io_poll_remove_one(struct io_kiocb *req) do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { io_put_req(req); - /* - * restore ->work because we will call - * io_req_clean_work below when dropping the - * final reference. - */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, - sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); } @@ -4969,9 +4941,6 @@ static int io_poll_add(struct io_kiocb *req) struct io_poll_table ipt; __poll_t mask; - /* ->work is in union with hash_node and others */ - io_req_clean_work(req); - INIT_HLIST_NODE(&req->hash_node); ipt.pt._qproc = io_poll_queue_proc; From 81b68a5ca0ab5d92229a7b76332b9ce88bd6dbd1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:46 +0300 Subject: [PATCH 120/127] io_uring: deduplicate __io_complete_rw() Call __io_complete_rw() in io_iopoll_queue() instead of hand coding it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 86ec5669fe50be..11f4ab87e08f0c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -891,7 +891,8 @@ enum io_mem_account { ACCT_PINNED, }; -static bool io_rw_reissue(struct io_kiocb *req, long res); +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); @@ -902,8 +903,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); static int io_prep_work_files(struct io_kiocb *req); -static void io_complete_rw_common(struct kiocb *kiocb, long res, - struct io_comp_state *cs); static void __io_clean_op(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); @@ -1976,8 +1975,7 @@ static void io_iopoll_queue(struct list_head *again) do { req = list_first_entry(again, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - if (!io_rw_reissue(req, -EAGAIN)) - io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL); + __io_complete_rw(req, -EAGAIN, 0, NULL); } while (!list_empty(again)); } From b2bd1cf99f3e7c8fbf12ea07af2c6998e1209e25 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:47 +0300 Subject: [PATCH 121/127] io_uring: fix racy overflow count reporting All ->cq_overflow modifications should be under completion_lock, otherwise it can report a wrong number to the userspace. Fix it in io_uring_cancel_files(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 11f4ab87e08f0c..6e2322525da64f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7847,10 +7847,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, clear_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } - spin_unlock_irq(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + spin_unlock_irq(&ctx->completion_lock); /* * Put inflight ref and overflow ref. If that's From dd9dfcdf5a603680458f5e7b0d2273c66e5417db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:48 +0300 Subject: [PATCH 122/127] io_uring: fix stalled deferred requests Always do io_commit_cqring() after completing a request, even if it was accounted as overflowed on the CQ side. Failing to do that may lead to not to pushing deferred requests when needed, and so stalling the whole ring. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e2322525da64f..11c1abe8bd1a6f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7849,6 +7849,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); /* From 4693014340808e7f099e302c1dc40e9d79ff7667 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:49 +0300 Subject: [PATCH 123/127] io_uring: consolidate *_check_overflow accounting Add a helper to mark ctx->{cq,sq}_check_overflow to get rid of duplicates, and it's clearer to check cq_overflow_list directly anyway. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 11c1abe8bd1a6f..efec290c6b08c7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1303,6 +1303,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } +static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) +{ + if (list_empty(&ctx->cq_overflow_list)) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + } +} + /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1347,11 +1356,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); - if (cqe) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } + io_cqring_mark_overflow(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -7842,11 +7848,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_del(&cancel_req->compl.list); cancel_req->flags &= ~REQ_F_OVERFLOW; - if (list_empty(&ctx->cq_overflow_list)) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } + + io_cqring_mark_overflow(ctx); WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); io_commit_cqring(ctx); From 01cec8c18f5ad9c27eee9f21439072832181039e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:50 +0300 Subject: [PATCH 124/127] io_uring: get rid of atomic FAA for cq_timeouts If ->cq_timeouts modifications are done under ->completion_lock, we don't really nee any fetch-and-add and other complex atomics. Replace it with non-atomic FAA, that saves an implicit full memory barrier. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index efec290c6b08c7..fabf0b69238498 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1205,7 +1205,8 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { - atomic_inc(&req->ctx->cq_timeouts); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); @@ -4972,9 +4973,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - atomic_inc(&ctx->cq_timeouts); - spin_lock_irqsave(&ctx->completion_lock, flags); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + /* * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. From d1719f70d0a5b83b12786a7dbc5b9fe396469016 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Jul 2020 13:43:53 -0600 Subject: [PATCH 125/127] io_uring: don't touch 'ctx' after installing file descriptor As soon as we install the file descriptor, we have to assume that it can get arbitrarily closed. We currently account memory (and note that we did) after installing the ring fd, which means that it could be a potential use-after-free condition if the fd is closed right after being installed, but before we fiddle with the ctx. In fact, syzbot reported this exact scenario: BUG: KASAN: use-after-free in io_account_mem fs/io_uring.c:7397 [inline] BUG: KASAN: use-after-free in io_uring_create fs/io_uring.c:8369 [inline] BUG: KASAN: use-after-free in io_uring_setup+0x2797/0x2910 fs/io_uring.c:8400 Read of size 1 at addr ffff888087a41044 by task syz-executor.5/18145 CPU: 0 PID: 18145 Comm: syz-executor.5 Not tainted 5.8.0-rc7-next-20200729-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 io_account_mem fs/io_uring.c:7397 [inline] io_uring_create fs/io_uring.c:8369 [inline] io_uring_setup+0x2797/0x2910 fs/io_uring.c:8400 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45c429 Code: 8d b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 5b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f8f121d0c78 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9 RAX: ffffffffffffffda RBX: 0000000000008540 RCX: 000000000045c429 RDX: 0000000000000000 RSI: 0000000020000040 RDI: 0000000000000196 RBP: 000000000078bf38 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000078bf0c R13: 00007fff86698cff R14: 00007f8f121d19c0 R15: 000000000078bf0c Move the accounting of the ring used locked memory before we get and install the ring file descriptor. Cc: stable@vger.kernel.org Reported-by: syzbot+9d46305e76057f30c74e@syzkaller.appspotmail.com Fixes: 309758254ea6 ("io_uring: report pinned memory usage") Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fabf0b69238498..33702f3b5af845 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8329,6 +8329,15 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ret = -EFAULT; goto err; } + + /* + * Account memory _before_ installing the file descriptor. Once + * the descriptor is installed, it can get closed at any time. + */ + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); + ctx->limit_mem = limit_mem; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup @@ -8338,9 +8347,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), - ACCT_LOCKED); - ctx->limit_mem = limit_mem; return ret; err: io_ring_ctx_wait_and_kill(ctx); From 1752f0adea98ef859978c090e0726844348758f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 1 Aug 2020 13:36:33 +0300 Subject: [PATCH 126/127] fs: optimise kiocb_set_rw_flags() Use a local var to collect flags in kiocb_set_rw_flags(). That spares some memory writes and allows to replace most of the jumps with MOVEcc. Signed-off-by: Pavel Begunkov Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Jens Axboe --- include/linux/fs.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4090320360f44f..e535543d31d975 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3446,22 +3446,28 @@ static inline int iocb_flags(struct file *file) static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) { + int kiocb_flags = 0; + + if (!flags) + return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; - ki->ki_flags |= IOCB_NOWAIT; + kiocb_flags |= IOCB_NOWAIT; } if (flags & RWF_HIPRI) - ki->ki_flags |= IOCB_HIPRI; + kiocb_flags |= IOCB_HIPRI; if (flags & RWF_DSYNC) - ki->ki_flags |= IOCB_DSYNC; + kiocb_flags |= IOCB_DSYNC; if (flags & RWF_SYNC) - ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + kiocb_flags |= (IOCB_DSYNC | IOCB_SYNC); if (flags & RWF_APPEND) - ki->ki_flags |= IOCB_APPEND; + kiocb_flags |= IOCB_APPEND; + + ki->ki_flags |= kiocb_flags; return 0; } From fa15bafb71fd7a4d6018dae87cfaf890fd4ab47f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 1 Aug 2020 13:50:02 +0300 Subject: [PATCH 127/127] io_uring: flip if handling after io_setup_async_rw As recently done with with send/recv, flip the if after rw_verify_aread() in io_{read,write}() and tabulise left bits left. This removes mispredicted by a compiler jump on the success/fast path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 146 +++++++++++++++++++++++++------------------------- 1 file changed, 72 insertions(+), 74 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 33702f3b5af845..6fd0b0f5df68b2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3034,57 +3034,56 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t io_size, ret; + ssize_t io_size, ret, ret2; + unsigned long nr_segs; ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; - io_size = ret; - req->result = io_size; - /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - unsigned long nr_segs = iter.nr_segs; - ssize_t ret2 = 0; + if (unlikely(ret)) + goto out_free; - ret2 = io_iter_do_read(req, &iter); + ret2 = io_iter_do_read(req, &iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { - kiocb_done(kiocb, ret2, cs); - } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; + /* Catch -EAGAIN return for forced non-blocking submission */ + if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* if we can retry, do so with the callbacks armed */ + if (io_rw_should_retry(req)) { + ret2 = io_iter_do_read(req, &iter); + if (ret2 == -EIOCBQUEUED) { + goto out_free; + } else if (ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - /* if we can retry, do so with the callbacks armed */ - if (io_rw_should_retry(req)) { - ret2 = io_iter_do_read(req, &iter); - if (ret2 == -EIOCBQUEUED) { - goto out_free; - } else if (ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - goto out_free; - } } - kiocb->ki_flags &= ~IOCB_WAITQ; - return -EAGAIN; } + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; } out_free: if (iovec) @@ -3117,19 +3116,19 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t ret, io_size; + ssize_t ret, ret2, io_size; + unsigned long nr_segs; ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - io_size = ret; - req->result = io_size; - /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; @@ -3140,51 +3139,50 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - unsigned long nr_segs = iter.nr_segs; - ssize_t ret2; + if (unlikely(ret)) + goto out_free; - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (req->flags & REQ_F_ISREG) { + __sb_start_write(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2, cs); - } else { - iter.count = iov_count; - iter.nr_segs = nr_segs; + /* + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; + if (!force_nonblock || ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) - goto out_free; - /* it's copied and will be cleaned with ->io */ - iovec = NULL; - return -EAGAIN; - } + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + return -EAGAIN; } out_free: if (iovec)