From fd7d6de2241453fc7d042336d366a939a25bc5a9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 23 Aug 2020 11:00:37 -0600 Subject: [PATCH 01/10] io_uring: don't recurse on tsk->sighand->siglock with signalfd If an application is doing reads on signalfd, and we arm the poll handler because there's no data available, then the wakeup can recurse on the tasks sighand->siglock as the signal delivery from task_work_add() will use TWA_SIGNAL and that attempts to lock it again. We can detect the signalfd case pretty easily by comparing the poll->head wait_queue_head_t with the target task signalfd wait queue. Just use normal task wakeup for this case. Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 91e2cc8414f9a2..c9d526ff55e088 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1746,7 +1746,8 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, + bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; @@ -1759,7 +1760,7 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) * will do the job. */ notify = 0; - if (!(ctx->flags & IORING_SETUP_SQPOLL)) + if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; ret = task_work_add(tsk, cb, notify); @@ -1819,7 +1820,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_submit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -2322,7 +2323,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) init_task_work(&req->task_work, io_rw_resubmit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, true); if (!ret) return true; #endif @@ -3044,7 +3045,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -4566,6 +4567,7 @@ struct io_poll_table { static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { + bool twa_signal_ok; int ret; /* for instances that support it check for an event match first: */ @@ -4580,13 +4582,21 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, init_task_work(&req->task_work, func); percpu_ref_get(&req->ctx->refs); + /* + * If we using the signalfd wait_queue_head for this wakeup, then + * it's not safe to use TWA_SIGNAL as we could be recursing on the + * tsk->sighand->siglock on doing the wakeup. Should not be needed + * either, as the normal wakeup will suffice. + */ + twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh); + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok); if (unlikely(ret)) { struct task_struct *tsk; From 204361a77f4018627addd4a06877448f088ddfc0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 23 Aug 2020 20:33:10 +0300 Subject: [PATCH 02/10] io-wq: fix hang after cancelling pending hashed work Don't forget to update wqe->hash_tail after cancelling a pending work item, if it was hashed. Cc: stable@vger.kernel.org # 5.7+ Reported-by: Dmitry Shulyak Fixes: 86f3cd1b589a1 ("io-wq: handle hashed writes in chains") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index e92c4724480cad..414beb5438836c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -925,6 +925,24 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return match->nr_running && !match->cancel_all; } +static inline void io_wqe_remove_pending(struct io_wqe *wqe, + struct io_wq_work *work, + struct io_wq_work_node *prev) +{ + unsigned int hash = io_get_work_hash(work); + struct io_wq_work *prev_work = NULL; + + if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) { + if (prev) + prev_work = container_of(prev, struct io_wq_work, list); + if (prev_work && io_get_work_hash(prev_work) == hash) + wqe->hash_tail[hash] = prev_work; + else + wqe->hash_tail[hash] = NULL; + } + wq_list_del(&wqe->work_list, &work->list, prev); +} + static void io_wqe_cancel_pending_work(struct io_wqe *wqe, struct io_cb_cancel_data *match) { @@ -938,8 +956,7 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - - wq_list_del(&wqe->work_list, node, prev); + io_wqe_remove_pending(wqe, work, prev); spin_unlock_irqrestore(&wqe->lock, flags); io_run_cancel(work, wqe); match->nr_pending++; From 842163154b87b01d8f516af15ad8916eb1661016 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Aug 2020 11:45:26 -0600 Subject: [PATCH 03/10] io_uring: revert consumed iov_iter bytes on error Some consumers of the iov_iter will return an error, but still have bytes consumed in the iterator. This is an issue for -EAGAIN, since we rely on a sane iov_iter state across retries. Fix this by ensuring that we revert consumed bytes, if any, if the file operations have consumed any bytes from iterator. This is similar to what generic_file_read_iter() does, and is always safe as we have the previous bytes count handy already. Fixes: ff6165b2d7f6 ("io_uring: retain iov_iter state over io_read/io_write calls") Reported-by: Dmitry Shulyak Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index c9d526ff55e088..e030b33fa53e12 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3153,6 +3153,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, } else if (ret == -EAGAIN) { if (!force_nonblock) goto done; + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (ret) goto out_free; @@ -3294,6 +3296,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, cs); } else { + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); copy_iov: ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) From 6b7898eb180df12767933466b7855b23103ad489 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Aug 2020 07:58:00 -0600 Subject: [PATCH 04/10] io_uring: fix imbalanced sqo_mm accounting We do the initial accounting of locked_vm and pinned_vm before we have setup ctx->sqo_mm, which means we can end up having not accounted the memory at setup time, but still decrement it when we exit. This causes an imbalance in the accounting. Setup ctx->sqo_mm earlier in io_uring_create(), before we do the first accounting of mm->{locked,pinned}_vm. This also unifies the state grabbing for the ctx, and eliminates a failure case in io_sq_offload_start(). Fixes: f74441e6311a ("io_uring: account locked memory before potential error case") Reported-by: Robert M. Muncrief Reported-by: Niklas Schnelle Tested-by: Niklas Schnelle Tested-by: Robert M. Muncrief Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e030b33fa53e12..384df86dfc6921 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7447,9 +7447,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - mmgrab(current->mm); - ctx->sqo_mm = current->mm; - if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) @@ -7494,10 +7491,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - if (ctx->sqo_mm) { - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; - } return ret; } @@ -8547,6 +8540,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->user = user; ctx->creds = get_current_cred(); + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + /* * Account memory _before_ installing the file descriptor. Once * the descriptor is installed, it can get closed at any time. Also From 9dab14b81807a40dab8e464ec87043935c562c2c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Aug 2020 12:27:50 -0600 Subject: [PATCH 05/10] io_uring: don't use poll handler if file can't be nonblocking read/written There's no point in using the poll handler if we can't do a nonblocking IO attempt of the operation, since we'll need to go async anyway. In fact this is actively harmful, as reading from eg pipes won't return 0 to indicate EOF. Cc: stable@vger.kernel.org # v5.7+ Reported-by: Benedikt Ames Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 384df86dfc6921..d15139088e4c18 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4889,12 +4889,20 @@ static bool io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask, ret; + int rw; if (!req->file || !file_can_poll(req->file)) return false; if (req->flags & REQ_F_POLLED) return false; - if (!def->pollin && !def->pollout) + if (def->pollin) + rw = READ; + else if (def->pollout) + rw = WRITE; + else + return false; + /* if we can't nonblock try, then no point in arming a poll handler */ + if (!io_file_supports_async(req->file, rw)) return false; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); From 00d23d516e2e7900cd1bd577c1f84794ae7ff3a7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Aug 2020 12:59:22 -0600 Subject: [PATCH 06/10] io_uring: ensure read requests go through -ERESTART* transformation We need to call kiocb_done() for any ret < 0 to ensure that we always get the proper -ERESTARTSYS (and friends) transformation done. At some point this should be tied into general error handling, so we can get rid of the various (mostly network) related commands that check and perform this substitution. Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d15139088e4c18..d9b88644d5e8a0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3160,7 +3160,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, goto out_free; return -EAGAIN; } else if (ret < 0) { - goto out_free; + /* make sure -ERESTARTSYS -> -EINTR is done */ + goto done; } /* read it all, or we did blocking attempt. no retry. */ From 0fef948363f62494d779cf9dc3c0a86ea1e5f7cd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Aug 2020 10:36:20 -0600 Subject: [PATCH 07/10] io_uring: make offset == -1 consistent with preadv2/pwritev2 The man page for io_uring generally claims were consistent with what preadv2 and pwritev2 accept, but turns out there's a slight discrepancy in how offset == -1 is handled for pipes/streams. preadv doesn't allow it, but preadv2 does. This currently causes io_uring to return -EINVAL if that is attempted, but we should allow that as documented. This change makes us consistent with preadv2/pwritev2 for just passing in a NULL ppos for streams if the offset is -1. Cc: stable@vger.kernel.org # v5.7+ Reported-by: Benedikt Ames Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d9b88644d5e8a0..bd2d8de3f2e8f8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2866,6 +2866,11 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return iov_iter_count(&req->io->rw.iter); } +static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) +{ + return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos; +} + /* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. @@ -2901,10 +2906,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } if (iov_iter_is_bvec(iter)) @@ -3139,7 +3144,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, goto copy_iov; iov_count = iov_iter_count(iter); - ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3262,7 +3267,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, goto copy_iov; iov_count = iov_iter_count(iter); - ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; From 56450c20fe10d4d93f58019109aa4e06fc0b9206 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Aug 2020 18:58:26 -0600 Subject: [PATCH 08/10] io_uring: clear req->result on IOPOLL re-issue Make sure we clear req->result, which was set to -EAGAIN for retry purposes, when moving it to the reissue list. Otherwise we can end up retrying a request more than once, which leads to weird results in the io-wq handling (and other spots). Cc: stable@vger.kernel.org Reported-by: Andres Freund Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index bd2d8de3f2e8f8..6df08287c59e14 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2049,6 +2049,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { + req->result = 0; req->iopoll_completed = 0; list_move_tail(&req->inflight_entry, &again); continue; From eefdf30f3dcb5c1d47bee2b3afdb9d4d05343ff3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Aug 2020 16:40:19 -0600 Subject: [PATCH 09/10] io_uring: fix IOPOLL -EAGAIN retries This normally isn't hit, as polling is mostly done on NVMe with deep queue depths. But if we do run into request starvation, we need to ensure that retries are properly serialized. Reported-by: Andres Freund Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6df08287c59e14..8c77ad4a65f063 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1150,7 +1150,7 @@ static void io_prep_async_work(struct io_kiocb *req) io_req_init_async(req); if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file) + if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) @@ -3132,6 +3132,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); io_size = ret; req->result = io_size; ret = 0; @@ -3144,7 +3145,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; - iov_count = iov_iter_count(iter); ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3157,7 +3157,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = 0; goto out_free; } else if (ret == -EAGAIN) { - if (!force_nonblock) + /* IOPOLL retry should happen for io-wq threads */ + if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; /* some cases will consume bytes even on error returns */ iov_iter_revert(iter, iov_count - iov_iter_count(iter)); @@ -3251,6 +3252,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); io_size = ret; req->result = io_size; @@ -3267,7 +3269,6 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - iov_count = iov_iter_count(iter); ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3301,11 +3302,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { + /* IOPOLL retry should happen for io-wq threads */ + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) + goto copy_iov; kiocb_done(kiocb, ret2, cs); } else { +copy_iov: /* some cases will consume bytes even on error returns */ iov_iter_revert(iter, iov_count - iov_iter_count(iter)); -copy_iov: ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; From fdee946d0925f971f167d2606984426763355e4f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Aug 2020 16:46:24 -0600 Subject: [PATCH 10/10] io_uring: don't bounce block based -EAGAIN retry off task_work These events happen inline from submission, so there's no need to bounce them through the original task. Just set them up for retry and issue retry directly instead of going over task_work. Signed-off-by: Jens Axboe --- fs/io_uring.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8c77ad4a65f063..852c2eaf1a9ac7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2295,22 +2295,6 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error) io_req_complete(req, ret); return false; } - -static void io_rw_resubmit(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct io_ring_ctx *ctx = req->ctx; - int err; - - err = io_sq_thread_acquire_mm(ctx, req); - - if (io_resubmit_prep(req, err)) { - refcount_inc(&req->refs); - io_queue_async_work(req); - } - - percpu_ref_put(&ctx->refs); -} #endif static bool io_rw_reissue(struct io_kiocb *req, long res) @@ -2321,12 +2305,14 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - init_task_work(&req->task_work, io_rw_resubmit); - percpu_ref_get(&req->ctx->refs); + ret = io_sq_thread_acquire_mm(req->ctx, req); - ret = io_req_task_work_add(req, &req->task_work, true); - if (!ret) + if (io_resubmit_prep(req, ret)) { + refcount_inc(&req->refs); + io_queue_async_work(req); return true; + } + #endif return false; }