Skip to content

Commit

Permalink
Merge tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux
Browse files Browse the repository at this point in the history
Pull io_uring fixes from Jens Axboe:

 - Fix an issue with discontig page checking for IORING_SETUP_NO_MMAP

 - Fix an issue with not allowing IORING_SETUP_NO_MMAP also disallowing
   mmap'ed buffer rings

 - Fix an issue with deferred release of memory mapped pages

 - Fix a lockdep issue with IORING_SETUP_NO_MMAP

 - Use fget/fput consistently, even from our sync system calls. No real
   issue here, but if we were ever to allow closing io_uring descriptors
   it would be required. Let's play it safe and just use the full ref
   counted versions upfront. Most uses of io_uring are threaded anyway,
   and hence already doing the full version underneath.

* tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux:
  io_uring: use fget/fput consistently
  io_uring: free io_buffer_list entries via RCU
  io_uring/kbuf: prune deferred locked cache when tearing down
  io_uring/kbuf: recycle freed mapped buffer ring entries
  io_uring/kbuf: defer release of mapped buffer rings
  io_uring: enable io_mem_alloc/free to be used in other parts
  io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP
  io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP
  • Loading branch information
torvalds committed Dec 1, 2023
2 parents ee0c8a9 + 73363c2 commit c9a925b
Show file tree
Hide file tree
Showing 6 changed files with 224 additions and 70 deletions.
3 changes: 3 additions & 0 deletions include/linux/io_uring_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,9 @@ struct io_ring_ctx {

struct list_head io_buffers_cache;

/* deferred free list, protected by ->uring_lock */
struct hlist_head io_buf_list;

/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
Expand Down
11 changes: 6 additions & 5 deletions io_uring/cancel.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
};
ktime_t timeout = KTIME_MAX;
struct io_uring_sync_cancel_reg sc;
struct fd f = { };
struct file *file = NULL;
DEFINE_WAIT(wait);
int ret, i;

Expand All @@ -295,10 +295,10 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
/* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
f = fdget(sc.fd);
if (!f.file)
file = fget(sc.fd);
if (!file)
return -EBADF;
cd.file = f.file;
cd.file = file;
}

ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
Expand Down Expand Up @@ -348,6 +348,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
if (ret == -ENOENT || ret > 0)
ret = 0;
out:
fdput(f);
if (file)
fput(file);
return ret;
}
95 changes: 51 additions & 44 deletions io_uring/io_uring.c
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
INIT_HLIST_HEAD(&ctx->io_buf_list);
io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node));
io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
Expand Down Expand Up @@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}

static void io_mem_free(void *ptr)
void io_mem_free(void *ptr)
{
if (!ptr)
return;
Expand Down Expand Up @@ -2697,6 +2698,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
{
struct page **page_array;
unsigned int nr_pages;
void *page_addr;
int ret, i;

*npages = 0;
Expand All @@ -2718,27 +2720,29 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
io_pages_free(&page_array, ret > 0 ? ret : 0);
return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
}
/*
* Should be a single page. If the ring is small enough that we can
* use a normal page, that is fine. If we need multiple pages, then
* userspace should use a huge page. That's the only way to guarantee
* that we get contigious memory, outside of just being lucky or
* (currently) having low memory fragmentation.
*/
if (page_array[0] != page_array[ret - 1])
goto err;

/*
* Can't support mapping user allocated ring memory on 32-bit archs
* where it could potentially reside in highmem. Just fail those with
* -EINVAL, just like we did on kernels that didn't support this
* feature.
*/
page_addr = page_address(page_array[0]);
for (i = 0; i < nr_pages; i++) {
if (PageHighMem(page_array[i])) {
ret = -EINVAL;
ret = -EINVAL;

/*
* Can't support mapping user allocated ring memory on 32-bit
* archs where it could potentially reside in highmem. Just
* fail those with -EINVAL, just like we did on kernels that
* didn't support this feature.
*/
if (PageHighMem(page_array[i]))
goto err;
}

/*
* No support for discontig pages for now, should either be a
* single normal page, or a huge page. Later on we can add
* support for remapping discontig pages, for now we will
* just fail them with EINVAL.
*/
if (page_address(page_array[i]) != page_addr)
goto err;
page_addr += PAGE_SIZE;
}

*pages = page_array;
Expand Down Expand Up @@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
}
}

static void *io_mem_alloc(size_t size)
void *io_mem_alloc(size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
void *ret;
Expand Down Expand Up @@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
io_rings_free(ctx);
io_kbuf_mmap_list_free(ctx);

percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
Expand Down Expand Up @@ -3475,25 +3480,27 @@ static void *io_uring_validate_mmap_request(struct file *file,
struct page *page;
void *ptr;

/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);

switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
ptr = ctx->rings;
break;
case IORING_OFF_SQES:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
unsigned int bgid;

bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
mutex_lock(&ctx->uring_lock);
rcu_read_lock();
ptr = io_pbuf_get_address(ctx, bgid);
mutex_unlock(&ctx->uring_lock);
rcu_read_unlock();
if (!ptr)
return ERR_PTR(-EINVAL);
break;
Expand Down Expand Up @@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
size_t, argsz)
{
struct io_ring_ctx *ctx;
struct fd f;
struct file *file;
long ret;

if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
Expand All @@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
f.file = tctx->registered_rings[fd];
f.flags = 0;
if (unlikely(!f.file))
file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else {
f = fdget(fd);
if (unlikely(!f.file))
file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (unlikely(!io_is_uring_fops(f.file)))
if (unlikely(!io_is_uring_fops(file)))
goto out;
}

ctx = f.file->private_data;
ctx = file->private_data;
ret = -EBADFD;
if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
goto out;
Expand Down Expand Up @@ -3770,7 +3776,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
}
out:
fdput(f);
if (!(flags & IORING_ENTER_REGISTERED_RING))
fput(file);
return ret;
}

Expand Down Expand Up @@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct fd f;
struct file *file;
bool use_registered_ring;

use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
Expand All @@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
f.file = tctx->registered_rings[fd];
f.flags = 0;
if (unlikely(!f.file))
file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else {
f = fdget(fd);
if (unlikely(!f.file))
file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (!io_is_uring_fops(f.file))
if (!io_is_uring_fops(file))
goto out_fput;
}

ctx = f.file->private_data;
ctx = file->private_data;

mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
fdput(f);
if (!use_registered_ring)
fput(file);
return ret;
}

Expand Down
3 changes: 3 additions & 0 deletions io_uring/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);

void *io_mem_alloc(size_t size);
void io_mem_free(void *ptr);

#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
Expand Down
Loading

0 comments on commit c9a925b

Please sign in to comment.