Skip to content

Commit

Permalink
Merge tag 'for-5.20/io_uring-zerocopy-send-2022-07-29' of git://git.k…
Browse files Browse the repository at this point in the history
…ernel.dk/linux-block

Pull io_uring zerocopy support from Jens Axboe:
 "This adds support for efficient support for zerocopy sends through
  io_uring. Both ipv4 and ipv6 is supported, as well as both TCP and
  UDP.

  The core network changes to support this is in a stable branch from
  Jakub that both io_uring and net-next has pulled in, and the io_uring
  changes are layered on top of that.

  All of the work has been done by Pavel"

* tag 'for-5.20/io_uring-zerocopy-send-2022-07-29' of git://git.kernel.dk/linux-block: (34 commits)
  io_uring: notification completion optimisation
  io_uring: export req alloc from core
  io_uring/net: use unsigned for flags
  io_uring/net: make page accounting more consistent
  io_uring/net: checks errors of zc mem accounting
  io_uring/net: improve io_get_notif_slot types
  selftests/io_uring: test zerocopy send
  io_uring: enable managed frags with register buffers
  io_uring: add zc notification flush requests
  io_uring: rename IORING_OP_FILES_UPDATE
  io_uring: flush notifiers after sendzc
  io_uring: sendzc with fixed buffers
  io_uring: allow to pass addr into sendzc
  io_uring: account locked pages for non-fixed zc
  io_uring: wire send zc request type
  io_uring: add notification slot registration
  io_uring: add rsrc referencing for notifiers
  io_uring: complete notifiers in tw
  io_uring: cache struct io_notif
  io_uring: add zc notification infrastructure
  ...
  • Loading branch information
torvalds committed Aug 2, 2022
2 parents 98e2474 + 14b146b commit 42df1cb
Show file tree
Hide file tree
Showing 25 changed files with 1,604 additions and 158 deletions.
30 changes: 30 additions & 0 deletions include/linux/io_uring_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/bitmap.h>
#include <linux/llist.h>
#include <uapi/linux/io_uring.h>

struct io_wq_work_node {
Expand Down Expand Up @@ -33,6 +34,9 @@ struct io_file_table {
unsigned int alloc_hint;
};

struct io_notif;
struct io_notif_slot;

struct io_hash_bucket {
spinlock_t lock;
struct hlist_head list;
Expand All @@ -43,6 +47,30 @@ struct io_hash_table {
unsigned hash_bits;
};

/*
* Arbitrary limit, can be raised if need be
*/
#define IO_RINGFD_REG_MAX 16

struct io_uring_task {
/* submission side */
int cached_refs;
const struct io_ring_ctx *last;
struct io_wq *io_wq;
struct file *registered_rings[IO_RINGFD_REG_MAX];

struct xarray xa;
struct wait_queue_head wait;
atomic_t in_idle;
atomic_t inflight_tracked;
struct percpu_counter inflight;

struct { /* task_work */
struct llist_head task_list;
struct callback_head task_work;
} ____cacheline_aligned_in_smp;
};

struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
Expand Down Expand Up @@ -212,6 +240,8 @@ struct io_ring_ctx {
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
struct io_notif_slot *notif_slots;
unsigned nr_notif_slots;

struct io_submit_state submit_state;

Expand Down
66 changes: 48 additions & 18 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -686,10 +686,18 @@ enum {
* charged to the kernel memory.
*/
SKBFL_PURE_ZEROCOPY = BIT(2),

SKBFL_DONT_ORPHAN = BIT(3),

/* page references are managed by the ubuf_info, so it's safe to
* use frags only up until ubuf_info is released
*/
SKBFL_MANAGED_FRAG_REFS = BIT(4),
};

#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)

/*
* The callback notifies userspace to release buffers when skb DMA is done in
Expand Down Expand Up @@ -1773,13 +1781,14 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
bool success);

int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length);
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, struct iov_iter *from,
size_t length);

static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
struct msghdr *msg, int len)
{
return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
}

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
Expand All @@ -1806,6 +1815,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb)
return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}

static inline bool skb_zcopy_managed(const struct sk_buff *skb)
{
return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
}

static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
const struct sk_buff *skb2)
{
Expand Down Expand Up @@ -1880,6 +1894,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
}
}

void __skb_zcopy_downgrade_managed(struct sk_buff *skb);

static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
if (unlikely(skb_zcopy_managed(skb)))
__skb_zcopy_downgrade_managed(skb);
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
skb->next = NULL;
Expand Down Expand Up @@ -2528,6 +2550,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
return skb_headlen(skb) + __skb_pagelen(skb);
}

static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
int i, struct page *page,
int off, int size)
{
skb_frag_t *frag = &shinfo->frags[i];

/*
* Propagate page pfmemalloc to the skb if we can. The problem is
* that not all callers have unique ownership of the page but rely
* on page_is_pfmemalloc doing the right thing(tm).
*/
frag->bv_page = page;
frag->bv_offset = off;
skb_frag_size_set(frag, size);
}

/**
* __skb_fill_page_desc - initialise a paged fragment in an skb
* @skb: buffer containing fragment to be initialised
Expand All @@ -2544,17 +2582,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
struct page *page, int off, int size)
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

/*
* Propagate page pfmemalloc to the skb if we can. The problem is
* that not all callers have unique ownership of the page but rely
* on page_is_pfmemalloc doing the right thing(tm).
*/
frag->bv_page = page;
frag->bv_offset = off;
skb_frag_size_set(frag, size);

__skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
page = compound_head(page);
if (page_is_pfmemalloc(page))
skb->pfmemalloc = true;
Expand Down Expand Up @@ -3182,8 +3210,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
if (likely(!skb_zcopy(skb)))
return 0;
if (!skb_zcopy_is_nouarg(skb) &&
skb_uarg(skb)->callback == msg_zerocopy_callback)
if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
return 0;
return skb_copy_ubufs(skb, gfp_mask);
}
Expand Down Expand Up @@ -3496,7 +3523,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
*/
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
struct skb_shared_info *shinfo = skb_shinfo(skb);

if (!skb_zcopy_managed(skb))
__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}

/**
Expand Down
5 changes: 5 additions & 0 deletions include/linux/socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ struct file;
struct pid;
struct cred;
struct socket;
struct sock;
struct sk_buff;

#define __sockaddr_check_size(size) \
BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
Expand Down Expand Up @@ -69,6 +71,9 @@ struct msghdr {
unsigned int msg_flags; /* flags on received message */
__kernel_size_t msg_controllen; /* ancillary data buffer length */
struct kiocb *msg_iocb; /* ptr to iocb for async requests */
struct ubuf_info *msg_ubuf;
int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length);
};

struct user_msghdr {
Expand Down
45 changes: 43 additions & 2 deletions include/uapi/linux/io_uring.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ struct io_uring_sqe {
union {
__s32 splice_fd_in;
__u32 file_index;
struct {
__u16 notification_idx;
__u16 addr_len;
};
};
union {
struct {
Expand Down Expand Up @@ -170,7 +174,8 @@ enum io_uring_op {
IORING_OP_FALLOCATE,
IORING_OP_OPENAT,
IORING_OP_CLOSE,
IORING_OP_FILES_UPDATE,
IORING_OP_RSRC_UPDATE,
IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE,
IORING_OP_STATX,
IORING_OP_READ,
IORING_OP_WRITE,
Expand All @@ -197,6 +202,7 @@ enum io_uring_op {
IORING_OP_GETXATTR,
IORING_OP_SOCKET,
IORING_OP_URING_CMD,
IORING_OP_SENDZC_NOTIF,

/* this goes last, obviously */
IORING_OP_LAST,
Expand All @@ -218,6 +224,7 @@ enum io_uring_op {
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)

/*
* sqe->splice_flags
* extends splice(2) flags
Expand Down Expand Up @@ -267,15 +274,32 @@ enum io_uring_op {
* IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
* the handler will continue to report
* CQEs on behalf of the same SQE.
*
* IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in
* the buf_index field.
*
* IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful
* successful. Only for zerocopy sends.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
#define IORING_RECV_MULTISHOT (1U << 1)
#define IORING_RECV_MULTISHOT (1U << 1)
#define IORING_RECVSEND_FIXED_BUF (1U << 2)
#define IORING_RECVSEND_NOTIF_FLUSH (1U << 3)

/*
* accept flags stored in sqe->ioprio
*/
#define IORING_ACCEPT_MULTISHOT (1U << 0)


/*
* IORING_OP_RSRC_UPDATE flags
*/
enum {
IORING_RSRC_UPDATE_FILES,
IORING_RSRC_UPDATE_NOTIF,
};

/*
* IORING_OP_MSG_RING command types, stored in sqe->addr
*/
Expand Down Expand Up @@ -457,6 +481,10 @@ enum {
/* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25,

/* zerocopy notification API */
IORING_REGISTER_NOTIFIERS = 26,
IORING_UNREGISTER_NOTIFIERS = 27,

/* this goes last */
IORING_REGISTER_LAST
};
Expand Down Expand Up @@ -503,6 +531,19 @@ struct io_uring_rsrc_update2 {
__u32 resv2;
};

struct io_uring_notification_slot {
__u64 tag;
__u64 resv[3];
};

struct io_uring_notification_register {
__u32 nr_slots;
__u32 resv;
__u64 resv2;
__u64 data;
__u64 resv3;
};

/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)

Expand Down
2 changes: 1 addition & 1 deletion io_uring/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o
cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
obj-$(CONFIG_IO_WQ) += io-wq.o
Loading

0 comments on commit 42df1cb

Please sign in to comment.