Skip to content

Commit

Permalink
Merge branch 'tun-skb_array'
Browse files Browse the repository at this point in the history
Jason Wang says:

====================
switch to use tx skb array in tun

This series tries to switch to use skb array in tun. This is used to
eliminate the spinlock contention between producer and consumer. The
conversion was straightforward: just introdce a tx skb array and use
it instead of sk_receive_queue.

A minor issue is to keep the tx_queue_len behaviour, since tun used to
use it for the length of sk_receive_queue. This is done through:

- add the ability to resize multiple rings at once to avoid handling
  partial resize failure for mutiple rings.
- add the support for zero length ring.
- introduce a notifier which was triggered when tx_queue_len was
  changed for a netdev.
- resize all queues during the tx_queue_len changing.

Tests shows about 15% improvement on guest rx pps:

Before: ~1300000pps
After : ~1500000pps

Changes from V3:
- fix kbuild warnings
- call NETDEV_CHANGE_TX_QUEUE_LEN on IFLA_TXQLEN

Changes from V2:
- add multiple rings resizing support for ptr_ring/skb_array
- add zero length ring support
- introdce a NETDEV_CHANGE_TX_QUEUE_LEN
- drop new flags

Changes from V1:
- switch to use skb array instead of a customized circular buffer
- add non-blocking support
- rename .peek to .peek_len
- drop lockless peeking since test show very minor improvement
====================

Acked-by: Michael S. Tsirkin <[email protected]>
Acked-from-altitude: 34697 feet.
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
davem330 committed Jul 1, 2016
2 parents 8dc7243 + 1576d98 commit beb528d
Show file tree
Hide file tree
Showing 9 changed files with 255 additions and 27 deletions.
138 changes: 130 additions & 8 deletions drivers/net/tun.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
#include <net/sock.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>

#include <asm/uaccess.h>

Expand Down Expand Up @@ -167,6 +168,7 @@ struct tun_file {
};
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
};

struct tun_flow_entry {
Expand Down Expand Up @@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)

static void tun_queue_purge(struct tun_file *tfile)
{
skb_queue_purge(&tfile->sk.sk_receive_queue);
struct sk_buff *skb;

while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
kfree_skb(skb);

skb_queue_purge(&tfile->sk.sk_error_queue);
}

Expand Down Expand Up @@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun->dev->reg_state == NETREG_REGISTERED)
unregister_netdevice(tun->dev);
}
if (tun)
skb_array_cleanup(&tfile->tx_array);
sock_put(&tfile->sk);
}
}
Expand Down Expand Up @@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev)
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
{
struct tun_file *tfile = file->private_data;
struct net_device *dev = tun->dev;
int err;

err = security_tun_dev_attach(tfile->socket.sk, tun->security);
Expand Down Expand Up @@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
if (!err)
goto out;
}

if (!tfile->detached &&
skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
err = -ENOMEM;
goto out;
}

tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
rcu_assign_pointer(tfile->tun, tun);
Expand Down Expand Up @@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)

nf_reset(skb);

/* Enqueue packet */
skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
if (skb_array_produce(&tfile->tx_array, skb))
goto drop;

/* Notify and wake up reader process */
if (tfile->flags & TUN_FASYNC)
Expand Down Expand Up @@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)

poll_wait(file, sk_sleep(sk), wait);

if (!skb_queue_empty(&sk->sk_receive_queue))
if (!skb_array_empty(&tfile->tx_array))
mask |= POLLIN | POLLRDNORM;

if (sock_writeable(sk) ||
Expand Down Expand Up @@ -1426,22 +1442,61 @@ static ssize_t tun_put_user(struct tun_struct *tun,
return total;
}

static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
int *err)
{
DECLARE_WAITQUEUE(wait, current);
struct sk_buff *skb = NULL;

skb = skb_array_consume(&tfile->tx_array);
if (skb)
goto out;
if (noblock) {
*err = -EAGAIN;
goto out;
}

add_wait_queue(&tfile->wq.wait, &wait);
current->state = TASK_INTERRUPTIBLE;

while (1) {
skb = skb_array_consume(&tfile->tx_array);
if (skb)
break;
if (signal_pending(current)) {
*err = -ERESTARTSYS;
break;
}
if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
*err = -EFAULT;
break;
}

schedule();
}

current->state = TASK_RUNNING;
remove_wait_queue(&tfile->wq.wait, &wait);

out:
return skb;
}

static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
struct iov_iter *to,
int noblock)
{
struct sk_buff *skb;
ssize_t ret;
int peeked, err, off = 0;
int err;

tun_debug(KERN_INFO, tun, "tun_do_read\n");

if (!iov_iter_count(to))
return 0;

/* Read frames from queue */
skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
&peeked, &off, &err);
/* Read frames from ring */
skb = tun_ring_recv(tfile, noblock, &err);
if (!skb)
return err;

Expand Down Expand Up @@ -1574,8 +1629,25 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
return ret;
}

static int tun_peek_len(struct socket *sock)
{
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun;
int ret = 0;

tun = __tun_get(tfile);
if (!tun)
return 0;

ret = skb_array_peek_len(&tfile->tx_array);
tun_put(tun);

return ret;
}

/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
.peek_len = tun_peek_len,
.sendmsg = tun_sendmsg,
.recvmsg = tun_recvmsg,
};
Expand Down Expand Up @@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
};

static int tun_queue_resize(struct tun_struct *tun)
{
struct net_device *dev = tun->dev;
struct tun_file *tfile;
struct skb_array **arrays;
int n = tun->numqueues + tun->numdisabled;
int ret, i;

arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
if (!arrays)
return -ENOMEM;

for (i = 0; i < tun->numqueues; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
arrays[i] = &tfile->tx_array;
}
list_for_each_entry(tfile, &tun->disabled, next)
arrays[i++] = &tfile->tx_array;

ret = skb_array_resize_multiple(arrays, n,
dev->tx_queue_len, GFP_KERNEL);

kfree(arrays);
return ret;
}

static int tun_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tun_struct *tun = netdev_priv(dev);

switch (event) {
case NETDEV_CHANGE_TX_QUEUE_LEN:
if (tun_queue_resize(tun))
return NOTIFY_BAD;
break;
default:
break;
}

return NOTIFY_DONE;
}

static struct notifier_block tun_notifier_block __read_mostly = {
.notifier_call = tun_device_event,
};

static int __init tun_init(void)
{
Expand All @@ -2416,6 +2535,8 @@ static int __init tun_init(void)
pr_err("Can't register misc device %d\n", TUN_MINOR);
goto err_misc;
}

register_netdevice_notifier(&tun_notifier_block);
return 0;
err_misc:
rtnl_link_unregister(&tun_link_ops);
Expand All @@ -2427,6 +2548,7 @@ static void tun_cleanup(void)
{
misc_deregister(&tun_miscdev);
rtnl_link_unregister(&tun_link_ops);
unregister_netdevice_notifier(&tun_notifier_block);
}

/* Get an underlying socket object from tun file. Returns error unless file is
Expand Down
16 changes: 15 additions & 1 deletion drivers/vhost/net.c
Original file line number Diff line number Diff line change
Expand Up @@ -481,10 +481,14 @@ static void handle_tx(struct vhost_net *net)

static int peek_head_len(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
struct sk_buff *head;
int len = 0;
unsigned long flags;

if (sock->ops->peek_len)
return sock->ops->peek_len(sock);

spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue);
if (likely(head)) {
Expand All @@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk)
return len;
}

static int sk_has_rx_data(struct sock *sk)
{
struct socket *sock = sk->sk_socket;

if (sock->ops->peek_len)
return sock->ops->peek_len(sock);

return skb_queue_empty(&sk->sk_receive_queue);
}

static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
Expand All @@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
endtime = busy_clock() + vq->busyloop_timeout;

while (vhost_can_busy_poll(&net->dev, endtime) &&
skb_queue_empty(&sk->sk_receive_queue) &&
!sk_has_rx_data(sk) &&
vhost_vq_avail_empty(&net->dev, vq))
cpu_relax_lowlatency();

Expand Down
1 change: 1 addition & 0 deletions include/linux/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ struct proto_ops {
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
int (*set_peek_off)(struct sock *sk, int val);
int (*peek_len)(struct socket *sock);
};

#define DECLARE_SOCKADDR(type, dst, src) \
Expand Down
1 change: 1 addition & 0 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -2237,6 +2237,7 @@ struct netdev_lag_lower_state_info {
#define NETDEV_PRECHANGEUPPER 0x001A
#define NETDEV_CHANGELOWERSTATE 0x001B
#define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C
#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E

int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
Expand Down
Loading

0 comments on commit beb528d

Please sign in to comment.