Skip to content

Commit

Permalink
net: accept UFO datagrams from tuntap and packet
Browse files Browse the repository at this point in the history
Tuntap and similar devices can inject GSO packets. Accept type
VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively.

Processes are expected to use feature negotiation such as TUNSETOFFLOAD
to detect supported offload types and refrain from injecting other
packets. This process breaks down with live migration: guest kernels
do not renegotiate flags, so destination hosts need to expose all
features that the source host does.

Partially revert the UFO removal from 182e0b6~1..d9d30ad.
This patch introduces nearly(*) no new code to simplify verification.
It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP
insertion and software UFO segmentation.

It does not reinstate protocol stack support, hardware offload
(NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception
of VIRTIO_NET_HDR_GSO_UDP packets in tuntap.

To support SKB_GSO_UDP reappearing in the stack, also reinstate
logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD
by squashing in commit 9399122 ("net: skb_needs_check() removes
CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee
("net: avoid skb_warn_bad_offload false positives on UFO").

(*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id,
ipv6_proxy_select_ident is changed to return a __be32 and this is
assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted
at the end of the enum to minimize code churn.

Tested
  Booted a v4.13 guest kernel with QEMU. On a host kernel before this
  patch `ethtool -k eth0` shows UFO disabled. After the patch, it is
  enabled, same as on a v4.13 host kernel.

  A UFO packet sent from the guest appears on the tap device:
    host:
      nc -l -p -u 8000 &
      tcpdump -n -i tap0

    guest:
      dd if=/dev/zero of=payload.txt bs=1 count=2000
      nc -u 192.16.1.1 8000 < payload.txt

  Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds,
  packets arriving fragmented:

    ./with_tap_pair.sh ./tap_send_ufo tap0 tap1
    (from https://github.com/wdebruij/kerneltools/tree/master/tests)

Changes
  v1 -> v2
    - simplified set_offload change (review comment)
    - documented test procedure

Link: http://lkml.kernel.org/r/<CAF=yD-LuUeDuL9YWPJD9ykOZ0QCjNeznPDr6whqZ9NGMNF12Mw@mail.gmail.com>
Fixes: fb652fd ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.")
Reported-by: Michal Kubecek <[email protected]>
Signed-off-by: Willem de Bruijn <[email protected]>
Acked-by: Jason Wang <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
wdebruij authored and davem330 committed Nov 23, 2017
1 parent 9e77d7a commit 0c19f84
Show file tree
Hide file tree
Showing 15 changed files with 209 additions and 14 deletions.
2 changes: 1 addition & 1 deletion drivers/net/tap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1077,7 +1077,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
case TUNSETOFFLOAD:
/* let the user check for future flags */
if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
TUN_F_TSO_ECN))
TUN_F_TSO_ECN | TUN_F_UFO))
return -EINVAL;

rtnl_lock();
Expand Down
2 changes: 2 additions & 0 deletions drivers/net/tun.c
Original file line number Diff line number Diff line change
Expand Up @@ -2370,6 +2370,8 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
features |= NETIF_F_TSO6;
arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
}

arg &= ~TUN_F_UFO;
}

/* This gives the user a way to test for new features in future by
Expand Down
4 changes: 3 additions & 1 deletion include/linux/netdev_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ enum {
NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */
NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */
NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */
/**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */
NETIF_F_GSO_ESP_BIT,
NETIF_F_GSO_UDP_BIT,

NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */
NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */
Expand Down Expand Up @@ -132,6 +133,7 @@ enum {
#define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
#define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP)
#define NETIF_F_GSO_ESP __NETIF_F(GSO_ESP)
#define NETIF_F_GSO_UDP __NETIF_F(GSO_UDP)
#define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
#define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX)
#define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX)
Expand Down
1 change: 1 addition & 0 deletions include/linux/netdevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -4140,6 +4140,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));

return (features & feature) == feature;
}
Expand Down
2 changes: 2 additions & 0 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,8 @@ enum {
SKB_GSO_SCTP = 1 << 14,

SKB_GSO_ESP = 1 << 15,

SKB_GSO_UDP = 1 << 16,
};

#if BITS_PER_LONG > 32
Expand Down
5 changes: 4 additions & 1 deletion include/linux/virtio_net.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
const struct virtio_net_hdr *hdr,
bool little_endian)
{
unsigned short gso_type = 0;
unsigned int gso_type = 0;

if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
Expand All @@ -19,6 +19,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
case VIRTIO_NET_HDR_GSO_TCPV6:
gso_type = SKB_GSO_TCPV6;
break;
case VIRTIO_NET_HDR_GSO_UDP:
gso_type = SKB_GSO_UDP;
break;
default:
return -EINVAL;
}
Expand Down
1 change: 1 addition & 0 deletions include/net/ipv6.h
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,7 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
__be32 ipv6_select_ident(struct net *net,
const struct in6_addr *daddr,
const struct in6_addr *saddr);
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);

int ip6_dst_hoplimit(struct dst_entry *dst);

Expand Down
3 changes: 2 additions & 1 deletion net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -2746,7 +2746,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
{
if (tx_path)
return skb->ip_summed != CHECKSUM_PARTIAL;
return skb->ip_summed != CHECKSUM_PARTIAL &&
skb->ip_summed != CHECKSUM_UNNECESSARY;

return skb->ip_summed == CHECKSUM_NONE;
}
Expand Down
12 changes: 10 additions & 2 deletions net/ipv4/af_inet.c
Original file line number Diff line number Diff line change
Expand Up @@ -1223,9 +1223,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
bool fixedid = false, gso_partial, encap;
bool udpfrag = false, fixedid = false, gso_partial, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
struct iphdr *iph;
int proto, tot_len;
int nhoff;
Expand Down Expand Up @@ -1260,6 +1261,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);

if (!skb->encapsulation || encap) {
udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);

/* fixed ID is invalid if DF bit is not set */
Expand All @@ -1279,7 +1281,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
skb = segs;
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
if (skb_is_gso(skb)) {
if (udpfrag) {
iph->frag_off = htons(offset >> 3);
if (skb->next)
iph->frag_off |= htons(IP_MF);
offset += skb->len - nhoff - ihl;
tot_len = skb->len - nhoff;
} else if (skb_is_gso(skb)) {
if (!fixedid) {
iph->id = htons(id);
id += skb_shinfo(skb)->gso_segs;
Expand Down
49 changes: 45 additions & 4 deletions net/ipv4/udp_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,16 +187,57 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);

static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb,
netdev_features_t features)
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
__wsum csum;
struct udphdr *uh;
struct iphdr *iph;

if (skb->encapsulation &&
(skb_shinfo(skb)->gso_type &
(SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)))
(SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) {
segs = skb_udp_tunnel_segment(skb, features, false);
goto out;
}

if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;

mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;

/* Do software UFO. Complete and fill in the UDP checksum as
* HW cannot do checksum of UDP packets sent as multiple
* IP fragments.
*/

uh = udp_hdr(skb);
iph = ip_hdr(skb);

uh->check = 0;
csum = skb_checksum(skb, 0, skb->len, 0);
uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;

skb->ip_summed = CHECKSUM_UNNECESSARY;

/* If there is no outer header we can fake a checksum offload
* due to the fact that we have already done the checksum in
* software prior to segmenting the frame.
*/
if (!skb->encap_hdr_csum)
features |= NETIF_F_HW_CSUM;

/* Fragment the skb. IP headers of the fragments are updated in
* inet_gso_segment()
*/
segs = skb_segment(skb, features);
out:
return segs;
}

Expand Down Expand Up @@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)

static const struct net_offload udpv4_offload = {
.callbacks = {
.gso_segment = udp4_tunnel_segment,
.gso_segment = udp4_ufo_fragment,
.gro_receive = udp4_gro_receive,
.gro_complete = udp4_gro_complete,
},
Expand Down
31 changes: 31 additions & 0 deletions net/ipv6/output_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,37 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
return id;
}

/* This function exists only for tap drivers that must support broken
* clients requesting UFO without specifying an IPv6 fragment ID.
*
* This is similar to ipv6_select_ident() but we use an independent hash
* seed to limit information leakage.
*
* The network header must be set before calling this.
*/
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
{
static u32 ip6_proxy_idents_hashrnd __read_mostly;
struct in6_addr buf[2];
struct in6_addr *addrs;
u32 id;

addrs = skb_header_pointer(skb,
skb_network_offset(skb) +
offsetof(struct ipv6hdr, saddr),
sizeof(buf), buf);
if (!addrs)
return 0;

net_get_random_once(&ip6_proxy_idents_hashrnd,
sizeof(ip6_proxy_idents_hashrnd));

id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
&addrs[1], &addrs[0]);
return htonl(id);
}
EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);

__be32 ipv6_select_ident(struct net *net,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
Expand Down
85 changes: 82 additions & 3 deletions net/ipv6/udp_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,94 @@
#include <net/ip6_checksum.h>
#include "ip6_offload.h"

static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb,
netdev_features_t features)
static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
unsigned int unfrag_ip6hlen, unfrag_len;
struct frag_hdr *fptr;
u8 *packet_start, *prevhdr;
u8 nexthdr;
u8 frag_hdr_sz = sizeof(struct frag_hdr);
__wsum csum;
int tnl_hlen;
int err;

mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
goto out;

if (skb->encapsulation && skb_shinfo(skb)->gso_type &
(SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
segs = skb_udp_tunnel_segment(skb, features, true);
else {
const struct ipv6hdr *ipv6h;
struct udphdr *uh;

if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto out;

/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
* do checksum of UDP packets sent as multiple IP fragments.
*/

uh = udp_hdr(skb);
ipv6h = ipv6_hdr(skb);

uh->check = 0;
csum = skb_checksum(skb, 0, skb->len, 0);
uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
&ipv6h->daddr, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;

skb->ip_summed = CHECKSUM_UNNECESSARY;

/* If there is no outer header we can fake a checksum offload
* due to the fact that we have already done the checksum in
* software prior to segmenting the frame.
*/
if (!skb->encap_hdr_csum)
features |= NETIF_F_HW_CSUM;

/* Check if there is enough headroom to insert fragment header. */
tnl_hlen = skb_tnl_header_len(skb);
if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
goto out;
}

/* Find the unfragmentable header and shift it left by frag_hdr_sz
* bytes to insert fragment header.
*/
err = ip6_find_1stfragopt(skb, &prevhdr);
if (err < 0)
return ERR_PTR(err);
unfrag_ip6hlen = err;
nexthdr = *prevhdr;
*prevhdr = NEXTHDR_FRAGMENT;
unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
unfrag_ip6hlen + tnl_hlen;
packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);

SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
skb->mac_header -= frag_hdr_sz;
skb->network_header -= frag_hdr_sz;

fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
fptr->nexthdr = nexthdr;
fptr->reserved = 0;
fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb);

/* Fragment the skb. ipv6 header and the remaining fields of the
* fragment header are updated in ipv6_gso_segment()
*/
segs = skb_segment(skb, features);
}

out:
return segs;
}

Expand Down Expand Up @@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)

static const struct net_offload udpv6_offload = {
.callbacks = {
.gso_segment = udp6_tunnel_segment,
.gso_segment = udp6_ufo_fragment,
.gro_receive = udp6_gro_receive,
.gro_complete = udp6_gro_complete,
},
Expand Down
14 changes: 14 additions & 0 deletions net/openvswitch/datapath.c
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
const struct dp_upcall_info *upcall_info,
uint32_t cutlen)
{
unsigned short gso_type = skb_shinfo(skb)->gso_type;
struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
int err;

Expand All @@ -318,9 +320,21 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
if (segs == NULL)
return -EINVAL;

if (gso_type & SKB_GSO_UDP) {
/* The initial flow key extracted by ovs_flow_key_extract()
* in this case is for a first fragment, so we need to
* properly mark later fragments.
*/
later_key = *key;
later_key.ip.frag = OVS_FRAG_TYPE_LATER;
}

/* Queue all of the segments. */
skb = segs;
do {
if (gso_type & SKB_GSO_UDP && skb != segs)
key = &later_key;

err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
if (err)
break;
Expand Down
6 changes: 5 additions & 1 deletion net/openvswitch/flow.c
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
key->ip.frag = OVS_FRAG_TYPE_LATER;
return 0;
}
if (nh->frag_off & htons(IP_MF))
if (nh->frag_off & htons(IP_MF) ||
skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
key->ip.frag = OVS_FRAG_TYPE_FIRST;
else
key->ip.frag = OVS_FRAG_TYPE_NONE;
Expand Down Expand Up @@ -747,6 +748,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)

if (key->ip.frag == OVS_FRAG_TYPE_LATER)
return 0;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
key->ip.frag = OVS_FRAG_TYPE_FIRST;

/* Transport layer. */
if (key->ip.proto == NEXTHDR_TCP) {
if (tcphdr_ok(skb)) {
Expand Down
Loading

0 comments on commit 0c19f84

Please sign in to comment.