Skip to content

Commit

Permalink
Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
Browse files Browse the repository at this point in the history
Pablo Neira Ayuso says:

====================
Netfilter/IPVS/OVS updates for net-next

The following patchset contains Netfilter/IPVS fixes and OVS NAT
support, more specifically this batch is composed of:

1) Fix a crash in ipset when performing a parallel flush/dump with
   set:list type, from Jozsef Kadlecsik.

2) Make sure NFACCT_FILTER_* netlink attributes are in place before
   accessing them, from Phil Turnbull.

3) Check return error code from ip_vs_fill_iph_skb_off() in IPVS SIP
   helper, from Arnd Bergmann.

4) Add workaround to IPVS to reschedule existing connections to new
   destination server by dropping the packet and wait for retransmission
   of TCP syn packet, from Julian Anastasov.

5) Allow connection rescheduling in IPVS when in CLOSE state, also
   from Julian.

6) Fix wrong offset of SIP Call-ID in IPVS helper, from Marco Angaroni.

7) Validate IPSET_ATTR_ETHER netlink attribute length, from Jozsef.

8) Check match/targetinfo netlink attribute size in nft_compat,
   patch from Florian Westphal.

9) Check for integer overflow on 32-bit systems in x_tables, from
   Florian Westphal.

Several patches from Jarno Rajahalme to prepare the introduction of
NAT support to OVS based on the Netfilter infrastructure:

10) Schedule IP_CT_NEW_REPLY definition for removal in
    nf_conntrack_common.h.

11) Simplify checksumming recalculation in nf_nat.

12) Add comments to the openvswitch conntrack code, from Jarno.

13) Update the CT state key only after successful nf_conntrack_in()
    invocation.

14) Find existing conntrack entry after upcall.

15) Handle NF_REPEAT case due to templates in nf_conntrack_in().

16) Call the conntrack helper functions once the conntrack has been
    confirmed.

17) And finally, add the NAT interface to OVS.

The batch closes with:

18) Cleanup to use spin_unlock_wait() instead of
    spin_lock()/spin_unlock(), from Nicholas Mc Guire.
====================

Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
davem330 committed Mar 15, 2016
2 parents acffb58 + e39365b commit 1cdba55
Show file tree
Hide file tree
Showing 18 changed files with 795 additions and 134 deletions.
17 changes: 17 additions & 0 deletions include/net/ip_vs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
}
#endif /* CONFIG_IP_VS_NFCT */

/* Really using conntrack? */
static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp,
struct sk_buff *skb)
{
#ifdef CONFIG_IP_VS_NFCT
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;

if (!(cp->flags & IP_VS_CONN_F_NFCT))
return false;
ct = nf_ct_get(skb, &ctinfo);
if (ct && !nf_ct_is_untracked(ct))
return true;
#endif
return false;
}

static inline int
ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
{
Expand Down
12 changes: 9 additions & 3 deletions include/uapi/linux/netfilter/nf_conntrack_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@ enum ip_conntrack_info {

IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,
/* Number of distinct IP_CT types (no NEW in reply dirn). */
IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
/* No NEW in reply direction. */

/* Number of distinct IP_CT types. */
IP_CT_NUMBER,

/* only for userspace compatibility */
#ifndef __KERNEL__
IP_CT_NEW_REPLY = IP_CT_NUMBER,
#endif
};

#define NF_CT_STATE_INVALID_BIT (1 << 0)
Expand Down
49 changes: 49 additions & 0 deletions include/uapi/linux/openvswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,14 @@ struct ovs_key_ct_labels {
#define OVS_CS_F_REPLY_DIR 0x08 /* Flow is in the reply direction. */
#define OVS_CS_F_INVALID 0x10 /* Could not track connection. */
#define OVS_CS_F_TRACKED 0x20 /* Conntrack has occurred. */
#define OVS_CS_F_SRC_NAT 0x40 /* Packet's source address/port was
* mangled by NAT.
*/
#define OVS_CS_F_DST_NAT 0x80 /* Packet's destination address/port
* was mangled by NAT.
*/

#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)

/**
* enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
Expand Down Expand Up @@ -632,6 +640,8 @@ struct ovs_action_hash {
* mask. For each bit set in the mask, the corresponding bit in the value is
* copied to the connection tracking label field in the connection.
* @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
* @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
* translation (NAT) on the packet.
*/
enum ovs_ct_attr {
OVS_CT_ATTR_UNSPEC,
Expand All @@ -641,11 +651,50 @@ enum ovs_ct_attr {
OVS_CT_ATTR_LABELS, /* labels to associate with this connection. */
OVS_CT_ATTR_HELPER, /* netlink helper to assist detection of
related connections. */
OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */
__OVS_CT_ATTR_MAX
};

#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)

/**
* enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
*
* @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
* @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
* address/port). Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
* specified. Effective only for packets for ct_state NEW connections.
* Packets of committed connections are mangled by the NAT action according to
* the committed NAT type regardless of the flags specified. As a corollary, a
* NAT action without a NAT type flag will only mangle packets of committed
* connections. The following NAT attributes only apply for NEW
* (non-committed) connections, and they may be included only when the CT
* action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
* @OVS_NAT_ATTR_DST is also included.
* @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
* @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
* @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
* @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
* @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
* @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
* @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
*/
enum ovs_nat_attr {
OVS_NAT_ATTR_UNSPEC,
OVS_NAT_ATTR_SRC,
OVS_NAT_ATTR_DST,
OVS_NAT_ATTR_IP_MIN,
OVS_NAT_ATTR_IP_MAX,
OVS_NAT_ATTR_PROTO_MIN,
OVS_NAT_ATTR_PROTO_MAX,
OVS_NAT_ATTR_PERSISTENT,
OVS_NAT_ATTR_PROTO_HASH,
OVS_NAT_ATTR_PROTO_RANDOM,
__OVS_NAT_ATTR_MAX,
};

#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)

/**
* enum ovs_action_attr - Action types.
*
Expand Down
30 changes: 8 additions & 22 deletions net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
u8 proto, void *data, __sum16 *check,
int datalen, int oldlen)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt = skb_rtable(skb);

if (skb->ip_summed != CHECKSUM_PARTIAL) {
if (!(rt->rt_flags & RTCF_LOCAL) &&
(!skb->dev || skb->dev->features &
(NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) +
skb_network_offset(skb) +
ip_hdrlen(skb);
skb->csum_offset = (void *)check - data;
*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
datalen, proto, 0);
} else {
*check = 0;
*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
datalen, proto,
csum_partial(data, datalen,
0));
if (proto == IPPROTO_UDP && !*check)
*check = CSUM_MANGLED_0;
}
const struct iphdr *iph = ip_hdr(skb);

skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
ip_hdrlen(skb);
skb->csum_offset = (void *)check - data;
*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
proto, 0);
} else
inet_proto_csum_replace2(check, skb,
htons(oldlen), htons(datalen), true);
Expand Down
30 changes: 8 additions & 22 deletions net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,29 +131,15 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
u8 proto, void *data, __sum16 *check,
int datalen, int oldlen)
{
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);

if (skb->ip_summed != CHECKSUM_PARTIAL) {
if (!(rt->rt6i_flags & RTF_LOCAL) &&
(!skb->dev || skb->dev->features &
(NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) +
skb_network_offset(skb) +
(data - (void *)skb->data);
skb->csum_offset = (void *)check - data;
*check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
datalen, proto, 0);
} else {
*check = 0;
*check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
datalen, proto,
csum_partial(data, datalen,
0));
if (proto == IPPROTO_UDP && !*check)
*check = CSUM_MANGLED_0;
}
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);

skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
(data - (void *)skb->data);
skb->csum_offset = (void *)check - data;
*check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
datalen, proto, 0);
} else
inet_proto_csum_replace2(check, skb,
htons(oldlen), htons(datalen), true);
Expand Down
2 changes: 2 additions & 0 deletions net/netfilter/ipset/ip_set_bitmap_ipmac.c
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,8 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],

e.id = ip_to_id(map, ip);
if (tb[IPSET_ATTR_ETHER]) {
if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)
return -IPSET_ERR_PROTOCOL;
memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
e.add_mac = 1;
}
Expand Down
3 changes: 3 additions & 0 deletions net/netfilter/ipset/ip_set_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,9 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
if (unlikely(protocol_failed(attr)))
return -IPSET_ERR_PROTOCOL;

/* Must wait for flush to be really finished in list:set */
rcu_barrier();

/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
* External systems (i.e. xt_set) must call
Expand Down
3 changes: 2 additions & 1 deletion net/netfilter/ipset/ip_set_hash_mac.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);

if (unlikely(!tb[IPSET_ATTR_ETHER]))
if (unlikely(!tb[IPSET_ATTR_ETHER] ||
nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN))
return -IPSET_ERR_PROTOCOL;

ret = ip_set_get_extensions(set, tb, &ext);
Expand Down
55 changes: 25 additions & 30 deletions net/netfilter/ipset/ip_set_list_set.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ MODULE_ALIAS("ip_set_list:set");
struct set_elem {
struct rcu_head rcu;
struct list_head list;
struct ip_set *set; /* Sigh, in order to cleanup reference */
ip_set_id_t id;
} __aligned(__alignof__(u64));

Expand Down Expand Up @@ -151,30 +152,29 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
/* Userspace interfaces: we are protected by the nfnl mutex */

static void
__list_set_del(struct ip_set *set, struct set_elem *e)
__list_set_del_rcu(struct rcu_head * rcu)
{
struct set_elem *e = container_of(rcu, struct set_elem, rcu);
struct ip_set *set = e->set;
struct list_set *map = set->data;

ip_set_put_byindex(map->net, e->id);
/* We may call it, because we don't have a to be destroyed
* extension which is used by the kernel.
*/
ip_set_ext_destroy(set, e);
kfree_rcu(e, rcu);
kfree(e);
}

static inline void
list_set_del(struct ip_set *set, struct set_elem *e)
{
list_del_rcu(&e->list);
__list_set_del(set, e);
call_rcu(&e->rcu, __list_set_del_rcu);
}

static inline void
list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
list_set_replace(struct set_elem *e, struct set_elem *old)
{
list_replace_rcu(&old->list, &e->list);
__list_set_del(set, old);
call_rcu(&old->rcu, __list_set_del_rcu);
}

static void
Expand Down Expand Up @@ -244,9 +244,6 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct set_elem *e, *n, *prev, *next;
bool flag_exist = flags & IPSET_FLAG_EXIST;

if (SET_WITH_TIMEOUT(set))
set_cleanup_entries(set);

/* Find where to add the new entry */
n = prev = next = NULL;
list_for_each_entry(e, &map->members, list) {
Expand Down Expand Up @@ -301,10 +298,11 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (!e)
return -ENOMEM;
e->id = d->id;
e->set = set;
INIT_LIST_HEAD(&e->list);
list_set_init_extensions(set, ext, e);
if (n)
list_set_replace(set, e, n);
list_set_replace(e, n);
else if (next)
list_add_tail_rcu(&e->list, &next->list);
else if (prev)
Expand Down Expand Up @@ -431,6 +429,7 @@ list_set_destroy(struct ip_set *set)

if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);

list_for_each_entry_safe(e, n, &map->members, list) {
list_del(&e->list);
ip_set_put_byindex(map->net, e->id);
Expand All @@ -450,8 +449,10 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
struct set_elem *e;
u32 n = 0;

list_for_each_entry(e, &map->members, list)
rcu_read_lock();
list_for_each_entry_rcu(e, &map->members, list)
n++;
rcu_read_unlock();

nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
Expand Down Expand Up @@ -483,33 +484,25 @@ list_set_list(const struct ip_set *set,
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
list_for_each_entry(e, &map->members, list) {
if (i == first)
break;
i++;
}

rcu_read_lock();
list_for_each_entry_from(e, &map->members, list) {
i++;
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
list_for_each_entry_rcu(e, &map->members, list) {
if (i < first ||
(SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))) {
i++;
continue;
}
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested) {
if (i == first) {
nla_nest_cancel(skb, atd);
ret = -EMSGSIZE;
goto out;
}
if (!nested)
goto nla_put_failure;
}
if (nla_put_string(skb, IPSET_ATTR_NAME,
ip_set_name_byindex(map->net, e->id)))
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, e, true))
goto nla_put_failure;
ipset_nest_end(skb, nested);
i++;
}

ipset_nest_end(skb, atd);
Expand All @@ -520,10 +513,12 @@ list_set_list(const struct ip_set *set,
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(i == first)) {
nla_nest_cancel(skb, atd);
cb->args[IPSET_CB_ARG0] = 0;
ret = -EMSGSIZE;
} else {
cb->args[IPSET_CB_ARG0] = i;
}
cb->args[IPSET_CB_ARG0] = i - 1;
ipset_nest_end(skb, atd);
out:
rcu_read_unlock();
Expand Down
Loading

0 comments on commit 1cdba55

Please sign in to comment.