Skip to content

Commit

Permalink
Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf
Browse files Browse the repository at this point in the history
Pablo Neira Ayuso says:

====================
Netfilter fixes for net

This batch contains Netfilter fixes for net:

1) Restrict hashlimit size to 1048576, from Cong Wang.

2) Check for offload flags from nf_flow_table_offload_setup(),
   this fixes a crash in case the hardware offload is disabled.
   From Florian Westphal.

3) Three preparation patches to extend the conntrack clash resolution,
   from Florian.

4) Extend clash resolution to deal with DNS packets from the same flow
   racing to set up the NAT configuration.

5) Small documentation fix in pipapo, from Stefano Brivio.

6) Remove misleading unlikely() from pipapo_refill(), also from Stefano.

7) Reduce hashlimit mutex scope, from Cong Wang. This patch is actually
   triggering another problem, still under discussion, another patch to
   fix this will follow up.
====================

Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
davem330 committed Feb 18, 2020
2 parents a2cfb96 + 9a77120 commit 7c8c169
Show file tree
Hide file tree
Showing 7 changed files with 220 additions and 45 deletions.
7 changes: 7 additions & 0 deletions include/linux/rculist_nulls.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,13 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
}
}

/* after that hlist_nulls_del will work */
static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
{
n->pprev = &n->next;
n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
}

/**
* hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
* @tpos: the type * to use as a loop cursor.
Expand Down
12 changes: 11 additions & 1 deletion include/uapi/linux/netfilter/nf_conntrack_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ enum ip_conntrack_status {
IPS_UNTRACKED_BIT = 12,
IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT),

#ifdef __KERNEL__
/* Re-purposed for in-kernel use:
* Tags a conntrack entry that clashed with an existing entry
* on insert.
*/
IPS_NAT_CLASH_BIT = IPS_UNTRACKED_BIT,
IPS_NAT_CLASH = IPS_UNTRACKED,
#endif

/* Conntrack got a helper explicitly attached via CT target. */
IPS_HELPER_BIT = 13,
IPS_HELPER = (1 << IPS_HELPER_BIT),
Expand All @@ -110,7 +119,8 @@ enum ip_conntrack_status {
*/
IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED |
IPS_OFFLOAD),

__IPS_MAX_BIT = 15,
};
Expand Down
192 changes: 164 additions & 28 deletions net/netfilter/nf_conntrack_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -894,32 +894,175 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
}
}

/* Resolve race on insertion if this protocol allows this. */
static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
{
struct nf_conn_tstamp *tstamp;

atomic_inc(&ct->ct_general.use);
ct->status |= IPS_CONFIRMED;

/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
if (tstamp)
tstamp->start = ktime_get_real_ns();
}

static int __nf_ct_resolve_clash(struct sk_buff *skb,
struct nf_conntrack_tuple_hash *h)
{
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
enum ip_conntrack_info ctinfo;
struct nf_conn *loser_ct;

loser_ct = nf_ct_get(skb, &ctinfo);

if (nf_ct_is_dying(ct))
return NF_DROP;

if (!atomic_inc_not_zero(&ct->ct_general.use))
return NF_DROP;

if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
nf_ct_match(ct, loser_ct)) {
struct net *net = nf_ct_net(ct);

nf_ct_acct_merge(ct, ctinfo, loser_ct);
nf_ct_add_to_dying_list(loser_ct);
nf_conntrack_put(&loser_ct->ct_general);
nf_ct_set(skb, ct, ctinfo);

NF_CT_STAT_INC(net, insert_failed);
return NF_ACCEPT;
}

nf_ct_put(ct);
return NF_DROP;
}

/**
* nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
*
* @skb: skb that causes the collision
* @repl_idx: hash slot for reply direction
*
* Called when origin or reply direction had a clash.
* The skb can be handled without packet drop provided the reply direction
* is unique or there the existing entry has the identical tuple in both
* directions.
*
* Caller must hold conntrack table locks to prevent concurrent updates.
*
* Returns NF_DROP if the clash could not be handled.
*/
static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
{
struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct net *net;

zone = nf_ct_zone(loser_ct);
net = nf_ct_net(loser_ct);

/* Reply direction must never result in a clash, unless both origin
* and reply tuples are identical.
*/
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
if (nf_ct_key_equal(h,
&loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
return __nf_ct_resolve_clash(skb, h);
}

/* We want the clashing entry to go away real soon: 1 second timeout. */
loser_ct->timeout = nfct_time_stamp + HZ;

/* IPS_NAT_CLASH removes the entry automatically on the first
* reply. Also prevents UDP tracker from moving the entry to
* ASSURED state, i.e. the entry can always be evicted under
* pressure.
*/
loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;

__nf_conntrack_insert_prepare(loser_ct);

/* fake add for ORIGINAL dir: we want lookups to only find the entry
* already in the table. This also hides the clashing entry from
* ctnetlink iteration, i.e. conntrack -L won't show them.
*/
hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);

hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
&nf_conntrack_hash[repl_idx]);
return NF_ACCEPT;
}

/**
* nf_ct_resolve_clash - attempt to handle clash without packet drop
*
* @skb: skb that causes the clash
* @h: tuplehash of the clashing entry already in table
* @hash_reply: hash slot for reply direction
*
* A conntrack entry can be inserted to the connection tracking table
* if there is no existing entry with an identical tuple.
*
* If there is one, @skb (and the assocated, unconfirmed conntrack) has
* to be dropped. In case @skb is retransmitted, next conntrack lookup
* will find the already-existing entry.
*
* The major problem with such packet drop is the extra delay added by
* the packet loss -- it will take some time for a retransmit to occur
* (or the sender to time out when waiting for a reply).
*
* This function attempts to handle the situation without packet drop.
*
* If @skb has no NAT transformation or if the colliding entries are
* exactly the same, only the to-be-confirmed conntrack entry is discarded
* and @skb is associated with the conntrack entry already in the table.
*
* Failing that, the new, unconfirmed conntrack is still added to the table
* provided that the collision only occurs in the ORIGINAL direction.
* The new entry will be added after the existing one in the hash list,
* so packets in the ORIGINAL direction will continue to match the existing
* entry. The new entry will also have a fixed timeout so it expires --
* due to the collision, it will not see bidirectional traffic.
*
* Returns NF_DROP if the clash could not be resolved.
*/
static __cold noinline int
nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
struct nf_conntrack_tuple_hash *h)
nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
u32 reply_hash)
{
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
const struct nf_conntrack_l4proto *l4proto;
enum ip_conntrack_info oldinfo;
struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
enum ip_conntrack_info ctinfo;
struct nf_conn *loser_ct;
struct net *net;
int ret;

loser_ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(loser_ct);

l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
if (l4proto->allow_clash &&
!nf_ct_is_dying(ct) &&
atomic_inc_not_zero(&ct->ct_general.use)) {
if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
nf_ct_match(ct, loser_ct)) {
nf_ct_acct_merge(ct, ctinfo, loser_ct);
nf_conntrack_put(&loser_ct->ct_general);
nf_ct_set(skb, ct, oldinfo);
return NF_ACCEPT;
}
nf_ct_put(ct);
}
if (!l4proto->allow_clash)
goto drop;

ret = __nf_ct_resolve_clash(skb, h);
if (ret == NF_ACCEPT)
return ret;

ret = nf_ct_resolve_clash_harder(skb, reply_hash);
if (ret == NF_ACCEPT)
return ret;

drop:
nf_ct_add_to_dying_list(loser_ct);
NF_CT_STAT_INC(net, drop);
NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
}

Expand All @@ -932,7 +1075,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
Expand Down Expand Up @@ -989,6 +1131,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)

if (unlikely(nf_ct_is_dying(ct))) {
nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, insert_failed);
goto dying;
}

Expand All @@ -1009,13 +1152,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout += nfct_time_stamp;
atomic_inc(&ct->ct_general.use);
ct->status |= IPS_CONFIRMED;

/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
if (tstamp)
tstamp->start = ktime_get_real_ns();
__nf_conntrack_insert_prepare(ct);

/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
Expand All @@ -1035,11 +1173,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;

out:
nf_ct_add_to_dying_list(ct);
ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
return ret;
}
Expand Down
20 changes: 16 additions & 4 deletions net/netfilter/nf_conntrack_proto_udp.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,18 @@ static bool udp_error(struct sk_buff *skb,
return false;
}

static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct,
struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
u32 extra_jiffies)
{
if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY &&
ct->status & IPS_NAT_CLASH))
nf_ct_kill(ct);
else
nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies);
}

/* Returns verdict for packet, and may modify conntracktype */
int nf_conntrack_udp_packet(struct nf_conn *ct,
struct sk_buff *skb,
Expand Down Expand Up @@ -116,8 +128,8 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
nf_ct_refresh_acct(ct, ctinfo, skb,
timeouts[UDP_CT_UNREPLIED]);
nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
Expand Down Expand Up @@ -198,8 +210,8 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct,
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
nf_ct_refresh_acct(ct, ctinfo, skb,
timeouts[UDP_CT_UNREPLIED]);
nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
Expand Down
6 changes: 3 additions & 3 deletions net/netfilter/nf_flow_table_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -847,9 +847,6 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
{
int err;

if (!nf_flowtable_hw_offload(flowtable))
return 0;

if (!dev->netdev_ops->ndo_setup_tc)
return -EOPNOTSUPP;

Expand All @@ -876,6 +873,9 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
struct flow_block_offload bo;
int err;

if (!nf_flowtable_hw_offload(flowtable))
return 0;

err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack);
if (err < 0)
return err;
Expand Down
6 changes: 3 additions & 3 deletions net/netfilter/nft_set_pipapo.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@
* ::
*
* rule indices in last field: 0 1
* map to elements: 0x42 0x66
* map to elements: 0x66 0x42
*
*
* Matching
Expand Down Expand Up @@ -298,7 +298,7 @@
* ::
*
* rule indices in last field: 0 1
* map to elements: 0x42 0x66
* map to elements: 0x66 0x42
*
* the matching element is at 0x42.
*
Expand Down Expand Up @@ -503,7 +503,7 @@ static int pipapo_refill(unsigned long *map, int len, int rules,
return -1;
}

if (unlikely(match_only)) {
if (match_only) {
bitmap_clear(map, i, 1);
return i;
}
Expand Down
Loading

0 comments on commit 7c8c169

Please sign in to comment.