Skip to content

Commit

Permalink
netfilter: revised locking for x_tables
Browse files Browse the repository at this point in the history
The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer 
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.

This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.

Signed-off-by: Stephen Hemminger <[email protected]>
Acked-by: Linus Torvalds <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
Stephen Hemminger authored and davem330 committed Apr 29, 2009
1 parent bf368e4 commit 942e4a2
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 296 deletions.
73 changes: 68 additions & 5 deletions include/linux/netfilter/x_tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,6 @@ struct xt_table
/* What hooks you will enter on */
unsigned int valid_hooks;

/* Lock for the curtain */
struct mutex lock;

/* Man behind the curtain... */
struct xt_table_info *private;

Expand Down Expand Up @@ -434,8 +431,74 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);

extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
extern void xt_free_table_info(struct xt_table_info *info);
extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
struct xt_table_info *new);

/*
* Per-CPU spinlock associated with per-cpu table entries, and
* with a counter for the "reading" side that allows a recursive
* reader to avoid taking the lock and deadlocking.
*
* "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
* It needs to ensure that the rules are not being changed while the packet
* is being processed. In some cases, the read lock will be acquired
* twice on the same CPU; this is okay because of the count.
*
* "writing" is used when reading counters.
* During replace any readers that are using the old tables have to complete
* before freeing the old table. This is handled by the write locking
* necessary for reading the counters.
*/
struct xt_info_lock {
spinlock_t lock;
unsigned char readers;
};
DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);

/*
* Note: we need to ensure that preemption is disabled before acquiring
* the per-cpu-variable, so we do it as a two step process rather than
* using "spin_lock_bh()".
*
* We _also_ need to disable bottom half processing before updating our
* nesting count, to make sure that the only kind of re-entrancy is this
* code being called by itself: since the count+lock is not an atomic
* operation, we can allow no races.
*
* _Only_ that special combination of being per-cpu and never getting
* re-entered asynchronously means that the count is safe.
*/
static inline void xt_info_rdlock_bh(void)
{
struct xt_info_lock *lock;

local_bh_disable();
lock = &__get_cpu_var(xt_info_locks);
if (!lock->readers++)
spin_lock(&lock->lock);
}

static inline void xt_info_rdunlock_bh(void)
{
struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);

if (!--lock->readers)
spin_unlock(&lock->lock);
local_bh_enable();
}

/*
* The "writer" side needs to get exclusive access to the lock,
* regardless of readers. This must be called with bottom half
* processing (and thus also preemption) disabled.
*/
static inline void xt_info_wrlock(unsigned int cpu)
{
spin_lock(&per_cpu(xt_info_locks, cpu).lock);
}

static inline void xt_info_wrunlock(unsigned int cpu)
{
spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
}

/*
* This helper is performance critical and must be inlined
Expand Down
125 changes: 36 additions & 89 deletions net/ipv4/netfilter/arp_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;

rcu_read_lock_bh();
private = rcu_dereference(table->private);
table_base = rcu_dereference(private->entries[smp_processor_id()]);
xt_info_rdlock_bh();
private = table->private;
table_base = private->entries[smp_processor_id()];

e = get_entry(table_base, private->hook_entry[hook]);
back = get_entry(table_base, private->underflow[hook]);
Expand All @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,

hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
(2 * skb->dev->addr_len);

ADD_COUNTER(e->counters, hdr_len, 1);

t = arpt_get_target(e);
Expand Down Expand Up @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
e = (void *)e + e->next_offset;
}
} while (!hotdrop);

rcu_read_unlock_bh();
xt_info_rdunlock_bh();

if (hotdrop)
return NF_DROP;
Expand Down Expand Up @@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
* with data used by 'current' CPU
* We dont care about preemption here.
*
* Bottom half has to be disabled to prevent deadlock
* if new softirq were to run and call ipt_do_table
*/
curcpu = raw_smp_processor_id();
local_bh_disable();
curcpu = smp_processor_id();

i = 0;
ARPT_ENTRY_ITERATE(t->entries[curcpu],
Expand All @@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
if (cpu == curcpu)
continue;
i = 0;
xt_info_wrlock(cpu);
ARPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
&i);
xt_info_wrunlock(cpu);
}
}


/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct arpt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

(*i)++;
return 0;
}

/* Take values from counters and add them back onto the current cpu */
static void put_counters(struct xt_table_info *t,
const struct xt_counters counters[])
{
unsigned int i, cpu;

local_bh_disable();
cpu = smp_processor_id();
i = 0;
ARPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_counter_to_entry,
counters,
&i);
local_bh_enable();
}

static inline int
zero_entry_counter(struct arpt_entry *e, void *arg)
{
e->counters.bcnt = 0;
e->counters.pcnt = 0;
return 0;
}

static void
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
{
unsigned int cpu;
const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];

memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
for_each_possible_cpu(cpu) {
memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
zero_entry_counter, NULL);
}
}

static struct xt_counters *alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
struct xt_table_info *private = table->private;
struct xt_table_info *info;

/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
Expand All @@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
counters = vmalloc_node(countersize, numa_node_id());

if (counters == NULL)
goto nomem;

info = xt_alloc_table_info(private->size);
if (!info)
goto free_counters;

clone_counters(info, private);

mutex_lock(&table->lock);
xt_table_entry_swap_rcu(private, info);
synchronize_net(); /* Wait until smoke has cleared */
return ERR_PTR(-ENOMEM);

get_counters(info, counters);
put_counters(private, counters);
mutex_unlock(&table->lock);

xt_free_table_info(info);
get_counters(private, counters);

return counters;

free_counters:
vfree(counters);
nomem:
return ERR_PTR(-ENOMEM);
}

static int copy_entries_to_user(unsigned int total_size,
Expand Down Expand Up @@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);

/* Get the old counters. */
/* Get the old counters, and synchronize with replace */
get_counters(oldinfo, counters);

/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
Expand Down Expand Up @@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}

/* We're lazy, and add to the first CPU; overflow works its fey magic
* and everything is OK. */
static int
add_counter_to_entry(struct arpt_entry *e,
const struct xt_counters addme[],
unsigned int *i)
{
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);

(*i)++;
return 0;
}

static int do_add_counters(struct net *net, void __user *user, unsigned int len,
int compat)
{
unsigned int i;
unsigned int i, curcpu;
struct xt_counters_info tmp;
struct xt_counters *paddc;
unsigned int num_counters;
Expand Down Expand Up @@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
goto free;
}

mutex_lock(&t->lock);
local_bh_disable();
private = t->private;
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}

preempt_disable();
i = 0;
/* Choose the copy that is on our node */
loc_cpu_entry = private->entries[smp_processor_id()];
curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu];
xt_info_wrlock(curcpu);
ARPT_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
paddc,
&i);
preempt_enable();
xt_info_wrunlock(curcpu);
unlock_up_free:
mutex_unlock(&t->lock);

local_bh_enable();
xt_table_unlock(t);
module_put(t->me);
free:
Expand Down
Loading

0 comments on commit 942e4a2

Please sign in to comment.