Skip to content

Commit

Permalink
net: sched: do not acquire qdisc spinlock in qdisc/class stats dump
Browse files Browse the repository at this point in the history
Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host
agent [1] are problematic at scale :

For each qdisc/class found in the dump, we currently lock the root qdisc
spinlock in order to get stats. Sampling stats every 5 seconds from
thousands of HTB classes is a challenge when the root qdisc spinlock is
under high pressure. Not only the dumps take time, they also slow
down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases.

An audit of existing qdiscs showed that sch_fq_codel is the only qdisc
that might need the qdisc lock in fq_codel_dump_stats() and
fq_codel_dump_class_stats()

In v2 of this patch, I now use the Qdisc running seqcount to provide
consistent reads of packets/bytes counters, regardless of 32/64 bit arches.

I also changed rate estimators to use the same infrastructure
so that they no longer need to lock root qdisc lock.

[1]
http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf

Signed-off-by: Eric Dumazet <[email protected]>
Cc: Cong Wang <[email protected]>
Cc: Jamal Hadi Salim <[email protected]>
Cc: John Fastabend <[email protected]>
Cc: Kevin Athey <[email protected]>
Cc: Xiaotian Pei <[email protected]>
Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
Eric Dumazet authored and davem330 committed Jun 7, 2016
1 parent f9eb8ae commit edb09eb
Show file tree
Hide file tree
Showing 20 changed files with 126 additions and 69 deletions.
2 changes: 1 addition & 1 deletion Documentation/networking/gen_stats.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct mystruct {
...
};

Update statistics:
Update statistics, in dequeue() methods only, (while owning qdisc->running)
mystruct->tstats.packet++;
mystruct->qstats.backlog += skb->pkt_len;

Expand Down
12 changes: 8 additions & 4 deletions include/net/gen_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
spinlock_t *lock, struct gnet_dump *d,
int padattr);

int gnet_stats_copy_basic(struct gnet_dump *d,
int gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
void __gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
int gnet_stats_copy_rate_est(struct gnet_dump *d,
Expand All @@ -52,13 +54,15 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock, struct nlattr *opt);
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_rate_est64 *rate_est);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock, struct nlattr *opt);
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
const struct gnet_stats_rate_est64 *rate_est);
#endif
8 changes: 8 additions & 0 deletions include/net/sch_generic.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,14 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
return qdisc_lock(root);
}

static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
{
struct Qdisc *root = qdisc_root_sleeping(qdisc);

ASSERT_RTNL();
return &root->running;
}

static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
{
return qdisc->dev_queue->dev;
Expand Down
24 changes: 16 additions & 8 deletions net/core/gen_estimator.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ struct gen_estimator
struct gnet_stats_basic_packed *bstats;
struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
seqcount_t *running;
int ewma_log;
u32 last_packets;
unsigned long avpps;
Expand Down Expand Up @@ -121,26 +122,28 @@ static void est_timer(unsigned long arg)
unsigned long rate;
u64 brate;

spin_lock(e->stats_lock);
if (e->stats_lock)
spin_lock(e->stats_lock);
read_lock(&est_lock);
if (e->bstats == NULL)
goto skip;

__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
__gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);

brate = (b.bytes - e->last_bytes)<<(7 - idx);
e->last_bytes = b.bytes;
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
e->rate_est->bps = (e->avbps+0xF)>>5;
WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);

rate = b.packets - e->last_packets;
rate <<= (7 - idx);
e->last_packets = b.packets;
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
e->rate_est->pps = (e->avpps + 0xF) >> 5;
WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
skip:
read_unlock(&est_lock);
spin_unlock(e->stats_lock);
if (e->stats_lock)
spin_unlock(e->stats_lock);
}

if (!list_empty(&elist[idx].list))
Expand Down Expand Up @@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @running: qdisc running seqcount
* @opt: rate estimator configuration TLV
*
* Creates a new rate estimator with &bstats as source and &rate_est
Expand All @@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
struct gen_estimator *est;
Expand All @@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
if (est == NULL)
return -ENOBUFS;

__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
__gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);

idx = parm->interval + 2;
est->bstats = bstats;
est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->running = running;
est->ewma_log = parm->ewma_log;
est->last_bytes = b.bytes;
est->avbps = rate_est->bps<<5;
Expand Down Expand Up @@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @running: qdisc running seqcount (might be NULL)
* @opt: rate estimator configuration TLV
*
* Replaces the configuration of a rate estimator by calling
Expand All @@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock, struct nlattr *opt)
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
gen_kill_estimator(bstats, rate_est);
return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);

Expand Down
34 changes: 23 additions & 11 deletions net/core/gen_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
return 0;

nla_put_failure:
if (d->lock)
spin_unlock_bh(d->lock);
kfree(d->xstats);
d->xstats = NULL;
d->xstats_len = 0;
spin_unlock_bh(d->lock);
return -1;
}

Expand Down Expand Up @@ -65,15 +66,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
{
memset(d, 0, sizeof(*d));

spin_lock_bh(lock);
d->lock = lock;
if (type)
d->tail = (struct nlattr *)skb_tail_pointer(skb);
d->skb = skb;
d->compat_tc_stats = tc_stats_type;
d->compat_xstats = xstats_type;
d->padattr = padattr;

if (lock) {
d->lock = lock;
spin_lock_bh(lock);
}
if (d->tail)
return gnet_stats_copy(d, type, NULL, 0, padattr);

Expand Down Expand Up @@ -126,16 +128,23 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
}

void
__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
__gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
unsigned int seq;

if (cpu) {
__gnet_stats_copy_basic_cpu(bstats, cpu);
} else {
return;
}
do {
if (running)
seq = read_seqcount_begin(running);
bstats->bytes = b->bytes;
bstats->packets = b->packets;
}
} while (running && read_seqcount_retry(running, seq));
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);

Expand All @@ -152,13 +161,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_basic(struct gnet_dump *d,
gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
struct gnet_stats_basic_packed bstats = {0};

__gnet_stats_copy_basic(&bstats, cpu, b);
__gnet_stats_copy_basic(running, &bstats, cpu, b);

if (d->compat_tc_stats) {
d->tc_stats.bytes = bstats.bytes;
Expand Down Expand Up @@ -328,8 +338,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
return 0;

err_out:
if (d->lock)
spin_unlock_bh(d->lock);
d->xstats_len = 0;
spin_unlock_bh(d->lock);
return -1;
}
EXPORT_SYMBOL(gnet_stats_copy_app);
Expand Down Expand Up @@ -363,10 +374,11 @@ gnet_stats_finish_copy(struct gnet_dump *d)
return -1;
}

if (d->lock)
spin_unlock_bh(d->lock);
kfree(d->xstats);
d->xstats = NULL;
d->xstats_len = 0;
spin_unlock_bh(d->lock);
return 0;
}
EXPORT_SYMBOL(gnet_stats_finish_copy);
2 changes: 1 addition & 1 deletion net/netfilter/xt_RATEEST.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.ewma_log = info->ewma_log;

ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
&est->lock, &cfg.opt);
&est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;

Expand Down
4 changes: 2 additions & 2 deletions net/sched/act_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
if (est) {
err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats,
&p->tcfc_rate_est,
&p->tcfc_lock, est);
&p->tcfc_lock, NULL, est);
if (err) {
free_percpu(p->cpu_qstats);
goto err2;
Expand Down Expand Up @@ -671,7 +671,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
if (err < 0)
goto errout;

if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfc_bstats) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfc_bstats,
&p->tcfc_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
Expand Down
3 changes: 2 additions & 1 deletion net/sched/act_police.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
if (est) {
err = gen_replace_estimator(&police->tcf_bstats, NULL,
&police->tcf_rate_est,
&police->tcf_lock, est);
&police->tcf_lock,
NULL, est);
if (err)
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
Expand Down
21 changes: 11 additions & 10 deletions net/sched/sch_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -982,7 +982,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
rcu_assign_pointer(sch->stab, stab);
}
if (tca[TCA_RATE]) {
spinlock_t *root_lock;
seqcount_t *running;

err = -EOPNOTSUPP;
if (sch->flags & TCQ_F_MQROOT)
Expand All @@ -991,14 +991,15 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
if ((sch->parent != TC_H_ROOT) &&
!(sch->flags & TCQ_F_INGRESS) &&
(!p || !(p->flags & TCQ_F_MQROOT)))
root_lock = qdisc_root_sleeping_lock(sch);
running = qdisc_root_sleeping_running(sch);
else
root_lock = qdisc_lock(sch);
running = &sch->running;

err = gen_new_estimator(&sch->bstats,
sch->cpu_bstats,
&sch->rate_est,
root_lock,
NULL,
running,
tca[TCA_RATE]);
if (err)
goto err_out4;
Expand Down Expand Up @@ -1061,7 +1062,8 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
gen_replace_estimator(&sch->bstats,
sch->cpu_bstats,
&sch->rate_est,
qdisc_root_sleeping_lock(sch),
NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
}
out:
Expand Down Expand Up @@ -1369,8 +1371,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
goto nla_put_failure;

if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
qdisc_root_sleeping_lock(q), &d,
TCA_PAD) < 0)
NULL, &d, TCA_PAD) < 0)
goto nla_put_failure;

if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
Expand All @@ -1381,7 +1382,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
cpu_qstats = q->cpu_qstats;
}

if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
&d, cpu_bstats, &q->bstats) < 0 ||
gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
Expand Down Expand Up @@ -1684,8 +1686,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
goto nla_put_failure;

if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
qdisc_root_sleeping_lock(q), &d,
TCA_PAD) < 0)
NULL, &d, TCA_PAD) < 0)
goto nla_put_failure;

if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
Expand Down
3 changes: 2 additions & 1 deletion net/sched/sch_atm.c
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,8 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
{
struct atm_flow_data *flow = (struct atm_flow_data *)arg;

if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &flow->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
return -1;

Expand Down
9 changes: 6 additions & 3 deletions net/sched/sch_cbq.c
Original file line number Diff line number Diff line change
Expand Up @@ -1600,7 +1600,8 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (cl->undertime != PSCHED_PASTPERFECT)
cl->xstats.undertime = cl->undertime - q->now;

if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
return -1;
Expand Down Expand Up @@ -1755,7 +1756,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
qdisc_root_sleeping_lock(sch),
NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
qdisc_put_rtab(rtab);
Expand Down Expand Up @@ -1848,7 +1850,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t

if (tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
qdisc_root_sleeping_lock(sch),
NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
kfree(cl);
Expand Down
Loading

0 comments on commit edb09eb

Please sign in to comment.