Skip to content

Commit

Permalink
neigh: new unresolved queue limits
Browse files Browse the repository at this point in the history
Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <[email protected]>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <[email protected]>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> >  ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit.  The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.

Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.

[PATCH V5 net-next] neigh: new unresolved queue limits

unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.

$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms

Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
Eric Dumazet authored and davem330 committed Nov 14, 2011
1 parent 292d139 commit 8b5c171
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 56 deletions.
10 changes: 10 additions & 0 deletions Documentation/networking/ip-sysctl.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ neigh/default/gc_thresh3 - INTEGER
when using large numbers of interfaces and when communicating
with large numbers of directly-connected peers.

neigh/default/unres_qlen_bytes - INTEGER
The maximum number of bytes which may be used by packets
queued for each unresolved address by other network layers.
(added in linux 3.3)

neigh/default/unres_qlen - INTEGER
The maximum number of packets which may be queued for each
unresolved address by other network layers.
(deprecated in linux 3.3) : use unres_qlen_bytes instead.

mtu_expires - INTEGER
Time, in seconds, that cached PMTU information is kept.

Expand Down
1 change: 1 addition & 0 deletions include/linux/neighbour.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ enum {
NDTPA_PROXY_DELAY, /* u64, msecs */
NDTPA_PROXY_QLEN, /* u32 */
NDTPA_LOCKTIME, /* u64, msecs */
NDTPA_QUEUE_LENBYTES, /* u32 */
__NDTPA_MAX
};
#define NDTPA_MAX (__NDTPA_MAX - 1)
Expand Down
3 changes: 2 additions & 1 deletion include/net/neighbour.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ struct neigh_parms {
int reachable_time;
int delay_probe_time;

int queue_len;
int queue_len_bytes;
int ucast_probes;
int app_probes;
int mcast_probes;
Expand Down Expand Up @@ -99,6 +99,7 @@ struct neighbour {
rwlock_t lock;
atomic_t refcnt;
struct sk_buff_head arp_queue;
unsigned int arp_queue_len_bytes;
struct timer_list timer;
unsigned long used;
atomic_t probes;
Expand Down
2 changes: 1 addition & 1 deletion net/atm/clip.c
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ static struct neigh_table clip_tbl = {
.gc_staletime = 60 * HZ,
.reachable_time = 30 * HZ,
.delay_probe_time = 5 * HZ,
.queue_len = 3,
.queue_len_bytes = 64 * 1024,
.ucast_probes = 3,
.mcast_probes = 3,
.anycast_delay = 1 * HZ,
Expand Down
162 changes: 111 additions & 51 deletions net/core/neighbour.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
it to safe state.
*/
skb_queue_purge(&n->arp_queue);
n->arp_queue_len_bytes = 0;
n->output = neigh_blackhole;
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
Expand Down Expand Up @@ -702,6 +703,7 @@ void neigh_destroy(struct neighbour *neigh)
printk(KERN_WARNING "Impossible event.\n");

skb_queue_purge(&neigh->arp_queue);
neigh->arp_queue_len_bytes = 0;

dev_put(neigh->dev);
neigh_parms_put(neigh->parms);
Expand Down Expand Up @@ -842,6 +844,7 @@ static void neigh_invalidate(struct neighbour *neigh)
write_lock(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
neigh->arp_queue_len_bytes = 0;
}

static void neigh_probe(struct neighbour *neigh)
Expand Down Expand Up @@ -980,15 +983,20 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)

if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
if (skb_queue_len(&neigh->arp_queue) >=
neigh->parms->queue_len) {
while (neigh->arp_queue_len_bytes + skb->truesize >
neigh->parms->queue_len_bytes) {
struct sk_buff *buff;

buff = __skb_dequeue(&neigh->arp_queue);
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
Expand Down Expand Up @@ -1175,6 +1183,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
write_lock_bh(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter) {
Expand Down Expand Up @@ -1747,7 +1756,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);

NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes);
/* approximative value for deprecated QUEUE_LEN (in packets) */
NLA_PUT_U32(skb, NDTPA_QUEUE_LEN,
DIV_ROUND_UP(parms->queue_len_bytes,
SKB_TRUESIZE(ETH_FRAME_LEN)));
NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
Expand Down Expand Up @@ -1974,7 +1987,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)

switch (i) {
case NDTPA_QUEUE_LEN:
p->queue_len = nla_get_u32(tbp[i]);
p->queue_len_bytes = nla_get_u32(tbp[i]) *
SKB_TRUESIZE(ETH_FRAME_LEN);
break;
case NDTPA_QUEUE_LENBYTES:
p->queue_len_bytes = nla_get_u32(tbp[i]);
break;
case NDTPA_PROXY_QLEN:
p->proxy_qlen = nla_get_u32(tbp[i]);
Expand Down Expand Up @@ -2635,117 +2652,158 @@ EXPORT_SYMBOL(neigh_app_ns);

#ifdef CONFIG_SYSCTL

#define NEIGH_VARS_MAX 19
static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int size, ret;
ctl_table tmp = *ctl;

tmp.data = &size;
size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN));
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (write && !ret)
*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
return ret;
}

enum {
NEIGH_VAR_MCAST_PROBE,
NEIGH_VAR_UCAST_PROBE,
NEIGH_VAR_APP_PROBE,
NEIGH_VAR_RETRANS_TIME,
NEIGH_VAR_BASE_REACHABLE_TIME,
NEIGH_VAR_DELAY_PROBE_TIME,
NEIGH_VAR_GC_STALETIME,
NEIGH_VAR_QUEUE_LEN,
NEIGH_VAR_QUEUE_LEN_BYTES,
NEIGH_VAR_PROXY_QLEN,
NEIGH_VAR_ANYCAST_DELAY,
NEIGH_VAR_PROXY_DELAY,
NEIGH_VAR_LOCKTIME,
NEIGH_VAR_RETRANS_TIME_MS,
NEIGH_VAR_BASE_REACHABLE_TIME_MS,
NEIGH_VAR_GC_INTERVAL,
NEIGH_VAR_GC_THRESH1,
NEIGH_VAR_GC_THRESH2,
NEIGH_VAR_GC_THRESH3,
NEIGH_VAR_MAX
};

static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
struct ctl_table neigh_vars[NEIGH_VARS_MAX];
struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
char *dev_name;
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
{
[NEIGH_VAR_MCAST_PROBE] = {
.procname = "mcast_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
[NEIGH_VAR_UCAST_PROBE] = {
.procname = "ucast_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
[NEIGH_VAR_APP_PROBE] = {
.procname = "app_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
[NEIGH_VAR_RETRANS_TIME] = {
.procname = "retrans_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
{
[NEIGH_VAR_BASE_REACHABLE_TIME] = {
.procname = "base_reachable_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
[NEIGH_VAR_DELAY_PROBE_TIME] = {
.procname = "delay_first_probe_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
[NEIGH_VAR_GC_STALETIME] = {
.procname = "gc_stale_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
[NEIGH_VAR_QUEUE_LEN] = {
.procname = "unres_qlen",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_unres_qlen,
},
[NEIGH_VAR_QUEUE_LEN_BYTES] = {
.procname = "unres_qlen_bytes",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
[NEIGH_VAR_PROXY_QLEN] = {
.procname = "proxy_qlen",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
[NEIGH_VAR_ANYCAST_DELAY] = {
.procname = "anycast_delay",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
{
[NEIGH_VAR_PROXY_DELAY] = {
.procname = "proxy_delay",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
{
[NEIGH_VAR_LOCKTIME] = {
.procname = "locktime",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
{
[NEIGH_VAR_RETRANS_TIME_MS] = {
.procname = "retrans_time_ms",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
{
[NEIGH_VAR_BASE_REACHABLE_TIME_MS] = {
.procname = "base_reachable_time_ms",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
{
[NEIGH_VAR_GC_INTERVAL] = {
.procname = "gc_interval",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
[NEIGH_VAR_GC_THRESH1] = {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
[NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
[NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
Expand Down Expand Up @@ -2778,47 +2836,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
if (!t)
goto err;

t->neigh_vars[0].data = &p->mcast_probes;
t->neigh_vars[1].data = &p->ucast_probes;
t->neigh_vars[2].data = &p->app_probes;
t->neigh_vars[3].data = &p->retrans_time;
t->neigh_vars[4].data = &p->base_reachable_time;
t->neigh_vars[5].data = &p->delay_probe_time;
t->neigh_vars[6].data = &p->gc_staletime;
t->neigh_vars[7].data = &p->queue_len;
t->neigh_vars[8].data = &p->proxy_qlen;
t->neigh_vars[9].data = &p->anycast_delay;
t->neigh_vars[10].data = &p->proxy_delay;
t->neigh_vars[11].data = &p->locktime;
t->neigh_vars[12].data = &p->retrans_time;
t->neigh_vars[13].data = &p->base_reachable_time;
t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes;
t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes;
t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time;
t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time;
t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime;
t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes;
t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes;
t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen;
t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay;
t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay;
t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time;

if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
t->neigh_vars[14].data = (int *)(p + 1);
t->neigh_vars[15].data = (int *)(p + 1) + 1;
t->neigh_vars[16].data = (int *)(p + 1) + 2;
t->neigh_vars[17].data = (int *)(p + 1) + 3;
t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
}


if (handler) {
/* RetransTime */
t->neigh_vars[3].proc_handler = handler;
t->neigh_vars[3].extra1 = dev;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;
/* ReachableTime */
t->neigh_vars[4].proc_handler = handler;
t->neigh_vars[4].extra1 = dev;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;
/* RetransTime (in milliseconds)*/
t->neigh_vars[12].proc_handler = handler;
t->neigh_vars[12].extra1 = dev;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;
/* ReachableTime (in milliseconds) */
t->neigh_vars[13].proc_handler = handler;
t->neigh_vars[13].extra1 = dev;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;
}

t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
Expand Down
2 changes: 1 addition & 1 deletion net/decnet/dn_neigh.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ struct neigh_table dn_neigh_table = {
.gc_staletime = 60 * HZ,
.reachable_time = 30 * HZ,
.delay_probe_time = 5 * HZ,
.queue_len = 3,
.queue_len_bytes = 64*1024,
.ucast_probes = 0,
.app_probes = 0,
.mcast_probes = 0,
Expand Down
Loading

0 comments on commit 8b5c171

Please sign in to comment.