From a294865978b701e4d0d90135672749531b9a900d Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Fri, 6 May 2011 03:27:18 +0000 Subject: [PATCH 01/24] dccp: handle invalid feature options length A length of zero (after subtracting two for the type and len fields) for the DCCPO_{CHANGE,CONFIRM}_{L,R} options will cause an underflow due to the subtraction. The subsequent code may read past the end of the options value buffer when parsing. I'm unsure of what the consequences of this might be, but it's probably not good. Signed-off-by: Dan Rosenberg Cc: stable@kernel.org Acked-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/options.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/dccp/options.c b/net/dccp/options.c index f06ffcfc8d7124..4b2ab657ac8e61 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -123,6 +123,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ break; + if (len == 0) + goto out_invalid_option; rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, *value, value + 1, len - 1); if (rc) From e328d410826d52e9ee348aff9064c4a207f2adb1 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 6 May 2011 08:32:53 +0000 Subject: [PATCH 02/24] vmxnet3: Consistently disable irqs when taking adapter->cmd_lock Using the vmxnet3 driver produces a lockdep warning because vmxnet3_set_mc(), which is called with mc->mca_lock held, takes adapter->cmd_lock. However, there are a couple of places where adapter->cmd_lock is taken with softirqs enabled, lockdep warns that a softirq that tries to take mc->mca_lock could happen while adapter->cmd_lock is held, leading to an AB-BA deadlock. I'm not sure if this is a real potential deadlock or not, but the simplest and best fix seems to be simply to make sure we take cmd_lock with spin_lock_irqsave() everywhere -- the places with plain spin_lock just look like oversights. The full enormous lockdep warning is: ========================================================= [ INFO: possible irq lock inversion dependency detected ] 2.6.39-rc6+ #1 --------------------------------------------------------- ifconfig/567 just changed the state of lock: (&(&mc->mca_lock)->rlock){+.-...}, at: [] mld_ifc_timer_expire+0xff/0x280 but this lock took another, SOFTIRQ-unsafe lock in the past: (&(&adapter->cmd_lock)->rlock){+.+...} and interrupts could create inverse lock ordering between them. other info that might help us debug this: 4 locks held by ifconfig/567: #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 #1: ((inetaddr_chain).rwsem){.+.+.+}, at: [] __blocking_notifier_call_chain+0x5f/0xb0 #2: (&idev->mc_ifc_timer){+.-...}, at: [] run_timer_softirq+0xeb/0x3f0 #3: (&ndev->lock){++.-..}, at: [] mld_ifc_timer_expire+0x32/0x280 the shortest dependencies between 2nd lock and 1st lock: -> (&(&adapter->cmd_lock)->rlock){+.+...} ops: 11 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b SOFTIRQ-ON-W at: [] __lock_acquire+0x827/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b } ... key at: [] __key.42516+0x0/0xffffffffffffda70 [vmxnet3] ... acquired at: [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_irqsave+0x55/0xa0 [] vmxnet3_set_mc+0x97/0x1a0 [vmxnet3] [] __dev_set_rx_mode+0x40/0xb0 [] dev_set_rx_mode+0x30/0x50 [] __dev_open+0xc7/0x100 [] __dev_change_flags+0xa1/0x180 [] dev_change_flags+0x28/0x70 [] devinet_ioctl+0x730/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b -> (_xmit_ETHER){+.....} ops: 6 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b } ... key at: [] netdev_addr_lock_key+0x8/0x1e0 ... acquired at: [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b -> (&(&mc->mca_lock)->rlock){+.-...} ops: 6 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] igmp6_group_added+0x45/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_init+0x4e/0x183 [] inet6_init+0x191/0x2a6 [] do_one_initcall+0x45/0x190 [] kernel_init+0xe3/0x168 [] kernel_thread_helper+0x4/0x10 IN-SOFTIRQ-W at: [] __lock_acquire+0x7ce/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] __do_softirq+0xc0/0x210 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] igmp6_group_added+0x45/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_init+0x4e/0x183 [] inet6_init+0x191/0x2a6 [] do_one_initcall+0x45/0x190 [] kernel_init+0xe3/0x168 [] kernel_thread_helper+0x4/0x10 } ... key at: [] __key.40877+0x0/0x8 ... acquired at: [] check_usage_forwards+0x9c/0x110 [] mark_lock+0x19c/0x400 [] __lock_acquire+0x7ce/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] __do_softirq+0xc0/0x210 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b stack backtrace: Pid: 567, comm: ifconfig Not tainted 2.6.39-rc6+ #1 Call Trace: [] print_irq_inversion_bug+0x146/0x170 [] ? print_irq_inversion_bug+0x170/0x170 [] check_usage_forwards+0x9c/0x110 [] mark_lock+0x19c/0x400 [] __lock_acquire+0x7ce/0x1e10 [] ? mark_lock+0x1f3/0x400 [] ? __lock_acquire+0xf07/0x1e10 [] ? native_sched_clock+0x15/0x70 [] lock_acquire+0x9d/0x130 [] ? mld_ifc_timer_expire+0xff/0x280 [] ? lock_release_holdtime+0x3d/0x1a0 [] _raw_spin_lock_bh+0x3b/0x70 [] ? mld_ifc_timer_expire+0xff/0x280 [] ? _raw_spin_unlock+0x2b/0x40 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] ? run_timer_softirq+0xeb/0x3f0 [] ? sched_clock+0x9/0x10 [] ? mld_gq_timer_expire+0x30/0x30 [] __do_softirq+0xc0/0x210 [] ? tick_program_event+0x1f/0x30 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] ? retint_restore_args+0x13/0x13 [] ? lock_is_held+0x17/0xd0 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] ? local_clock+0x6f/0x80 [] ? do_page_fault+0x268/0x560 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] ? __call_rcu+0xa7/0x190 [] do_vfs_ioctl+0x98/0x570 [] ? fget_light+0x33e/0x430 [] ? retint_swapgs+0x13/0x1b [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Roland Dreier Signed-off-by: Shreyas N Bhatewara Signed-off-by: Scott J. Goldman Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_drv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 0d47c3a0530788..c16ed961153a77 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -178,6 +178,7 @@ static void vmxnet3_process_events(struct vmxnet3_adapter *adapter) { int i; + unsigned long flags; u32 events = le32_to_cpu(adapter->shared->ecr); if (!events) return; @@ -190,10 +191,10 @@ vmxnet3_process_events(struct vmxnet3_adapter *adapter) /* Check if there is an error on xmit/recv queues */ if (events & (VMXNET3_ECR_TQERR | VMXNET3_ECR_RQERR)) { - spin_lock(&adapter->cmd_lock); + spin_lock_irqsave(&adapter->cmd_lock, flags); VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_QUEUE_STATUS); - spin_unlock(&adapter->cmd_lock); + spin_unlock_irqrestore(&adapter->cmd_lock, flags); for (i = 0; i < adapter->num_tx_queues; i++) if (adapter->tqd_start[i].status.stopped) @@ -2733,13 +2734,14 @@ static void vmxnet3_alloc_intr_resources(struct vmxnet3_adapter *adapter) { u32 cfg; + unsigned long flags; /* intr settings */ - spin_lock(&adapter->cmd_lock); + spin_lock_irqsave(&adapter->cmd_lock, flags); VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_CONF_INTR); cfg = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD); - spin_unlock(&adapter->cmd_lock); + spin_unlock_irqrestore(&adapter->cmd_lock, flags); adapter->intr.type = cfg & 0x3; adapter->intr.mask_mode = (cfg >> 2) & 0x3; From 9c412942a0bb19ba18f7bd939d42eff1e132a901 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 3 May 2011 07:49:25 +0000 Subject: [PATCH 03/24] ipheth: Properly distinguish length and alignment in URBs and skbs The USB protocol this driver implements appears to require 2 bytes of padding in front of each received packet. This used to be equal to the value of NET_IP_ALIGN on x86, so the driver abused that constant and mostly worked, but this is no longer the case. The driver also mixed up the URB and packet lengths, resulting in 2 bytes of junk at the end of the skb. Introduce a private constant for the 2 bytes of padding; fix this confusion and check for the under-length case. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 7d42f9a2c06868..81126ff85e0576 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -65,6 +65,7 @@ #define IPHETH_USBINTF_PROTO 1 #define IPHETH_BUF_SIZE 1516 +#define IPHETH_IP_ALIGN 2 /* padding at front of URB */ #define IPHETH_TX_TIMEOUT (5 * HZ) #define IPHETH_INTFNUM 2 @@ -202,18 +203,21 @@ static void ipheth_rcvbulk_callback(struct urb *urb) return; } - len = urb->actual_length; - buf = urb->transfer_buffer; + if (urb->actual_length <= IPHETH_IP_ALIGN) { + dev->net->stats.rx_length_errors++; + return; + } + len = urb->actual_length - IPHETH_IP_ALIGN; + buf = urb->transfer_buffer + IPHETH_IP_ALIGN; - skb = dev_alloc_skb(NET_IP_ALIGN + len); + skb = dev_alloc_skb(len); if (!skb) { err("%s: dev_alloc_skb: -ENOMEM", __func__); dev->net->stats.rx_dropped++; return; } - skb_reserve(skb, NET_IP_ALIGN); - memcpy(skb_put(skb, len), buf + NET_IP_ALIGN, len - NET_IP_ALIGN); + memcpy(skb_put(skb, len), buf, len); skb->dev = dev->net; skb->protocol = eth_type_trans(skb, dev->net); From b9f47a3aaeabdce3b42829bbb27765fa340f76ba Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 4 May 2011 10:04:56 +0000 Subject: [PATCH 04/24] tcp_cubic: limit delayed_ack ratio to prevent divide error TCP Cubic keeps a metric that estimates the amount of delayed acknowledgements to use in adjusting the window. If an abnormally large number of packets are acknowledged at once, then the update could wrap and reach zero. This kind of ACK could only happen when there was a large window and huge number of ACK's were lost. This patch limits the value of delayed ack ratio. The choice of 32 is just a conservative value since normally it should be range of 1 to 4 packets. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 34340c9c95fab4..f376b05cca818f 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -93,6 +93,7 @@ struct bictcp { u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ #define ACK_RATIO_SHIFT 4 +#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ @@ -398,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) u32 delay; if (icsk->icsk_ca_state == TCP_CA_Open) { - cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ca->delayed_ack += cnt; + u32 ratio = ca->delayed_ack; + + ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ratio += cnt; + + ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ From dcbe14b91a920657ff3a9ba0efb7c5b5562f956a Mon Sep 17 00:00:00 2001 From: Kleber Sacilotto de Souza Date: Wed, 4 May 2011 13:05:11 +0000 Subject: [PATCH 05/24] ehea: fix wrongly reported speed and port Currently EHEA reports to ethtool as supporting 10M, 100M, 1G and 10G and connected to FIBRE independent of the hardware configuration. However, when connected to FIBRE the only supported speed is 10G full-duplex, and the other speeds and modes are only supported when connected to twisted pair. Signed-off-by: Kleber Sacilotto de Souza Acked-by: Breno Leitao Signed-off-by: David S. Miller --- drivers/net/ehea/ehea_ethtool.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/ehea/ehea_ethtool.c b/drivers/net/ehea/ehea_ethtool.c index 3e2e734fecb73b..f3bbdcef338cfb 100644 --- a/drivers/net/ehea/ehea_ethtool.c +++ b/drivers/net/ehea/ehea_ethtool.c @@ -55,15 +55,20 @@ static int ehea_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) cmd->duplex = -1; } - cmd->supported = (SUPPORTED_10000baseT_Full | SUPPORTED_1000baseT_Full - | SUPPORTED_100baseT_Full | SUPPORTED_100baseT_Half - | SUPPORTED_10baseT_Full | SUPPORTED_10baseT_Half - | SUPPORTED_Autoneg | SUPPORTED_FIBRE); - - cmd->advertising = (ADVERTISED_10000baseT_Full | ADVERTISED_Autoneg - | ADVERTISED_FIBRE); + if (cmd->speed == SPEED_10000) { + cmd->supported = (SUPPORTED_10000baseT_Full | SUPPORTED_FIBRE); + cmd->advertising = (ADVERTISED_10000baseT_Full | ADVERTISED_FIBRE); + cmd->port = PORT_FIBRE; + } else { + cmd->supported = (SUPPORTED_1000baseT_Full | SUPPORTED_100baseT_Full + | SUPPORTED_100baseT_Half | SUPPORTED_10baseT_Full + | SUPPORTED_10baseT_Half | SUPPORTED_Autoneg + | SUPPORTED_TP); + cmd->advertising = (ADVERTISED_1000baseT_Full | ADVERTISED_Autoneg + | ADVERTISED_TP); + cmd->port = PORT_TP; + } - cmd->port = PORT_FIBRE; cmd->autoneg = port->autoneg == 1 ? AUTONEG_ENABLE : AUTONEG_DISABLE; return 0; From 6709d9521df05c105343473ab8b147e2ef1e13d8 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Wed, 4 May 2011 22:40:46 +0000 Subject: [PATCH 06/24] be2net: Fixed bugs related to PVID. Fixed bug to make sure 'pvid' retrieval will work on big endian hosts. Fixed incorrect comparison between the Rx Completion's 16-bit VLAN TCI and the PVID. Now comparing only the relevant 12 bits corresponding to the VID. Renamed 'vid' field under Rx Completion to 'vlan_tag' to reflect accurate description. Signed-off-by: Somnath Kotur Signed-off-by: David S. Miller --- drivers/net/benet/be.h | 2 +- drivers/net/benet/be_cmds.c | 2 +- drivers/net/benet/be_main.c | 18 ++++++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/benet/be.h b/drivers/net/benet/be.h index 66823eded7a3b7..2353eca32593f7 100644 --- a/drivers/net/benet/be.h +++ b/drivers/net/benet/be.h @@ -213,7 +213,7 @@ struct be_rx_stats { struct be_rx_compl_info { u32 rss_hash; - u16 vid; + u16 vlan_tag; u16 pkt_size; u16 rxq_idx; u16 mac_id; diff --git a/drivers/net/benet/be_cmds.c b/drivers/net/benet/be_cmds.c index 1e2d825bb94a2b..9dc9394fd4cae5 100644 --- a/drivers/net/benet/be_cmds.c +++ b/drivers/net/benet/be_cmds.c @@ -132,7 +132,7 @@ static void be_async_grp5_pvid_state_process(struct be_adapter *adapter, struct be_async_event_grp5_pvid_state *evt) { if (evt->enabled) - adapter->pvid = evt->tag; + adapter->pvid = le16_to_cpu(evt->tag); else adapter->pvid = 0; } diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c index 02a0443d1821d5..9187fb4e08f116 100644 --- a/drivers/net/benet/be_main.c +++ b/drivers/net/benet/be_main.c @@ -1018,7 +1018,8 @@ static void be_rx_compl_process(struct be_adapter *adapter, kfree_skb(skb); return; } - vlan_hwaccel_receive_skb(skb, adapter->vlan_grp, rxcp->vid); + vlan_hwaccel_receive_skb(skb, adapter->vlan_grp, + rxcp->vlan_tag); } else { netif_receive_skb(skb); } @@ -1076,7 +1077,8 @@ static void be_rx_compl_process_gro(struct be_adapter *adapter, if (likely(!rxcp->vlanf)) napi_gro_frags(&eq_obj->napi); else - vlan_gro_frags(&eq_obj->napi, adapter->vlan_grp, rxcp->vid); + vlan_gro_frags(&eq_obj->napi, adapter->vlan_grp, + rxcp->vlan_tag); } static void be_parse_rx_compl_v1(struct be_adapter *adapter, @@ -1102,7 +1104,8 @@ static void be_parse_rx_compl_v1(struct be_adapter *adapter, rxcp->pkt_type = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, cast_enc, compl); rxcp->vtm = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vtm, compl); - rxcp->vid = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vlan_tag, compl); + rxcp->vlan_tag = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vlan_tag, + compl); } static void be_parse_rx_compl_v0(struct be_adapter *adapter, @@ -1128,7 +1131,8 @@ static void be_parse_rx_compl_v0(struct be_adapter *adapter, rxcp->pkt_type = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, cast_enc, compl); rxcp->vtm = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vtm, compl); - rxcp->vid = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vlan_tag, compl); + rxcp->vlan_tag = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vlan_tag, + compl); } static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo) @@ -1155,9 +1159,11 @@ static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo) rxcp->vlanf = 0; if (!lancer_chip(adapter)) - rxcp->vid = swab16(rxcp->vid); + rxcp->vlan_tag = swab16(rxcp->vlan_tag); - if ((adapter->pvid == rxcp->vid) && !adapter->vlan_tag[rxcp->vid]) + if (((adapter->pvid & VLAN_VID_MASK) == + (rxcp->vlan_tag & VLAN_VID_MASK)) && + !adapter->vlan_tag[rxcp->vlan_tag]) rxcp->vlanf = 0; /* As the compl has been parsed, reset it; we wont touch it again */ From 057bef938896e6266ae24ec4266d24792d27c29a Mon Sep 17 00:00:00 2001 From: Matvejchikov Ilya Date: Fri, 6 May 2011 06:23:09 +0000 Subject: [PATCH 07/24] NET: slip, fix ldisc->open retval TTY layer expects 0 if the ldisc->open operation succeeded. Signed-off-by : Matvejchikov Ilya Acked-by: Oliver Hartkopp Acked-by: Alan Cox Signed-off-by: David S. Miller --- drivers/net/slip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/slip.c b/drivers/net/slip.c index 86cbb9ea2f269e..8ec1a9a0bb9ae0 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -853,7 +853,9 @@ static int slip_open(struct tty_struct *tty) /* Done. We have linked the TTY line to a channel. */ rtnl_unlock(); tty->receive_room = 65536; /* We don't flow control */ - return sl->dev->base_addr; + + /* TTY layer expects 0 on success */ + return 0; err_free_bufs: sl_free_bufs(sl); From ce3dad0f74e6b240f0b1dedbd8ea268a3f298d82 Mon Sep 17 00:00:00 2001 From: Toshiharu Okada Date: Fri, 6 May 2011 02:53:51 +0000 Subject: [PATCH 08/24] PCH_GbE : Fixed the issue of collision detection The collision detection setting was invalid. When collision occurred, because data was not resent, there was an issue to which a transmitting throughput falls. This patch enables the collision detection. Signed-off-by: Toshiharu Okada Signed-off-by: David S. Miller --- drivers/net/pch_gbe/pch_gbe_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 2ef2f9cdefa6ff..4ebd1d4ad3ed24 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -43,8 +43,7 @@ const char pch_driver_version[] = DRV_VERSION; #define PCH_GBE_MAC_RGMII_CTRL_SETTING ( \ PCH_GBE_CHIP_TYPE_INTERNAL | \ - PCH_GBE_RGMII_MODE_RGMII | \ - PCH_GBE_CRS_SEL \ + PCH_GBE_RGMII_MODE_RGMII \ ) /* Ethertype field values */ From 5d05a04d283061b586e8dc819cfa6f4b8cfd5948 Mon Sep 17 00:00:00 2001 From: Toshiharu Okada Date: Fri, 6 May 2011 02:53:56 +0000 Subject: [PATCH 09/24] PCH_GbE : Fixed the issue of checksum judgment The checksum judgment was mistaken. Judgment result 0:Correct 1:Wrong This patch fixes the issue. Signed-off-by: Toshiharu Okada Signed-off-by: David S. Miller --- drivers/net/pch_gbe/pch_gbe_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 4ebd1d4ad3ed24..9f6c4025c6eaed 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -1493,12 +1493,11 @@ pch_gbe_clean_rx(struct pch_gbe_adapter *adapter, /* Write meta date of skb */ skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); - if ((tcp_ip_status & PCH_GBE_RXD_ACC_STAT_TCPIPOK) == - PCH_GBE_RXD_ACC_STAT_TCPIPOK) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { + if (tcp_ip_status & PCH_GBE_RXD_ACC_STAT_TCPIPOK) skb->ip_summed = CHECKSUM_NONE; - } + else + skb->ip_summed = CHECKSUM_UNNECESSARY; + napi_gro_receive(&adapter->napi, skb); (*work_done)++; pr_debug("Receive skb->ip_summed: %d length: %d\n", From b0e6baf5619a6fa3eaf43b55fdb4daa362c3c916 Mon Sep 17 00:00:00 2001 From: Tomoya Date: Mon, 9 May 2011 01:19:37 +0000 Subject: [PATCH 10/24] pch_gbe: support ML7223 IOH Support new device OKI SEMICONDUCTOR ML7223 IOH(Input/Output Hub). The ML7223 IOH is for MP(Media Phone) use. The ML7223 is companion chip for Intel Atom E6xx series. The ML7223 is completely compatible for Intel EG20T PCH. Signed-off-by: Tomoya MORINAGA Signed-off-by: David S. Miller --- drivers/net/Kconfig | 8 +++++++- drivers/net/pch_gbe/pch_gbe_main.c | 11 +++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index dc280bc8eba2e0..6c884ef1b0690e 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -2536,7 +2536,7 @@ config S6GMAC source "drivers/net/stmmac/Kconfig" config PCH_GBE - tristate "PCH Gigabit Ethernet" + tristate "Intel EG20T PCH / OKI SEMICONDUCTOR ML7223 IOH GbE" depends on PCI select MII ---help--- @@ -2548,6 +2548,12 @@ config PCH_GBE to Gigabit Ethernet. This driver enables Gigabit Ethernet function. + This driver also can be used for OKI SEMICONDUCTOR IOH(Input/ + Output Hub), ML7223. + ML7223 IOH is for MP(Media Phone) use. + ML7223 is companion chip for Intel Atom E6xx series. + ML7223 is completely compatible for Intel EG20T PCH. + endif # NETDEV_1000 # diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 9f6c4025c6eaed..56d049a472da4e 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -34,6 +34,10 @@ const char pch_driver_version[] = DRV_VERSION; #define PCH_GBE_COPYBREAK_DEFAULT 256 #define PCH_GBE_PCI_BAR 1 +/* Macros for ML7223 */ +#define PCI_VENDOR_ID_ROHM 0x10db +#define PCI_DEVICE_ID_ROHM_ML7223_GBE 0x8013 + #define PCH_GBE_TX_WEIGHT 64 #define PCH_GBE_RX_WEIGHT 64 #define PCH_GBE_RX_BUFFER_WRITE 16 @@ -2418,6 +2422,13 @@ static DEFINE_PCI_DEVICE_TABLE(pch_gbe_pcidev_id) = { .class = (PCI_CLASS_NETWORK_ETHERNET << 8), .class_mask = (0xFFFF00) }, + {.vendor = PCI_VENDOR_ID_ROHM, + .device = PCI_DEVICE_ID_ROHM_ML7223_GBE, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .class = (PCI_CLASS_NETWORK_ETHERNET << 8), + .class_mask = (0xFFFF00) + }, /* required last entry */ {0} }; From 315c34dae0069d0c67abd714bb846cd466289c7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Apr 2011 10:55:07 +0200 Subject: [PATCH 11/24] netfilter: ctnetlink: fix timestamp support for new conntracks This patch fixes the missing initialization of the start time if the timestamp support is enabled. libnetfilter_conntrack/utils# conntrack -E & libnetfilter_conntrack/utils# ./conntrack_create tcp 6 109 ESTABLISHED src=1.1.1.1 dst=2.2.2.2 sport=1025 dport=21 packets=0 bytes=0 [UNREPLIED] src=2.2.2.2 dst=1.1.1.1 sport=21 dport=1025 packets=0 bytes=0 mark=0 delta-time=1303296401 use=2 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_netlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 30bf8a167fc880..482e90c6185087 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1334,6 +1334,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, struct nf_conn *ct; int err = -EINVAL; struct nf_conntrack_helper *helper; + struct nf_conn_tstamp *tstamp; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) @@ -1451,6 +1452,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, __set_bit(IPS_EXPECTED_BIT, &ct->status); ct->master = master_ct; } + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_to_ns(ktime_get_real()); add_timer(&ct->timeout); nf_conntrack_hash_insert(ct); From 5a6351eecf8c87afed9c883bb6341d09406d74ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Apr 2011 10:57:21 +0200 Subject: [PATCH 12/24] netfilter: fix ebtables compat support commit 255d0dc34068a976 (netfilter: x_table: speedup compat operations) made ebtables not working anymore. 1) xt_compat_calc_jump() is not an exact match lookup 2) compat_table_info() has a typo in xt_compat_init_offsets() call 3) compat_do_replace() misses a xt_compat_init_offsets() call Reported-by: dann frazier Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 3 ++- net/netfilter/x_tables.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 893669caa8de6c..9707079bc40a82 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1766,7 +1766,7 @@ static int compat_table_info(const struct ebt_table_info *info, newinfo->entries_size = size; - xt_compat_init_offsets(AF_INET, info->nentries); + xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } @@ -2240,6 +2240,7 @@ static int compat_do_replace(struct net *net, void __user *user, xt_compat_lock(NFPROTO_BRIDGE); + xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (ret < 0) goto out_unlock; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a9adf4c6b299ee..8a025a585d2f7c 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -455,6 +455,7 @@ void xt_compat_flush_offsets(u_int8_t af) vfree(xt[af].compat_tab); xt[af].compat_tab = NULL; xt[af].number = 0; + xt[af].cur = 0; } } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); @@ -473,8 +474,7 @@ int xt_compat_calc_jump(u_int8_t af, unsigned int offset) else return mid ? tmp[mid - 1].delta : 0; } - WARN_ON_ONCE(1); - return 0; + return left ? tmp[left - 1].delta : 0; } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); From 103a9778e07bcc0cd34b5c35a87281454eec719e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Apr 2011 10:58:25 +0200 Subject: [PATCH 13/24] netfilter: ebtables: only call xt_compat_add_offset once per rule The optimizations in commit 255d0dc34068a976 (netfilter: x_table: speedup compat operations) assume that xt_compat_add_offset is called once per rule. ebtables however called it for each match/target found in a rule. The match/watcher/target parser already returns the needed delta, so it is sufficient to move the xt_compat_add_offset call to a more reasonable location. While at it, also get rid of the unused COMPAT iterator macros. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 61 +++++---------------------------- 1 file changed, 9 insertions(+), 52 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9707079bc40a82..1a92b369c8202c 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1882,7 +1882,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, struct xt_match *match; struct xt_target *wt; void *dst = NULL; - int off, pad = 0, ret = 0; + int off, pad = 0; unsigned int size_kern, entry_offset, match_size = mwt->match_size; strlcpy(name, mwt->u.name, sizeof(name)); @@ -1935,13 +1935,6 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, break; } - if (!dst) { - ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, - off + ebt_compat_entry_padsize()); - if (ret < 0) - return ret; - } - state->buf_kern_offset += match_size + off; state->buf_user_offset += match_size; pad = XT_ALIGN(size_kern) - size_kern; @@ -2016,50 +2009,6 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, return growth; } -#define EBT_COMPAT_WATCHER_ITERATE(e, fn, args...) \ -({ \ - unsigned int __i; \ - int __ret = 0; \ - struct compat_ebt_entry_mwt *__watcher; \ - \ - for (__i = e->watchers_offset; \ - __i < (e)->target_offset; \ - __i += __watcher->watcher_size + \ - sizeof(struct compat_ebt_entry_mwt)) { \ - __watcher = (void *)(e) + __i; \ - __ret = fn(__watcher , ## args); \ - if (__ret != 0) \ - break; \ - } \ - if (__ret == 0) { \ - if (__i != (e)->target_offset) \ - __ret = -EINVAL; \ - } \ - __ret; \ -}) - -#define EBT_COMPAT_MATCH_ITERATE(e, fn, args...) \ -({ \ - unsigned int __i; \ - int __ret = 0; \ - struct compat_ebt_entry_mwt *__match; \ - \ - for (__i = sizeof(struct ebt_entry); \ - __i < (e)->watchers_offset; \ - __i += __match->match_size + \ - sizeof(struct compat_ebt_entry_mwt)) { \ - __match = (void *)(e) + __i; \ - __ret = fn(__match , ## args); \ - if (__ret != 0) \ - break; \ - } \ - if (__ret == 0) { \ - if (__i != (e)->watchers_offset) \ - __ret = -EINVAL; \ - } \ - __ret; \ -}) - /* called for all ebt_entry structures. */ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, unsigned int *total, @@ -2132,6 +2081,14 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, } } + if (state->buf_kern_start == NULL) { + unsigned int offset = buf_start - (char *) base; + + ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset); + if (ret < 0) + return ret; + } + startoff = state->buf_user_offset - startoff; BUG_ON(*total < startoff); From 1ae132b0347907ac95b8bc9dba37934f59d2a508 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 3 May 2011 22:09:30 +0200 Subject: [PATCH 14/24] IPVS: Change of socket usage to enable name space exit. If the sync daemons run in a name space while it crashes or get killed, there is no way to stop them except for a reboot. When all patches are there, ip_vs_core will handle register_pernet_(), i.e. ip_vs_sync_init() and ip_vs_sync_cleanup() will be removed. Kernel threads should not increment the use count of a socket. By calling sk_change_net() after creating a socket this is avoided. sock_release cant be used intead sk_release_kernel() should be used. Thanks Eric W Biederman for your advices. Signed-off-by: Hans Schillstrom [horms@verge.net.au: minor edit to changelog] Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 58 +++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 07accf6b2401a0..a0791dc05a2758 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1896,7 +1896,7 @@ static int __net_init __ip_vs_init(struct net *net) static void __net_exit __ip_vs_cleanup(struct net *net) { - IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen); + IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); } static struct pernet_operations ipvs_core_ops = { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3e7961e85e9caa..0cce95310820e4 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1303,13 +1303,18 @@ static struct socket *make_send_sock(struct net *net) struct socket *sock; int result; - /* First create a socket */ - result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); + /* First create a socket move it to right name space later */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); @@ -1334,8 +1339,8 @@ static struct socket *make_send_sock(struct net *net) return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -1350,12 +1355,17 @@ static struct socket *make_receive_sock(struct net *net) int result; /* First create a socket */ - result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = 1; @@ -1377,8 +1387,8 @@ static struct socket *make_receive_sock(struct net *net) return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -1473,7 +1483,7 @@ static int sync_thread_master(void *data) ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo); return 0; @@ -1513,7 +1523,7 @@ static int sync_thread_backup(void *data) } /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo->buf); kfree(tinfo); @@ -1601,7 +1611,7 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) outbuf: kfree(buf); outsocket: - sock_release(sock); + sk_release_kernel(sock->sk); out: return result; } @@ -1610,6 +1620,7 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) int stop_sync_thread(struct net *net, int state) { struct netns_ipvs *ipvs = net_ipvs(net); + int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); @@ -1629,7 +1640,7 @@ int stop_sync_thread(struct net *net, int state) spin_lock_bh(&ipvs->sync_lock); ipvs->sync_state &= ~IP_VS_STATE_MASTER; spin_unlock_bh(&ipvs->sync_lock); - kthread_stop(ipvs->master_thread); + retc = kthread_stop(ipvs->master_thread); ipvs->master_thread = NULL; } else if (state == IP_VS_STATE_BACKUP) { if (!ipvs->backup_thread) @@ -1639,16 +1650,14 @@ int stop_sync_thread(struct net *net, int state) task_pid_nr(ipvs->backup_thread)); ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(ipvs->backup_thread); + retc = kthread_stop(ipvs->backup_thread); ipvs->backup_thread = NULL; - } else { - return -EINVAL; } /* decrease the module use count */ ip_vs_use_count_dec(); - return 0; + return retc; } /* @@ -1670,8 +1679,15 @@ static int __net_init __ip_vs_sync_init(struct net *net) static void __ip_vs_sync_cleanup(struct net *net) { - stop_sync_thread(net, IP_VS_STATE_MASTER); - stop_sync_thread(net, IP_VS_STATE_BACKUP); + int retc; + + retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); } static struct pernet_operations ipvs_sync_ops = { @@ -1682,10 +1698,10 @@ static struct pernet_operations ipvs_sync_ops = { int __init ip_vs_sync_init(void) { - return register_pernet_subsys(&ipvs_sync_ops); + return register_pernet_device(&ipvs_sync_ops); } void ip_vs_sync_cleanup(void) { - unregister_pernet_subsys(&ipvs_sync_ops); + unregister_pernet_device(&ipvs_sync_ops); } From 7a4f0761fce32ff4918a7c23b08db564ad33092d Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 3 May 2011 22:09:31 +0200 Subject: [PATCH 15/24] IPVS: init and cleanup restructuring DESCRIPTION This patch tries to restore the initial init and cleanup sequences that was before namspace patch. Netns also requires action when net devices unregister which has never been implemented. I.e this patch also covers when a device moves into a network namespace, and has to be released. IMPLEMENTATION The number of calls to register_pernet_device have been reduced to one for the ip_vs.ko Schedulers still have their own calls. This patch adds a function __ip_vs_service_cleanup() and an enable flag for the netfilter hooks. The nf hooks will be enabled when the first service is loaded and never disabled again, except when a namespace exit starts. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov [horms@verge.net.au: minor edit to changelog] Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 +++++ net/netfilter/ipvs/ip_vs_app.c | 15 +--- net/netfilter/ipvs/ip_vs_conn.c | 12 +--- net/netfilter/ipvs/ip_vs_core.c | 101 +++++++++++++++++++++++--- net/netfilter/ipvs/ip_vs_ctl.c | 120 ++++++++++++++++++++++++++----- net/netfilter/ipvs/ip_vs_est.c | 14 +--- net/netfilter/ipvs/ip_vs_proto.c | 11 +-- net/netfilter/ipvs/ip_vs_sync.c | 13 +--- 8 files changed, 223 insertions(+), 80 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d516f00c8e0fc3..86aefed6140b34 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -791,6 +791,7 @@ struct ip_vs_app { /* IPVS in network namespace */ struct netns_ipvs { int gen; /* Generation */ + int enable; /* enable like nf_hooks do */ /* * Hash table: for real service lookups */ @@ -1089,6 +1090,22 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) atomic_inc(&ctl_cp->n_control); } +/* + * IPVS netns init & cleanup functions + */ +extern int __ip_vs_estimator_init(struct net *net); +extern int __ip_vs_control_init(struct net *net); +extern int __ip_vs_protocol_init(struct net *net); +extern int __ip_vs_app_init(struct net *net); +extern int __ip_vs_conn_init(struct net *net); +extern int __ip_vs_sync_init(struct net *net); +extern void __ip_vs_conn_cleanup(struct net *net); +extern void __ip_vs_app_cleanup(struct net *net); +extern void __ip_vs_protocol_cleanup(struct net *net); +extern void __ip_vs_control_cleanup(struct net *net); +extern void __ip_vs_estimator_cleanup(struct net *net); +extern void __ip_vs_sync_cleanup(struct net *net); +extern void __ip_vs_service_cleanup(struct net *net); /* * IPVS application functions diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 2dc6de13ac1890..51f3af7c474301 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -576,7 +576,7 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -static int __net_init __ip_vs_app_init(struct net *net) +int __net_init __ip_vs_app_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -585,26 +585,17 @@ static int __net_init __ip_vs_app_init(struct net *net) return 0; } -static void __net_exit __ip_vs_app_cleanup(struct net *net) +void __net_exit __ip_vs_app_cleanup(struct net *net) { proc_net_remove(net, "ip_vs_app"); } -static struct pernet_operations ip_vs_app_ops = { - .init = __ip_vs_app_init, - .exit = __ip_vs_app_cleanup, -}; - int __init ip_vs_app_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_app_ops); - return rv; + return 0; } void ip_vs_app_cleanup(void) { - unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c97bd45975bed6..d3fd91bbba4905 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1258,22 +1258,17 @@ int __net_init __ip_vs_conn_init(struct net *net) return 0; } -static void __net_exit __ip_vs_conn_cleanup(struct net *net) +void __net_exit __ip_vs_conn_cleanup(struct net *net) { /* flush all the connection entries first */ ip_vs_conn_flush(net); proc_net_remove(net, "ip_vs_conn"); proc_net_remove(net, "ip_vs_conn_sync"); } -static struct pernet_operations ipvs_conn_ops = { - .init = __ip_vs_conn_init, - .exit = __ip_vs_conn_cleanup, -}; int __init ip_vs_conn_init(void) { int idx; - int retc; /* Compute size and mask */ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; @@ -1309,17 +1304,14 @@ int __init ip_vs_conn_init(void) rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - retc = register_pernet_subsys(&ipvs_conn_ops); - /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - return retc; + return 0; } void ip_vs_conn_cleanup(void) { - unregister_pernet_subsys(&ipvs_conn_ops); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a0791dc05a2758..a74dae6c5dbce9 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1113,6 +1113,9 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) return NF_ACCEPT; net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1343,6 +1346,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) return NF_ACCEPT; /* The packet looks wrong, ignore */ net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->protocol); if (!pd) return NF_ACCEPT; @@ -1529,6 +1533,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* Bad... Do not break raw sockets */ @@ -1562,7 +1571,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } - net = skb_net(skb); /* Protocol supported? */ pd = ip_vs_proto_data_get(net, iph.protocol); if (unlikely(!pd)) @@ -1588,7 +1596,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - net = skb_net(skb); ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { @@ -1743,10 +1750,16 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + return ip_vs_in_icmp(skb, &r, hooknum); } @@ -1757,10 +1770,16 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) return NF_ACCEPT; + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + return ip_vs_in_icmp_v6(skb, &r, hooknum); } #endif @@ -1884,21 +1903,72 @@ static int __net_init __ip_vs_init(struct net *net) pr_err("%s(): no memory.\n", __func__); return -ENOMEM; } + /* Hold the beast until a service is registerd */ + ipvs->enable = 0; ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; + + if (__ip_vs_estimator_init(net) < 0) + goto estimator_fail; + + if (__ip_vs_control_init(net) < 0) + goto control_fail; + + if (__ip_vs_protocol_init(net) < 0) + goto protocol_fail; + + if (__ip_vs_app_init(net) < 0) + goto app_fail; + + if (__ip_vs_conn_init(net) < 0) + goto conn_fail; + + if (__ip_vs_sync_init(net) < 0) + goto sync_fail; + printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", sizeof(struct netns_ipvs), ipvs->gen); return 0; +/* + * Error handling + */ + +sync_fail: + __ip_vs_conn_cleanup(net); +conn_fail: + __ip_vs_app_cleanup(net); +app_fail: + __ip_vs_protocol_cleanup(net); +protocol_fail: + __ip_vs_control_cleanup(net); +control_fail: + __ip_vs_estimator_cleanup(net); +estimator_fail: + return -ENOMEM; } static void __net_exit __ip_vs_cleanup(struct net *net) { + __ip_vs_service_cleanup(net); /* ip_vs_flush() with locks */ + __ip_vs_conn_cleanup(net); + __ip_vs_app_cleanup(net); + __ip_vs_protocol_cleanup(net); + __ip_vs_control_cleanup(net); + __ip_vs_estimator_cleanup(net); IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); } +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ + EnterFunction(2); + net_ipvs(net)->enable = 0; /* Disable packet reception */ + __ip_vs_sync_cleanup(net); + LeaveFunction(2); +} + static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, .exit = __ip_vs_cleanup, @@ -1906,6 +1976,10 @@ static struct pernet_operations ipvs_core_ops = { .size = sizeof(struct netns_ipvs), }; +static struct pernet_operations ipvs_core_dev_ops = { + .exit = __ip_vs_dev_cleanup, +}; + /* * Initialize IP Virtual Server */ @@ -1913,10 +1987,6 @@ static int __init ip_vs_init(void) { int ret; - ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ - if (ret < 0) - return ret; - ip_vs_estimator_init(); ret = ip_vs_control_init(); if (ret < 0) { @@ -1944,15 +2014,28 @@ static int __init ip_vs_init(void) goto cleanup_conn; } + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_sync; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { pr_err("can't register hooks.\n"); - goto cleanup_sync; + goto cleanup_dev; } pr_info("ipvs loaded.\n"); + return ret; +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); cleanup_sync: ip_vs_sync_cleanup(); cleanup_conn: @@ -1964,20 +2047,20 @@ static int __init ip_vs_init(void) ip_vs_control_cleanup(); cleanup_estimator: ip_vs_estimator_cleanup(); - unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ return ret; } static void __exit ip_vs_cleanup(void) { nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_sync_cleanup(); ip_vs_conn_cleanup(); ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); ip_vs_estimator_cleanup(); - unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ae47090bf45fe1..ea722810faf3c2 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -69,6 +69,11 @@ int ip_vs_get_debug_level(void) } #endif + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc); + + #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ static int __ip_vs_addr_is_local_v6(struct net *net, @@ -1214,6 +1219,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; return 0; @@ -1472,6 +1479,84 @@ static int ip_vs_flush(struct net *net) return 0; } +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_cleanup() + */ +void __ip_vs_service_cleanup(struct net *net) +{ + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + ip_vs_flush(net); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} +/* + * Release dst hold by dst_cache + */ +static inline void +__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +{ + spin_lock_bh(&dest->dst_lock); + if (dest->dst_cache && dest->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + ip_vs_dst_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* + * Netdev event receiver + * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to + * a device that is "unregister" it must be released. + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + } + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + + } + } + + list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { + __ip_vs_dev_reset(dest, dev); + } + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} /* * Zero counters in a service or all services @@ -3588,6 +3673,10 @@ void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { } #endif +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +}; + int __net_init __ip_vs_control_init(struct net *net) { int idx; @@ -3626,7 +3715,7 @@ int __net_init __ip_vs_control_init(struct net *net) return -ENOMEM; } -static void __net_exit __ip_vs_control_cleanup(struct net *net) +void __net_exit __ip_vs_control_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -3639,11 +3728,6 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) free_percpu(ipvs->tot_stats.cpustats); } -static struct pernet_operations ipvs_control_ops = { - .init = __ip_vs_control_init, - .exit = __ip_vs_control_cleanup, -}; - int __init ip_vs_control_init(void) { int idx; @@ -3657,33 +3741,32 @@ int __init ip_vs_control_init(void) INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); } - ret = register_pernet_subsys(&ipvs_control_ops); - if (ret) { - pr_err("cannot register namespace.\n"); - goto err; - } - smp_wmb(); /* Do we really need it now ? */ ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); - goto err_net; + goto err_sock; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); - nf_unregister_sockopt(&ip_vs_sockopts); - goto err_net; + goto err_genl; } + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + goto err_notf; + LeaveFunction(2); return 0; -err_net: - unregister_pernet_subsys(&ipvs_control_ops); -err: +err_notf: + ip_vs_genl_unregister(); +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: return ret; } @@ -3691,7 +3774,6 @@ int __init ip_vs_control_init(void) void ip_vs_control_cleanup(void) { EnterFunction(2); - unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 8c8766ca56ada4..508cce98777c74 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -192,7 +192,7 @@ void ip_vs_read_estimator(struct ip_vs_stats_user *dst, dst->outbps = (e->outbps + 0xF) >> 5; } -static int __net_init __ip_vs_estimator_init(struct net *net) +int __net_init __ip_vs_estimator_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -203,24 +203,16 @@ static int __net_init __ip_vs_estimator_init(struct net *net) return 0; } -static void __net_exit __ip_vs_estimator_exit(struct net *net) +void __net_exit __ip_vs_estimator_cleanup(struct net *net) { del_timer_sync(&net_ipvs(net)->est_timer); } -static struct pernet_operations ip_vs_app_ops = { - .init = __ip_vs_estimator_init, - .exit = __ip_vs_estimator_exit, -}; int __init ip_vs_estimator_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_app_ops); - return rv; + return 0; } void ip_vs_estimator_cleanup(void) { - unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 17484a4416ef91..eb86028536fc94 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -316,7 +316,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, /* * per network name-space init */ -static int __net_init __ip_vs_protocol_init(struct net *net) +int __net_init __ip_vs_protocol_init(struct net *net) { #ifdef CONFIG_IP_VS_PROTO_TCP register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); @@ -336,7 +336,7 @@ static int __net_init __ip_vs_protocol_init(struct net *net) return 0; } -static void __net_exit __ip_vs_protocol_cleanup(struct net *net) +void __net_exit __ip_vs_protocol_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; @@ -349,11 +349,6 @@ static void __net_exit __ip_vs_protocol_cleanup(struct net *net) } } -static struct pernet_operations ipvs_proto_ops = { - .init = __ip_vs_protocol_init, - .exit = __ip_vs_protocol_cleanup, -}; - int __init ip_vs_protocol_init(void) { char protocols[64]; @@ -382,7 +377,6 @@ int __init ip_vs_protocol_init(void) REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); - return register_pernet_subsys(&ipvs_proto_ops); return 0; } @@ -393,7 +387,6 @@ void ip_vs_protocol_cleanup(void) struct ip_vs_protocol *pp; int i; - unregister_pernet_subsys(&ipvs_proto_ops); /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 0cce95310820e4..e292e5bddc7000 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1663,7 +1663,7 @@ int stop_sync_thread(struct net *net, int state) /* * Initialize data struct for each netns */ -static int __net_init __ip_vs_sync_init(struct net *net) +int __net_init __ip_vs_sync_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1677,7 +1677,7 @@ static int __net_init __ip_vs_sync_init(struct net *net) return 0; } -static void __ip_vs_sync_cleanup(struct net *net) +void __ip_vs_sync_cleanup(struct net *net) { int retc; @@ -1690,18 +1690,11 @@ static void __ip_vs_sync_cleanup(struct net *net) pr_err("Failed to stop Backup Daemon\n"); } -static struct pernet_operations ipvs_sync_ops = { - .init = __ip_vs_sync_init, - .exit = __ip_vs_sync_cleanup, -}; - - int __init ip_vs_sync_init(void) { - return register_pernet_device(&ipvs_sync_ops); + return 0; } void ip_vs_sync_cleanup(void) { - unregister_pernet_device(&ipvs_sync_ops); } From 4319cc0cf5bb894b7368008cdf6dd20eb8868018 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 10 May 2011 09:55:44 +0200 Subject: [PATCH 16/24] netfilter: IPv6: initialize TOS field in REJECT target module The IPv6 header is not zeroed out in alloc_skb so we must initialize it properly unless we want to see IPv6 packets with random TOS fields floating around. The current implementation resets the flow label but this could be changed if deemed necessary. We stumbled upon this issue when trying to apply a mangle rule to the RST packet generated by the REJECT target module. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_REJECT.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 28e74488a32977..a5a4c5dd53961c 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -45,6 +45,8 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) int tcphoff, needs_ack; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; struct dst_entry *dst = NULL; u8 proto; struct flowi6 fl6; @@ -124,7 +126,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) skb_put(nskb, sizeof(struct ipv6hdr)); skb_reset_network_header(nskb); ip6h = ipv6_hdr(nskb); - ip6h->version = 6; + *(__be32 *)ip6h = htonl(0x60000000 | (tclass << 20)); ip6h->hop_limit = ip6_dst_hoplimit(dst); ip6h->nexthdr = IPPROTO_TCP; ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); From 1ed2f73d90fb49bcf5704aee7e9084adb882bfc5 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 10 May 2011 10:00:21 +0200 Subject: [PATCH 17/24] netfilter: IPv6: fix DSCP mangle code The mask indicates the bits one wants to zero out, so it needs to be inverted before applying to the original TOS field. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_DSCP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 0a229191e55b2f..ae8271652efa9c 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t orig, nv; orig = ipv6_get_dsfield(iph); - nv = (orig & info->tos_mask) ^ info->tos_value; + nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (!skb_make_writable(skb, sizeof(struct iphdr))) From 93bbce1ad0cd788190dd7d6c17d289f771fe3d0d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 May 2011 12:13:36 +0200 Subject: [PATCH 18/24] netfilter: revert a2361c8735e07322023aedc36e4938b35af31eb0 This patch reverts a2361c8735e07322023aedc36e4938b35af31eb0: "[PATCH] netfilter: xt_conntrack: warn about use in raw table" Florian Wesphal says: "... when the packet was sent from the local machine the skb already has ->nfct attached, and -m conntrack seems to do the right thing." Acked-by: Jan Engelhardt Reported-by: Florian Wesphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_conntrack.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 481a86fdc40973..61805d7b38aa62 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -272,11 +272,6 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; - if (strcmp(par->table, "raw") == 0) { - pr_info("state is undetermined at the time of raw table\n"); - return -EINVAL; - } - ret = nf_ct_l3proto_try_module_get(par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", From 55aee10dec477254241e4f72968f92e0543b33ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 May 2011 12:22:54 -0700 Subject: [PATCH 19/24] vlan: fix GVRP at dismantle time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ip link add link eth2 eth2.103 type vlan id 103 gvrp on loose_binding on ip link set eth2.103 up rmmod tg3 # driver providing eth2 BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] garp_request_leave+0x3e/0xc0 [garp] PGD 11d251067 PUD 11b9e0067 PMD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/virtual/net/eth2.104/ifindex CPU 0 Modules linked in: tg3(-) 8021q garp nfsd lockd auth_rpcgss sunrpc libphy sg [last unloaded: x_tables] Pid: 11494, comm: rmmod Tainted: G W 2.6.39-rc6-00261-gfd71257-dirty #580 HP ProLiant BL460c G6 RIP: 0010:[] [] garp_request_leave+0x3e/0xc0 [garp] RSP: 0018:ffff88007a19bae8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88011b5e2000 RCX: 0000000000000002 RDX: 0000000000000000 RSI: 0000000000000175 RDI: ffffffffa0030d5b RBP: ffff88007a19bb18 R08: 0000000000000001 R09: ffff88011bd64a00 R10: ffff88011d34ec00 R11: 0000000000000000 R12: 0000000000000002 R13: ffff88007a19bc48 R14: ffff88007a19bb88 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88011fc00000(0063) knlGS:00000000f77d76c0 CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000011a675000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process rmmod (pid: 11494, threadinfo ffff88007a19a000, task ffff8800798595c0) Stack: ffff88007a19bb36 ffff88011c84b800 ffff88011b5e2000 ffff88007a19bc48 ffff88007a19bb88 0000000000000006 ffff88007a19bb38 ffffffffa003a5f6 ffff88007a19bb38 670088007a19bba8 ffff88007a19bb58 ffffffffa00397e7 Call Trace: [] vlan_gvrp_request_leave+0x46/0x50 [8021q] [] vlan_dev_stop+0xb7/0xc0 [8021q] [] __dev_close_many+0x87/0xe0 [] dev_close_many+0x87/0x110 [] rollback_registered_many+0xa0/0x240 [] unregister_netdevice_many+0x19/0x60 [] vlan_device_event+0x53b/0x550 [8021q] [] ? ip6mr_device_event+0xa8/0xd0 [] notifier_call_chain+0x53/0x80 [] __raw_notifier_call_chain+0x9/0x10 [] raw_notifier_call_chain+0x11/0x20 [] call_netdevice_notifiers+0x32/0x60 [] rollback_registered_many+0x10f/0x240 [] rollback_registered+0x2f/0x40 [] unregister_netdevice_queue+0x58/0x90 [] unregister_netdev+0x1b/0x30 [] tg3_remove_one+0x6f/0x10b [tg3] We should call vlan_gvrp_request_leave() from unregister_vlan_dev(), not from vlan_dev_stop(), because vlan_gvrp_uninit_applicant() is called right after unregister_netdevice_queue(). In batch mode, unregister_netdevice_queue() doesn’t immediately call vlan_dev_stop(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 +++ net/8021q/vlan_dev.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 7850412f52b7e2..0eb1a886b370bb 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -124,6 +124,9 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) grp->nr_vlans--; + if (vlan->flags & VLAN_FLAG_GVRP) + vlan_gvrp_request_leave(dev); + vlan_group_set_device(grp, vlan_id, NULL); if (!grp->killall) synchronize_net(); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e34ea9e5e28bce..b2ff6c8d3603da 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -487,9 +487,6 @@ static int vlan_dev_stop(struct net_device *dev) struct vlan_dev_info *vlan = vlan_dev_info(dev); struct net_device *real_dev = vlan->real_dev; - if (vlan->flags & VLAN_FLAG_GVRP) - vlan_gvrp_request_leave(dev); - dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) From e14a599335427f81bbb0008963e59aa9c6449dce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 May 2011 12:26:06 -0700 Subject: [PATCH 20/24] net: dev_close() should check IFF_UP Commit 443457242beb (factorize sync-rcu call in unregister_netdevice_many) mistakenly removed one test from dev_close() Following actions trigger a BUG : modprobe bonding modprobe dummy ifconfig bond0 up ifenslave bond0 dummy0 rmmod dummy dev_close() must not close a non IFF_UP device. With help from Frank Blaschka and Einar EL Lueck Reported-by: Frank Blaschka Reported-by: Einar EL Lueck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 856b6ee9a1d5d5..92009440d28bed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1284,11 +1284,13 @@ static int dev_close_many(struct list_head *head) */ int dev_close(struct net_device *dev) { - LIST_HEAD(single); + if (dev->flags & IFF_UP) { + LIST_HEAD(single); - list_add(&dev->unreg_list, &single); - dev_close_many(&single); - list_del(&single); + list_add(&dev->unreg_list, &single); + dev_close_many(&single); + list_del(&single); + } return 0; } EXPORT_SYMBOL(dev_close); From 43a4dea4c9d44baae38ddc14b9b6d86fde4c8b88 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 May 2011 19:36:38 +0000 Subject: [PATCH 21/24] xfrm: Assign the inner mode output function to the dst entry As it is, we assign the outer modes output function to the dst entry when we create the xfrm bundle. This leads to two problems on interfamily scenarios. We might insert ipv4 packets into ip6_fragment when called from xfrm6_output. The system crashes if we try to fragment an ipv4 packet with ip6_fragment. This issue was introduced with git commit ad0081e4 (ipv6: Fragment locally generated tunnel-mode IPSec6 packets as needed). The second issue is, that we might insert ipv4 packets in netfilter6 and vice versa on interfamily scenarios. With this patch we assign the inner mode output function to the dst entry when we create the xfrm bundle. So xfrm4_output/xfrm6_output from the inner mode is used and the right fragmentation and netfilter functions are called. We switch then to outer mode with the output_finish functions. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/xfrm.h | 3 +++ net/ipv4/xfrm4_output.c | 8 ++++++-- net/ipv4/xfrm4_state.c | 1 + net/ipv6/xfrm6_output.c | 6 +++--- net/ipv6/xfrm6_state.c | 1 + net/xfrm/xfrm_policy.c | 14 +++++++++++++- 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6ae4bc5ce8a712..20afeaa39395ec 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -324,6 +324,7 @@ struct xfrm_state_afinfo { int (*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n); int (*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n); int (*output)(struct sk_buff *skb); + int (*output_finish)(struct sk_buff *skb); int (*extract_input)(struct xfrm_state *x, struct sk_buff *skb); int (*extract_output)(struct xfrm_state *x, @@ -1454,6 +1455,7 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) extern int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm4_output(struct sk_buff *skb); +extern int xfrm4_output_finish(struct sk_buff *skb); extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); extern int xfrm6_extract_header(struct sk_buff *skb); @@ -1470,6 +1472,7 @@ extern __be32 xfrm6_tunnel_spi_lookup(struct net *net, xfrm_address_t *saddr); extern int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm6_output(struct sk_buff *skb); +extern int xfrm6_output_finish(struct sk_buff *skb); extern int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 571aa96a175c95..2d51840e53a114 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -69,7 +69,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm4_prepare_output); -static int xfrm4_output_finish(struct sk_buff *skb) +int xfrm4_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER if (!skb_dst(skb)->xfrm) { @@ -86,7 +86,11 @@ static int xfrm4_output_finish(struct sk_buff *skb) int xfrm4_output(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, skb_dst(skb)->dev, xfrm4_output_finish, + NULL, dst->dev, + x->outer_mode->afinfo->output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 1717c64628d1c5..805d63ef4340fa 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -78,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .init_tempsel = __xfrm4_init_tempsel, .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, + .output_finish = xfrm4_output_finish, .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8e688b3de9abc6..49a91c5f5623b8 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -79,7 +79,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm6_prepare_output); -static int xfrm6_output_finish(struct sk_buff *skb) +int xfrm6_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; @@ -97,9 +97,9 @@ static int __xfrm6_output(struct sk_buff *skb) if ((x && x->props.mode == XFRM_MODE_TUNNEL) && ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)))) { - return ip6_fragment(skb, xfrm6_output_finish); + return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); } - return xfrm6_output_finish(skb); + return x->outer_mode->afinfo->output_finish(skb); } int xfrm6_output(struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index afe941e9415cdd..248f0b2a7ee934 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -178,6 +178,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .tmpl_sort = __xfrm6_tmpl_sort, .state_sort = __xfrm6_state_sort, .output = xfrm6_output, + .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 15792d8b62721f..b4d745ea8ee14a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1406,6 +1406,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; + struct xfrm_mode *inner_mode; struct dst_entry *dst_prev = NULL; struct dst_entry *dst0 = NULL; int i = 0; @@ -1436,6 +1437,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } + if (xfrm[i]->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(xfrm[i], + xfrm_af2proto(family)); + if (!inner_mode) { + err = -EAFNOSUPPORT; + dst_release(dst); + goto put_states; + } + } else + inner_mode = xfrm[i]->inner_mode; + if (!dst_prev) dst0 = dst1; else { @@ -1464,7 +1476,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->lastuse = now; dst1->input = dst_discard; - dst1->output = xfrm[i]->outer_mode->afinfo->output; + dst1->output = inner_mode->afinfo->output; dst1->next = dst_prev; dst_prev = dst1; From 6fa5ddcc675b937f94d05628e8997c07a80c6cb9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 May 2011 19:43:05 +0000 Subject: [PATCH 22/24] xfrm: Don't allow esn with disabled anti replay detection Unlike the standard case, disabled anti replay detection needs some nontrivial extra treatment on ESN. RFC 4303 states: Note: If a receiver chooses to not enable anti-replay for an SA, then the receiver SHOULD NOT negotiate ESN in an SA management protocol. Use of ESN creates a need for the receiver to manage the anti-replay window (in order to determine the correct value for the high-order bits of the ESN, which are employed in the ICV computation), which is generally contrary to the notion of disabling anti-replay for an SA. So return an error if an ESN state with disabled anti replay detection is inserted for now and add the extra treatment later if we need it. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_replay.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index e8a781422feb46..47f1b8638df998 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -535,6 +535,9 @@ int xfrm_init_replay(struct xfrm_state *x) replay_esn->bmp_len * sizeof(__u32) * 8) return -EINVAL; + if ((x->props.flags & XFRM_STATE_ESN) && replay_esn->replay_window == 0) + return -EINVAL; + if ((x->props.flags & XFRM_STATE_ESN) && x->replay_esn) x->repl = &xfrm_replay_esn; else From aae1e743fee2b5523fb31ee050295f062cb26a31 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 9 May 2011 07:43:20 +0000 Subject: [PATCH 23/24] net/usb: mark LG VL600 LTE modem ethernet interface as WWAN Like other mobile broadband device ethernet interfaces, mark the LG VL600 with the 'wwan' devtype so userspace knows it needs additional configuration via the AT port before the interface can be used. Signed-off-by: Dan Williams Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index a301479ecc60a5..c924ea2bce07a4 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -567,7 +567,7 @@ static const struct usb_device_id products [] = { { USB_DEVICE_AND_INTERFACE_INFO(0x1004, 0x61aa, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), - .driver_info = 0, + .driver_info = (unsigned long)&wwan_info, }, /* From 0d4420a90b51abdea71585f571bad6d789ff8eb7 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 10 May 2011 13:12:30 -0700 Subject: [PATCH 24/24] slcan: fix ldisc->open retval TTY layer expects 0 if the ldisc->open operation succeeded. Reported-by: Matvejchikov Ilya Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- drivers/net/can/slcan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index b423965a78d16f..1b49df6b247087 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -583,7 +583,9 @@ static int slcan_open(struct tty_struct *tty) /* Done. We have linked the TTY line to a channel. */ rtnl_unlock(); tty->receive_room = 65536; /* We don't flow control */ - return sl->dev->base_addr; + + /* TTY layer expects 0 on success */ + return 0; err_free_chan: sl->tty = NULL;