From 8f4d19aacb64f2b3d65c8cf7974c3d153224b5f2 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 30 May 2018 10:29:31 +0800 Subject: [PATCH 01/81] netfilter: xt_CT: Reject the non-null terminated string from user space The helper and timeout strings are from user-space, we need to make sure they are null terminated. If not, evil user could make kernel read the unexpected memory, even print it when fail to find by the following codes. pr_info_ratelimited("No such helper \"%s\"\n", helper_name); Signed-off-by: Gao Feng Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_CT.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 8790190c6feb..03b9a50ec93b 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -245,12 +245,22 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, } if (info->helper[0]) { + if (strnlen(info->helper, sizeof(info->helper)) == sizeof(info->helper)) { + ret = -ENAMETOOLONG; + goto err3; + } + ret = xt_ct_set_helper(ct, info->helper, par); if (ret < 0) goto err3; } if (info->timeout[0]) { + if (strnlen(info->timeout, sizeof(info->timeout)) == sizeof(info->timeout)) { + ret = -ENAMETOOLONG; + goto err4; + } + ret = xt_ct_set_timeout(ct, par, info->timeout); if (ret < 0) goto err4; From 9c7f96fd77b0dbe1fe7ed1f9c462c45dc48a1076 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 31 May 2018 19:53:33 +0300 Subject: [PATCH 02/81] netfilter: nf_tables: check msg_type before nft_trans_set(trans) The patch moves the "trans->msg_type == NFT_MSG_NEWSET" check before using nft_trans_set(trans). Otherwise we can get out of bounds read. For example, KASAN reported the one when running 0001_cache_handling_0 nft test. In this case "trans->msg_type" was NFT_MSG_NEWTABLE: [75517.177808] BUG: KASAN: slab-out-of-bounds in nft_set_lookup_global+0x22f/0x270 [nf_tables] [75517.279094] Read of size 8 at addr ffff881bdb643fc8 by task nft/7356 ... [75517.375605] CPU: 26 PID: 7356 Comm: nft Tainted: G E 4.17.0-rc7.1.x86_64 #1 [75517.489587] Hardware name: Oracle Corporation SUN SERVER X4-2 [75517.618129] Call Trace: [75517.648821] dump_stack+0xd1/0x13b [75517.691040] ? show_regs_print_info+0x5/0x5 [75517.742519] ? kmsg_dump_rewind_nolock+0xf5/0xf5 [75517.799300] ? lock_acquire+0x143/0x310 [75517.846738] print_address_description+0x85/0x3a0 [75517.904547] kasan_report+0x18d/0x4b0 [75517.949892] ? nft_set_lookup_global+0x22f/0x270 [nf_tables] [75518.019153] ? nft_set_lookup_global+0x22f/0x270 [nf_tables] [75518.088420] ? nft_set_lookup_global+0x22f/0x270 [nf_tables] [75518.157689] nft_set_lookup_global+0x22f/0x270 [nf_tables] [75518.224869] nf_tables_newsetelem+0x1a5/0x5d0 [nf_tables] [75518.291024] ? nft_add_set_elem+0x2280/0x2280 [nf_tables] [75518.357154] ? nla_parse+0x1a5/0x300 [75518.401455] ? kasan_kmalloc+0xa6/0xd0 [75518.447842] nfnetlink_rcv+0xc43/0x1bdf [nfnetlink] [75518.507743] ? nfnetlink_rcv+0x7a5/0x1bdf [nfnetlink] [75518.569745] ? nfnl_err_reset+0x3c0/0x3c0 [nfnetlink] [75518.631711] ? lock_acquire+0x143/0x310 [75518.679133] ? netlink_deliver_tap+0x9b/0x1070 [75518.733840] ? kasan_unpoison_shadow+0x31/0x40 [75518.788542] netlink_unicast+0x45d/0x680 [75518.837111] ? __isolate_free_page+0x890/0x890 [75518.891913] ? netlink_attachskb+0x6b0/0x6b0 [75518.944542] netlink_sendmsg+0x6fa/0xd30 [75518.993107] ? netlink_unicast+0x680/0x680 [75519.043758] ? netlink_unicast+0x680/0x680 [75519.094402] sock_sendmsg+0xd9/0x160 [75519.138810] ___sys_sendmsg+0x64d/0x980 [75519.186234] ? copy_msghdr_from_user+0x350/0x350 [75519.243118] ? lock_downgrade+0x650/0x650 [75519.292738] ? do_raw_spin_unlock+0x5d/0x250 [75519.345456] ? _raw_spin_unlock+0x24/0x30 [75519.395065] ? __handle_mm_fault+0xbde/0x3410 [75519.448830] ? sock_setsockopt+0x3d2/0x1940 [75519.500516] ? __lock_acquire.isra.25+0xdc/0x19d0 [75519.558448] ? lock_downgrade+0x650/0x650 [75519.608057] ? __audit_syscall_entry+0x317/0x720 [75519.664960] ? __fget_light+0x58/0x250 [75519.711325] ? __sys_sendmsg+0xde/0x170 [75519.758850] __sys_sendmsg+0xde/0x170 [75519.804193] ? __ia32_sys_shutdown+0x90/0x90 [75519.856725] ? syscall_trace_enter+0x897/0x10e0 [75519.912354] ? trace_event_raw_event_sys_enter+0x920/0x920 [75519.979432] ? __audit_syscall_entry+0x720/0x720 [75520.036118] do_syscall_64+0xa3/0x3d0 [75520.081248] ? prepare_exit_to_usermode+0x47/0x1d0 [75520.139904] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [75520.201680] RIP: 0033:0x7fc153320ba0 [75520.245772] RSP: 002b:00007ffe294c3638 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [75520.337708] RAX: ffffffffffffffda RBX: 00007ffe294c4820 RCX: 00007fc153320ba0 [75520.424547] RDX: 0000000000000000 RSI: 00007ffe294c46b0 RDI: 0000000000000003 [75520.511386] RBP: 00007ffe294c47b0 R08: 0000000000000004 R09: 0000000002114090 [75520.598225] R10: 00007ffe294c30a0 R11: 0000000000000246 R12: 00007ffe294c3660 [75520.684961] R13: 0000000000000001 R14: 00007ffe294c3650 R15: 0000000000000001 [75520.790946] Allocated by task 7356: [75520.833994] kasan_kmalloc+0xa6/0xd0 [75520.878088] __kmalloc+0x189/0x450 [75520.920107] nft_trans_alloc_gfp+0x20/0x190 [nf_tables] [75520.983961] nf_tables_newtable+0xcd0/0x1bd0 [nf_tables] [75521.048857] nfnetlink_rcv+0xc43/0x1bdf [nfnetlink] [75521.108655] netlink_unicast+0x45d/0x680 [75521.157013] netlink_sendmsg+0x6fa/0xd30 [75521.205271] sock_sendmsg+0xd9/0x160 [75521.249365] ___sys_sendmsg+0x64d/0x980 [75521.296686] __sys_sendmsg+0xde/0x170 [75521.341822] do_syscall_64+0xa3/0x3d0 [75521.386957] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [75521.467867] Freed by task 23454: [75521.507804] __kasan_slab_free+0x132/0x180 [75521.558137] kfree+0x14d/0x4d0 [75521.596005] free_rt_sched_group+0x153/0x280 [75521.648410] sched_autogroup_create_attach+0x19a/0x520 [75521.711330] ksys_setsid+0x2ba/0x400 [75521.755529] __ia32_sys_setsid+0xa/0x10 [75521.802850] do_syscall_64+0xa3/0x3d0 [75521.848090] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [75521.929000] The buggy address belongs to the object at ffff881bdb643f80 which belongs to the cache kmalloc-96 of size 96 [75522.079797] The buggy address is located 72 bytes inside of 96-byte region [ffff881bdb643f80, ffff881bdb643fe0) [75522.221234] The buggy address belongs to the page: [75522.280100] page:ffffea006f6d90c0 count:1 mapcount:0 mapping:0000000000000000 index:0x0 [75522.377443] flags: 0x2fffff80000100(slab) [75522.426956] raw: 002fffff80000100 0000000000000000 0000000000000000 0000000180200020 [75522.521275] raw: ffffea006e6fafc0 0000000c0000000c ffff881bf180f400 0000000000000000 [75522.615601] page dumped because: kasan: bad access detected Fixes: 37a9cc525525 ("netfilter: nf_tables: add generation mask to sets") Signed-off-by: Alexey Kodanev Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 501e48a7965b..8d8dfe417014 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2728,12 +2728,13 @@ static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, u32 id = ntohl(nla_get_be32(nla)); list_for_each_entry(trans, &net->nft.commit_list, list) { - struct nft_set *set = nft_trans_set(trans); + if (trans->msg_type == NFT_MSG_NEWSET) { + struct nft_set *set = nft_trans_set(trans); - if (trans->msg_type == NFT_MSG_NEWSET && - id == nft_trans_set_id(trans) && - nft_active_genmask(set, genmask)) - return set; + if (id == nft_trans_set_id(trans) && + nft_active_genmask(set, genmask)) + return set; + } } return ERR_PTR(-ENOENT); } From 31875d4970baa02e08b719fdfea6f43e9e2f7e77 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 24 May 2018 23:40:12 +0300 Subject: [PATCH 03/81] ipvs: register conntrack hooks for ftp ip_vs_ftp requires conntrack modules for mangling of FTP command responses in passive mode. Make sure the conntrack hooks are registered when real servers use NAT method in FTP virtual service. The hooks will be registered while the service is present. Fixes: 0c66dc1ea3f0 ("netfilter: conntrack: register hooks in netns when needed by ruleset") Signed-off-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 30 ++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_ctl.c | 4 ++++ 2 files changed, 34 insertions(+) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index eb0bec043c96..ae72d9057eda 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -643,6 +643,7 @@ struct ip_vs_service { /* alternate persistence engine */ struct ip_vs_pe __rcu *pe; + int conntrack_afmask; struct rcu_head rcu_head; }; @@ -1620,6 +1621,35 @@ static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, return false; } +static inline int ip_vs_register_conntrack(struct ip_vs_service *svc) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + int afmask = (svc->af == AF_INET6) ? 2 : 1; + int ret = 0; + + if (!(svc->conntrack_afmask & afmask)) { + ret = nf_ct_netns_get(svc->ipvs->net, svc->af); + if (ret >= 0) + svc->conntrack_afmask |= afmask; + } + return ret; +#else + return 0; +#endif +} + +static inline void ip_vs_unregister_conntrack(struct ip_vs_service *svc) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + int afmask = (svc->af == AF_INET6) ? 2 : 1; + + if (svc->conntrack_afmask & afmask) { + nf_ct_netns_put(svc->ipvs->net, svc->af); + svc->conntrack_afmask &= ~afmask; + } +#endif +} + static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 3ecca0616d8c..ee0ab278f1f1 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -835,6 +835,9 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, * For now only for NAT! */ ip_vs_rs_hash(ipvs, dest); + /* FTP-NAT requires conntrack for mangling */ + if (svc->port == FTPPORT) + ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); @@ -1458,6 +1461,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) */ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { + ip_vs_unregister_conntrack(svc); /* Hold svc to avoid double release from dest_trash */ atomic_inc(&svc->refcnt); /* From 0cafa3926f0d8d72a2a814843f4db8cfef66d4ce Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 1 Jun 2018 19:12:28 +0900 Subject: [PATCH 04/81] netfilter: nft_reject_bridge: fix skb allocation size in nft_reject_br_send_v6_unreach In order to allocate icmpv6 skb, sizeof(struct ipv6hdr) should be used. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_reject_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index eaf05de37f75..6de981270566 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -261,7 +261,7 @@ static void nft_reject_br_send_v6_unreach(struct net *net, if (!reject6_br_csum_ok(oldskb, hook)) return; - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmp6hdr) + + nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) + LL_MAX_HEADER + len, GFP_ATOMIC); if (!nskb) return; From 6fcc02e3c2bddeaf628fde3c6a5ab3216d45691a Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 2 Jun 2018 21:52:15 +0300 Subject: [PATCH 05/81] ipvs: fix check on xmit to non-local addresses There is mistake in the rt_mode_allow_non_local assignment. It should be used to check if sending to non-local addresses is allowed, now it checks if local addresses are allowed. As local addresses are allowed for most of the cases, the only places that are affected are for traffic to transparent cache servers: - bypass connections when cache server is not available - related ICMP in FORWARD hook when sent to cache server Fixes: 4a4739d56b00 ("ipvs: Pull out crosses_local_route_boundary logic") Signed-off-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 4527921b1c3a..8f7fff774283 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -168,7 +168,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, bool new_rt_is_local) { bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); - bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); + bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); bool source_is_loopback; bool old_rt_is_local; From 9e8c8dabb78e886ace989729e763d28c76f5169e Mon Sep 17 00:00:00 2001 From: Alin Nastac Date: Wed, 30 May 2018 15:19:36 +0200 Subject: [PATCH 06/81] netfilter: ebtables: fix compat entry padding On arm64, ebt_entry_{match,watcher,target} structs are 40 bytes long while on 32-bit arm these structs have a size of 36 bytes. COMPAT_XT_ALIGN() macro cannot be used here to determine the necessary padding for the CONFIG_COMPAT because it imposes an 8-byte boundary alignment, condition that is not found in 32-bit ebtables application. Signed-off-by: Alin Nastac Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 6ba639f6c51d..5f459c8b7937 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1610,16 +1610,16 @@ struct compat_ebt_entry_mwt { compat_uptr_t ptr; } u; compat_uint_t match_size; - compat_uint_t data[0]; + compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace)))); }; /* account for possible padding between match_size and ->data */ static int ebt_compat_entry_padsize(void) { - BUILD_BUG_ON(XT_ALIGN(sizeof(struct ebt_entry_match)) < - COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt))); - return (int) XT_ALIGN(sizeof(struct ebt_entry_match)) - - COMPAT_XT_ALIGN(sizeof(struct compat_ebt_entry_mwt)); + BUILD_BUG_ON(sizeof(struct ebt_entry_match) < + sizeof(struct compat_ebt_entry_mwt)); + return (int) sizeof(struct ebt_entry_match) - + sizeof(struct compat_ebt_entry_mwt); } static int ebt_compat_match_offset(const struct xt_match *match, From 9dcceb1378b6d66633f613805b2d5a22af4d5383 Mon Sep 17 00:00:00 2001 From: Serhey Popovych Date: Tue, 5 Jun 2018 11:46:13 +0200 Subject: [PATCH 07/81] netfilter: xt_set: Check hook mask correctly Inserting rule before one with SET target we get error with warning in dmesg(1) output: # iptables -A FORWARD -t mangle -j SET --map-set test src --map-prio # iptables -I FORWARD 1 -t mangle -j ACCEPT iptables: Invalid argument. Run `dmesg' for more information. # dmesg |tail -n1 [268578.026643] mapping of prio or/and queue is allowed only from \ OUTPUT/FORWARD/POSTROUTING chains Rather than checking for supported hook bits for SET target check for unsupported one as done in all rest of matches and targets. Signed-off-by: Serhey Popovych Signed-off-by: Jozsef Kadlecsik --- net/netfilter/xt_set.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 6f4c5217d835..07af7dbf7a30 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -470,7 +470,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) } if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && - !(par->hook_mask & (1 << NF_INET_FORWARD | + (par->hook_mask & ~(1 << NF_INET_FORWARD | 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); From bd975e691486ba52790ba23cc9b4fecab7bc0d31 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Thu, 31 May 2018 18:45:21 +0200 Subject: [PATCH 08/81] netfilter: ipset: List timing out entries with "timeout 1" instead of zero When listing sets with timeout support, there's a probability that just timing out entries with "0" timeout value is listed/saved. However when restoring the saved list, the zero timeout value means permanent elelements. The new behaviour is that timing out entries are listed with "timeout 1" instead of zero. Fixes netfilter bugzilla #1258. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set_timeout.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h index bfb3531fd88a..7ad8ddf9ca8a 100644 --- a/include/linux/netfilter/ipset/ip_set_timeout.h +++ b/include/linux/netfilter/ipset/ip_set_timeout.h @@ -65,8 +65,14 @@ ip_set_timeout_set(unsigned long *timeout, u32 value) static inline u32 ip_set_timeout_get(const unsigned long *timeout) { - return *timeout == IPSET_ELEM_PERMANENT ? 0 : - jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC; + u32 t; + + if (*timeout == IPSET_ELEM_PERMANENT) + return 0; + + t = jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC; + /* Zero value in userspace means no timeout */ + return t == 0 ? 1 : t; } #endif /* __KERNEL__ */ From 30a2e107108c66cbcb7776b58cbcd7db223a1cc9 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 5 Jun 2018 11:53:35 +0200 Subject: [PATCH 09/81] netfilter: ipset: Limit max timeout value Due to the negative value condition in msecs_to_jiffies(), the real max possible timeout value must be set to (UINT_MAX >> 1)/MSEC_PER_SEC. Neutron Soutmun proposed the proper fix, but an insufficient one was applied, see https://patchwork.ozlabs.org/patch/400405/. Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set_timeout.h | 10 ++++++---- net/netfilter/xt_set.c | 8 ++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h index 7ad8ddf9ca8a..8ce271e187b6 100644 --- a/include/linux/netfilter/ipset/ip_set_timeout.h +++ b/include/linux/netfilter/ipset/ip_set_timeout.h @@ -23,6 +23,9 @@ /* Set is defined with timeout support: timeout value may be 0 */ #define IPSET_NO_TIMEOUT UINT_MAX +/* Max timeout value, see msecs_to_jiffies() in jiffies.h */ +#define IPSET_MAX_TIMEOUT (UINT_MAX >> 1)/MSEC_PER_SEC + #define ip_set_adt_opt_timeout(opt, set) \ ((opt)->ext.timeout != IPSET_NO_TIMEOUT ? (opt)->ext.timeout : (set)->timeout) @@ -32,11 +35,10 @@ ip_set_timeout_uget(struct nlattr *tb) unsigned int timeout = ip_set_get_h32(tb); /* Normalize to fit into jiffies */ - if (timeout > UINT_MAX/MSEC_PER_SEC) - timeout = UINT_MAX/MSEC_PER_SEC; + if (timeout > IPSET_MAX_TIMEOUT) + timeout = IPSET_MAX_TIMEOUT; - /* Userspace supplied TIMEOUT parameter: adjust crazy size */ - return timeout == IPSET_NO_TIMEOUT ? IPSET_NO_TIMEOUT - 1 : timeout; + return timeout; } static inline bool diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 07af7dbf7a30..bf2890b13212 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -372,8 +372,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; + add_opt.ext.timeout > IPSET_MAX_TIMEOUT) + add_opt.ext.timeout = IPSET_MAX_TIMEOUT; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) @@ -407,8 +407,8 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) /* Normalize to fit into jiffies */ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && - add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC) - add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC; + add_opt.ext.timeout > IPSET_MAX_TIMEOUT) + add_opt.ext.timeout = IPSET_MAX_TIMEOUT; if (info->add_set.index != IPSET_INVALID_ID) ip_set_add(info->add_set.index, skb, par, &add_opt); if (info->del_set.index != IPSET_INVALID_ID) From cbdebe481a14b42c45aa9f4ceb5ff19b55de2c57 Mon Sep 17 00:00:00 2001 From: Florent Fourcot Date: Mon, 4 Jun 2018 16:51:19 +0200 Subject: [PATCH 10/81] netfilter: ipset: forbid family for hash:mac sets Userspace `ipset` command forbids family option for hash:mac type: ipset create test hash:mac family inet4 ipset v6.30: Unknown argument: `family' However, this check is not done in kernel itself. When someone use external netlink applications (pyroute2 python library for example), one can create hash:mac with invalid family and inconsistant results from userspace (`ipset` command cannot read set content anymore). This patch enforce the logic in kernel, and forbids insertion of hash:mac with a family set. Since IP_SET_PROTO_UNDEF is defined only for hash:mac, this patch has no impact on other hash:* sets Signed-off-by: Florent Fourcot Signed-off-by: Victorien Molle Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index bbad940c0137..8a33dac4e805 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1234,7 +1234,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, pr_debug("Create set %s with family %s\n", set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); -#ifndef IP_SET_PROTO_UNDEF +#ifdef IP_SET_PROTO_UNDEF + if (set->family != NFPROTO_UNSPEC) + return -IPSET_ERR_INVALID_FAMILY; +#else if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) return -IPSET_ERR_INVALID_FAMILY; #endif From 11ff7288beb2b7da889a014aff0a7b80bf8efcf3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Jun 2018 12:14:56 +0200 Subject: [PATCH 11/81] netfilter: ebtables: reject non-bridge targets the ebtables evaluation loop expects targets to return positive values (jumps), or negative values (absolute verdicts). This is completely different from what xtables does. In xtables, targets are expected to return the standard netfilter verdicts, i.e. NF_DROP, NF_ACCEPT, etc. ebtables will consider these as jumps. Therefore reject any target found due to unspec fallback. v2: also reject watchers. ebtables ignores their return value, so a target that assumes skb ownership (and returns NF_STOLEN) causes use-after-free. The only watchers in the 'ebtables' front-end are log and nflog; both have AF_BRIDGE specific wrappers on kernel side. Reported-by: syzbot+2b43f681169a2a0d306a@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5f459c8b7937..08a65e4a77d0 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -396,6 +396,12 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par, watcher = xt_request_find_target(NFPROTO_BRIDGE, w->u.name, 0); if (IS_ERR(watcher)) return PTR_ERR(watcher); + + if (watcher->family != NFPROTO_BRIDGE) { + module_put(watcher->me); + return -ENOENT; + } + w->u.watcher = watcher; par->target = watcher; @@ -715,6 +721,13 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, goto cleanup_watchers; } + /* Reject UNSPEC, xtables verdicts/return values are incompatible */ + if (target->family != NFPROTO_BRIDGE) { + module_put(target->me); + ret = -ENOENT; + goto cleanup_watchers; + } + t->u.target = target; if (t->u.target == &ebt_standard_target) { if (gap < sizeof(struct ebt_standard_target)) { From 82e20b44477ffe90a5866caa209ecc9df818c6a1 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 7 Jun 2018 02:05:12 +0900 Subject: [PATCH 12/81] netfilter: nft_set_rbtree: fix parameter of __nft_rbtree_lookup() The parameter this doesn't have a flags value. so that it can't be used by nft_rbtree_interval_end(). test commands: %nft add table ip filter %nft add set ip filter s { type ipv4_addr \; flags interval \; } %nft add element ip filter s {0-1} %nft add element ip filter s {2-10} %nft add chain ip filter input { type filter hook input priority 0\; } %nft add rule ip filter input ip saddr @s Splat looks like: [ 246.752502] BUG: KASAN: slab-out-of-bounds in __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree] [ 246.752502] Read of size 1 at addr ffff88010d9efa47 by task http/1092 [ 246.752502] CPU: 1 PID: 1092 Comm: http Not tainted 4.17.0-rc6+ #185 [ 246.752502] Call Trace: [ 246.752502] [ 246.752502] dump_stack+0x74/0xbb [ 246.752502] ? __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree] [ 246.752502] print_address_description+0xc7/0x290 [ 246.752502] ? __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree] [ 246.752502] kasan_report+0x22c/0x350 [ 246.752502] __nft_rbtree_lookup+0x677/0x6a0 [nft_set_rbtree] [ 246.752502] nft_rbtree_lookup+0xc9/0x2d2 [nft_set_rbtree] [ 246.752502] ? sched_clock_cpu+0x144/0x180 [ 246.752502] nft_lookup_eval+0x149/0x3a0 [nf_tables] [ 246.752502] ? __lock_acquire+0xcea/0x4ed0 [ 246.752502] ? nft_lookup_init+0x6b0/0x6b0 [nf_tables] [ 246.752502] nft_do_chain+0x263/0xf50 [nf_tables] [ 246.752502] ? __nft_trace_packet+0x1a0/0x1a0 [nf_tables] [ 246.752502] ? sched_clock_cpu+0x144/0x180 [ ... ] Fixes: f9121355eb6f ("netfilter: nft_set_rbtree: incorrect assumption on lower interval lookups") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index e6f08bc5f359..26fa93b23805 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -65,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(parent->rb_left); if (interval && nft_rbtree_equal(set, this, interval) && - nft_rbtree_interval_end(this) && + nft_rbtree_interval_end(rbe) && !nft_rbtree_interval_end(interval)) continue; interval = rbe; From 64e6dd1fb2f1ed799f317dc34aa6e251c64f4981 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 7 Jun 2018 18:15:14 +0800 Subject: [PATCH 13/81] netfilter: nf_conntrack: Increase __IPS_MAX_BIT with new bit IPS_OFFLOAD_BIT The __IPS_MAX_BIT is used in __ctnetlink_change_status as the max bit value. When add new bit IPS_OFFLOAD_BIT whose value is 14, we should increase the __IPS_MAX_BIT too, from 14 to 15. There is no any bug in current codes, although it lost one loop in __ctnetlink_change_status. Because the new bit IPS_OFFLOAD_BIT belongs the IPS_UNCHANGEABLE_MASK. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index c712eb6879f1..336014bf8868 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -112,7 +112,7 @@ enum ip_conntrack_status { IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD), - __IPS_MAX_BIT = 14, + __IPS_MAX_BIT = 15, }; /* Connection tracking event types */ From c568503ef02030f169c9e19204def610a3510918 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 Jun 2018 21:34:43 +0200 Subject: [PATCH 14/81] netfilter: x_tables: initialise match/target check parameter struct syzbot reports following splat: BUG: KMSAN: uninit-value in ebt_stp_mt_check+0x24b/0x450 net/bridge/netfilter/ebt_stp.c:162 ebt_stp_mt_check+0x24b/0x450 net/bridge/netfilter/ebt_stp.c:162 xt_check_match+0x1438/0x1650 net/netfilter/x_tables.c:506 ebt_check_match net/bridge/netfilter/ebtables.c:372 [inline] ebt_check_entry net/bridge/netfilter/ebtables.c:702 [inline] The uninitialised access is xt_mtchk_param->nft_compat ... which should be set to 0. Fix it by zeroing the struct beforehand, same for tgchk. ip(6)tables targetinfo uses c99-style initialiser, so no change needed there. Reported-by: syzbot+da4494182233c23a5fcf@syzkaller.appspotmail.com Fixes: 55917a21d0cc0 ("netfilter: x_tables: add context to know if extension runs from nft_compat") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 2 ++ net/ipv4/netfilter/ip_tables.c | 1 + net/ipv6/netfilter/ip6_tables.c | 1 + 3 files changed, 4 insertions(+) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 08a65e4a77d0..ead123dab05e 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -700,6 +700,8 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, } i = 0; + memset(&mtpar, 0, sizeof(mtpar)); + memset(&tgpar, 0, sizeof(tgpar)); mtpar.net = tgpar.net = net; mtpar.table = tgpar.table = name; mtpar.entryinfo = tgpar.entryinfo = e; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e85f35b89c49..f6130704f052 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -531,6 +531,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, return -ENOMEM; j = 0; + memset(&mtpar, 0, sizeof(mtpar)); mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 97f79dc943d7..685c2168f524 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -551,6 +551,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, return -ENOMEM; j = 0; + memset(&mtpar, 0, sizeof(mtpar)); mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; From d8e87fc6d11c31525430a388317b52f4a98a5328 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 7 Jun 2018 19:38:09 +0000 Subject: [PATCH 15/81] netfilter: remove include/net/netfilter/nft_dup.h include/net/netfilter/nft_dup.h was introduced in d877f07112f1 ("netfilter: nf_tables: add nft_dup expression") but was never user since this date. Furthermore, the only struct in this file is unused elsewhere. Signed-off-by: Corentin Labbe Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_dup.h | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 include/net/netfilter/nft_dup.h diff --git a/include/net/netfilter/nft_dup.h b/include/net/netfilter/nft_dup.h deleted file mode 100644 index 4d9d512984b2..000000000000 --- a/include/net/netfilter/nft_dup.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NFT_DUP_H_ -#define _NFT_DUP_H_ - -struct nft_dup_inet { - enum nft_registers sreg_addr:8; - enum nft_registers sreg_dev:8; -}; - -#endif /* _NFT_DUP_H_ */ From b16558579576c8f2781062b638600f68954b1827 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Jun 2018 18:10:34 +0200 Subject: [PATCH 16/81] bpf: implement dummy fops for bpf objects syzkaller was able to trigger the following warning in do_dentry_open(): WARNING: CPU: 1 PID: 4508 at fs/open.c:778 do_dentry_open+0x4ad/0xe40 fs/open.c:778 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 4508 Comm: syz-executor867 Not tainted 4.17.0+ #90 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: [...] vfs_open+0x139/0x230 fs/open.c:908 do_last fs/namei.c:3370 [inline] path_openat+0x1717/0x4dc0 fs/namei.c:3511 do_filp_open+0x249/0x350 fs/namei.c:3545 do_sys_open+0x56f/0x740 fs/open.c:1101 __do_sys_openat fs/open.c:1128 [inline] __se_sys_openat fs/open.c:1122 [inline] __x64_sys_openat+0x9d/0x100 fs/open.c:1122 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe Problem was that prog and map inodes in bpf fs did not implement a dummy file open operation that would return an error. The patch in do_dentry_open() checks whether f_ops are present and if not bails out with an error. While this may be fine, we really shouldn't be throwing a warning though. Thus follow the model similar to bad_file_ops and reject the request unconditionally with -EIO. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Reported-by: syzbot+2e7fcab0f56fdbb330b8@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index ed13645bd80c..76efe9a183f5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -295,6 +295,15 @@ static const struct file_operations bpffs_map_fops = { .release = bpffs_map_release, }; +static int bpffs_obj_open(struct inode *inode, struct file *file) +{ + return -EIO; +} + +static const struct file_operations bpffs_obj_fops = { + .open = bpffs_obj_open, +}; + static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, const struct inode_operations *iops, const struct file_operations *fops) @@ -314,7 +323,8 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL); + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, + &bpffs_obj_fops); } static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) @@ -322,7 +332,7 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : NULL); + map->btf ? &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * From 1c9ca7e9836a4df1518568ea47461c5ef7c2cf8b Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 8 Jun 2018 08:51:27 +0200 Subject: [PATCH 17/81] selftests: bpf: fix urandom_read build issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc complains that urandom_read gets built twice. gcc -o tools/testing/selftests/bpf/urandom_read -static urandom_read.c -Wl,--build-id gcc -Wall -O2 -I../../../include/uapi -I../../../lib -I../../../lib/bpf -I../../../../include/generated -I../../../include urandom_read.c urandom_read -lcap -lelf -lrt -lpthread -o tools/testing/selftests/bpf/urandom_read gcc: fatal error: input file ‘tools/testing/selftests/bpf/urandom_read’ is the same as output file compilation terminated. ../lib.mk:110: recipe for target 'tools/testing/selftests/bpf/urandom_read' failed To fix this issue remove the urandom_read target and so target TEST_CUSTOM_PROGS gets used. Fixes: 81f77fd0deeb ("bpf: add selftest for stackmap with BPF_F_STACK_BUILD_ID") Signed-off-by: Anders Roxell Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 607ed8729c06..7a6214e9ae58 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -16,9 +16,7 @@ LDLIBS += -lcap -lelf -lrt -lpthread TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read all: $(TEST_CUSTOM_PROGS) -$(TEST_CUSTOM_PROGS): urandom_read - -urandom_read: urandom_read.c +$(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c $(CC) -o $(TEST_CUSTOM_PROGS) -static $< -Wl,--build-id # Order correspond to 'make run_tests' order From 646bb57ce86e4d7b0bd9d33244450ae009411e48 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 Jun 2018 11:07:24 -0400 Subject: [PATCH 18/81] ixgbe: Fix setting of TC configuration for macvlan case When we were enabling macvlan interfaces we weren't correctly configuring things until ixgbe_setup_tc was called a second time either by tweaking the number of queues or increasing the macvlan count past 15. The issue came down to the fact that num_rx_pools is not populated until after the queues and interrupts are reinitialized. Instead of trying to set it sooner we can just move the call to setup at least 1 traffic class to the SR-IOV/VMDq setup function so that we just set it for this one case. We already had a spot that was configuring the queues for TC 0 in the code here anyway so it makes sense to also set the number of TCs here as well. Fixes: 49cfbeb7a95c ("ixgbe: Fix handling of macvlan Tx offload") Signed-off-by: Alexander Duyck Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 8 ++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c index 893a9206e718..d361f570ca37 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c @@ -593,6 +593,14 @@ static bool ixgbe_set_sriov_queues(struct ixgbe_adapter *adapter) } #endif + /* To support macvlan offload we have to use num_tc to + * restrict the queues that can be used by the device. + * By doing this we can avoid reporting a false number of + * queues. + */ + if (vmdq_i > 1) + netdev_set_num_tc(adapter->netdev, 1); + /* populate TC0 for use by pool 0 */ netdev_set_tc_queue(adapter->netdev, 0, adapter->num_rx_queues_per_pool, 0); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 4929f7265598..f9e0dc041cfb 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8822,14 +8822,6 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc) } else { netdev_reset_tc(dev); - /* To support macvlan offload we have to use num_tc to - * restrict the queues that can be used by the device. - * By doing this we can avoid reporting a false number of - * queues. - */ - if (!tc && adapter->num_rx_pools > 1) - netdev_set_num_tc(dev, 1); - if (adapter->hw.mac.type == ixgbe_mac_82598EB) adapter->hw.fc.requested_mode = adapter->last_lfc_mode; From e433f3a5e272625c166d780f79ecc8fe456a5fc9 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 Jun 2018 16:51:20 -0400 Subject: [PATCH 19/81] ixgbe: Use CONFIG_XFRM_OFFLOAD instead of CONFIG_XFRM There is no point in adding code if CONFIG_XFRM is defined that we won't use unless CONFIG_XFRM_OFFLOAD is defined. So instead of leaving this code floating around I am replacing the ifdef with what I believe is the correct one so that we only include the code and variables if they will actually be used. Signed-off-by: Alexander Duyck Acked-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 4 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index fc534e91c6b2..144d5fe6b944 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -760,9 +760,9 @@ struct ixgbe_adapter { #define IXGBE_RSS_KEY_SIZE 40 /* size of RSS Hash Key in bytes */ u32 *rss_key; -#ifdef CONFIG_XFRM +#ifdef CONFIG_XFRM_OFFLOAD struct ixgbe_ipsec *ipsec; -#endif /* CONFIG_XFRM */ +#endif /* CONFIG_XFRM_OFFLOAD */ }; static inline u8 ixgbe_max_rss_indices(struct ixgbe_adapter *adapter) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f9e0dc041cfb..a925f05ec342 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9896,7 +9896,7 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev, * the TSO, so it's the exception. */ if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID)) { -#ifdef CONFIG_XFRM +#ifdef CONFIG_XFRM_OFFLOAD if (!skb->sp) #endif features &= ~NETIF_F_TSO; From de7a7e34e27c029fbb3c4e764db045548629b834 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 4 Jun 2018 16:51:25 -0400 Subject: [PATCH 20/81] ixgbe: Move ipsec init function to before reset call This patch moves the IPsec init function in ixgbe_sw_init. This way it is a bit more consistent with the placement of similar initialization functions and is placed before the reset_hw call which should allow us to clean up any link issues that may be introduced by the fact that we force the link up if somehow the device had IPsec still enabled before the driver was loaded. In addition to the function move it is necessary to change the assignment of netdev->features. The easiest way to do this is to just test for the existence of adapter->ipsec and if it is present we set the feature bits. Fixes: 49a94d74d948 ("ixgbe: add ipsec engine start and stop routines") Reported-by: Andre Tomt Signed-off-by: Alexander Duyck Acked-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 7 ------- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 11 +++++++++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 344a1f213a5f..38d8cf75e9ad 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -1001,13 +1001,6 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter) adapter->netdev->xfrmdev_ops = &ixgbe_xfrmdev_ops; -#define IXGBE_ESP_FEATURES (NETIF_F_HW_ESP | \ - NETIF_F_HW_ESP_TX_CSUM | \ - NETIF_F_GSO_ESP) - - adapter->netdev->features |= IXGBE_ESP_FEATURES; - adapter->netdev->hw_enc_features |= IXGBE_ESP_FEATURES; - return; err2: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a925f05ec342..8d061af276d3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6117,6 +6117,7 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, #ifdef CONFIG_IXGBE_DCB ixgbe_init_dcb(adapter); #endif + ixgbe_init_ipsec_offload(adapter); /* default flow control settings */ hw->fc.requested_mode = ixgbe_fc_full; @@ -10429,6 +10430,14 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (hw->mac.type >= ixgbe_mac_82599EB) netdev->features |= NETIF_F_SCTP_CRC; +#ifdef CONFIG_XFRM_OFFLOAD +#define IXGBE_ESP_FEATURES (NETIF_F_HW_ESP | \ + NETIF_F_HW_ESP_TX_CSUM | \ + NETIF_F_GSO_ESP) + + if (adapter->ipsec) + netdev->features |= IXGBE_ESP_FEATURES; +#endif /* copy netdev features into list of user selectable features */ netdev->hw_features |= netdev->features | NETIF_F_HW_VLAN_CTAG_FILTER | @@ -10491,8 +10500,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_FCOE_MTU; } #endif /* IXGBE_FCOE */ - ixgbe_init_ipsec_offload(adapter); - if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE) netdev->hw_features |= NETIF_F_LRO; if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) From e9f655ee97f14b4f5eba7b6b5a56a7c298573e67 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 5 Jun 2018 11:11:08 -0400 Subject: [PATCH 21/81] ixgbe: Avoid loopback and fix boolean logic in ipsec_stop_data This patch fixes two issues. First we add an early test for the Tx and Rx security block ready bits. By doing this we can avoid the need for waits or loopback in the event that the security block is already flushed out. Secondly we fix the boolean logic that was testing for the Tx OR Rx ready bits being set and change it so that we only exit if the Tx AND Rx ready bits are both set. Signed-off-by: Alexander Duyck Acked-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 38d8cf75e9ad..7b23fb0c2d07 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -158,7 +158,16 @@ static void ixgbe_ipsec_stop_data(struct ixgbe_adapter *adapter) reg |= IXGBE_SECRXCTRL_RX_DIS; IXGBE_WRITE_REG(hw, IXGBE_SECRXCTRL, reg); - IXGBE_WRITE_FLUSH(hw); + /* If both Tx and Rx are ready there are no packets + * that we need to flush so the loopback configuration + * below is not necessary. + */ + t_rdy = IXGBE_READ_REG(hw, IXGBE_SECTXSTAT) & + IXGBE_SECTXSTAT_SECTX_RDY; + r_rdy = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT) & + IXGBE_SECRXSTAT_SECRX_RDY; + if (t_rdy && r_rdy) + return; /* If the tx fifo doesn't have link, but still has data, * we can't clear the tx sec block. Set the MAC loopback @@ -185,7 +194,7 @@ static void ixgbe_ipsec_stop_data(struct ixgbe_adapter *adapter) IXGBE_SECTXSTAT_SECTX_RDY; r_rdy = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT) & IXGBE_SECRXSTAT_SECRX_RDY; - } while (!t_rdy && !r_rdy && limit--); + } while (!(t_rdy && r_rdy) && limit--); /* undo loopback if we played with it earlier */ if (!link) { From 421d954c4f1e9afd55bc65398bfc64ceba38df21 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 5 Jun 2018 11:11:14 -0400 Subject: [PATCH 22/81] ixgbe: Fix bit definitions and add support for testing for ipsec support This patch addresses two issues. First it adds the correct bit definitions for the SECTXSTAT and SECRXSTAT registers. Then it makes use of those definitions to test for if IPsec has been disabled on the part and if so we do not enable it. Signed-off-by: Alexander Duyck Reported-by: Andre Tomt Acked-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 14 +++++++++++++- drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 6 ++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 7b23fb0c2d07..c116f459945d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -975,10 +975,22 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring, **/ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter) { + struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_ipsec *ipsec; + u32 t_dis, r_dis; size_t size; - if (adapter->hw.mac.type == ixgbe_mac_82598EB) + if (hw->mac.type == ixgbe_mac_82598EB) + return; + + /* If there is no support for either Tx or Rx offload + * we should not be advertising support for IPsec. + */ + t_dis = IXGBE_READ_REG(hw, IXGBE_SECTXSTAT) & + IXGBE_SECTXSTAT_SECTX_OFF_DIS; + r_dis = IXGBE_READ_REG(hw, IXGBE_SECRXSTAT) & + IXGBE_SECRXSTAT_SECRX_OFF_DIS; + if (t_dis || r_dis) return; ipsec = kzalloc(sizeof(*ipsec), GFP_KERNEL); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h index e8ed37749ab1..44cfb2021145 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h @@ -599,13 +599,15 @@ struct ixgbe_nvm_version { #define IXGBE_SECTXCTRL_STORE_FORWARD 0x00000004 #define IXGBE_SECTXSTAT_SECTX_RDY 0x00000001 -#define IXGBE_SECTXSTAT_ECC_TXERR 0x00000002 +#define IXGBE_SECTXSTAT_SECTX_OFF_DIS 0x00000002 +#define IXGBE_SECTXSTAT_ECC_TXERR 0x00000004 #define IXGBE_SECRXCTRL_SECRX_DIS 0x00000001 #define IXGBE_SECRXCTRL_RX_DIS 0x00000002 #define IXGBE_SECRXSTAT_SECRX_RDY 0x00000001 -#define IXGBE_SECRXSTAT_ECC_RXERR 0x00000002 +#define IXGBE_SECRXSTAT_SECRX_OFF_DIS 0x00000002 +#define IXGBE_SECRXSTAT_ECC_RXERR 0x00000004 /* LinkSec (MacSec) Registers */ #define IXGBE_LSECTXCAP 0x08A00 From 0975764684487bf3f7a47eef009e750ea41bd514 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 11 Jun 2018 02:02:54 +0300 Subject: [PATCH 23/81] ipv6: allow PMTU exceptions to local routes IPVS setups with local client and remote tunnel server need to create exception for the local virtual IP. What we do is to change PMTU from 64KB (on "lo") to 1460 in the common case. Suggested-by: Martin KaFai Lau Fixes: 45e4fd26683c ("ipv6: Only create RTF_CACHE routes after encountering pmtu exception") Fixes: 7343ff31ebf0 ("ipv6: Don't create clones of host routes.") Signed-off-by: Julian Anastasov Acked-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fb956989adaf..86a0e4333d42 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2307,9 +2307,6 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = (struct rt6_info *)dst; - if (rt6->rt6i_flags & RTF_LOCAL) - return; - if (dst_metric_locked(dst, RTAX_MTU)) return; From 349b71d6f427ff8211adf50839dbbff3f27c1805 Mon Sep 17 00:00:00 2001 From: Zhouyang Jia Date: Mon, 11 Jun 2018 13:26:35 +0800 Subject: [PATCH 24/81] net: dsa: add error handling for pskb_trim_rcsum When pskb_trim_rcsum fails, the lack of error-handling code may cause unexpected results. This patch adds error-handling code after calling pskb_trim_rcsum. Signed-off-by: Zhouyang Jia Signed-off-by: David S. Miller --- net/dsa/tag_trailer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 7d20e1f3de28..56197f0d9608 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -75,7 +75,8 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; - pskb_trim_rcsum(skb, skb->len - 4); + if (pskb_trim_rcsum(skb, skb->len - 4)) + return NULL; return skb; } From a343993c518ce252b62ec00ac06bccfb1d17129d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 11 Jun 2018 13:57:12 +0200 Subject: [PATCH 25/81] xsk: silence warning on memory allocation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller reported a warning from xdp_umem_pin_pages(): WARNING: CPU: 1 PID: 4537 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996 ... __do_kmalloc mm/slab.c:3713 [inline] __kmalloc+0x25/0x760 mm/slab.c:3727 kmalloc_array include/linux/slab.h:634 [inline] kcalloc include/linux/slab.h:645 [inline] xdp_umem_pin_pages net/xdp/xdp_umem.c:205 [inline] xdp_umem_reg net/xdp/xdp_umem.c:318 [inline] xdp_umem_create+0x5c9/0x10f0 net/xdp/xdp_umem.c:349 xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531 __sys_setsockopt+0x1bd/0x390 net/socket.c:1935 __do_sys_setsockopt net/socket.c:1946 [inline] __se_sys_setsockopt net/socket.c:1943 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe This is a warning about attempting to allocate more than KMALLOC_MAX_SIZE memory. The request originates from userspace, and if the request is too big, the kernel is free to deny its allocation. In this patch, the failed allocation attempt is silenced with __GFP_NOWARN. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Reported-by: syzbot+4abadc5d69117b346506@syzkaller.appspotmail.com Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index b9ef487c4618..f47abb46c587 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -204,7 +204,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) long npgs; int err; - umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), + GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; From f6fadff33e8b09373eedf99822b89d9dd84545b8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Jun 2018 23:22:04 +0200 Subject: [PATCH 26/81] tls: fix NULL pointer dereference on poll While hacking on kTLS, I ran into the following panic from an unprivileged netserver / netperf TCP session: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 800000037f378067 P4D 800000037f378067 PUD 3c0e61067 PMD 0 Oops: 0010 [#1] SMP KASAN PTI CPU: 1 PID: 2289 Comm: netserver Not tainted 4.17.0+ #139 Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016 RIP: 0010: (null) Code: Bad RIP value. RSP: 0018:ffff88036abcf740 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff88036f5f6800 RCX: 1ffff1006debed26 RDX: ffff88036abcf920 RSI: ffff8803cb1a4f00 RDI: ffff8803c258c280 RBP: ffff8803c258c280 R08: ffff8803c258c280 R09: ffffed006f559d48 R10: ffff88037aacea43 R11: ffffed006f559d49 R12: ffff8803c258c280 R13: ffff8803cb1a4f20 R14: 00000000000000db R15: ffffffffc168a350 FS: 00007f7e631f4700(0000) GS:ffff8803d1c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000003ccf64005 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? tls_sw_poll+0xa4/0x160 [tls] ? sock_poll+0x20a/0x680 ? do_select+0x77b/0x11a0 ? poll_schedule_timeout.constprop.12+0x130/0x130 ? pick_link+0xb00/0xb00 ? read_word_at_a_time+0x13/0x20 ? vfs_poll+0x270/0x270 ? deref_stack_reg+0xad/0xe0 ? __read_once_size_nocheck.constprop.6+0x10/0x10 [...] Debugging further, it turns out that calling into ctx->sk_poll() is invalid since sk_poll itself is NULL which was saved from the original TCP socket in order for tls_sw_poll() to invoke it. Looks like the recent conversion from poll to poll_mask callback started in 152524231023 ("net: add support for ->poll_mask in proto_ops") missed to eventually convert kTLS, too: TCP's ->poll was converted over to the ->poll_mask in commit 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask") and therefore kTLS wrongly saved the ->poll old one which is now NULL. Convert kTLS over to use ->poll_mask instead. Also instead of POLLIN | POLLRDNORM use the proper EPOLLIN | EPOLLRDNORM bits as the case in tcp_poll_mask() as well that is mangled here. Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask") Signed-off-by: Daniel Borkmann Cc: Christoph Hellwig Cc: Dave Watson Tested-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 6 ++---- net/tls/tls_main.c | 2 +- net/tls/tls_sw.c | 19 +++++++++---------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 70c273777fe9..7f84ea3e217c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -109,8 +109,7 @@ struct tls_sw_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); - unsigned int (*sk_poll)(struct file *file, struct socket *sock, - struct poll_table_struct *wait); + __poll_t (*sk_poll_mask)(struct socket *sock, __poll_t events); struct sk_buff *recv_pkt; u8 control; bool decrypted; @@ -225,8 +224,7 @@ void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); +__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 301f22430469..a127d61e8af9 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -712,7 +712,7 @@ static int __init tls_register(void) build_protos(tls_prots[TLSV4], &tcp_prot); tls_sw_proto_ops = inet_stream_ops; - tls_sw_proto_ops.poll = tls_sw_poll; + tls_sw_proto_ops.poll_mask = tls_sw_poll_mask; tls_sw_proto_ops.splice_read = tls_sw_splice_read; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8ca57d01b18f..34895b7c132d 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -915,23 +915,22 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, return copied ? : err; } -unsigned int tls_sw_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait) +__poll_t tls_sw_poll_mask(struct socket *sock, __poll_t events) { - unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + __poll_t mask; - /* Grab POLLOUT and POLLHUP from the underlying socket */ - ret = ctx->sk_poll(file, sock, wait); + /* Grab EPOLLOUT and EPOLLHUP from the underlying socket */ + mask = ctx->sk_poll_mask(sock, events); - /* Clear POLLIN bits, and set based on recv_pkt */ - ret &= ~(POLLIN | POLLRDNORM); + /* Clear EPOLLIN bits, and set based on recv_pkt */ + mask &= ~(EPOLLIN | EPOLLRDNORM); if (ctx->recv_pkt) - ret |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; - return ret; + return mask; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1188,7 +1187,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; + sw_ctx_rx->sk_poll_mask = sk->sk_socket->ops->poll_mask; strp_check_rcv(&sw_ctx_rx->strp); } From 3f2d67b6bd24c615cfe3a6d793613bb2838dd9b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 11 Jun 2018 07:12:12 -0700 Subject: [PATCH 27/81] net/ipv6: Ensure cfg is properly initialized in ipv6_create_tempaddr Valdis reported a BUG in ipv6_add_addr: [ 1820.832682] BUG: unable to handle kernel NULL pointer dereference at 0000000000000209 [ 1820.832728] RIP: 0010:ipv6_add_addr+0x280/0xd10 [ 1820.832732] Code: 49 8b 1f 0f 84 6a 0a 00 00 48 85 db 0f 84 4e 0a 00 00 48 8b 03 48 8b 53 08 49 89 45 00 49 8b 47 10 49 89 55 08 48 85 c0 74 15 <48> 8b 50 08 48 8b 00 49 89 95 b8 01 00 00 49 89 85 b0 01 00 00 4c [ 1820.832847] RSP: 0018:ffffaa07c2fd7880 EFLAGS: 00010202 [ 1820.832853] RAX: 0000000000000201 RBX: ffffaa07c2fd79b0 RCX: 0000000000000000 [ 1820.832858] RDX: a4cfbfba2cbfa64c RSI: 0000000000000000 RDI: ffffffff8a8e9fa0 [ 1820.832862] RBP: ffffaa07c2fd7920 R08: 000000000000017a R09: ffffffff8a555300 [ 1820.832866] R10: 0000000000000000 R11: 0000000000000000 R12: ffff888d18e71c00 [ 1820.832871] R13: ffff888d0a9b1200 R14: 0000000000000000 R15: ffffaa07c2fd7980 [ 1820.832876] FS: 00007faa51bdb800(0000) GS:ffff888d1d400000(0000) knlGS:0000000000000000 [ 1820.832880] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1820.832885] CR2: 0000000000000209 CR3: 000000021e8f8001 CR4: 00000000001606e0 [ 1820.832888] Call Trace: [ 1820.832898] ? __local_bh_enable_ip+0x119/0x260 [ 1820.832904] ? ipv6_create_tempaddr+0x259/0x5a0 [ 1820.832912] ? __local_bh_enable_ip+0x139/0x260 [ 1820.832921] ipv6_create_tempaddr+0x2da/0x5a0 [ 1820.832926] ? ipv6_create_tempaddr+0x2da/0x5a0 [ 1820.832941] manage_tempaddrs+0x1a5/0x240 [ 1820.832951] inet6_addr_del+0x20b/0x3b0 [ 1820.832959] ? nla_parse+0xce/0x1e0 [ 1820.832968] inet6_rtm_deladdr+0xd9/0x210 [ 1820.832981] rtnetlink_rcv_msg+0x1d4/0x5f0 Looking at the code I found 1 element (peer_pfx) of the newly introduced ifa6_config struct that is not initialized. Use a memset rather than hard coding an init for each struct element. Reported-by: Valdis Kletnieks Fixes: e6464b8c63619 ("net/ipv6: Convert ipv6_add_addr to struct ifa6_config") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 89019bf59f46..c134286d6a41 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1324,6 +1324,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, } } + memset(&cfg, 0, sizeof(cfg)); cfg.valid_lft = min_t(__u32, ifp->valid_lft, idev->cnf.temp_valid_lft + age); cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor; @@ -1357,7 +1358,6 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, cfg.pfx = &addr; cfg.scope = ipv6_addr_scope(cfg.pfx); - cfg.rt_priority = 0; ift = ipv6_add_addr(idev, &cfg, block, NULL); if (IS_ERR(ift)) { From 6892286e9c09925780fe2cb6db3585b56b71fe8e Mon Sep 17 00:00:00 2001 From: David Miller Date: Mon, 11 Jun 2018 18:00:13 -0700 Subject: [PATCH 28/81] tcp: Do not reload skb pointer after skb_gro_receive(). This is not necessary. skb_gro_receive() will never change what 'head' points to. In it's original implementation (see commit 71d93b39e52e ("net: Add skb_gro_receive")), it did: ==================== + *head = nskb; + nskb->next = p->next; + p->next = NULL; ==================== This sequence was removed in commit 58025e46ea2d ("net: gro: remove obsolete code from skb_gro_receive()") Signed-off-by: David S. Miller Signed-off-by: Eric Dumazet --- net/ipv4/tcp_offload.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 4d58e2ce0b5b..8cc7c3487330 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -268,8 +268,6 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto out_check_final; } - p = *head; - th2 = tcp_hdr(p); tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: From 155fb5c5fae72d1faa2067d6fa0a5be12279c689 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Mon, 28 May 2018 18:14:49 +0900 Subject: [PATCH 29/81] netfilter: fix null-ptr-deref in nf_nat_decode_session Add null check for nat_hook in nf_nat_decode_session() [ 195.648098] UBSAN: Undefined behaviour in ./include/linux/netfilter.h:348:14 [ 195.651366] BUG: KASAN: null-ptr-deref in __xfrm_policy_check+0x208/0x1d70 [ 195.653888] member access within null pointer of type 'struct nf_nat_hook' [ 195.653896] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.17.0-rc6+ #5 [ 195.656320] Read of size 8 at addr 0000000000000008 by task ping/2469 [ 195.658715] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 195.658721] Call Trace: [ 195.661087] [ 195.669341] [ 195.670574] dump_stack+0xc6/0x150 [ 195.672156] ? dump_stack_print_info.cold.0+0x1b/0x1b [ 195.674121] ? ubsan_prologue+0x31/0x92 [ 195.676546] ubsan_epilogue+0x9/0x49 [ 195.678159] handle_null_ptr_deref+0x11a/0x130 [ 195.679800] ? sprint_OID+0x1a0/0x1a0 [ 195.681322] __ubsan_handle_type_mismatch_v1+0xd5/0x11d [ 195.683146] ? ubsan_prologue+0x92/0x92 [ 195.684642] __xfrm_policy_check+0x18ef/0x1d70 [ 195.686294] ? rt_cache_valid+0x118/0x180 [ 195.687804] ? __xfrm_route_forward+0x410/0x410 [ 195.689463] ? fib_multipath_hash+0x700/0x700 [ 195.691109] ? kvm_sched_clock_read+0x23/0x40 [ 195.692805] ? pvclock_clocksource_read+0xf6/0x280 [ 195.694409] ? graph_lock+0xa0/0xa0 [ 195.695824] ? pvclock_clocksource_read+0xf6/0x280 [ 195.697508] ? pvclock_read_flags+0x80/0x80 [ 195.698981] ? kvm_sched_clock_read+0x23/0x40 [ 195.700347] ? sched_clock+0x5/0x10 [ 195.701525] ? sched_clock_cpu+0x18/0x1a0 [ 195.702846] tcp_v4_rcv+0x1d32/0x1de0 [ 195.704115] ? lock_repin_lock+0x70/0x270 [ 195.707072] ? pvclock_read_flags+0x80/0x80 [ 195.709302] ? tcp_v4_early_demux+0x4b0/0x4b0 [ 195.711833] ? lock_acquire+0x195/0x380 [ 195.714222] ? ip_local_deliver_finish+0xfc/0x770 [ 195.716967] ? raw_rcv+0x2b0/0x2b0 [ 195.718856] ? lock_release+0xa00/0xa00 [ 195.720938] ip_local_deliver_finish+0x1b9/0x770 [...] Fixes: 2c205dd3981f ("netfilter: add struct nf_nat_hook and use it") Signed-off-by: Prashant Bhole Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 04551af2ff23..dd2052f0efb7 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -345,7 +345,7 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); - if (nat_hook->decode_session) + if (nat_hook && nat_hook->decode_session) nat_hook->decode_session(skb, fl); rcu_read_unlock(); #endif From 3fb61eca185cc65a1be23d9a5a11347eef79f597 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 Jun 2018 17:56:08 +0200 Subject: [PATCH 30/81] netfilter: nft_socket: fix module autoload Add alias definition for module autoload when adding socket rules. Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_socket.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index f28a0b944087..74e1b3bd6954 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -142,3 +142,4 @@ module_exit(nft_socket_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Máté Eckl"); MODULE_DESCRIPTION("nf_tables socket match module"); +MODULE_ALIAS_NFT_EXPR("socket"); From 215a31f19dedd4e92a67cf5a9717ee898d012b3a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 11 Jun 2018 17:18:29 +0200 Subject: [PATCH 31/81] netfilter: nft_dynset: do not reject set updates with NFT_SET_EVAL NFT_SET_EVAL is signalling the kernel that this sets can be updated from the evaluation path, even if there are no expressions attached to the element. Otherwise, set updates with no expressions fail. Update description to describe the right semantics. Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates") Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- net/netfilter/nft_dynset.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c9bf74b94f37..89438e68dc03 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -266,7 +266,7 @@ enum nft_rule_compat_attributes { * @NFT_SET_INTERVAL: set contains intervals * @NFT_SET_MAP: set is used as a dictionary * @NFT_SET_TIMEOUT: set uses timeouts - * @NFT_SET_EVAL: set contains expressions for evaluation + * @NFT_SET_EVAL: set can be updated from the evaluation path * @NFT_SET_OBJECT: set contains stateful objects */ enum nft_set_flags { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 4d49529cff61..27d7e4598ab6 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -203,9 +203,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, goto err1; set->ops->gc_init(set); } - - } else if (set->flags & NFT_SET_EVAL) - return -EINVAL; + } nft_set_ext_prepare(&priv->tmpl); nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); From 71ad00c50d77e507138c792a9646b53c16f22e11 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Jun 2018 13:20:35 +0200 Subject: [PATCH 32/81] netfilter: nf_tables: fix module unload race We must first remove the nfnetlink protocol handler when nf_tables module is unloaded -- we don't want userspace to submit new change requests once we've started to tear down nft state. Furthermore, nfnetlink must not call any subsystem function after call_batch returned -EAGAIN. EAGAIN means the subsys mutex was dropped, so its unlikely but possible that nf_tables subsystem was removed due to 'rmmod nf_tables' on another cpu. Therefore, we must abort batch completely and not move on to next part of the batch. Last, we can't invoke ->abort unless we've checked that the subsystem is still registered. Change netns exit path of nf_tables to make sure any incompleted transaction gets removed on exit. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 12 +++++++++--- net/netfilter/nfnetlink.c | 10 +++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7979095b69b0..ae312b31db28 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6439,7 +6439,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } -static int nf_tables_abort(struct net *net, struct sk_buff *skb) +static int __nf_tables_abort(struct net *net) { struct nft_trans *trans, *next; struct nft_trans_elem *te; @@ -6555,6 +6555,11 @@ static void nf_tables_cleanup(struct net *net) nft_validate_state_update(net, NFT_VALIDATE_SKIP); } +static int nf_tables_abort(struct net *net, struct sk_buff *skb) +{ + return __nf_tables_abort(net); +} + static bool nf_tables_valid_genid(struct net *net, u32 genid) { return net->nft.base_seq == genid; @@ -7149,9 +7154,10 @@ static int __net_init nf_tables_init_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { + if (!list_empty(&net->nft.commit_list)) + __nf_tables_abort(net); __nft_release_tables(net); WARN_ON_ONCE(!list_empty(&net->nft.tables)); - WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); } static struct pernet_operations nf_tables_net_ops = { @@ -7193,9 +7199,9 @@ static int __init nf_tables_module_init(void) static void __exit nf_tables_module_exit(void) { - unregister_pernet_subsys(&nf_tables_net_ops); nfnetlink_subsys_unregister(&nf_tables_subsys); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); + unregister_pernet_subsys(&nf_tables_net_ops); rcu_barrier(); nf_tables_core_module_exit(); kfree(info); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4d0da7042aff..e1b6be29848d 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -429,7 +429,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, */ if (err == -EAGAIN) { status |= NFNL_BATCH_REPLAY; - goto next; + goto done; } } ack: @@ -456,7 +456,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) status |= NFNL_BATCH_FAILURE; } -next: + msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; @@ -464,7 +464,11 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, } done: if (status & NFNL_BATCH_REPLAY) { - ss->abort(net, oskb); + const struct nfnetlink_subsystem *ss2; + + ss2 = nfnl_dereference_protected(subsys_id); + if (ss2 == ss) + ss->abort(net, oskb); nfnl_err_reset(&err_list); nfnl_unlock(subsys_id); kfree_skb(skb); From 0a2cf5ee432c2e8718af3553a56a3760d767b736 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Jun 2018 13:20:36 +0200 Subject: [PATCH 33/81] netfilter: nf_tables: close race between netns exit and rmmod If net namespace is exiting while nf_tables module is being removed we can oops: BUG: unable to handle kernel NULL pointer dereference at 0000000000000040 IP: nf_tables_flowtable_event+0x43/0xf0 [nf_tables] PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI Modules linked in: nf_tables(-) nfnetlink [..] unregister_netdevice_notifier+0xdd/0x130 nf_tables_module_exit+0x24/0x3a [nf_tables] SyS_delete_module+0x1c5/0x240 do_syscall_64+0x74/0x190 Avoid this by attempting to take reference on the net namespace from the notifiers. If it fails the namespace is exiting already, and nft core is taking care of cleanup work. We also need to make sure the netdev hook type gets removed before netns ops removal, else notifier might be invoked with device event for a netns where net->nft was never initialised (because pernet ops was removed beforehand). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 ++++++++++--- net/netfilter/nft_chain_filter.c | 5 +++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ae312b31db28..d23a5c269c44 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5837,18 +5837,23 @@ static int nf_tables_flowtable_event(struct notifier_block *this, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_flowtable *flowtable; struct nft_table *table; + struct net *net; if (event != NETDEV_UNREGISTER) return 0; + net = maybe_get_net(dev_net(dev)); + if (!net) + return 0; + nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(table, &dev_net(dev)->nft.tables, list) { + list_for_each_entry(table, &net->nft.tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { nft_flowtable_event(event, dev, flowtable); } } nfnl_unlock(NFNL_SUBSYS_NFTABLES); - + put_net(net); return NOTIFY_DONE; } @@ -7154,9 +7159,11 @@ static int __net_init nf_tables_init_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net); __nft_release_tables(net); + nfnl_unlock(NFNL_SUBSYS_NFTABLES); WARN_ON_ONCE(!list_empty(&net->nft.tables)); } @@ -7201,11 +7208,11 @@ static void __exit nf_tables_module_exit(void) { nfnetlink_subsys_unregister(&nf_tables_subsys); unregister_netdevice_notifier(&nf_tables_flowtable_notifier); + nft_chain_filter_fini(); unregister_pernet_subsys(&nf_tables_net_ops); rcu_barrier(); nf_tables_core_module_exit(); kfree(info); - nft_chain_filter_fini(); } module_init(nf_tables_module_init); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 84c902477a91..d21834bed805 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -318,6 +318,10 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; + ctx.net = maybe_get_net(ctx.net); + if (!ctx.net) + return NOTIFY_DONE; + nfnl_lock(NFNL_SUBSYS_NFTABLES); list_for_each_entry(table, &ctx.net->nft.tables, list) { if (table->family != NFPROTO_NETDEV) @@ -334,6 +338,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, } } nfnl_unlock(NFNL_SUBSYS_NFTABLES); + put_net(ctx.net); return NOTIFY_DONE; } From adc972c5b88829d38ede08b1069718661c7330ae Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 11 Jun 2018 22:16:33 +0900 Subject: [PATCH 34/81] netfilter: nf_tables: use WARN_ON_ONCE instead of BUG_ON in nft_do_chain() When depth of chain is bigger than NFT_JUMP_STACK_SIZE, the nft_do_chain crashes. But there is no need to crash hard here. Suggested-by: Florian Westphal Signed-off-by: Taehee Yoo Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index deff10adef9c..8de912ca53d3 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -183,7 +183,8 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) switch (regs.verdict.code) { case NFT_JUMP: - BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); + if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE)) + return NF_DROP; jumpstack[stackptr].chain = chain; jumpstack[stackptr].rules = rules + 1; stackptr++; From c05a45c0865d986a8aea373cd5297dbfded6882e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Jun 2018 22:22:19 +0200 Subject: [PATCH 35/81] netfilter: ctnetlink: avoid null pointer dereference Dan Carpenter points out that deref occurs after NULL check, we should re-fetch the pointer and check that instead. Fixes: 2c205dd3981f7 ("netfilter: add struct nf_nat_hook and use it") Reported-by: Dan Carpenter Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 39327a42879f..20a2e37c76d1 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1446,7 +1446,8 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); - if (nat_hook->parse_nat_setup) + nat_hook = rcu_dereference(nf_nat_hook); + if (nat_hook) return -EAGAIN; #endif return -EOPNOTSUPP; From fc6ddbecce440df74fb4491c17c372b52cf5be83 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 12 Jun 2018 18:36:19 +0200 Subject: [PATCH 36/81] netfilter: xt_connmark: fix list corruption on rmmod This needs to use xt_unregister_targets, else new revision is left on the list which then causes list to point to a target struct that has been free'd. Fixes: 472a73e00757 ("netfilter: xt_conntrack: Support bit-shifting for CONNMARK & MARK targets.") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 94df000abb92..29c38aa7f726 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -211,7 +211,7 @@ static int __init connmark_mt_init(void) static void __exit connmark_mt_exit(void) { xt_unregister_match(&connmark_mt_reg); - xt_unregister_target(connmark_tg_reg); + xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); } module_init(connmark_mt_init); From 21ba8847f857028dc83a0f341e16ecc616e34740 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Tue, 12 Jun 2018 10:51:34 -0700 Subject: [PATCH 37/81] netfilter: nf_conncount: Fix garbage collection with zones MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, we use check_hlist() for garbage colleciton. However, we use the ‘zone’ from the counted entry to query the existence of existing entries in the hlist. This could be wrong when they are in different zones, and this patch fixes this issue. Fixes: e59ea3df3fc2 ("netfilter: xt_connlimit: honor conntrack zone if available") Signed-off-by: Yi-Hung Wei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 3 ++- net/netfilter/nf_conncount.c | 13 +++++++++---- net/netfilter/nft_connlimit.c | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 1910b6572430..3a188a0923a3 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -20,7 +20,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, bool *addit); bool nf_conncount_add(struct hlist_head *head, - const struct nf_conntrack_tuple *tuple); + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); void nf_conncount_cache_free(struct hlist_head *hhead); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 3b5059a8dcdd..d8383609fe28 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -46,6 +46,7 @@ struct nf_conncount_tuple { struct hlist_node node; struct nf_conntrack_tuple tuple; + struct nf_conntrack_zone zone; }; struct nf_conncount_rb { @@ -80,7 +81,8 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) } bool nf_conncount_add(struct hlist_head *head, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { struct nf_conncount_tuple *conn; @@ -88,6 +90,7 @@ bool nf_conncount_add(struct hlist_head *head, if (conn == NULL) return false; conn->tuple = *tuple; + conn->zone = *zone; hlist_add_head(&conn->node, head); return true; } @@ -108,7 +111,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, /* check the saved connections */ hlist_for_each_entry_safe(conn, n, head, node) { - found = nf_conntrack_find_get(net, zone, &conn->tuple); + found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found == NULL) { hlist_del(&conn->node); kmem_cache_free(conncount_conn_cachep, conn); @@ -117,7 +120,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple)) { + if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && + nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks @@ -196,7 +200,7 @@ count_tree(struct net *net, struct rb_root *root, if (!addit) return count; - if (!nf_conncount_add(&rbconn->hhead, tuple)) + if (!nf_conncount_add(&rbconn->hhead, tuple, zone)) return 0; /* hotdrop */ return count + 1; @@ -238,6 +242,7 @@ count_tree(struct net *net, struct rb_root *root, } conn->tuple = *tuple; + conn->zone = *zone; memcpy(rbconn->key, key, sizeof(u32) * keylen); INIT_HLIST_HEAD(&rbconn->hhead); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 50c068d660e5..a832c59f0a9c 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -52,7 +52,7 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, if (!addit) goto out; - if (!nf_conncount_add(&priv->hhead, tuple_ptr)) { + if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; spin_unlock_bh(&priv->lock); return; From cdb8744d80352b55c622d049a6c91f449cd291f8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 12 Jun 2018 10:05:55 -0700 Subject: [PATCH 38/81] Revert "net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert the patch mentioned in the subject because it breaks at least the Avahi mDNS daemon. That patch namely causes the Ubuntu 18.04 Avahi daemon to fail to start: Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Successfully called chroot(). Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Successfully dropped remaining capabilities. Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: No service file found in /etc/avahi/services. Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: SO_REUSEADDR failed: Structure needs cleaning Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: SO_REUSEADDR failed: Structure needs cleaning Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: Failed to create server: No suitable network protocol available Jun 12 09:49:24 ubuntu-vm avahi-daemon[529]: avahi-daemon 0.7 exiting. Jun 12 09:49:24 ubuntu-vm systemd[1]: avahi-daemon.service: Main process exited, code=exited, status=255/n/a Jun 12 09:49:24 ubuntu-vm systemd[1]: avahi-daemon.service: Failed with result 'exit-code'. Jun 12 09:49:24 ubuntu-vm systemd[1]: Failed to start Avahi mDNS/DNS-SD Stack. Fixes: f396922d862a ("net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets") Cc: Maciej Żenczykowski Cc: Eric Dumazet Signed-off-by: Bart Van Assche Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index f333d75ef1a9..bcc41829a16d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -728,22 +728,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: - val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); - if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) && - inet_sk(sk)->inet_num && - (sk->sk_reuse != val)) { - ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN; - break; - } - sk->sk_reuse = val; + sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) && - inet_sk(sk)->inet_num && - (sk->sk_reuseport != valbool)) { - ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN; - break; - } sk->sk_reuseport = valbool; break; case SO_TYPE: From f8d0efb112275444c03b76ee2376f0055d12aeba Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 Jun 2018 21:33:35 -0700 Subject: [PATCH 39/81] nfp: don't pad strings in nfp_cpp_resource_find() to avoid gcc 8 warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once upon a time nfp_cpp_resource_find() took a name parameter, which could be any user-chosen string. Resources are identified by a CRC32 hash of a 8 byte string, so we had to pad user input with zeros to make sure CRC32 gave the correct result. Since then nfp_cpp_resource_find() was made to operate on allocated resources only (struct nfp_resource). We kzalloc those so there is no need to pad the strings and use memcmp. This avoids a GCC 8 stringop-truncation warning: In function ‘nfp_cpp_resource_find’, inlined from ‘nfp_resource_try_acquire’ at .../nfpcore/nfp_resource.c:153:8, inlined from ‘nfp_resource_acquire’ at .../nfpcore/nfp_resource.c:206:9: .../nfpcore/nfp_resource.c:108:2: warning: strncpy’ output may be truncated copying 8 bytes from a string of length 8 [-Wstringop-truncation] strncpy(name_pad, res->name, sizeof(name_pad)); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c index 2dd89dba9311..d32af598da90 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c @@ -98,21 +98,18 @@ struct nfp_resource { static int nfp_cpp_resource_find(struct nfp_cpp *cpp, struct nfp_resource *res) { - char name_pad[NFP_RESOURCE_ENTRY_NAME_SZ] = {}; struct nfp_resource_entry entry; u32 cpp_id, key; int ret, i; cpp_id = NFP_CPP_ID(NFP_RESOURCE_TBL_TARGET, 3, 0); /* Atomic read */ - strncpy(name_pad, res->name, sizeof(name_pad)); - /* Search for a matching entry */ - if (!memcmp(name_pad, NFP_RESOURCE_TBL_NAME "\0\0\0\0\0\0\0\0", 8)) { + if (!strcmp(res->name, NFP_RESOURCE_TBL_NAME)) { nfp_err(cpp, "Grabbing device lock not supported\n"); return -EOPNOTSUPP; } - key = crc32_posix(name_pad, sizeof(name_pad)); + key = crc32_posix(res->name, NFP_RESOURCE_ENTRY_NAME_SZ); for (i = 0; i < NFP_RESOURCE_TBL_ENTRIES; i++) { u64 addr = NFP_RESOURCE_TBL_BASE + From 29f534c4bbfc1f66faec04575148be80def66c2b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 Jun 2018 21:33:36 -0700 Subject: [PATCH 40/81] nfp: include all ring counters in interface stats We are gathering software statistics on per-ring basis. .ndo_get_stats64 handler adds the rings up. Unfortunately we are currently only adding up active rings, which means that if user decreases the number of active rings the statistics from deactivated rings will no longer be counted and total interface statistics may go backwards. Always sum all possible rings, the stats are allocated statically for max number of rings, so we don't have to worry about them being removed. We could add the stats up when user changes the ring count, but it seems unnecessary.. Adding up inactive rings will be very quick since no datapath will be touching them. Fixes: 164d1e9e5d52 ("nfp: add support for ethtool .set_channels") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 75110c8d6a90..ed27176c2bce 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3121,7 +3121,7 @@ static void nfp_net_stat64(struct net_device *netdev, struct nfp_net *nn = netdev_priv(netdev); int r; - for (r = 0; r < nn->dp.num_r_vecs; r++) { + for (r = 0; r < nn->max_r_vecs; r++) { struct nfp_net_r_vector *r_vec = &nn->r_vecs[r]; u64 data[3]; unsigned int start; From fe06a64e0de718d59ae10263180aca02b84245d6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 Jun 2018 21:33:37 -0700 Subject: [PATCH 41/81] nfp: remove phys_port_name on flower's vNIC .ndo_get_phys_port_name was recently extended to support multi-vNIC FWs. These are firmwares which can have more than one vNIC per PF without associated port (e.g. Adaptive Buffer Management FW), therefore we need a way of distinguishing the vNICs. Unfortunately, it's too late to make flower use the same naming. Flower users may depend on .ndo_get_phys_port_name returning -EOPNOTSUPP, for example the name udev gave the PF vNIC was just the bare PCI device-based name before the change, and will have 'nn0' appended after. To ensure flower's vNIC doesn't have phys_port_name attribute, add a flag to vNIC struct and set it in flower code. New projects will not set the flag adhere to the naming scheme from the start. Fixes: 51c1df83e35c ("nfp: assign vNIC id as phys_port_name of vNICs which are not ports") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/main.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_net.h | 4 ++++ drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c index 19cfa162ac65..1decf3a1cad3 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/main.c +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c @@ -455,6 +455,7 @@ static int nfp_flower_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, eth_hw_addr_random(nn->dp.netdev); netif_keep_dst(nn->dp.netdev); + nn->vnic_no_name = true; return 0; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 57cb035dcc6d..2a71a9ffd095 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -590,6 +590,8 @@ struct nfp_net_dp { * @vnic_list: Entry on device vNIC list * @pdev: Backpointer to PCI device * @app: APP handle if available + * @vnic_no_name: For non-port PF vNIC make ndo_get_phys_port_name return + * -EOPNOTSUPP to keep backwards compatibility (set by app) * @port: Pointer to nfp_port structure if vNIC is a port * @app_priv: APP private data for this vNIC */ @@ -663,6 +665,8 @@ struct nfp_net { struct pci_dev *pdev; struct nfp_app *app; + bool vnic_no_name; + struct nfp_port *port; void *app_priv; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index ed27176c2bce..d4c27f849f9b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3286,7 +3286,7 @@ nfp_net_get_phys_port_name(struct net_device *netdev, char *name, size_t len) if (nn->port) return nfp_port_get_phys_port_name(netdev, name, len); - if (nn->dp.is_vf) + if (nn->dp.is_vf || nn->vnic_no_name) return -EOPNOTSUPP; n = snprintf(name, len, "n%d", nn->id); From e62e51af3430745630f0cf76bb41a28d20c4ebdc Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Mon, 11 Jun 2018 21:33:38 -0700 Subject: [PATCH 42/81] nfp: flower: free dst_entry in route table We need to release the refcnt on dst_entry in the route table, otherwise we will leak the route. Fixes: 8e6a9046b66a ("nfp: flower vxlan neighbour offload") Signed-off-by: Pieter Jansen van Vuuren Signed-off-by: Louis Peens Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c index ec524d97869d..78afe75129ab 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c @@ -381,6 +381,8 @@ nfp_tun_neigh_event_handler(struct notifier_block *nb, unsigned long event, err = PTR_ERR_OR_ZERO(rt); if (err) return NOTIFY_DONE; + + ip_rt_put(rt); #else return NOTIFY_DONE; #endif From 8cde8f0c0c03f9f7440f3d71a74d7cc35083f281 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jun 2018 12:44:54 -0700 Subject: [PATCH 43/81] hv_netvsc: drop common code until callback model fixed The callback model of handling network failover is not suitable in the current form. 1. It was merged without addressing all the review feedback. 2. It was merged without approval of any of the netvsc maintainers. 3. Design discussion on how to handle PV/VF fallback is still not complete. 4. IMHO the code model using callbacks is trying to make something common which isn't. Revert the netvsc specific changes for now. Does not impact ongoing development of failover model for virtio. Revisit this after a simpler library based failover kernel routines are extracted. This reverts commit 9c6ffbacdb57 ("hv_netvsc: fix error return code in netvsc_probe()") and commit 1ff78076d8dd ("netvsc: refactor notifier/event handling code to use the failover framework") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/Kconfig | 1 - drivers/net/hyperv/hyperv_net.h | 2 - drivers/net/hyperv/netvsc_drv.c | 224 +++++++++++++++++++++++--------- 3 files changed, 165 insertions(+), 62 deletions(-) diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig index 23a2d145813a..0765d5f61714 100644 --- a/drivers/net/hyperv/Kconfig +++ b/drivers/net/hyperv/Kconfig @@ -2,6 +2,5 @@ config HYPERV_NET tristate "Microsoft Hyper-V virtual network driver" depends on HYPERV select UCS2_STRING - select FAILOVER help Select this option to enable the Hyper-V virtual network driver. diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 23304aca25f9..9246e4562830 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -931,8 +931,6 @@ struct net_device_context { u32 vf_alloc; /* Serial number of the VF to team with */ u32 vf_serial; - - struct failover *failover; }; /* Per channel data */ diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 7b18a8c267c2..3ec79eb183ad 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -42,7 +42,6 @@ #include #include #include -#include #include "hyperv_net.h" @@ -1780,6 +1779,46 @@ static void netvsc_link_change(struct work_struct *w) rtnl_unlock(); } +static struct net_device *get_netvsc_bymac(const u8 *mac) +{ + struct net_device *dev; + + ASSERT_RTNL(); + + for_each_netdev(&init_net, dev) { + if (dev->netdev_ops != &device_ops) + continue; /* not a netvsc device */ + + if (ether_addr_equal(mac, dev->perm_addr)) + return dev; + } + + return NULL; +} + +static struct net_device *get_netvsc_byref(struct net_device *vf_netdev) +{ + struct net_device *dev; + + ASSERT_RTNL(); + + for_each_netdev(&init_net, dev) { + struct net_device_context *net_device_ctx; + + if (dev->netdev_ops != &device_ops) + continue; /* not a netvsc device */ + + net_device_ctx = netdev_priv(dev); + if (!rtnl_dereference(net_device_ctx->nvdev)) + continue; /* device is removed */ + + if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev) + return dev; /* a match */ + } + + return NULL; +} + /* Called when VF is injecting data into network stack. * Change the associated network device from VF to netvsc. * note: already called with rcu_read_lock @@ -1802,6 +1841,46 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb) return RX_HANDLER_ANOTHER; } +static int netvsc_vf_join(struct net_device *vf_netdev, + struct net_device *ndev) +{ + struct net_device_context *ndev_ctx = netdev_priv(ndev); + int ret; + + ret = netdev_rx_handler_register(vf_netdev, + netvsc_vf_handle_frame, ndev); + if (ret != 0) { + netdev_err(vf_netdev, + "can not register netvsc VF receive handler (err = %d)\n", + ret); + goto rx_handler_failed; + } + + ret = netdev_master_upper_dev_link(vf_netdev, ndev, + NULL, NULL, NULL); + if (ret != 0) { + netdev_err(vf_netdev, + "can not set master device %s (err = %d)\n", + ndev->name, ret); + goto upper_link_failed; + } + + /* set slave flag before open to prevent IPv6 addrconf */ + vf_netdev->flags |= IFF_SLAVE; + + schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); + + call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); + + netdev_info(vf_netdev, "joined to %s\n", ndev->name); + return 0; + +upper_link_failed: + netdev_rx_handler_unregister(vf_netdev); +rx_handler_failed: + return ret; +} + static void __netvsc_vf_setup(struct net_device *ndev, struct net_device *vf_netdev) { @@ -1852,95 +1931,85 @@ static void netvsc_vf_setup(struct work_struct *w) rtnl_unlock(); } -static int netvsc_pre_register_vf(struct net_device *vf_netdev, - struct net_device *ndev) +static int netvsc_register_vf(struct net_device *vf_netdev) { + struct net_device *ndev; struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; + if (vf_netdev->addr_len != ETH_ALEN) + return NOTIFY_DONE; + + /* + * We will use the MAC address to locate the synthetic interface to + * associate with the VF interface. If we don't find a matching + * synthetic interface, move on. + */ + ndev = get_netvsc_bymac(vf_netdev->perm_addr); + if (!ndev) + return NOTIFY_DONE; + net_device_ctx = netdev_priv(ndev); netvsc_dev = rtnl_dereference(net_device_ctx->nvdev); if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev)) - return -ENODEV; - - return 0; -} - -static int netvsc_register_vf(struct net_device *vf_netdev, - struct net_device *ndev) -{ - struct net_device_context *ndev_ctx = netdev_priv(ndev); - - /* set slave flag before open to prevent IPv6 addrconf */ - vf_netdev->flags |= IFF_SLAVE; + return NOTIFY_DONE; - schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); + if (netvsc_vf_join(vf_netdev, ndev) != 0) + return NOTIFY_DONE; - call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); - - netdev_info(vf_netdev, "joined to %s\n", ndev->name); + netdev_info(ndev, "VF registering: %s\n", vf_netdev->name); dev_hold(vf_netdev); - rcu_assign_pointer(ndev_ctx->vf_netdev, vf_netdev); - - return 0; + rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev); + return NOTIFY_OK; } /* VF up/down change detected, schedule to change data path */ -static int netvsc_vf_changed(struct net_device *vf_netdev, - struct net_device *ndev) +static int netvsc_vf_changed(struct net_device *vf_netdev) { struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; + struct net_device *ndev; bool vf_is_up = netif_running(vf_netdev); + ndev = get_netvsc_byref(vf_netdev); + if (!ndev) + return NOTIFY_DONE; + net_device_ctx = netdev_priv(ndev); netvsc_dev = rtnl_dereference(net_device_ctx->nvdev); if (!netvsc_dev) - return -ENODEV; + return NOTIFY_DONE; netvsc_switch_datapath(ndev, vf_is_up); netdev_info(ndev, "Data path switched %s VF: %s\n", vf_is_up ? "to" : "from", vf_netdev->name); - return 0; + return NOTIFY_OK; } -static int netvsc_pre_unregister_vf(struct net_device *vf_netdev, - struct net_device *ndev) +static int netvsc_unregister_vf(struct net_device *vf_netdev) { + struct net_device *ndev; struct net_device_context *net_device_ctx; - net_device_ctx = netdev_priv(ndev); - cancel_delayed_work_sync(&net_device_ctx->vf_takeover); - - return 0; -} - -static int netvsc_unregister_vf(struct net_device *vf_netdev, - struct net_device *ndev) -{ - struct net_device_context *net_device_ctx; + ndev = get_netvsc_byref(vf_netdev); + if (!ndev) + return NOTIFY_DONE; net_device_ctx = netdev_priv(ndev); + cancel_delayed_work_sync(&net_device_ctx->vf_takeover); netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name); + netdev_rx_handler_unregister(vf_netdev); + netdev_upper_dev_unlink(vf_netdev, ndev); RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL); dev_put(vf_netdev); - return 0; + return NOTIFY_OK; } -static struct failover_ops netvsc_failover_ops = { - .slave_pre_register = netvsc_pre_register_vf, - .slave_register = netvsc_register_vf, - .slave_pre_unregister = netvsc_pre_unregister_vf, - .slave_unregister = netvsc_unregister_vf, - .slave_link_change = netvsc_vf_changed, - .slave_handle_frame = netvsc_vf_handle_frame, -}; - static int netvsc_probe(struct hv_device *dev, const struct hv_vmbus_device_id *dev_id) { @@ -2030,16 +2099,8 @@ static int netvsc_probe(struct hv_device *dev, goto register_failed; } - net_device_ctx->failover = failover_register(net, &netvsc_failover_ops); - if (IS_ERR(net_device_ctx->failover)) { - ret = PTR_ERR(net_device_ctx->failover); - goto err_failover; - } - return ret; -err_failover: - unregister_netdev(net); register_failed: rndis_filter_device_remove(dev, nvdev); rndis_failed: @@ -2080,15 +2141,13 @@ static int netvsc_remove(struct hv_device *dev) rtnl_lock(); vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); if (vf_netdev) - failover_slave_unregister(vf_netdev); + netvsc_unregister_vf(vf_netdev); if (nvdev) rndis_filter_device_remove(dev, nvdev); unregister_netdevice(net); - failover_unregister(ndev_ctx->failover); - rtnl_unlock(); rcu_read_unlock(); @@ -2115,8 +2174,54 @@ static struct hv_driver netvsc_drv = { .remove = netvsc_remove, }; +/* + * On Hyper-V, every VF interface is matched with a corresponding + * synthetic interface. The synthetic interface is presented first + * to the guest. When the corresponding VF instance is registered, + * we will take care of switching the data path. + */ +static int netvsc_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + /* Skip our own events */ + if (event_dev->netdev_ops == &device_ops) + return NOTIFY_DONE; + + /* Avoid non-Ethernet type devices */ + if (event_dev->type != ARPHRD_ETHER) + return NOTIFY_DONE; + + /* Avoid Vlan dev with same MAC registering as VF */ + if (is_vlan_dev(event_dev)) + return NOTIFY_DONE; + + /* Avoid Bonding master dev with same MAC registering as VF */ + if ((event_dev->priv_flags & IFF_BONDING) && + (event_dev->flags & IFF_MASTER)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + return netvsc_register_vf(event_dev); + case NETDEV_UNREGISTER: + return netvsc_unregister_vf(event_dev); + case NETDEV_UP: + case NETDEV_DOWN: + return netvsc_vf_changed(event_dev); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block netvsc_netdev_notifier = { + .notifier_call = netvsc_netdev_event, +}; + static void __exit netvsc_drv_exit(void) { + unregister_netdevice_notifier(&netvsc_netdev_notifier); vmbus_driver_unregister(&netvsc_drv); } @@ -2135,6 +2240,7 @@ static int __init netvsc_drv_init(void) if (ret) return ret; + register_netdevice_notifier(&netvsc_netdev_notifier); return 0; } From 7bf7bb37f16a80465ee3bd7c6c966f96f5a075a6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jun 2018 12:44:55 -0700 Subject: [PATCH 44/81] hv_netvsc: fix network namespace issues with VF support When finding the parent netvsc device, the search needs to be across all netvsc device instances (independent of network namespace). Find parent device of VF using upper_dev_get routine which searches only adjacent list. Fixes: e8ff40d4bff1 ("hv_netvsc: improve VF device matching") Signed-off-by: Stephen Hemminger netns aware byref Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 2 ++ drivers/net/hyperv/netvsc_drv.c | 43 +++++++++++++++------------------ 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 9246e4562830..d31c0cd329a1 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -901,6 +901,8 @@ struct net_device_context { struct hv_device *device_ctx; /* netvsc_device */ struct netvsc_device __rcu *nvdev; + /* list of netvsc net_devices */ + struct list_head list; /* reconfigure work */ struct delayed_work dwork; /* last reconfig time */ diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 3ec79eb183ad..309696b5cd14 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -67,6 +67,8 @@ static int debug = -1; module_param(debug, int, 0444); MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)"); +static LIST_HEAD(netvsc_dev_list); + static void netvsc_change_rx_flags(struct net_device *net, int change) { struct net_device_context *ndev_ctx = netdev_priv(net); @@ -1781,13 +1783,10 @@ static void netvsc_link_change(struct work_struct *w) static struct net_device *get_netvsc_bymac(const u8 *mac) { - struct net_device *dev; - - ASSERT_RTNL(); + struct net_device_context *ndev_ctx; - for_each_netdev(&init_net, dev) { - if (dev->netdev_ops != &device_ops) - continue; /* not a netvsc device */ + list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) { + struct net_device *dev = hv_get_drvdata(ndev_ctx->device_ctx); if (ether_addr_equal(mac, dev->perm_addr)) return dev; @@ -1798,25 +1797,18 @@ static struct net_device *get_netvsc_bymac(const u8 *mac) static struct net_device *get_netvsc_byref(struct net_device *vf_netdev) { + struct net_device_context *net_device_ctx; struct net_device *dev; - ASSERT_RTNL(); - - for_each_netdev(&init_net, dev) { - struct net_device_context *net_device_ctx; + dev = netdev_master_upper_dev_get(vf_netdev); + if (!dev || dev->netdev_ops != &device_ops) + return NULL; /* not a netvsc device */ - if (dev->netdev_ops != &device_ops) - continue; /* not a netvsc device */ + net_device_ctx = netdev_priv(dev); + if (!rtnl_dereference(net_device_ctx->nvdev)) + return NULL; /* device is removed */ - net_device_ctx = netdev_priv(dev); - if (!rtnl_dereference(net_device_ctx->nvdev)) - continue; /* device is removed */ - - if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev) - return dev; /* a match */ - } - - return NULL; + return dev; } /* Called when VF is injecting data into network stack. @@ -2093,15 +2085,19 @@ static int netvsc_probe(struct hv_device *dev, else net->max_mtu = ETH_DATA_LEN; - ret = register_netdev(net); + rtnl_lock(); + ret = register_netdevice(net); if (ret != 0) { pr_err("Unable to register netdev.\n"); goto register_failed; } - return ret; + list_add(&net_device_ctx->list, &netvsc_dev_list); + rtnl_unlock(); + return 0; register_failed: + rtnl_unlock(); rndis_filter_device_remove(dev, nvdev); rndis_failed: free_percpu(net_device_ctx->vf_stats); @@ -2147,6 +2143,7 @@ static int netvsc_remove(struct hv_device *dev) rndis_filter_device_remove(dev, nvdev); unregister_netdevice(net); + list_del(&ndev_ctx->list); rtnl_unlock(); rcu_read_unlock(); From c0a41b887ce614279c51964509e8d715979ce1f2 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jun 2018 12:44:56 -0700 Subject: [PATCH 45/81] hv_netvsc: move VF to same namespace as netvsc device When VF is added, the paravirtual device is already present and may have been moved to another network namespace. For example, sometimes the management interface is put in another net namespace in some environments. The VF should get moved to where the netvsc device is when the VF is discovered. The user can move it later (if desired). Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 309696b5cd14..fe2256bf1d13 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1928,6 +1928,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev) struct net_device *ndev; struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; + int ret; if (vf_netdev->addr_len != ETH_ALEN) return NOTIFY_DONE; @@ -1946,11 +1947,29 @@ static int netvsc_register_vf(struct net_device *vf_netdev) if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev)) return NOTIFY_DONE; - if (netvsc_vf_join(vf_netdev, ndev) != 0) + /* if syntihetic interface is a different namespace, + * then move the VF to that namespace; join will be + * done again in that context. + */ + if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) { + ret = dev_change_net_namespace(vf_netdev, + dev_net(ndev), "eth%d"); + if (ret) + netdev_err(vf_netdev, + "could not move to same namespace as %s: %d\n", + ndev->name, ret); + else + netdev_info(vf_netdev, + "VF moved to namespace with: %s\n", + ndev->name); return NOTIFY_DONE; + } netdev_info(ndev, "VF registering: %s\n", vf_netdev->name); + if (netvsc_vf_join(vf_netdev, ndev) != 0) + return NOTIFY_DONE; + dev_hold(vf_netdev); rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev); return NOTIFY_OK; From 909f1edc49953dbc0bc0512e1300691b9c2f432d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 11 Jun 2018 13:19:03 +0200 Subject: [PATCH 46/81] net: phy: mdio-gpio: Cut surplus includes The GPIO MDIO driver now needs only so cut the legacy and includes that are no longer used. Signed-off-by: Linus Walleij Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio-gpio.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 4e4c8daf44c3..33265747bf39 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -26,10 +26,7 @@ #include #include #include -#include #include - -#include #include struct mdio_gpio_info { From 469998c861faaa9f228701557fe8454f75a12e5c Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Fri, 8 Jun 2018 02:27:59 -0700 Subject: [PATCH 47/81] net: thunderx: prevent concurrent data re-writing by nicvf_set_rx_mode For each network interface linux network stack issue ndo_set_rx_mode call in order to configure MAC address filters (e.g. for multicast filtering). Currently ThunderX NICVF driver has only one ordered workqueue to process such requests for all VFs. And because of that it is possible that subsequent call to ndo_set_rx_mode would corrupt data which is currently in use by nicvf_set_rx_mode_task. Which in turn could cause following issue: [...] [ 48.978341] Unable to handle kernel paging request at virtual address 1fffff0000000000 [ 48.986275] Mem abort info: [ 48.989058] Exception class = DABT (current EL), IL = 32 bits [ 48.994965] SET = 0, FnV = 0 [ 48.998020] EA = 0, S1PTW = 0 [ 49.001152] Data abort info: [ 49.004022] ISV = 0, ISS = 0x00000004 [ 49.007869] CM = 0, WnR = 0 [ 49.010826] [1fffff0000000000] address between user and kernel address ranges [ 49.017963] Internal error: Oops: 96000004 [#1] SMP [...] [ 49.072138] task: ffff800fdd675400 task.stack: ffff000026440000 [ 49.078051] PC is at prefetch_freepointer.isra.37+0x28/0x3c [ 49.083613] LR is at kmem_cache_alloc_trace+0xc8/0x1fc [...] [ 49.272684] [] prefetch_freepointer.isra.37+0x28/0x3c [ 49.279286] [] kmem_cache_alloc_trace+0xc8/0x1fc [ 49.285455] [] alloc_fdtable+0x78/0x134 [ 49.290841] [] dup_fd+0x254/0x2f4 [ 49.295709] [] copy_process.isra.38.part.39+0x64c/0x1168 [ 49.302572] [] _do_fork+0xfc/0x3b0 [ 49.307524] [] SyS_clone+0x44/0x50 [...] This patch is to prevent such concurrent data write with spinlock. Reported-by: Dean Nelson Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nic.h | 2 + .../net/ethernet/cavium/thunder/nicvf_main.c | 50 +++++++++++++------ 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h index 448d1fafc827..f4d81765221e 100644 --- a/drivers/net/ethernet/cavium/thunder/nic.h +++ b/drivers/net/ethernet/cavium/thunder/nic.h @@ -325,6 +325,8 @@ struct nicvf { struct tasklet_struct qs_err_task; struct work_struct reset_task; struct nicvf_work rx_mode_work; + /* spinlock to protect workqueue arguments from concurrent access */ + spinlock_t rx_mode_wq_lock; /* PTP timestamp */ struct cavium_ptp *ptp_clock; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 7135db45927e..135766c4296b 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1923,17 +1923,12 @@ static int nicvf_ioctl(struct net_device *netdev, struct ifreq *req, int cmd) } } -static void nicvf_set_rx_mode_task(struct work_struct *work_arg) +static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, + struct nicvf *nic) { - struct nicvf_work *vf_work = container_of(work_arg, struct nicvf_work, - work.work); - struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work); union nic_mbx mbx = {}; int idx; - if (!vf_work) - return; - /* From the inside of VM code flow we have only 128 bits memory * available to send message to host's PF, so send all mc addrs * one by one, starting from flush command in case if kernel @@ -1944,7 +1939,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg) mbx.xcast.msg = NIC_MBOX_MSG_RESET_XCAST; nicvf_send_msg_to_pf(nic, &mbx); - if (vf_work->mode & BGX_XCAST_MCAST_FILTER) { + if (mode & BGX_XCAST_MCAST_FILTER) { /* once enabling filtering, we need to signal to PF to add * its' own LMAC to the filter to accept packets for it. */ @@ -1954,23 +1949,46 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg) } /* check if we have any specific MACs to be added to PF DMAC filter */ - if (vf_work->mc) { + if (mc_addrs) { /* now go through kernel list of MACs and add them one by one */ - for (idx = 0; idx < vf_work->mc->count; idx++) { + for (idx = 0; idx < mc_addrs->count; idx++) { mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST; - mbx.xcast.data.mac = vf_work->mc->mc[idx]; + mbx.xcast.data.mac = mc_addrs->mc[idx]; nicvf_send_msg_to_pf(nic, &mbx); } - kfree(vf_work->mc); + kfree(mc_addrs); } /* and finally set rx mode for PF accordingly */ mbx.xcast.msg = NIC_MBOX_MSG_SET_XCAST; - mbx.xcast.data.mode = vf_work->mode; + mbx.xcast.data.mode = mode; nicvf_send_msg_to_pf(nic, &mbx); } +static void nicvf_set_rx_mode_task(struct work_struct *work_arg) +{ + struct nicvf_work *vf_work = container_of(work_arg, struct nicvf_work, + work.work); + struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work); + u8 mode; + struct xcast_addr_list *mc; + + if (!vf_work) + return; + + /* Save message data locally to prevent them from + * being overwritten by next ndo_set_rx_mode call(). + */ + spin_lock(&nic->rx_mode_wq_lock); + mode = vf_work->mode; + mc = vf_work->mc; + vf_work->mc = NULL; + spin_unlock(&nic->rx_mode_wq_lock); + + __nicvf_set_rx_mode_task(mode, mc, nic); +} + static void nicvf_set_rx_mode(struct net_device *netdev) { struct nicvf *nic = netdev_priv(netdev); @@ -2004,9 +2022,12 @@ static void nicvf_set_rx_mode(struct net_device *netdev) } } } + spin_lock(&nic->rx_mode_wq_lock); + kfree(nic->rx_mode_work.mc); nic->rx_mode_work.mc = mc_list; nic->rx_mode_work.mode = mode; - queue_delayed_work(nicvf_rx_mode_wq, &nic->rx_mode_work.work, 2 * HZ); + queue_delayed_work(nicvf_rx_mode_wq, &nic->rx_mode_work.work, 0); + spin_unlock(&nic->rx_mode_wq_lock); } static const struct net_device_ops nicvf_netdev_ops = { @@ -2163,6 +2184,7 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&nic->reset_task, nicvf_reset_task); INIT_DELAYED_WORK(&nic->rx_mode_work.work, nicvf_set_rx_mode_task); + spin_lock_init(&nic->rx_mode_wq_lock); err = register_netdev(netdev); if (err) { From 31962c8c78b3cb480e28120a20b45811b76e207d Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 11 Jun 2018 16:02:36 +0200 Subject: [PATCH 48/81] tc-testing: ife: fix wrong teardown command in test b7b8 fix failures in the 'teardown' stage of test b7b8, probably a leftover of commit 7c5995b33d6e ("tc-testing: fixed copy-pasting error in ife tests") Fixes: a56e6bcd34b55 ("tc-testing: updated ife test cases") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- tools/testing/selftests/tc-testing/tc-tests/actions/ife.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json index de97e4ff705c..637ea0219617 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json @@ -568,7 +568,7 @@ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*use tcindex 65535.*index 1", "matchCount": "1", "teardown": [ - "$TC actions flush action skbedit" + "$TC actions flush action ife" ] }, { From 760a6ed6b6f29c48f97ff5a94ba0dbc639a2e677 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 11 Jun 2018 19:52:27 +0200 Subject: [PATCH 49/81] net: stmmac: dwmac-meson8b: Fix an error handling path in 'meson8b_dwmac_probe()' If 'of_device_get_match_data()' fails, we need to release some resources as done in the other error handling path of this function. Fixes: efacb568c962 ("net: stmmac: dwmac-meson: extend phy mode setting") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index 4ff231df7322..c5979569fd60 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -334,9 +334,10 @@ static int meson8b_dwmac_probe(struct platform_device *pdev) dwmac->data = (const struct meson8b_dwmac_data *) of_device_get_match_data(&pdev->dev); - if (!dwmac->data) - return -EINVAL; - + if (!dwmac->data) { + ret = -EINVAL; + goto err_remove_config_dt; + } res = platform_get_resource(pdev, IORESOURCE_MEM, 1); dwmac->regs = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(dwmac->regs)) { From c0129a0614428e5e4350fa963eecd1fbe19e57e9 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Jun 2018 14:07:14 -0700 Subject: [PATCH 50/81] smc: convert to ->poll_mask smc->clcsock is an internal TCP socket, after TCP socket converts to ->poll_mask, ->poll doesn't exist any more. So just convert smc socket to ->poll_mask too. Fixes: 2c7d3dacebd4 ("net/tcp: convert to ->poll_mask") Reported-by: syzbot+f5066e369b2d5fff630f@syzkaller.appspotmail.com Cc: Christoph Hellwig Cc: Ursula Braun Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/smc/af_smc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 973b4471b532..da7f02edcd37 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1273,8 +1273,7 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) { struct sock *sk = sock->sk; __poll_t mask = 0; @@ -1290,7 +1289,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ release_sock(sk); - mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); + mask = smc->clcsock->ops->poll_mask(smc->clcsock, events); lock_sock(sk); sk->sk_err = smc->clcsock->sk->sk_err; if (sk->sk_err) { @@ -1308,11 +1307,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, } } } else { - if (sk->sk_state != SMC_CLOSED) { - release_sock(sk); - sock_poll_wait(file, sk_sleep(sk), wait); - lock_sock(sk); - } if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || @@ -1625,7 +1619,7 @@ static const struct proto_ops smc_sock_ops = { .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, - .poll = smc_poll, + .poll_mask = smc_poll_mask, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, From 57f230ab04d2910a06d17d988f1c4d7586a59113 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 12 Jun 2018 08:57:53 +0200 Subject: [PATCH 51/81] xen/netfront: raise max number of slots in xennet_get_responses() The max number of slots used in xennet_get_responses() is set to MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD). In old kernel-xen MAX_SKB_FRAGS was 18, while nowadays it is 17. This difference is resulting in frequent messages "too many slots" and a reduced network throughput for some workloads (factor 10 below that of a kernel-xen based guest). Replacing MAX_SKB_FRAGS by XEN_NETIF_NR_SLOTS_MIN for calculation of the max number of slots to use solves that problem (tests showed no more messages "too many slots" and throughput was as high as with the kernel-xen based guest system). Replace MAX_SKB_FRAGS-2 by XEN_NETIF_NR_SLOTS_MIN-1 in netfront_tx_slot_available() for making it clearer what is really being tested without actually modifying the tested value. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: David S. Miller --- drivers/net/xen-netfront.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 679da1abd73c..922ce0abf5cf 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -239,7 +239,7 @@ static void rx_refill_timeout(struct timer_list *t) static int netfront_tx_slot_available(struct netfront_queue *queue) { return (queue->tx.req_prod_pvt - queue->tx.rsp_cons) < - (NET_TX_RING_SIZE - MAX_SKB_FRAGS - 2); + (NET_TX_RING_SIZE - XEN_NETIF_NR_SLOTS_MIN - 1); } static void xennet_maybe_wake_tx(struct netfront_queue *queue) @@ -790,7 +790,7 @@ static int xennet_get_responses(struct netfront_queue *queue, RING_IDX cons = queue->rx.rsp_cons; struct sk_buff *skb = xennet_get_rx_skb(queue, cons); grant_ref_t ref = xennet_get_rx_ref(queue, cons); - int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD); + int max = XEN_NETIF_NR_SLOTS_MIN + (rx->status <= RX_COPY_THRESHOLD); int slots = 1; int err = 0; unsigned long ret; From 995191220056300c51ed870a5d5321f91f3eef89 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 14 Jun 2018 07:37:02 +0800 Subject: [PATCH 52/81] sctp: define sctp_packet_gso_append to build GSO frames Now sctp GSO uses skb_gro_receive() to append the data into head skb frag_list. However it actually only needs very few code from skb_gro_receive(). Besides, NAPI_GRO_CB has to be set while most of its members are not needed here. This patch is to add sctp_packet_gso_append() to build GSO frames instead of skb_gro_receive(), and it would avoid many unnecessary checks and make the code clearer. Note that sctp will use page frags instead of frag_list to build GSO frames in another patch. But it may take time, as sctp's GSO frames may have different size. skb_segment() can only split it into the frags with the same size, which would break the border of sctp chunks. Signed-off-by: Xin Long Reviewed-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 5 +++++ net/sctp/output.c | 28 ++++++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index ebf809eed33a..dbe1b911a24d 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1133,6 +1133,11 @@ struct sctp_input_cb { }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) +struct sctp_output_cb { + struct sk_buff *last; +}; +#define SCTP_OUTPUT_CB(__skb) ((struct sctp_output_cb *)&((__skb)->cb[0])) + static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb) { const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; diff --git a/net/sctp/output.c b/net/sctp/output.c index e672dee302c7..7f849b01ec8e 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -409,6 +409,21 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) refcount_inc(&sk->sk_wmem_alloc); } +static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) +{ + if (SCTP_OUTPUT_CB(head)->last == head) + skb_shinfo(head)->frag_list = skb; + else + SCTP_OUTPUT_CB(head)->last->next = skb; + SCTP_OUTPUT_CB(head)->last = skb; + + head->truesize += skb->truesize; + head->data_len += skb->len; + head->len += skb->len; + + __skb_header_release(skb); +} + static int sctp_packet_pack(struct sctp_packet *packet, struct sk_buff *head, int gso, gfp_t gfp) { @@ -422,7 +437,7 @@ static int sctp_packet_pack(struct sctp_packet *packet, if (gso) { skb_shinfo(head)->gso_type = sk->sk_gso_type; - NAPI_GRO_CB(head)->last = head; + SCTP_OUTPUT_CB(head)->last = head; } else { nskb = head; pkt_size = packet->size; @@ -503,15 +518,8 @@ static int sctp_packet_pack(struct sctp_packet *packet, &packet->chunk_list); } - if (gso) { - if (skb_gro_receive(&head, nskb)) { - kfree_skb(nskb); - return 0; - } - if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= - sk->sk_gso_max_segs)) - return 0; - } + if (gso) + sctp_packet_gso_append(head, nskb); pkt_count++; } while (!list_empty(&packet->chunk_list)); From bdf767cae3dddcb50a9ca09d01bb79df3e384f7b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 11 Jun 2018 21:03:45 +0800 Subject: [PATCH 53/81] net: qcom/emac: Add missing of_node_put() Add missing of_node_put() call for device node returned by of_parse_phandle(). Signed-off-by: YueHaibing Acked-by: Timur Tabi Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/emac/emac-sgmii.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c index e78e5db39458..c694e3428dfc 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c @@ -384,6 +384,7 @@ int emac_sgmii_config(struct platform_device *pdev, struct emac_adapter *adpt) } sgmii_pdev = of_find_device_by_node(np); + of_node_put(np); if (!sgmii_pdev) { dev_err(&pdev->dev, "invalid internal-phy property\n"); return -ENODEV; From 4fd44a98ffe0d048246efef67ed640fdf2098a62 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 12 Jun 2018 23:09:37 +0000 Subject: [PATCH 54/81] tcp: verify the checksum of the first data segment in a new connection commit 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") introduced an optimization for the handling of child sockets created for a new TCP connection. But this optimization passes any data associated with the last ACK of the connection handshake up the stack without verifying its checksum, because it calls tcp_child_process(), which in turn calls tcp_rcv_state_process() directly. These lower-level processing functions do not do any checksum verification. Insert a tcp_checksum_complete call in the TCP_NEW_SYN_RECEIVE path to fix this. Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Frank van der Linden Signed-off-by: Eric Dumazet Tested-by: Balbir Singh Reviewed-by: Balbir Singh Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++++ net/ipv6/tcp_ipv6.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fed3f1c66167..bea17f1e8302 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1730,6 +1730,10 @@ int tcp_v4_rcv(struct sk_buff *skb) reqsk_put(req); goto discard_it; } + if (tcp_checksum_complete(skb)) { + reqsk_put(req); + goto csum_error; + } if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b620d9b72e59..7efa9fd7e109 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1479,6 +1479,10 @@ static int tcp_v6_rcv(struct sk_buff *skb) reqsk_put(req); goto discard_it; } + if (tcp_checksum_complete(skb)) { + reqsk_put(req); + goto csum_error; + } if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; From 087fca595a0a30804fd7896e77ba11aa46e5d708 Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Wed, 13 Jun 2018 12:05:16 +0530 Subject: [PATCH 55/81] net: emaclite: Fix position of lp->mii_bus assignment To ensure MDIO bus is not double freed in remove() path assign lp->mii_bus after MDIO bus registration. Signed-off-by: Radhey Shyam Pandey Signed-off-by: Michal Simek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 69e31ceccfae..37989ce543ba 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -863,14 +863,14 @@ static int xemaclite_mdio_setup(struct net_local *lp, struct device *dev) bus->write = xemaclite_mdio_write; bus->parent = dev; - lp->mii_bus = bus; - rc = of_mdiobus_register(bus, np); if (rc) { dev_err(dev, "Failed to register mdio bus.\n"); goto err_register; } + lp->mii_bus = bus; + return 0; err_register: From 27cad008406600822ab638980412ceea740e7fc8 Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Wed, 13 Jun 2018 12:05:17 +0530 Subject: [PATCH 56/81] net: emaclite: Fix MDIO bus unregister bug Since 'has_mdio' flag is not used,sequence insmod->rmmod-> insmod leads to failure as MDIO unregister doesn't happen in .remove(). Fix it by checking MII bus pointer instead. Signed-off-by: Radhey Shyam Pandey Signed-off-by: Michal Simek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 37989ce543ba..06eb6c886388 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1191,7 +1191,7 @@ static int xemaclite_of_remove(struct platform_device *of_dev) struct net_local *lp = netdev_priv(ndev); /* Un-register the mii_bus, if configured */ - if (lp->has_mdio) { + if (lp->mii_bus) { mdiobus_unregister(lp->mii_bus); mdiobus_free(lp->mii_bus); lp->mii_bus = NULL; From bd45cbf5451dcbba16c19aafd6dd99bc3e1e9644 Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Wed, 13 Jun 2018 12:05:18 +0530 Subject: [PATCH 57/81] net: emaclite: Remove unused 'has_mdio' flag. Remove unused 'has_mdio' flag. Signed-off-by: Radhey Shyam Pandey Signed-off-by: Michal Simek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 06eb6c886388..ec4608e8ab1b 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -123,7 +123,6 @@ * @phy_node: pointer to the PHY device node * @mii_bus: pointer to the MII bus * @last_link: last link status - * @has_mdio: indicates whether MDIO is included in the HW */ struct net_local { @@ -144,7 +143,6 @@ struct net_local { struct mii_bus *mii_bus; int last_link; - bool has_mdio; }; From 560c5bddba72cc4fd9b77731b64b7937fde3b340 Mon Sep 17 00:00:00 2001 From: Radhey Shyam Pandey Date: Wed, 13 Jun 2018 12:05:19 +0530 Subject: [PATCH 58/81] net: emaclite: Remove xemaclite_mdio_setup return check Errors are already reported in xemaclite_mdio_setup so avoid reporting it again. Signed-off-by: Radhey Shyam Pandey Signed-off-by: Michal Simek Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index ec4608e8ab1b..2a0c06e0f730 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1143,9 +1143,7 @@ static int xemaclite_of_probe(struct platform_device *ofdev) xemaclite_update_address(lp, ndev->dev_addr); lp->phy_node = of_parse_phandle(ofdev->dev.of_node, "phy-handle", 0); - rc = xemaclite_mdio_setup(lp, &ofdev->dev); - if (rc) - dev_warn(&ofdev->dev, "error registering MDIO bus\n"); + xemaclite_mdio_setup(lp, &ofdev->dev); dev_info(dev, "MAC address is now %pM\n", ndev->dev_addr); From 90904ff5f958a215cc3d26f957a46e80fa178470 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 13 Jun 2018 15:09:18 +0200 Subject: [PATCH 59/81] l2tp: fix pseudo-wire type for sessions created by pppol2tp_connect() Define cfg.pw_type so that the new session is created with its .pwtype field properly set (L2TP_PWTYPE_PPP). Not setting the pseudo-wire type had several annoying effects: * Invalid value returned in the L2TP_ATTR_PW_TYPE attribute when dumping sessions with the netlink API. * Impossibility to delete the session using the netlink API (because l2tp_nl_cmd_session_delete() gets the deletion callback function from an array indexed by the session's pseudo-wire type). Also, there are several cases where we should check a session's pseudo-wire type. For example, pppol2tp_connect() should refuse to connect a session that is not PPPoL2TP, but that requires the session's .pwtype field to be properly set. Fixes: f7faffa3ff8e ("l2tp: Add L2TPv3 protocol support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index b56cb1df4fc0..270a0a999eaf 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -751,6 +751,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, /* Default MTU must allow space for UDP/L2TP/PPP headers */ cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD; cfg.mru = cfg.mtu; + cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, session_id, From 7ac6ab1f8a38ba7f8d97f95475bb6a2575db4658 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 13 Jun 2018 15:09:19 +0200 Subject: [PATCH 60/81] l2tp: only accept PPP sessions in pppol2tp_connect() l2tp_session_priv() returns a struct pppol2tp_session pointer only for PPPoL2TP sessions. In particular, if the session is an L2TP_PWTYPE_ETH pseudo-wire, l2tp_session_priv() returns a pointer to an l2tp_eth_sess structure, which is much smaller than struct pppol2tp_session. This leads to invalid memory dereference when trying to lock ps->sk_lock. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 270a0a999eaf..8b3b6947a07d 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -734,6 +734,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, session = l2tp_session_get(sock_net(sk), tunnel, session_id); if (session) { drop_refcnt = true; + + if (session->pwtype != L2TP_PWTYPE_PPP) { + error = -EPROTOTYPE; + goto end; + } + ps = l2tp_session_priv(session); /* Using a pre-existing session is fine as long as it hasn't From 3e1bc8bf974e2d4e7beb842a4c801c2542eff3bd Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 13 Jun 2018 15:09:20 +0200 Subject: [PATCH 61/81] l2tp: prevent pppol2tp_connect() from creating kernel sockets If 'fd' is negative, l2tp_tunnel_create() creates a tunnel socket using the configuration passed in 'tcfg'. Currently, pppol2tp_connect() sets the relevant fields to zero, tricking l2tp_tunnel_create() into setting up an unusable kernel socket. We can't set 'tcfg' with the required fields because there's no way to get them from the current connect() parameters. So let's restrict kernel sockets creation to the netlink API, which is the original use case. Fixes: 789a4a2c61d8 ("l2tp: Add support for static unmanaged L2TPv3 tunnels") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 8b3b6947a07d..1b24f76ae210 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -701,6 +701,15 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, .encap = L2TP_ENCAPTYPE_UDP, .debug = 0, }; + + /* Prevent l2tp_tunnel_register() from trying to set up + * a kernel socket. + */ + if (fd < 0) { + error = -EBADF; + goto end; + } + error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel); if (error < 0) goto end; From bda06be2158c7aa7e41b15500c4d3840369c19a6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 13 Jun 2018 15:09:21 +0200 Subject: [PATCH 62/81] l2tp: clean up stale tunnel or session in pppol2tp_connect's error path pppol2tp_connect() may create a tunnel or a session. Remove them in case of error. Fixes: fd558d186df2 ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 1b24f76ae210..f429fed06a1e 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -612,6 +612,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, u32 session_id, peer_session_id; bool drop_refcnt = false; bool drop_tunnel = false; + bool new_session = false; + bool new_tunnel = false; int ver = 2; int fd; @@ -722,6 +724,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } drop_tunnel = true; + new_tunnel = true; } } else { /* Error if we can't find the tunnel */ @@ -788,6 +791,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } drop_refcnt = true; + new_session = true; } /* Special case: if source & dest session_id == 0x0000, this @@ -834,6 +838,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, session->name); end: + if (error) { + if (new_session) + l2tp_session_delete(session); + if (new_tunnel) + l2tp_tunnel_delete(tunnel); + } if (drop_refcnt) l2tp_session_dec_refcount(session); if (drop_tunnel) From f1693c63ab133d16994cc50f773982b5905af264 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 14 Jun 2018 11:52:34 -0700 Subject: [PATCH 63/81] rds: avoid unenecessary cong_update in loop transport Loop transport which is self loopback, remote port congestion update isn't relevant. Infact the xmit path already ignores it. Receive path needs to do the same. Reported-by: syzbot+4c20b3866171ce8441d2@syzkaller.appspotmail.com Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/loop.c | 1 + net/rds/rds.h | 5 +++++ net/rds/recv.c | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/net/rds/loop.c b/net/rds/loop.c index f2bf78de5688..dac6218a460e 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -193,4 +193,5 @@ struct rds_transport rds_loop_transport = { .inc_copy_to_user = rds_message_inc_copy_to_user, .inc_free = rds_loop_inc_free, .t_name = "loopback", + .t_type = RDS_TRANS_LOOP, }; diff --git a/net/rds/rds.h b/net/rds/rds.h index b04c333d9d1c..f2272fb8cd45 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -479,6 +479,11 @@ struct rds_notifier { int n_status; }; +/* Available as part of RDS core, so doesn't need to participate + * in get_preferred transport etc + */ +#define RDS_TRANS_LOOP 3 + /** * struct rds_transport - transport specific behavioural hooks * diff --git a/net/rds/recv.c b/net/rds/recv.c index dc67458b52f0..192ac6f78ded 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -103,6 +103,11 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, rds_stats_add(s_recv_bytes_added_to_socket, delta); else rds_stats_add(s_recv_bytes_removed_from_socket, -delta); + + /* loop transport doesn't send/recv congestion updates */ + if (rs->rs_transport->t_type == RDS_TRANS_LOOP) + return; + now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " From 06bdf2803cae82c66c666b932f21b7c01d7b2ef9 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Thu, 14 Jun 2018 18:29:09 -0700 Subject: [PATCH 64/81] hv_netvsc: Fix the variable sizes in ipsecv2 and rsc offload These fields in struct ndis_ipsecv2_offload and struct ndis_rsc_offload are one byte according to the specs. This patch defines them with the right size. These structs are not in use right now, but will be used soon. Signed-off-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index d31c0cd329a1..1a924b867b07 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -1277,17 +1277,17 @@ struct ndis_lsov2_offload { struct ndis_ipsecv2_offload { u32 encap; - u16 ip6; - u16 ip4opt; - u16 ip6ext; - u16 ah; - u16 esp; - u16 ah_esp; - u16 xport; - u16 tun; - u16 xport_tun; - u16 lso; - u16 extseq; + u8 ip6; + u8 ip4opt; + u8 ip6ext; + u8 ah; + u8 esp; + u8 ah_esp; + u8 xport; + u8 tun; + u8 xport_tun; + u8 lso; + u8 extseq; u32 udp_esp; u32 auth; u32 crypto; @@ -1295,8 +1295,8 @@ struct ndis_ipsecv2_offload { }; struct ndis_rsc_offload { - u16 ip4; - u16 ip6; + u8 ip4; + u8 ip6; }; struct ndis_encap_offload { From badbc27df3a934e0025be238754f9ca6a852c006 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 8 Jun 2018 10:04:47 +0300 Subject: [PATCH 65/81] nl80211: fix some kernel doc tag mistakes There is a bunch of tags marking constants with &, which means struct or enum name. Replace them with %, which is the correct tag for constants. Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 28b36545de24..27e4e441caac 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -981,18 +981,18 @@ * only the %NL80211_ATTR_IE data is used and updated with this command. * * @NL80211_CMD_SET_PMK: For offloaded 4-Way handshake, set the PMK or PMK-R0 - * for the given authenticator address (specified with &NL80211_ATTR_MAC). - * When &NL80211_ATTR_PMKR0_NAME is set, &NL80211_ATTR_PMK specifies the + * for the given authenticator address (specified with %NL80211_ATTR_MAC). + * When %NL80211_ATTR_PMKR0_NAME is set, %NL80211_ATTR_PMK specifies the * PMK-R0, otherwise it specifies the PMK. * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously * configured PMK for the authenticator address identified by - * &NL80211_ATTR_MAC. + * %NL80211_ATTR_MAC. * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way * handshake was completed successfully by the driver. The BSSID is - * specified with &NL80211_ATTR_MAC. Drivers that support 4 way handshake + * specified with %NL80211_ATTR_MAC. Drivers that support 4 way handshake * offload should send this event after indicating 802.11 association with - * &NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed - * &NL80211_CMD_DISCONNECT should be indicated instead. + * %NL80211_CMD_CONNECT or %NL80211_CMD_ROAM. If the 4 way handshake failed + * %NL80211_CMD_DISCONNECT should be indicated instead. * * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request * and RX notification. This command is used both as a request to transmit @@ -1029,9 +1029,9 @@ * initiated the connection through the connect request. * * @NL80211_CMD_STA_OPMODE_CHANGED: An event that notify station's - * ht opmode or vht opmode changes using any of &NL80211_ATTR_SMPS_MODE, - * &NL80211_ATTR_CHANNEL_WIDTH,&NL80211_ATTR_NSS attributes with its - * address(specified in &NL80211_ATTR_MAC). + * ht opmode or vht opmode changes using any of %NL80211_ATTR_SMPS_MODE, + * %NL80211_ATTR_CHANNEL_WIDTH,%NL80211_ATTR_NSS attributes with its + * address(specified in %NL80211_ATTR_MAC). * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -2218,7 +2218,7 @@ enum nl80211_commands { * @NL80211_ATTR_EXTERNAL_AUTH_ACTION: Identify the requested external * authentication operation (u32 attribute with an * &enum nl80211_external_auth_action value). This is used with the - * &NL80211_CMD_EXTERNAL_AUTH request event. + * %NL80211_CMD_EXTERNAL_AUTH request event. * @NL80211_ATTR_EXTERNAL_AUTH_SUPPORT: Flag attribute indicating that the user * space supports external authentication. This attribute shall be used * only with %NL80211_CMD_CONNECT request. The driver may offload @@ -3491,7 +3491,7 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated * base on contiguous rules and wider channels will be allowed to cross * multiple contiguous/overlapping frequency ranges. - * @NL80211_RRF_IR_CONCURRENT: See &NL80211_FREQUENCY_ATTR_IR_CONCURRENT + * @NL80211_RRF_IR_CONCURRENT: See %NL80211_FREQUENCY_ATTR_IR_CONCURRENT * @NL80211_RRF_NO_HT40MINUS: channels can't be used in HT40- operation * @NL80211_RRF_NO_HT40PLUS: channels can't be used in HT40+ operation * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed @@ -5643,11 +5643,11 @@ enum nl80211_nan_func_attributes { * @NL80211_NAN_SRF_INCLUDE: present if the include bit of the SRF set. * This is a flag. * @NL80211_NAN_SRF_BF: Bloom Filter. Present if and only if - * &NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary. + * %NL80211_NAN_SRF_MAC_ADDRS isn't present. This attribute is binary. * @NL80211_NAN_SRF_BF_IDX: index of the Bloom Filter. Mandatory if - * &NL80211_NAN_SRF_BF is present. This is a u8. + * %NL80211_NAN_SRF_BF is present. This is a u8. * @NL80211_NAN_SRF_MAC_ADDRS: list of MAC addresses for the SRF. Present if - * and only if &NL80211_NAN_SRF_BF isn't present. This is a nested + * and only if %NL80211_NAN_SRF_BF isn't present. This is a nested * attribute. Each nested attribute is a MAC address. * @NUM_NL80211_NAN_SRF_ATTR: internal * @NL80211_NAN_SRF_ATTR_MAX: highest NAN SRF attribute From 3c12d0486856b9eb89c2a9ac336713cba90813e3 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 Jun 2018 10:53:55 +0200 Subject: [PATCH 66/81] cfg80211: initialize sinfo in cfg80211_get_station Most of the implementations behind cfg80211_get_station will not initialize sinfo to zero before manipulating it. For example, the member "filled", which indicates the filled in parts of this struct, is often only modified by enabling certain bits in the bitfield while keeping the remaining bits in their original state. A caller without a preinitialized sinfo.filled can then no longer decide which parts of sinfo were filled in by cfg80211_get_station (or actually the underlying implementations). cfg80211_get_station must therefore take care that sinfo is initialized to zero. Otherwise, the caller may tries to read information which was not filled in and which must therefore also be considered uninitialized. In batadv_v_elp_get_throughput's case, an invalid "random" expected throughput may be stored for this neighbor and thus the B.A.T.M.A.N V algorithm may switch to non-optimal neighbors for certain destinations. Fixes: 7406353d43c8 ("cfg80211: implement cfg80211_get_station cfg80211 API") Reported-by: Thomas Lauer Reported-by: Marcel Schmidt Cc: b.a.t.m.a.n@lists.open-mesh.org Signed-off-by: Sven Eckelmann Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index b5bb1c309914..3c654cd7ba56 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1746,6 +1746,8 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, if (!rdev->ops->get_station) return -EOPNOTSUPP; + memset(sinfo, 0, sizeof(*sinfo)); + return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); From 3f61b7a30a6a8fd917d7570cb00a26a054d84ab4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 29 May 2018 12:04:51 +0200 Subject: [PATCH 67/81] mac80211_hwsim: fix module init error paths We didn't free the workqueue on any errors, nor did we correctly check for rhashtable allocation errors, nor did we free the hashtable on error. Reported-by: Colin King Reported-by: Dan Carpenter Signed-off-by: Johannes Berg Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 9825bfd42abc..18e819d964f1 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3572,11 +3572,14 @@ static int __init init_mac80211_hwsim(void) hwsim_wq = alloc_workqueue("hwsim_wq", 0, 0); if (!hwsim_wq) return -ENOMEM; - rhashtable_init(&hwsim_radios_rht, &hwsim_rht_params); + + err = rhashtable_init(&hwsim_radios_rht, &hwsim_rht_params); + if (err) + goto out_free_wq; err = register_pernet_device(&hwsim_net_ops); if (err) - return err; + goto out_free_rht; err = platform_driver_register(&mac80211_hwsim_driver); if (err) @@ -3701,6 +3704,10 @@ static int __init init_mac80211_hwsim(void) platform_driver_unregister(&mac80211_hwsim_driver); out_unregister_pernet: unregister_pernet_device(&hwsim_net_ops); +out_free_rht: + rhashtable_destroy(&hwsim_radios_rht); +out_free_wq: + destroy_workqueue(hwsim_wq); return err; } module_init(init_mac80211_hwsim); From dc8b274f0952f604d72b10698cde6887321a669f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 25 May 2018 14:29:21 +0200 Subject: [PATCH 68/81] mac80211: Move up init of TXQs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On init, ieee80211_if_add() dumps the interface. Since that now includes a dump of the TXQ state, we need to initialise that before the dump happens. So move up the TXQ initialisation to to before the call to ieee80211_if_add(). Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace") Reported-by: Niklas Cassel Signed-off-by: Toke Høiland-Jørgensen Tested-by: Niklas Cassel Signed-off-by: Johannes Berg --- net/mac80211/main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 4d2e797e3f16..722f3d9fb416 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1098,6 +1098,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_led_init(local); + result = ieee80211_txq_setup_flows(local); + if (result) + goto fail_flows; + rtnl_lock(); result = ieee80211_init_rate_ctrl_alg(local, @@ -1120,10 +1124,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); - result = ieee80211_txq_setup_flows(local); - if (result) - goto fail_flows; - #ifdef CONFIG_INET local->ifa_notifier.notifier_call = ieee80211_ifa_changed; result = register_inetaddr_notifier(&local->ifa_notifier); @@ -1149,8 +1149,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) #if defined(CONFIG_INET) || defined(CONFIG_IPV6) fail_ifa: #endif - ieee80211_txq_teardown_flows(local); - fail_flows: rtnl_lock(); rate_control_deinitialize(local); ieee80211_remove_interfaces(local); @@ -1158,6 +1156,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); ieee80211_led_exit(local); ieee80211_wep_free(local); + ieee80211_txq_teardown_flows(local); + fail_flows: destroy_workqueue(local->workqueue); fail_workqueue: wiphy_unregister(local->hw.wiphy); From bf2b61a6838f19cbc33f6732715012c483fa3795 Mon Sep 17 00:00:00 2001 From: Dedy Lansky Date: Fri, 15 Jun 2018 13:05:01 +0200 Subject: [PATCH 69/81] cfg80211: fix rcu in cfg80211_unregister_wdev Callers of cfg80211_unregister_wdev can free the wdev object immediately after this function returns. This may crash the kernel because this wdev object is still in use by other threads. Add synchronize_rcu() after list_del_rcu to make sure wdev object can be safely freed. Signed-off-by: Dedy Lansky Signed-off-by: Johannes Berg --- net/wireless/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/core.c b/net/wireless/core.c index 5fe35aafdd9c..48e8097339ab 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1012,6 +1012,7 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); list_del_rcu(&wdev->list); + synchronize_rcu(); rdev->devlist_generation++; switch (wdev->iftype) { From ab188e8f4aad9845589ed050bde9514550a23ea5 Mon Sep 17 00:00:00 2001 From: Elad Nachman Date: Fri, 15 Jun 2018 09:57:39 +0300 Subject: [PATCH 70/81] stmmac: added support for 802.1ad vlan stripping stmmac reception handler calls stmmac_rx_vlan() to strip the vlan before calling napi_gro_receive(). The function assumes VLAN tagged frames are always tagged with 802.1Q protocol, and assigns ETH_P_8021Q to the skb by hard-coding the parameter on call to __vlan_hwaccel_put_tag() . This causes packets not to be passed to the VLAN slave if it was created with 802.1AD protocol (ip link add link eth0 eth0.100 type vlan proto 802.1ad id 100). This fix passes the protocol from the VLAN header into __vlan_hwaccel_put_tag() instead of using the hard-coded value of ETH_P_8021Q. NETIF_F_HW_VLAN_STAG_RX check was added and the strip action is now dependent on the correct combination of features and the detected vlan tag. NETIF_F_HW_VLAN_STAG_RX feature was added to be in line with the driver actual abilities. Signed-off-by: Elad Nachman Reviewed-by: Toshiaki Makita Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 11fb7c777d89..5e6d4fe2f4ef 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3182,17 +3182,22 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb) { - struct ethhdr *ehdr; + struct vlan_ethhdr *veth; + __be16 vlan_proto; u16 vlanid; - if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) == - NETIF_F_HW_VLAN_CTAG_RX && - !__vlan_get_tag(skb, &vlanid)) { + veth = (struct vlan_ethhdr *)skb->data; + vlan_proto = veth->h_vlan_proto; + + if ((vlan_proto == htons(ETH_P_8021Q) && + dev->features & NETIF_F_HW_VLAN_CTAG_RX) || + (vlan_proto == htons(ETH_P_8021AD) && + dev->features & NETIF_F_HW_VLAN_STAG_RX)) { /* pop the vlan tag */ - ehdr = (struct ethhdr *)skb->data; - memmove(skb->data + VLAN_HLEN, ehdr, ETH_ALEN * 2); + vlanid = ntohs(veth->h_vlan_TCI); + memmove(skb->data + VLAN_HLEN, veth, ETH_ALEN * 2); skb_pull(skb, VLAN_HLEN); - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid); + __vlan_hwaccel_put_tag(skb, vlan_proto, vlanid); } } @@ -4235,7 +4240,7 @@ int stmmac_dvr_probe(struct device *device, ndev->watchdog_timeo = msecs_to_jiffies(watchdog); #ifdef STMMAC_VLAN_TAG_USED /* Both mac100 and gmac support receive VLAN tag detection */ - ndev->features |= NETIF_F_HW_VLAN_CTAG_RX; + ndev->features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX; #endif priv->msg_enable = netif_msg_init(debug, default_msg_level); From 6eba08c3626bca42b3cb0c9d43ac37ab11b4be1a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jun 2018 16:23:35 +0300 Subject: [PATCH 71/81] ipv6: Only emit append events for appended routes Current code will emit an append event in the FIB notification chain for any route added with NLM_F_APPEND set, even if the route was not appended to any existing route. This is inconsistent with IPv4 where such an event is only emitted when the new route is appended after an existing one. Align IPv6 behavior with IPv4, thereby allowing listeners to more easily handle these events. Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 7aa4c41a3bd9..39d1d487eca2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -934,6 +934,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); + enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct fib6_info *iter = NULL, *match = NULL; struct fib6_info __rcu **ins; int replace = (info->nlh && @@ -1013,6 +1014,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, "Can not append to a REJECT route"); return -EINVAL; } + event = FIB_EVENT_ENTRY_APPEND; rt->fib6_nsiblings = match->fib6_nsiblings; list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); match->fib6_nsiblings++; @@ -1034,15 +1036,12 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, * insert node */ if (!replace) { - enum fib_event_type event; - if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; - event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD; err = call_fib6_entry_notifiers(info->nl_net, event, rt, extack); if (err) From 53b562df8c203e189fc69f7af0d8668e8dec5a8a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jun 2018 16:23:36 +0300 Subject: [PATCH 72/81] mlxsw: spectrum_router: Allow appending to dev-only routes Commit f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") changed the IPv6 route append logic so that dev-only routes can be appended and not only gatewayed routes. Align mlxsw with the new behaviour. Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 77b2adb29341..c8956ab224ea 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4771,11 +4771,11 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, - const struct fib6_info *nrt, bool replace) + const struct fib6_info *nrt, bool append) { struct mlxsw_sp_fib6_entry *fib6_entry; - if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace) + if (!append) return NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { @@ -4790,8 +4790,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, break; if (rt->fib6_metric < nrt->fib6_metric) continue; - if (rt->fib6_metric == nrt->fib6_metric && - mlxsw_sp_fib6_rt_can_mp(rt)) + if (rt->fib6_metric == nrt->fib6_metric) return fib6_entry; if (rt->fib6_metric > nrt->fib6_metric) break; @@ -5316,7 +5315,8 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, - struct fib6_info *rt, bool replace) + struct fib6_info *rt, bool replace, + bool append) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_fib_node *fib_node; @@ -5342,7 +5342,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, /* Before creating a new entry, try to append route to an existing * multipath entry. */ - fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace); + fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, append); if (fib6_entry) { err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt); if (err) @@ -5350,6 +5350,14 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, return 0; } + /* We received an append event, yet did not find any route to + * append to. + */ + if (WARN_ON(append)) { + err = -EINVAL; + goto err_fib6_entry_append; + } + fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt); if (IS_ERR(fib6_entry)) { err = PTR_ERR(fib6_entry); @@ -5367,6 +5375,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, err_fib6_node_entry_link: mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry); err_fib6_entry_create: +err_fib6_entry_append: err_fib6_entry_nexthop_add: mlxsw_sp_fib_node_put(mlxsw_sp, fib_node); return err; @@ -5717,7 +5726,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) struct mlxsw_sp_fib_event_work *fib_work = container_of(work, struct mlxsw_sp_fib_event_work, work); struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp; - bool replace; + bool replace, append; int err; rtnl_lock(); @@ -5728,8 +5737,10 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) case FIB_EVENT_ENTRY_APPEND: /* fall through */ case FIB_EVENT_ENTRY_ADD: replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE; + append = fib_work->event == FIB_EVENT_ENTRY_APPEND; err = mlxsw_sp_router_fib6_add(mlxsw_sp, - fib_work->fen6_info.rt, replace); + fib_work->fen6_info.rt, replace, + append); if (err) mlxsw_sp_router_fib_abort(mlxsw_sp); mlxsw_sp_rt6_release(fib_work->fen6_info.rt); From ce45bded6435aa7d1e34be3b47e9c04c72f85742 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Jun 2018 16:23:37 +0300 Subject: [PATCH 73/81] mlxsw: spectrum_router: Align with new route replace logic Commit f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") changed the IPv6 route replace logic so that the first matching route (i.e., same metric) is replaced. Have mlxsw replace the first matching route as well. Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_router.c | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index c8956ab224ea..6aaaf3d9ba31 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4756,12 +4756,6 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6) kfree(mlxsw_sp_rt6); } -static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt) -{ - /* RTF_CACHE routes are ignored */ - return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY; -} - static struct fib6_info * mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) { @@ -5169,7 +5163,7 @@ static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, const struct fib6_info *nrt, bool replace) { - struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL; + struct mlxsw_sp_fib6_entry *fib6_entry; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); @@ -5178,18 +5172,13 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, continue; if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id) break; - if (replace && rt->fib6_metric == nrt->fib6_metric) { - if (mlxsw_sp_fib6_rt_can_mp(rt) == - mlxsw_sp_fib6_rt_can_mp(nrt)) - return fib6_entry; - if (mlxsw_sp_fib6_rt_can_mp(nrt)) - fallback = fallback ?: fib6_entry; - } + if (replace && rt->fib6_metric == nrt->fib6_metric) + return fib6_entry; if (rt->fib6_metric > nrt->fib6_metric) - return fallback ?: fib6_entry; + return fib6_entry; } - return fallback; + return NULL; } static int From 9e25826ffc942e985b8595b2f1cf2065d3880514 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 15 Jun 2018 16:23:38 +0300 Subject: [PATCH 74/81] mlxsw: spectrum_switchdev: Fix port_vlan refcounting Switchdev notifications for addition of SWITCHDEV_OBJ_ID_PORT_VLAN are distributed not only on clean addition, but also when flags on an existing VLAN are changed. mlxsw_sp_bridge_port_vlan_add() calls mlxsw_sp_port_vlan_get() to get at the port_vlan in question, which implicitly references the object. This then leads to discrepancies in reference counting when the VLAN is removed. spectrum.c warns about the problem when the module is removed: [13578.493090] WARNING: CPU: 0 PID: 2454 at drivers/net/ethernet/mellanox/mlxsw/spectrum.c:2973 mlxsw_sp_port_remove+0xfd/0x110 [mlxsw_spectrum] [...] [13578.627106] Call Trace: [13578.629617] mlxsw_sp_fini+0x2a/0xe0 [mlxsw_spectrum] [13578.634748] mlxsw_core_bus_device_unregister+0x3e/0x130 [mlxsw_core] [13578.641290] mlxsw_pci_remove+0x13/0x40 [mlxsw_pci] [13578.646238] pci_device_remove+0x31/0xb0 [13578.650244] device_release_driver_internal+0x14f/0x220 [13578.655562] driver_detach+0x32/0x70 [13578.659183] bus_remove_driver+0x47/0xa0 [13578.663134] pci_unregister_driver+0x1e/0x80 [13578.667486] mlxsw_sp_module_exit+0xc/0x3fa [mlxsw_spectrum] [13578.673207] __x64_sys_delete_module+0x13b/0x1e0 [13578.677888] ? exit_to_usermode_loop+0x78/0x80 [13578.682374] do_syscall_64+0x39/0xe0 [13578.685976] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix by putting the port_vlan when mlxsw_sp_port_vlan_bridge_join() determines it's a flag-only change. Fixes: b3529af6bb0d ("spectrum: Reference count VLAN entries") Signed-off-by: Petr Machata Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index e97652c40d13..eea5666a86b2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1018,8 +1018,10 @@ mlxsw_sp_port_vlan_bridge_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan, int err; /* No need to continue if only VLAN flags were changed */ - if (mlxsw_sp_port_vlan->bridge_port) + if (mlxsw_sp_port_vlan->bridge_port) { + mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan); return 0; + } err = mlxsw_sp_port_vlan_fid_join(mlxsw_sp_port_vlan, bridge_port); if (err) From de9bada5d389903f4faf33980e6a95a2911c7e6d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 15 Jun 2018 15:39:17 +0200 Subject: [PATCH 75/81] l2tp: reject creation of non-PPP sessions on L2TPv2 tunnels The /proc/net/pppol2tp handlers (pppol2tp_seq_*()) iterate over all L2TPv2 tunnels, and rightfully expect that only PPP sessions can be found there. However, l2tp_netlink accepts creating Ethernet sessions regardless of the underlying tunnel version. This confuses pppol2tp_seq_session_show(), which expects that l2tp_session_priv() returns a pppol2tp_session structure. When the session is an Ethernet pseudo-wire, a struct l2tp_eth_sess is returned instead. This leads to invalid memory access when pppol2tp_session_get_sock() later tries to dereference ps->sk. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 6616c9fd292f..5b9900889e31 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -553,6 +553,12 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf goto out_tunnel; } + /* L2TPv2 only accepts PPP pseudo-wires */ + if (tunnel->version == 2 && cfg.pw_type != L2TP_PWTYPE_PPP) { + ret = -EPROTONOSUPPORT; + goto out_tunnel; + } + if (tunnel->version > 2) { if (info->attrs[L2TP_ATTR_DATA_SEQ]) cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]); From ecd012e45ab5fd76ed57546865897ce35920f56b Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 15 Jun 2018 15:39:19 +0200 Subject: [PATCH 76/81] l2tp: filter out non-PPP sessions in pppol2tp_tunnel_ioctl() pppol2tp_tunnel_ioctl() can act on an L2TPv3 tunnel, in which case 'session' may be an Ethernet pseudo-wire. However, pppol2tp_session_ioctl() expects a PPP pseudo-wire, as it assumes l2tp_session_priv() points to a pppol2tp_session structure. For an Ethernet pseudo-wire l2tp_session_priv() points to an l2tp_eth_sess structure instead, making pppol2tp_session_ioctl() access invalid memory. Fixes: d9e31d17ceba ("l2tp: Add L2TP ethernet pseudowire support") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index f429fed06a1e..55188382845c 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1201,7 +1201,7 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel, l2tp_session_get(sock_net(sk), tunnel, stats.session_id); - if (session) { + if (session && session->pwtype == L2TP_PWTYPE_PPP) { err = pppol2tp_session_ioctl(session, cmd, arg); l2tp_session_dec_refcount(session); From a447da7d00410278c90d3576782a43f8b675d7be Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 03:07:45 +0200 Subject: [PATCH 77/81] tls: fix use-after-free in tls_push_record syzkaller managed to trigger a use-after-free in tls like the following: BUG: KASAN: use-after-free in tls_push_record.constprop.15+0x6a2/0x810 [tls] Write of size 1 at addr ffff88037aa08000 by task a.out/2317 CPU: 3 PID: 2317 Comm: a.out Not tainted 4.17.0+ #144 Hardware name: LENOVO 20FBCTO1WW/20FBCTO1WW, BIOS N1FET47W (1.21 ) 11/28/2016 Call Trace: dump_stack+0x71/0xab print_address_description+0x6a/0x280 kasan_report+0x258/0x380 ? tls_push_record.constprop.15+0x6a2/0x810 [tls] tls_push_record.constprop.15+0x6a2/0x810 [tls] tls_sw_push_pending_record+0x2e/0x40 [tls] tls_sk_proto_close+0x3fe/0x710 [tls] ? tcp_check_oom+0x4c0/0x4c0 ? tls_write_space+0x260/0x260 [tls] ? kmem_cache_free+0x88/0x1f0 inet_release+0xd6/0x1b0 __sock_release+0xc0/0x240 sock_close+0x11/0x20 __fput+0x22d/0x660 task_work_run+0x114/0x1a0 do_exit+0x71a/0x2780 ? mm_update_next_owner+0x650/0x650 ? handle_mm_fault+0x2f5/0x5f0 ? __do_page_fault+0x44f/0xa50 ? mm_fault_error+0x2d0/0x2d0 do_group_exit+0xde/0x300 __x64_sys_exit_group+0x3a/0x50 do_syscall_64+0x9a/0x300 ? page_fault+0x8/0x30 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happened through fault injection where aead_req allocation in tls_do_encryption() eventually failed and we returned -ENOMEM from the function. Turns out that the use-after-free is triggered from tls_sw_sendmsg() in the second tls_push_record(). The error then triggers a jump to waiting for memory in sk_stream_wait_memory() resp. returning immediately in case of MSG_DONTWAIT. What follows is the trim_both_sgl(sk, orig_size), which drops elements from the sg list added via tls_sw_sendmsg(). Now the use-after-free gets triggered when the socket is being closed, where tls_sk_proto_close() callback is invoked. The tls_complete_pending_work() will figure that there's a pending closed tls record to be flushed and thus calls into the tls_push_pending_closed_record() from there. ctx->push_pending_record() is called from the latter, which is the tls_sw_push_pending_record() from sw path. This again calls into tls_push_record(). And here the tls_fill_prepend() will panic since the buffer address has been freed earlier via trim_both_sgl(). One way to fix it is to move the aead request allocation out of tls_do_encryption() early into tls_push_record(). This means we don't prep the tls header and advance state to the TLS_PENDING_CLOSED_RECORD before allocation which could potentially fail happened. That fixes the issue on my side. Fixes: 3c4d7559159b ("tls: kernel TLS support") Reported-by: syzbot+5c74af81c547738e1684@syzkaller.appspotmail.com Reported-by: syzbot+709f2810a6a05f11d4d3@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 34895b7c132d..2945a3bd538c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -191,18 +191,12 @@ static void tls_free_both_sg(struct sock *sk) } static int tls_do_encryption(struct tls_context *tls_ctx, - struct tls_sw_context_tx *ctx, size_t data_len, - gfp_t flags) + struct tls_sw_context_tx *ctx, + struct aead_request *aead_req, + size_t data_len) { - unsigned int req_size = sizeof(struct aead_request) + - crypto_aead_reqsize(ctx->aead_send); - struct aead_request *aead_req; int rc; - aead_req = kzalloc(req_size, flags); - if (!aead_req) - return -ENOMEM; - ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; @@ -219,7 +213,6 @@ static int tls_do_encryption(struct tls_context *tls_ctx, ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; - kfree(aead_req); return rc; } @@ -228,8 +221,14 @@ static int tls_push_record(struct sock *sk, int flags, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct aead_request *req; int rc; + req = kzalloc(sizeof(struct aead_request) + + crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation); + if (!req) + return -ENOMEM; + sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); @@ -245,15 +244,14 @@ static int tls_push_record(struct sock *sk, int flags, tls_ctx->pending_open_record_frags = 0; set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags); - rc = tls_do_encryption(tls_ctx, ctx, ctx->sg_plaintext_size, - sk->sk_allocation); + rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size); if (rc < 0) { /* If we are called from write_space and * we fail, we need to set this SOCK_NOSPACE * to trigger another write_space in the future. */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - return rc; + goto out_req; } free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, @@ -268,6 +266,8 @@ static int tls_push_record(struct sock *sk, int flags, tls_err_abort(sk, EBADMSG); tls_advance_record_sn(sk, &tls_ctx->tx); +out_req: + kfree(req); return rc; } From 06030dbaf3b6c5801dcdb7fe4fbab3b91c8da84a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 03:07:46 +0200 Subject: [PATCH 78/81] tls: fix waitall behavior in tls_sw_recvmsg Current behavior in tls_sw_recvmsg() is to wait for incoming tls messages and copy up to exactly len bytes of data that the user provided. This is problematic in the sense that i) if no packet is currently queued in strparser we keep waiting until one has been processed and pushed into tls receive layer for tls_wait_data() to wake up and push the decrypted bits to user space. Given after tls decryption, we're back at streaming data, use sock_rcvlowat() hint from tcp socket instead. Retain current behavior with MSG_WAITALL flag and otherwise use the hint target for breaking the loop and returning to application. This is done if currently no ctx->recv_pkt is ready, otherwise continue to process it from our strparser backlog. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Signed-off-by: Daniel Borkmann Acked-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2945a3bd538c..f127fac88acf 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -754,7 +754,7 @@ int tls_sw_recvmsg(struct sock *sk, struct sk_buff *skb; ssize_t copied = 0; bool cmsg = false; - int err = 0; + int target, err = 0; long timeo; flags |= nonblock; @@ -764,6 +764,7 @@ int tls_sw_recvmsg(struct sock *sk, lock_sock(sk); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { bool zc = false; @@ -856,6 +857,9 @@ int tls_sw_recvmsg(struct sock *sk, goto recv_end; } } + /* If we have a new message from strparser, continue now. */ + if (copied >= target && !ctx->recv_pkt) + break; } while (len); recv_end: From 7c099773b08634df9db0f5be40f0fcc06baa2e1b Mon Sep 17 00:00:00 2001 From: Zhouyang Jia Date: Fri, 15 Jun 2018 11:06:17 +0800 Subject: [PATCH 79/81] net: cxgb3: add error handling for sysfs_create_group When sysfs_create_group fails, the lack of error-handling code may cause unexpected results. This patch adds error-handling code after calling sysfs_create_group. Signed-off-by: Zhouyang Jia Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index 2edfdbdaae48..7b795edd9d3a 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -3362,10 +3362,17 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) err = sysfs_create_group(&adapter->port[0]->dev.kobj, &cxgb3_attr_group); + if (err) { + dev_err(&pdev->dev, "cannot create sysfs group\n"); + goto out_close_led; + } print_port_info(adapter, ai); return 0; +out_close_led: + t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL, 0); + out_free_dev: iounmap(adapter->regs); for (i = ai->nports0 + ai->nports1 - 1; i >= 0; --i) From f6a6f203d507aae3a06a8de79c6f0ecc4658b81c Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 12 Jun 2018 21:26:10 -0700 Subject: [PATCH 80/81] neighbour: skip NTF_EXT_LEARNED entries during forced gc Commit 9ce33e46531d ("neighbour: support for NTF_EXT_LEARNED flag") added support for NTF_EXT_LEARNED for neighbour entries. NTF_EXT_LEARNED entries are neigh entries managed by control plane (eg: Ethernet VPN implementation in FRR routing suite). Periodic gc already excludes these entries. This patch extends it to forced gc which the earlier patch missed. Fixes: 9ce33e46531d ("neighbour: support for NTF_EXT_LEARNED flag") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/neighbour.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a7a9c3d738ba..8e3fda9e725c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -119,13 +119,14 @@ unsigned long neigh_rand_reach_time(unsigned long base) EXPORT_SYMBOL(neigh_rand_reach_time); -static bool neigh_del(struct neighbour *n, __u8 state, +static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, struct neighbour __rcu **np, struct neigh_table *tbl) { bool retval = false; write_lock(&n->lock); - if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) { + if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && + !(n->flags & flags)) { struct neighbour *neigh; neigh = rcu_dereference_protected(n->next, @@ -157,7 +158,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) while ((n = rcu_dereference_protected(*np, lockdep_is_held(&tbl->lock)))) { if (n == ndel) - return neigh_del(n, 0, np, tbl); + return neigh_del(n, 0, 0, np, tbl); np = &n->next; } return false; @@ -185,7 +186,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) * - nobody refers to it. * - it is not permanent */ - if (neigh_del(n, NUD_PERMANENT, np, tbl)) { + if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, + tbl)) { shrunk = 1; continue; } From 7cfde0af731c14664e3882c7ba77ace1059f2c5e Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Fri, 15 Jun 2018 16:17:27 +0100 Subject: [PATCH 81/81] net: stmmac: Run HWIF Quirks after getting HW caps Currently we were running HWIF quirks before getting HW capabilities. This is not right because some HWIF callbacks depend on HW caps. Lets save the quirks callback and use it in a later stage. This fixes Altera socfpga. Signed-off-by: Jose Abreu Fixes: 5f0456b43140 ("net: stmmac: Implement logic to automatically select HW Interface") Reported-by: Dinh Nguyen Cc: David S. Miller Cc: Joao Pinto Cc: Vitor Soares Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Cc: Dinh Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/hwif.c | 9 ++------- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 +++++++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index 14770fc8865e..1f50e83cafb2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -252,13 +252,8 @@ int stmmac_hwif_init(struct stmmac_priv *priv) return ret; } - /* Run quirks, if needed */ - if (entry->quirks) { - ret = entry->quirks(priv); - if (ret) - return ret; - } - + /* Save quirks, if needed for posterior use */ + priv->hwif_quirks = entry->quirks; return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 025efbf6145c..76649adf8fb0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -129,6 +129,7 @@ struct stmmac_priv { struct net_device *dev; struct device *device; struct mac_device_info *hw; + int (*hwif_quirks)(struct stmmac_priv *priv); struct mutex lock; /* RX Queue */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 5e6d4fe2f4ef..e79b0d7b388a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4135,6 +4135,13 @@ static int stmmac_hw_init(struct stmmac_priv *priv) if (priv->dma_cap.tsoen) dev_info(priv->device, "TSO supported\n"); + /* Run HW quirks, if any */ + if (priv->hwif_quirks) { + ret = priv->hwif_quirks(priv); + if (ret) + return ret; + } + return 0; }