Skip to content

Commit

Permalink
Merge branch 'tcp-introduce-optional-per-netns-ehash'
Browse files Browse the repository at this point in the history
Kuniyuki Iwashima says:

====================
tcp: Introduce optional per-netns ehash.

The more sockets we have in the hash table, the longer we spend looking
up the socket.  While running a number of small workloads on the same
host, they penalise each other and cause performance degradation.

The root cause might be a single workload that consumes much more
resources than the others.  It often happens on a cloud service where
different workloads share the same computing resource.

On EC2 c5.24xlarge instance (196 GiB memory and 524288 (1Mi / 2) ehash
entries), after running iperf3 in different netns, creating 24Mi sockets
without data transfer in the root netns causes about 10% performance
regression for the iperf3's connection.

 thash_entries		sockets		length		Gbps
	524288		      1		     1		50.7
			   24Mi		    48		45.1

It is basically related to the length of the list of each hash bucket.
For testing purposes to see how performance drops along the length,
I set 131072 (1Mi / 8) to thash_entries, and here's the result.

 thash_entries		sockets		length		Gbps
        131072		      1		     1		50.7
			    1Mi		     8		49.9
			    2Mi		    16		48.9
			    4Mi		    32		47.3
			    8Mi		    64		44.6
			   16Mi		   128		40.6
			   24Mi		   192		36.3
			   32Mi		   256		32.5
			   40Mi		   320		27.0
			   48Mi		   384		25.0

To resolve the socket lookup degradation, we introduce an optional
per-netns hash table for TCP, but it's just ehash, and we still share
the global bhash, bhash2 and lhash2.

With a smaller ehash, we can look up non-listener sockets faster and
isolate such noisy neighbours.  Also, we can reduce lock contention.

For details, please see the last patch.

  patch 1 - 4: prep for per-netns ehash
  patch     5: small optimisation for netns dismantle without TIME_WAIT sockets
  patch     6: add per-netns ehash

Many thanks to Eric Dumazet for reviewing and advising.
====================

Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
  • Loading branch information
kuba-moo committed Sep 20, 2022
2 parents 17df341 + d1e5e64 commit 4fa37e4
Show file tree
Hide file tree
Showing 28 changed files with 361 additions and 161 deletions.
29 changes: 29 additions & 0 deletions Documentation/networking/ip-sysctl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,35 @@ tcp_challenge_ack_limit - INTEGER
TCP stack implements per TCP socket limits anyway.
Default: INT_MAX (unlimited)

tcp_ehash_entries - INTEGER
Show the number of hash buckets for TCP sockets in the current
networking namespace.

A negative value means the networking namespace does not own its
hash buckets and shares the initial networking namespace's one.

tcp_child_ehash_entries - INTEGER
Control the number of hash buckets for TCP sockets in the child
networking namespace, which must be set before clone() or unshare().

If the value is not 0, the kernel uses a value rounded up to 2^n
as the actual hash bucket size. 0 is a special value, meaning
the child networking namespace will share the initial networking
namespace's hash buckets.

Note that the child will use the global one in case the kernel
fails to allocate enough memory. In addition, the global hash
buckets are spread over available NUMA nodes, but the allocation
of the child hash table depends on the current process's NUMA
policy, which could result in performance differences.

Note also that the default value of tcp_max_tw_buckets and
tcp_max_syn_backlog depend on the hash bucket size.

Possible values: 0, 2^n (n: 0 - 24 (16Mi))

Default: 0

UDP variables
=============

Expand Down
5 changes: 2 additions & 3 deletions drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1069,8 +1069,7 @@ static void chtls_pass_accept_rpl(struct sk_buff *skb,
cxgb4_l2t_send(csk->egress_dev, skb, csk->l2t_entry);
}

static void inet_inherit_port(struct inet_hashinfo *hash_info,
struct sock *lsk, struct sock *newsk)
static void inet_inherit_port(struct sock *lsk, struct sock *newsk)
{
local_bh_disable();
__inet_inherit_port(lsk, newsk);
Expand Down Expand Up @@ -1240,7 +1239,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk,
ipv4.sysctl_tcp_window_scaling),
tp->window_clamp);
neigh_release(n);
inet_inherit_port(&tcp_hashinfo, lsk, newsk);
inet_inherit_port(lsk, newsk);
csk_set_flag(csk, CSK_CONN_INLINE);
bh_unlock_sock(newsk); /* tcp_create_openreq_child ->sk_clone_lock */

Expand Down
5 changes: 3 additions & 2 deletions drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb)
{
struct ethhdr *eth = (struct ethhdr *)(skb->data);
struct net_device *netdev = rq->netdev;
struct net *net = dev_net(netdev);
struct sock *sk = NULL;
unsigned int datalen;
struct iphdr *iph;
Expand All @@ -475,7 +476,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb)
depth += sizeof(struct iphdr);
th = (void *)iph + sizeof(struct iphdr);

sk = inet_lookup_established(dev_net(netdev), &tcp_hashinfo,
sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->saddr, th->source, iph->daddr,
th->dest, netdev->ifindex);
#if IS_ENABLED(CONFIG_IPV6)
Expand All @@ -485,7 +486,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb)
depth += sizeof(struct ipv6hdr);
th = (void *)ipv6h + sizeof(struct ipv6hdr);

sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo,
sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&ipv6h->saddr, th->source,
&ipv6h->daddr, ntohs(th->dest),
netdev->ifindex, 0);
Expand Down
5 changes: 3 additions & 2 deletions drivers/net/ethernet/netronome/nfp/crypto/tls.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ int nfp_net_tls_rx_resync_req(struct net_device *netdev,
{
struct nfp_net *nn = netdev_priv(netdev);
struct nfp_net_tls_offload_ctx *ntls;
struct net *net = dev_net(netdev);
struct ipv6hdr *ipv6h;
struct tcphdr *th;
struct iphdr *iph;
Expand All @@ -494,13 +495,13 @@ int nfp_net_tls_rx_resync_req(struct net_device *netdev,

switch (ipv6h->version) {
case 4:
sk = inet_lookup_established(dev_net(netdev), &tcp_hashinfo,
sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->saddr, th->source, iph->daddr,
th->dest, netdev->ifindex);
break;
#if IS_ENABLED(CONFIG_IPV6)
case 6:
sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo,
sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&ipv6h->saddr, th->source,
&ipv6h->daddr, ntohs(th->dest),
netdev->ifindex, 0);
Expand Down
16 changes: 16 additions & 0 deletions include/net/inet_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,20 @@ struct inet_hashinfo {
/* The 2nd listener table hashed by local port and address */
unsigned int lhash2_mask;
struct inet_listen_hashbucket *lhash2;

bool pernet;
};

static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IP_DCCP)
return sk->sk_prot->h.hashinfo ? :
sock_net(sk)->ipv4.tcp_death_row.hashinfo;
#else
return sock_net(sk)->ipv4.tcp_death_row.hashinfo;
#endif
}

static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
Expand Down Expand Up @@ -204,6 +216,10 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
}

struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
unsigned int ehash_entries);
void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo);

struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
Expand Down
4 changes: 3 additions & 1 deletion include/net/netns/ipv4.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,15 @@ struct inet_hashinfo;
struct inet_timewait_death_row {
refcount_t tw_refcount;

/* Padding to avoid false sharing, tw_refcount can be often written */
struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp;
int sysctl_max_tw_buckets;
};

struct tcp_fastopen_context;

struct netns_ipv4 {
struct inet_timewait_death_row *tcp_death_row;
struct inet_timewait_death_row tcp_death_row;

#ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr;
Expand Down Expand Up @@ -170,6 +171,7 @@ struct netns_ipv4 {
int sysctl_tcp_pacing_ca_ratio;
int sysctl_tcp_wmem[3];
int sysctl_tcp_rmem[3];
unsigned int sysctl_tcp_child_ehash_entries;
unsigned long sysctl_tcp_comp_sack_delay_ns;
unsigned long sysctl_tcp_comp_sack_slack_ns;
int sysctl_max_syn_backlog;
Expand Down
1 change: 1 addition & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
void tcp_twsk_purge(struct list_head *net_exit_list, int family);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
Expand Down
5 changes: 3 additions & 2 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -6373,6 +6373,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
{
struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
bool refcounted = false;
struct sock *sk = NULL;

Expand All @@ -6381,7 +6382,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
__be32 dst4 = tuple->ipv4.daddr;

if (proto == IPPROTO_TCP)
sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0,
sk = __inet_lookup(net, hinfo, NULL, 0,
src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &refcounted);
Expand All @@ -6395,7 +6396,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;

if (proto == IPPROTO_TCP)
sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0,
sk = __inet6_lookup(net, hinfo, NULL, 0,
src6, tuple->ipv6.sport,
dst6, ntohs(tuple->ipv6.dport),
dif, sdif, &refcounted);
Expand Down
2 changes: 2 additions & 0 deletions net/dccp/proto.c
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,8 @@ static int __init dccp_init(void)
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
}

dccp_hashinfo.pernet = false;

rc = dccp_mib_init();
if (rc)
goto out_free_dccp_bhash2;
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/af_inet.c
Original file line number Diff line number Diff line change
Expand Up @@ -1250,7 +1250,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
}

prev_addr_hashbucket =
inet_bhashfn_portaddr(sk->sk_prot->h.hashinfo, sk,
inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk,
sock_net(sk), inet->inet_num);

inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
Expand Down
3 changes: 2 additions & 1 deletion net/ipv4/esp4.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ static void esp_free_tcp_sk(struct rcu_head *head)
static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
{
struct xfrm_encap_tmpl *encap = x->encap;
struct net *net = xs_net(x);
struct esp_tcp_sk *esk;
__be16 sport, dport;
struct sock *nsk;
Expand All @@ -160,7 +161,7 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
}
spin_unlock_bh(&x->lock);

sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4,
sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4,
dport, x->props.saddr.a4, sport, 0);
if (!sk)
return ERR_PTR(-ENOENT);
Expand Down
22 changes: 10 additions & 12 deletions net/ipv4/inet_connection_sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,16 +285,14 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
struct inet_bind2_bucket **tb2_ret,
struct inet_bind_hashbucket **head2_ret, int *port_ret)
{
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int port = 0;
struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
int i, low, high, attempt_half, port, l3mdev;
struct inet_bind_hashbucket *head, *head2;
struct net *net = sock_net(sk);
bool relax = false;
int i, low, high, attempt_half;
struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
u32 remaining, offset;
int l3mdev;
bool relax = false;

l3mdev = inet_sk_bound_l3mdev(sk);
ports_exhausted:
Expand Down Expand Up @@ -469,17 +467,16 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int ret = 1, port = snum;
struct net *net = sock_net(sk);
bool found_port = false, check_bind_conflict = true;
bool bhash_created = false, bhash2_created = false;
struct inet_bind_hashbucket *head, *head2;
struct inet_bind2_bucket *tb2 = NULL;
struct inet_bind_bucket *tb = NULL;
bool head2_lock_acquired = false;
int l3mdev;
int ret = 1, port = snum, l3mdev;
struct net *net = sock_net(sk);

l3mdev = inet_sk_bound_l3mdev(sk);

Expand Down Expand Up @@ -909,14 +906,15 @@ static void reqsk_migrate_reset(struct request_sock *req)
/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock *req)
{
struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
struct sock *sk = req_to_sk(req);
bool found = false;

if (sk_hashed(req_to_sk(req))) {
if (sk_hashed(sk)) {
struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);

spin_lock(lock);
found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
found = __sk_nulls_del_node_init_rcu(sk);
spin_unlock(lock);
}
if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
Expand Down
Loading

0 comments on commit 4fa37e4

Please sign in to comment.