Skip to content

Commit

Permalink
rds: Use RCU for the bind lookup searches
Browse files Browse the repository at this point in the history
The RDS bind lookups are somewhat expensive in terms of CPU
time and locking overhead.  This commit changes them into a
faster RCU based hash tree instead of the rbtrees they were using
before.

On large NUMA systems it is a significant improvement.

Signed-off-by: Chris Mason <[email protected]>
  • Loading branch information
chrismason-xx authored and Andy Grover committed Sep 9, 2010
1 parent e4c52c9 commit 38a4e5e
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 45 deletions.
8 changes: 8 additions & 0 deletions net/rds/af_rds.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,15 @@ static int rds_release(struct socket *sock)
* with the socket. */
rds_clear_recv_queue(rs);
rds_cong_remove_socket(rs);

/*
* the binding lookup hash uses rcu, we need to
* make sure we sychronize_rcu before we free our
* entry
*/
rds_remove_bound(rs);
synchronize_rcu();

rds_send_drop_to(rs, NULL);
rds_rdma_drop_keys(rs);
rds_notify_queue_get(rs, NULL);
Expand Down
90 changes: 46 additions & 44 deletions net/rds/bind.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,45 +34,52 @@
#include <net/sock.h>
#include <linux/in.h>
#include <linux/if_arp.h>
#include <linux/jhash.h>
#include "rds.h"

/*
* XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
* particularly zippy.
*
* This is now called for every incoming frame so we arguably care much more
* about it than we used to.
*/
static DEFINE_RWLOCK(rds_bind_lock);
static struct rb_root rds_bind_tree = RB_ROOT;
#define BIND_HASH_SIZE 1024
static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
static DEFINE_SPINLOCK(rds_bind_lock);

static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
{
return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
(BIND_HASH_SIZE - 1));
}

static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
struct rds_sock *insert)
static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
struct rds_sock *insert)
{
struct rb_node **p = &rds_bind_tree.rb_node;
struct rb_node *parent = NULL;
struct rds_sock *rs;
struct hlist_node *node;
struct hlist_head *head = hash_to_bucket(addr, port);
u64 cmp;
u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);

while (*p) {
parent = *p;
rs = rb_entry(parent, struct rds_sock, rs_bound_node);

rcu_read_lock();
hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
be16_to_cpu(rs->rs_bound_port);

if (needle < cmp)
p = &(*p)->rb_left;
else if (needle > cmp)
p = &(*p)->rb_right;
else
if (cmp == needle) {
rcu_read_unlock();
return rs;
}
}
rcu_read_unlock();

if (insert) {
rb_link_node(&insert->rs_bound_node, parent, p);
rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
/*
* make sure our addr and port are set before
* we are added to the list, other people
* in rcu will find us as soon as the
* hlist_add_head_rcu is done
*/
insert->rs_bound_addr = addr;
insert->rs_bound_port = port;
rds_sock_addref(insert);

hlist_add_head_rcu(&insert->rs_bound_node, head);
}
return NULL;
}
Expand All @@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
{
struct rds_sock *rs;
unsigned long flags;

read_lock_irqsave(&rds_bind_lock, flags);
rs = rds_bind_tree_walk(addr, port, NULL);
rs = rds_bind_lookup(addr, port, NULL);

if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
read_unlock_irqrestore(&rds_bind_lock, flags);

rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
ntohs(port));
Expand All @@ -116,28 +121,21 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
last = rover - 1;
}

write_lock_irqsave(&rds_bind_lock, flags);
spin_lock_irqsave(&rds_bind_lock, flags);

do {
if (rover == 0)
rover++;
if (!rds_bind_tree_walk(addr, cpu_to_be16(rover), rs)) {
*port = cpu_to_be16(rover);
if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
*port = rs->rs_bound_port;
ret = 0;
rdsdebug("rs %p binding to %pI4:%d\n",
rs, &addr, (int)ntohs(*port));
break;
}
} while (rover++ != last);

if (ret == 0) {
rs->rs_bound_addr = addr;
rs->rs_bound_port = *port;
rds_sock_addref(rs);

rdsdebug("rs %p binding to %pI4:%d\n",
rs, &addr, (int)ntohs(*port));
}

write_unlock_irqrestore(&rds_bind_lock, flags);
spin_unlock_irqrestore(&rds_bind_lock, flags);

return ret;
}
Expand All @@ -146,19 +144,19 @@ void rds_remove_bound(struct rds_sock *rs)
{
unsigned long flags;

write_lock_irqsave(&rds_bind_lock, flags);
spin_lock_irqsave(&rds_bind_lock, flags);

if (rs->rs_bound_addr) {
rdsdebug("rs %p unbinding from %pI4:%d\n",
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port));

rb_erase(&rs->rs_bound_node, &rds_bind_tree);
hlist_del_init_rcu(&rs->rs_bound_node);
rds_sock_put(rs);
rs->rs_bound_addr = 0;
}

write_unlock_irqrestore(&rds_bind_lock, flags);
spin_unlock_irqrestore(&rds_bind_lock, flags);
}

int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
Expand Down Expand Up @@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)

out:
release_sock(sk);

/* we might have called rds_remove_bound on error */
if (ret)
synchronize_rcu();
return ret;
}
2 changes: 2 additions & 0 deletions net/rds/ib_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
goto out_no_cigar;
}

memset(ibmr, 0, sizeof(*ibmr));

ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
Expand Down
2 changes: 1 addition & 1 deletion net/rds/rds.h
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ struct rds_sock {
* bound_addr used for both incoming and outgoing, no INADDR_ANY
* support.
*/
struct rb_node rs_bound_node;
struct hlist_node rs_bound_node;
__be32 rs_bound_addr;
__be32 rs_conn_addr;
__be16 rs_bound_port;
Expand Down

0 comments on commit 38a4e5e

Please sign in to comment.