Skip to content

Commit

Permalink
RDS: IB: split mr pool to improve 8K messages performance
Browse files Browse the repository at this point in the history
8K message sizes are pretty important usecase for RDS current
workloads so we make provison to have 8K mrs available from the pool.
Based on number of SG's in the RDS message, we pick a pool to use.

Also to make sure that we don't under utlise mrs when say 8k messages
are dominating which could lead to 8k pull being exhausted, we fall-back
to 1m pool till 8k pool recovers for use.

This helps to at least push ~55 kB/s bidirectional data which
is a nice improvement.

Signed-off-by: Santosh Shilimkar <[email protected]>
Signed-off-by: Santosh Shilimkar <[email protected]>
  • Loading branch information
SantoshShilimkar committed Oct 5, 2015
1 parent 41a4e96 commit 0676651
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 62 deletions.
47 changes: 33 additions & 14 deletions net/rds/ib.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,14 @@
#include "rds.h"
#include "ib.h"

static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;

module_param(fmr_pool_size, int, 0444);
MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
module_param(fmr_message_size, int, 0444);
MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
module_param(rds_ib_fmr_1m_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA");
module_param(rds_ib_fmr_8k_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");

Expand Down Expand Up @@ -97,8 +97,10 @@ static void rds_ib_dev_free(struct work_struct *work)
struct rds_ib_device *rds_ibdev = container_of(work,
struct rds_ib_device, free_work);

if (rds_ibdev->mr_pool)
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
if (rds_ibdev->mr_8k_pool)
rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
if (rds_ibdev->mr_1m_pool)
rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
if (rds_ibdev->pd)
ib_dealloc_pd(rds_ibdev->pd);

Expand Down Expand Up @@ -148,9 +150,13 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);

rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
rds_ibdev->max_fmrs = dev_attr->max_mr ?
min_t(unsigned int, dev_attr->max_mr, fmr_pool_size) :
fmr_pool_size;
rds_ibdev->max_1m_fmrs = dev_attr->max_mr ?
min_t(unsigned int, (dev_attr->max_mr / 2),
rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;

rds_ibdev->max_8k_fmrs = dev_attr->max_mr ?
min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE),
rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;

rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
Expand All @@ -162,12 +168,25 @@ static void rds_ib_add_one(struct ib_device *device)
goto put_dev;
}

rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
if (IS_ERR(rds_ibdev->mr_pool)) {
rds_ibdev->mr_pool = NULL;
rds_ibdev->mr_1m_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
if (IS_ERR(rds_ibdev->mr_1m_pool)) {
rds_ibdev->mr_1m_pool = NULL;
goto put_dev;
}

rds_ibdev->mr_8k_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL);
if (IS_ERR(rds_ibdev->mr_8k_pool)) {
rds_ibdev->mr_8k_pool = NULL;
goto put_dev;
}

rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
rds_ibdev->max_8k_fmrs);

INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);

Expand Down
43 changes: 31 additions & 12 deletions net/rds/ib.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
#include "rds.h"
#include "rdma_transport.h"

#define RDS_FMR_SIZE 256
#define RDS_FMR_POOL_SIZE 8192
#define RDS_FMR_1M_POOL_SIZE (8192 / 2)
#define RDS_FMR_1M_MSG_SIZE 256
#define RDS_FMR_8K_MSG_SIZE 2
#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1))
#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))

#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
Expand Down Expand Up @@ -189,15 +192,23 @@ struct rds_ib_ipaddr {
struct rcu_head rcu;
};

enum {
RDS_IB_MR_8K_POOL,
RDS_IB_MR_1M_POOL,
};

struct rds_ib_device {
struct list_head list;
struct list_head ipaddr_list;
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
struct rds_ib_mr_pool *mr_pool;
unsigned int fmr_max_remaps;
unsigned int max_fmrs;
struct rds_ib_mr_pool *mr_1m_pool;
struct rds_ib_mr_pool *mr_8k_pool;
unsigned int fmr_max_remaps;
unsigned int max_8k_fmrs;
unsigned int max_1m_fmrs;
int max_sge;
unsigned int max_wrs;
unsigned int max_initiator_depth;
Expand Down Expand Up @@ -239,12 +250,18 @@ struct rds_ib_statistics {
uint64_t s_ib_ack_send_delayed;
uint64_t s_ib_ack_send_piggybacked;
uint64_t s_ib_ack_received;
uint64_t s_ib_rdma_mr_alloc;
uint64_t s_ib_rdma_mr_free;
uint64_t s_ib_rdma_mr_used;
uint64_t s_ib_rdma_mr_pool_flush;
uint64_t s_ib_rdma_mr_pool_wait;
uint64_t s_ib_rdma_mr_pool_depleted;
uint64_t s_ib_rdma_mr_8k_alloc;
uint64_t s_ib_rdma_mr_8k_free;
uint64_t s_ib_rdma_mr_8k_used;
uint64_t s_ib_rdma_mr_8k_pool_flush;
uint64_t s_ib_rdma_mr_8k_pool_wait;
uint64_t s_ib_rdma_mr_8k_pool_depleted;
uint64_t s_ib_rdma_mr_1m_alloc;
uint64_t s_ib_rdma_mr_1m_free;
uint64_t s_ib_rdma_mr_1m_used;
uint64_t s_ib_rdma_mr_1m_pool_flush;
uint64_t s_ib_rdma_mr_1m_pool_wait;
uint64_t s_ib_rdma_mr_1m_pool_depleted;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
};
Expand Down Expand Up @@ -296,7 +313,8 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client;

extern unsigned int fmr_message_size;
extern unsigned int rds_ib_fmr_1m_pool_size;
extern unsigned int rds_ib_fmr_8k_pool_size;
extern unsigned int rds_ib_retry_count;

extern spinlock_t ib_nodev_conns_lock;
Expand Down Expand Up @@ -326,7 +344,8 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
int npages);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
Expand Down
101 changes: 71 additions & 30 deletions net/rds/ib_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ struct rds_ib_mr {
* Our own little FMR pool
*/
struct rds_ib_mr_pool {
unsigned int pool_type;
struct mutex flush_lock; /* serialize fmr invalidate */
struct delayed_work flush_worker; /* flush worker */

Expand Down Expand Up @@ -234,43 +235,47 @@ void rds_ib_destroy_nodev_conns(void)
rds_conn_destroy(ic->conn);
}

struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
int pool_type)
{
struct rds_ib_mr_pool *pool;

pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool)
return ERR_PTR(-ENOMEM);

pool->pool_type = pool_type;
init_llist_head(&pool->free_list);
init_llist_head(&pool->drop_list);
init_llist_head(&pool->clean_list);
mutex_init(&pool->flush_lock);
init_waitqueue_head(&pool->flush_wait);
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);

pool->fmr_attr.max_pages = fmr_message_size;
if (pool_type == RDS_IB_MR_1M_POOL) {
/* +1 allows for unaligned MRs */
pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1;
pool->max_items = RDS_FMR_1M_POOL_SIZE;
} else {
/* pool_type == RDS_IB_MR_8K_POOL */
pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1;
pool->max_items = RDS_FMR_8K_POOL_SIZE;
}

pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
pool->fmr_attr.page_shift = PAGE_SHIFT;
pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;

/* We never allow more than max_items MRs to be allocated.
* When we exceed more than max_items_soft, we start freeing
* items more aggressively.
* Make sure that max_items > max_items_soft > max_items / 2
*/
pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
pool->max_items = rds_ibdev->max_fmrs;

return pool;
}

void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
{
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;

iinfo->rdma_mr_max = pool->max_items;
iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
iinfo->rdma_mr_max = pool_1m->max_items;
iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
}

void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
Expand Down Expand Up @@ -312,15 +317,29 @@ static inline void wait_clean_list_grace(void)
}
}

static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
int npages)
{
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
struct rds_ib_mr_pool *pool;
struct rds_ib_mr *ibmr = NULL;
int err = 0, iter = 0;

if (npages <= RDS_FMR_8K_MSG_SIZE)
pool = rds_ibdev->mr_8k_pool;
else
pool = rds_ibdev->mr_1m_pool;

if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);

/* Switch pools if one of the pool is reaching upper limit */
if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) {
if (pool->pool_type == RDS_IB_MR_8K_POOL)
pool = rds_ibdev->mr_1m_pool;
else
pool = rds_ibdev->mr_8k_pool;
}

while (1) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr)
Expand All @@ -341,12 +360,18 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
atomic_dec(&pool->item_count);

if (++iter > 2) {
rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
if (pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
return ERR_PTR(-EAGAIN);
}

/* We do have some empty MRs. Flush them out. */
rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
if (pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
rds_ib_flush_mr_pool(pool, 0, &ibmr);
if (ibmr)
return ibmr;
Expand All @@ -371,7 +396,12 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
goto out_no_cigar;
}

rds_ib_stats_inc(s_ib_rdma_mr_alloc);
ibmr->pool = pool;
if (pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);

return ibmr;

out_no_cigar:
Expand Down Expand Up @@ -427,7 +457,7 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
}

page_cnt += len >> PAGE_SHIFT;
if (page_cnt > fmr_message_size)
if (page_cnt > ibmr->pool->fmr_attr.max_pages)
return -EINVAL;

dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
Expand Down Expand Up @@ -459,7 +489,10 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
ibmr->sg_dma_len = sg_dma_len;
ibmr->remap_count++;

rds_ib_stats_inc(s_ib_rdma_mr_used);
if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
ret = 0;

out:
Expand Down Expand Up @@ -591,7 +624,7 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
* to free as many MRs as needed to get back to this limit.
*/
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
int free_all, struct rds_ib_mr **ibmr_ret)
int free_all, struct rds_ib_mr **ibmr_ret)
{
struct rds_ib_mr *ibmr, *next;
struct llist_node *clean_nodes;
Expand All @@ -602,11 +635,14 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
int ret = 0;

rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
if (pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush);

if (ibmr_ret) {
DEFINE_WAIT(wait);
while(!mutex_trylock(&pool->flush_lock)) {
while (!mutex_trylock(&pool->flush_lock)) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
Expand Down Expand Up @@ -663,8 +699,12 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
unpinned += ibmr->sg_len;
__rds_ib_teardown_mr(ibmr);
if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
rds_ib_stats_inc(s_ib_rdma_mr_free);
if (nfreed < free_goal ||
ibmr->remap_count >= pool->fmr_attr.max_maps) {
if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
list_del(&ibmr->unmap_list);
ib_dealloc_fmr(ibmr->fmr);
kfree(ibmr);
Expand Down Expand Up @@ -756,10 +796,11 @@ void rds_ib_flush_mrs(void)

down_read(&rds_ib_devices_lock);
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
if (rds_ibdev->mr_8k_pool)
rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL);

if (pool)
rds_ib_flush_mr_pool(pool, 0, NULL);
if (rds_ibdev->mr_1m_pool)
rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL);
}
up_read(&rds_ib_devices_lock);
}
Expand All @@ -777,12 +818,12 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
goto out;
}

if (!rds_ibdev->mr_pool) {
if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
ret = -ENODEV;
goto out;
}

ibmr = rds_ib_alloc_fmr(rds_ibdev);
ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
if (IS_ERR(ibmr)) {
rds_ib_dev_put(rds_ibdev);
return ibmr;
Expand Down
Loading

0 comments on commit 0676651

Please sign in to comment.