Skip to content

Commit

Permalink
Merge tag 'rds-odp-for-5.5' of https://git.kernel.org/pub/scm/linux/k…
Browse files Browse the repository at this point in the history
…ernel/git/leon/linux-rdma

Leon Romanovsky says:

====================
Use ODP MRs for kernel ULPs

The following series extends MR creation routines to allow creation of
user MRs through kernel ULPs as a proxy. The immediate use case is to
allow RDS to work over FS-DAX, which requires ODP (on-demand-paging)
MRs to be created and such MRs were not possible to create prior this
series.

The first part of this patchset extends RDMA to have special verb
ib_reg_user_mr(). The common use case that uses this function is a
userspace application that allocates memory for HCA access but the
responsibility to register the memory at the HCA is on an kernel ULP.
This ULP acts as an agent for the userspace application.

The second part provides advise MR functionality for ULPs. This is
integral part of ODP flows and used to trigger pagefaults in advance
to prepare memory before running working set.

The third part is actual user of those in-kernel APIs.
====================

Signed-off-by: David S. Miller <[email protected]>
  • Loading branch information
davem330 committed Jan 21, 2020
2 parents 17e10a1 + b2dfc67 commit ad06307
Show file tree
Hide file tree
Showing 45 changed files with 561 additions and 256 deletions.
27 changes: 9 additions & 18 deletions drivers/infiniband/core/umem.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,14 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz);
/**
* ib_umem_get - Pin and DMA map userspace memory.
*
* @udata: userspace context to pin memory for
* @device: IB device to connect UMEM
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
*/
struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
size_t size, int access)
{
struct ib_ucontext *context;
struct ib_umem *umem;
struct page **page_list;
unsigned long lock_limit;
Expand All @@ -201,14 +200,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
struct scatterlist *sg;
unsigned int gup_flags = FOLL_WRITE;

if (!udata)
return ERR_PTR(-EIO);

context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
if (!context)
return ERR_PTR(-EIO);

/*
* If the combination of the addr and size requested for this memory
* region causes an integer overflow, return error.
Expand All @@ -226,7 +217,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
umem = kzalloc(sizeof(*umem), GFP_KERNEL);
if (!umem)
return ERR_PTR(-ENOMEM);
umem->ibdev = context->device;
umem->ibdev = device;
umem->length = size;
umem->address = addr;
umem->writable = ib_access_writable(access);
Expand Down Expand Up @@ -281,18 +272,18 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
npages -= ret;

sg = ib_umem_add_sg_table(sg, page_list, ret,
dma_get_max_seg_size(context->device->dma_device),
dma_get_max_seg_size(device->dma_device),
&umem->sg_nents);

up_read(&mm->mmap_sem);
}

sg_mark_end(sg);

umem->nmap = ib_dma_map_sg(context->device,
umem->sg_head.sgl,
umem->sg_nents,
DMA_BIDIRECTIONAL);
umem->nmap = ib_dma_map_sg(device,
umem->sg_head.sgl,
umem->sg_nents,
DMA_BIDIRECTIONAL);

if (!umem->nmap) {
ret = -ENOMEM;
Expand All @@ -303,7 +294,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
goto out;

umem_release:
__ib_umem_release(context->device, umem, 0);
__ib_umem_release(device, umem, 0);
vma:
atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
out:
Expand Down
29 changes: 7 additions & 22 deletions drivers/infiniband/core/umem_odp.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,30 +110,24 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
* They exist only to hold the per_mm reference to help the driver create
* children umems.
*
* @udata: udata from the syscall being used to create the umem
* @device: IB device to create UMEM
* @access: ib_reg_mr access flags
*/
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
int access)
{
struct ib_ucontext *context =
container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
int ret;

if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL);

if (!context)
return ERR_PTR(-EIO);

umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
if (!umem_odp)
return ERR_PTR(-ENOMEM);
umem = &umem_odp->umem;
umem->ibdev = context->device;
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
umem_odp->is_implicit_odp = 1;
Expand Down Expand Up @@ -201,7 +195,7 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
/**
* ib_umem_odp_get - Create a umem_odp for a userspace va
*
* @udata: userspace context to pin memory for
* @device: IB device struct to get UMEM
* @addr: userspace virtual address to start at
* @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned
Expand All @@ -210,31 +204,22 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
* pinning, instead, stores the mm for future page fault handling in
* conjunction with MMU notifiers.
*/
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
size_t size, int access,
struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
unsigned long addr, size_t size, int access,
const struct mmu_interval_notifier_ops *ops)
{
struct ib_umem_odp *umem_odp;
struct ib_ucontext *context;
struct mm_struct *mm;
int ret;

if (!udata)
return ERR_PTR(-EIO);

context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
if (!context)
return ERR_PTR(-EIO);

if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
return ERR_PTR(-EINVAL);

umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
if (!umem_odp)
return ERR_PTR(-ENOMEM);

umem_odp->umem.ibdev = context->device;
umem_odp->umem.ibdev = device;
umem_odp->umem.length = size;
umem_odp->umem.address = addr;
umem_odp->umem.writable = ib_access_writable(access);
Expand Down
41 changes: 41 additions & 0 deletions drivers/infiniband/core/verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1990,6 +1990,47 @@ EXPORT_SYMBOL(ib_resize_cq);

/* Memory regions */

struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags)
{
struct ib_mr *mr;

if (access_flags & IB_ACCESS_ON_DEMAND) {
if (!(pd->device->attrs.device_cap_flags &
IB_DEVICE_ON_DEMAND_PAGING)) {
pr_debug("ODP support not available\n");
return ERR_PTR(-EINVAL);
}
}

mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
access_flags, NULL);

if (IS_ERR(mr))
return mr;

mr->device = pd->device;
mr->pd = pd;
mr->dm = NULL;
atomic_inc(&pd->usecnt);
mr->res.type = RDMA_RESTRACK_MR;
rdma_restrack_kadd(&mr->res);

return mr;
}
EXPORT_SYMBOL(ib_reg_user_mr);

int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
u32 flags, struct ib_sge *sg_list, u32 num_sge)
{
if (!pd->device->ops.advise_mr)
return -EOPNOTSUPP;

return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
NULL);
}
EXPORT_SYMBOL(ib_advise_mr);

int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
{
struct ib_pd *pd = mr->pd;
Expand Down
12 changes: 7 additions & 5 deletions drivers/infiniband/hw/bnxt_re/ib_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,8 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
bytes += (qplib_qp->sq.max_wqe * psn_sz);
}
bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE);
umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem))
return PTR_ERR(umem);

Expand All @@ -850,7 +851,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
if (!qp->qplib_qp.srq) {
bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.qprva, bytes,
umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem))
goto rqfail;
Expand Down Expand Up @@ -1304,7 +1305,8 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,

bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE);
umem = ib_umem_get(&rdev->ibdev, ureq.srqva, bytes,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem))
return PTR_ERR(umem);

Expand Down Expand Up @@ -2545,7 +2547,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
goto fail;
}

cq->umem = ib_umem_get(udata, req.cq_va,
cq->umem = ib_umem_get(&rdev->ibdev, req.cq_va,
entries * sizeof(struct cq_base),
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(cq->umem)) {
Expand Down Expand Up @@ -3514,7 +3516,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
/* The fixed portion of the rkey is the same as the lkey */
mr->ib_mr.rkey = mr->qplib_mr.rkey;

umem = ib_umem_get(udata, start, length, mr_access_flags);
umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags);
if (IS_ERR(umem)) {
dev_err(rdev_to_dev(rdev), "Failed to get umem");
rc = -EFAULT;
Expand Down
2 changes: 1 addition & 1 deletion drivers/infiniband/hw/cxgb4/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,

mhp->rhp = rhp;

mhp->umem = ib_umem_get(udata, start, length, acc);
mhp->umem = ib_umem_get(pd->device, start, length, acc);
if (IS_ERR(mhp->umem))
goto err_free_skb;

Expand Down
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/efa/efa_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1358,7 +1358,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
int inline_size;
int err;

if (udata->inlen &&
if (udata && udata->inlen &&
!ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
ibdev_dbg(&dev->ibdev,
"Incompatible ABI params, udata not cleared\n");
Expand All @@ -1384,7 +1384,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
goto err_out;
}

mr->umem = ib_umem_get(udata, start, length, access_flags);
mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem);
ibdev_dbg(&dev->ibdev,
Expand Down
2 changes: 1 addition & 1 deletion drivers/infiniband/hw/hns/hns_roce_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
u32 npages;
int ret;

*umem = ib_umem_get(udata, ucmd.buf_addr, buf->size,
*umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, buf->size,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(*umem))
return PTR_ERR(*umem);
Expand Down
3 changes: 2 additions & 1 deletion drivers/infiniband/hw/hns/hns_roce_db.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context,

refcount_set(&page->refcount, 1);
page->user_virt = page_addr;
page->umem = ib_umem_get(udata, page_addr, PAGE_SIZE, 0);
page->umem = ib_umem_get(context->ibucontext.device, page_addr,
PAGE_SIZE, 0);
if (IS_ERR(page->umem)) {
ret = PTR_ERR(page->umem);
kfree(page);
Expand Down
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/hns/hns_roce_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -1145,7 +1145,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr)
return ERR_PTR(-ENOMEM);

mr->umem = ib_umem_get(udata, start, length, access_flags);
mr->umem = ib_umem_get(pd->device, start, length, access_flags);
if (IS_ERR(mr->umem)) {
ret = PTR_ERR(mr->umem);
goto err_free;
Expand Down Expand Up @@ -1230,7 +1230,7 @@ static int rereg_mr_trans(struct ib_mr *ibmr, int flags,
}
ib_umem_release(mr->umem);

mr->umem = ib_umem_get(udata, start, length, mr_access_flags);
mr->umem = ib_umem_get(ibmr->device, start, length, mr_access_flags);
if (IS_ERR(mr->umem)) {
ret = PTR_ERR(mr->umem);
mr->umem = NULL;
Expand Down
2 changes: 1 addition & 1 deletion drivers/infiniband/hw/hns/hns_roce_qp.c
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
goto err_alloc_rq_inline_buf;
}

hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr,
hr_qp->umem = ib_umem_get(ib_pd->device, ucmd.buf_addr,
hr_qp->buff_size, 0);
if (IS_ERR(hr_qp->umem)) {
dev_err(dev, "ib_umem_get error for create qp\n");
Expand Down
5 changes: 3 additions & 2 deletions drivers/infiniband/hw/hns/hns_roce_srq.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
return -EFAULT;

srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0);
srq->umem =
ib_umem_get(srq->ibsrq.device, ucmd.buf_addr, srq_buf_size, 0);
if (IS_ERR(srq->umem))
return PTR_ERR(srq->umem);

Expand All @@ -205,7 +206,7 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
goto err_user_srq_mtt;

/* config index queue BA */
srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
srq->idx_que.umem = ib_umem_get(srq->ibsrq.device, ucmd.que_addr,
srq->idx_que.buf_size, 0);
if (IS_ERR(srq->idx_que.umem)) {
dev_err(hr_dev->dev, "ib_umem_get error for index queue\n");
Expand Down
5 changes: 4 additions & 1 deletion drivers/infiniband/hw/i40iw/i40iw_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1756,12 +1756,15 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
int ret;
int pg_shift;

if (!udata)
return ERR_PTR(-EOPNOTSUPP);

if (iwdev->closing)
return ERR_PTR(-ENODEV);

if (length > I40IW_MAX_MR_SIZE)
return ERR_PTR(-EINVAL);
region = ib_umem_get(udata, start, length, acc);
region = ib_umem_get(pd->device, start, length, acc);
if (IS_ERR(region))
return (struct ib_mr *)region;

Expand Down
2 changes: 1 addition & 1 deletion drivers/infiniband/hw/mlx4/cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata,
int shift;
int n;

*umem = ib_umem_get(udata, buf_addr, cqe * cqe_size,
*umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(*umem))
return PTR_ERR(*umem);
Expand Down
3 changes: 2 additions & 1 deletion drivers/infiniband/hw/mlx4/doorbell.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,

page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0;
page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0);
page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
PAGE_SIZE, 0);
if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem);
kfree(page);
Expand Down
Loading

0 comments on commit ad06307

Please sign in to comment.