Skip to content

Commit

Permalink
hw/rdma: Add support for MAD packets
Browse files Browse the repository at this point in the history
MAD (Management Datagram) packets are widely used by various modules
both in kernel and in user space for example the rdma_* API which is
used to create and maintain "connection" layer on top of RDMA uses
several types of MAD packets.

For more information please refer to chapter 13.4 in Volume 1
Architecture Specification, Release 1.1 available here:
https://www.infinibandta.org/ibta-specifications-download/

To support MAD packets the device uses an external utility
(contrib/rdmacm-mux) to relay packets from and to the guest driver.

Signed-off-by: Yuval Shaia <[email protected]>
Reviewed-by: Marcel Apfelbaum<[email protected]>
Signed-off-by: Marcel Apfelbaum <[email protected]>
  • Loading branch information
Yuval Shaia authored and marcel-apf committed Dec 22, 2018
1 parent 305bdd7 commit 605ec16
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 10 deletions.
250 changes: 243 additions & 7 deletions hw/rdma/rdma_backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,13 @@
#include "qemu/osdep.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "qapi/qmp/qlist.h"
#include "qapi/qmp/qnum.h"

#include <infiniband/verbs.h>
#include <infiniband/umad_types.h>
#include <infiniband/umad.h>
#include <rdma/rdma_user_cm.h>

#include "trace.h"
#include "rdma_utils.h"
Expand All @@ -33,16 +38,25 @@
#define VENDOR_ERR_MAD_SEND 0x206
#define VENDOR_ERR_INVLKEY 0x207
#define VENDOR_ERR_MR_SMALL 0x208
#define VENDOR_ERR_INV_MAD_BUFF 0x209
#define VENDOR_ERR_INV_NUM_SGE 0x210

#define THR_NAME_LEN 16
#define THR_POLL_TO 5000

#define MAD_HDR_SIZE sizeof(struct ibv_grh)

typedef struct BackendCtx {
uint64_t req_id;
void *up_ctx;
bool is_tx_req;
struct ibv_sge sge; /* Used to save MAD recv buffer */
} BackendCtx;

struct backend_umad {
struct ib_user_mad hdr;
char mad[RDMA_MAX_PRIVATE_DATA];
};

static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);

static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
Expand Down Expand Up @@ -286,6 +300,61 @@ static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
return 0;
}

static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
uint32_t num_sge)
{
struct backend_umad umad = {0};
char *hdr, *msg;
int ret;

pr_dbg("num_sge=%d\n", num_sge);

if (num_sge != 2) {
return -EINVAL;
}

umad.hdr.length = sge[0].length + sge[1].length;
pr_dbg("msg_len=%d\n", umad.hdr.length);

if (umad.hdr.length > sizeof(umad.mad)) {
return -ENOMEM;
}

umad.hdr.addr.qpn = htobe32(1);
umad.hdr.addr.grh_present = 1;
umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
umad.hdr.addr.hop_limit = 0xFF;

hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
if (!hdr) {
pr_dbg("Fail to map to sge[0]\n");
return -ENOMEM;
}
msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
if (!msg) {
pr_dbg("Fail to map to sge[1]\n");
rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
return -ENOMEM;
}

pr_dbg_buf("mad_hdr", hdr, sge[0].length);
pr_dbg_buf("mad_data", data, sge[1].length);

memcpy(&umad.mad[0], hdr, sge[0].length);
memcpy(&umad.mad[sge[0].length], msg, sge[1].length);

rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);

ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad,
sizeof(umad));

pr_dbg("qemu_chr_fe_write=%d\n", ret);

return (ret != sizeof(umad));
}

void rdma_backend_post_send(RdmaBackendDev *backend_dev,
RdmaBackendQP *qp, uint8_t qp_type,
struct ibv_sge *sge, uint32_t num_sge,
Expand All @@ -304,9 +373,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
} else if (qp_type == IBV_QPT_GSI) {
pr_dbg("QP1\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
rc = mad_send(backend_dev, sge, num_sge);
if (rc) {
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
} else {
comp_handler(IBV_WC_SUCCESS, 0, ctx);
}
}
pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
return;
}

Expand Down Expand Up @@ -370,6 +443,48 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
g_free(bctx);
}

static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
struct ibv_sge *sge, uint32_t num_sge,
void *ctx)
{
BackendCtx *bctx;
int rc;
uint32_t bctx_id;

if (num_sge != 1) {
pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge);
return VENDOR_ERR_INV_NUM_SGE;
}

if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
pr_dbg("Too small buffer for MAD\n");
return VENDOR_ERR_INV_MAD_BUFF;
}

pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr);
pr_dbg("length=%d\n", sge[0].length);
pr_dbg("lkey=%d\n", sge[0].lkey);

bctx = g_malloc0(sizeof(*bctx));

rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
if (unlikely(rc)) {
g_free(bctx);
pr_dbg("Fail to allocate cqe_ctx\n");
return VENDOR_ERR_NOMEM;
}

pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id, bctx, ctx);
bctx->up_ctx = ctx;
bctx->sge = *sge;

qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
qlist_append_int(backend_dev->recv_mads_list.list, bctx_id);
qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);

return 0;
}

void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
RdmaDeviceResources *rdma_dev_res,
RdmaBackendQP *qp, uint8_t qp_type,
Expand All @@ -388,7 +503,10 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
}
if (qp_type == IBV_QPT_GSI) {
pr_dbg("QP1\n");
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
rc = save_mad_recv_buffer(backend_dev, sge, num_sge, ctx);
if (rc) {
comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
}
}
return;
}
Expand Down Expand Up @@ -517,7 +635,6 @@ int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,

switch (qp_type) {
case IBV_QPT_GSI:
pr_dbg("QP1 unsupported\n");
return 0;

case IBV_QPT_RC:
Expand Down Expand Up @@ -748,11 +865,122 @@ static int init_device_caps(RdmaBackendDev *backend_dev,
return 0;
}

static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
union ibv_gid *my_gid, int paylen)
{
grh->paylen = htons(paylen);
grh->sgid = *sgid;
grh->dgid = *my_gid;

pr_dbg("paylen=%d (net=0x%x)\n", paylen, grh->paylen);
pr_dbg("my_gid=0x%llx\n", my_gid->global.interface_id);
pr_dbg("gid=0x%llx\n", sgid->global.interface_id);
}

static inline int mad_can_receieve(void *opaque)
{
return sizeof(struct backend_umad);
}

static void mad_read(void *opaque, const uint8_t *buf, int size)
{
RdmaBackendDev *backend_dev = (RdmaBackendDev *)opaque;
QObject *o_ctx_id;
unsigned long cqe_ctx_id;
BackendCtx *bctx;
char *mad;
struct backend_umad *umad;

assert(size != sizeof(umad));
umad = (struct backend_umad *)buf;

pr_dbg("Got %d bytes\n", size);
pr_dbg("umad->hdr.length=%d\n", umad->hdr.length);

#ifdef PVRDMA_DEBUG
struct umad_hdr *hdr = (struct umad_hdr *)&msg->umad.mad;
pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64 " at %x atm %x\n",
hdr->base_version, hdr->mgmt_class, hdr->class_version,
hdr->method, hdr->status, be64toh(hdr->tid),
hdr->attr_id, hdr->attr_mod);
#endif

qemu_mutex_lock(&backend_dev->recv_mads_list.lock);
o_ctx_id = qlist_pop(backend_dev->recv_mads_list.list);
qemu_mutex_unlock(&backend_dev->recv_mads_list.lock);
if (!o_ctx_id) {
pr_dbg("No more free MADs buffers, waiting for a while\n");
sleep(THR_POLL_TO);
return;
}

cqe_ctx_id = qnum_get_uint(qobject_to(QNum, o_ctx_id));
bctx = rdma_rm_get_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
if (unlikely(!bctx)) {
pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id);
return;
}

pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id, bctx, bctx->up_ctx);

mad = rdma_pci_dma_map(backend_dev->dev, bctx->sge.addr,
bctx->sge.length);
if (!mad || bctx->sge.length < umad->hdr.length + MAD_HDR_SIZE) {
comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_INV_MAD_BUFF,
bctx->up_ctx);
} else {
memset(mad, 0, bctx->sge.length);
build_mad_hdr((struct ibv_grh *)mad,
(union ibv_gid *)&umad->hdr.addr.gid,
&backend_dev->gid, umad->hdr.length);
memcpy(&mad[MAD_HDR_SIZE], umad->mad, umad->hdr.length);
rdma_pci_dma_unmap(backend_dev->dev, mad, bctx->sge.length);

comp_handler(IBV_WC_SUCCESS, 0, bctx->up_ctx);
}

g_free(bctx);
rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, cqe_ctx_id);
}

static int mad_init(RdmaBackendDev *backend_dev)
{
struct backend_umad umad = {0};
int ret;

if (!qemu_chr_fe_backend_connected(backend_dev->mad_chr_be)) {
pr_dbg("Missing chardev for MAD multiplexer\n");
return -EIO;
}

qemu_chr_fe_set_handlers(backend_dev->mad_chr_be, mad_can_receieve,
mad_read, NULL, NULL, backend_dev, NULL, true);

/* Register ourself */
memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *)&umad,
sizeof(umad.hdr));
if (ret != sizeof(umad.hdr)) {
pr_dbg("Fail to register to rdma_umadmux (%d)\n", ret);
}

qemu_mutex_init(&backend_dev->recv_mads_list.lock);
backend_dev->recv_mads_list.list = qlist_new();

return 0;
}

static void mad_fini(RdmaBackendDev *backend_dev)
{
qlist_destroy_obj(QOBJECT(backend_dev->recv_mads_list.list));
qemu_mutex_destroy(&backend_dev->recv_mads_list.lock);
}

int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
RdmaDeviceResources *rdma_dev_res,
const char *backend_device_name, uint8_t port_num,
uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
Error **errp)
CharBackend *mad_chr_be, Error **errp)
{
int i;
int ret = 0;
Expand All @@ -763,7 +991,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
memset(backend_dev, 0, sizeof(*backend_dev));

backend_dev->dev = pdev;

backend_dev->mad_chr_be = mad_chr_be;
backend_dev->backend_gid_idx = backend_gid_idx;
backend_dev->port_num = port_num;
backend_dev->rdma_dev_res = rdma_dev_res;
Expand Down Expand Up @@ -854,6 +1082,13 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
pr_dbg("interface_id=0x%" PRIx64 "\n",
be64_to_cpu(backend_dev->gid.global.interface_id));

ret = mad_init(backend_dev);
if (ret) {
error_setg(errp, "Fail to initialize mad");
ret = -EIO;
goto out_destroy_comm_channel;
}

backend_dev->comp_thread.run = false;
backend_dev->comp_thread.is_running = false;

Expand Down Expand Up @@ -890,6 +1125,7 @@ void rdma_backend_stop(RdmaBackendDev *backend_dev)
void rdma_backend_fini(RdmaBackendDev *backend_dev)
{
rdma_backend_stop(backend_dev);
mad_fini(backend_dev);
g_hash_table_destroy(ah_hash);
ibv_destroy_comp_channel(backend_dev->channel);
ibv_close_device(backend_dev->context);
Expand Down
4 changes: 3 additions & 1 deletion hw/rdma/rdma_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#define RDMA_BACKEND_H

#include "qapi/error.h"
#include "chardev/char-fe.h"

#include "rdma_rm_defs.h"
#include "rdma_backend_defs.h"

Expand Down Expand Up @@ -50,7 +52,7 @@ int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
RdmaDeviceResources *rdma_dev_res,
const char *backend_device_name, uint8_t port_num,
uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
Error **errp);
CharBackend *mad_chr_be, Error **errp);
void rdma_backend_fini(RdmaBackendDev *backend_dev);
void rdma_backend_start(RdmaBackendDev *backend_dev);
void rdma_backend_stop(RdmaBackendDev *backend_dev);
Expand Down
10 changes: 9 additions & 1 deletion hw/rdma/rdma_backend_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@
#ifndef RDMA_BACKEND_DEFS_H
#define RDMA_BACKEND_DEFS_H

#include <infiniband/verbs.h>
#include "qemu/thread.h"
#include "chardev/char-fe.h"
#include <infiniband/verbs.h>

typedef struct RdmaDeviceResources RdmaDeviceResources;

Expand All @@ -28,6 +29,11 @@ typedef struct RdmaBackendThread {
bool is_running; /* Set by the thread to report its status */
} RdmaBackendThread;

typedef struct RecvMadList {
QemuMutex lock;
QList *list;
} RecvMadList;

typedef struct RdmaBackendDev {
struct ibv_device_attr dev_attr;
RdmaBackendThread comp_thread;
Expand All @@ -39,6 +45,8 @@ typedef struct RdmaBackendDev {
struct ibv_comp_channel *channel;
uint8_t port_num;
uint8_t backend_gid_idx;
RecvMadList recv_mads_list;
CharBackend *mad_chr_be;
} RdmaBackendDev;

typedef struct RdmaBackendPD {
Expand Down
Loading

0 comments on commit 605ec16

Please sign in to comment.