Skip to content

Commit

Permalink
IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support
Browse files Browse the repository at this point in the history
Currently the driver doesn't support completion vectors. These
are used to indicate which sets of CQs should be grouped together
into the same vector. A vector is a CQ processing thread that
runs on a specific CPU.

If an application has several CQs bound to different completion
vectors, and each completion vector runs on different CPUs, then
the completion queue workload is balanced. This helps scale as more
nodes are used.

Implement CQ completion vector support using a global workqueue
where a CQ entry is queued to the CPU corresponding to the CQ's
completion vector. Since the workqueue is global, it's guaranteed
to always be there when queueing CQ entries; Therefore, the RCU
locking for cq->rdi->worker in the hot path is superfluous.

Each completion vector is assigned to a different CPU. The number of
completion vectors available is computed by taking the number of
online, physical CPUs from the local NUMA node and subtracting the
CPUs used for kernel receive queues and the general interrupt.
Special use cases:

  * If there are no CPUs left for completion vectors, the same CPU
    for the general interrupt is used; Therefore, there would only
    be one completion vector available.

  * For multi-HFI systems, the number of completion vectors available
    for each device is the total number of completion vectors in
    the local NUMA node divided by the number of devices in the same
    NUMA node. If there's a division remainder, the first device to
    get initialized gets an extra completion vector.

Upon a CQ creation, an invalid completion vector could be specified.
Handle it as follows:

  * If the completion vector is less than 0, set it to 0.

  * Set the completion vector to the result of the passed completion
    vector moded with the number of device completion vectors
    available.

Reviewed-by: Mike Marciniszyn <[email protected]>
Signed-off-by: Sebastian Sanchez <[email protected]>
Signed-off-by: Dennis Dalessandro <[email protected]>
Signed-off-by: Doug Ledford <[email protected]>
  • Loading branch information
ssanchez11 authored and dledford committed May 9, 2018
1 parent cf38ea1 commit 5d18ee6
Show file tree
Hide file tree
Showing 15 changed files with 534 additions and 101 deletions.
414 changes: 407 additions & 7 deletions drivers/infiniband/hw/hfi1/affinity.c

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions drivers/infiniband/hw/hfi1/affinity.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright(c) 2015 - 2017 Intel Corporation.
* Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
Expand Down Expand Up @@ -98,9 +98,11 @@ void hfi1_put_proc_affinity(int cpu);

struct hfi1_affinity_node {
int node;
u16 __percpu *comp_vect_affinity;
struct cpu_mask_set def_intr;
struct cpu_mask_set rcv_intr;
struct cpumask general_intr_mask;
struct cpumask comp_vect_mask;
struct list_head list;
};

Expand All @@ -116,7 +118,11 @@ struct hfi1_affinity_node_list {
};

int node_affinity_init(void);
void node_affinity_destroy(void);
void node_affinity_destroy_all(void);
extern struct hfi1_affinity_node_list node_affinity;
void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd);
int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect);
int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd);
void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd);

#endif /* _HFI1_AFFINITY_H */
5 changes: 5 additions & 0 deletions drivers/infiniband/hw/hfi1/chip.c
Original file line number Diff line number Diff line change
Expand Up @@ -15233,6 +15233,10 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
if (ret)
goto bail_cleanup;

ret = hfi1_comp_vectors_set_up(dd);
if (ret)
goto bail_clear_intr;

/* set up LCB access - must be after set_up_interrupts() */
init_lcb_access(dd);

Expand Down Expand Up @@ -15275,6 +15279,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
bail_free_cntrs:
free_cntrs(dd);
bail_clear_intr:
hfi1_comp_vectors_clean_up(dd);
hfi1_clean_up_interrupts(dd);
bail_cleanup:
hfi1_pcie_ddcleanup(dd);
Expand Down
3 changes: 3 additions & 0 deletions drivers/infiniband/hw/hfi1/hfi.h
Original file line number Diff line number Diff line change
Expand Up @@ -1263,6 +1263,9 @@ struct hfi1_devdata {

/* Save the enabled LCB error bits */
u64 lcb_err_en;
struct cpu_mask_set *comp_vect;
int *comp_vect_mappings;
u32 comp_vect_possible_cpus;

/*
* Capability to have different send engines simply by changing a
Expand Down
15 changes: 13 additions & 2 deletions drivers/infiniband/hw/hfi1/init.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright(c) 2015-2017 Intel Corporation.
* Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
Expand Down Expand Up @@ -1244,6 +1244,8 @@ static void hfi1_clean_devdata(struct hfi1_devdata *dd)
dd->rcv_limit = NULL;
dd->send_schedule = NULL;
dd->tx_opstats = NULL;
kfree(dd->comp_vect);
dd->comp_vect = NULL;
sdma_clean(dd, dd->num_sdma);
rvt_dealloc_device(&dd->verbs_dev.rdi);
}
Expand Down Expand Up @@ -1300,6 +1302,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
dd->unit = ret;
list_add(&dd->list, &hfi1_dev_list);
}
dd->node = -1;

spin_unlock_irqrestore(&hfi1_devs_lock, flags);
idr_preload_end();
Expand Down Expand Up @@ -1352,6 +1355,12 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
goto bail;
}

dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL);
if (!dd->comp_vect) {
ret = -ENOMEM;
goto bail;
}

kobject_init(&dd->kobj, &hfi1_devdata_type);
return dd;

Expand Down Expand Up @@ -1521,7 +1530,7 @@ module_init(hfi1_mod_init);
static void __exit hfi1_mod_cleanup(void)
{
pci_unregister_driver(&hfi1_pci_driver);
node_affinity_destroy();
node_affinity_destroy_all();
hfi1_wss_exit();
hfi1_dbg_exit();

Expand Down Expand Up @@ -1605,6 +1614,8 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
static void postinit_cleanup(struct hfi1_devdata *dd)
{
hfi1_start_cleanup(dd);
hfi1_comp_vectors_clean_up(dd);
hfi1_dev_affinity_clean_up(dd);

hfi1_pcie_ddcleanup(dd);
hfi1_pcie_cleanup(dd->pcidev);
Expand Down
3 changes: 2 additions & 1 deletion drivers/infiniband/hw/hfi1/trace.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright(c) 2015 - 2017 Intel Corporation.
* Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
Expand Down Expand Up @@ -374,6 +374,7 @@ const char *print_u32_array(
return ret;
}

__hfi1_trace_fn(AFFINITY);
__hfi1_trace_fn(PKT);
__hfi1_trace_fn(PROC);
__hfi1_trace_fn(SDMA);
Expand Down
3 changes: 2 additions & 1 deletion drivers/infiniband/hw/hfi1/trace_dbg.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright(c) 2015, 2016 Intel Corporation.
* Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
Expand Down Expand Up @@ -113,6 +113,7 @@ void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \
* hfi1_cdbg(LVL, fmt, ...); as well as take care of all
* the debugfs stuff.
*/
__hfi1_trace_def(AFFINITY);
__hfi1_trace_def(PKT);
__hfi1_trace_def(PROC);
__hfi1_trace_def(SDMA);
Expand Down
7 changes: 4 additions & 3 deletions drivers/infiniband/hw/hfi1/verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
#include "debugfs.h"
#include "vnic.h"
#include "fault.h"
#include "affinity.h"

static unsigned int hfi1_lkey_table_size = 16;
module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
Expand Down Expand Up @@ -1934,11 +1935,11 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
hfi1_comp_vect_mappings_lookup;

/* completeion queue */
snprintf(dd->verbs_dev.rdi.dparms.cq_name,
sizeof(dd->verbs_dev.rdi.dparms.cq_name),
"hfi1_cq%d", dd->unit);
dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus;
dd->verbs_dev.rdi.dparms.node = dd->node;

/* misc settings */
Expand Down
6 changes: 1 addition & 5 deletions drivers/infiniband/hw/qib/qib_verbs.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2012, 2013 Intel Corporation. All rights reserved.
* Copyright (c) 2012 - 2018 Intel Corporation. All rights reserved.
* Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
* Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
*
Expand Down Expand Up @@ -1631,10 +1631,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB;
dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE;

snprintf(dd->verbs_dev.rdi.dparms.cq_name,
sizeof(dd->verbs_dev.rdi.dparms.cq_name),
"qib_cq%d", dd->unit);

qib_fill_device_attr(dd);

ppd = dd->pport;
Expand Down
81 changes: 32 additions & 49 deletions drivers/infiniband/sw/rdmavt/cq.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright(c) 2016 Intel Corporation.
* Copyright(c) 2016 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
Expand Down Expand Up @@ -47,11 +47,12 @@

#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/kthread.h>
#include "cq.h"
#include "vt.h"
#include "trace.h"

static struct workqueue_struct *comp_vector_wq;

/**
* rvt_cq_enter - add a new entry to the completion queue
* @cq: completion queue
Expand Down Expand Up @@ -120,27 +121,21 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
if (cq->notify == IB_CQ_NEXT_COMP ||
(cq->notify == IB_CQ_SOLICITED &&
(solicited || entry->status != IB_WC_SUCCESS))) {
struct kthread_worker *worker;

/*
* This will cause send_complete() to be called in
* another thread.
*/
rcu_read_lock();
worker = rcu_dereference(cq->rdi->worker);
if (likely(worker)) {
cq->notify = RVT_CQ_NONE;
cq->triggered++;
kthread_queue_work(worker, &cq->comptask);
}
rcu_read_unlock();
cq->notify = RVT_CQ_NONE;
cq->triggered++;
queue_work_on(cq->comp_vector_cpu, comp_vector_wq,
&cq->comptask);
}

spin_unlock_irqrestore(&cq->lock, flags);
}
EXPORT_SYMBOL(rvt_cq_enter);

static void send_complete(struct kthread_work *work)
static void send_complete(struct work_struct *work)
{
struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask);

Expand Down Expand Up @@ -192,13 +187,19 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
struct ib_cq *ret;
u32 sz;
unsigned int entries = attr->cqe;
int comp_vector = attr->comp_vector;

if (attr->flags)
return ERR_PTR(-EINVAL);

if (entries < 1 || entries > rdi->dparms.props.max_cqe)
return ERR_PTR(-EINVAL);

if (comp_vector < 0)
comp_vector = 0;

comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;

/* Allocate the completion queue structure. */
cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
if (!cq)
Expand Down Expand Up @@ -267,14 +268,22 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
* an error.
*/
cq->rdi = rdi;
if (rdi->driver_f.comp_vect_cpu_lookup)
cq->comp_vector_cpu =
rdi->driver_f.comp_vect_cpu_lookup(rdi, comp_vector);
else
cq->comp_vector_cpu =
cpumask_first(cpumask_of_node(rdi->dparms.node));

cq->ibcq.cqe = entries;
cq->notify = RVT_CQ_NONE;
spin_lock_init(&cq->lock);
kthread_init_work(&cq->comptask, send_complete);
INIT_WORK(&cq->comptask, send_complete);
cq->queue = wc;

ret = &cq->ibcq;

trace_rvt_create_cq(cq, attr);
goto done;

bail_ip:
Expand All @@ -300,7 +309,7 @@ int rvt_destroy_cq(struct ib_cq *ibcq)
struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
struct rvt_dev_info *rdi = cq->rdi;

kthread_flush_work(&cq->comptask);
flush_work(&cq->comptask);
spin_lock_irq(&rdi->n_cqs_lock);
rdi->n_cqs_allocated--;
spin_unlock_irq(&rdi->n_cqs_lock);
Expand Down Expand Up @@ -510,48 +519,22 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
*
* Return: 0 on success
*/
int rvt_driver_cq_init(struct rvt_dev_info *rdi)
int rvt_driver_cq_init(void)
{
int cpu;
struct kthread_worker *worker;

if (rcu_access_pointer(rdi->worker))
return 0;

spin_lock_init(&rdi->n_cqs_lock);

cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
worker = kthread_create_worker_on_cpu(cpu, 0,
"%s", rdi->dparms.cq_name);
if (IS_ERR(worker))
return PTR_ERR(worker);
comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE,
0, "rdmavt_cq");
if (!comp_vector_wq)
return -ENOMEM;

set_user_nice(worker->task, MIN_NICE);
RCU_INIT_POINTER(rdi->worker, worker);
return 0;
}

/**
* rvt_cq_exit - tear down cq reources
* @rdi: rvt dev structure
*/
void rvt_cq_exit(struct rvt_dev_info *rdi)
void rvt_cq_exit(void)
{
struct kthread_worker *worker;

if (!rcu_access_pointer(rdi->worker))
return;

spin_lock(&rdi->n_cqs_lock);
worker = rcu_dereference_protected(rdi->worker,
lockdep_is_held(&rdi->n_cqs_lock));
if (!worker) {
spin_unlock(&rdi->n_cqs_lock);
return;
}
RCU_INIT_POINTER(rdi->worker, NULL);
spin_unlock(&rdi->n_cqs_lock);
synchronize_rcu();

kthread_destroy_worker(worker);
destroy_workqueue(comp_vector_wq);
comp_vector_wq = NULL;
}
6 changes: 3 additions & 3 deletions drivers/infiniband/sw/rdmavt/cq.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#define DEF_RVTCQ_H

/*
* Copyright(c) 2016 Intel Corporation.
* Copyright(c) 2016 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
Expand Down Expand Up @@ -59,6 +59,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq);
int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
int rvt_driver_cq_init(struct rvt_dev_info *rdi);
void rvt_cq_exit(struct rvt_dev_info *rdi);
int rvt_driver_cq_init(void);
void rvt_cq_exit(void);
#endif /* DEF_RVTCQ_H */
Loading

0 comments on commit 5d18ee6

Please sign in to comment.