Skip to content

Commit

Permalink
blk-mq: avoid inserting requests before establishing new mapping
Browse files Browse the repository at this point in the history
Notifier callbacks for CPU_ONLINE action can be run on the other CPU
than the CPU which was just onlined.  So it is possible for the
process running on the just onlined CPU to insert request and run
hw queue before establishing new mapping which is done by
blk_mq_queue_reinit_notify().

This can cause a problem when the CPU has just been onlined first time
since the request queue was initialized.  At this time ctx->index_hw
for the CPU, which is the index in hctx->ctxs[] for this ctx, is still
zero before blk_mq_queue_reinit_notify() is called by notifier
callbacks for CPU_ONLINE action.

For example, there is a single hw queue (hctx) and two CPU queues
(ctx0 for CPU0, and ctx1 for CPU1).  Now CPU1 is just onlined and
a request is inserted into ctx1->rq_list and set bit0 in pending
bitmap as ctx1->index_hw is still zero.

And then while running hw queue, flush_busy_ctxs() finds bit0 is set
in pending bitmap and tries to retrieve requests in
hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0, so the
request in ctx1->rq_list is ignored.

Fix it by ensuring that new mapping is established before onlined cpu
starts running.

Signed-off-by: Akinobu Mita <[email protected]>
Reviewed-by: Ming Lei <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Ming Lei <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
  • Loading branch information
mita authored and axboe committed Sep 29, 2015
1 parent 0e62636 commit 5778322
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 19 deletions.
9 changes: 5 additions & 4 deletions block/blk-mq-cpumap.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
return cpu;
}

int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
const struct cpumask *online_mask)
{
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
cpumask_var_t cpus;
Expand All @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)

cpumask_clear(cpus);
nr_cpus = nr_uniq_cpus = 0;
for_each_online_cpu(i) {
for_each_cpu(i, online_mask) {
nr_cpus++;
first_sibling = get_first_sibling(i);
if (!cpumask_test_cpu(first_sibling, cpus))
Expand All @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)

queue = 0;
for_each_possible_cpu(i) {
if (!cpu_online(i)) {
if (!cpumask_test_cpu(i, online_mask)) {
map[i] = 0;
continue;
}
Expand Down Expand Up @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
if (!map)
return NULL;

if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
return map;

kfree(map);
Expand Down
59 changes: 45 additions & 14 deletions block/blk-mq.c
Original file line number Diff line number Diff line change
Expand Up @@ -1789,7 +1789,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}

static void blk_mq_map_swqueue(struct request_queue *q)
static void blk_mq_map_swqueue(struct request_queue *q,
const struct cpumask *online_mask)
{
unsigned int i;
struct blk_mq_hw_ctx *hctx;
Expand All @@ -1806,7 +1807,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
*/
queue_for_each_ctx(q, ctx, i) {
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpu_online(i))
if (!cpumask_test_cpu(i, online_mask))
continue;

hctx = q->mq_ops->map_queue(q, i);
Expand Down Expand Up @@ -1852,7 +1853,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
}

queue_for_each_ctx(q, ctx, i) {
if (!cpu_online(i))
if (!cpumask_test_cpu(i, online_mask))
continue;

hctx = q->mq_ops->map_queue(q, i);
Expand Down Expand Up @@ -2037,13 +2038,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
if (blk_mq_init_hw_queues(q, set))
goto err_hctxs;

get_online_cpus();
mutex_lock(&all_q_mutex);

list_add_tail(&q->all_q_node, &all_q_list);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
blk_mq_map_swqueue(q, cpu_online_mask);

mutex_unlock(&all_q_mutex);
put_online_cpus();

return q;

Expand Down Expand Up @@ -2080,21 +2083,22 @@ void blk_mq_free_queue(struct request_queue *q)
}

/* Basically redo blk_mq_init_queue with queue frozen */
static void blk_mq_queue_reinit(struct request_queue *q)
static void blk_mq_queue_reinit(struct request_queue *q,
const struct cpumask *online_mask)
{
WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));

blk_mq_sysfs_unregister(q);

blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);

/*
* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
* we should change hctx numa_node according to new topology (this
* involves free and re-allocate memory, worthy doing?)
*/

blk_mq_map_swqueue(q);
blk_mq_map_swqueue(q, online_mask);

blk_mq_sysfs_register(q);
}
Expand All @@ -2103,16 +2107,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
unsigned long action, void *hcpu)
{
struct request_queue *q;
int cpu = (unsigned long)hcpu;
/*
* New online cpumask which is going to be set in this hotplug event.
* Declare this cpumasks as global as cpu-hotplug operation is invoked
* one-by-one and dynamically allocating this could result in a failure.
*/
static struct cpumask online_new;

/*
* Before new mappings are established, hotadded cpu might already
* start handling requests. This doesn't break anything as we map
* offline CPUs to first hardware queue. We will re-init the queue
* below to get optimal settings.
* Before hotadded cpu starts handling requests, new mappings must
* be established. Otherwise, these requests in hw queue might
* never be dispatched.
*
* For example, there is a single hw queue (hctx) and two CPU queues
* (ctx0 for CPU0, and ctx1 for CPU1).
*
* Now CPU1 is just onlined and a request is inserted into
* ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
* still zero.
*
* And then while running hw queue, flush_busy_ctxs() finds bit0 is
* set in pending bitmap and tries to retrieve requests in
* hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0,
* so the request in ctx1->rq_list is ignored.
*/
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DEAD:
case CPU_UP_CANCELED:
cpumask_copy(&online_new, cpu_online_mask);
break;
case CPU_UP_PREPARE:
cpumask_copy(&online_new, cpu_online_mask);
cpumask_set_cpu(cpu, &online_new);
break;
default:
return NOTIFY_OK;
}

mutex_lock(&all_q_mutex);

Expand All @@ -2136,7 +2167,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
}

list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_queue_reinit(q);
blk_mq_queue_reinit(q, &online_new);

list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_unfreeze_queue(q);
Expand Down
3 changes: 2 additions & 1 deletion block/blk-mq.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
* CPU -> queue mappings
*/
extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
const struct cpumask *online_mask);
extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);

/*
Expand Down

0 comments on commit 5778322

Please sign in to comment.