From f72b8792d180948b4b3898374998f5ac8c02e539 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 24 Aug 2016 15:51:50 -0600 Subject: [PATCH 01/54] workqueue: add cancel_work() Like cancel_delayed_work(), but for regular work. Signed-off-by: Jens Axboe Mehed-by: Tejun Heo Acked-by: Tejun Heo --- include/linux/workqueue.h | 1 + kernel/workqueue.c | 40 +++++++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 26cc1df280d65a..fc6e2218640587 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -442,6 +442,7 @@ extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); +extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef071ca73fc325..bd81f039027782 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +static bool __cancel_work(struct work_struct *work, bool is_dwork) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(work, is_dwork, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_pool_and_clear_pending(work, get_work_pool_id(work)); + local_irq_restore(flags); + return ret; +} + +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work); */ bool cancel_delayed_work(struct delayed_work *dwork) { - unsigned long flags; - int ret; - - do { - ret = try_to_grab_pending(&dwork->work, true, &flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - - set_work_pool_and_clear_pending(&dwork->work, - get_work_pool_id(&dwork->work)); - local_irq_restore(flags); - return ret; + return __cancel_work(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work); From ee63cfa7fc197b63669623721b8009cce5b0659b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 24 Aug 2016 15:52:48 -0600 Subject: [PATCH 02/54] block: add kblockd_schedule_work_on() Add a helper to schedule a regular struct work on a particular CPU. Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++++ include/linux/blkdev.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 36c7ac328d8c17..2d08597533a438 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3097,6 +3097,12 @@ int kblockd_schedule_work(struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, kblockd_workqueue, work); +} +EXPORT_SYMBOL(kblockd_schedule_work_on); + int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e79055c8b57799..69aae720f4ef38 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1440,8 +1440,8 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) return bio_will_gap(req->q, bio, req->bio); } -struct work_struct; int kblockd_schedule_work(struct work_struct *work); +int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); From 27489a3c827b7eebba26eda0320bb0f100bef167 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 24 Aug 2016 15:54:25 -0600 Subject: [PATCH 03/54] blk-mq: turn hctx->run_work into a regular work struct We don't need the larger delayed work struct, since we always run it immediately. Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-mq.c | 9 ++++----- include/linux/blk-mq.h | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2d08597533a438..34ff8088eebee1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -288,7 +288,7 @@ void blk_sync_queue(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - cancel_delayed_work_sync(&hctx->run_work); + cancel_work_sync(&hctx->run_work); cancel_delayed_work_sync(&hctx->delay_work); } } else { diff --git a/block/blk-mq.c b/block/blk-mq.c index 13f5a6c1de7682..b68fdcbe58f6fe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -936,8 +936,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, 0); + kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); } void blk_mq_run_hw_queues(struct request_queue *q, bool async) @@ -958,7 +957,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - cancel_delayed_work(&hctx->run_work); + cancel_work(&hctx->run_work); cancel_delayed_work(&hctx->delay_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } @@ -1011,7 +1010,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + hctx = container_of(work, struct blk_mq_hw_ctx, run_work); __blk_mq_run_hw_queue(hctx); } @@ -1722,7 +1721,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (node == NUMA_NO_NODE) node = hctx->numa_node = set->numa_node; - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e43bbffb5b7a35..d579252e6463b2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -25,7 +25,7 @@ struct blk_mq_hw_ctx { } ____cacheline_aligned_in_smp; unsigned long state; /* BLK_MQ_S_* flags */ - struct delayed_work run_work; + struct work_struct run_work; struct delayed_work delay_work; cpumask_var_t cpumask; int next_cpu; From 8d354f133e86dd03ea7885a91df398c55ff699ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Aug 2016 08:00:28 -0600 Subject: [PATCH 04/54] blk-mq: improve layout of blk_mq_hw_ctx Various cache line optimizations: - Move delay_work towards the end. It's huge, and we don't use it a lot (only SCSI). - Move the atomic state into the same cacheline as the the dispatch list and lock. - Rearrange a few members to pack it better. - Shrink the max-order for dispatch accounting from 10 to 7. This means that ->dispatched[] and ->run now take up their own cacheline. This shrinks struct blk_mq_hw_ctx down to 8 cachelines. Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d579252e6463b2..e1544f0f8c214b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -22,11 +22,10 @@ struct blk_mq_hw_ctx { struct { spinlock_t lock; struct list_head dispatch; + unsigned long state; /* BLK_MQ_S_* flags */ } ____cacheline_aligned_in_smp; - unsigned long state; /* BLK_MQ_S_* flags */ struct work_struct run_work; - struct delayed_work delay_work; cpumask_var_t cpumask; int next_cpu; int next_cpu_batch; @@ -40,8 +39,8 @@ struct blk_mq_hw_ctx { struct blk_mq_ctxmap ctx_map; - unsigned int nr_ctx; struct blk_mq_ctx **ctxs; + unsigned int nr_ctx; atomic_t wait_index; @@ -49,7 +48,7 @@ struct blk_mq_hw_ctx { unsigned long queued; unsigned long run; -#define BLK_MQ_MAX_DISPATCH_ORDER 10 +#define BLK_MQ_MAX_DISPATCH_ORDER 7 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int numa_node; @@ -57,6 +56,8 @@ struct blk_mq_hw_ctx { atomic_t nr_active; + struct delayed_work delay_work; + struct blk_mq_cpu_notifier cpu_notifier; struct kobject kobj; From 88c7b2b75132c3ff8180b71e4f06cf043a00eac8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Aug 2016 08:07:30 -0600 Subject: [PATCH 05/54] blk-mq: prefetch request in blk_mq_tag_to_rq() When drivers or the core calls this function, they usually dereference the request shortly there after. Prefetch the first cache line. Profiling IO workloads shows that this is the most common cache miss on the block side of things. Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index b68fdcbe58f6fe..eea0d230faa123 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -588,8 +589,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - if (tag < tags->nr_tags) + if (tag < tags->nr_tags) { + prefetch(tags->rqs[tag]); return tags->rqs[tag]; + } return NULL; } From 99e6b87ec2102b10e190c92ea9560bafcb744f86 Mon Sep 17 00:00:00 2001 From: Baoyou Xie Date: Fri, 26 Aug 2016 14:08:53 +0800 Subject: [PATCH 06/54] mtip32xx: mark symbols static where possible We get 1 warning when biuld kernel with W=1: drivers/block/mtip32xx/mtip32xx.c:3689:6: warning: no previous prototype for 'mtip_block_release' [-Wmissing-prototypes] In fact, this function is only used in the file in which it is declared and don't need a declaration, but can be made static. so this patch marks it 'static'. Signed-off-by: Baoyou Xie Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 2aca98e8e42739..88c46853dbb54b 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3686,7 +3686,7 @@ static int mtip_block_open(struct block_device *dev, fmode_t mode) return -ENODEV; } -void mtip_block_release(struct gendisk *disk, fmode_t mode) +static void mtip_block_release(struct gendisk *disk, fmode_t mode) { } From fd8383fd88a2fd842a9431df5ed353bd7129eecc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 8 Sep 2016 12:33:37 -0700 Subject: [PATCH 07/54] nbd: convert to blkmq This moves NBD over to using blkmq, which allows us to get rid of the NBD wide queue lock and the async submit kthread. We will start with 1 hw queue for now, but I plan to add multiple tcp connection support in the future and we'll fix how we set the hwqueue's. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 337 +++++++++++++++++--------------------------- 1 file changed, 129 insertions(+), 208 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a9e398019f3815..15e7c6740873bc 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -45,12 +46,8 @@ struct nbd_device { struct socket * sock; /* If == NULL, device is not ready, yet */ int magic; - spinlock_t queue_lock; - struct list_head queue_head; /* Requests waiting result */ - struct request *active_req; - wait_queue_head_t active_wq; - struct list_head waiting_queue; /* Requests to be sent */ - wait_queue_head_t waiting_wq; + atomic_t outstanding_cmds; + struct blk_mq_tag_set tag_set; struct mutex tx_lock; struct gendisk *disk; @@ -71,6 +68,11 @@ struct nbd_device { #endif }; +struct nbd_cmd { + struct nbd_device *nbd; + struct list_head list; +}; + #if IS_ENABLED(CONFIG_DEBUG_FS) static struct dentry *nbd_dbg_dir; #endif @@ -83,18 +85,6 @@ static unsigned int nbds_max = 16; static struct nbd_device *nbd_dev; static int max_part; -/* - * Use just one lock (or at most 1 per NIC). Two arguments for this: - * 1. Each NIC is essentially a synchronization point for all servers - * accessed through that NIC so there's no need to have more locks - * than NICs anyway. - * 2. More locks lead to more "Dirty cache line bouncing" which will slow - * down each lock to the point where they're actually slower than just - * a single lock. - * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this! - */ -static DEFINE_SPINLOCK(nbd_lock); - static inline struct device *nbd_to_dev(struct nbd_device *nbd) { return disk_to_dev(nbd->disk); @@ -153,18 +143,17 @@ static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev, return 0; } -static void nbd_end_request(struct nbd_device *nbd, struct request *req) +static void nbd_end_request(struct nbd_cmd *cmd) { + struct nbd_device *nbd = cmd->nbd; + struct request *req = blk_mq_rq_from_pdu(cmd); int error = req->errors ? -EIO : 0; - struct request_queue *q = req->q; - unsigned long flags; - dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req, + dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd, error ? "failed" : "done"); - spin_lock_irqsave(q->queue_lock, flags); - __blk_end_request_all(req, error); - spin_unlock_irqrestore(q->queue_lock, flags); + atomic_dec(&nbd->outstanding_cmds); + blk_mq_complete_request(req, error); } /* @@ -193,7 +182,7 @@ static void nbd_xmit_timeout(unsigned long arg) struct nbd_device *nbd = (struct nbd_device *)arg; unsigned long flags; - if (list_empty(&nbd->queue_head)) + if (!atomic_read(&nbd->outstanding_cmds)) return; spin_lock_irqsave(&nbd->sock_lock, flags); @@ -273,8 +262,9 @@ static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec, } /* always call with the tx_lock held */ -static int nbd_send_req(struct nbd_device *nbd, struct request *req) +static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd) { + struct request *req = blk_mq_rq_from_pdu(cmd); int result, flags; struct nbd_request request; unsigned long size = blk_rq_bytes(req); @@ -298,10 +288,10 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); } - memcpy(request.handle, &req, sizeof(req)); + memcpy(request.handle, &req->tag, sizeof(req->tag)); dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", - req, nbdcmd_to_ascii(type), + cmd, nbdcmd_to_ascii(type), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); result = sock_xmit(nbd, 1, &request, sizeof(request), (type == NBD_CMD_WRITE) ? MSG_MORE : 0); @@ -323,7 +313,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) if (!rq_iter_last(bvec, iter)) flags = MSG_MORE; dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", - req, bvec.bv_len); + cmd, bvec.bv_len); result = sock_send_bvec(nbd, &bvec, flags); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), @@ -336,29 +326,6 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) return 0; } -static struct request *nbd_find_request(struct nbd_device *nbd, - struct request *xreq) -{ - struct request *req, *tmp; - int err; - - err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq); - if (unlikely(err)) - return ERR_PTR(err); - - spin_lock(&nbd->queue_lock); - list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) { - if (req != xreq) - continue; - list_del_init(&req->queuelist); - spin_unlock(&nbd->queue_lock); - return req; - } - spin_unlock(&nbd->queue_lock); - - return ERR_PTR(-ENOENT); -} - static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) { int result; @@ -370,11 +337,14 @@ static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) } /* NULL returned = something went wrong, inform userspace */ -static struct request *nbd_read_stat(struct nbd_device *nbd) +static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd) { int result; struct nbd_reply reply; - struct request *req; + struct nbd_cmd *cmd; + struct request *req = NULL; + u16 hwq; + int tag; reply.magic = 0; result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); @@ -390,25 +360,27 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) return ERR_PTR(-EPROTO); } - req = nbd_find_request(nbd, *(struct request **)reply.handle); - if (IS_ERR(req)) { - result = PTR_ERR(req); - if (result != -ENOENT) - return ERR_PTR(result); + memcpy(&tag, reply.handle, sizeof(int)); - dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", - reply.handle); - return ERR_PTR(-EBADR); + hwq = blk_mq_unique_tag_to_hwq(tag); + if (hwq < nbd->tag_set.nr_hw_queues) + req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq], + blk_mq_unique_tag_to_tag(tag)); + if (!req || !blk_mq_request_started(req)) { + dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n", + tag, req); + return ERR_PTR(-ENOENT); } + cmd = blk_mq_rq_to_pdu(req); if (ntohl(reply.error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); req->errors++; - return req; + return cmd; } - dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); + dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd); if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; @@ -419,13 +391,13 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); req->errors++; - return req; + return cmd; } dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", - req, bvec.bv_len); + cmd, bvec.bv_len); } } - return req; + return cmd; } static ssize_t pid_show(struct device *dev, @@ -444,7 +416,7 @@ static struct device_attribute pid_attr = { static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) { - struct request *req; + struct nbd_cmd *cmd; int ret; BUG_ON(nbd->magic != NBD_MAGIC); @@ -460,13 +432,13 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) nbd_size_update(nbd, bdev); while (1) { - req = nbd_read_stat(nbd); - if (IS_ERR(req)) { - ret = PTR_ERR(req); + cmd = nbd_read_stat(nbd); + if (IS_ERR(cmd)) { + ret = PTR_ERR(cmd); break; } - nbd_end_request(nbd, req); + nbd_end_request(cmd); } nbd_size_clear(nbd, bdev); @@ -475,44 +447,37 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) return ret; } -static void nbd_clear_que(struct nbd_device *nbd) +static void nbd_clear_req(struct request *req, void *data, bool reserved) { - struct request *req; + struct nbd_cmd *cmd; + if (!blk_mq_request_started(req)) + return; + cmd = blk_mq_rq_to_pdu(req); + req->errors++; + nbd_end_request(cmd); +} + +static void nbd_clear_que(struct nbd_device *nbd) +{ BUG_ON(nbd->magic != NBD_MAGIC); /* * Because we have set nbd->sock to NULL under the tx_lock, all - * modifications to the list must have completed by now. For - * the same reason, the active_req must be NULL. - * - * As a consequence, we don't need to take the spin lock while - * purging the list here. + * modifications to the list must have completed by now. */ BUG_ON(nbd->sock); - BUG_ON(nbd->active_req); - while (!list_empty(&nbd->queue_head)) { - req = list_entry(nbd->queue_head.next, struct request, - queuelist); - list_del_init(&req->queuelist); - req->errors++; - nbd_end_request(nbd, req); - } - - while (!list_empty(&nbd->waiting_queue)) { - req = list_entry(nbd->waiting_queue.next, struct request, - queuelist); - list_del_init(&req->queuelist); - req->errors++; - nbd_end_request(nbd, req); - } + blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); } -static void nbd_handle_req(struct nbd_device *nbd, struct request *req) +static void nbd_handle_cmd(struct nbd_cmd *cmd) { + struct request *req = blk_mq_rq_from_pdu(cmd); + struct nbd_device *nbd = cmd->nbd; + if (req->cmd_type != REQ_TYPE_FS) goto error_out; @@ -526,6 +491,7 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) req->errors = 0; mutex_lock(&nbd->tx_lock); + nbd->task_send = current; if (unlikely(!nbd->sock)) { mutex_unlock(&nbd->tx_lock); dev_err(disk_to_dev(nbd->disk), @@ -533,106 +499,34 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) goto error_out; } - nbd->active_req = req; - - if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head)) + if (nbd->xmit_timeout && !atomic_read(&nbd->outstanding_cmds)) mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); - if (nbd_send_req(nbd, req) != 0) { + atomic_inc(&nbd->outstanding_cmds); + if (nbd_send_cmd(nbd, cmd) != 0) { dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; - nbd_end_request(nbd, req); - } else { - spin_lock(&nbd->queue_lock); - list_add_tail(&req->queuelist, &nbd->queue_head); - spin_unlock(&nbd->queue_lock); + nbd_end_request(cmd); } - nbd->active_req = NULL; + nbd->task_send = NULL; mutex_unlock(&nbd->tx_lock); - wake_up_all(&nbd->active_wq); return; error_out: req->errors++; - nbd_end_request(nbd, req); -} - -static int nbd_thread_send(void *data) -{ - struct nbd_device *nbd = data; - struct request *req; - - nbd->task_send = current; - - set_user_nice(current, MIN_NICE); - while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { - /* wait for something to do */ - wait_event_interruptible(nbd->waiting_wq, - kthread_should_stop() || - !list_empty(&nbd->waiting_queue)); - - /* extract request */ - if (list_empty(&nbd->waiting_queue)) - continue; - - spin_lock_irq(&nbd->queue_lock); - req = list_entry(nbd->waiting_queue.next, struct request, - queuelist); - list_del_init(&req->queuelist); - spin_unlock_irq(&nbd->queue_lock); - - /* handle request */ - nbd_handle_req(nbd, req); - } - - nbd->task_send = NULL; - - return 0; + nbd_end_request(cmd); } -/* - * We always wait for result of write, for now. It would be nice to make it optional - * in future - * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK)) - * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } - */ - -static void nbd_request_handler(struct request_queue *q) - __releases(q->queue_lock) __acquires(q->queue_lock) +static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct request *req; - - while ((req = blk_fetch_request(q)) != NULL) { - struct nbd_device *nbd; - - spin_unlock_irq(q->queue_lock); - - nbd = req->rq_disk->private_data; - - BUG_ON(nbd->magic != NBD_MAGIC); + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n", - req, req->cmd_type); - - if (unlikely(!nbd->sock)) { - dev_err_ratelimited(disk_to_dev(nbd->disk), - "Attempted send on closed socket\n"); - req->errors++; - nbd_end_request(nbd, req); - spin_lock_irq(q->queue_lock); - continue; - } - - spin_lock_irq(&nbd->queue_lock); - list_add_tail(&req->queuelist, &nbd->waiting_queue); - spin_unlock_irq(&nbd->queue_lock); - - wake_up(&nbd->waiting_wq); - - spin_lock_irq(q->queue_lock); - } + blk_mq_start_request(bd->rq); + nbd_handle_cmd(cmd); + return BLK_MQ_RQ_QUEUE_OK; } static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock) @@ -700,33 +594,37 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, { switch (cmd) { case NBD_DISCONNECT: { - struct request sreq; + struct request *sreq; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); if (!nbd->sock) return -EINVAL; + sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0); + if (!sreq) + return -ENOMEM; + mutex_unlock(&nbd->tx_lock); fsync_bdev(bdev); mutex_lock(&nbd->tx_lock); - blk_rq_init(NULL, &sreq); - sreq.cmd_type = REQ_TYPE_DRV_PRIV; + sreq->cmd_type = REQ_TYPE_DRV_PRIV; /* Check again after getting mutex back. */ - if (!nbd->sock) + if (!nbd->sock) { + blk_mq_free_request(sreq); return -EINVAL; + } nbd->disconnect = true; - nbd_send_req(nbd, &sreq); + nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq)); + blk_mq_free_request(sreq); return 0; } case NBD_CLEAR_SOCK: sock_shutdown(nbd); nbd_clear_que(nbd); - BUG_ON(!list_empty(&nbd->queue_head)); - BUG_ON(!list_empty(&nbd->waiting_queue)); kill_bdev(bdev); return 0; @@ -772,7 +670,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return 0; case NBD_DO_IT: { - struct task_struct *thread; int error; if (nbd->task_recv) @@ -786,18 +683,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd_parse_flags(nbd, bdev); - thread = kthread_run(nbd_thread_send, nbd, "%s", - nbd_name(nbd)); - if (IS_ERR(thread)) { - mutex_lock(&nbd->tx_lock); - nbd->task_recv = NULL; - return PTR_ERR(thread); - } - nbd_dev_dbg_init(nbd); error = nbd_thread_recv(nbd, bdev); nbd_dev_dbg_close(nbd); - kthread_stop(thread); mutex_lock(&nbd->tx_lock); nbd->task_recv = NULL; @@ -825,10 +713,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return 0; case NBD_PRINT_DEBUG: - dev_info(disk_to_dev(nbd->disk), - "next = %p, prev = %p, head = %p\n", - nbd->queue_head.next, nbd->queue_head.prev, - &nbd->queue_head); + /* + * For compatibility only, we no longer keep a list of + * outstanding requests. + */ return 0; } return -ENOTTY; @@ -987,6 +875,23 @@ static void nbd_dbg_close(void) #endif +static int nbd_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->nbd = data; + INIT_LIST_HEAD(&cmd->list); + return 0; +} + +static struct blk_mq_ops nbd_mq_ops = { + .queue_rq = nbd_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = nbd_init_request, +}; + /* * And here should be modules and kernel interface * (Just smiley confuses emacs :-) @@ -1035,16 +940,34 @@ static int __init nbd_init(void) if (!disk) goto out; nbd_dev[i].disk = disk; + + nbd_dev[i].tag_set.ops = &nbd_mq_ops; + nbd_dev[i].tag_set.nr_hw_queues = 1; + nbd_dev[i].tag_set.queue_depth = 128; + nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE; + nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd); + nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE | + BLK_MQ_F_SG_MERGE; + nbd_dev[i].tag_set.driver_data = &nbd_dev[i]; + + err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set); + if (err) { + put_disk(disk); + goto out; + } + /* * The new linux 2.5 block layer implementation requires * every gendisk to have its very own request_queue struct. * These structs are big so we dynamically allocate them. */ - disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock); + disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set); if (!disk->queue) { + blk_mq_free_tag_set(&nbd_dev[i].tag_set); put_disk(disk); goto out; } + /* * Tell the block layer that we are not a rotational device */ @@ -1069,16 +992,12 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].magic = NBD_MAGIC; - INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); - spin_lock_init(&nbd_dev[i].queue_lock); spin_lock_init(&nbd_dev[i].sock_lock); - INIT_LIST_HEAD(&nbd_dev[i].queue_head); mutex_init(&nbd_dev[i].tx_lock); init_timer(&nbd_dev[i].timeout_timer); nbd_dev[i].timeout_timer.function = nbd_xmit_timeout; nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i]; - init_waitqueue_head(&nbd_dev[i].active_wq); - init_waitqueue_head(&nbd_dev[i].waiting_wq); + atomic_set(&nbd_dev[i].outstanding_cmds, 0); disk->major = NBD_MAJOR; disk->first_minor = i << part_shift; disk->fops = &nbd_fops; @@ -1091,6 +1010,7 @@ static int __init nbd_init(void) return 0; out: while (i--) { + blk_mq_free_tag_set(&nbd_dev[i].tag_set); blk_cleanup_queue(nbd_dev[i].disk->queue); put_disk(nbd_dev[i].disk); } @@ -1110,6 +1030,7 @@ static void __exit nbd_cleanup(void) if (disk) { del_gendisk(disk); blk_cleanup_queue(disk->queue); + blk_mq_free_tag_set(&nbd_dev[i].tag_set); put_disk(disk); } } From c261189862c6f65117eb3b1748622a08ef49c262 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 8 Sep 2016 12:33:38 -0700 Subject: [PATCH 08/54] nbd: don't shutdown sock with irq's disabled We hit a warning when shutting down the nbd connection because we have irq's disabled. We don't really need to do the shutdown under the lock, just clear the nbd->sock. So do the shutdown outside of the irq. This gets rid of the warning. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 15e7c6740873bc..4b7d0f3c35217e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -161,6 +161,8 @@ static void nbd_end_request(struct nbd_cmd *cmd) */ static void sock_shutdown(struct nbd_device *nbd) { + struct socket *sock; + spin_lock_irq(&nbd->sock_lock); if (!nbd->sock) { @@ -168,18 +170,21 @@ static void sock_shutdown(struct nbd_device *nbd) return; } + sock = nbd->sock; dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); - kernel_sock_shutdown(nbd->sock, SHUT_RDWR); - sockfd_put(nbd->sock); nbd->sock = NULL; spin_unlock_irq(&nbd->sock_lock); + kernel_sock_shutdown(sock, SHUT_RDWR); + sockfd_put(sock); + del_timer(&nbd->timeout_timer); } static void nbd_xmit_timeout(unsigned long arg) { struct nbd_device *nbd = (struct nbd_device *)arg; + struct socket *sock = NULL; unsigned long flags; if (!atomic_read(&nbd->outstanding_cmds)) @@ -189,10 +194,16 @@ static void nbd_xmit_timeout(unsigned long arg) nbd->timedout = true; - if (nbd->sock) - kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + if (nbd->sock) { + sock = nbd->sock; + get_file(sock->file); + } spin_unlock_irqrestore(&nbd->sock_lock, flags); + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sockfd_put(sock); + } dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); } From 9b4a6ba9185ac1c398f2db69c491989452ce7018 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 8 Sep 2016 12:33:39 -0700 Subject: [PATCH 09/54] nbd: use flags instead of bool In preparation for some future changes, change a few of the state bools over to normal bits to set/clear properly. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4b7d0f3c35217e..cf855a1b3729ba 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -41,8 +41,12 @@ #include +#define NBD_TIMEDOUT 0 +#define NBD_DISCONNECT_REQUESTED 1 + struct nbd_device { u32 flags; + unsigned long runtime_flags; struct socket * sock; /* If == NULL, device is not ready, yet */ int magic; @@ -54,8 +58,6 @@ struct nbd_device { int blksize; loff_t bytesize; int xmit_timeout; - bool timedout; - bool disconnect; /* a disconnect has been requested by user */ struct timer_list timeout_timer; /* protects initialization and shutdown of the socket */ @@ -192,7 +194,7 @@ static void nbd_xmit_timeout(unsigned long arg) spin_lock_irqsave(&nbd->sock_lock, flags); - nbd->timedout = true; + set_bit(NBD_TIMEDOUT, &nbd->runtime_flags); if (nbd->sock) { sock = nbd->sock; @@ -562,8 +564,7 @@ static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock) /* Reset all properties of an NBD device */ static void nbd_reset(struct nbd_device *nbd) { - nbd->disconnect = false; - nbd->timedout = false; + nbd->runtime_flags = 0; nbd->blksize = 1024; nbd->bytesize = 0; set_capacity(nbd->disk, 0); @@ -626,7 +627,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return -EINVAL; } - nbd->disconnect = true; + set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags); nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq)); blk_mq_free_request(sreq); @@ -706,9 +707,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, kill_bdev(bdev); nbd_bdev_reset(bdev); - if (nbd->disconnect) /* user requested, ignore socket errors */ + /* user requested, ignore socket errors */ + if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags)) error = 0; - if (nbd->timedout) + if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags)) error = -ETIMEDOUT; nbd_reset(nbd); From 0eadf37afc2500e1162c9040ec26a705b9af8d47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 8 Sep 2016 12:33:40 -0700 Subject: [PATCH 10/54] nbd: allow block mq to deal with timeouts Instead of rolling our own timer, just utilize the blk mq req timeout and do the disconnect if any of our commands timeout. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 51 +++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index cf855a1b3729ba..4c6dd1a85eade5 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -50,16 +50,13 @@ struct nbd_device { struct socket * sock; /* If == NULL, device is not ready, yet */ int magic; - atomic_t outstanding_cmds; struct blk_mq_tag_set tag_set; struct mutex tx_lock; struct gendisk *disk; int blksize; loff_t bytesize; - int xmit_timeout; - struct timer_list timeout_timer; /* protects initialization and shutdown of the socket */ spinlock_t sock_lock; struct task_struct *task_recv; @@ -154,7 +151,6 @@ static void nbd_end_request(struct nbd_cmd *cmd) dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd, error ? "failed" : "done"); - atomic_dec(&nbd->outstanding_cmds); blk_mq_complete_request(req, error); } @@ -165,7 +161,7 @@ static void sock_shutdown(struct nbd_device *nbd) { struct socket *sock; - spin_lock_irq(&nbd->sock_lock); + spin_lock(&nbd->sock_lock); if (!nbd->sock) { spin_unlock_irq(&nbd->sock_lock); @@ -175,24 +171,20 @@ static void sock_shutdown(struct nbd_device *nbd) sock = nbd->sock; dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); nbd->sock = NULL; - spin_unlock_irq(&nbd->sock_lock); + spin_unlock(&nbd->sock_lock); kernel_sock_shutdown(sock, SHUT_RDWR); sockfd_put(sock); - - del_timer(&nbd->timeout_timer); } -static void nbd_xmit_timeout(unsigned long arg) +static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, + bool reserved) { - struct nbd_device *nbd = (struct nbd_device *)arg; + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); + struct nbd_device *nbd = cmd->nbd; struct socket *sock = NULL; - unsigned long flags; - - if (!atomic_read(&nbd->outstanding_cmds)) - return; - spin_lock_irqsave(&nbd->sock_lock, flags); + spin_lock(&nbd->sock_lock); set_bit(NBD_TIMEDOUT, &nbd->runtime_flags); @@ -201,13 +193,15 @@ static void nbd_xmit_timeout(unsigned long arg) get_file(sock->file); } - spin_unlock_irqrestore(&nbd->sock_lock, flags); + spin_unlock(&nbd->sock_lock); if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sockfd_put(sock); } + req->errors++; dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); + return BLK_EH_HANDLED; } /* @@ -257,9 +251,6 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, tsk_restore_flags(current, pflags, PF_MEMALLOC); - if (!send && nbd->xmit_timeout) - mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); - return result; } @@ -512,10 +503,6 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd) goto error_out; } - if (nbd->xmit_timeout && !atomic_read(&nbd->outstanding_cmds)) - mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); - - atomic_inc(&nbd->outstanding_cmds); if (nbd_send_cmd(nbd, cmd) != 0) { dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; @@ -569,9 +556,8 @@ static void nbd_reset(struct nbd_device *nbd) nbd->bytesize = 0; set_capacity(nbd->disk, 0); nbd->flags = 0; - nbd->xmit_timeout = 0; + nbd->tag_set.timeout = 0; queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); - del_timer_sync(&nbd->timeout_timer); } static void nbd_bdev_reset(struct block_device *bdev) @@ -668,13 +654,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return nbd_size_set(nbd, bdev, nbd->blksize, arg); case NBD_SET_TIMEOUT: - nbd->xmit_timeout = arg * HZ; - if (arg) - mod_timer(&nbd->timeout_timer, - jiffies + nbd->xmit_timeout); - else - del_timer_sync(&nbd->timeout_timer); - + nbd->tag_set.timeout = arg * HZ; return 0; case NBD_SET_FLAGS: @@ -836,7 +816,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); - debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); + debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); @@ -903,6 +883,7 @@ static struct blk_mq_ops nbd_mq_ops = { .queue_rq = nbd_queue_rq, .map_queue = blk_mq_map_queue, .init_request = nbd_init_request, + .timeout = nbd_xmit_timeout, }; /* @@ -1007,10 +988,6 @@ static int __init nbd_init(void) nbd_dev[i].magic = NBD_MAGIC; spin_lock_init(&nbd_dev[i].sock_lock); mutex_init(&nbd_dev[i].tx_lock); - init_timer(&nbd_dev[i].timeout_timer); - nbd_dev[i].timeout_timer.function = nbd_xmit_timeout; - nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i]; - atomic_set(&nbd_dev[i].outstanding_cmds, 0); disk->major = NBD_MAJOR; disk->first_minor = i << part_shift; disk->fops = &nbd_fops; From 6e219353afa1f67f453141f7462b01708ebf5574 Mon Sep 17 00:00:00 2001 From: Stephen Bates Date: Tue, 13 Sep 2016 12:23:15 -0600 Subject: [PATCH 11/54] block: add poll_considered statistic In order to help determine the effectiveness of polling in a running system it is usful to determine the ratio of how often the poll function is called vs how often the completion is checked. For this reason we add a poll_considered variable and add it to the sysfs entry for io_poll. Signed-off-by: Stephen Bates Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++-- block/blk-mq-sysfs.c | 4 +++- include/linux/blk-mq.h | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 34ff8088eebee1..14d7c0740dc07a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3307,19 +3307,23 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie) { struct blk_plug *plug; long state; + unsigned int queue_num; + struct blk_mq_hw_ctx *hctx; if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return false; + queue_num = blk_qc_t_to_queue_num(cookie); + hctx = q->queue_hw_ctx[queue_num]; + hctx->poll_considered++; + plug = current->plug; if (plug) blk_flush_plug_list(plug, false); state = current->state; while (!need_resched()) { - unsigned int queue_num = blk_qc_t_to_queue_num(cookie); - struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; int ret; hctx->poll_invoked++; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index fe822aa5b8e4a1..ea8c3f58afbdc6 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -176,7 +176,9 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) { - return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); + return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n", + hctx->poll_considered, hctx->poll_invoked, + hctx->poll_success); } static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e1544f0f8c214b..7710f795d7c284 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -61,6 +61,7 @@ struct blk_mq_hw_ctx { struct blk_mq_cpu_notifier cpu_notifier; struct kobject kobj; + unsigned long poll_considered; unsigned long poll_invoked; unsigned long poll_success; }; From d21ea4bc0f6afbc852f1436c7c691c7b6fed0eb8 Mon Sep 17 00:00:00 2001 From: Stephen Bates Date: Tue, 13 Sep 2016 12:23:16 -0600 Subject: [PATCH 12/54] block: enable zeroing of io_poll statistics Allow the io_poll statistics to be zeroed to make for easier logging of polling event. Signed-off-by: Stephen Bates Acked-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ea8c3f58afbdc6..ac5160eb686253 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -181,6 +181,14 @@ static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) hctx->poll_success); } +static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t size) +{ + hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; + + return size; +} + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -303,8 +311,9 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .show = blk_mq_hw_sysfs_cpus_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { - .attr = {.name = "io_poll", .mode = S_IRUGO }, + .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO }, .show = blk_mq_hw_sysfs_poll_show, + .store = blk_mq_hw_sysfs_poll_store, }; static struct attribute *default_hw_ctx_attrs[] = { From 223757016837d5bc8546c5683e13fbafe6cb374d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Sep 2016 11:56:13 +0200 Subject: [PATCH 13/54] block_dev: remove DAX leftovers DAX support for block devices was removed in commits 03cdad ("block: disable block device DAX by default") and 99a01cd ("block: remove BLK_DEV_DAX config option"), but we still kept a call to dax_do_io and some uneeded i_flags manipulations introduced in commit bbab37 ("block: Add support for DAX reads/writes to block devices"). Remove those leftovers. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Acked-by: Dan Williams Signed-off-by: Jens Axboe --- fs/block_dev.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 08ae99343d92f9..a516568f63b6e9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -180,9 +180,6 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); - if (IS_DAX(inode)) - return dax_do_io(iocb, inode, iter, blkdev_get_block, - NULL, DIO_SKIP_DIO_COUNT); return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); @@ -1275,7 +1272,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - bdev->bd_inode->i_flags = 0; if (!partno) { ret = -ENXIO; @@ -1303,11 +1299,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) { + if (!ret) bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!bdev_dax_capable(bdev)) - bdev->bd_inode->i_flags &= ~S_DAX; - } /* * If the device is invalidated, rescan partition @@ -1342,8 +1335,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!bdev_dax_capable(bdev)) - bdev->bd_inode->i_flags &= ~S_DAX; } } else { if (bdev->bd_contains == bdev) { From abe47114b192a9e0167905a3418d815b4fcf87de Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 14 Sep 2016 14:33:15 +0200 Subject: [PATCH 14/54] block: remove blk_mq_alloc_single_hw_queue() prototype The blk_mq_alloc_single_hw_queue() is a prototype artifact that should have been removed with commit cdef54dd85ad66e77262ea57796a3e81683dd5d6 "blk-mq: remove alloc_hctx and free_hctx methods" where the last users of it were deleted. Fixes: cdef54dd85ad ("blk-mq: remove alloc_hctx and free_hctx methods") Signed-off-by: Linus Walleij Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7710f795d7c284..ff14f68067aaa1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -223,7 +223,6 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) } struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); int blk_mq_request_started(struct request *rq); void blk_mq_start_request(struct request *rq); From a441b0d093b3690b7cc2cb30998358904d051db4 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 14 Sep 2016 14:32:52 +0200 Subject: [PATCH 15/54] block: remove remnant refs to hardsect commit e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 "block: Do away with the notion of hardsect_size" removed the notion of "hardware sector size" from the kernel in favor of logical block size, but references remain in comments and documentation. Update the remaining sites mentioning hardsect. Signed-off-by: Linus Walleij Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 4 ++-- block/bio.c | 2 +- fs/befs/linuxvfs.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index bcdb2b4c1f12dc..918e1e0d0e78b2 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -115,7 +115,7 @@ i. Per-queue limits/values exported to the generic layer by the driver Various parameters that the generic i/o scheduler logic uses are set at a per-queue level (e.g maximum request size, maximum number of segments in -a scatter-gather list, hardsect size) +a scatter-gather list, logical block size) Some parameters that were earlier available as global arrays indexed by major/minor are now directly associated with the queue. Some of these may @@ -156,7 +156,7 @@ Some new queue property settings: blk_queue_max_segment_size(q, max_seg_size) Maximum size of a clustered segment, 64kB default. - blk_queue_hardsect_size(q, hardsect_size) + blk_queue_logical_block_size(q, logical_block_size) Lowest possible sector size that the hardware can operate on, 512 bytes default. diff --git a/block/bio.c b/block/bio.c index aa7354088008ba..a6d279e1ea9e96 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1274,7 +1274,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, nr_pages += end - start; /* - * buffer must be aligned to at least hardsector size for now + * buffer must be aligned to at least logical block size for now */ if (uaddr & queue_dma_alignment(q)) return ERR_PTR(-EINVAL); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 7da05b159ade2d..bfe9f999493531 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -789,7 +789,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) * Will be set to real fs blocksize later. * * Linux 2.4.10 and later refuse to read blocks smaller than - * the hardsect size for the device. But we also need to read at + * the logical block size for the device. But we also need to read at * least 1k to get the second 512 bytes of the volume. * -WD 10-26-01 */ From 637ca77bd1f7950538956c61dcd0c2e559905dbf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 14 Sep 2016 10:44:12 +0200 Subject: [PATCH 16/54] block: Document that bio_op() uses the data type of bio.bi_opf Make it clear that the sizeof(unsigned int) expression in BIO_OP_SHIFT refers to the bi_opf member of struct bio. Signed-off-by: Bart Van Assche Cc: Mike Christie Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 436f43f87da928..1e1ef210ae91b0 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -89,7 +89,7 @@ struct bio { struct bio_vec bi_inline_vecs[0]; }; -#define BIO_OP_SHIFT (8 * sizeof(unsigned int) - REQ_OP_BITS) +#define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS) #define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) #define bio_set_op_attrs(bio, op, op_flags) do { \ From 4382e33ad374862eacf62003bb02c750391ada05 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 14 Sep 2016 10:45:36 +0200 Subject: [PATCH 17/54] block, dm-crypt, btrfs: Introduce bio_flags() Introduce the bio_flags() macro. Ensure that the second argument of bio_set_op_attrs() only contains flags and no operation. This patch does not change any functionality. Signed-off-by: Bart Van Assche Cc: Mike Christie Cc: Chris Mason (maintainer:BTRFS FILE SYSTEM) Cc: Josef Bacik (maintainer:BTRFS FILE SYSTEM) Cc: Mike Snitzer Cc: Hannes Reinecke Cc: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm-crypt.c | 2 +- fs/btrfs/inode.c | 5 +++-- include/linux/blk_types.h | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index eedba67b0e3ef6..9ba0f0724d28fe 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1136,7 +1136,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; - bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf); + bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio)); } static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e6811c42e41ef3..ca01106795ead9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8412,7 +8412,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); + bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8450,7 +8450,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); + bio_set_op_attrs(bio, bio_op(orig_bio), + bio_flags(orig_bio)); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1e1ef210ae91b0..311fa2f478b843 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -90,11 +90,12 @@ struct bio { }; #define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS) +#define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1)) #define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) #define bio_set_op_attrs(bio, op, op_flags) do { \ WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1); \ + (bio)->bi_opf = bio_flags(bio); \ (bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT); \ (bio)->bi_opf |= op_flags; \ } while (0) From 3e1de31b9bf608c5b35e2d0d134eb87f2a9ba4ae Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 14 Sep 2016 10:46:22 +0200 Subject: [PATCH 18/54] block: Improve bio_set_op_attrs() robustness Since REQ_OP_BITS == 3 and __REQ_NR_BITS == 30 it is not that hard to pass an op_flags argument to bio_set_op_attrs() that is larger than the number of bits reserved for the op_flags argument. Complain if this happens. Additionally, ensure that negative arguments trigger a complaint (1 << ... is signed while 1U << ... is unsigned; adding 0U to an integer expression causes it to be promoted to an unsigned type). Signed-off-by: Bart Van Assche Cc: Mike Christie Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 311fa2f478b843..53ee1a2acd4fde 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -93,11 +93,18 @@ struct bio { #define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1)) #define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) -#define bio_set_op_attrs(bio, op, op_flags) do { \ - WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (bio)->bi_opf = bio_flags(bio); \ - (bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT); \ - (bio)->bi_opf |= op_flags; \ +#define bio_set_op_attrs(bio, op, op_flags) do { \ + if (__builtin_constant_p(op)) \ + BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS)); \ + else \ + WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS)); \ + if (__builtin_constant_p(op_flags)) \ + BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ + else \ + WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ + (bio)->bi_opf = bio_flags(bio); \ + (bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT); \ + (bio)->bi_opf |= (op_flags); \ } while (0) #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) From 3f7c624aa58f769e0313ca3310704c5d88ac99ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 11 Sep 2016 16:03:02 +0200 Subject: [PATCH 19/54] block: remove bio_destructor_t Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 53ee1a2acd4fde..cd395ecec99d01 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -16,7 +16,6 @@ struct block_device; struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); -typedef void (bio_destructor_t) (struct bio *); #ifdef CONFIG_BLOCK /* From fc95db3edeaf924e9ad16592d9c1b06c730a49c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 11 Sep 2016 16:03:03 +0200 Subject: [PATCH 20/54] bio.h: remove a very outdated comment Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 23ddf4b46a9b01..e00721a2dce133 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -1,6 +1,4 @@ /* - * 2.5 block I/O model - * * Copyright (C) 2001 Jens Axboe * * This program is free software; you can redistribute it and/or modify From c5c5ca777469f0ff854f1da0aff9b3a9051b3ef7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 11 Sep 2016 16:03:04 +0200 Subject: [PATCH 21/54] block: remove IOPRIO_BITS Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index beb9ce1c2c2335..8c1239020d79d3 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -7,7 +7,6 @@ /* * Gives us 8 prio classes with 13-bits of data for each class */ -#define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) From 2849450ad39d2e699fda2d5c6f41e05d87fd7004 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 14 Sep 2016 13:28:30 -0400 Subject: [PATCH 22/54] blk-mq: introduce blk_mq_delay_kick_requeue_list() blk_mq_delay_kick_requeue_list() provides the ability to kick the q->requeue_list after a specified time. To do this the request_queue's 'requeue_work' member was changed to a delayed_work. blk_mq_delay_kick_requeue_list() allows DM to defer processing requeued requests while it doesn't make sense to immediately requeue them (e.g. when all paths in a DM multipath have failed). Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++++---- include/linux/blk-mq.h | 1 + include/linux/blkdev.h | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index eea0d230faa123..7ddc7969fba43b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -502,7 +502,7 @@ EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = - container_of(work, struct request_queue, requeue_work); + container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); struct request *rq, *next; unsigned long flags; @@ -557,16 +557,24 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_cancel_requeue_work(struct request_queue *q) { - cancel_work_sync(&q->requeue_work); + cancel_delayed_work_sync(&q->requeue_work); } EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_work(&q->requeue_work); + kblockd_schedule_delayed_work(&q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); +void blk_mq_delay_kick_requeue_list(struct request_queue *q, + unsigned long msecs) +{ + kblockd_schedule_delayed_work(&q->requeue_work, + msecs_to_jiffies(msecs)); +} +EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); + void blk_mq_abort_requeue_list(struct request_queue *q) { unsigned long flags; @@ -2084,7 +2092,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->sg_reserved_size = INT_MAX; - INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ff14f68067aaa1..60ef14cbcd2da8 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -233,6 +233,7 @@ void blk_mq_requeue_request(struct request *rq); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); void blk_mq_cancel_requeue_work(struct request_queue *q); void blk_mq_kick_requeue_list(struct request_queue *q); +void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_abort_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq, int error); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 69aae720f4ef38..c47c358ba0529c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -449,7 +449,7 @@ struct request_queue { struct list_head requeue_list; spinlock_t requeue_lock; - struct work_struct requeue_work; + struct delayed_work requeue_work; struct mutex sysfs_lock; From 703fd1c0f177219e3a84e6c095c31dc566514d81 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Sep 2016 13:59:14 -0600 Subject: [PATCH 23/54] blk-mq: account higher order dispatch We currently account a '0' dispatch, and anything above that still falls below the range set by BLK_MQ_MAX_DISPATCH_ORDER. If we dispatch more, we don't account it. Change the last bucket to be inclusive of anything above the range we track, and have the sysfs file reflect that by including a '+' in the output: $ cat /sys/block/nvme0n1/mq/0/dispatched 0 1006 1 20229 2 1 4 0 8 0 16 0 32+ 0 Signed-off-by: Jens Axboe Reviewed-by: Omar Sandoval --- block/blk-mq-sysfs.c | 8 +++++--- block/blk-mq.c | 13 +++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ac5160eb686253..3c385b196bc711 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -208,12 +208,14 @@ static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { - unsigned long d = 1U << (i - 1); + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { + unsigned int d = 1U << (i - 1); - page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); + page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]); } + page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1), + hctx->dispatched[i]); return page - start_page; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 7ddc7969fba43b..0cb93625d8121d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -789,6 +789,14 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) } } +static inline unsigned int queued_to_index(unsigned int queued) +{ + if (!queued) + return 0; + + return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); +} + /* * Run this hardware queue, pulling any software queues mapped to it in. * Note that this function currently has various problems around ordering @@ -877,10 +885,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) dptr = &driver_list; } - if (!queued) - hctx->dispatched[0]++; - else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) - hctx->dispatched[ilog2(queued) + 1]++; + hctx->dispatched[queued_to_index(queued)]++; /* * Any items that need requeuing? Stuff them into hctx->dispatch, From 88459642cba452630326b9cab1c651e09577d4e4 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Sat, 17 Sep 2016 08:38:44 -0600 Subject: [PATCH 24/54] blk-mq: abstract tag allocation out into sbitmap library This is a generally useful data structure, so make it available to anyone else who might want to use it. It's also a nice cleanup separating the allocation logic from the rest of the tag handling logic. The code is behind a new Kconfig option, CONFIG_SBITMAP, which is only selected by CONFIG_BLOCK for now. This should be a complete noop functionality-wise. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + block/Kconfig | 1 + block/blk-mq-tag.c | 463 ++++++++++------------------------------ block/blk-mq-tag.h | 37 +--- block/blk-mq.c | 112 +++------- block/blk-mq.h | 9 - include/linux/blk-mq.h | 9 +- include/linux/sbitmap.h | 327 ++++++++++++++++++++++++++++ lib/Kconfig | 3 + lib/Makefile | 2 + lib/sbitmap.c | 301 ++++++++++++++++++++++++++ 11 files changed, 789 insertions(+), 476 deletions(-) create mode 100644 include/linux/sbitmap.h create mode 100644 lib/sbitmap.c diff --git a/MAINTAINERS b/MAINTAINERS index 71aa5daeae8f43..157b1ca3e19d25 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2451,6 +2451,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git S: Maintained F: block/ F: kernel/trace/blktrace.c +F: lib/sbitmap.c BLOCK2MTD DRIVER M: Joern Engel diff --git a/block/Kconfig b/block/Kconfig index 161491d0a879ed..5136ad4bb6d557 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -4,6 +4,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select SBITMAP help Provide block layer support for the kernel. diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 729bac3a673b7a..2cbdecd594e970 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,12 +1,7 @@ /* - * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread - * over multiple cachelines to avoid ping-pong between multiple submitters - * or submitter and completer. Uses rolling wakeups to avoid falling of - * the scaling cliff when we run out of tags and have to start putting - * submitters to sleep. - * - * Uses active queue tracking to support fairer distribution of tags - * between multiple submitters when a shared tag map is used. + * Tag allocation using scalable bitmaps. Uses active queue tracking to support + * fairer distribution of tags between multiple submitters when a shared tag map + * is used. * * Copyright (C) 2013-2014 Jens Axboe */ @@ -19,40 +14,12 @@ #include "blk-mq.h" #include "blk-mq-tag.h" -static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) -{ - int i; - - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - int ret; - - ret = find_first_zero_bit(&bm->word, bm->depth); - if (ret < bm->depth) - return true; - } - - return false; -} - bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { if (!tags) return true; - return bt_has_free_tags(&tags->bitmap_tags); -} - -static inline int bt_index_inc(int index) -{ - return (index + 1) & (BT_WAIT_QUEUES - 1); -} - -static inline void bt_index_atomic_inc(atomic_t *index) -{ - int old = atomic_read(index); - int new = bt_index_inc(old); - atomic_cmpxchg(index, old, new); + return sbitmap_any_bit_clear(&tags->bitmap_tags.sb); } /* @@ -72,29 +39,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { - struct blk_mq_bitmap_tags *bt; - int i, wake_index; - - /* - * Make sure all changes prior to this are visible from other CPUs. - */ - smp_mb(); - bt = &tags->bitmap_tags; - wake_index = atomic_read(&bt->wake_index); - for (i = 0; i < BT_WAIT_QUEUES; i++) { - struct bt_wait_state *bs = &bt->bs[wake_index]; - - if (waitqueue_active(&bs->wait)) - wake_up(&bs->wait); - - wake_index = bt_index_inc(wake_index); - } - - if (include_reserve) { - bt = &tags->breserved_tags; - if (waitqueue_active(&bt->bs[0].wait)) - wake_up(&bt->bs[0].wait); - } + sbitmap_queue_wake_all(&tags->bitmap_tags); + if (include_reserve) + sbitmap_queue_wake_all(&tags->breserved_tags); } /* @@ -118,7 +65,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct blk_mq_bitmap_tags *bt) + struct sbitmap_queue *bt) { unsigned int depth, users; @@ -130,7 +77,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* * Don't try dividing an ant */ - if (bt->depth == 1) + if (bt->sb.depth == 1) return true; users = atomic_read(&hctx->tags->active_queues); @@ -140,127 +87,42 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* * Allow at least some tags */ - depth = max((bt->depth + users - 1) / users, 4U); + depth = max((bt->sb.depth + users - 1) / users, 4U); return atomic_read(&hctx->nr_active) < depth; } -static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, - bool nowrap) -{ - int tag, org_last_tag = last_tag; - - while (1) { - tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); - if (unlikely(tag >= bm->depth)) { - /* - * We started with an offset, and we didn't reset the - * offset to 0 in a failure case, so start from 0 to - * exhaust the map. - */ - if (org_last_tag && last_tag && !nowrap) { - last_tag = org_last_tag = 0; - continue; - } - return -1; - } - - if (!test_and_set_bit(tag, &bm->word)) - break; - - last_tag = tag + 1; - if (last_tag >= bm->depth - 1) - last_tag = 0; - } - - return tag; -} - #define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) -/* - * Straight forward bitmap tag implementation, where each bit is a tag - * (cleared == free, and set == busy). The small twist is using per-cpu - * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue - * contexts. This enables us to drastically limit the space searched, - * without dirtying an extra shared cacheline like we would if we stored - * the cache value inside the shared blk_mq_bitmap_tags structure. On top - * of that, each word of tags is in a separate cacheline. This means that - * multiple users will tend to stick to different cachelines, at least - * until the map is exhausted. - */ -static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, unsigned int *tag_cache, struct blk_mq_tags *tags) { - unsigned int last_tag, org_last_tag; - int index, i, tag; + unsigned int last_tag; + int tag; if (!hctx_may_queue(hctx, bt)) return -1; - last_tag = org_last_tag = *tag_cache; - index = TAG_TO_INDEX(bt, last_tag); + last_tag = *tag_cache; + tag = sbitmap_get(&bt->sb, last_tag, BT_ALLOC_RR(tags)); - for (i = 0; i < bt->map_nr; i++) { - tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), - BT_ALLOC_RR(tags)); - if (tag != -1) { - tag += (index << bt->bits_per_word); - goto done; - } - - /* - * Jump to next index, and reset the last tag to be the - * first tag of that index - */ - index++; - last_tag = (index << bt->bits_per_word); - - if (index >= bt->map_nr) { - index = 0; - last_tag = 0; - } - } - - *tag_cache = 0; - return -1; - - /* - * Only update the cache from the allocation path, if we ended - * up using the specific cached tag. - */ -done: - if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { + if (tag == -1) { + *tag_cache = 0; + } else if (tag == last_tag || unlikely(BT_ALLOC_RR(tags))) { last_tag = tag + 1; - if (last_tag >= bt->depth - 1) + if (last_tag >= bt->sb.depth - 1) last_tag = 0; - *tag_cache = last_tag; } return tag; } -static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, - struct blk_mq_hw_ctx *hctx) -{ - struct bt_wait_state *bs; - int wait_index; - - if (!hctx) - return &bt->bs[0]; - - wait_index = atomic_read(&hctx->wait_index); - bs = &bt->bs[wait_index]; - bt_index_atomic_inc(&hctx->wait_index); - return bs; -} - static int bt_get(struct blk_mq_alloc_data *data, - struct blk_mq_bitmap_tags *bt, - struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag, struct blk_mq_tags *tags) + struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, struct blk_mq_tags *tags) { - struct bt_wait_state *bs; + struct sbq_wait_state *ws; DEFINE_WAIT(wait); int tag; @@ -271,9 +133,9 @@ static int bt_get(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_NOWAIT) return -1; - bs = bt_wait_ptr(bt, hctx); + ws = bt_wait_ptr(bt, hctx); do { - prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); tag = __bt_get(hctx, bt, last_tag, tags); if (tag != -1) @@ -310,11 +172,11 @@ static int bt_get(struct blk_mq_alloc_data *data, hctx = data->hctx; bt = &hctx->tags->bitmap_tags; } - finish_wait(&bs->wait, &wait); - bs = bt_wait_ptr(bt, hctx); + finish_wait(&ws->wait, &wait); + ws = bt_wait_ptr(bt, hctx); } while (1); - finish_wait(&bs->wait, &wait); + finish_wait(&ws->wait, &wait); return tag; } @@ -354,53 +216,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) return __blk_mq_get_tag(data); } -static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) -{ - int i, wake_index; - - wake_index = atomic_read(&bt->wake_index); - for (i = 0; i < BT_WAIT_QUEUES; i++) { - struct bt_wait_state *bs = &bt->bs[wake_index]; - - if (waitqueue_active(&bs->wait)) { - int o = atomic_read(&bt->wake_index); - if (wake_index != o) - atomic_cmpxchg(&bt->wake_index, o, wake_index); - - return bs; - } - - wake_index = bt_index_inc(wake_index); - } - - return NULL; -} - -static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) -{ - const int index = TAG_TO_INDEX(bt, tag); - struct bt_wait_state *bs; - int wait_cnt; - - clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); - - /* Ensure that the wait list checks occur after clear_bit(). */ - smp_mb(); - - bs = bt_wake_ptr(bt); - if (!bs) - return; - - wait_cnt = atomic_dec_return(&bs->wait_cnt); - if (unlikely(wait_cnt < 0)) - wait_cnt = atomic_inc_return(&bs->wait_cnt); - if (wait_cnt == 0) { - atomic_add(bt->wake_cnt, &bs->wait_cnt); - bt_index_atomic_inc(&bt->wake_index); - wake_up(&bs->wait); - } -} - void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag) { @@ -410,67 +225,94 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); - bt_clear_tag(&tags->bitmap_tags, real_tag); + sbitmap_queue_clear(&tags->bitmap_tags, real_tag); if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) *last_tag = real_tag; } else { BUG_ON(tag >= tags->nr_reserved_tags); - bt_clear_tag(&tags->breserved_tags, tag); + sbitmap_queue_clear(&tags->breserved_tags, tag); } } -static void bt_for_each(struct blk_mq_hw_ctx *hctx, - struct blk_mq_bitmap_tags *bt, unsigned int off, - busy_iter_fn *fn, void *data, bool reserved) +struct bt_iter_data { + struct blk_mq_hw_ctx *hctx; + busy_iter_fn *fn; + void *data; + bool reserved; +}; + +static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { + struct bt_iter_data *iter_data = data; + struct blk_mq_hw_ctx *hctx = iter_data->hctx; + struct blk_mq_tags *tags = hctx->tags; + bool reserved = iter_data->reserved; struct request *rq; - int bit, i; - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; + if (!reserved) + bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr]; - for (bit = find_first_bit(&bm->word, bm->depth); - bit < bm->depth; - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = hctx->tags->rqs[off + bit]; - if (rq->q == hctx->queue) - fn(hctx, rq, data, reserved); - } + if (rq->q == hctx->queue) + iter_data->fn(hctx, rq, iter_data->data, reserved); + return true; +} - off += (1 << bt->bits_per_word); - } +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, + busy_iter_fn *fn, void *data, bool reserved) +{ + struct bt_iter_data iter_data = { + .hctx = hctx, + .fn = fn, + .data = data, + .reserved = reserved, + }; + + sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } -static void bt_tags_for_each(struct blk_mq_tags *tags, - struct blk_mq_bitmap_tags *bt, unsigned int off, - busy_tag_iter_fn *fn, void *data, bool reserved) +struct bt_tags_iter_data { + struct blk_mq_tags *tags; + busy_tag_iter_fn *fn; + void *data; + bool reserved; +}; + +static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { + struct bt_tags_iter_data *iter_data = data; + struct blk_mq_tags *tags = iter_data->tags; + bool reserved = iter_data->reserved; struct request *rq; - int bit, i; - if (!tags->rqs) - return; - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - - for (bit = find_first_bit(&bm->word, bm->depth); - bit < bm->depth; - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = tags->rqs[off + bit]; - fn(rq, data, reserved); - } + if (!reserved) + bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr]; - off += (1 << bt->bits_per_word); - } + iter_data->fn(rq, iter_data->data, reserved); + return true; +} + +static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, + busy_tag_iter_fn *fn, void *data, bool reserved) +{ + struct bt_tags_iter_data iter_data = { + .tags = tags, + .fn = fn, + .data = data, + .reserved = reserved, + }; + + if (tags->rqs) + sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { if (tags->nr_reserved_tags) - bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); - bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); + bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true); + bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false); } void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, @@ -529,107 +371,20 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, continue; if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); - } - -} - -static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) -{ - unsigned int i, used; - - for (i = 0, used = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - - used += bitmap_weight(&bm->word, bm->depth); + bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } - return bt->depth - used; } -static void bt_update_count(struct blk_mq_bitmap_tags *bt, - unsigned int depth) +static unsigned int bt_unused_tags(const struct sbitmap_queue *bt) { - unsigned int tags_per_word = 1U << bt->bits_per_word; - unsigned int map_depth = depth; - - if (depth) { - int i; - - for (i = 0; i < bt->map_nr; i++) { - bt->map[i].depth = min(map_depth, tags_per_word); - map_depth -= bt->map[i].depth; - } - } - - bt->wake_cnt = BT_WAIT_BATCH; - if (bt->wake_cnt > depth / BT_WAIT_QUEUES) - bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); - - bt->depth = depth; + return bt->sb.depth - sbitmap_weight(&bt->sb); } -static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, - int node, bool reserved) +static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, int node) { - int i; - - bt->bits_per_word = ilog2(BITS_PER_LONG); - - /* - * Depth can be zero for reserved tags, that's not a failure - * condition. - */ - if (depth) { - unsigned int nr, tags_per_word; - - tags_per_word = (1 << bt->bits_per_word); - - /* - * If the tag space is small, shrink the number of tags - * per word so we spread over a few cachelines, at least. - * If less than 4 tags, just forget about it, it's not - * going to work optimally anyway. - */ - if (depth >= 4) { - while (tags_per_word * 4 > depth) { - bt->bits_per_word--; - tags_per_word = (1 << bt->bits_per_word); - } - } - - nr = ALIGN(depth, tags_per_word) / tags_per_word; - bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bt->map) - return -ENOMEM; - - bt->map_nr = nr; - } - - bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); - if (!bt->bs) { - kfree(bt->map); - bt->map = NULL; - return -ENOMEM; - } - - bt_update_count(bt, depth); - - for (i = 0; i < BT_WAIT_QUEUES; i++) { - init_waitqueue_head(&bt->bs[i].wait); - atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); - } - - return 0; -} - -static void bt_free(struct blk_mq_bitmap_tags *bt) -{ - kfree(bt->map); - kfree(bt->bs); + return sbitmap_queue_init_node(bt, depth, -1, GFP_KERNEL, node); } static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, @@ -639,14 +394,15 @@ static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, tags->alloc_policy = alloc_policy; - if (bt_alloc(&tags->bitmap_tags, depth, node, false)) - goto enomem; - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) - goto enomem; + if (bt_alloc(&tags->bitmap_tags, depth, node)) + goto free_tags; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node)) + goto free_bitmap_tags; return tags; -enomem: - bt_free(&tags->bitmap_tags); +free_bitmap_tags: + sbitmap_queue_free(&tags->bitmap_tags); +free_tags: kfree(tags); return NULL; } @@ -679,8 +435,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, void blk_mq_free_tags(struct blk_mq_tags *tags) { - bt_free(&tags->bitmap_tags); - bt_free(&tags->breserved_tags); + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); free_cpumask_var(tags->cpumask); kfree(tags); } @@ -702,7 +458,8 @@ int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) * Don't need (or can't) update reserved tags here, they remain * static and should never need resizing. */ - bt_update_count(&tags->bitmap_tags, tdepth); + sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags, false); return 0; } @@ -746,7 +503,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " "bits_per_word=%u\n", tags->nr_tags, tags->nr_reserved_tags, - tags->bitmap_tags.bits_per_word); + 1U << tags->bitmap_tags.sb.shift); free = bt_unused_tags(&tags->bitmap_tags); res = bt_unused_tags(&tags->breserved_tags); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index d468a79f2c4a2c..3215c08c63cc96 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -3,31 +3,6 @@ #include "blk-mq.h" -enum { - BT_WAIT_QUEUES = 8, - BT_WAIT_BATCH = 8, -}; - -struct bt_wait_state { - atomic_t wait_cnt; - wait_queue_head_t wait; -} ____cacheline_aligned_in_smp; - -#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) -#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) - -struct blk_mq_bitmap_tags { - unsigned int depth; - unsigned int wake_cnt; - unsigned int bits_per_word; - - unsigned int map_nr; - struct blk_align_bitmap *map; - - atomic_t wake_index; - struct bt_wait_state *bs; -}; - /* * Tag address space map. */ @@ -37,8 +12,8 @@ struct blk_mq_tags { atomic_t active_queues; - struct blk_mq_bitmap_tags bitmap_tags; - struct blk_mq_bitmap_tags breserved_tags; + struct sbitmap_queue bitmap_tags; + struct sbitmap_queue breserved_tags; struct request **rqs; struct list_head page_list; @@ -61,6 +36,14 @@ extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); +static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx) +{ + if (!hctx) + return &bt->ws[0]; + return sbq_wait_ptr(bt, &hctx->wait_index); +} + enum { BLK_MQ_TAG_CACHE_MIN = 1, BLK_MQ_TAG_CACHE_MAX = 64, diff --git a/block/blk-mq.c b/block/blk-mq.c index 0cb93625d8121d..6603be18064e3d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -41,42 +41,23 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - unsigned int i; - - for (i = 0; i < hctx->ctx_map.size; i++) - if (hctx->ctx_map.map[i].word) - return true; - - return false; -} - -static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx) -{ - return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; + return sbitmap_any_bit_set(&hctx->ctx_map); } -#define CTX_TO_BIT(hctx, ctx) \ - ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) - /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) - set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) + sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } void blk_mq_freeze_queue_start(struct request_queue *q) @@ -755,38 +736,36 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } +struct flush_busy_ctx_data { + struct blk_mq_hw_ctx *hctx; + struct list_head *list; +}; + +static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) +{ + struct flush_busy_ctx_data *flush_data = data; + struct blk_mq_hw_ctx *hctx = flush_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + sbitmap_clear_bit(sb, bitnr); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, flush_data->list); + spin_unlock(&ctx->lock); + return true; +} + /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - struct blk_mq_ctx *ctx; - int i; - - for (i = 0; i < hctx->ctx_map.size; i++) { - struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; - unsigned int off, bit; - - if (!bm->word) - continue; - - bit = 0; - off = i * hctx->ctx_map.bits_per_word; - do { - bit = find_next_bit(&bm->word, bm->depth, bit); - if (bit >= bm->depth) - break; - - ctx = hctx->ctxs[bit + off]; - clear_bit(bit, &bm->word); - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, list); - spin_unlock(&ctx->lock); + struct flush_busy_ctx_data data = { + .hctx = hctx, + .list = list, + }; - bit++; - } while (1); - } + sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } static inline unsigned int queued_to_index(unsigned int queued) @@ -1609,32 +1588,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, return NULL; } -static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) -{ - kfree(bitmap->map); -} - -static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) -{ - unsigned int bpw = 8, total, num_maps, i; - - bitmap->bits_per_word = bpw; - - num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; - bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bitmap->map) - return -ENOMEM; - - total = nr_cpu_ids; - for (i = 0; i < num_maps; i++) { - bitmap->map[i].depth = min(total, bitmap->bits_per_word); - total -= bitmap->map[i].depth; - } - - return 0; -} - /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it @@ -1700,7 +1653,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); blk_free_flush_queue(hctx->fq); - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -1760,7 +1713,8 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->ctxs) goto unregister_cpu_notifier; - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, + node)) goto free_ctxs; hctx->nr_ctx = 0; @@ -1787,7 +1741,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); unregister_cpu_notifier: @@ -1863,8 +1817,6 @@ static void blk_mq_map_swqueue(struct request_queue *q, mutex_unlock(&q->sysfs_lock); queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_ctxmap *map = &hctx->ctx_map; - /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -1890,7 +1842,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ - map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); + sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Initialize batch roundrobin counts diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b11037b70a..71831f970fd372 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -63,15 +63,6 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved); void blk_mq_release(struct request_queue *q); -/* - * Basic implementation of sparser bitmap, allowing the user to spread - * the bits over more cachelines. - */ -struct blk_align_bitmap { - unsigned long word; - unsigned long depth; -} ____cacheline_aligned_in_smp; - static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 60ef14cbcd2da8..2575779cf13f16 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -2,6 +2,7 @@ #define BLK_MQ_H #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -12,12 +13,6 @@ struct blk_mq_cpu_notifier { int (*notify)(void *data, unsigned long action, unsigned int cpu); }; -struct blk_mq_ctxmap { - unsigned int size; - unsigned int bits_per_word; - struct blk_align_bitmap *map; -}; - struct blk_mq_hw_ctx { struct { spinlock_t lock; @@ -37,7 +32,7 @@ struct blk_mq_hw_ctx { void *driver_data; - struct blk_mq_ctxmap ctx_map; + struct sbitmap ctx_map; struct blk_mq_ctx **ctxs; unsigned int nr_ctx; diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h new file mode 100644 index 00000000000000..1a3b836042e152 --- /dev/null +++ b/include/linux/sbitmap.h @@ -0,0 +1,327 @@ +/* + * Fast and scalable bitmaps. + * + * Copyright (C) 2016 Facebook + * Copyright (C) 2013-2014 Jens Axboe + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __LINUX_SCALE_BITMAP_H +#define __LINUX_SCALE_BITMAP_H + +#include +#include + +/** + * struct sbitmap_word - Word in a &struct sbitmap. + */ +struct sbitmap_word { + /** + * @word: The bitmap word itself. + */ + unsigned long word; + + /** + * @depth: Number of bits being used in @word. + */ + unsigned long depth; +} ____cacheline_aligned_in_smp; + +/** + * struct sbitmap - Scalable bitmap. + * + * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This + * trades off higher memory usage for better scalability. + */ +struct sbitmap { + /** + * @depth: Number of bits used in the whole bitmap. + */ + unsigned int depth; + + /** + * @shift: log2(number of bits used per word) + */ + unsigned int shift; + + /** + * @map_nr: Number of words (cachelines) being used for the bitmap. + */ + unsigned int map_nr; + + /** + * @map: Allocated bitmap. + */ + struct sbitmap_word *map; +}; + +#define SBQ_WAIT_QUEUES 8 +#define SBQ_WAKE_BATCH 8 + +/** + * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. + */ +struct sbq_wait_state { + /** + * @wait_cnt: Number of frees remaining before we wake up. + */ + atomic_t wait_cnt; + + /** + * @wait: Wait queue. + */ + wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +/** + * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free + * bits. + * + * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to + * avoid contention on the wait queue spinlock. This ensures that we don't hit a + * scalability wall when we run out of free bits and have to start putting tasks + * to sleep. + */ +struct sbitmap_queue { + /** + * @sb: Scalable bitmap. + */ + struct sbitmap sb; + + /** + * @wake_batch: Number of bits which must be freed before we wake up any + * waiters. + */ + unsigned int wake_batch; + + /** + * @wake_index: Next wait queue in @ws to wake up. + */ + atomic_t wake_index; + + /** + * @ws: Wait queues. + */ + struct sbq_wait_state *ws; +}; + +/** + * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node. + * @sb: Bitmap to initialize. + * @depth: Number of bits to allocate. + * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if + * given, a good default is chosen. + * @flags: Allocation flags. + * @node: Memory node to allocate on. + * + * Return: Zero on success or negative errno on failure. + */ +int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, + gfp_t flags, int node); + +/** + * sbitmap_free() - Free memory used by a &struct sbitmap. + * @sb: Bitmap to free. + */ +static inline void sbitmap_free(struct sbitmap *sb) +{ + kfree(sb->map); + sb->map = NULL; +} + +/** + * sbitmap_resize() - Resize a &struct sbitmap. + * @sb: Bitmap to resize. + * @depth: New number of bits to resize to. + * + * Doesn't reallocate anything. It's up to the caller to ensure that the new + * depth doesn't exceed the depth that the sb was initialized with. + */ +void sbitmap_resize(struct sbitmap *sb, unsigned int depth); + +/** + * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap. + * @sb: Bitmap to allocate from. + * @alloc_hint: Hint for where to start searching for a free bit. + * @round_robin: If true, be stricter about allocation order; always allocate + * starting from the last allocated bit. This is less efficient + * than the default behavior (false). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); + +/** + * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. + * @sb: Bitmap to check. + * + * Return: true if any bit in the bitmap is set, false otherwise. + */ +bool sbitmap_any_bit_set(const struct sbitmap *sb); + +/** + * sbitmap_any_bit_clear() - Check for an unset bit in a &struct + * sbitmap. + * @sb: Bitmap to check. + * + * Return: true if any bit in the bitmap is clear, false otherwise. + */ +bool sbitmap_any_bit_clear(const struct sbitmap *sb); + +typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); + +/** + * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * @sb: Bitmap to iterate over. + * @fn: Callback. Should return true to continue or false to break early. + * @data: Pointer to pass to callback. + * + * This is inline even though it's non-trivial so that the function calls to the + * callback will hopefully get optimized away. + */ +static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, + void *data) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + struct sbitmap_word *word = &sb->map[i]; + unsigned int off, nr; + + if (!word->word) + continue; + + nr = 0; + off = i << sb->shift; + while (1) { + nr = find_next_bit(&word->word, word->depth, nr); + if (nr >= word->depth) + break; + + if (!fn(sb, off + nr, data)) + return; + + nr++; + } + } +} + +#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) +#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) + +static inline unsigned long *__sbitmap_word(struct sbitmap *sb, + unsigned int bitnr) +{ + return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word; +} + +/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */ + +static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr) +{ + set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); +} + +static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) +{ + clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); +} + +static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) +{ + return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); +} + +unsigned int sbitmap_weight(const struct sbitmap *sb); + +/** + * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific + * memory node. + * @sbq: Bitmap queue to initialize. + * @depth: See sbitmap_init_node(). + * @shift: See sbitmap_init_node(). + * @flags: Allocation flags. + * @node: Memory node to allocate on. + * + * Return: Zero on success or negative errno on failure. + */ +int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, + int shift, gfp_t flags, int node); + +/** + * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. + * + * @sbq: Bitmap queue to free. + */ +static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) +{ + kfree(sbq->ws); + sbitmap_free(&sbq->sb); +} + +/** + * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. + * @sbq: Bitmap queue to resize. + * @depth: New number of bits to resize to. + * + * Like sbitmap_resize(), this doesn't reallocate anything. It has to do + * some extra work on the &struct sbitmap_queue, so it's not safe to just + * resize the underlying &struct sbitmap. + */ +void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); + +/** + * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a + * &struct sbitmap_queue. + * @sbq: Bitmap to free from. + * @nr: Bit number to free. + */ +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr); + +static inline int sbq_index_inc(int index) +{ + return (index + 1) & (SBQ_WAIT_QUEUES - 1); +} + +static inline void sbq_index_atomic_inc(atomic_t *index) +{ + int old = atomic_read(index); + int new = sbq_index_inc(old); + atomic_cmpxchg(index, old, new); +} + +/** + * sbq_wait_ptr() - Get the next wait queue to use for a &struct + * sbitmap_queue. + * @sbq: Bitmap queue to wait on. + * @wait_index: A counter per "user" of @sbq. + */ +static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, + atomic_t *wait_index) +{ + struct sbq_wait_state *ws; + + ws = &sbq->ws[atomic_read(wait_index)]; + sbq_index_atomic_inc(wait_index); + return ws; +} + +/** + * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct + * sbitmap_queue. + * @sbq: Bitmap queue to wake up. + */ +void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); + +#endif /* __LINUX_SCALE_BITMAP_H */ diff --git a/lib/Kconfig b/lib/Kconfig index d79909dc01ec83..942fb8091a8607 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -550,4 +550,7 @@ config STACKDEPOT bool select STACKTRACE +config SBITMAP + bool + endmenu diff --git a/lib/Makefile b/lib/Makefile index cfa68eb269e4b6..2cbfd2904994b0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -228,3 +228,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o obj-$(CONFIG_UBSAN) += ubsan.o UBSAN_SANITIZE_ubsan.o := n + +obj-$(CONFIG_SBITMAP) += sbitmap.o diff --git a/lib/sbitmap.c b/lib/sbitmap.c new file mode 100644 index 00000000000000..dfc084ac6937cc --- /dev/null +++ b/lib/sbitmap.c @@ -0,0 +1,301 @@ +/* + * Copyright (C) 2016 Facebook + * Copyright (C) 2013-2014 Jens Axboe + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, + gfp_t flags, int node) +{ + unsigned int bits_per_word; + unsigned int i; + + if (shift < 0) { + shift = ilog2(BITS_PER_LONG); + /* + * If the bitmap is small, shrink the number of bits per word so + * we spread over a few cachelines, at least. If less than 4 + * bits, just forget about it, it's not going to work optimally + * anyway. + */ + if (depth >= 4) { + while ((4U << shift) > depth) + shift--; + } + } + bits_per_word = 1U << shift; + if (bits_per_word > BITS_PER_LONG) + return -EINVAL; + + sb->shift = shift; + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + if (depth == 0) { + sb->map = NULL; + return 0; + } + + sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); + if (!sb->map) + return -ENOMEM; + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_init_node); + +void sbitmap_resize(struct sbitmap *sb, unsigned int depth) +{ + unsigned int bits_per_word = 1U << sb->shift; + unsigned int i; + + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } +} +EXPORT_SYMBOL_GPL(sbitmap_resize); + +static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint, + bool wrap) +{ + unsigned int orig_hint = hint; + int nr; + + while (1) { + nr = find_next_zero_bit(&word->word, word->depth, hint); + if (unlikely(nr >= word->depth)) { + /* + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to + * exhaust the map. + */ + if (orig_hint && hint && wrap) { + hint = orig_hint = 0; + continue; + } + return -1; + } + + if (!test_and_set_bit(nr, &word->word)) + break; + + hint = nr + 1; + if (hint >= word->depth - 1) + hint = 0; + } + + return nr; +} + +int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) +{ + unsigned int i, index; + int nr = -1; + + index = SB_NR_TO_INDEX(sb, alloc_hint); + + for (i = 0; i < sb->map_nr; i++) { + nr = __sbitmap_get_word(&sb->map[index], + SB_NR_TO_BIT(sb, alloc_hint), + !round_robin); + if (nr != -1) { + nr += index << sb->shift; + break; + } + + /* Jump to next index. */ + index++; + alloc_hint = index << sb->shift; + + if (index >= sb->map_nr) { + index = 0; + alloc_hint = 0; + } + } + + return nr; +} +EXPORT_SYMBOL_GPL(sbitmap_get); + +bool sbitmap_any_bit_set(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + if (sb->map[i].word) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); + +bool sbitmap_any_bit_clear(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + unsigned long ret; + + ret = find_first_zero_bit(&word->word, word->depth); + if (ret < word->depth) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); + +unsigned int sbitmap_weight(const struct sbitmap *sb) +{ + unsigned int i, weight; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + + weight += bitmap_weight(&word->word, word->depth); + } + return weight; +} +EXPORT_SYMBOL_GPL(sbitmap_weight); + +static unsigned int sbq_calc_wake_batch(unsigned int depth) +{ + unsigned int wake_batch; + + /* + * For each batch, we wake up one queue. We need to make sure that our + * batch size is small enough that the full depth of the bitmap is + * enough to wake up all of the queues. + */ + wake_batch = SBQ_WAKE_BATCH; + if (wake_batch > depth / SBQ_WAIT_QUEUES) + wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); + + return wake_batch; +} + +int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, + int shift, gfp_t flags, int node) +{ + int ret; + int i; + + ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node); + if (ret) + return ret; + + sbq->wake_batch = sbq_calc_wake_batch(depth); + atomic_set(&sbq->wake_index, 0); + + sbq->ws = kzalloc(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags); + if (!sbq->ws) { + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + init_waitqueue_head(&sbq->ws[i].wait); + atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); + } + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); + +void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) +{ + sbq->wake_batch = sbq_calc_wake_batch(depth); + sbitmap_resize(&sbq->sb, depth); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_resize); + +static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) { + int o = atomic_read(&sbq->wake_index); + + if (wake_index != o) + atomic_cmpxchg(&sbq->wake_index, o, wake_index); + return ws; + } + + wake_index = sbq_index_inc(wake_index); + } + + return NULL; +} + +static void sbq_wake_up(struct sbitmap_queue *sbq) +{ + struct sbq_wait_state *ws; + int wait_cnt; + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); + + ws = sbq_wake_ptr(sbq); + if (!ws) + return; + + wait_cnt = atomic_dec_return(&ws->wait_cnt); + if (unlikely(wait_cnt < 0)) + wait_cnt = atomic_inc_return(&ws->wait_cnt); + if (wait_cnt == 0) { + atomic_add(sbq->wake_batch, &ws->wait_cnt); + sbq_index_atomic_inc(&sbq->wake_index); + wake_up(&ws->wait); + } +} + +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr) +{ + sbitmap_clear_bit(&sbq->sb, nr); + sbq_wake_up(sbq); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_clear); + +void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) + wake_up(&ws->wait); + + wake_index = sbq_index_inc(wake_index); + } +} +EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); From 48e28166a7b608e19a6aea3acadd81cdfe660f6b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Sat, 17 Sep 2016 01:28:22 -0700 Subject: [PATCH 25/54] sbitmap: allocate wait queues on a specific node The original bt_alloc() we converted from was using kzalloc(), not kzalloc_node(), to allocate the wait queues. This was probably an oversight, so fix it for sbitmap_queue_init_node(). Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- lib/sbitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index dfc084ac6937cc..4d8e97e470ee93 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -208,7 +208,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, sbq->wake_batch = sbq_calc_wake_batch(depth); atomic_set(&sbq->wake_index, 0); - sbq->ws = kzalloc(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags); + sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { sbitmap_free(&sbq->sb); return -ENOMEM; From 40aabb67464d5aad9ca3d2a5fedee56e2ff45aa0 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Sat, 17 Sep 2016 01:28:23 -0700 Subject: [PATCH 26/54] sbitmap: push per-cpu last_tag into sbitmap_queue Allocating your own per-cpu allocation hint separately makes for an awkward API. Instead, allocate the per-cpu hint as part of the struct sbitmap_queue. There's no point for a struct sbitmap_queue without the cache, but you can still use a bare struct sbitmap. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 53 +++++++++++++---------------------------- block/blk-mq-tag.h | 3 ++- block/blk-mq.c | 2 +- block/blk-mq.h | 2 -- include/linux/sbitmap.h | 45 +++++++++++++++++++++++++++++++++- lib/sbitmap.c | 35 ++++++++++++++++++++++++++- 6 files changed, 98 insertions(+), 42 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2cbdecd594e970..c9a22dbbbda1e7 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -94,39 +94,21 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, #define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - unsigned int *tag_cache, struct blk_mq_tags *tags) + struct blk_mq_tags *tags) { - unsigned int last_tag; - int tag; - if (!hctx_may_queue(hctx, bt)) return -1; - - last_tag = *tag_cache; - tag = sbitmap_get(&bt->sb, last_tag, BT_ALLOC_RR(tags)); - - if (tag == -1) { - *tag_cache = 0; - } else if (tag == last_tag || unlikely(BT_ALLOC_RR(tags))) { - last_tag = tag + 1; - if (last_tag >= bt->sb.depth - 1) - last_tag = 0; - *tag_cache = last_tag; - } - - return tag; + return __sbitmap_queue_get(bt, BT_ALLOC_RR(tags)); } -static int bt_get(struct blk_mq_alloc_data *data, - struct sbitmap_queue *bt, - struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag, struct blk_mq_tags *tags) +static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags) { struct sbq_wait_state *ws; DEFINE_WAIT(wait); int tag; - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt, tags); if (tag != -1) return tag; @@ -137,7 +119,7 @@ static int bt_get(struct blk_mq_alloc_data *data, do { prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt, tags); if (tag != -1) break; @@ -154,7 +136,7 @@ static int bt_get(struct blk_mq_alloc_data *data, * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt, tags); if (tag != -1) break; @@ -168,7 +150,6 @@ static int bt_get(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_RESERVED) { bt = &data->hctx->tags->breserved_tags; } else { - last_tag = &data->ctx->last_tag; hctx = data->hctx; bt = &hctx->tags->bitmap_tags; } @@ -185,7 +166,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) int tag; tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, - &data->ctx->last_tag, data->hctx->tags); + data->hctx->tags); if (tag >= 0) return tag + data->hctx->tags->nr_reserved_tags; @@ -194,15 +175,15 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) { - int tag, zero = 0; + int tag; if (unlikely(!data->hctx->tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, - data->hctx->tags); + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, + data->hctx->tags); if (tag < 0) return BLK_MQ_TAG_FAIL; @@ -216,8 +197,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) return __blk_mq_get_tag(data); } -void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, - unsigned int *last_tag) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + unsigned int tag) { struct blk_mq_tags *tags = hctx->tags; @@ -225,12 +206,12 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); - sbitmap_queue_clear(&tags->bitmap_tags, real_tag); - if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) - *last_tag = real_tag; + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, + BT_ALLOC_RR(tags), ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); - sbitmap_queue_clear(&tags->breserved_tags, tag); + sbitmap_queue_clear(&tags->breserved_tags, tag, + BT_ALLOC_RR(tags), ctx->cpu); } } diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 3215c08c63cc96..2b1d52ed82e018 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -27,7 +27,8 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); diff --git a/block/blk-mq.c b/block/blk-mq.c index 6603be18064e3d..e0a69daddbd8ce 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -303,7 +303,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - blk_mq_put_tag(hctx, tag, &ctx->last_tag); + blk_mq_put_tag(hctx, ctx, tag); blk_queue_exit(q); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 71831f970fd372..9b15d2ef7f7bf2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -12,8 +12,6 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int last_tag ____cacheline_aligned_in_smp; - /* incremented at dispatch time */ unsigned long rq_dispatched[2]; unsigned long rq_merged; diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 1a3b836042e152..6745545e0b2253 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -99,6 +99,14 @@ struct sbitmap_queue { */ struct sbitmap sb; + /* + * @alloc_hint: Cache of last successfully allocated or freed bit. + * + * This is per-cpu, which allows multiple users to stick to different + * cachelines until the map is exhausted. + */ + unsigned int __percpu *alloc_hint; + /** * @wake_batch: Number of bits which must be freed before we wake up any * waiters. @@ -267,6 +275,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) { kfree(sbq->ws); + free_percpu(sbq->alloc_hint); sbitmap_free(&sbq->sb); } @@ -281,13 +290,47 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) */ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); +/** + * __sbitmap_queue_get() - Try to allocate a free bit from a &struct + * sbitmap_queue with preemption already disabled. + * @sbq: Bitmap queue to allocate from. + * @round_robin: See sbitmap_get(). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin); + +/** + * sbitmap_queue_get() - Try to allocate a free bit from a &struct + * sbitmap_queue. + * @sbq: Bitmap queue to allocate from. + * @round_robin: See sbitmap_get(). + * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to + * sbitmap_queue_clear()). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin, + unsigned int *cpu) +{ + int nr; + + *cpu = get_cpu(); + nr = __sbitmap_queue_get(sbq, round_robin); + put_cpu(); + return nr; +} + /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @nr: Bit number to free. + * @round_robin: See sbitmap_get(). + * @cpu: CPU the bit was allocated on. */ -void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr); +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, + bool round_robin, unsigned int cpu); static inline int sbq_index_inc(int index) { diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 4d8e97e470ee93..1651ad9d553095 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -205,11 +205,18 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, if (ret) return ret; + sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags); + if (!sbq->alloc_hint) { + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + sbq->wake_batch = sbq_calc_wake_batch(depth); atomic_set(&sbq->wake_index, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { + free_percpu(sbq->alloc_hint); sbitmap_free(&sbq->sb); return -ENOMEM; } @@ -229,6 +236,29 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); +int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin) +{ + unsigned int hint; + int nr; + + hint = this_cpu_read(*sbq->alloc_hint); + nr = sbitmap_get(&sbq->sb, hint, round_robin); + + if (nr == -1) { + /* If the map is full, a hint won't do us much good. */ + this_cpu_write(*sbq->alloc_hint, 0); + } else if (nr == hint || unlikely(round_robin)) { + /* Only update the hint if we used it. */ + hint = nr + 1; + if (hint >= sbq->sb.depth - 1) + hint = 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + + return nr; +} +EXPORT_SYMBOL_GPL(__sbitmap_queue_get); + static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) { int i, wake_index; @@ -273,10 +303,13 @@ static void sbq_wake_up(struct sbitmap_queue *sbq) } } -void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr) +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, + bool round_robin, unsigned int cpu) { sbitmap_clear_bit(&sbq->sb, nr); sbq_wake_up(sbq); + if (likely(!round_robin)) + *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); From f4a644db86669d938c71f19560aebf69d4720d63 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Sat, 17 Sep 2016 01:28:24 -0700 Subject: [PATCH 27/54] sbitmap: push alloc policy into sbitmap_queue Again, there's no point in passing this in every time. Make it part of struct sbitmap_queue and clean up the API. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 33 +++++++++++++++------------------ block/blk-mq-tag.h | 1 - include/linux/sbitmap.h | 19 +++++++++++-------- lib/sbitmap.c | 14 ++++++++------ 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c9a22dbbbda1e7..e1c2bedb0bf9cf 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -91,14 +91,11 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) - -static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - struct blk_mq_tags *tags) +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { if (!hctx_may_queue(hctx, bt)) return -1; - return __sbitmap_queue_get(bt, BT_ALLOC_RR(tags)); + return __sbitmap_queue_get(bt); } static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, @@ -108,7 +105,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, DEFINE_WAIT(wait); int tag; - tag = __bt_get(hctx, bt, tags); + tag = __bt_get(hctx, bt); if (tag != -1) return tag; @@ -119,7 +116,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, do { prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt, tags); + tag = __bt_get(hctx, bt); if (tag != -1) break; @@ -136,7 +133,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt, tags); + tag = __bt_get(hctx, bt); if (tag != -1) break; @@ -206,12 +203,10 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); - sbitmap_queue_clear(&tags->bitmap_tags, real_tag, - BT_ALLOC_RR(tags), ctx->cpu); + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); - sbitmap_queue_clear(&tags->breserved_tags, tag, - BT_ALLOC_RR(tags), ctx->cpu); + sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } @@ -363,21 +358,23 @@ static unsigned int bt_unused_tags(const struct sbitmap_queue *bt) return bt->sb.depth - sbitmap_weight(&bt->sb); } -static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, int node) +static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, + bool round_robin, int node) { - return sbitmap_queue_init_node(bt, depth, -1, GFP_KERNEL, node); + return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, + node); } static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, int node, int alloc_policy) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - tags->alloc_policy = alloc_policy; - - if (bt_alloc(&tags->bitmap_tags, depth, node)) + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) goto free_tags; - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node)) + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, + node)) goto free_bitmap_tags; return tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 2b1d52ed82e018..f90b850ce43d99 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -18,7 +18,6 @@ struct blk_mq_tags { struct request **rqs; struct list_head page_list; - int alloc_policy; cpumask_var_t cpumask; }; diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 6745545e0b2253..f017fd6e69c4f6 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -122,6 +122,11 @@ struct sbitmap_queue { * @ws: Wait queues. */ struct sbq_wait_state *ws; + + /** + * @round_robin: Allocate bits in strict round-robin order. + */ + bool round_robin; }; /** @@ -259,13 +264,14 @@ unsigned int sbitmap_weight(const struct sbitmap *sb); * @sbq: Bitmap queue to initialize. * @depth: See sbitmap_init_node(). * @shift: See sbitmap_init_node(). + * @round_robin: See sbitmap_get(). * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, - int shift, gfp_t flags, int node); + int shift, bool round_robin, gfp_t flags, int node); /** * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. @@ -294,29 +300,27 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); * __sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue with preemption already disabled. * @sbq: Bitmap queue to allocate from. - * @round_robin: See sbitmap_get(). * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ -int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin); +int __sbitmap_queue_get(struct sbitmap_queue *sbq); /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. * @sbq: Bitmap queue to allocate from. - * @round_robin: See sbitmap_get(). * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ -static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin, +static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, unsigned int *cpu) { int nr; *cpu = get_cpu(); - nr = __sbitmap_queue_get(sbq, round_robin); + nr = __sbitmap_queue_get(sbq); put_cpu(); return nr; } @@ -326,11 +330,10 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin, * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @nr: Bit number to free. - * @round_robin: See sbitmap_get(). * @cpu: CPU the bit was allocated on. */ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, - bool round_robin, unsigned int cpu); + unsigned int cpu); static inline int sbq_index_inc(int index) { diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 1651ad9d553095..be55f744b7713e 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -196,7 +196,7 @@ static unsigned int sbq_calc_wake_batch(unsigned int depth) } int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, - int shift, gfp_t flags, int node) + int shift, bool round_robin, gfp_t flags, int node) { int ret; int i; @@ -225,6 +225,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, init_waitqueue_head(&sbq->ws[i].wait); atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); } + + sbq->round_robin = round_robin; return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); @@ -236,18 +238,18 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); -int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin) +int __sbitmap_queue_get(struct sbitmap_queue *sbq) { unsigned int hint; int nr; hint = this_cpu_read(*sbq->alloc_hint); - nr = sbitmap_get(&sbq->sb, hint, round_robin); + nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin); if (nr == -1) { /* If the map is full, a hint won't do us much good. */ this_cpu_write(*sbq->alloc_hint, 0); - } else if (nr == hint || unlikely(round_robin)) { + } else if (nr == hint || unlikely(sbq->round_robin)) { /* Only update the hint if we used it. */ hint = nr + 1; if (hint >= sbq->sb.depth - 1) @@ -304,11 +306,11 @@ static void sbq_wake_up(struct sbitmap_queue *sbq) } void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, - bool round_robin, unsigned int cpu) + unsigned int cpu) { sbitmap_clear_bit(&sbq->sb, nr); sbq_wake_up(sbq); - if (likely(!round_robin)) + if (likely(!sbq->round_robin)) *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); From 98d95416dbfaf4910caadfb4ddc75e4aacbdff8c Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Sat, 17 Sep 2016 01:28:25 -0700 Subject: [PATCH 28/54] sbitmap: randomize initial alloc_hint values In order to get good cache behavior from a sbitmap, we want each CPU to stick to its own cacheline(s) as much as possible. This might happen naturally as the bitmap gets filled up and the alloc_hint values spread out, but we really want this behavior from the start. blk-mq apparently intended to do this, but the code to do this was never wired up. Get rid of the dead code and make it part of the sbitmap library. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 8 -------- block/blk-mq-tag.h | 1 - lib/sbitmap.c | 6 ++++++ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e1c2bedb0bf9cf..cef618f6fc9213 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -7,7 +7,6 @@ */ #include #include -#include #include #include "blk.h" @@ -419,13 +418,6 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) kfree(tags); } -void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) -{ - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; - - *tag = prandom_u32() % depth; -} - int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) { tdepth -= tags->nr_reserved_tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index f90b850ce43d99..09f4cc0aaa84c9 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -30,7 +30,6 @@ extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); -extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, diff --git a/lib/sbitmap.c b/lib/sbitmap.c index be55f744b7713e..928b82a733f2cc 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -15,6 +15,7 @@ * along with this program. If not, see . */ +#include #include int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, @@ -211,6 +212,11 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, return -ENOMEM; } + if (depth && !round_robin) { + for_each_possible_cpu(i) + *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth; + } + sbq->wake_batch = sbq_calc_wake_batch(depth); atomic_set(&sbq->wake_index, 0); From 05fd095d53b979878f016c3a7080d3683cc89d72 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Sat, 17 Sep 2016 01:28:26 -0700 Subject: [PATCH 29/54] sbitmap: re-initialize allocation hints after resize After a struct sbitmap_queue is resized smaller, the allocation hints may still be set to bits beyond the new depth of the bitmap. This means that, for example, if the number of blk-mq tags is reduced through sysfs, more requests than the nominal queue depth may be in flight. It's tempting to fix this at resize time by doing a one-time reinitialization of the hints, but this can race with __sbitmap_queue_get() updating the hint. Instead, check the hint before we use it. This caused no measurable performance difference in my synthetic benchmarks. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- lib/sbitmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 928b82a733f2cc..f736c52a712c06 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -246,10 +246,15 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_resize); int __sbitmap_queue_get(struct sbitmap_queue *sbq) { - unsigned int hint; + unsigned int hint, depth; int nr; hint = this_cpu_read(*sbq->alloc_hint); + depth = READ_ONCE(sbq->sb.depth); + if (unlikely(hint >= depth)) { + hint = depth ? prandom_u32() % depth : 0; + this_cpu_write(*sbq->alloc_hint, hint); + } nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin); if (nr == -1) { @@ -258,7 +263,7 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq) } else if (nr == hint || unlikely(sbq->round_robin)) { /* Only update the hint if we used it. */ hint = nr + 1; - if (hint >= sbq->sb.depth - 1) + if (hint >= depth - 1) hint = 0; this_cpu_write(*sbq->alloc_hint, hint); } From 5c64a8df0ca88c79c9cb74674c2481e5f7ede511 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Sat, 17 Sep 2016 12:20:54 -0700 Subject: [PATCH 30/54] sbitmap: don't update the allocation hint on clear after resize If we have a bunch of high-numbered bits allocated and then we resize the struct sbitmap_queue, when those bits get cleared, we'll update the hint and then have to re-randomize it repeatedly. Avoid that by checking that the cleared bit is still a valid hint. No measurable performance difference in the common case. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- lib/sbitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index f736c52a712c06..e408089215445c 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -321,7 +321,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, { sbitmap_clear_bit(&sbq->sb, nr); sbq_wake_up(sbq); - if (likely(!sbq->round_robin)) + if (likely(!sbq->round_robin && nr < sbq->sb.depth)) *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); From 60658e0dc1df058607990278fdf9d831e0c2c71a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 19 Sep 2016 14:34:08 +0100 Subject: [PATCH 31/54] sbitmap: initialize weight to zero Variable weight is not being initialized to zero before it is used to compute the weight sum. Ensure it is initialized to zero. Found with static analysis with cppcheck: [lib/sbitmap.c:177]: (error) Uninitialized variable: weight Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- lib/sbitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index e408089215445c..2cecf05c82fd83 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); unsigned int sbitmap_weight(const struct sbitmap *sb) { - unsigned int i, weight; + unsigned int i, weight = 0; for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; From e105ddb4a2a13d779311349df2c32fa22a87c406 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 16 Sep 2016 14:25:03 +0200 Subject: [PATCH 32/54] lightnvm: NVM should depend on HAS_DMA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If NO_DMA=y: drivers/built-in.o: In function `nvme_nvm_dev_dma_free': lightnvm.c:(.text+0x23df1a): undefined reference to `dma_pool_free' drivers/built-in.o: In function `nvme_nvm_dev_dma_alloc': lightnvm.c:(.text+0x23df38): undefined reference to `dma_pool_alloc' drivers/built-in.o: In function `nvme_nvm_destroy_dma_pool': lightnvm.c:(.text+0x23df4c): undefined reference to `dma_pool_destroy' drivers/built-in.o: In function `nvme_nvm_create_dma_pool': lightnvm.c:(.text+0x23df7e): undefined reference to `dma_pool_create' and ERROR: "dma_pool_destroy" [drivers/nvme/host/nvme-core.ko] undefined! ERROR: "dma_pool_free" [drivers/nvme/host/nvme-core.ko] undefined! ERROR: "dma_pool_alloc" [drivers/nvme/host/nvme-core.ko] undefined! ERROR: "dma_pool_create" [drivers/nvme/host/nvme-core.ko] undefined! Signed-off-by: Geert Uytterhoeven Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 61c68a1f054ae5..2f5d5f4a4c75b3 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -4,7 +4,7 @@ menuconfig NVM bool "Open-Channel SSD target support" - depends on BLOCK + depends on BLOCK && HAS_DMA help Say Y here to get to enable Open-channel SSDs. From ac81bfa9867103c9d50996ec21fa9179b81b727e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 16 Sep 2016 14:25:04 +0200 Subject: [PATCH 33/54] nvme: refactor namespaces to support non-gendisk devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With LightNVM enabled namespaces, the gendisk structure is not exposed to the user. This prevents LightNVM users from accessing the NVMe device driver specific sysfs entries, and LightNVM namespace geometry. Refactor the revalidation process, so that a namespace, instead of a gendisk, is revalidated. This later allows patches to wire up the sysfs entries up to a non-gendisk namespace. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 2 + drivers/nvme/host/core.c | 114 ++++++++++++++++++++++------------- drivers/nvme/host/lightnvm.c | 5 +- 3 files changed, 78 insertions(+), 43 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 9ebd2cfbd8490d..25c5df92032687 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -581,6 +581,8 @@ static int nvm_core_init(struct nvm_dev *dev) mutex_init(&dev->mlock); spin_lock_init(&dev->lock); + blk_queue_logical_block_size(dev->q, dev->sec_size); + return 0; err_fmtype: kfree(dev->lun_map); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2feacc70bf61f1..2c3da3315a021b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -888,42 +888,33 @@ static void nvme_config_discard(struct nvme_ns *ns) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); } -static int nvme_revalidate_disk(struct gendisk *disk) +static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) { - struct nvme_ns *ns = disk->private_data; - struct nvme_id_ns *id; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - if (test_bit(NVME_NS_DEAD, &ns->flags)) { - set_capacity(disk, 0); - return -ENODEV; - } - if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { + if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n", __func__); return -ENODEV; } - if (id->ncap == 0) { - kfree(id); - return -ENODEV; - } - if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { - if (nvme_nvm_register(ns->queue, disk->disk_name)) { - dev_warn(disk_to_dev(ns->disk), - "%s: LightNVM init failure\n", __func__); - kfree(id); - return -ENODEV; - } - ns->type = NVME_NS_LIGHTNVM; + if ((*id)->ncap == 0) { + kfree(*id); + return -ENODEV; } if (ns->ctrl->vs >= NVME_VS(1, 1)) - memcpy(ns->eui, id->eui64, sizeof(ns->eui)); + memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); if (ns->ctrl->vs >= NVME_VS(1, 2)) - memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); + memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); + + return 0; +} + +static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) +{ + struct nvme_ns *ns = disk->private_data; + u8 lbaf, pi_type; + u16 old_ms; + unsigned short bs; old_ms = ns->ms; lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; @@ -962,8 +953,26 @@ static int nvme_revalidate_disk(struct gendisk *disk) if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); blk_mq_unfreeze_queue(disk->queue); +} +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_id_ns *id = NULL; + int ret; + + if (test_bit(NVME_NS_DEAD, &ns->flags)) { + set_capacity(disk, 0); + return -ENODEV; + } + + ret = nvme_revalidate_ns(ns, &id); + if (ret) + return ret; + + __nvme_revalidate_disk(disk, id); kfree(id); + return 0; } @@ -1642,6 +1651,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; + struct nvme_id_ns *id; + char disk_name[DISK_NAME_LEN]; int node = dev_to_node(ctrl->dev); ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); @@ -1659,33 +1670,54 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->queue->queuedata = ns; ns->ctrl = ctrl; - disk = alloc_disk_node(0, node); - if (!disk) - goto out_free_queue; - kref_init(&ns->kref); ns->ns_id = nsid; - ns->disk = disk; ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); + sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); - if (nvme_revalidate_disk(ns->disk)) - goto out_free_disk; + if (nvme_revalidate_ns(ns, &id)) + goto out_free_queue; + + if (nvme_nvm_ns_supported(ns, id)) { + if (nvme_nvm_register(ns->queue, disk_name)) { + dev_warn(ctrl->dev, + "%s: LightNVM init failure\n", __func__); + goto out_free_id; + } + + disk = alloc_disk_node(0, node); + if (!disk) + goto out_free_id; + memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); + ns->disk = disk; + ns->type = NVME_NS_LIGHTNVM; + } else { + disk = alloc_disk_node(0, node); + if (!disk) + goto out_free_id; + + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->flags = GENHD_FL_EXT_DEVT; + memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); + ns->disk = disk; + + __nvme_revalidate_disk(disk, id); + } mutex_lock(&ctrl->namespaces_mutex); list_add_tail(&ns->list, &ctrl->namespaces); mutex_unlock(&ctrl->namespaces_mutex); kref_get(&ctrl->kref); + + kfree(id); + if (ns->type == NVME_NS_LIGHTNVM) return; @@ -1695,8 +1727,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) pr_warn("%s: failed to create sysfs group for identification\n", ns->disk->disk_name); return; - out_free_disk: - kfree(disk); + out_free_id: + kfree(id); out_free_queue: blk_cleanup_queue(ns->queue); out_release_instance: diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 63f483daf9301f..7268a7a1a19a4c 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -474,8 +474,9 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) - c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, - rqd->bio->bi_iter.bi_sector)); + /* momentarily hardcode the shift configuration. lba_shift from + * nvm_dev will be available in a follow-up patch */ + c->hb_rw.slba = cpu_to_le64(rqd->bio->bi_iter.bi_sector >> 3); } static void nvme_nvm_end_io(struct request *rq, int error) From 9ae2d0aa5046c67dd37cf4b70998ad296e718835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 16 Sep 2016 14:25:05 +0200 Subject: [PATCH 34/54] null_blk: refactor to support non-gendisk devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With LightNVM enabled devices, the gendisk structure is not exposed to the user. This hides the device driver specific sysfs entries, and prevents binding of LightNVM geometry information to the device. Refactor the device registration process, so that gendisk and non-gendisk devices are easily managed. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 110 ++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 49 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 75a7f88d67176a..895867a8a78358 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -414,23 +414,6 @@ static void cleanup_queues(struct nullb *nullb) kfree(nullb->queues); } -static void null_del_dev(struct nullb *nullb) -{ - list_del_init(&nullb->list); - - if (use_lightnvm) - nvm_unregister(nullb->disk_name); - else - del_gendisk(nullb->disk); - blk_cleanup_queue(nullb->q); - if (queue_mode == NULL_Q_MQ) - blk_mq_free_tag_set(&nullb->tag_set); - if (!use_lightnvm) - put_disk(nullb->disk); - cleanup_queues(nullb); - kfree(nullb); -} - #ifdef CONFIG_NVM static void null_lnvm_end_io(struct request *rq, int error) @@ -564,10 +547,41 @@ static struct nvm_dev_ops null_lnvm_dev_ops = { /* Simulate nvme protocol restriction */ .max_phys_sect = 64, }; + +static int null_nvm_register(struct nullb *nullb) +{ + return nvm_register(nullb->q, nullb->disk_name, &null_lnvm_dev_ops); +} + +static void null_nvm_unregister(struct nullb *nullb) +{ + nvm_unregister(nullb->disk_name); +} #else -static struct nvm_dev_ops null_lnvm_dev_ops; +static int null_nvm_register(struct nullb *nullb) +{ + return -EINVAL; +} +static void null_nvm_unregister(struct nullb *nullb) {} #endif /* CONFIG_NVM */ +static void null_del_dev(struct nullb *nullb) +{ + list_del_init(&nullb->list); + + if (use_lightnvm) + null_nvm_unregister(nullb); + else + del_gendisk(nullb->disk); + blk_cleanup_queue(nullb->q); + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); + if (!use_lightnvm) + put_disk(nullb->disk); + cleanup_queues(nullb); + kfree(nullb); +} + static int null_open(struct block_device *bdev, fmode_t mode) { return 0; @@ -640,11 +654,32 @@ static int init_driver_queues(struct nullb *nullb) return 0; } -static int null_add_dev(void) +static int null_gendisk_register(struct nullb *nullb) { struct gendisk *disk; - struct nullb *nullb; sector_t size; + + disk = nullb->disk = alloc_disk_node(1, home_node); + if (!disk) + return -ENOMEM; + size = gb * 1024 * 1024 * 1024ULL; + set_capacity(disk, size >> 9); + + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->major = null_major; + disk->first_minor = nullb->index; + disk->fops = &null_fops; + disk->private_data = nullb; + disk->queue = nullb->q; + strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + + add_disk(disk); + return 0; +} + +static int null_add_dev(void) +{ + struct nullb *nullb; int rv; nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); @@ -716,42 +751,19 @@ static int null_add_dev(void) sprintf(nullb->disk_name, "nullb%d", nullb->index); - if (use_lightnvm) { - rv = nvm_register(nullb->q, nullb->disk_name, - &null_lnvm_dev_ops); - if (rv) - goto out_cleanup_blk_queue; - goto done; - } - - disk = nullb->disk = alloc_disk_node(1, home_node); - if (!disk) { - rv = -ENOMEM; - goto out_cleanup_lightnvm; - } - size = gb * 1024 * 1024 * 1024ULL; - set_capacity(disk, size >> 9); - - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->major = null_major; - disk->first_minor = nullb->index; - disk->fops = &null_fops; - disk->private_data = nullb; - disk->queue = nullb->q; - strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + if (use_lightnvm) + rv = null_nvm_register(nullb); + else + rv = null_gendisk_register(nullb); - add_disk(disk); + if (rv) + goto out_cleanup_blk_queue; -done: mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); mutex_unlock(&lock); return 0; - -out_cleanup_lightnvm: - if (use_lightnvm) - nvm_unregister(nullb->disk_name); out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: From b21d5b301794ae332eaa6e177d71fe8b77d3664c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 16 Sep 2016 14:25:06 +0200 Subject: [PATCH 35/54] blk-mq: register device instead of disk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable devices without a gendisk instance to register itself with blk-mq and expose the associated multi-queue sysfs entries. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 17 +++++++---------- block/blk-sysfs.c | 4 ++-- drivers/md/dm-rq.c | 2 +- include/linux/blk-mq.h | 4 ++-- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 3c385b196bc711..01fb455d337747 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -393,9 +393,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -static void __blk_mq_unregister_disk(struct gendisk *disk) +static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { - struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; int i, j; @@ -413,15 +412,15 @@ static void __blk_mq_unregister_disk(struct gendisk *disk) kobject_del(&q->mq_kobj); kobject_put(&q->mq_kobj); - kobject_put(&disk_to_dev(disk)->kobj); + kobject_put(&dev->kobj); q->mq_sysfs_init_done = false; } -void blk_mq_unregister_disk(struct gendisk *disk) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { blk_mq_disable_hotplug(); - __blk_mq_unregister_disk(disk); + __blk_mq_unregister_dev(dev, q); blk_mq_enable_hotplug(); } @@ -443,10 +442,8 @@ static void blk_mq_sysfs_init(struct request_queue *q) } } -int blk_mq_register_disk(struct gendisk *disk) +int blk_mq_register_dev(struct device *dev, struct request_queue *q) { - struct device *dev = disk_to_dev(disk); - struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; int ret, i; @@ -467,7 +464,7 @@ int blk_mq_register_disk(struct gendisk *disk) } if (ret) - __blk_mq_unregister_disk(disk); + __blk_mq_unregister_dev(dev, q); else q->mq_sysfs_init_done = true; out: @@ -475,7 +472,7 @@ int blk_mq_register_disk(struct gendisk *disk) return ret; } -EXPORT_SYMBOL_GPL(blk_mq_register_disk); +EXPORT_SYMBOL_GPL(blk_mq_register_dev); void blk_mq_sysfs_unregister(struct request_queue *q) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f87a7e747d3600..9cc8d7c5439a98 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -704,7 +704,7 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->kobj, KOBJ_ADD); if (q->mq_ops) - blk_mq_register_disk(disk); + blk_mq_register_dev(dev, q); if (!q->request_fn) return 0; @@ -729,7 +729,7 @@ void blk_unregister_queue(struct gendisk *disk) return; if (q->mq_ops) - blk_mq_unregister_disk(disk); + blk_mq_unregister_dev(disk_to_dev(disk), q); if (q->request_fn) elv_unregister_queue(q); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 1ca7463e8bb2b2..ee48230a2952dd 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -955,7 +955,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ - blk_mq_register_disk(md->disk); + blk_mq_register_dev(disk_to_dev(md->disk), q); return 0; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2575779cf13f16..fbcfdf3232437b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -175,8 +175,8 @@ enum { struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); -int blk_mq_register_disk(struct gendisk *); -void blk_mq_unregister_disk(struct gendisk *); +int blk_mq_register_dev(struct device *, struct request_queue *); +void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); From b0b4e09c1ae71c4ec33df0616b830ae050006e9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 16 Sep 2016 14:25:07 +0200 Subject: [PATCH 36/54] lightnvm: control life of nvm_dev in driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LightNVM compatible device drivers does not have a method to expose LightNVM specific sysfs entries. To enable LightNVM sysfs entries to be exposed, lightnvm device drivers require a struct device to attach it to. To allow both the actual device driver and lightnvm sysfs entries to coexist, the device driver tracks the lifetime of the nvm_dev structure. This patch refactors NVMe and null_blk to handle the lifetime of struct nvm_dev, which eliminates the need for struct gendisk when a lightnvm compatible device is provided. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 22 ++++++++++++++++++++-- drivers/lightnvm/core.c | 35 ++++++++--------------------------- drivers/nvme/host/core.c | 36 +++++++++++++++--------------------- drivers/nvme/host/lightnvm.c | 31 ++++++++++++++++++++++++------- drivers/nvme/host/nvme.h | 12 +++++++----- include/linux/lightnvm.h | 15 +++++++++------ 6 files changed, 83 insertions(+), 68 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 895867a8a78358..91e1de898dafd8 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -34,6 +34,7 @@ struct nullb { unsigned int index; struct request_queue *q; struct gendisk *disk; + struct nvm_dev *ndev; struct blk_mq_tag_set tag_set; struct hrtimer timer; unsigned int queue_depth; @@ -550,12 +551,29 @@ static struct nvm_dev_ops null_lnvm_dev_ops = { static int null_nvm_register(struct nullb *nullb) { - return nvm_register(nullb->q, nullb->disk_name, &null_lnvm_dev_ops); + struct nvm_dev *dev; + int rv; + + dev = nvm_alloc_dev(0); + if (!dev) + return -ENOMEM; + + dev->q = nullb->q; + memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN); + dev->ops = &null_lnvm_dev_ops; + + rv = nvm_register(dev); + if (rv) { + kfree(dev); + return rv; + } + nullb->ndev = dev; + return 0; } static void null_nvm_unregister(struct nullb *nullb) { - nvm_unregister(nullb->disk_name); + nvm_unregister(nullb->ndev); } #else static int null_nvm_register(struct nullb *nullb) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 25c5df92032687..a99b59d1eb36c2 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -660,22 +660,15 @@ static void nvm_exit(struct nvm_dev *dev) pr_info("nvm: successfully unloaded\n"); } -int nvm_register(struct request_queue *q, char *disk_name, - struct nvm_dev_ops *ops) +struct nvm_dev *nvm_alloc_dev(int node) { - struct nvm_dev *dev; - int ret; - - if (!ops->identity) - return -EINVAL; - - dev = kzalloc(sizeof(struct nvm_dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; + return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node); +} +EXPORT_SYMBOL(nvm_alloc_dev); - dev->q = q; - dev->ops = ops; - strncpy(dev->name, disk_name, DISK_NAME_LEN); +int nvm_register(struct nvm_dev *dev) +{ + int ret; ret = nvm_init(dev); if (ret) @@ -714,29 +707,17 @@ int nvm_register(struct request_queue *q, char *disk_name, return 0; err_init: kfree(dev->lun_map); - kfree(dev); return ret; } EXPORT_SYMBOL(nvm_register); -void nvm_unregister(char *disk_name) +void nvm_unregister(struct nvm_dev *dev) { - struct nvm_dev *dev; - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(disk_name); - if (!dev) { - pr_err("nvm: could not find device %s to unregister\n", - disk_name); - up_write(&nvm_lock); - return; - } - list_del(&dev->devices); up_write(&nvm_lock); nvm_exit(dev); - kfree(dev); } EXPORT_SYMBOL(nvm_unregister); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2c3da3315a021b..3c707d83b1da8f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -156,12 +156,14 @@ static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - if (ns->type == NVME_NS_LIGHTNVM) - nvme_nvm_unregister(ns->queue, ns->disk->disk_name); + if (ns->ndev) + nvme_nvm_unregister(ns); - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); + if (ns->disk) { + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + } put_disk(ns->disk); ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); @@ -891,8 +893,7 @@ static void nvme_config_discard(struct nvme_ns *ns) static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) { if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { - dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n", - __func__); + dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__); return -ENODEV; } @@ -1683,18 +1684,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) goto out_free_queue; if (nvme_nvm_ns_supported(ns, id)) { - if (nvme_nvm_register(ns->queue, disk_name)) { - dev_warn(ctrl->dev, - "%s: LightNVM init failure\n", __func__); + if (nvme_nvm_register(ns, disk_name, node)) { + dev_warn(ctrl->dev, "%s: LightNVM init failure\n", + __func__); goto out_free_id; } - - disk = alloc_disk_node(0, node); - if (!disk) - goto out_free_id; - memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); - ns->disk = disk; - ns->type = NVME_NS_LIGHTNVM; } else { disk = alloc_disk_node(0, node); if (!disk) @@ -1718,7 +1712,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) kfree(id); - if (ns->type == NVME_NS_LIGHTNVM) + if (ns->ndev) return; device_add_disk(ctrl->device, ns->disk); @@ -1742,7 +1736,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; - if (ns->disk->flags & GENHD_FL_UP) { + if (ns->disk && ns->disk->flags & GENHD_FL_UP) { if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, @@ -1765,7 +1759,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = nvme_find_get_ns(ctrl, nsid); if (ns) { - if (revalidate_disk(ns->disk)) + if (ns->disk && revalidate_disk(ns->disk)) nvme_ns_remove(ns); nvme_put_ns(ns); } else @@ -2070,7 +2064,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) * Revalidating a dead namespace sets capacity to 0. This will * end buffered writers dirtying pages that can't be synced. */ - if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags)) revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 7268a7a1a19a4c..798fcd9f5d1f7d 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -474,9 +474,8 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) - /* momentarily hardcode the shift configuration. lba_shift from - * nvm_dev will be available in a follow-up patch */ - c->hb_rw.slba = cpu_to_le64(rqd->bio->bi_iter.bi_sector >> 3); + c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, + rqd->bio->bi_iter.bi_sector)); } static void nvme_nvm_end_io(struct request *rq, int error) @@ -593,14 +592,32 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; -int nvme_nvm_register(struct request_queue *q, char *disk_name) +int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) { - return nvm_register(q, disk_name, &nvme_nvm_dev_ops); + struct request_queue *q = ns->queue; + struct nvm_dev *dev; + int ret; + + dev = nvm_alloc_dev(node); + if (!dev) + return -ENOMEM; + + dev->q = q; + memcpy(dev->name, disk_name, DISK_NAME_LEN); + dev->ops = &nvme_nvm_dev_ops; + ns->ndev = dev; + + ret = nvm_register(dev); + + ns->lba_shift = ilog2(dev->sec_size) - 9; + + return ret; } -void nvme_nvm_unregister(struct request_queue *q, char *disk_name) +void nvme_nvm_unregister(struct nvme_ns *ns) { - nvm_unregister(disk_name); + nvm_unregister(ns->ndev); + kfree(ns->ndev); } /* move to shared place when used in multiple places. */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ab18b78102bf21..e0535c14e5387a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -18,6 +18,7 @@ #include #include #include +#include enum { /* @@ -154,6 +155,7 @@ struct nvme_ns { struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; + struct nvm_dev *ndev; struct kref kref; int instance; @@ -165,7 +167,6 @@ struct nvme_ns { u16 ms; bool ext; u8 pi_type; - int type; unsigned long flags; #define NVME_NS_REMOVING 0 @@ -307,15 +308,16 @@ int nvme_sg_get_version_num(int __user *ip); #ifdef CONFIG_NVM int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); -int nvme_nvm_register(struct request_queue *q, char *disk_name); -void nvme_nvm_unregister(struct request_queue *q, char *disk_name); +int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); +void nvme_nvm_unregister(struct nvme_ns *ns); #else -static inline int nvme_nvm_register(struct request_queue *q, char *disk_name) +static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, + int node) { return 0; } -static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {}; +static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) { diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ba78b830667435..5afc2634f332d3 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -524,9 +524,9 @@ extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, unsigned long); extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); -extern int nvm_register(struct request_queue *, char *, - struct nvm_dev_ops *); -extern void nvm_unregister(char *); +extern struct nvm_dev *nvm_alloc_dev(int); +extern int nvm_register(struct nvm_dev *); +extern void nvm_unregister(struct nvm_dev *); void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); @@ -575,11 +575,14 @@ extern int nvm_dev_factory(struct nvm_dev *, int flags); #else /* CONFIG_NVM */ struct nvm_dev_ops; -static inline int nvm_register(struct request_queue *q, char *disk_name, - struct nvm_dev_ops *ops) +static inline struct nvm_dev *nvm_alloc_dev(int node) +{ + return ERR_PTR(-EINVAL); +} +static inline int nvm_register(struct nvm_dev *dev) { return -EINVAL; } -static inline void nvm_unregister(char *disk_name) {} +static inline void nvm_unregister(struct nvm_dev *dev) {} #endif /* CONFIG_NVM */ #endif /* LIGHTNVM.H */ From 40267efddc296190d50c61d96daf277151447cf6 Mon Sep 17 00:00:00 2001 From: "Simon A. F. Lund" Date: Fri, 16 Sep 2016 14:25:08 +0200 Subject: [PATCH 37/54] lightnvm: expose device geometry through sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For a host to access an Open-Channel SSD, it has to know its geometry, so that it writes and reads at the appropriate device bounds. Currently, the geometry information is kept within the kernel, and not exported to user-space for consumption. This patch exposes the configuration through sysfs and enables user-space libraries, such as liblightnvm, to use the sysfs implementation to get the geometry of an Open-Channel SSD. The sysfs entries are stored within the device hierarchy, and can be found using the "lightnvm" device type. An example configuration looks like this: /sys/class/nvme/ └── nvme0n1 ├── capabilities: 3 ├── device_mode: 1 ├── erase_max: 1000000 ├── erase_typ: 1000000 ├── flash_media_type: 0 ├── media_capabilities: 0x00000001 ├── media_type: 0 ├── multiplane: 0x00010101 ├── num_blocks: 1022 ├── num_channels: 1 ├── num_luns: 4 ├── num_pages: 64 ├── num_planes: 1 ├── page_size: 4096 ├── prog_max: 100000 ├── prog_typ: 100000 ├── read_max: 10000 ├── read_typ: 10000 ├── sector_oob_size: 0 ├── sector_size: 4096 ├── media_manager: gennvm ├── ppa_format: 0x380830082808001010102008 ├── vendor_opcode: 0 ├── max_phys_secs: 64 └── version: 1 Signed-off-by: Simon A. F. Lund Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/Makefile | 2 +- drivers/lightnvm/core.c | 20 ++-- drivers/lightnvm/lightnvm.h | 35 +++++++ drivers/lightnvm/sysfs.c | 195 +++++++++++++++++++++++++++++++++++ drivers/nvme/host/core.c | 13 +-- drivers/nvme/host/lightnvm.c | 9 +- drivers/nvme/host/nvme.h | 18 +++- include/linux/lightnvm.h | 3 + 8 files changed, 278 insertions(+), 17 deletions(-) create mode 100644 drivers/lightnvm/lightnvm.h create mode 100644 drivers/lightnvm/sysfs.c diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index a7a0a22cf1a596..1f6b6521016aaf 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -2,6 +2,6 @@ # Makefile for Open-Channel SSDs. # -obj-$(CONFIG_NVM) := core.o sysblk.o +obj-$(CONFIG_NVM) := core.o sysblk.o sysfs.o obj-$(CONFIG_NVM_GENNVM) += gennvm.o obj-$(CONFIG_NVM_RRPC) += rrpc.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index a99b59d1eb36c2..a2393e1ef82ed9 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -27,6 +27,8 @@ #include #include +#include "lightnvm.h" + static LIST_HEAD(nvm_tgt_types); static DECLARE_RWSEM(nvm_tgtt_lock); static LIST_HEAD(nvm_mgrs); @@ -598,15 +600,19 @@ static void nvm_free_mgr(struct nvm_dev *dev) dev->mt = NULL; } -static void nvm_free(struct nvm_dev *dev) +void nvm_free(struct nvm_dev *dev) { if (!dev) return; nvm_free_mgr(dev); + if (dev->dma_pool) + dev->ops->destroy_dma_pool(dev->dma_pool); + kfree(dev->lptbl); kfree(dev->lun_map); + kfree(dev); } static int nvm_init(struct nvm_dev *dev) @@ -653,11 +659,7 @@ static int nvm_init(struct nvm_dev *dev) static void nvm_exit(struct nvm_dev *dev) { - if (dev->dma_pool) - dev->ops->destroy_dma_pool(dev->dma_pool); - nvm_free(dev); - - pr_info("nvm: successfully unloaded\n"); + nvm_sysfs_unregister_dev(dev); } struct nvm_dev *nvm_alloc_dev(int node) @@ -689,6 +691,10 @@ int nvm_register(struct nvm_dev *dev) } } + ret = nvm_sysfs_register_dev(dev); + if (ret) + goto err_ppalist; + if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { ret = nvm_get_sysblock(dev, &dev->sb); if (!ret) @@ -705,6 +711,8 @@ int nvm_register(struct nvm_dev *dev) up_write(&nvm_lock); return 0; +err_ppalist: + dev->ops->destroy_dma_pool(dev->dma_pool); err_init: kfree(dev->lun_map); return ret; diff --git a/drivers/lightnvm/lightnvm.h b/drivers/lightnvm/lightnvm.h new file mode 100644 index 00000000000000..93f1aacc9f02bd --- /dev/null +++ b/drivers/lightnvm/lightnvm.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 CNEX Labs. All rights reserved. + * Initial release: Matias Bjorling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#ifndef LIGHTNVM_H +#define LIGHTNVM_H + +#include + +/* core -> sysfs.c */ +int nvm_sysfs_register_dev(struct nvm_dev *); +void nvm_sysfs_unregister_dev(struct nvm_dev *); +int nvm_sysfs_register(void); +void nvm_sysfs_unregister(void); + +/* sysfs > core */ +void nvm_free(struct nvm_dev *); + +#endif diff --git a/drivers/lightnvm/sysfs.c b/drivers/lightnvm/sysfs.c new file mode 100644 index 00000000000000..72ad089c0269dc --- /dev/null +++ b/drivers/lightnvm/sysfs.c @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include + +#include "lightnvm.h" + +static ssize_t nvm_dev_attr_show(struct device *dev, + struct device_attribute *dattr, char *page) +{ + struct nvm_dev *ndev = container_of(dev, struct nvm_dev, dev); + struct nvm_id *id = &ndev->identity; + struct nvm_id_group *grp = &id->groups[0]; + struct attribute *attr = &dattr->attr; + + if (strcmp(attr->name, "version") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id); + } else if (strcmp(attr->name, "vendor_opcode") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt); + } else if (strcmp(attr->name, "capabilities") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", id->cap); + } else if (strcmp(attr->name, "device_mode") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", id->dom); + } else if (strcmp(attr->name, "media_manager") == 0) { + if (!ndev->mt) + return scnprintf(page, PAGE_SIZE, "%s\n", "none"); + return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name); + } else if (strcmp(attr->name, "ppa_format") == 0) { + return scnprintf(page, PAGE_SIZE, + "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", + id->ppaf.ch_offset, id->ppaf.ch_len, + id->ppaf.lun_offset, id->ppaf.lun_len, + id->ppaf.pln_offset, id->ppaf.pln_len, + id->ppaf.blk_offset, id->ppaf.blk_len, + id->ppaf.pg_offset, id->ppaf.pg_len, + id->ppaf.sect_offset, id->ppaf.sect_len); + } else if (strcmp(attr->name, "media_type") == 0) { /* u8 */ + return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype); + } else if (strcmp(attr->name, "flash_media_type") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype); + } else if (strcmp(attr->name, "num_channels") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch); + } else if (strcmp(attr->name, "num_luns") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun); + } else if (strcmp(attr->name, "num_planes") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); + } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk); + } else if (strcmp(attr->name, "num_pages") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); + } else if (strcmp(attr->name, "page_size") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz); + } else if (strcmp(attr->name, "hw_sector_size") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs); + } else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */ + return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos); + } else if (strcmp(attr->name, "read_typ") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt); + } else if (strcmp(attr->name, "read_max") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm); + } else if (strcmp(attr->name, "prog_typ") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt); + } else if (strcmp(attr->name, "prog_max") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm); + } else if (strcmp(attr->name, "erase_typ") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet); + } else if (strcmp(attr->name, "erase_max") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem); + } else if (strcmp(attr->name, "multiplane_modes") == 0) { + return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos); + } else if (strcmp(attr->name, "media_capabilities") == 0) { + return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap); + } else if (strcmp(attr->name, "max_phys_secs") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", + ndev->ops->max_phys_sect); + } else { + return scnprintf(page, + PAGE_SIZE, + "Unhandled attr(%s) in `nvm_dev_attr_show`\n", + attr->name); + } +} + +#define NVM_DEV_ATTR_RO(_name) \ + DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL) + +static NVM_DEV_ATTR_RO(version); +static NVM_DEV_ATTR_RO(vendor_opcode); +static NVM_DEV_ATTR_RO(capabilities); +static NVM_DEV_ATTR_RO(device_mode); +static NVM_DEV_ATTR_RO(ppa_format); +static NVM_DEV_ATTR_RO(media_manager); + +static NVM_DEV_ATTR_RO(media_type); +static NVM_DEV_ATTR_RO(flash_media_type); +static NVM_DEV_ATTR_RO(num_channels); +static NVM_DEV_ATTR_RO(num_luns); +static NVM_DEV_ATTR_RO(num_planes); +static NVM_DEV_ATTR_RO(num_blocks); +static NVM_DEV_ATTR_RO(num_pages); +static NVM_DEV_ATTR_RO(page_size); +static NVM_DEV_ATTR_RO(hw_sector_size); +static NVM_DEV_ATTR_RO(oob_sector_size); +static NVM_DEV_ATTR_RO(read_typ); +static NVM_DEV_ATTR_RO(read_max); +static NVM_DEV_ATTR_RO(prog_typ); +static NVM_DEV_ATTR_RO(prog_max); +static NVM_DEV_ATTR_RO(erase_typ); +static NVM_DEV_ATTR_RO(erase_max); +static NVM_DEV_ATTR_RO(multiplane_modes); +static NVM_DEV_ATTR_RO(media_capabilities); +static NVM_DEV_ATTR_RO(max_phys_secs); + +#define NVM_DEV_ATTR(_name) (dev_attr_##_name##) + +static struct attribute *nvm_dev_attrs[] = { + &dev_attr_version.attr, + &dev_attr_vendor_opcode.attr, + &dev_attr_capabilities.attr, + &dev_attr_device_mode.attr, + &dev_attr_media_manager.attr, + + &dev_attr_ppa_format.attr, + &dev_attr_media_type.attr, + &dev_attr_flash_media_type.attr, + &dev_attr_num_channels.attr, + &dev_attr_num_luns.attr, + &dev_attr_num_planes.attr, + &dev_attr_num_blocks.attr, + &dev_attr_num_pages.attr, + &dev_attr_page_size.attr, + &dev_attr_hw_sector_size.attr, + &dev_attr_oob_sector_size.attr, + &dev_attr_read_typ.attr, + &dev_attr_read_max.attr, + &dev_attr_prog_typ.attr, + &dev_attr_prog_max.attr, + &dev_attr_erase_typ.attr, + &dev_attr_erase_max.attr, + &dev_attr_multiplane_modes.attr, + &dev_attr_media_capabilities.attr, + &dev_attr_max_phys_secs.attr, + NULL, +}; + +static struct attribute_group nvm_dev_attr_group = { + .name = "lightnvm", + .attrs = nvm_dev_attrs, +}; + +static const struct attribute_group *nvm_dev_attr_groups[] = { + &nvm_dev_attr_group, + NULL, +}; + +static void nvm_dev_release(struct device *device) +{ + struct nvm_dev *dev = container_of(device, struct nvm_dev, dev); + struct request_queue *q = dev->q; + + pr_debug("nvm/sysfs: `nvm_dev_release`\n"); + + blk_mq_unregister_dev(device, q); + + nvm_free(dev); +} + +static struct device_type nvm_type = { + .name = "lightnvm", + .groups = nvm_dev_attr_groups, + .release = nvm_dev_release, +}; + +int nvm_sysfs_register_dev(struct nvm_dev *dev) +{ + if (!dev->parent_dev) + return 0; + + dev->dev.parent = dev->parent_dev; + dev_set_name(&dev->dev, "%s", dev->name); + dev->dev.type = &nvm_type; + device_initialize(&dev->dev); + device_add(&dev->dev); + + blk_mq_register_dev(&dev->dev, dev->q); + + return 0; +} + +void nvm_sysfs_unregister_dev(struct nvm_dev *dev) +{ + if (dev && dev->parent_dev) + kobject_put(&dev->dev.kobj); +} diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3c707d83b1da8f..bd2156cbfc6cb9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1435,7 +1435,7 @@ static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); struct nvme_ctrl *ctrl = ns->ctrl; int serial_len = sizeof(ctrl->serial); int model_len = sizeof(ctrl->model); @@ -1459,7 +1459,7 @@ static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); return sprintf(buf, "%pU\n", ns->uuid); } static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); @@ -1467,7 +1467,7 @@ static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); return sprintf(buf, "%8phd\n", ns->eui); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); @@ -1475,7 +1475,7 @@ static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); return sprintf(buf, "%d\n", ns->ns_id); } static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); @@ -1492,7 +1492,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); if (a == &dev_attr_uuid.attr) { if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) @@ -1684,7 +1684,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) goto out_free_queue; if (nvme_nvm_ns_supported(ns, id)) { - if (nvme_nvm_register(ns, disk_name, node)) { + if (nvme_nvm_register(ns, disk_name, node, + &nvme_ns_attr_group)) { dev_warn(ctrl->dev, "%s: LightNVM init failure\n", __func__); goto out_free_id; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 798fcd9f5d1f7d..f5e3011e31fcdf 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -592,7 +592,8 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) +int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node, + const struct attribute_group *attrs) { struct request_queue *q = ns->queue; struct nvm_dev *dev; @@ -605,19 +606,23 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) dev->q = q; memcpy(dev->name, disk_name, DISK_NAME_LEN); dev->ops = &nvme_nvm_dev_ops; + dev->parent_dev = ns->ctrl->device; + dev->private_data = ns; ns->ndev = dev; ret = nvm_register(dev); ns->lba_shift = ilog2(dev->sec_size) - 9; + if (sysfs_create_group(&dev->dev.kobj, attrs)) + pr_warn("%s: failed to create sysfs group for identification\n", + disk_name); return ret; } void nvme_nvm_unregister(struct nvme_ns *ns) { nvm_unregister(ns->ndev); - kfree(ns->ndev); } /* move to shared place when used in multiple places. */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e0535c14e5387a..bfd25dd73bca2c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -308,11 +308,21 @@ int nvme_sg_get_version_num(int __user *ip); #ifdef CONFIG_NVM int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); +int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node, + const struct attribute_group *attrs); void nvme_nvm_unregister(struct nvme_ns *ns); + +static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) +{ + if (dev->type->devnode) + return dev_to_disk(dev)->private_data; + + return (container_of(dev, struct nvm_dev, dev))->private_data; +} #else static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, - int node) + int node, + const struct attribute_group *attrs) { return 0; } @@ -323,6 +333,10 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i { return 0; } +static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) +{ + return dev_to_disk(dev)->private_data; +} #endif /* CONFIG_NVM */ int __init nvme_core_init(void); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 5afc2634f332d3..d190786e4ad812 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -352,7 +352,10 @@ struct nvm_dev { /* Backend device */ struct request_queue *q; + struct device dev; + struct device *parent_dev; char name[DISK_NAME_LEN]; + void *private_data; struct mutex mlock; spinlock_t lock; From 1e3aeae4ea710023dda2a6b780183ee371d1a796 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Sep 2016 14:25:09 +0200 Subject: [PATCH 38/54] lightnvm: propagate device_add() error code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit device_add() may fail, and all callers are supposed to check the return value, but one new user in lightnvm doesn't: drivers/lightnvm/sysfs.c: In function 'nvm_sysfs_register_dev': drivers/lightnvm/sysfs.c:184:2: error: ignoring return value of 'device_add', declared with attribute warn_unused_result [-Werror=unused-result] This changes the caller to propagate any error codes, which avoids the warning. Signed-off-by: Arnd Bergmann Fixes: 38c9e260b9f9 ("lightnvm: expose device geometry through sysfs") Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/lightnvm.h | 2 +- drivers/lightnvm/sysfs.c | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/lightnvm/lightnvm.h b/drivers/lightnvm/lightnvm.h index 93f1aacc9f02bd..305c181509a63a 100644 --- a/drivers/lightnvm/lightnvm.h +++ b/drivers/lightnvm/lightnvm.h @@ -24,7 +24,7 @@ #include /* core -> sysfs.c */ -int nvm_sysfs_register_dev(struct nvm_dev *); +int __must_check nvm_sysfs_register_dev(struct nvm_dev *); void nvm_sysfs_unregister_dev(struct nvm_dev *); int nvm_sysfs_register(void); void nvm_sysfs_unregister(void); diff --git a/drivers/lightnvm/sysfs.c b/drivers/lightnvm/sysfs.c index 72ad089c0269dc..0338c27ab95a92 100644 --- a/drivers/lightnvm/sysfs.c +++ b/drivers/lightnvm/sysfs.c @@ -174,6 +174,8 @@ static struct device_type nvm_type = { int nvm_sysfs_register_dev(struct nvm_dev *dev) { + int ret; + if (!dev->parent_dev) return 0; @@ -181,11 +183,12 @@ int nvm_sysfs_register_dev(struct nvm_dev *dev) dev_set_name(&dev->dev, "%s", dev->name); dev->dev.type = &nvm_type; device_initialize(&dev->dev); - device_add(&dev->dev); + ret = device_add(&dev->dev); - blk_mq_register_dev(&dev->dev, dev->q); + if (!ret) + blk_mq_register_dev(&dev->dev, dev->q); - return 0; + return ret; } void nvm_sysfs_unregister_dev(struct nvm_dev *dev) From 491221f88d00651e449c9caf7415b6453c8a77b7 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 22 Sep 2016 03:10:01 -0400 Subject: [PATCH 39/54] block: export bio_free_pages to other modules bio_free_pages is introduced in commit 1dfa0f68c040 ("block: add a helper to free bio bounce buffer pages"), we can reuse the func in other modules after it was imported. Cc: Christoph Hellwig Cc: Jens Axboe Cc: Mike Snitzer Cc: Shaohua Li Signed-off-by: Guoqing Jiang Acked-by: Kent Overstreet Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- drivers/md/bcache/btree.c | 6 +----- drivers/md/bcache/debug.c | 6 ++---- drivers/md/bcache/movinggc.c | 5 +---- drivers/md/bcache/request.c | 9 ++------- drivers/md/bcache/writeback.c | 5 +---- drivers/md/dm-log-writes.c | 6 +----- drivers/md/raid1.c | 8 ++------ include/linux/bio.h | 1 + 9 files changed, 13 insertions(+), 36 deletions(-) diff --git a/block/bio.c b/block/bio.c index a6d279e1ea9e96..db85c5753a7656 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1068,7 +1068,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) return 0; } -static void bio_free_pages(struct bio *bio) +void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; int i; @@ -1076,6 +1076,7 @@ static void bio_free_pages(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); } +EXPORT_SYMBOL(bio_free_pages); /** * bio_uncopy_user - finish previously mapped bio diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 76f7534d1dd158..81d3db40cd7be6 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -361,12 +361,8 @@ static void __btree_node_write_done(struct closure *cl) static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io); - struct bio_vec *bv; - int n; - - bio_for_each_segment_all(bv, b->bio, n) - __free_page(bv->bv_page); + bio_free_pages(b->bio); __btree_node_write_done(cl); } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c28df164701e7b..333a1e5f6ae66c 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -107,9 +107,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) { char name[BDEVNAME_SIZE]; struct bio *check; - struct bio_vec bv, *bv2; + struct bio_vec bv; struct bvec_iter iter; - int i; check = bio_clone(bio, GFP_NOIO); if (!check) @@ -136,8 +135,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) kunmap_atomic(p1); } - bio_for_each_segment_all(bv2, check, i) - __free_page(bv2->bv_page); + bio_free_pages(check); out_put: bio_put(check); } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 1881319f2298f6..5c4bddecfaf092 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -44,11 +44,8 @@ static void write_moving_finish(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; - struct bio_vec *bv; - int i; - bio_for_each_segment_all(bv, bio, i) - __free_page(bv->bv_page); + bio_free_pages(bio); if (io->op.replace_collision) trace_bcache_gc_copy_collision(&io->w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 4b177fe11ebb99..40ffe5e424b302 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -694,13 +694,8 @@ static void cached_dev_cache_miss_done(struct closure *cl) if (s->iop.replace_collision) bch_mark_cache_miss_collision(s->iop.c, s->d); - if (s->iop.bio) { - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, s->iop.bio, i) - __free_page(bv->bv_page); - } + if (s->iop.bio) + bio_free_pages(s->iop.bio); cached_dev_bio_complete(cl); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d9fd2a62e5f6f9..e51644e503a53b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -128,11 +128,8 @@ static void write_dirty_finish(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; - struct bio_vec *bv; - int i; - bio_for_each_segment_all(bv, &io->bio, i) - __free_page(bv->bv_page); + bio_free_pages(&io->bio); /* This is kind of a dumb way of signalling errors. */ if (KEY_DIRTY(&w->key)) { diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 4ab68033f9d10e..b52404159ccf96 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -149,8 +149,6 @@ static void put_io_block(struct log_writes_c *lc) static void log_end_io(struct bio *bio) { struct log_writes_c *lc = bio->bi_private; - struct bio_vec *bvec; - int i; if (bio->bi_error) { unsigned long flags; @@ -161,9 +159,7 @@ static void log_end_io(struct bio *bio) spin_unlock_irqrestore(&lc->blocks_lock, flags); } - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - + bio_free_pages(bio); put_io_block(lc); bio_put(bio); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 21dc00eb1989fd..1961d827dbd19b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -145,12 +145,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; out_free_pages: - while (--j >= 0) { - struct bio_vec *bv; - - bio_for_each_segment_all(bv, r1_bio->bios[j], i) - __free_page(bv->bv_page); - } + while (--j >= 0) + bio_free_pages(r1_bio->bios[j]); out_free_bio: while (++j < pi->raid_disks) diff --git a/include/linux/bio.h b/include/linux/bio.h index e00721a2dce133..97cb48f03dc73c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -459,6 +459,7 @@ static inline void bio_flush_dcache_pages(struct bio *bi) extern void bio_copy_data(struct bio *dst, struct bio *src); extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); +extern void bio_free_pages(struct bio *bio); extern struct bio *bio_copy_user_iov(struct request_queue *, struct rq_map_data *, From 841bac2c87fc21c3ecf3bc3354855921735aeec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2016 10:08:43 -0600 Subject: [PATCH 40/54] blk-mq: get rid of manual run of queue with __blk_mq_run_hw_queue() Two cases: 1) blk_mq_alloc_request() needlessly re-runs the queue, after calling into the tag allocation without NOWAIT set. We don't need to do that. 2) blk_mq_map_request() should just use blk_mq_run_hw_queue() with the async flag set to false. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- block/blk-mq.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index e0a69daddbd8ce..c29700010b5c82 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -34,8 +34,6 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); - /* * Check if any of the ctx's have pending work in this hardware queue */ @@ -228,19 +226,9 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - ctx = alloc_data.ctx; - } blk_mq_put_ctx(ctx); + if (!rq) { blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); @@ -1225,7 +1213,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); if (unlikely(!rq)) { - __blk_mq_run_hw_queue(hctx); + blk_mq_run_hw_queue(hctx, false); blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, op); From 63581af3f31e0dbea112b83f77c4fbb6a10e1406 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Sep 2016 11:38:23 -0700 Subject: [PATCH 41/54] blk-mq: remove non-blocking pass in blk_mq_map_request bt_get already does a non-blocking pass as well as running the queue when scheduling internally, no need to duplicate it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c29700010b5c82..80d483864247ff 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1210,20 +1210,8 @@ static struct request *blk_mq_map_request(struct request_queue *q, op_flags |= REQ_SYNC; trace_block_getrq(q, bio, op); - blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - if (unlikely(!rq)) { - blk_mq_run_hw_queue(hctx, false); - blk_mq_put_ctx(ctx); - trace_block_sleeprq(q, bio, op); - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - ctx = alloc_data.ctx; - hctx = alloc_data.hctx; - } hctx->queued++; data->hctx = hctx; From 1b792f2f92784c00db2e6431496e437855d6f12a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Sep 2016 10:12:13 -0600 Subject: [PATCH 42/54] blk-mq: add flag for drivers wanting blocking ->queue_rq() If a driver sets BLK_MQ_F_BLOCKING, it is allowed to block in its ->queue_rq() handler. For that case, blk-mq ensures that we always calls it from a safe context. Signed-off-by: Jens Axboe Tested-by: Josef Bacik --- block/blk-mq.c | 2 +- include/linux/blk-mq.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 80d483864247ff..e9ebe9864cc450 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -908,7 +908,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) !blk_mq_hw_queue_mapped(hctx))) return; - if (!async) { + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fbcfdf3232437b..5daa0ef756dd29 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -155,6 +155,7 @@ enum { BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_DEFER_ISSUE = 1 << 4, + BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, From 3932a86b4b9d1f0b049d64d4591ce58ad18b44ec Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 22 Sep 2016 20:59:59 -0400 Subject: [PATCH 43/54] cfq: fix starvation of asynchronous writes While debugging timeouts happening in my application workload (ScyllaDB), I have observed calls to open() taking a long time, ranging everywhere from 2 seconds - the first ones that are enough to time out my application - to more than 30 seconds. The problem seems to happen because XFS may block on pending metadata updates under certain circumnstances, and that's confirmed with the following backtrace taken by the offcputime tool (iovisor/bcc): ffffffffb90c57b1 finish_task_switch ffffffffb97dffb5 schedule ffffffffb97e310c schedule_timeout ffffffffb97e1f12 __down ffffffffb90ea821 down ffffffffc046a9dc xfs_buf_lock ffffffffc046abfb _xfs_buf_find ffffffffc046ae4a xfs_buf_get_map ffffffffc046babd xfs_buf_read_map ffffffffc0499931 xfs_trans_read_buf_map ffffffffc044a561 xfs_da_read_buf ffffffffc0451390 xfs_dir3_leaf_read.constprop.16 ffffffffc0452b90 xfs_dir2_leaf_lookup_int ffffffffc0452e0f xfs_dir2_leaf_lookup ffffffffc044d9d3 xfs_dir_lookup ffffffffc047d1d9 xfs_lookup ffffffffc0479e53 xfs_vn_lookup ffffffffb925347a path_openat ffffffffb9254a71 do_filp_open ffffffffb9242a94 do_sys_open ffffffffb9242b9e sys_open ffffffffb97e42b2 entry_SYSCALL_64_fastpath 00007fb0698162ed [unknown] Inspecting my run with blktrace, I can see that the xfsaild kthread exhibit very high "Dispatch wait" times, on the dozens of seconds range and consistent with the open() times I have saw in that run. Still from the blktrace output, we can after searching a bit, identify the request that wasn't dispatched: 8,0 11 152 81.092472813 804 A WM 141698288 + 8 <- (8,1) 141696240 8,0 11 153 81.092472889 804 Q WM 141698288 + 8 [xfsaild/sda1] 8,0 11 154 81.092473207 804 G WM 141698288 + 8 [xfsaild/sda1] 8,0 11 206 81.092496118 804 I WM 141698288 + 8 ( 22911) [xfsaild/sda1] <==== 'I' means Inserted (into the IO scheduler) ===================================> 8,0 0 289372 96.718761435 0 D WM 141698288 + 8 (15626265317) [swapper/0] <==== Only 15s later the CFQ scheduler dispatches the request ======================> As we can see above, in this particular example CFQ took 15 seconds to dispatch this request. Going back to the full trace, we can see that the xfsaild queue had plenty of opportunity to run, and it was selected as the active queue many times. It would just always be preempted by something else (example): 8,0 1 0 81.117912979 0 m N cfq1618SN / insert_request 8,0 1 0 81.117913419 0 m N cfq1618SN / add_to_rr 8,0 1 0 81.117914044 0 m N cfq1618SN / preempt 8,0 1 0 81.117914398 0 m N cfq767A / slice expired t=1 8,0 1 0 81.117914755 0 m N cfq767A / resid=40 8,0 1 0 81.117915340 0 m N / served: vt=1948520448 min_vt=1948520448 8,0 1 0 81.117915858 0 m N cfq767A / sl_used=1 disp=0 charge=0 iops=1 sect=0 where cfq767 is the xfsaild queue and cfq1618 corresponds to one of the ScyllaDB IO dispatchers. The requests preempting the xfsaild queue are synchronous requests. That's a characteristic of ScyllaDB workloads, as we only ever issue O_DIRECT requests. While it can be argued that preempting ASYNC requests in favor of SYNC is part of the CFQ logic, I don't believe that doing so for 15+ seconds is anyone's goal. Moreover, unless I am misunderstanding something, that breaks the expectation set by the "fifo_expire_async" tunable, which in my system is set to the default. Looking at the code, it seems to me that the issue is that after we make an async queue active, there is no guarantee that it will execute any request. When the queue itself tests if it cfq_may_dispatch() it can bail if it sees SYNC requests in flight. An incoming request from another queue can also preempt it in such situation before we have the chance to execute anything (as seen in the trace above). This patch sets the must_dispatch flag if we notice that we have requests that are already fifo_expired. This flag is always cleared after cfq_dispatch_request() returns from cfq_dispatch_requests(), so it won't pin the queue for subsequent requests (unless they are themselves expired) Care is taken during preempt to still allow rt requests to preempt us regardless. Testing my workload with this patch applied produces much better results. From the application side I see no timeouts, and the open() latency histogram generated by systemtap looks much better, with the worst outlier at 131ms: Latency histogram of xfs_buf_lock acquisition (microseconds): value |-------------------------------------------------- count 0 | 11 1 |@@@@ 161 2 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1966 4 |@ 54 8 | 36 16 | 7 32 | 0 64 | 0 ~ 1024 | 0 2048 | 0 4096 | 1 8192 | 1 16384 | 2 32768 | 0 65536 | 0 131072 | 1 262144 | 0 524288 | 0 Signed-off-by: Glauber Costa CC: Jens Axboe CC: linux-block@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Glauber Costa Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index cc2f6dbd430326..5e24d880306c2a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) if (ktime_get_ns() < rq->fifo_time) rq = NULL; - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); return rq; } @@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; + if (cfq_cfqq_must_dispatch(cfqq)) + return true; + /* * Drain async requests before we start sync IO */ @@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); + rq = cfq_check_fifo(cfqq); + if (rq) + cfq_mark_cfqq_must_dispatch(cfqq); + if (!cfq_may_dispatch(cfqd, cfqq)) return false; /* * follow expired path, else get first next available */ - rq = cfq_check_fifo(cfqq); if (!rq) rq = cfqq->next_rq; + else + cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); /* * insert request into driver dispatch list @@ -3989,7 +3996,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if the new request is sync, but the currently running queue is * not, let the sync request have priority. */ - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) + if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) return true; /* From 55679c8d23d191c24ad133abc5647e3054ca8de1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 23 Sep 2016 09:07:56 -0700 Subject: [PATCH 44/54] blkcg: Annotate blkg_hint correctly Avoid that sparse complains about blkg_hint manipulations. Fixes: a637120e4902 ("blkcg: use radix tree to index blkgs from blkcg") Signed-off-by: Bart Van Assche Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 10648e300c930e..cbdbf34de5b607 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -45,7 +45,7 @@ struct blkcg { spinlock_t lock; struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; + struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; From 005043ac31ba5bf6721b4ddca10ff2066e2ee2fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Sep 2016 16:55:31 -0400 Subject: [PATCH 45/54] nbd: use BLK_MQ_F_BLOCKING We take a mutex when sending commands and send stuff over the network, we need to have queue_rq called asynchronously. Signed-off-by: Josef Bacik Fixes: fd8383fd88a2 ("nbd: convert to blkmq") Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4c6dd1a85eade5..ccfcfc11399abd 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -941,7 +941,7 @@ static int __init nbd_init(void) nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE; nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd); nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE | - BLK_MQ_F_SG_MERGE; + BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING; nbd_dev[i].tag_set.driver_data = &nbd_dev[i]; err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set); From 0fe51ff269da745078e0ab2b90dfdf2a58d6f3e7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 2 Aug 2016 10:40:01 +0300 Subject: [PATCH 46/54] nvme-fabrics: rework nvmf_get_address() for variable options Revise nvmf_get_address() string to account for not all options being present. Signed-off-by: James Smart Acked-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fabrics.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index dc996761042ffb..2867b92cbdfba1 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -109,8 +109,16 @@ static void nvmf_host_put(struct nvmf_host *host) */ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) { - return snprintf(buf, size, "traddr=%s,trsvcid=%s\n", - ctrl->opts->traddr, ctrl->opts->trsvcid); + int len = 0; + + if (ctrl->opts->mask & NVMF_OPT_TRADDR) + len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr); + if (ctrl->opts->mask & NVMF_OPT_TRSVCID) + len += snprintf(buf + len, size - len, "%strsvcid=%s", + (len) ? "," : "", ctrl->opts->trsvcid); + len += snprintf(buf + len, size - len, "\n"); + + return len; } EXPORT_SYMBOL_GPL(nvmf_get_address); From 4a9f05c57f98e794763650056731a0023ebfab5f Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 2 Aug 2016 10:41:20 +0300 Subject: [PATCH 47/54] nvme-fabrics: revise host transport option descriptions Revise some of the comments so not so ethernet-network centric Signed-off-by: James Smart Acked-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fabrics.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 89df52c8be978c..2755efde16ac1e 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -64,9 +64,10 @@ enum { * being added. * @subsysnqn: Hold the fully qualified NQN subystem name (format defined * in the NVMe specification, "NVMe Qualified Names"). - * @traddr: network address that will be used by the host to communicate - * to the added NVMe controller. - * @trsvcid: network port used for host-controller communication. + * @traddr: The transport-specific TRADDR field for a port on the + * subsystem which is adding a controller. + * @trsvcid: The transport-specific TRSVCID field for a port on the + * subsystem which is adding a controller. * @queue_size: Number of IO queue elements. * @nr_io_queues: Number of controller IO queues that will be established. * @reconnect_delay: Time between two consecutive reconnect attempts. From 478bcb9388f2c3eedba34ed5811793400047f95d Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 2 Aug 2016 10:42:10 +0300 Subject: [PATCH 48/54] nvme-fabrics: Add host_traddr options field to host infrastructure Add the host_traddr field to allow specification of the host-port connection info for the transport. Will be used by FC transport. Signed-off-by: James Smart Acked-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fabrics.c | 13 +++++++++++++ drivers/nvme/host/fabrics.h | 4 ++++ 2 files changed, 17 insertions(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 2867b92cbdfba1..1c07b76cbdeb8e 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -116,6 +116,9 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) if (ctrl->opts->mask & NVMF_OPT_TRSVCID) len += snprintf(buf + len, size - len, "%strsvcid=%s", (len) ? "," : "", ctrl->opts->trsvcid); + if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) + len += snprintf(buf + len, size - len, "%shost_traddr=%s", + (len) ? "," : "", ctrl->opts->host_traddr); len += snprintf(buf + len, size - len, "\n"); return len; @@ -518,6 +521,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" }, { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, + { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, { NVMF_OPT_ERR, NULL } }; @@ -674,6 +678,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->reconnect_delay = token; break; + case NVMF_OPT_HOST_TRADDR: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->host_traddr = p; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -740,6 +752,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts) kfree(opts->traddr); kfree(opts->trsvcid); kfree(opts->subsysnqn); + kfree(opts->host_traddr); kfree(opts); } EXPORT_SYMBOL_GPL(nvmf_free_options); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 2755efde16ac1e..8f08c3a3406b04 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -52,6 +52,7 @@ enum { NVMF_OPT_KATO = 1 << 7, NVMF_OPT_HOSTNQN = 1 << 8, NVMF_OPT_RECONNECT_DELAY = 1 << 9, + NVMF_OPT_HOST_TRADDR = 1 << 10, }; /** @@ -68,6 +69,8 @@ enum { * subsystem which is adding a controller. * @trsvcid: The transport-specific TRSVCID field for a port on the * subsystem which is adding a controller. + * @host_traddr: A transport-specific field identifying the NVME host port + * to use for the connection to the controller. * @queue_size: Number of IO queue elements. * @nr_io_queues: Number of controller IO queues that will be established. * @reconnect_delay: Time between two consecutive reconnect attempts. @@ -81,6 +84,7 @@ struct nvmf_ctrl_options { char *subsysnqn; char *traddr; char *trsvcid; + char *host_traddr; size_t queue_size; unsigned int nr_io_queues; unsigned int reconnect_delay; From 2d79c7dc8fe5cf1158250a5fd25c02d781324cd3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 1 Sep 2016 20:45:03 +0100 Subject: [PATCH 49/54] admin-cmd: Added smart-log command support. This patch implements the support for smart-log command (NVM Express 1.2.1-section 5.10.1.2 SMART / Health Information (Log Identifier 02h)) on the target for NVMe over Fabric. In current implementation host can retrieve following statistics:- 1. Data Units Read. 2. Data Units Written. 3. Host Read Commands. 4. Host Write Commands. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/target/admin-cmd.c | 88 +++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 47c564b5a28951..7ab9c9381b9895 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include "nvmet.h" u32 nvmet_get_log_page_len(struct nvme_command *cmd) @@ -29,8 +30,84 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd) return len; } +static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, + struct nvme_smart_log *slog) +{ + u16 status; + struct nvmet_ns *ns; + u64 host_reads, host_writes, data_units_read, data_units_written; + + status = NVME_SC_SUCCESS; + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS; + pr_err("nvmet : Counld not find namespace id : %d\n", + le32_to_cpu(req->cmd->get_log_page.nsid)); + goto out; + } + + host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); + data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]); + host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); + data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + + put_unaligned_le64(host_reads, &slog->host_reads[0]); + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); + put_unaligned_le64(host_writes, &slog->host_writes[0]); + put_unaligned_le64(data_units_written, &slog->data_units_written[0]); + nvmet_put_namespace(ns); +out: + return status; +} + +static u16 nvmet_get_smart_log_all(struct nvmet_req *req, + struct nvme_smart_log *slog) +{ + u16 status; + u64 host_reads = 0, host_writes = 0; + u64 data_units_read = 0, data_units_written = 0; + struct nvmet_ns *ns; + struct nvmet_ctrl *ctrl; + + status = NVME_SC_SUCCESS; + ctrl = req->sq->ctrl; + + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); + data_units_read += + part_stat_read(ns->bdev->bd_part, sectors[READ]); + host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); + data_units_written += + part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + + } + rcu_read_unlock(); + + put_unaligned_le64(host_reads, &slog->host_reads[0]); + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); + put_unaligned_le64(host_writes, &slog->host_writes[0]); + put_unaligned_le64(data_units_written, &slog->data_units_written[0]); + + return status; +} + +static u16 nvmet_get_smart_log(struct nvmet_req *req, + struct nvme_smart_log *slog) +{ + u16 status; + + WARN_ON(req == NULL || slog == NULL); + if (req->cmd->get_log_page.nsid == 0xFFFFFFFF) + status = nvmet_get_smart_log_all(req, slog); + else + status = nvmet_get_smart_log_nsid(req, slog); + return status; +} + static void nvmet_execute_get_log_page(struct nvmet_req *req) { + struct nvme_smart_log *smart_log; size_t data_len = nvmet_get_log_page_len(req->cmd); void *buf; u16 status = 0; @@ -59,6 +136,16 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) * available (e.g. units or commands read/written) those aren't * persistent over power loss. */ + if (data_len != sizeof(*smart_log)) { + status = NVME_SC_INTERNAL; + goto err; + } + smart_log = buf; + status = nvmet_get_smart_log(req, smart_log); + if (status) { + memset(buf, '\0', data_len); + goto err; + } break; case 0x03: /* @@ -73,6 +160,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) status = nvmet_copy_to_sgl(req, 0, buf, data_len); +err: kfree(buf); out: nvmet_req_complete(req, status); From 9b349b080ca9777f6dab3da06d7fa4577f7d4c29 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 21 Sep 2016 11:06:32 -0700 Subject: [PATCH 50/54] nvmet: Use direct IO for writes We're designed to work with high-end devices where direct IO makes perfect sense. We noticed that we context switch by scheduling kblockd instead of going directly to the device without REQ_SYNC for writes. Signed-off-by: Sagi Grimberg Reviewed-by: Jens Axboe --- drivers/nvme/target/io-cmd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 2cd069b691ae54..4132b6b9818233 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -58,6 +58,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) if (req->cmd->rw.opcode == nvme_cmd_write) { op = REQ_OP_WRITE; + op_flags = WRITE_ODIRECT; if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) op_flags |= REQ_FUA; } else { From 2e5d0baa04845dc3a3d7dfa33d7663de270bb146 Mon Sep 17 00:00:00 2001 From: Alexander Solganik Date: Wed, 21 Sep 2016 14:12:38 -0700 Subject: [PATCH 51/54] nvmet: Make dsm number of ranges zero based This caused the nvmet request data length to be incorrect. Signed-off-by: Alexander Solganik Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig --- drivers/nvme/target/io-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 4132b6b9818233..4a96c2049b7b6b 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -206,7 +206,7 @@ int nvmet_parse_io_cmd(struct nvmet_req *req) return 0; case nvme_cmd_dsm: req->execute = nvmet_execute_dsm; - req->data_len = le32_to_cpu(cmd->dsm.nr) * + req->data_len = le32_to_cpu(cmd->dsm.nr + 1) * sizeof(struct nvme_dsm_range); return 0; default: From 26501db8dcbc3c63c0d8fb6c5bb098bc7d35d741 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 16 Sep 2016 11:16:09 -0700 Subject: [PATCH 52/54] nvme/scsi: Remove power management support As far as I can tell, there is basically nothing correct about this code. It misinterprets npss (off-by-one). It hardcodes a bunch of power states, which is nonsense, because they're all just indices into a table that software needs to parse. It completely ignores the distinction between operational and non-operational states. And, until 4.8, if all of the above magically succeeded, it would dereference a NULL pointer and OOPS. Since this code appears to be useless, just delete it. Signed-off-by: Andy Lutomirski Reviewed-by: Christoph Hellwig Acked-by: Jay Freyensee Tested-by: Jay Freyensee Signed-off-by: Jens Axboe --- drivers/nvme/host/scsi.c | 74 ++-------------------------------------- 1 file changed, 3 insertions(+), 71 deletions(-) diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index e947e298a737b1..44009105f8c808 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -72,15 +72,6 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define ALL_LUNS_RETURNED 0x02 #define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 #define RESTRICTED_LUNS_RETURNED 0x00 -#define NVME_POWER_STATE_START_VALID 0x00 -#define NVME_POWER_STATE_ACTIVE 0x01 -#define NVME_POWER_STATE_IDLE 0x02 -#define NVME_POWER_STATE_STANDBY 0x03 -#define NVME_POWER_STATE_LU_CONTROL 0x07 -#define POWER_STATE_0 0 -#define POWER_STATE_1 1 -#define POWER_STATE_2 2 -#define POWER_STATE_3 3 #define DOWNLOAD_SAVE_ACTIVATE 0x05 #define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E #define ACTIVATE_DEFERRED_MICROCODE 0x0F @@ -1229,64 +1220,6 @@ static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, /* Start Stop Unit Helper Functions */ -static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 pc, u8 pcmod, u8 start) -{ - int res; - int nvme_sc; - struct nvme_id_ctrl *id_ctrl; - int lowest_pow_st; /* max npss = lowest power consumption */ - unsigned ps_desired = 0; - - nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); - kfree(id_ctrl); - - switch (pc) { - case NVME_POWER_STATE_START_VALID: - /* Action unspecified if POWER CONDITION MODIFIER != 0 */ - if (pcmod == 0 && start == 0x1) - ps_desired = POWER_STATE_0; - if (pcmod == 0 && start == 0x0) - ps_desired = lowest_pow_st; - break; - case NVME_POWER_STATE_ACTIVE: - /* Action unspecified if POWER CONDITION MODIFIER != 0 */ - if (pcmod == 0) - ps_desired = POWER_STATE_0; - break; - case NVME_POWER_STATE_IDLE: - /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ - if (pcmod == 0x0) - ps_desired = POWER_STATE_1; - else if (pcmod == 0x1) - ps_desired = POWER_STATE_2; - else if (pcmod == 0x2) - ps_desired = POWER_STATE_3; - break; - case NVME_POWER_STATE_STANDBY: - /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ - if (pcmod == 0x0) - ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2)); - else if (pcmod == 0x1) - ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1)); - break; - case NVME_POWER_STATE_LU_CONTROL: - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0, - NULL); - return nvme_trans_status_code(hdr, nvme_sc); -} - static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 buffer_id) { @@ -2235,11 +2168,10 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - u8 immed, pcmod, pc, no_flush, start; + u8 immed, pcmod, no_flush, start; immed = cmd[1] & 0x01; pcmod = cmd[3] & 0x0f; - pc = (cmd[4] & 0xf0) >> 4; no_flush = cmd[4] & 0x04; start = cmd[4] & 0x01; @@ -2254,8 +2186,8 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (res) return res; } - /* Setup the expected power state transition */ - return nvme_trans_power_state(ns, hdr, pc, pcmod, start); + + return 0; } } From 1a6fe74dfd1bb10afb41cbbbdc14890604be42a6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 16 Sep 2016 11:16:10 -0700 Subject: [PATCH 53/54] nvme: Pass pointers, not dma addresses, to nvme_get/set_features() Any user I can imagine that needs a buffer at all will want to pass a pointer directly. There are no currently callers that use buffers, so this change is painless, and it will make it much easier to start using features that use buffers (e.g. APST). Signed-off-by: Andy Lutomirski Reviewed-by: Christoph Hellwig Acked-by: Jay Freyensee Tested-by: Jay Freyensee Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 14 ++++++-------- drivers/nvme/host/nvme.h | 4 ++-- drivers/nvme/host/scsi.c | 6 +++--- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bd2156cbfc6cb9..4669c052239ed2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -599,7 +599,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, } int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result) + void *buffer, size_t buflen, u32 *result) { struct nvme_command c; struct nvme_completion cqe; @@ -608,10 +608,9 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_get_features; c.features.nsid = cpu_to_le32(nsid); - c.features.dptr.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, buffer, buflen, 0, NVME_QID_ANY, 0, 0); if (ret >= 0 && result) *result = le32_to_cpu(cqe.result); @@ -619,7 +618,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, } int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, - dma_addr_t dma_addr, u32 *result) + void *buffer, size_t buflen, u32 *result) { struct nvme_command c; struct nvme_completion cqe; @@ -627,12 +626,11 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_set_features; - c.features.dptr.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, - NVME_QID_ANY, 0, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, + buffer, buflen, 0, NVME_QID_ANY, 0, 0); if (ret >= 0 && result) *result = le32_to_cpu(cqe.result); return ret; @@ -666,7 +664,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) u32 result; int status, nr_io_queues; - status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, &result); if (status < 0) return status; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bfd25dd73bca2c..b0a9ec681685c8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -293,9 +293,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id); int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result); + void *buffer, size_t buflen, u32 *result); int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, - dma_addr_t dma_addr, u32 *result); + void *buffer, size_t buflen, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index 44009105f8c808..c2a0a1c7d05d15 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -906,7 +906,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, kfree(smart_log); /* Get Features for Temp Threshold */ - res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0, + res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, NULL, 0, &feature_resp); if (res != NVME_SC_SUCCESS) temp_c_thresh = LOG_TEMP_UNKNOWN; @@ -1039,7 +1039,7 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns, if (len < MODE_PAGE_CACHING_LEN) return -EINVAL; - nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0, + nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, NULL, 0, &feature_resp); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -1328,7 +1328,7 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, case MODE_PAGE_CACHING: dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, - dword11, 0, NULL); + dword11, NULL, 0, NULL); res = nvme_trans_status_code(hdr, nvme_sc); break; case MODE_PAGE_CONTROL: From 997198ba1ed691c09457120576c27dbd953d0557 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Tue, 4 Oct 2016 10:53:40 +0200 Subject: [PATCH 54/54] fs/block_dev.c: return the right error in thaw_bdev() When triggering thaw-filesystems via magic sysrq, the system enters a loop in do_thaw_one(), as thaw_bdev() still returns success if bd_fsfreeze_count == 0. To fix this, let thaw_bdev() always return error (and simplify the code a bit at the same time). Reviewed-by: Eric Farman Reviewed-by: Cornelia Huck Signed-off-by: Pierre Morel Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/block_dev.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index a516568f63b6e9..376e4e42632416 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -299,14 +299,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) error = sb->s_op->thaw_super(sb); else error = thaw_super(sb); - if (error) { + if (error) bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } out: mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; + return error; } EXPORT_SYMBOL(thaw_bdev);