From f72b8792d180948b4b3898374998f5ac8c02e539 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 24 Aug 2016 15:51:50 -0600
Subject: [PATCH 01/54] workqueue: add cancel_work()

Like cancel_delayed_work(), but for regular work.

Signed-off-by: Jens Axboe <axboe@fb.com>
Mehed-by: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 40 +++++++++++++++++++++++++--------------
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 26cc1df280d65a..fc6e2218640587 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -442,6 +442,7 @@ extern int schedule_on_each_cpu(work_func_t func);
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
 extern bool flush_work(struct work_struct *work);
+extern bool cancel_work(struct work_struct *work);
 extern bool cancel_work_sync(struct work_struct *work);
 
 extern bool flush_delayed_work(struct delayed_work *dwork);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ef071ca73fc325..bd81f039027782 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(flush_delayed_work);
 
+static bool __cancel_work(struct work_struct *work, bool is_dwork)
+{
+	unsigned long flags;
+	int ret;
+
+	do {
+		ret = try_to_grab_pending(work, is_dwork, &flags);
+	} while (unlikely(ret == -EAGAIN));
+
+	if (unlikely(ret < 0))
+		return false;
+
+	set_work_pool_and_clear_pending(work, get_work_pool_id(work));
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * See cancel_delayed_work()
+ */
+bool cancel_work(struct work_struct *work)
+{
+	return __cancel_work(work, false);
+}
+
 /**
  * cancel_delayed_work - cancel a delayed work
  * @dwork: delayed_work to cancel
@@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work);
  */
 bool cancel_delayed_work(struct delayed_work *dwork)
 {
-	unsigned long flags;
-	int ret;
-
-	do {
-		ret = try_to_grab_pending(&dwork->work, true, &flags);
-	} while (unlikely(ret == -EAGAIN));
-
-	if (unlikely(ret < 0))
-		return false;
-
-	set_work_pool_and_clear_pending(&dwork->work,
-					get_work_pool_id(&dwork->work));
-	local_irq_restore(flags);
-	return ret;
+	return __cancel_work(&dwork->work, true);
 }
 EXPORT_SYMBOL(cancel_delayed_work);
 

From ee63cfa7fc197b63669623721b8009cce5b0659b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 24 Aug 2016 15:52:48 -0600
Subject: [PATCH 02/54] block: add kblockd_schedule_work_on()

Add a helper to schedule a regular struct work on a particular CPU.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 6 ++++++
 include/linux/blkdev.h | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 36c7ac328d8c17..2d08597533a438 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3097,6 +3097,12 @@ int kblockd_schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
+int kblockd_schedule_work_on(int cpu, struct work_struct *work)
+{
+	return queue_work_on(cpu, kblockd_workqueue, work);
+}
+EXPORT_SYMBOL(kblockd_schedule_work_on);
+
 int kblockd_schedule_delayed_work(struct delayed_work *dwork,
 				  unsigned long delay)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e79055c8b57799..69aae720f4ef38 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1440,8 +1440,8 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 	return bio_will_gap(req->q, bio, req->bio);
 }
 
-struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
 int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 

From 27489a3c827b7eebba26eda0320bb0f100bef167 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 24 Aug 2016 15:54:25 -0600
Subject: [PATCH 03/54] blk-mq: turn hctx->run_work into a regular work struct

We don't need the larger delayed work struct, since we always run it
immediately.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 2 +-
 block/blk-mq.c         | 9 ++++-----
 include/linux/blk-mq.h | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2d08597533a438..34ff8088eebee1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -288,7 +288,7 @@ void blk_sync_queue(struct request_queue *q)
 		int i;
 
 		queue_for_each_hw_ctx(q, hctx, i) {
-			cancel_delayed_work_sync(&hctx->run_work);
+			cancel_work_sync(&hctx->run_work);
 			cancel_delayed_work_sync(&hctx->delay_work);
 		}
 	} else {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 13f5a6c1de7682..b68fdcbe58f6fe 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -936,8 +936,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		put_cpu();
 	}
 
-	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-			&hctx->run_work, 0);
+	kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
 }
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -958,7 +957,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	cancel_delayed_work(&hctx->run_work);
+	cancel_work(&hctx->run_work);
 	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
@@ -1011,7 +1010,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
+	hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
 
 	__blk_mq_run_hw_queue(hctx);
 }
@@ -1722,7 +1721,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (node == NUMA_NO_NODE)
 		node = hctx->numa_node = set->numa_node;
 
-	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e43bbffb5b7a35..d579252e6463b2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -25,7 +25,7 @@ struct blk_mq_hw_ctx {
 	} ____cacheline_aligned_in_smp;
 
 	unsigned long		state;		/* BLK_MQ_S_* flags */
-	struct delayed_work	run_work;
+	struct work_struct	run_work;
 	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;

From 8d354f133e86dd03ea7885a91df398c55ff699ff Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 25 Aug 2016 08:00:28 -0600
Subject: [PATCH 04/54] blk-mq: improve layout of blk_mq_hw_ctx

Various cache line optimizations:

- Move delay_work towards the end. It's huge, and we don't use it
  a lot (only SCSI).

- Move the atomic state into the same cacheline as the the dispatch
  list and lock.

- Rearrange a few members to pack it better.

- Shrink the max-order for dispatch accounting from 10 to 7. This
  means that ->dispatched[] and ->run now take up their own
  cacheline.

This shrinks struct blk_mq_hw_ctx down to 8 cachelines.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d579252e6463b2..e1544f0f8c214b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,11 +22,10 @@ struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
 		struct list_head	dispatch;
+		unsigned long		state;		/* BLK_MQ_S_* flags */
 	} ____cacheline_aligned_in_smp;
 
-	unsigned long		state;		/* BLK_MQ_S_* flags */
 	struct work_struct	run_work;
-	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;
 	int			next_cpu_batch;
@@ -40,8 +39,8 @@ struct blk_mq_hw_ctx {
 
 	struct blk_mq_ctxmap	ctx_map;
 
-	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
+	unsigned int		nr_ctx;
 
 	atomic_t		wait_index;
 
@@ -49,7 +48,7 @@ struct blk_mq_hw_ctx {
 
 	unsigned long		queued;
 	unsigned long		run;
-#define BLK_MQ_MAX_DISPATCH_ORDER	10
+#define BLK_MQ_MAX_DISPATCH_ORDER	7
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
 	unsigned int		numa_node;
@@ -57,6 +56,8 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
+	struct delayed_work	delay_work;
+
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;
 

From 88c7b2b75132c3ff8180b71e4f06cf043a00eac8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 25 Aug 2016 08:07:30 -0600
Subject: [PATCH 05/54] blk-mq: prefetch request in blk_mq_tag_to_rq()

When drivers or the core calls this function, they usually
dereference the request shortly there after. Prefetch the first
cache line.

Profiling IO workloads shows that this is the most common cache
miss on the block side of things.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b68fdcbe58f6fe..eea0d230faa123 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -22,6 +22,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
+#include <linux/prefetch.h>
 
 #include <trace/events/block.h>
 
@@ -588,8 +589,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list);
 
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
-	if (tag < tags->nr_tags)
+	if (tag < tags->nr_tags) {
+		prefetch(tags->rqs[tag]);
 		return tags->rqs[tag];
+	}
 
 	return NULL;
 }

From 99e6b87ec2102b10e190c92ea9560bafcb744f86 Mon Sep 17 00:00:00 2001
From: Baoyou Xie <baoyou.xie@linaro.org>
Date: Fri, 26 Aug 2016 14:08:53 +0800
Subject: [PATCH 06/54] mtip32xx: mark symbols static where possible

We get 1 warning when biuld kernel with W=1:
drivers/block/mtip32xx/mtip32xx.c:3689:6: warning: no previous prototype for
 'mtip_block_release' [-Wmissing-prototypes]

In fact, this function is only used in the file in which it is declared
and don't need a declaration, but can be made static.
so this patch marks it 'static'.

Signed-off-by: Baoyou Xie <baoyou.xie@linaro.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 2aca98e8e42739..88c46853dbb54b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3686,7 +3686,7 @@ static int mtip_block_open(struct block_device *dev, fmode_t mode)
 	return -ENODEV;
 }
 
-void mtip_block_release(struct gendisk *disk, fmode_t mode)
+static void mtip_block_release(struct gendisk *disk, fmode_t mode)
 {
 }
 

From fd8383fd88a2fd842a9431df5ed353bd7129eecc Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 8 Sep 2016 12:33:37 -0700
Subject: [PATCH 07/54] nbd: convert to blkmq

This moves NBD over to using blkmq, which allows us to get rid of the NBD
wide queue lock and the async submit kthread.  We will start with 1 hw
queue for now, but I plan to add multiple tcp connection support in the
future and we'll fix how we set the hwqueue's.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 337 +++++++++++++++++---------------------------
 1 file changed, 129 insertions(+), 208 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a9e398019f3815..15e7c6740873bc 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -34,6 +34,7 @@
 #include <linux/kthread.h>
 #include <linux/types.h>
 #include <linux/debugfs.h>
+#include <linux/blk-mq.h>
 
 #include <asm/uaccess.h>
 #include <asm/types.h>
@@ -45,12 +46,8 @@ struct nbd_device {
 	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
 	int magic;
 
-	spinlock_t queue_lock;
-	struct list_head queue_head;	/* Requests waiting result */
-	struct request *active_req;
-	wait_queue_head_t active_wq;
-	struct list_head waiting_queue;	/* Requests to be sent */
-	wait_queue_head_t waiting_wq;
+	atomic_t outstanding_cmds;
+	struct blk_mq_tag_set tag_set;
 
 	struct mutex tx_lock;
 	struct gendisk *disk;
@@ -71,6 +68,11 @@ struct nbd_device {
 #endif
 };
 
+struct nbd_cmd {
+	struct nbd_device *nbd;
+	struct list_head list;
+};
+
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 static struct dentry *nbd_dbg_dir;
 #endif
@@ -83,18 +85,6 @@ static unsigned int nbds_max = 16;
 static struct nbd_device *nbd_dev;
 static int max_part;
 
-/*
- * Use just one lock (or at most 1 per NIC). Two arguments for this:
- * 1. Each NIC is essentially a synchronization point for all servers
- *    accessed through that NIC so there's no need to have more locks
- *    than NICs anyway.
- * 2. More locks lead to more "Dirty cache line bouncing" which will slow
- *    down each lock to the point where they're actually slower than just
- *    a single lock.
- * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
- */
-static DEFINE_SPINLOCK(nbd_lock);
-
 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 {
 	return disk_to_dev(nbd->disk);
@@ -153,18 +143,17 @@ static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
 	return 0;
 }
 
-static void nbd_end_request(struct nbd_device *nbd, struct request *req)
+static void nbd_end_request(struct nbd_cmd *cmd)
 {
+	struct nbd_device *nbd = cmd->nbd;
+	struct request *req = blk_mq_rq_from_pdu(cmd);
 	int error = req->errors ? -EIO : 0;
-	struct request_queue *q = req->q;
-	unsigned long flags;
 
-	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
+	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
 		error ? "failed" : "done");
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_end_request_all(req, error);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	atomic_dec(&nbd->outstanding_cmds);
+	blk_mq_complete_request(req, error);
 }
 
 /*
@@ -193,7 +182,7 @@ static void nbd_xmit_timeout(unsigned long arg)
 	struct nbd_device *nbd = (struct nbd_device *)arg;
 	unsigned long flags;
 
-	if (list_empty(&nbd->queue_head))
+	if (!atomic_read(&nbd->outstanding_cmds))
 		return;
 
 	spin_lock_irqsave(&nbd->sock_lock, flags);
@@ -273,8 +262,9 @@ static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
 }
 
 /* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *nbd, struct request *req)
+static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 {
+	struct request *req = blk_mq_rq_from_pdu(cmd);
 	int result, flags;
 	struct nbd_request request;
 	unsigned long size = blk_rq_bytes(req);
@@ -298,10 +288,10 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 		request.len = htonl(size);
 	}
-	memcpy(request.handle, &req, sizeof(req));
+	memcpy(request.handle, &req->tag, sizeof(req->tag));
 
 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
-		req, nbdcmd_to_ascii(type),
+		cmd, nbdcmd_to_ascii(type),
 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 	result = sock_xmit(nbd, 1, &request, sizeof(request),
 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
@@ -323,7 +313,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 			if (!rq_iter_last(bvec, iter))
 				flags = MSG_MORE;
 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
-				req, bvec.bv_len);
+				cmd, bvec.bv_len);
 			result = sock_send_bvec(nbd, &bvec, flags);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk),
@@ -336,29 +326,6 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 	return 0;
 }
 
-static struct request *nbd_find_request(struct nbd_device *nbd,
-					struct request *xreq)
-{
-	struct request *req, *tmp;
-	int err;
-
-	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
-	if (unlikely(err))
-		return ERR_PTR(err);
-
-	spin_lock(&nbd->queue_lock);
-	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
-		if (req != xreq)
-			continue;
-		list_del_init(&req->queuelist);
-		spin_unlock(&nbd->queue_lock);
-		return req;
-	}
-	spin_unlock(&nbd->queue_lock);
-
-	return ERR_PTR(-ENOENT);
-}
-
 static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
 {
 	int result;
@@ -370,11 +337,14 @@ static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *nbd)
+static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 {
 	int result;
 	struct nbd_reply reply;
-	struct request *req;
+	struct nbd_cmd *cmd;
+	struct request *req = NULL;
+	u16 hwq;
+	int tag;
 
 	reply.magic = 0;
 	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
@@ -390,25 +360,27 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
 		return ERR_PTR(-EPROTO);
 	}
 
-	req = nbd_find_request(nbd, *(struct request **)reply.handle);
-	if (IS_ERR(req)) {
-		result = PTR_ERR(req);
-		if (result != -ENOENT)
-			return ERR_PTR(result);
+	memcpy(&tag, reply.handle, sizeof(int));
 
-		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
-			reply.handle);
-		return ERR_PTR(-EBADR);
+	hwq = blk_mq_unique_tag_to_hwq(tag);
+	if (hwq < nbd->tag_set.nr_hw_queues)
+		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
+				       blk_mq_unique_tag_to_tag(tag));
+	if (!req || !blk_mq_request_started(req)) {
+		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
+			tag, req);
+		return ERR_PTR(-ENOENT);
 	}
+	cmd = blk_mq_rq_to_pdu(req);
 
 	if (ntohl(reply.error)) {
 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 			ntohl(reply.error));
 		req->errors++;
-		return req;
+		return cmd;
 	}
 
-	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
+	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
 	if (rq_data_dir(req) != WRITE) {
 		struct req_iterator iter;
 		struct bio_vec bvec;
@@ -419,13 +391,13 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
 				req->errors++;
-				return req;
+				return cmd;
 			}
 			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
-				req, bvec.bv_len);
+				cmd, bvec.bv_len);
 		}
 	}
-	return req;
+	return cmd;
 }
 
 static ssize_t pid_show(struct device *dev,
@@ -444,7 +416,7 @@ static struct device_attribute pid_attr = {
 
 static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 {
-	struct request *req;
+	struct nbd_cmd *cmd;
 	int ret;
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
@@ -460,13 +432,13 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 	nbd_size_update(nbd, bdev);
 
 	while (1) {
-		req = nbd_read_stat(nbd);
-		if (IS_ERR(req)) {
-			ret = PTR_ERR(req);
+		cmd = nbd_read_stat(nbd);
+		if (IS_ERR(cmd)) {
+			ret = PTR_ERR(cmd);
 			break;
 		}
 
-		nbd_end_request(nbd, req);
+		nbd_end_request(cmd);
 	}
 
 	nbd_size_clear(nbd, bdev);
@@ -475,44 +447,37 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 	return ret;
 }
 
-static void nbd_clear_que(struct nbd_device *nbd)
+static void nbd_clear_req(struct request *req, void *data, bool reserved)
 {
-	struct request *req;
+	struct nbd_cmd *cmd;
 
+	if (!blk_mq_request_started(req))
+		return;
+	cmd = blk_mq_rq_to_pdu(req);
+	req->errors++;
+	nbd_end_request(cmd);
+}
+
+static void nbd_clear_que(struct nbd_device *nbd)
+{
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	/*
 	 * Because we have set nbd->sock to NULL under the tx_lock, all
-	 * modifications to the list must have completed by now.  For
-	 * the same reason, the active_req must be NULL.
-	 *
-	 * As a consequence, we don't need to take the spin lock while
-	 * purging the list here.
+	 * modifications to the list must have completed by now.
 	 */
 	BUG_ON(nbd->sock);
-	BUG_ON(nbd->active_req);
 
-	while (!list_empty(&nbd->queue_head)) {
-		req = list_entry(nbd->queue_head.next, struct request,
-				 queuelist);
-		list_del_init(&req->queuelist);
-		req->errors++;
-		nbd_end_request(nbd, req);
-	}
-
-	while (!list_empty(&nbd->waiting_queue)) {
-		req = list_entry(nbd->waiting_queue.next, struct request,
-				 queuelist);
-		list_del_init(&req->queuelist);
-		req->errors++;
-		nbd_end_request(nbd, req);
-	}
+	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 }
 
 
-static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
+static void nbd_handle_cmd(struct nbd_cmd *cmd)
 {
+	struct request *req = blk_mq_rq_from_pdu(cmd);
+	struct nbd_device *nbd = cmd->nbd;
+
 	if (req->cmd_type != REQ_TYPE_FS)
 		goto error_out;
 
@@ -526,6 +491,7 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 	req->errors = 0;
 
 	mutex_lock(&nbd->tx_lock);
+	nbd->task_send = current;
 	if (unlikely(!nbd->sock)) {
 		mutex_unlock(&nbd->tx_lock);
 		dev_err(disk_to_dev(nbd->disk),
@@ -533,106 +499,34 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 		goto error_out;
 	}
 
-	nbd->active_req = req;
-
-	if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
+	if (nbd->xmit_timeout && !atomic_read(&nbd->outstanding_cmds))
 		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
 
-	if (nbd_send_req(nbd, req) != 0) {
+	atomic_inc(&nbd->outstanding_cmds);
+	if (nbd_send_cmd(nbd, cmd) != 0) {
 		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
-		nbd_end_request(nbd, req);
-	} else {
-		spin_lock(&nbd->queue_lock);
-		list_add_tail(&req->queuelist, &nbd->queue_head);
-		spin_unlock(&nbd->queue_lock);
+		nbd_end_request(cmd);
 	}
 
-	nbd->active_req = NULL;
+	nbd->task_send = NULL;
 	mutex_unlock(&nbd->tx_lock);
-	wake_up_all(&nbd->active_wq);
 
 	return;
 
 error_out:
 	req->errors++;
-	nbd_end_request(nbd, req);
-}
-
-static int nbd_thread_send(void *data)
-{
-	struct nbd_device *nbd = data;
-	struct request *req;
-
-	nbd->task_send = current;
-
-	set_user_nice(current, MIN_NICE);
-	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
-		/* wait for something to do */
-		wait_event_interruptible(nbd->waiting_wq,
-					 kthread_should_stop() ||
-					 !list_empty(&nbd->waiting_queue));
-
-		/* extract request */
-		if (list_empty(&nbd->waiting_queue))
-			continue;
-
-		spin_lock_irq(&nbd->queue_lock);
-		req = list_entry(nbd->waiting_queue.next, struct request,
-				 queuelist);
-		list_del_init(&req->queuelist);
-		spin_unlock_irq(&nbd->queue_lock);
-
-		/* handle request */
-		nbd_handle_req(nbd, req);
-	}
-
-	nbd->task_send = NULL;
-
-	return 0;
+	nbd_end_request(cmd);
 }
 
-/*
- * We always wait for result of write, for now. It would be nice to make it optional
- * in future
- * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
- *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
- */
-
-static void nbd_request_handler(struct request_queue *q)
-		__releases(q->queue_lock) __acquires(q->queue_lock)
+static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+			const struct blk_mq_queue_data *bd)
 {
-	struct request *req;
-	
-	while ((req = blk_fetch_request(q)) != NULL) {
-		struct nbd_device *nbd;
-
-		spin_unlock_irq(q->queue_lock);
-
-		nbd = req->rq_disk->private_data;
-
-		BUG_ON(nbd->magic != NBD_MAGIC);
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
-		dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
-			req, req->cmd_type);
-
-		if (unlikely(!nbd->sock)) {
-			dev_err_ratelimited(disk_to_dev(nbd->disk),
-					    "Attempted send on closed socket\n");
-			req->errors++;
-			nbd_end_request(nbd, req);
-			spin_lock_irq(q->queue_lock);
-			continue;
-		}
-
-		spin_lock_irq(&nbd->queue_lock);
-		list_add_tail(&req->queuelist, &nbd->waiting_queue);
-		spin_unlock_irq(&nbd->queue_lock);
-
-		wake_up(&nbd->waiting_wq);
-
-		spin_lock_irq(q->queue_lock);
-	}
+	blk_mq_start_request(bd->rq);
+	nbd_handle_cmd(cmd);
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
@@ -700,33 +594,37 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 {
 	switch (cmd) {
 	case NBD_DISCONNECT: {
-		struct request sreq;
+		struct request *sreq;
 
 		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
 		if (!nbd->sock)
 			return -EINVAL;
 
+		sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0);
+		if (!sreq)
+			return -ENOMEM;
+
 		mutex_unlock(&nbd->tx_lock);
 		fsync_bdev(bdev);
 		mutex_lock(&nbd->tx_lock);
-		blk_rq_init(NULL, &sreq);
-		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
+		sreq->cmd_type = REQ_TYPE_DRV_PRIV;
 
 		/* Check again after getting mutex back.  */
-		if (!nbd->sock)
+		if (!nbd->sock) {
+			blk_mq_free_request(sreq);
 			return -EINVAL;
+		}
 
 		nbd->disconnect = true;
 
-		nbd_send_req(nbd, &sreq);
+		nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
+		blk_mq_free_request(sreq);
 		return 0;
 	}
  
 	case NBD_CLEAR_SOCK:
 		sock_shutdown(nbd);
 		nbd_clear_que(nbd);
-		BUG_ON(!list_empty(&nbd->queue_head));
-		BUG_ON(!list_empty(&nbd->waiting_queue));
 		kill_bdev(bdev);
 		return 0;
 
@@ -772,7 +670,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return 0;
 
 	case NBD_DO_IT: {
-		struct task_struct *thread;
 		int error;
 
 		if (nbd->task_recv)
@@ -786,18 +683,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 
 		nbd_parse_flags(nbd, bdev);
 
-		thread = kthread_run(nbd_thread_send, nbd, "%s",
-				     nbd_name(nbd));
-		if (IS_ERR(thread)) {
-			mutex_lock(&nbd->tx_lock);
-			nbd->task_recv = NULL;
-			return PTR_ERR(thread);
-		}
-
 		nbd_dev_dbg_init(nbd);
 		error = nbd_thread_recv(nbd, bdev);
 		nbd_dev_dbg_close(nbd);
-		kthread_stop(thread);
 
 		mutex_lock(&nbd->tx_lock);
 		nbd->task_recv = NULL;
@@ -825,10 +713,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return 0;
 
 	case NBD_PRINT_DEBUG:
-		dev_info(disk_to_dev(nbd->disk),
-			"next = %p, prev = %p, head = %p\n",
-			nbd->queue_head.next, nbd->queue_head.prev,
-			&nbd->queue_head);
+		/*
+		 * For compatibility only, we no longer keep a list of
+		 * outstanding requests.
+		 */
 		return 0;
 	}
 	return -ENOTTY;
@@ -987,6 +875,23 @@ static void nbd_dbg_close(void)
 
 #endif
 
+static int nbd_init_request(void *data, struct request *rq,
+			    unsigned int hctx_idx, unsigned int request_idx,
+			    unsigned int numa_node)
+{
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->nbd = data;
+	INIT_LIST_HEAD(&cmd->list);
+	return 0;
+}
+
+static struct blk_mq_ops nbd_mq_ops = {
+	.queue_rq	= nbd_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.init_request	= nbd_init_request,
+};
+
 /*
  * And here should be modules and kernel interface 
  *  (Just smiley confuses emacs :-)
@@ -1035,16 +940,34 @@ static int __init nbd_init(void)
 		if (!disk)
 			goto out;
 		nbd_dev[i].disk = disk;
+
+		nbd_dev[i].tag_set.ops = &nbd_mq_ops;
+		nbd_dev[i].tag_set.nr_hw_queues = 1;
+		nbd_dev[i].tag_set.queue_depth = 128;
+		nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE;
+		nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd);
+		nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
+			BLK_MQ_F_SG_MERGE;
+		nbd_dev[i].tag_set.driver_data = &nbd_dev[i];
+
+		err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set);
+		if (err) {
+			put_disk(disk);
+			goto out;
+		}
+
 		/*
 		 * The new linux 2.5 block layer implementation requires
 		 * every gendisk to have its very own request_queue struct.
 		 * These structs are big so we dynamically allocate them.
 		 */
-		disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
+		disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set);
 		if (!disk->queue) {
+			blk_mq_free_tag_set(&nbd_dev[i].tag_set);
 			put_disk(disk);
 			goto out;
 		}
+
 		/*
 		 * Tell the block layer that we are not a rotational device
 		 */
@@ -1069,16 +992,12 @@ static int __init nbd_init(void)
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].magic = NBD_MAGIC;
-		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
-		spin_lock_init(&nbd_dev[i].queue_lock);
 		spin_lock_init(&nbd_dev[i].sock_lock);
-		INIT_LIST_HEAD(&nbd_dev[i].queue_head);
 		mutex_init(&nbd_dev[i].tx_lock);
 		init_timer(&nbd_dev[i].timeout_timer);
 		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
 		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
-		init_waitqueue_head(&nbd_dev[i].active_wq);
-		init_waitqueue_head(&nbd_dev[i].waiting_wq);
+		atomic_set(&nbd_dev[i].outstanding_cmds, 0);
 		disk->major = NBD_MAJOR;
 		disk->first_minor = i << part_shift;
 		disk->fops = &nbd_fops;
@@ -1091,6 +1010,7 @@ static int __init nbd_init(void)
 	return 0;
 out:
 	while (i--) {
+		blk_mq_free_tag_set(&nbd_dev[i].tag_set);
 		blk_cleanup_queue(nbd_dev[i].disk->queue);
 		put_disk(nbd_dev[i].disk);
 	}
@@ -1110,6 +1030,7 @@ static void __exit nbd_cleanup(void)
 		if (disk) {
 			del_gendisk(disk);
 			blk_cleanup_queue(disk->queue);
+			blk_mq_free_tag_set(&nbd_dev[i].tag_set);
 			put_disk(disk);
 		}
 	}

From c261189862c6f65117eb3b1748622a08ef49c262 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 8 Sep 2016 12:33:38 -0700
Subject: [PATCH 08/54] nbd: don't shutdown sock with irq's disabled

We hit a warning when shutting down the nbd connection because we have irq's
disabled.  We don't really need to do the shutdown under the lock, just clear
the nbd->sock.  So do the shutdown outside of the irq.  This gets rid of the
warning.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 15e7c6740873bc..4b7d0f3c35217e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -161,6 +161,8 @@ static void nbd_end_request(struct nbd_cmd *cmd)
  */
 static void sock_shutdown(struct nbd_device *nbd)
 {
+	struct socket *sock;
+
 	spin_lock_irq(&nbd->sock_lock);
 
 	if (!nbd->sock) {
@@ -168,18 +170,21 @@ static void sock_shutdown(struct nbd_device *nbd)
 		return;
 	}
 
+	sock = nbd->sock;
 	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
-	kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
-	sockfd_put(nbd->sock);
 	nbd->sock = NULL;
 	spin_unlock_irq(&nbd->sock_lock);
 
+	kernel_sock_shutdown(sock, SHUT_RDWR);
+	sockfd_put(sock);
+
 	del_timer(&nbd->timeout_timer);
 }
 
 static void nbd_xmit_timeout(unsigned long arg)
 {
 	struct nbd_device *nbd = (struct nbd_device *)arg;
+	struct socket *sock = NULL;
 	unsigned long flags;
 
 	if (!atomic_read(&nbd->outstanding_cmds))
@@ -189,10 +194,16 @@ static void nbd_xmit_timeout(unsigned long arg)
 
 	nbd->timedout = true;
 
-	if (nbd->sock)
-		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+	if (nbd->sock) {
+		sock = nbd->sock;
+		get_file(sock->file);
+	}
 
 	spin_unlock_irqrestore(&nbd->sock_lock, flags);
+	if (sock) {
+		kernel_sock_shutdown(sock, SHUT_RDWR);
+		sockfd_put(sock);
+	}
 
 	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
 }

From 9b4a6ba9185ac1c398f2db69c491989452ce7018 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 8 Sep 2016 12:33:39 -0700
Subject: [PATCH 09/54] nbd: use flags instead of bool

In preparation for some future changes, change a few of the state bools over to
normal bits to set/clear properly.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4b7d0f3c35217e..cf855a1b3729ba 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -41,8 +41,12 @@
 
 #include <linux/nbd.h>
 
+#define NBD_TIMEDOUT			0
+#define NBD_DISCONNECT_REQUESTED	1
+
 struct nbd_device {
 	u32 flags;
+	unsigned long runtime_flags;
 	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
 	int magic;
 
@@ -54,8 +58,6 @@ struct nbd_device {
 	int blksize;
 	loff_t bytesize;
 	int xmit_timeout;
-	bool timedout;
-	bool disconnect; /* a disconnect has been requested by user */
 
 	struct timer_list timeout_timer;
 	/* protects initialization and shutdown of the socket */
@@ -192,7 +194,7 @@ static void nbd_xmit_timeout(unsigned long arg)
 
 	spin_lock_irqsave(&nbd->sock_lock, flags);
 
-	nbd->timedout = true;
+	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
 
 	if (nbd->sock) {
 		sock = nbd->sock;
@@ -562,8 +564,7 @@ static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
 /* Reset all properties of an NBD device */
 static void nbd_reset(struct nbd_device *nbd)
 {
-	nbd->disconnect = false;
-	nbd->timedout = false;
+	nbd->runtime_flags = 0;
 	nbd->blksize = 1024;
 	nbd->bytesize = 0;
 	set_capacity(nbd->disk, 0);
@@ -626,7 +627,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 			return -EINVAL;
 		}
 
-		nbd->disconnect = true;
+		set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
 
 		nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
 		blk_mq_free_request(sreq);
@@ -706,9 +707,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		kill_bdev(bdev);
 		nbd_bdev_reset(bdev);
 
-		if (nbd->disconnect) /* user requested, ignore socket errors */
+		/* user requested, ignore socket errors */
+		if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
 			error = 0;
-		if (nbd->timedout)
+		if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
 			error = -ETIMEDOUT;
 
 		nbd_reset(nbd);

From 0eadf37afc2500e1162c9040ec26a705b9af8d47 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 8 Sep 2016 12:33:40 -0700
Subject: [PATCH 10/54] nbd: allow block mq to deal with timeouts

Instead of rolling our own timer, just utilize the blk mq req timeout and do the
disconnect if any of our commands timeout.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 51 +++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 37 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index cf855a1b3729ba..4c6dd1a85eade5 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -50,16 +50,13 @@ struct nbd_device {
 	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
 	int magic;
 
-	atomic_t outstanding_cmds;
 	struct blk_mq_tag_set tag_set;
 
 	struct mutex tx_lock;
 	struct gendisk *disk;
 	int blksize;
 	loff_t bytesize;
-	int xmit_timeout;
 
-	struct timer_list timeout_timer;
 	/* protects initialization and shutdown of the socket */
 	spinlock_t sock_lock;
 	struct task_struct *task_recv;
@@ -154,7 +151,6 @@ static void nbd_end_request(struct nbd_cmd *cmd)
 	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
 		error ? "failed" : "done");
 
-	atomic_dec(&nbd->outstanding_cmds);
 	blk_mq_complete_request(req, error);
 }
 
@@ -165,7 +161,7 @@ static void sock_shutdown(struct nbd_device *nbd)
 {
 	struct socket *sock;
 
-	spin_lock_irq(&nbd->sock_lock);
+	spin_lock(&nbd->sock_lock);
 
 	if (!nbd->sock) {
 		spin_unlock_irq(&nbd->sock_lock);
@@ -175,24 +171,20 @@ static void sock_shutdown(struct nbd_device *nbd)
 	sock = nbd->sock;
 	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
 	nbd->sock = NULL;
-	spin_unlock_irq(&nbd->sock_lock);
+	spin_unlock(&nbd->sock_lock);
 
 	kernel_sock_shutdown(sock, SHUT_RDWR);
 	sockfd_put(sock);
-
-	del_timer(&nbd->timeout_timer);
 }
 
-static void nbd_xmit_timeout(unsigned long arg)
+static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
+						 bool reserved)
 {
-	struct nbd_device *nbd = (struct nbd_device *)arg;
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+	struct nbd_device *nbd = cmd->nbd;
 	struct socket *sock = NULL;
-	unsigned long flags;
-
-	if (!atomic_read(&nbd->outstanding_cmds))
-		return;
 
-	spin_lock_irqsave(&nbd->sock_lock, flags);
+	spin_lock(&nbd->sock_lock);
 
 	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
 
@@ -201,13 +193,15 @@ static void nbd_xmit_timeout(unsigned long arg)
 		get_file(sock->file);
 	}
 
-	spin_unlock_irqrestore(&nbd->sock_lock, flags);
+	spin_unlock(&nbd->sock_lock);
 	if (sock) {
 		kernel_sock_shutdown(sock, SHUT_RDWR);
 		sockfd_put(sock);
 	}
 
+	req->errors++;
 	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
+	return BLK_EH_HANDLED;
 }
 
 /*
@@ -257,9 +251,6 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 
 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 
-	if (!send && nbd->xmit_timeout)
-		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
-
 	return result;
 }
 
@@ -512,10 +503,6 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd)
 		goto error_out;
 	}
 
-	if (nbd->xmit_timeout && !atomic_read(&nbd->outstanding_cmds))
-		mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
-
-	atomic_inc(&nbd->outstanding_cmds);
 	if (nbd_send_cmd(nbd, cmd) != 0) {
 		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
@@ -569,9 +556,8 @@ static void nbd_reset(struct nbd_device *nbd)
 	nbd->bytesize = 0;
 	set_capacity(nbd->disk, 0);
 	nbd->flags = 0;
-	nbd->xmit_timeout = 0;
+	nbd->tag_set.timeout = 0;
 	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
-	del_timer_sync(&nbd->timeout_timer);
 }
 
 static void nbd_bdev_reset(struct block_device *bdev)
@@ -668,13 +654,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return nbd_size_set(nbd, bdev, nbd->blksize, arg);
 
 	case NBD_SET_TIMEOUT:
-		nbd->xmit_timeout = arg * HZ;
-		if (arg)
-			mod_timer(&nbd->timeout_timer,
-				  jiffies + nbd->xmit_timeout);
-		else
-			del_timer_sync(&nbd->timeout_timer);
-
+		nbd->tag_set.timeout = arg * HZ;
 		return 0;
 
 	case NBD_SET_FLAGS:
@@ -836,7 +816,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
 
 	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
 	debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
-	debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
+	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
 	debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
 	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
 
@@ -903,6 +883,7 @@ static struct blk_mq_ops nbd_mq_ops = {
 	.queue_rq	= nbd_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_request	= nbd_init_request,
+	.timeout	= nbd_xmit_timeout,
 };
 
 /*
@@ -1007,10 +988,6 @@ static int __init nbd_init(void)
 		nbd_dev[i].magic = NBD_MAGIC;
 		spin_lock_init(&nbd_dev[i].sock_lock);
 		mutex_init(&nbd_dev[i].tx_lock);
-		init_timer(&nbd_dev[i].timeout_timer);
-		nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
-		nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
-		atomic_set(&nbd_dev[i].outstanding_cmds, 0);
 		disk->major = NBD_MAJOR;
 		disk->first_minor = i << part_shift;
 		disk->fops = &nbd_fops;

From 6e219353afa1f67f453141f7462b01708ebf5574 Mon Sep 17 00:00:00 2001
From: Stephen Bates <sbates@raithlin.com>
Date: Tue, 13 Sep 2016 12:23:15 -0600
Subject: [PATCH 11/54] block: add poll_considered statistic

In order to help determine the effectiveness of polling in a running
system it is usful to determine the ratio of how often the poll
function is called vs how often the completion is checked. For this
reason we add a poll_considered variable and add it to the sysfs entry
for io_poll.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 8 ++++++--
 block/blk-mq-sysfs.c   | 4 +++-
 include/linux/blk-mq.h | 1 +
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 34ff8088eebee1..14d7c0740dc07a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3307,19 +3307,23 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie)
 {
 	struct blk_plug *plug;
 	long state;
+	unsigned int queue_num;
+	struct blk_mq_hw_ctx *hctx;
 
 	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return false;
 
+	queue_num = blk_qc_t_to_queue_num(cookie);
+	hctx = q->queue_hw_ctx[queue_num];
+	hctx->poll_considered++;
+
 	plug = current->plug;
 	if (plug)
 		blk_flush_plug_list(plug, false);
 
 	state = current->state;
 	while (!need_resched()) {
-		unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
-		struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
 		int ret;
 
 		hctx->poll_invoked++;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index fe822aa5b8e4a1..ea8c3f58afbdc6 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -176,7 +176,9 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
 
 static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-	return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success);
+	return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n",
+		       hctx->poll_considered, hctx->poll_invoked,
+		       hctx->poll_success);
 }
 
 static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e1544f0f8c214b..7710f795d7c284 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -61,6 +61,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;
 
+	unsigned long		poll_considered;
 	unsigned long		poll_invoked;
 	unsigned long		poll_success;
 };

From d21ea4bc0f6afbc852f1436c7c691c7b6fed0eb8 Mon Sep 17 00:00:00 2001
From: Stephen Bates <sbates@raithlin.com>
Date: Tue, 13 Sep 2016 12:23:16 -0600
Subject: [PATCH 12/54] block: enable zeroing of io_poll statistics

Allow the io_poll statistics to be zeroed to make for easier logging
of polling event.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index ea8c3f58afbdc6..ac5160eb686253 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -181,6 +181,14 @@ static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
 		       hctx->poll_success);
 }
 
+static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx,
+					  const char *page, size_t size)
+{
+	hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
+
+	return size;
+}
+
 static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
 					   char *page)
 {
@@ -303,8 +311,9 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
 	.show = blk_mq_hw_sysfs_cpus_show,
 };
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
-	.attr = {.name = "io_poll", .mode = S_IRUGO },
+	.attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO },
 	.show = blk_mq_hw_sysfs_poll_show,
+	.store = blk_mq_hw_sysfs_poll_store,
 };
 
 static struct attribute *default_hw_ctx_attrs[] = {

From 223757016837d5bc8546c5683e13fbafe6cb374d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Sep 2016 11:56:13 +0200
Subject: [PATCH 13/54] block_dev: remove DAX leftovers

DAX support for block devices was removed in commits 03cdad
("block: disable block device DAX by default") and 99a01cd
("block: remove BLK_DEV_DAX config option"), but we still kept a call to
dax_do_io and some uneeded i_flags manipulations introduced in commit
bbab37 ("block: Add support for DAX reads/writes to block devices").

Remove those leftovers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 08ae99343d92f9..a516568f63b6e9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -180,9 +180,6 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
 
-	if (IS_DAX(inode))
-		return dax_do_io(iocb, inode, iter, blkdev_get_block,
-				NULL, DIO_SKIP_DIO_COUNT);
 	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
 				    blkdev_get_block, NULL, NULL,
 				    DIO_SKIP_DIO_COUNT);
@@ -1275,7 +1272,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
-		bdev->bd_inode->i_flags = 0;
 
 		if (!partno) {
 			ret = -ENXIO;
@@ -1303,11 +1299,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				}
 			}
 
-			if (!ret) {
+			if (!ret)
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-				if (!bdev_dax_capable(bdev))
-					bdev->bd_inode->i_flags &= ~S_DAX;
-			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1342,8 +1335,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-			if (!bdev_dax_capable(bdev))
-				bdev->bd_inode->i_flags &= ~S_DAX;
 		}
 	} else {
 		if (bdev->bd_contains == bdev) {

From abe47114b192a9e0167905a3418d815b4fcf87de Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 14 Sep 2016 14:33:15 +0200
Subject: [PATCH 14/54] block: remove blk_mq_alloc_single_hw_queue() prototype

The blk_mq_alloc_single_hw_queue() is a prototype artifact that
should have been removed with
commit cdef54dd85ad66e77262ea57796a3e81683dd5d6
"blk-mq: remove alloc_hctx and free_hctx methods" where the last
users of it were deleted.

Fixes: cdef54dd85ad ("blk-mq: remove alloc_hctx and free_hctx methods")
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7710f795d7c284..ff14f68067aaa1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -223,7 +223,6 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
 }
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
 
 int blk_mq_request_started(struct request *rq);
 void blk_mq_start_request(struct request *rq);

From a441b0d093b3690b7cc2cb30998358904d051db4 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 14 Sep 2016 14:32:52 +0200
Subject: [PATCH 15/54] block: remove remnant refs to hardsect

commit e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1
"block: Do away with the notion of hardsect_size"
removed the notion of "hardware sector size" from
the kernel in favor of logical block size, but
references remain in comments and documentation.

Update the remaining sites mentioning hardsect.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/biodoc.txt | 4 ++--
 block/bio.c                    | 2 +-
 fs/befs/linuxvfs.c             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index bcdb2b4c1f12dc..918e1e0d0e78b2 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -115,7 +115,7 @@ i. Per-queue limits/values exported to the generic layer by the driver
 
 Various parameters that the generic i/o scheduler logic uses are set at
 a per-queue level (e.g maximum request size, maximum number of segments in
-a scatter-gather list, hardsect size)
+a scatter-gather list, logical block size)
 
 Some parameters that were earlier available as global arrays indexed by
 major/minor are now directly associated with the queue. Some of these may
@@ -156,7 +156,7 @@ Some new queue property settings:
 	blk_queue_max_segment_size(q, max_seg_size)
 		Maximum size of a clustered segment, 64kB default.
 
-	blk_queue_hardsect_size(q, hardsect_size)
+	blk_queue_logical_block_size(q, logical_block_size)
 		Lowest possible sector size that the hardware can operate
 		on, 512 bytes default.
 
diff --git a/block/bio.c b/block/bio.c
index aa7354088008ba..a6d279e1ea9e96 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1274,7 +1274,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 
 		nr_pages += end - start;
 		/*
-		 * buffer must be aligned to at least hardsector size for now
+		 * buffer must be aligned to at least logical block size for now
 		 */
 		if (uaddr & queue_dma_alignment(q))
 			return ERR_PTR(-EINVAL);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 7da05b159ade2d..bfe9f999493531 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -789,7 +789,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	 * Will be set to real fs blocksize later.
 	 *
 	 * Linux 2.4.10 and later refuse to read blocks smaller than
-	 * the hardsect size for the device. But we also need to read at 
+	 * the logical block size for the device. But we also need to read at
 	 * least 1k to get the second 512 bytes of the volume.
 	 * -WD 10-26-01
 	 */ 

From 637ca77bd1f7950538956c61dcd0c2e559905dbf Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 14 Sep 2016 10:44:12 +0200
Subject: [PATCH 16/54] block: Document that bio_op() uses the data type of
 bio.bi_opf

Make it clear that the sizeof(unsigned int) expression in BIO_OP_SHIFT
refers to the bi_opf member of struct bio.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 436f43f87da928..1e1ef210ae91b0 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -89,7 +89,7 @@ struct bio {
 	struct bio_vec		bi_inline_vecs[0];
 };
 
-#define BIO_OP_SHIFT	(8 * sizeof(unsigned int) - REQ_OP_BITS)
+#define BIO_OP_SHIFT	(8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS)
 #define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
 
 #define bio_set_op_attrs(bio, op, op_flags) do {		\

From 4382e33ad374862eacf62003bb02c750391ada05 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 14 Sep 2016 10:45:36 +0200
Subject: [PATCH 17/54] block, dm-crypt, btrfs: Introduce bio_flags()

Introduce the bio_flags() macro. Ensure that the second argument of
bio_set_op_attrs() only contains flags and no operation. This patch
does not change any functionality.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Chris Mason <clm@fb.com> (maintainer:BTRFS FILE SYSTEM)
Cc: Josef Bacik <jbacik@fb.com> (maintainer:BTRFS FILE SYSTEM)
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-crypt.c     | 2 +-
 fs/btrfs/inode.c          | 5 +++--
 include/linux/blk_types.h | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index eedba67b0e3ef6..9ba0f0724d28fe 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1136,7 +1136,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_private = io;
 	clone->bi_end_io  = crypt_endio;
 	clone->bi_bdev    = cc->dev->bdev;
-	bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf);
+	bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio));
 }
 
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e6811c42e41ef3..ca01106795ead9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8412,7 +8412,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	if (!bio)
 		return -ENOMEM;
 
-	bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
+	bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
 	bio->bi_private = dip;
 	bio->bi_end_io = btrfs_end_dio_bio;
 	btrfs_io_bio(bio)->logical = file_offset;
@@ -8450,7 +8450,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 						  start_sector, GFP_NOFS);
 			if (!bio)
 				goto out_err;
-			bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
+			bio_set_op_attrs(bio, bio_op(orig_bio),
+					 bio_flags(orig_bio));
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 			btrfs_io_bio(bio)->logical = file_offset;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1e1ef210ae91b0..311fa2f478b843 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -90,11 +90,12 @@ struct bio {
 };
 
 #define BIO_OP_SHIFT	(8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS)
+#define bio_flags(bio)	((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1))
 #define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
 
 #define bio_set_op_attrs(bio, op, op_flags) do {		\
 	WARN_ON(op >= (1 << REQ_OP_BITS));			\
-	(bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1);		\
+	(bio)->bi_opf = bio_flags(bio);				\
 	(bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT);	\
 	(bio)->bi_opf |= op_flags;				\
 } while (0)

From 3e1de31b9bf608c5b35e2d0d134eb87f2a9ba4ae Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 14 Sep 2016 10:46:22 +0200
Subject: [PATCH 18/54] block: Improve bio_set_op_attrs() robustness

Since REQ_OP_BITS == 3 and __REQ_NR_BITS == 30 it is not that hard
to pass an op_flags argument to bio_set_op_attrs() that is larger
than the number of bits reserved for the op_flags argument. Complain
if this happens. Additionally, ensure that negative arguments trigger
a complaint (1 << ... is signed while 1U << ... is unsigned; adding
0U to an integer expression causes it to be promoted to an unsigned
type).

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Mike Christie <mchristi@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 311fa2f478b843..53ee1a2acd4fde 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -93,11 +93,18 @@ struct bio {
 #define bio_flags(bio)	((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1))
 #define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
 
-#define bio_set_op_attrs(bio, op, op_flags) do {		\
-	WARN_ON(op >= (1 << REQ_OP_BITS));			\
-	(bio)->bi_opf = bio_flags(bio);				\
-	(bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT);	\
-	(bio)->bi_opf |= op_flags;				\
+#define bio_set_op_attrs(bio, op, op_flags) do {			\
+	if (__builtin_constant_p(op))					\
+		BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS));		\
+	else								\
+		WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS));		\
+	if (__builtin_constant_p(op_flags))				\
+		BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT));	\
+	else								\
+		WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT));	\
+	(bio)->bi_opf = bio_flags(bio);					\
+	(bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT);			\
+	(bio)->bi_opf |= (op_flags);					\
 } while (0)
 
 #define BIO_RESET_BYTES		offsetof(struct bio, bi_max_vecs)

From 3f7c624aa58f769e0313ca3310704c5d88ac99ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 11 Sep 2016 16:03:02 +0200
Subject: [PATCH 19/54] block: remove bio_destructor_t

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 53ee1a2acd4fde..cd395ecec99d01 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -16,7 +16,6 @@ struct block_device;
 struct io_context;
 struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
-typedef void (bio_destructor_t) (struct bio *);
 
 #ifdef CONFIG_BLOCK
 /*

From fc95db3edeaf924e9ad16592d9c1b06c730a49c9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 11 Sep 2016 16:03:03 +0200
Subject: [PATCH 20/54] bio.h: remove a very outdated comment

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 23ddf4b46a9b01..e00721a2dce133 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -1,6 +1,4 @@
 /*
- * 2.5 block I/O model
- *
  * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
  *
  * This program is free software; you can redistribute it and/or modify

From c5c5ca777469f0ff854f1da0aff9b3a9051b3ef7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 11 Sep 2016 16:03:04 +0200
Subject: [PATCH 21/54] block: remove IOPRIO_BITS

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/ioprio.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index beb9ce1c2c2335..8c1239020d79d3 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -7,7 +7,6 @@
 /*
  * Gives us 8 prio classes with 13-bits of data for each class
  */
-#define IOPRIO_BITS		(16)
 #define IOPRIO_CLASS_SHIFT	(13)
 #define IOPRIO_PRIO_MASK	((1UL << IOPRIO_CLASS_SHIFT) - 1)
 

From 2849450ad39d2e699fda2d5c6f41e05d87fd7004 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 14 Sep 2016 13:28:30 -0400
Subject: [PATCH 22/54] blk-mq: introduce blk_mq_delay_kick_requeue_list()

blk_mq_delay_kick_requeue_list() provides the ability to kick the
q->requeue_list after a specified time.  To do this the request_queue's
'requeue_work' member was changed to a delayed_work.

blk_mq_delay_kick_requeue_list() allows DM to defer processing requeued
requests while it doesn't make sense to immediately requeue them
(e.g. when all paths in a DM multipath have failed).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 16 ++++++++++++----
 include/linux/blk-mq.h |  1 +
 include/linux/blkdev.h |  2 +-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index eea0d230faa123..7ddc7969fba43b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -502,7 +502,7 @@ EXPORT_SYMBOL(blk_mq_requeue_request);
 static void blk_mq_requeue_work(struct work_struct *work)
 {
 	struct request_queue *q =
-		container_of(work, struct request_queue, requeue_work);
+		container_of(work, struct request_queue, requeue_work.work);
 	LIST_HEAD(rq_list);
 	struct request *rq, *next;
 	unsigned long flags;
@@ -557,16 +557,24 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
 void blk_mq_cancel_requeue_work(struct request_queue *q)
 {
-	cancel_work_sync(&q->requeue_work);
+	cancel_delayed_work_sync(&q->requeue_work);
 }
 EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-	kblockd_schedule_work(&q->requeue_work);
+	kblockd_schedule_delayed_work(&q->requeue_work, 0);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
+void blk_mq_delay_kick_requeue_list(struct request_queue *q,
+				    unsigned long msecs)
+{
+	kblockd_schedule_delayed_work(&q->requeue_work,
+				      msecs_to_jiffies(msecs));
+}
+EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
+
 void blk_mq_abort_requeue_list(struct request_queue *q)
 {
 	unsigned long flags;
@@ -2084,7 +2092,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	q->sg_reserved_size = INT_MAX;
 
-	INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
+	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
 	INIT_LIST_HEAD(&q->requeue_list);
 	spin_lock_init(&q->requeue_lock);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ff14f68067aaa1..60ef14cbcd2da8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -233,6 +233,7 @@ void blk_mq_requeue_request(struct request *rq);
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
 void blk_mq_cancel_requeue_work(struct request_queue *q);
 void blk_mq_kick_requeue_list(struct request_queue *q);
+void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_abort_requeue_list(struct request_queue *q);
 void blk_mq_complete_request(struct request *rq, int error);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 69aae720f4ef38..c47c358ba0529c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -449,7 +449,7 @@ struct request_queue {
 
 	struct list_head	requeue_list;
 	spinlock_t		requeue_lock;
-	struct work_struct	requeue_work;
+	struct delayed_work	requeue_work;
 
 	struct mutex		sysfs_lock;
 

From 703fd1c0f177219e3a84e6c095c31dc566514d81 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 16 Sep 2016 13:59:14 -0600
Subject: [PATCH 23/54] blk-mq: account higher order dispatch

We currently account a '0' dispatch, and anything above that still falls
below the range set by BLK_MQ_MAX_DISPATCH_ORDER. If we dispatch more,
we don't account it.

Change the last bucket to be inclusive of anything above the range we
track, and have the sysfs file reflect that by including a '+' in the
output:

$ cat /sys/block/nvme0n1/mq/0/dispatched
        0	1006
        1	20229
        2	1
        4	0
        8	0
       16	0
       32+	0

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
---
 block/blk-mq-sysfs.c |  8 +++++---
 block/blk-mq.c       | 13 +++++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index ac5160eb686253..3c385b196bc711 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -208,12 +208,14 @@ static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
 
 	page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
 
-	for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
-		unsigned long d = 1U << (i - 1);
+	for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
+		unsigned int d = 1U << (i - 1);
 
-		page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
+		page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]);
 	}
 
+	page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1),
+						hctx->dispatched[i]);
 	return page - start_page;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7ddc7969fba43b..0cb93625d8121d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -789,6 +789,14 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 	}
 }
 
+static inline unsigned int queued_to_index(unsigned int queued)
+{
+	if (!queued)
+		return 0;
+
+	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
+}
+
 /*
  * Run this hardware queue, pulling any software queues mapped to it in.
  * Note that this function currently has various problems around ordering
@@ -877,10 +885,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 			dptr = &driver_list;
 	}
 
-	if (!queued)
-		hctx->dispatched[0]++;
-	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
-		hctx->dispatched[ilog2(queued) + 1]++;
+	hctx->dispatched[queued_to_index(queued)]++;
 
 	/*
 	 * Any items that need requeuing? Stuff them into hctx->dispatch,

From 88459642cba452630326b9cab1c651e09577d4e4 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 08:38:44 -0600
Subject: [PATCH 24/54] blk-mq: abstract tag allocation out into sbitmap
 library

This is a generally useful data structure, so make it available to
anyone else who might want to use it. It's also a nice cleanup
separating the allocation logic from the rest of the tag handling logic.

The code is behind a new Kconfig option, CONFIG_SBITMAP, which is only
selected by CONFIG_BLOCK for now.

This should be a complete noop functionality-wise.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 MAINTAINERS             |   1 +
 block/Kconfig           |   1 +
 block/blk-mq-tag.c      | 463 ++++++++++------------------------------
 block/blk-mq-tag.h      |  37 +---
 block/blk-mq.c          | 112 +++-------
 block/blk-mq.h          |   9 -
 include/linux/blk-mq.h  |   9 +-
 include/linux/sbitmap.h | 327 ++++++++++++++++++++++++++++
 lib/Kconfig             |   3 +
 lib/Makefile            |   2 +
 lib/sbitmap.c           | 301 ++++++++++++++++++++++++++
 11 files changed, 789 insertions(+), 476 deletions(-)
 create mode 100644 include/linux/sbitmap.h
 create mode 100644 lib/sbitmap.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 71aa5daeae8f43..157b1ca3e19d25 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2451,6 +2451,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 S:	Maintained
 F:	block/
 F:	kernel/trace/blktrace.c
+F:	lib/sbitmap.c
 
 BLOCK2MTD DRIVER
 M:	Joern Engel <joern@lazybastard.org>
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879ed..5136ad4bb6d557 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
+       select SBITMAP
        help
 	 Provide block layer support for the kernel.
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 729bac3a673b7a..2cbdecd594e970 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,12 +1,7 @@
 /*
- * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
- * over multiple cachelines to avoid ping-pong between multiple submitters
- * or submitter and completer. Uses rolling wakeups to avoid falling of
- * the scaling cliff when we run out of tags and have to start putting
- * submitters to sleep.
- *
- * Uses active queue tracking to support fairer distribution of tags
- * between multiple submitters when a shared tag map is used.
+ * Tag allocation using scalable bitmaps. Uses active queue tracking to support
+ * fairer distribution of tags between multiple submitters when a shared tag map
+ * is used.
  *
  * Copyright (C) 2013-2014 Jens Axboe
  */
@@ -19,40 +14,12 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
-{
-	int i;
-
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-		int ret;
-
-		ret = find_first_zero_bit(&bm->word, bm->depth);
-		if (ret < bm->depth)
-			return true;
-	}
-
-	return false;
-}
-
 bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 {
 	if (!tags)
 		return true;
 
-	return bt_has_free_tags(&tags->bitmap_tags);
-}
-
-static inline int bt_index_inc(int index)
-{
-	return (index + 1) & (BT_WAIT_QUEUES - 1);
-}
-
-static inline void bt_index_atomic_inc(atomic_t *index)
-{
-	int old = atomic_read(index);
-	int new = bt_index_inc(old);
-	atomic_cmpxchg(index, old, new);
+	return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
 }
 
 /*
@@ -72,29 +39,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  */
 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 {
-	struct blk_mq_bitmap_tags *bt;
-	int i, wake_index;
-
-	/*
-	 * Make sure all changes prior to this are visible from other CPUs.
-	 */
-	smp_mb();
-	bt = &tags->bitmap_tags;
-	wake_index = atomic_read(&bt->wake_index);
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		struct bt_wait_state *bs = &bt->bs[wake_index];
-
-		if (waitqueue_active(&bs->wait))
-			wake_up(&bs->wait);
-
-		wake_index = bt_index_inc(wake_index);
-	}
-
-	if (include_reserve) {
-		bt = &tags->breserved_tags;
-		if (waitqueue_active(&bt->bs[0].wait))
-			wake_up(&bt->bs[0].wait);
-	}
+	sbitmap_queue_wake_all(&tags->bitmap_tags);
+	if (include_reserve)
+		sbitmap_queue_wake_all(&tags->breserved_tags);
 }
 
 /*
@@ -118,7 +65,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
  * and attempt to provide a fair share of the tag depth for each of them.
  */
 static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
-				  struct blk_mq_bitmap_tags *bt)
+				  struct sbitmap_queue *bt)
 {
 	unsigned int depth, users;
 
@@ -130,7 +77,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	/*
 	 * Don't try dividing an ant
 	 */
-	if (bt->depth == 1)
+	if (bt->sb.depth == 1)
 		return true;
 
 	users = atomic_read(&hctx->tags->active_queues);
@@ -140,127 +87,42 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	/*
 	 * Allow at least some tags
 	 */
-	depth = max((bt->depth + users - 1) / users, 4U);
+	depth = max((bt->sb.depth + users - 1) / users, 4U);
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
-			 bool nowrap)
-{
-	int tag, org_last_tag = last_tag;
-
-	while (1) {
-		tag = find_next_zero_bit(&bm->word, bm->depth, last_tag);
-		if (unlikely(tag >= bm->depth)) {
-			/*
-			 * We started with an offset, and we didn't reset the
-			 * offset to 0 in a failure case, so start from 0 to
-			 * exhaust the map.
-			 */
-			if (org_last_tag && last_tag && !nowrap) {
-				last_tag = org_last_tag = 0;
-				continue;
-			}
-			return -1;
-		}
-
-		if (!test_and_set_bit(tag, &bm->word))
-			break;
-
-		last_tag = tag + 1;
-		if (last_tag >= bm->depth - 1)
-			last_tag = 0;
-	}
-
-	return tag;
-}
-
 #define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
 
-/*
- * Straight forward bitmap tag implementation, where each bit is a tag
- * (cleared == free, and set == busy). The small twist is using per-cpu
- * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
- * contexts. This enables us to drastically limit the space searched,
- * without dirtying an extra shared cacheline like we would if we stored
- * the cache value inside the shared blk_mq_bitmap_tags structure. On top
- * of that, each word of tags is in a separate cacheline. This means that
- * multiple users will tend to stick to different cachelines, at least
- * until the map is exhausted.
- */
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
 		    unsigned int *tag_cache, struct blk_mq_tags *tags)
 {
-	unsigned int last_tag, org_last_tag;
-	int index, i, tag;
+	unsigned int last_tag;
+	int tag;
 
 	if (!hctx_may_queue(hctx, bt))
 		return -1;
 
-	last_tag = org_last_tag = *tag_cache;
-	index = TAG_TO_INDEX(bt, last_tag);
+	last_tag = *tag_cache;
+	tag = sbitmap_get(&bt->sb, last_tag, BT_ALLOC_RR(tags));
 
-	for (i = 0; i < bt->map_nr; i++) {
-		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
-				    BT_ALLOC_RR(tags));
-		if (tag != -1) {
-			tag += (index << bt->bits_per_word);
-			goto done;
-		}
-
-		/*
-		 * Jump to next index, and reset the last tag to be the
-		 * first tag of that index
-		 */
-		index++;
-		last_tag = (index << bt->bits_per_word);
-
-		if (index >= bt->map_nr) {
-			index = 0;
-			last_tag = 0;
-		}
-	}
-
-	*tag_cache = 0;
-	return -1;
-
-	/*
-	 * Only update the cache from the allocation path, if we ended
-	 * up using the specific cached tag.
-	 */
-done:
-	if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
+	if (tag == -1) {
+		*tag_cache = 0;
+	} else if (tag == last_tag || unlikely(BT_ALLOC_RR(tags))) {
 		last_tag = tag + 1;
-		if (last_tag >= bt->depth - 1)
+		if (last_tag >= bt->sb.depth - 1)
 			last_tag = 0;
-
 		*tag_cache = last_tag;
 	}
 
 	return tag;
 }
 
-static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
-					 struct blk_mq_hw_ctx *hctx)
-{
-	struct bt_wait_state *bs;
-	int wait_index;
-
-	if (!hctx)
-		return &bt->bs[0];
-
-	wait_index = atomic_read(&hctx->wait_index);
-	bs = &bt->bs[wait_index];
-	bt_index_atomic_inc(&hctx->wait_index);
-	return bs;
-}
-
 static int bt_get(struct blk_mq_alloc_data *data,
-		struct blk_mq_bitmap_tags *bt,
-		struct blk_mq_hw_ctx *hctx,
-		unsigned int *last_tag, struct blk_mq_tags *tags)
+		  struct sbitmap_queue *bt,
+		  struct blk_mq_hw_ctx *hctx,
+		  unsigned int *last_tag, struct blk_mq_tags *tags)
 {
-	struct bt_wait_state *bs;
+	struct sbq_wait_state *ws;
 	DEFINE_WAIT(wait);
 	int tag;
 
@@ -271,9 +133,9 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	if (data->flags & BLK_MQ_REQ_NOWAIT)
 		return -1;
 
-	bs = bt_wait_ptr(bt, hctx);
+	ws = bt_wait_ptr(bt, hctx);
 	do {
-		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __bt_get(hctx, bt, last_tag, tags);
 		if (tag != -1)
@@ -310,11 +172,11 @@ static int bt_get(struct blk_mq_alloc_data *data,
 			hctx = data->hctx;
 			bt = &hctx->tags->bitmap_tags;
 		}
-		finish_wait(&bs->wait, &wait);
-		bs = bt_wait_ptr(bt, hctx);
+		finish_wait(&ws->wait, &wait);
+		ws = bt_wait_ptr(bt, hctx);
 	} while (1);
 
-	finish_wait(&bs->wait, &wait);
+	finish_wait(&ws->wait, &wait);
 	return tag;
 }
 
@@ -354,53 +216,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return __blk_mq_get_tag(data);
 }
 
-static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
-{
-	int i, wake_index;
-
-	wake_index = atomic_read(&bt->wake_index);
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		struct bt_wait_state *bs = &bt->bs[wake_index];
-
-		if (waitqueue_active(&bs->wait)) {
-			int o = atomic_read(&bt->wake_index);
-			if (wake_index != o)
-				atomic_cmpxchg(&bt->wake_index, o, wake_index);
-
-			return bs;
-		}
-
-		wake_index = bt_index_inc(wake_index);
-	}
-
-	return NULL;
-}
-
-static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
-{
-	const int index = TAG_TO_INDEX(bt, tag);
-	struct bt_wait_state *bs;
-	int wait_cnt;
-
-	clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
-
-	/* Ensure that the wait list checks occur after clear_bit(). */
-	smp_mb();
-
-	bs = bt_wake_ptr(bt);
-	if (!bs)
-		return;
-
-	wait_cnt = atomic_dec_return(&bs->wait_cnt);
-	if (unlikely(wait_cnt < 0))
-		wait_cnt = atomic_inc_return(&bs->wait_cnt);
-	if (wait_cnt == 0) {
-		atomic_add(bt->wake_cnt, &bs->wait_cnt);
-		bt_index_atomic_inc(&bt->wake_index);
-		wake_up(&bs->wait);
-	}
-}
-
 void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 		    unsigned int *last_tag)
 {
@@ -410,67 +225,94 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 		const int real_tag = tag - tags->nr_reserved_tags;
 
 		BUG_ON(real_tag >= tags->nr_tags);
-		bt_clear_tag(&tags->bitmap_tags, real_tag);
+		sbitmap_queue_clear(&tags->bitmap_tags, real_tag);
 		if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
 			*last_tag = real_tag;
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
-		bt_clear_tag(&tags->breserved_tags, tag);
+		sbitmap_queue_clear(&tags->breserved_tags, tag);
 	}
 }
 
-static void bt_for_each(struct blk_mq_hw_ctx *hctx,
-		struct blk_mq_bitmap_tags *bt, unsigned int off,
-		busy_iter_fn *fn, void *data, bool reserved)
+struct bt_iter_data {
+	struct blk_mq_hw_ctx *hctx;
+	busy_iter_fn *fn;
+	void *data;
+	bool reserved;
+};
+
+static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
+	struct bt_iter_data *iter_data = data;
+	struct blk_mq_hw_ctx *hctx = iter_data->hctx;
+	struct blk_mq_tags *tags = hctx->tags;
+	bool reserved = iter_data->reserved;
 	struct request *rq;
-	int bit, i;
 
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
+	if (!reserved)
+		bitnr += tags->nr_reserved_tags;
+	rq = tags->rqs[bitnr];
 
-		for (bit = find_first_bit(&bm->word, bm->depth);
-		     bit < bm->depth;
-		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
-			rq = hctx->tags->rqs[off + bit];
-			if (rq->q == hctx->queue)
-				fn(hctx, rq, data, reserved);
-		}
+	if (rq->q == hctx->queue)
+		iter_data->fn(hctx, rq, iter_data->data, reserved);
+	return true;
+}
 
-		off += (1 << bt->bits_per_word);
-	}
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
+			busy_iter_fn *fn, void *data, bool reserved)
+{
+	struct bt_iter_data iter_data = {
+		.hctx = hctx,
+		.fn = fn,
+		.data = data,
+		.reserved = reserved,
+	};
+
+	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
 }
 
-static void bt_tags_for_each(struct blk_mq_tags *tags,
-		struct blk_mq_bitmap_tags *bt, unsigned int off,
-		busy_tag_iter_fn *fn, void *data, bool reserved)
+struct bt_tags_iter_data {
+	struct blk_mq_tags *tags;
+	busy_tag_iter_fn *fn;
+	void *data;
+	bool reserved;
+};
+
+static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
+	struct bt_tags_iter_data *iter_data = data;
+	struct blk_mq_tags *tags = iter_data->tags;
+	bool reserved = iter_data->reserved;
 	struct request *rq;
-	int bit, i;
 
-	if (!tags->rqs)
-		return;
-	for (i = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-
-		for (bit = find_first_bit(&bm->word, bm->depth);
-		     bit < bm->depth;
-		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
-			rq = tags->rqs[off + bit];
-			fn(rq, data, reserved);
-		}
+	if (!reserved)
+		bitnr += tags->nr_reserved_tags;
+	rq = tags->rqs[bitnr];
 
-		off += (1 << bt->bits_per_word);
-	}
+	iter_data->fn(rq, iter_data->data, reserved);
+	return true;
+}
+
+static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
+			     busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct bt_tags_iter_data iter_data = {
+		.tags = tags,
+		.fn = fn,
+		.data = data,
+		.reserved = reserved,
+	};
+
+	if (tags->rqs)
+		sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
 }
 
 static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
 		busy_tag_iter_fn *fn, void *priv)
 {
 	if (tags->nr_reserved_tags)
-		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
-	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
-			false);
+		bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
 }
 
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@ -529,107 +371,20 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 			continue;
 
 		if (tags->nr_reserved_tags)
-			bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
-		bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
-		      false);
-	}
-
-}
-
-static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
-{
-	unsigned int i, used;
-
-	for (i = 0, used = 0; i < bt->map_nr; i++) {
-		struct blk_align_bitmap *bm = &bt->map[i];
-
-		used += bitmap_weight(&bm->word, bm->depth);
+			bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
+		bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
 	}
 
-	return bt->depth - used;
 }
 
-static void bt_update_count(struct blk_mq_bitmap_tags *bt,
-			    unsigned int depth)
+static unsigned int bt_unused_tags(const struct sbitmap_queue *bt)
 {
-	unsigned int tags_per_word = 1U << bt->bits_per_word;
-	unsigned int map_depth = depth;
-
-	if (depth) {
-		int i;
-
-		for (i = 0; i < bt->map_nr; i++) {
-			bt->map[i].depth = min(map_depth, tags_per_word);
-			map_depth -= bt->map[i].depth;
-		}
-	}
-
-	bt->wake_cnt = BT_WAIT_BATCH;
-	if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
-		bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
-
-	bt->depth = depth;
+	return bt->sb.depth - sbitmap_weight(&bt->sb);
 }
 
-static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
-			int node, bool reserved)
+static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, int node)
 {
-	int i;
-
-	bt->bits_per_word = ilog2(BITS_PER_LONG);
-
-	/*
-	 * Depth can be zero for reserved tags, that's not a failure
-	 * condition.
-	 */
-	if (depth) {
-		unsigned int nr, tags_per_word;
-
-		tags_per_word = (1 << bt->bits_per_word);
-
-		/*
-		 * If the tag space is small, shrink the number of tags
-		 * per word so we spread over a few cachelines, at least.
-		 * If less than 4 tags, just forget about it, it's not
-		 * going to work optimally anyway.
-		 */
-		if (depth >= 4) {
-			while (tags_per_word * 4 > depth) {
-				bt->bits_per_word--;
-				tags_per_word = (1 << bt->bits_per_word);
-			}
-		}
-
-		nr = ALIGN(depth, tags_per_word) / tags_per_word;
-		bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
-						GFP_KERNEL, node);
-		if (!bt->map)
-			return -ENOMEM;
-
-		bt->map_nr = nr;
-	}
-
-	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
-	if (!bt->bs) {
-		kfree(bt->map);
-		bt->map = NULL;
-		return -ENOMEM;
-	}
-
-	bt_update_count(bt, depth);
-
-	for (i = 0; i < BT_WAIT_QUEUES; i++) {
-		init_waitqueue_head(&bt->bs[i].wait);
-		atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
-	}
-
-	return 0;
-}
-
-static void bt_free(struct blk_mq_bitmap_tags *bt)
-{
-	kfree(bt->map);
-	kfree(bt->bs);
+	return sbitmap_queue_init_node(bt, depth, -1, GFP_KERNEL, node);
 }
 
 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
@@ -639,14 +394,15 @@ static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
 
 	tags->alloc_policy = alloc_policy;
 
-	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
-		goto enomem;
-	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
-		goto enomem;
+	if (bt_alloc(&tags->bitmap_tags, depth, node))
+		goto free_tags;
+	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node))
+		goto free_bitmap_tags;
 
 	return tags;
-enomem:
-	bt_free(&tags->bitmap_tags);
+free_bitmap_tags:
+	sbitmap_queue_free(&tags->bitmap_tags);
+free_tags:
 	kfree(tags);
 	return NULL;
 }
@@ -679,8 +435,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 
 void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	bt_free(&tags->bitmap_tags);
-	bt_free(&tags->breserved_tags);
+	sbitmap_queue_free(&tags->bitmap_tags);
+	sbitmap_queue_free(&tags->breserved_tags);
 	free_cpumask_var(tags->cpumask);
 	kfree(tags);
 }
@@ -702,7 +458,8 @@ int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
 	 * Don't need (or can't) update reserved tags here, they remain
 	 * static and should never need resizing.
 	 */
-	bt_update_count(&tags->bitmap_tags, tdepth);
+	sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+
 	blk_mq_tag_wakeup_all(tags, false);
 	return 0;
 }
@@ -746,7 +503,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
 			"bits_per_word=%u\n",
 			tags->nr_tags, tags->nr_reserved_tags,
-			tags->bitmap_tags.bits_per_word);
+			1U << tags->bitmap_tags.sb.shift);
 
 	free = bt_unused_tags(&tags->bitmap_tags);
 	res = bt_unused_tags(&tags->breserved_tags);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d468a79f2c4a2c..3215c08c63cc96 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -3,31 +3,6 @@
 
 #include "blk-mq.h"
 
-enum {
-	BT_WAIT_QUEUES	= 8,
-	BT_WAIT_BATCH	= 8,
-};
-
-struct bt_wait_state {
-	atomic_t wait_cnt;
-	wait_queue_head_t wait;
-} ____cacheline_aligned_in_smp;
-
-#define TAG_TO_INDEX(bt, tag)	((tag) >> (bt)->bits_per_word)
-#define TAG_TO_BIT(bt, tag)	((tag) & ((1 << (bt)->bits_per_word) - 1))
-
-struct blk_mq_bitmap_tags {
-	unsigned int depth;
-	unsigned int wake_cnt;
-	unsigned int bits_per_word;
-
-	unsigned int map_nr;
-	struct blk_align_bitmap *map;
-
-	atomic_t wake_index;
-	struct bt_wait_state *bs;
-};
-
 /*
  * Tag address space map.
  */
@@ -37,8 +12,8 @@ struct blk_mq_tags {
 
 	atomic_t active_queues;
 
-	struct blk_mq_bitmap_tags bitmap_tags;
-	struct blk_mq_bitmap_tags breserved_tags;
+	struct sbitmap_queue bitmap_tags;
+	struct sbitmap_queue breserved_tags;
 
 	struct request **rqs;
 	struct list_head page_list;
@@ -61,6 +36,14 @@ extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv);
 
+static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
+						 struct blk_mq_hw_ctx *hctx)
+{
+	if (!hctx)
+		return &bt->ws[0];
+	return sbq_wait_ptr(bt, &hctx->wait_index);
+}
+
 enum {
 	BLK_MQ_TAG_CACHE_MIN	= 1,
 	BLK_MQ_TAG_CACHE_MAX	= 64,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0cb93625d8121d..6603be18064e3d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,42 +41,23 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
  */
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	unsigned int i;
-
-	for (i = 0; i < hctx->ctx_map.size; i++)
-		if (hctx->ctx_map.map[i].word)
-			return true;
-
-	return false;
-}
-
-static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
-					      struct blk_mq_ctx *ctx)
-{
-	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+	return sbitmap_any_bit_set(&hctx->ctx_map);
 }
 
-#define CTX_TO_BIT(hctx, ctx)	\
-	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
-
 /*
  * Mark this ctx as having pending work in this hardware queue
  */
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 				     struct blk_mq_ctx *ctx)
 {
-	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
-	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
-		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+	if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
+		sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 				      struct blk_mq_ctx *ctx)
 {
-	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
-	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
@@ -755,38 +736,36 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 	return false;
 }
 
+struct flush_busy_ctx_data {
+	struct blk_mq_hw_ctx *hctx;
+	struct list_head *list;
+};
+
+static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
+{
+	struct flush_busy_ctx_data *flush_data = data;
+	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
+	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+	sbitmap_clear_bit(sb, bitnr);
+	spin_lock(&ctx->lock);
+	list_splice_tail_init(&ctx->rq_list, flush_data->list);
+	spin_unlock(&ctx->lock);
+	return true;
+}
+
 /*
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
-	struct blk_mq_ctx *ctx;
-	int i;
-
-	for (i = 0; i < hctx->ctx_map.size; i++) {
-		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
-		unsigned int off, bit;
-
-		if (!bm->word)
-			continue;
-
-		bit = 0;
-		off = i * hctx->ctx_map.bits_per_word;
-		do {
-			bit = find_next_bit(&bm->word, bm->depth, bit);
-			if (bit >= bm->depth)
-				break;
-
-			ctx = hctx->ctxs[bit + off];
-			clear_bit(bit, &bm->word);
-			spin_lock(&ctx->lock);
-			list_splice_tail_init(&ctx->rq_list, list);
-			spin_unlock(&ctx->lock);
+	struct flush_busy_ctx_data data = {
+		.hctx = hctx,
+		.list = list,
+	};
 
-			bit++;
-		} while (1);
-	}
+	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 }
 
 static inline unsigned int queued_to_index(unsigned int queued)
@@ -1609,32 +1588,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 	return NULL;
 }
 
-static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
-{
-	kfree(bitmap->map);
-}
-
-static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
-{
-	unsigned int bpw = 8, total, num_maps, i;
-
-	bitmap->bits_per_word = bpw;
-
-	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
-	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
-					GFP_KERNEL, node);
-	if (!bitmap->map)
-		return -ENOMEM;
-
-	total = nr_cpu_ids;
-	for (i = 0; i < num_maps; i++) {
-		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
-		total -= bitmap->map[i].depth;
-	}
-
-	return 0;
-}
-
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -1700,7 +1653,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 
 	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 	blk_free_flush_queue(hctx->fq);
-	blk_mq_free_bitmap(&hctx->ctx_map);
+	sbitmap_free(&hctx->ctx_map);
 }
 
 static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -1760,7 +1713,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (!hctx->ctxs)
 		goto unregister_cpu_notifier;
 
-	if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
+	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
+			      node))
 		goto free_ctxs;
 
 	hctx->nr_ctx = 0;
@@ -1787,7 +1741,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
  free_bitmap:
-	blk_mq_free_bitmap(&hctx->ctx_map);
+	sbitmap_free(&hctx->ctx_map);
  free_ctxs:
 	kfree(hctx->ctxs);
  unregister_cpu_notifier:
@@ -1863,8 +1817,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	mutex_unlock(&q->sysfs_lock);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		struct blk_mq_ctxmap *map = &hctx->ctx_map;
-
 		/*
 		 * If no software queues are mapped to this hardware queue,
 		 * disable it and free the request entries.
@@ -1890,7 +1842,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 		 * This is more accurate and more efficient than looping
 		 * over all possibly mapped software queues.
 		 */
-		map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word);
+		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
 
 		/*
 		 * Initialize batch roundrobin counts
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9087b11037b70a..71831f970fd372 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -63,15 +63,6 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
 void blk_mq_release(struct request_queue *q);
 
-/*
- * Basic implementation of sparser bitmap, allowing the user to spread
- * the bits over more cachelines.
- */
-struct blk_align_bitmap {
-	unsigned long word;
-	unsigned long depth;
-} ____cacheline_aligned_in_smp;
-
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 60ef14cbcd2da8..2575779cf13f16 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -2,6 +2,7 @@
 #define BLK_MQ_H
 
 #include <linux/blkdev.h>
+#include <linux/sbitmap.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -12,12 +13,6 @@ struct blk_mq_cpu_notifier {
 	int (*notify)(void *data, unsigned long action, unsigned int cpu);
 };
 
-struct blk_mq_ctxmap {
-	unsigned int size;
-	unsigned int bits_per_word;
-	struct blk_align_bitmap *map;
-};
-
 struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
@@ -37,7 +32,7 @@ struct blk_mq_hw_ctx {
 
 	void			*driver_data;
 
-	struct blk_mq_ctxmap	ctx_map;
+	struct sbitmap		ctx_map;
 
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
new file mode 100644
index 00000000000000..1a3b836042e152
--- /dev/null
+++ b/include/linux/sbitmap.h
@@ -0,0 +1,327 @@
+/*
+ * Fast and scalable bitmaps.
+ *
+ * Copyright (C) 2016 Facebook
+ * Copyright (C) 2013-2014 Jens Axboe
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LINUX_SCALE_BITMAP_H
+#define __LINUX_SCALE_BITMAP_H
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+/**
+ * struct sbitmap_word - Word in a &struct sbitmap.
+ */
+struct sbitmap_word {
+	/**
+	 * @word: The bitmap word itself.
+	 */
+	unsigned long word;
+
+	/**
+	 * @depth: Number of bits being used in @word.
+	 */
+	unsigned long depth;
+} ____cacheline_aligned_in_smp;
+
+/**
+ * struct sbitmap - Scalable bitmap.
+ *
+ * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This
+ * trades off higher memory usage for better scalability.
+ */
+struct sbitmap {
+	/**
+	 * @depth: Number of bits used in the whole bitmap.
+	 */
+	unsigned int depth;
+
+	/**
+	 * @shift: log2(number of bits used per word)
+	 */
+	unsigned int shift;
+
+	/**
+	 * @map_nr: Number of words (cachelines) being used for the bitmap.
+	 */
+	unsigned int map_nr;
+
+	/**
+	 * @map: Allocated bitmap.
+	 */
+	struct sbitmap_word *map;
+};
+
+#define SBQ_WAIT_QUEUES 8
+#define SBQ_WAKE_BATCH 8
+
+/**
+ * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue.
+ */
+struct sbq_wait_state {
+	/**
+	 * @wait_cnt: Number of frees remaining before we wake up.
+	 */
+	atomic_t wait_cnt;
+
+	/**
+	 * @wait: Wait queue.
+	 */
+	wait_queue_head_t wait;
+} ____cacheline_aligned_in_smp;
+
+/**
+ * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free
+ * bits.
+ *
+ * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to
+ * avoid contention on the wait queue spinlock. This ensures that we don't hit a
+ * scalability wall when we run out of free bits and have to start putting tasks
+ * to sleep.
+ */
+struct sbitmap_queue {
+	/**
+	 * @sb: Scalable bitmap.
+	 */
+	struct sbitmap sb;
+
+	/**
+	 * @wake_batch: Number of bits which must be freed before we wake up any
+	 * waiters.
+	 */
+	unsigned int wake_batch;
+
+	/**
+	 * @wake_index: Next wait queue in @ws to wake up.
+	 */
+	atomic_t wake_index;
+
+	/**
+	 * @ws: Wait queues.
+	 */
+	struct sbq_wait_state *ws;
+};
+
+/**
+ * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node.
+ * @sb: Bitmap to initialize.
+ * @depth: Number of bits to allocate.
+ * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if
+ *         given, a good default is chosen.
+ * @flags: Allocation flags.
+ * @node: Memory node to allocate on.
+ *
+ * Return: Zero on success or negative errno on failure.
+ */
+int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
+		      gfp_t flags, int node);
+
+/**
+ * sbitmap_free() - Free memory used by a &struct sbitmap.
+ * @sb: Bitmap to free.
+ */
+static inline void sbitmap_free(struct sbitmap *sb)
+{
+	kfree(sb->map);
+	sb->map = NULL;
+}
+
+/**
+ * sbitmap_resize() - Resize a &struct sbitmap.
+ * @sb: Bitmap to resize.
+ * @depth: New number of bits to resize to.
+ *
+ * Doesn't reallocate anything. It's up to the caller to ensure that the new
+ * depth doesn't exceed the depth that the sb was initialized with.
+ */
+void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
+
+/**
+ * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @round_robin: If true, be stricter about allocation order; always allocate
+ *               starting from the last allocated bit. This is less efficient
+ *               than the default behavior (false).
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
+
+/**
+ * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
+ * @sb: Bitmap to check.
+ *
+ * Return: true if any bit in the bitmap is set, false otherwise.
+ */
+bool sbitmap_any_bit_set(const struct sbitmap *sb);
+
+/**
+ * sbitmap_any_bit_clear() - Check for an unset bit in a &struct
+ * sbitmap.
+ * @sb: Bitmap to check.
+ *
+ * Return: true if any bit in the bitmap is clear, false otherwise.
+ */
+bool sbitmap_any_bit_clear(const struct sbitmap *sb);
+
+typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
+
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ *
+ * This is inline even though it's non-trivial so that the function calls to the
+ * callback will hopefully get optimized away.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+					void *data)
+{
+	unsigned int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		struct sbitmap_word *word = &sb->map[i];
+		unsigned int off, nr;
+
+		if (!word->word)
+			continue;
+
+		nr = 0;
+		off = i << sb->shift;
+		while (1) {
+			nr = find_next_bit(&word->word, word->depth, nr);
+			if (nr >= word->depth)
+				break;
+
+			if (!fn(sb, off + nr, data))
+				return;
+
+			nr++;
+		}
+	}
+}
+
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
+static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
+					    unsigned int bitnr)
+{
+	return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
+}
+
+/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */
+
+static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+unsigned int sbitmap_weight(const struct sbitmap *sb);
+
+/**
+ * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
+ * memory node.
+ * @sbq: Bitmap queue to initialize.
+ * @depth: See sbitmap_init_node().
+ * @shift: See sbitmap_init_node().
+ * @flags: Allocation flags.
+ * @node: Memory node to allocate on.
+ *
+ * Return: Zero on success or negative errno on failure.
+ */
+int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
+			    int shift, gfp_t flags, int node);
+
+/**
+ * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
+ *
+ * @sbq: Bitmap queue to free.
+ */
+static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
+{
+	kfree(sbq->ws);
+	sbitmap_free(&sbq->sb);
+}
+
+/**
+ * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
+ * @sbq: Bitmap queue to resize.
+ * @depth: New number of bits to resize to.
+ *
+ * Like sbitmap_resize(), this doesn't reallocate anything. It has to do
+ * some extra work on the &struct sbitmap_queue, so it's not safe to just
+ * resize the underlying &struct sbitmap.
+ */
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
+
+/**
+ * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @nr: Bit number to free.
+ */
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr);
+
+static inline int sbq_index_inc(int index)
+{
+	return (index + 1) & (SBQ_WAIT_QUEUES - 1);
+}
+
+static inline void sbq_index_atomic_inc(atomic_t *index)
+{
+	int old = atomic_read(index);
+	int new = sbq_index_inc(old);
+	atomic_cmpxchg(index, old, new);
+}
+
+/**
+ * sbq_wait_ptr() - Get the next wait queue to use for a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to wait on.
+ * @wait_index: A counter per "user" of @sbq.
+ */
+static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
+						  atomic_t *wait_index)
+{
+	struct sbq_wait_state *ws;
+
+	ws = &sbq->ws[atomic_read(wait_index)];
+	sbq_index_atomic_inc(wait_index);
+	return ws;
+}
+
+/**
+ * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to wake up.
+ */
+void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
+
+#endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index d79909dc01ec83..942fb8091a8607 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -550,4 +550,7 @@ config STACKDEPOT
 	bool
 	select STACKTRACE
 
+config SBITMAP
+	bool
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index cfa68eb269e4b6..2cbfd2904994b0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -228,3 +228,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
 obj-$(CONFIG_UBSAN) += ubsan.o
 
 UBSAN_SANITIZE_ubsan.o := n
+
+obj-$(CONFIG_SBITMAP) += sbitmap.o
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
new file mode 100644
index 00000000000000..dfc084ac6937cc
--- /dev/null
+++ b/lib/sbitmap.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2016 Facebook
+ * Copyright (C) 2013-2014 Jens Axboe
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/sbitmap.h>
+
+int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
+		      gfp_t flags, int node)
+{
+	unsigned int bits_per_word;
+	unsigned int i;
+
+	if (shift < 0) {
+		shift = ilog2(BITS_PER_LONG);
+		/*
+		 * If the bitmap is small, shrink the number of bits per word so
+		 * we spread over a few cachelines, at least. If less than 4
+		 * bits, just forget about it, it's not going to work optimally
+		 * anyway.
+		 */
+		if (depth >= 4) {
+			while ((4U << shift) > depth)
+				shift--;
+		}
+	}
+	bits_per_word = 1U << shift;
+	if (bits_per_word > BITS_PER_LONG)
+		return -EINVAL;
+
+	sb->shift = shift;
+	sb->depth = depth;
+	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+	if (depth == 0) {
+		sb->map = NULL;
+		return 0;
+	}
+
+	sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
+	if (!sb->map)
+		return -ENOMEM;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		sb->map[i].depth = min(depth, bits_per_word);
+		depth -= sb->map[i].depth;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_init_node);
+
+void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
+{
+	unsigned int bits_per_word = 1U << sb->shift;
+	unsigned int i;
+
+	sb->depth = depth;
+	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		sb->map[i].depth = min(depth, bits_per_word);
+		depth -= sb->map[i].depth;
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_resize);
+
+static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
+			      bool wrap)
+{
+	unsigned int orig_hint = hint;
+	int nr;
+
+	while (1) {
+		nr = find_next_zero_bit(&word->word, word->depth, hint);
+		if (unlikely(nr >= word->depth)) {
+			/*
+			 * We started with an offset, and we didn't reset the
+			 * offset to 0 in a failure case, so start from 0 to
+			 * exhaust the map.
+			 */
+			if (orig_hint && hint && wrap) {
+				hint = orig_hint = 0;
+				continue;
+			}
+			return -1;
+		}
+
+		if (!test_and_set_bit(nr, &word->word))
+			break;
+
+		hint = nr + 1;
+		if (hint >= word->depth - 1)
+			hint = 0;
+	}
+
+	return nr;
+}
+
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
+{
+	unsigned int i, index;
+	int nr = -1;
+
+	index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		nr = __sbitmap_get_word(&sb->map[index],
+					SB_NR_TO_BIT(sb, alloc_hint),
+					!round_robin);
+		if (nr != -1) {
+			nr += index << sb->shift;
+			break;
+		}
+
+		/* Jump to next index. */
+		index++;
+		alloc_hint = index << sb->shift;
+
+		if (index >= sb->map_nr) {
+			index = 0;
+			alloc_hint = 0;
+		}
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get);
+
+bool sbitmap_any_bit_set(const struct sbitmap *sb)
+{
+	unsigned int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		if (sb->map[i].word)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);
+
+bool sbitmap_any_bit_clear(const struct sbitmap *sb)
+{
+	unsigned int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		const struct sbitmap_word *word = &sb->map[i];
+		unsigned long ret;
+
+		ret = find_first_zero_bit(&word->word, word->depth);
+		if (ret < word->depth)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
+
+unsigned int sbitmap_weight(const struct sbitmap *sb)
+{
+	unsigned int i, weight;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		const struct sbitmap_word *word = &sb->map[i];
+
+		weight += bitmap_weight(&word->word, word->depth);
+	}
+	return weight;
+}
+EXPORT_SYMBOL_GPL(sbitmap_weight);
+
+static unsigned int sbq_calc_wake_batch(unsigned int depth)
+{
+	unsigned int wake_batch;
+
+	/*
+	 * For each batch, we wake up one queue. We need to make sure that our
+	 * batch size is small enough that the full depth of the bitmap is
+	 * enough to wake up all of the queues.
+	 */
+	wake_batch = SBQ_WAKE_BATCH;
+	if (wake_batch > depth / SBQ_WAIT_QUEUES)
+		wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+
+	return wake_batch;
+}
+
+int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
+			    int shift, gfp_t flags, int node)
+{
+	int ret;
+	int i;
+
+	ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node);
+	if (ret)
+		return ret;
+
+	sbq->wake_batch = sbq_calc_wake_batch(depth);
+	atomic_set(&sbq->wake_index, 0);
+
+	sbq->ws = kzalloc(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags);
+	if (!sbq->ws) {
+		sbitmap_free(&sbq->sb);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		init_waitqueue_head(&sbq->ws[i].wait);
+		atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+	sbq->wake_batch = sbq_calc_wake_batch(depth);
+	sbitmap_resize(&sbq->sb, depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
+
+static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
+{
+	int i, wake_index;
+
+	wake_index = atomic_read(&sbq->wake_index);
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+		if (waitqueue_active(&ws->wait)) {
+			int o = atomic_read(&sbq->wake_index);
+
+			if (wake_index != o)
+				atomic_cmpxchg(&sbq->wake_index, o, wake_index);
+			return ws;
+		}
+
+		wake_index = sbq_index_inc(wake_index);
+	}
+
+	return NULL;
+}
+
+static void sbq_wake_up(struct sbitmap_queue *sbq)
+{
+	struct sbq_wait_state *ws;
+	int wait_cnt;
+
+	/* Ensure that the wait list checks occur after clear_bit(). */
+	smp_mb();
+
+	ws = sbq_wake_ptr(sbq);
+	if (!ws)
+		return;
+
+	wait_cnt = atomic_dec_return(&ws->wait_cnt);
+	if (unlikely(wait_cnt < 0))
+		wait_cnt = atomic_inc_return(&ws->wait_cnt);
+	if (wait_cnt == 0) {
+		atomic_add(sbq->wake_batch, &ws->wait_cnt);
+		sbq_index_atomic_inc(&sbq->wake_index);
+		wake_up(&ws->wait);
+	}
+}
+
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr)
+{
+	sbitmap_clear_bit(&sbq->sb, nr);
+	sbq_wake_up(sbq);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
+
+void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
+{
+	int i, wake_index;
+
+	/*
+	 * Make sure all changes prior to this are visible from other CPUs.
+	 */
+	smp_mb();
+	wake_index = atomic_read(&sbq->wake_index);
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+		if (waitqueue_active(&ws->wait))
+			wake_up(&ws->wait);
+
+		wake_index = sbq_index_inc(wake_index);
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);

From 48e28166a7b608e19a6aea3acadd81cdfe660f6b Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 01:28:22 -0700
Subject: [PATCH 25/54] sbitmap: allocate wait queues on a specific node

The original bt_alloc() we converted from was using kzalloc(), not
kzalloc_node(), to allocate the wait queues. This was probably an
oversight, so fix it for sbitmap_queue_init_node().

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 lib/sbitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index dfc084ac6937cc..4d8e97e470ee93 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -208,7 +208,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	sbq->wake_batch = sbq_calc_wake_batch(depth);
 	atomic_set(&sbq->wake_index, 0);
 
-	sbq->ws = kzalloc(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags);
+	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
 		sbitmap_free(&sbq->sb);
 		return -ENOMEM;

From 40aabb67464d5aad9ca3d2a5fedee56e2ff45aa0 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 01:28:23 -0700
Subject: [PATCH 26/54] sbitmap: push per-cpu last_tag into sbitmap_queue

Allocating your own per-cpu allocation hint separately makes for an
awkward API. Instead, allocate the per-cpu hint as part of the struct
sbitmap_queue. There's no point for a struct sbitmap_queue without the
cache, but you can still use a bare struct sbitmap.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c      | 53 +++++++++++++----------------------------
 block/blk-mq-tag.h      |  3 ++-
 block/blk-mq.c          |  2 +-
 block/blk-mq.h          |  2 --
 include/linux/sbitmap.h | 45 +++++++++++++++++++++++++++++++++-
 lib/sbitmap.c           | 35 ++++++++++++++++++++++++++-
 6 files changed, 98 insertions(+), 42 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2cbdecd594e970..c9a22dbbbda1e7 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -94,39 +94,21 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 #define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
 
 static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
-		    unsigned int *tag_cache, struct blk_mq_tags *tags)
+		    struct blk_mq_tags *tags)
 {
-	unsigned int last_tag;
-	int tag;
-
 	if (!hctx_may_queue(hctx, bt))
 		return -1;
-
-	last_tag = *tag_cache;
-	tag = sbitmap_get(&bt->sb, last_tag, BT_ALLOC_RR(tags));
-
-	if (tag == -1) {
-		*tag_cache = 0;
-	} else if (tag == last_tag || unlikely(BT_ALLOC_RR(tags))) {
-		last_tag = tag + 1;
-		if (last_tag >= bt->sb.depth - 1)
-			last_tag = 0;
-		*tag_cache = last_tag;
-	}
-
-	return tag;
+	return __sbitmap_queue_get(bt, BT_ALLOC_RR(tags));
 }
 
-static int bt_get(struct blk_mq_alloc_data *data,
-		  struct sbitmap_queue *bt,
-		  struct blk_mq_hw_ctx *hctx,
-		  unsigned int *last_tag, struct blk_mq_tags *tags)
+static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
+		  struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags)
 {
 	struct sbq_wait_state *ws;
 	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = __bt_get(hctx, bt, last_tag, tags);
+	tag = __bt_get(hctx, bt, tags);
 	if (tag != -1)
 		return tag;
 
@@ -137,7 +119,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 	do {
 		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt, last_tag, tags);
+		tag = __bt_get(hctx, bt, tags);
 		if (tag != -1)
 			break;
 
@@ -154,7 +136,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt, last_tag, tags);
+		tag = __bt_get(hctx, bt, tags);
 		if (tag != -1)
 			break;
 
@@ -168,7 +150,6 @@ static int bt_get(struct blk_mq_alloc_data *data,
 		if (data->flags & BLK_MQ_REQ_RESERVED) {
 			bt = &data->hctx->tags->breserved_tags;
 		} else {
-			last_tag = &data->ctx->last_tag;
 			hctx = data->hctx;
 			bt = &hctx->tags->bitmap_tags;
 		}
@@ -185,7 +166,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	int tag;
 
 	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
-			&data->ctx->last_tag, data->hctx->tags);
+		     data->hctx->tags);
 	if (tag >= 0)
 		return tag + data->hctx->tags->nr_reserved_tags;
 
@@ -194,15 +175,15 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
 
 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
 {
-	int tag, zero = 0;
+	int tag;
 
 	if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
 		WARN_ON_ONCE(1);
 		return BLK_MQ_TAG_FAIL;
 	}
 
-	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
-		data->hctx->tags);
+	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
+		     data->hctx->tags);
 	if (tag < 0)
 		return BLK_MQ_TAG_FAIL;
 
@@ -216,8 +197,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return __blk_mq_get_tag(data);
 }
 
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
-		    unsigned int *last_tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+		    unsigned int tag)
 {
 	struct blk_mq_tags *tags = hctx->tags;
 
@@ -225,12 +206,12 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 		const int real_tag = tag - tags->nr_reserved_tags;
 
 		BUG_ON(real_tag >= tags->nr_tags);
-		sbitmap_queue_clear(&tags->bitmap_tags, real_tag);
-		if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
-			*last_tag = real_tag;
+		sbitmap_queue_clear(&tags->bitmap_tags, real_tag,
+				    BT_ALLOC_RR(tags), ctx->cpu);
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
-		sbitmap_queue_clear(&tags->breserved_tags, tag);
+		sbitmap_queue_clear(&tags->breserved_tags, tag,
+				    BT_ALLOC_RR(tags), ctx->cpu);
 	}
 }
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 3215c08c63cc96..2b1d52ed82e018 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -27,7 +27,8 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			   unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6603be18064e3d..e0a69daddbd8ce 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -303,7 +303,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	rq->cmd_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	blk_mq_put_tag(hctx, tag, &ctx->last_tag);
+	blk_mq_put_tag(hctx, ctx, tag);
 	blk_queue_exit(q);
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 71831f970fd372..9b15d2ef7f7bf2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -12,8 +12,6 @@ struct blk_mq_ctx {
 	unsigned int		cpu;
 	unsigned int		index_hw;
 
-	unsigned int		last_tag ____cacheline_aligned_in_smp;
-
 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
 	unsigned long		rq_merged;
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 1a3b836042e152..6745545e0b2253 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -99,6 +99,14 @@ struct sbitmap_queue {
 	 */
 	struct sbitmap sb;
 
+	/*
+	 * @alloc_hint: Cache of last successfully allocated or freed bit.
+	 *
+	 * This is per-cpu, which allows multiple users to stick to different
+	 * cachelines until the map is exhausted.
+	 */
+	unsigned int __percpu *alloc_hint;
+
 	/**
 	 * @wake_batch: Number of bits which must be freed before we wake up any
 	 * waiters.
@@ -267,6 +275,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
 {
 	kfree(sbq->ws);
+	free_percpu(sbq->alloc_hint);
 	sbitmap_free(&sbq->sb);
 }
 
@@ -281,13 +290,47 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
  */
 void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
 
+/**
+ * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
+ * sbitmap_queue with preemption already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @round_robin: See sbitmap_get().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin);
+
+/**
+ * sbitmap_queue_get() - Try to allocate a free bit from a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to allocate from.
+ * @round_robin: See sbitmap_get().
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ *       sbitmap_queue_clear()).
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin,
+				    unsigned int *cpu)
+{
+	int nr;
+
+	*cpu = get_cpu();
+	nr = __sbitmap_queue_get(sbq, round_robin);
+	put_cpu();
+	return nr;
+}
+
 /**
  * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
  * &struct sbitmap_queue.
  * @sbq: Bitmap to free from.
  * @nr: Bit number to free.
+ * @round_robin: See sbitmap_get().
+ * @cpu: CPU the bit was allocated on.
  */
-void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr);
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
+			 bool round_robin, unsigned int cpu);
 
 static inline int sbq_index_inc(int index)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 4d8e97e470ee93..1651ad9d553095 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -205,11 +205,18 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	if (ret)
 		return ret;
 
+	sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
+	if (!sbq->alloc_hint) {
+		sbitmap_free(&sbq->sb);
+		return -ENOMEM;
+	}
+
 	sbq->wake_batch = sbq_calc_wake_batch(depth);
 	atomic_set(&sbq->wake_index, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
+		free_percpu(sbq->alloc_hint);
 		sbitmap_free(&sbq->sb);
 		return -ENOMEM;
 	}
@@ -229,6 +236,29 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
 
+int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin)
+{
+	unsigned int hint;
+	int nr;
+
+	hint = this_cpu_read(*sbq->alloc_hint);
+	nr = sbitmap_get(&sbq->sb, hint, round_robin);
+
+	if (nr == -1) {
+		/* If the map is full, a hint won't do us much good. */
+		this_cpu_write(*sbq->alloc_hint, 0);
+	} else if (nr == hint || unlikely(round_robin)) {
+		/* Only update the hint if we used it. */
+		hint = nr + 1;
+		if (hint >= sbq->sb.depth - 1)
+			hint = 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
@@ -273,10 +303,13 @@ static void sbq_wake_up(struct sbitmap_queue *sbq)
 	}
 }
 
-void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr)
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
+			 bool round_robin, unsigned int cpu)
 {
 	sbitmap_clear_bit(&sbq->sb, nr);
 	sbq_wake_up(sbq);
+	if (likely(!round_robin))
+		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
 

From f4a644db86669d938c71f19560aebf69d4720d63 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 01:28:24 -0700
Subject: [PATCH 27/54] sbitmap: push alloc policy into sbitmap_queue

Again, there's no point in passing this in every time. Make it part of
struct sbitmap_queue and clean up the API.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c      | 33 +++++++++++++++------------------
 block/blk-mq-tag.h      |  1 -
 include/linux/sbitmap.h | 19 +++++++++++--------
 lib/sbitmap.c           | 14 ++++++++------
 4 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c9a22dbbbda1e7..e1c2bedb0bf9cf 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -91,14 +91,11 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
-
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
-		    struct blk_mq_tags *tags)
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
 {
 	if (!hctx_may_queue(hctx, bt))
 		return -1;
-	return __sbitmap_queue_get(bt, BT_ALLOC_RR(tags));
+	return __sbitmap_queue_get(bt);
 }
 
 static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
@@ -108,7 +105,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = __bt_get(hctx, bt, tags);
+	tag = __bt_get(hctx, bt);
 	if (tag != -1)
 		return tag;
 
@@ -119,7 +116,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 	do {
 		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt, tags);
+		tag = __bt_get(hctx, bt);
 		if (tag != -1)
 			break;
 
@@ -136,7 +133,7 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt, tags);
+		tag = __bt_get(hctx, bt);
 		if (tag != -1)
 			break;
 
@@ -206,12 +203,10 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 		const int real_tag = tag - tags->nr_reserved_tags;
 
 		BUG_ON(real_tag >= tags->nr_tags);
-		sbitmap_queue_clear(&tags->bitmap_tags, real_tag,
-				    BT_ALLOC_RR(tags), ctx->cpu);
+		sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
-		sbitmap_queue_clear(&tags->breserved_tags, tag,
-				    BT_ALLOC_RR(tags), ctx->cpu);
+		sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
 	}
 }
 
@@ -363,21 +358,23 @@ static unsigned int bt_unused_tags(const struct sbitmap_queue *bt)
 	return bt->sb.depth - sbitmap_weight(&bt->sb);
 }
 
-static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, int node)
+static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
+		    bool round_robin, int node)
 {
-	return sbitmap_queue_init_node(bt, depth, -1, GFP_KERNEL, node);
+	return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
+				       node);
 }
 
 static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
 						   int node, int alloc_policy)
 {
 	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+	bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
 
-	tags->alloc_policy = alloc_policy;
-
-	if (bt_alloc(&tags->bitmap_tags, depth, node))
+	if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
 		goto free_tags;
-	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node))
+	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
+		     node))
 		goto free_bitmap_tags;
 
 	return tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 2b1d52ed82e018..f90b850ce43d99 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -18,7 +18,6 @@ struct blk_mq_tags {
 	struct request **rqs;
 	struct list_head page_list;
 
-	int alloc_policy;
 	cpumask_var_t cpumask;
 };
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 6745545e0b2253..f017fd6e69c4f6 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -122,6 +122,11 @@ struct sbitmap_queue {
 	 * @ws: Wait queues.
 	 */
 	struct sbq_wait_state *ws;
+
+	/**
+	 * @round_robin: Allocate bits in strict round-robin order.
+	 */
+	bool round_robin;
 };
 
 /**
@@ -259,13 +264,14 @@ unsigned int sbitmap_weight(const struct sbitmap *sb);
  * @sbq: Bitmap queue to initialize.
  * @depth: See sbitmap_init_node().
  * @shift: See sbitmap_init_node().
+ * @round_robin: See sbitmap_get().
  * @flags: Allocation flags.
  * @node: Memory node to allocate on.
  *
  * Return: Zero on success or negative errno on failure.
  */
 int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
-			    int shift, gfp_t flags, int node);
+			    int shift, bool round_robin, gfp_t flags, int node);
 
 /**
  * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
@@ -294,29 +300,27 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
  * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue with preemption already disabled.
  * @sbq: Bitmap queue to allocate from.
- * @round_robin: See sbitmap_get().
  *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
-int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin);
+int __sbitmap_queue_get(struct sbitmap_queue *sbq);
 
 /**
  * sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue.
  * @sbq: Bitmap queue to allocate from.
- * @round_robin: See sbitmap_get().
  * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
  *       sbitmap_queue_clear()).
  *
  * Return: Non-negative allocated bit number if successful, -1 otherwise.
  */
-static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin,
+static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
 				    unsigned int *cpu)
 {
 	int nr;
 
 	*cpu = get_cpu();
-	nr = __sbitmap_queue_get(sbq, round_robin);
+	nr = __sbitmap_queue_get(sbq);
 	put_cpu();
 	return nr;
 }
@@ -326,11 +330,10 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin,
  * &struct sbitmap_queue.
  * @sbq: Bitmap to free from.
  * @nr: Bit number to free.
- * @round_robin: See sbitmap_get().
  * @cpu: CPU the bit was allocated on.
  */
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
-			 bool round_robin, unsigned int cpu);
+			 unsigned int cpu);
 
 static inline int sbq_index_inc(int index)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 1651ad9d553095..be55f744b7713e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -196,7 +196,7 @@ static unsigned int sbq_calc_wake_batch(unsigned int depth)
 }
 
 int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
-			    int shift, gfp_t flags, int node)
+			    int shift, bool round_robin, gfp_t flags, int node)
 {
 	int ret;
 	int i;
@@ -225,6 +225,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 		init_waitqueue_head(&sbq->ws[i].wait);
 		atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
 	}
+
+	sbq->round_robin = round_robin;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
@@ -236,18 +238,18 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
 
-int __sbitmap_queue_get(struct sbitmap_queue *sbq, bool round_robin)
+int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 {
 	unsigned int hint;
 	int nr;
 
 	hint = this_cpu_read(*sbq->alloc_hint);
-	nr = sbitmap_get(&sbq->sb, hint, round_robin);
+	nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);
 
 	if (nr == -1) {
 		/* If the map is full, a hint won't do us much good. */
 		this_cpu_write(*sbq->alloc_hint, 0);
-	} else if (nr == hint || unlikely(round_robin)) {
+	} else if (nr == hint || unlikely(sbq->round_robin)) {
 		/* Only update the hint if we used it. */
 		hint = nr + 1;
 		if (hint >= sbq->sb.depth - 1)
@@ -304,11 +306,11 @@ static void sbq_wake_up(struct sbitmap_queue *sbq)
 }
 
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
-			 bool round_robin, unsigned int cpu)
+			 unsigned int cpu)
 {
 	sbitmap_clear_bit(&sbq->sb, nr);
 	sbq_wake_up(sbq);
-	if (likely(!round_robin))
+	if (likely(!sbq->round_robin))
 		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);

From 98d95416dbfaf4910caadfb4ddc75e4aacbdff8c Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 01:28:25 -0700
Subject: [PATCH 28/54] sbitmap: randomize initial alloc_hint values

In order to get good cache behavior from a sbitmap, we want each CPU to
stick to its own cacheline(s) as much as possible. This might happen
naturally as the bitmap gets filled up and the alloc_hint values spread
out, but we really want this behavior from the start. blk-mq apparently
intended to do this, but the code to do this was never wired up. Get rid
of the dead code and make it part of the sbitmap library.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 8 --------
 block/blk-mq-tag.h | 1 -
 lib/sbitmap.c      | 6 ++++++
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e1c2bedb0bf9cf..cef618f6fc9213 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -7,7 +7,6 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/random.h>
 
 #include <linux/blk-mq.h>
 #include "blk.h"
@@ -419,13 +418,6 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
 	kfree(tags);
 }
 
-void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
-{
-	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
-
-	*tag = prandom_u32() % depth;
-}
-
 int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
 {
 	tdepth -= tags->nr_reserved_tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index f90b850ce43d99..09f4cc0aaa84c9 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -30,7 +30,6 @@ extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 			   unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
-extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
 extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index be55f744b7713e..928b82a733f2cc 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -15,6 +15,7 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
+#include <linux/random.h>
 #include <linux/sbitmap.h>
 
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
@@ -211,6 +212,11 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 		return -ENOMEM;
 	}
 
+	if (depth && !round_robin) {
+		for_each_possible_cpu(i)
+			*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
+	}
+
 	sbq->wake_batch = sbq_calc_wake_batch(depth);
 	atomic_set(&sbq->wake_index, 0);
 

From 05fd095d53b979878f016c3a7080d3683cc89d72 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 01:28:26 -0700
Subject: [PATCH 29/54] sbitmap: re-initialize allocation hints after resize

After a struct sbitmap_queue is resized smaller, the allocation hints
may still be set to bits beyond the new depth of the bitmap. This means
that, for example, if the number of blk-mq tags is reduced through
sysfs, more requests than the nominal queue depth may be in flight.

It's tempting to fix this at resize time by doing a one-time
reinitialization of the hints, but this can race with
__sbitmap_queue_get() updating the hint. Instead, check the hint before
we use it. This caused no measurable performance difference in my
synthetic benchmarks.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 lib/sbitmap.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 928b82a733f2cc..f736c52a712c06 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -246,10 +246,15 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
 
 int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 {
-	unsigned int hint;
+	unsigned int hint, depth;
 	int nr;
 
 	hint = this_cpu_read(*sbq->alloc_hint);
+	depth = READ_ONCE(sbq->sb.depth);
+	if (unlikely(hint >= depth)) {
+		hint = depth ? prandom_u32() % depth : 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
 	nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);
 
 	if (nr == -1) {
@@ -258,7 +263,7 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 	} else if (nr == hint || unlikely(sbq->round_robin)) {
 		/* Only update the hint if we used it. */
 		hint = nr + 1;
-		if (hint >= sbq->sb.depth - 1)
+		if (hint >= depth - 1)
 			hint = 0;
 		this_cpu_write(*sbq->alloc_hint, hint);
 	}

From 5c64a8df0ca88c79c9cb74674c2481e5f7ede511 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Sat, 17 Sep 2016 12:20:54 -0700
Subject: [PATCH 30/54] sbitmap: don't update the allocation hint on clear
 after resize

If we have a bunch of high-numbered bits allocated and then we resize
the struct sbitmap_queue, when those bits get cleared, we'll update the
hint and then have to re-randomize it repeatedly. Avoid that by checking
that the cleared bit is still a valid hint. No measurable performance
difference in the common case.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 lib/sbitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index f736c52a712c06..e408089215445c 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -321,7 +321,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 {
 	sbitmap_clear_bit(&sbq->sb, nr);
 	sbq_wake_up(sbq);
-	if (likely(!sbq->round_robin))
+	if (likely(!sbq->round_robin && nr < sbq->sb.depth))
 		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);

From 60658e0dc1df058607990278fdf9d831e0c2c71a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 19 Sep 2016 14:34:08 +0100
Subject: [PATCH 31/54] sbitmap: initialize weight to zero

Variable weight is not being initialized to zero before it is
used to compute the weight sum. Ensure it is initialized to zero.

Found with static analysis with cppcheck:
[lib/sbitmap.c:177]: (error) Uninitialized variable: weight

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 lib/sbitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index e408089215445c..2cecf05c82fd83 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
 
 unsigned int sbitmap_weight(const struct sbitmap *sb)
 {
-	unsigned int i, weight;
+	unsigned int i, weight = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
 		const struct sbitmap_word *word = &sb->map[i];

From e105ddb4a2a13d779311349df2c32fa22a87c406 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Fri, 16 Sep 2016 14:25:03 +0200
Subject: [PATCH 32/54] lightnvm: NVM should depend on HAS_DMA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If NO_DMA=y:

    drivers/built-in.o: In function `nvme_nvm_dev_dma_free':
    lightnvm.c:(.text+0x23df1a): undefined reference to `dma_pool_free'
    drivers/built-in.o: In function `nvme_nvm_dev_dma_alloc':
    lightnvm.c:(.text+0x23df38): undefined reference to `dma_pool_alloc'
    drivers/built-in.o: In function `nvme_nvm_destroy_dma_pool':
    lightnvm.c:(.text+0x23df4c): undefined reference to `dma_pool_destroy'
    drivers/built-in.o: In function `nvme_nvm_create_dma_pool':
    lightnvm.c:(.text+0x23df7e): undefined reference to `dma_pool_create'

and

    ERROR: "dma_pool_destroy" [drivers/nvme/host/nvme-core.ko] undefined!
    ERROR: "dma_pool_free" [drivers/nvme/host/nvme-core.ko] undefined!
    ERROR: "dma_pool_alloc" [drivers/nvme/host/nvme-core.ko] undefined!
    ERROR: "dma_pool_create" [drivers/nvme/host/nvme-core.ko] undefined!

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 61c68a1f054ae5..2f5d5f4a4c75b3 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -4,7 +4,7 @@
 
 menuconfig NVM
 	bool "Open-Channel SSD target support"
-	depends on BLOCK
+	depends on BLOCK && HAS_DMA
 	help
 	  Say Y here to get to enable Open-channel SSDs.
 

From ac81bfa9867103c9d50996ec21fa9179b81b727e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Fri, 16 Sep 2016 14:25:04 +0200
Subject: [PATCH 33/54] nvme: refactor namespaces to support non-gendisk
 devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With LightNVM enabled namespaces, the gendisk structure is not exposed
to the user. This prevents LightNVM users from accessing the NVMe device
driver specific sysfs entries, and LightNVM namespace geometry.

Refactor the revalidation process, so that a namespace, instead of a
gendisk, is revalidated. This later allows patches to wire up the
sysfs entries up to a non-gendisk namespace.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      |   2 +
 drivers/nvme/host/core.c     | 114 ++++++++++++++++++++++-------------
 drivers/nvme/host/lightnvm.c |   5 +-
 3 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 9ebd2cfbd8490d..25c5df92032687 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -581,6 +581,8 @@ static int nvm_core_init(struct nvm_dev *dev)
 	mutex_init(&dev->mlock);
 	spin_lock_init(&dev->lock);
 
+	blk_queue_logical_block_size(dev->q, dev->sec_size);
+
 	return 0;
 err_fmtype:
 	kfree(dev->lun_map);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2feacc70bf61f1..2c3da3315a021b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -888,42 +888,33 @@ static void nvme_config_discard(struct nvme_ns *ns)
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
 }
 
-static int nvme_revalidate_disk(struct gendisk *disk)
+static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
 {
-	struct nvme_ns *ns = disk->private_data;
-	struct nvme_id_ns *id;
-	u8 lbaf, pi_type;
-	u16 old_ms;
-	unsigned short bs;
-
-	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
-		set_capacity(disk, 0);
-		return -ENODEV;
-	}
-	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
+	if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) {
 		dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n",
 				__func__);
 		return -ENODEV;
 	}
-	if (id->ncap == 0) {
-		kfree(id);
-		return -ENODEV;
-	}
 
-	if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
-		if (nvme_nvm_register(ns->queue, disk->disk_name)) {
-			dev_warn(disk_to_dev(ns->disk),
-				"%s: LightNVM init failure\n", __func__);
-			kfree(id);
-			return -ENODEV;
-		}
-		ns->type = NVME_NS_LIGHTNVM;
+	if ((*id)->ncap == 0) {
+		kfree(*id);
+		return -ENODEV;
 	}
 
 	if (ns->ctrl->vs >= NVME_VS(1, 1))
-		memcpy(ns->eui, id->eui64, sizeof(ns->eui));
+		memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui));
 	if (ns->ctrl->vs >= NVME_VS(1, 2))
-		memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
+		memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid));
+
+	return 0;
+}
+
+static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
+{
+	struct nvme_ns *ns = disk->private_data;
+	u8 lbaf, pi_type;
+	u16 old_ms;
+	unsigned short bs;
 
 	old_ms = ns->ms;
 	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
@@ -962,8 +953,26 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
 		nvme_config_discard(ns);
 	blk_mq_unfreeze_queue(disk->queue);
+}
 
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+	struct nvme_ns *ns = disk->private_data;
+	struct nvme_id_ns *id = NULL;
+	int ret;
+
+	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+		set_capacity(disk, 0);
+		return -ENODEV;
+	}
+
+	ret = nvme_revalidate_ns(ns, &id);
+	if (ret)
+		return ret;
+
+	__nvme_revalidate_disk(disk, id);
 	kfree(id);
+
 	return 0;
 }
 
@@ -1642,6 +1651,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
+	struct nvme_id_ns *id;
+	char disk_name[DISK_NAME_LEN];
 	int node = dev_to_node(ctrl->dev);
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
@@ -1659,33 +1670,54 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	ns->queue->queuedata = ns;
 	ns->ctrl = ctrl;
 
-	disk = alloc_disk_node(0, node);
-	if (!disk)
-		goto out_free_queue;
-
 	kref_init(&ns->kref);
 	ns->ns_id = nsid;
-	ns->disk = disk;
 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
 
-
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	nvme_set_queue_limits(ctrl, ns->queue);
 
-	disk->fops = &nvme_fops;
-	disk->private_data = ns;
-	disk->queue = ns->queue;
-	disk->flags = GENHD_FL_EXT_DEVT;
-	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
+	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
 
-	if (nvme_revalidate_disk(ns->disk))
-		goto out_free_disk;
+	if (nvme_revalidate_ns(ns, &id))
+		goto out_free_queue;
+
+	if (nvme_nvm_ns_supported(ns, id)) {
+		if (nvme_nvm_register(ns->queue, disk_name)) {
+			dev_warn(ctrl->dev,
+				"%s: LightNVM init failure\n", __func__);
+			goto out_free_id;
+		}
+
+		disk = alloc_disk_node(0, node);
+		if (!disk)
+			goto out_free_id;
+		memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
+		ns->disk = disk;
+		ns->type = NVME_NS_LIGHTNVM;
+	} else {
+		disk = alloc_disk_node(0, node);
+		if (!disk)
+			goto out_free_id;
+
+		disk->fops = &nvme_fops;
+		disk->private_data = ns;
+		disk->queue = ns->queue;
+		disk->flags = GENHD_FL_EXT_DEVT;
+		memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
+		ns->disk = disk;
+
+		__nvme_revalidate_disk(disk, id);
+	}
 
 	mutex_lock(&ctrl->namespaces_mutex);
 	list_add_tail(&ns->list, &ctrl->namespaces);
 	mutex_unlock(&ctrl->namespaces_mutex);
 
 	kref_get(&ctrl->kref);
+
+	kfree(id);
+
 	if (ns->type == NVME_NS_LIGHTNVM)
 		return;
 
@@ -1695,8 +1727,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		pr_warn("%s: failed to create sysfs group for identification\n",
 			ns->disk->disk_name);
 	return;
- out_free_disk:
-	kfree(disk);
+ out_free_id:
+	kfree(id);
  out_free_queue:
 	blk_cleanup_queue(ns->queue);
  out_release_instance:
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 63f483daf9301f..7268a7a1a19a4c 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -474,8 +474,9 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 	c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
 
 	if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
-		c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
-						rqd->bio->bi_iter.bi_sector));
+		/* momentarily hardcode the shift configuration. lba_shift from
+		 * nvm_dev will be available in a follow-up patch */
+		c->hb_rw.slba = cpu_to_le64(rqd->bio->bi_iter.bi_sector >> 3);
 }
 
 static void nvme_nvm_end_io(struct request *rq, int error)

From 9ae2d0aa5046c67dd37cf4b70998ad296e718835 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Fri, 16 Sep 2016 14:25:05 +0200
Subject: [PATCH 34/54] null_blk: refactor to support non-gendisk devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With LightNVM enabled devices, the gendisk structure is not exposed
to the user. This hides the device driver specific sysfs entries, and
prevents binding of LightNVM geometry information to the device.

Refactor the device registration process, so that gendisk and
non-gendisk devices are easily managed.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 110 ++++++++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 49 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 75a7f88d67176a..895867a8a78358 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -414,23 +414,6 @@ static void cleanup_queues(struct nullb *nullb)
 	kfree(nullb->queues);
 }
 
-static void null_del_dev(struct nullb *nullb)
-{
-	list_del_init(&nullb->list);
-
-	if (use_lightnvm)
-		nvm_unregister(nullb->disk_name);
-	else
-		del_gendisk(nullb->disk);
-	blk_cleanup_queue(nullb->q);
-	if (queue_mode == NULL_Q_MQ)
-		blk_mq_free_tag_set(&nullb->tag_set);
-	if (!use_lightnvm)
-		put_disk(nullb->disk);
-	cleanup_queues(nullb);
-	kfree(nullb);
-}
-
 #ifdef CONFIG_NVM
 
 static void null_lnvm_end_io(struct request *rq, int error)
@@ -564,10 +547,41 @@ static struct nvm_dev_ops null_lnvm_dev_ops = {
 	/* Simulate nvme protocol restriction */
 	.max_phys_sect		= 64,
 };
+
+static int null_nvm_register(struct nullb *nullb)
+{
+	return nvm_register(nullb->q, nullb->disk_name, &null_lnvm_dev_ops);
+}
+
+static void null_nvm_unregister(struct nullb *nullb)
+{
+	nvm_unregister(nullb->disk_name);
+}
 #else
-static struct nvm_dev_ops null_lnvm_dev_ops;
+static int null_nvm_register(struct nullb *nullb)
+{
+	return -EINVAL;
+}
+static void null_nvm_unregister(struct nullb *nullb) {}
 #endif /* CONFIG_NVM */
 
+static void null_del_dev(struct nullb *nullb)
+{
+	list_del_init(&nullb->list);
+
+	if (use_lightnvm)
+		null_nvm_unregister(nullb);
+	else
+		del_gendisk(nullb->disk);
+	blk_cleanup_queue(nullb->q);
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
+	if (!use_lightnvm)
+		put_disk(nullb->disk);
+	cleanup_queues(nullb);
+	kfree(nullb);
+}
+
 static int null_open(struct block_device *bdev, fmode_t mode)
 {
 	return 0;
@@ -640,11 +654,32 @@ static int init_driver_queues(struct nullb *nullb)
 	return 0;
 }
 
-static int null_add_dev(void)
+static int null_gendisk_register(struct nullb *nullb)
 {
 	struct gendisk *disk;
-	struct nullb *nullb;
 	sector_t size;
+
+	disk = nullb->disk = alloc_disk_node(1, home_node);
+	if (!disk)
+		return -ENOMEM;
+	size = gb * 1024 * 1024 * 1024ULL;
+	set_capacity(disk, size >> 9);
+
+	disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
+	disk->major		= null_major;
+	disk->first_minor	= nullb->index;
+	disk->fops		= &null_fops;
+	disk->private_data	= nullb;
+	disk->queue		= nullb->q;
+	strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+
+	add_disk(disk);
+	return 0;
+}
+
+static int null_add_dev(void)
+{
+	struct nullb *nullb;
 	int rv;
 
 	nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
@@ -716,42 +751,19 @@ static int null_add_dev(void)
 
 	sprintf(nullb->disk_name, "nullb%d", nullb->index);
 
-	if (use_lightnvm) {
-		rv = nvm_register(nullb->q, nullb->disk_name,
-							&null_lnvm_dev_ops);
-		if (rv)
-			goto out_cleanup_blk_queue;
-		goto done;
-	}
-
-	disk = nullb->disk = alloc_disk_node(1, home_node);
-	if (!disk) {
-		rv = -ENOMEM;
-		goto out_cleanup_lightnvm;
-	}
-	size = gb * 1024 * 1024 * 1024ULL;
-	set_capacity(disk, size >> 9);
-
-	disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
-	disk->major		= null_major;
-	disk->first_minor	= nullb->index;
-	disk->fops		= &null_fops;
-	disk->private_data	= nullb;
-	disk->queue		= nullb->q;
-	strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+	if (use_lightnvm)
+		rv = null_nvm_register(nullb);
+	else
+		rv = null_gendisk_register(nullb);
 
-	add_disk(disk);
+	if (rv)
+		goto out_cleanup_blk_queue;
 
-done:
 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
 	mutex_unlock(&lock);
 
 	return 0;
-
-out_cleanup_lightnvm:
-	if (use_lightnvm)
-		nvm_unregister(nullb->disk_name);
 out_cleanup_blk_queue:
 	blk_cleanup_queue(nullb->q);
 out_cleanup_tags:

From b21d5b301794ae332eaa6e177d71fe8b77d3664c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Fri, 16 Sep 2016 14:25:06 +0200
Subject: [PATCH 35/54] blk-mq: register device instead of disk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable devices without a gendisk instance to register itself with blk-mq
and expose the associated multi-queue sysfs entries.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c   | 17 +++++++----------
 block/blk-sysfs.c      |  4 ++--
 drivers/md/dm-rq.c     |  2 +-
 include/linux/blk-mq.h |  4 ++--
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 3c385b196bc711..01fb455d337747 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -393,9 +393,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	return ret;
 }
 
-static void __blk_mq_unregister_disk(struct gendisk *disk)
+static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
-	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	int i, j;
@@ -413,15 +412,15 @@ static void __blk_mq_unregister_disk(struct gendisk *disk)
 	kobject_del(&q->mq_kobj);
 	kobject_put(&q->mq_kobj);
 
-	kobject_put(&disk_to_dev(disk)->kobj);
+	kobject_put(&dev->kobj);
 
 	q->mq_sysfs_init_done = false;
 }
 
-void blk_mq_unregister_disk(struct gendisk *disk)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
 	blk_mq_disable_hotplug();
-	__blk_mq_unregister_disk(disk);
+	__blk_mq_unregister_dev(dev, q);
 	blk_mq_enable_hotplug();
 }
 
@@ -443,10 +442,8 @@ static void blk_mq_sysfs_init(struct request_queue *q)
 	}
 }
 
-int blk_mq_register_disk(struct gendisk *disk)
+int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 {
-	struct device *dev = disk_to_dev(disk);
-	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
 	int ret, i;
 
@@ -467,7 +464,7 @@ int blk_mq_register_disk(struct gendisk *disk)
 	}
 
 	if (ret)
-		__blk_mq_unregister_disk(disk);
+		__blk_mq_unregister_dev(dev, q);
 	else
 		q->mq_sysfs_init_done = true;
 out:
@@ -475,7 +472,7 @@ int blk_mq_register_disk(struct gendisk *disk)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blk_mq_register_disk);
+EXPORT_SYMBOL_GPL(blk_mq_register_dev);
 
 void blk_mq_sysfs_unregister(struct request_queue *q)
 {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f87a7e747d3600..9cc8d7c5439a98 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -704,7 +704,7 @@ int blk_register_queue(struct gendisk *disk)
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
 	if (q->mq_ops)
-		blk_mq_register_disk(disk);
+		blk_mq_register_dev(dev, q);
 
 	if (!q->request_fn)
 		return 0;
@@ -729,7 +729,7 @@ void blk_unregister_queue(struct gendisk *disk)
 		return;
 
 	if (q->mq_ops)
-		blk_mq_unregister_disk(disk);
+		blk_mq_unregister_dev(disk_to_dev(disk), q);
 
 	if (q->request_fn)
 		elv_unregister_queue(q);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1ca7463e8bb2b2..ee48230a2952dd 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -955,7 +955,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	dm_init_md_queue(md);
 
 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
-	blk_mq_register_disk(md->disk);
+	blk_mq_register_dev(disk_to_dev(md->disk), q);
 
 	return 0;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2575779cf13f16..fbcfdf3232437b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -175,8 +175,8 @@ enum {
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						  struct request_queue *q);
-int blk_mq_register_disk(struct gendisk *);
-void blk_mq_unregister_disk(struct gendisk *);
+int blk_mq_register_dev(struct device *, struct request_queue *);
+void blk_mq_unregister_dev(struct device *, struct request_queue *);
 
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);

From b0b4e09c1ae71c4ec33df0616b830ae050006e9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Fri, 16 Sep 2016 14:25:07 +0200
Subject: [PATCH 36/54] lightnvm: control life of nvm_dev in driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LightNVM compatible device drivers does not have a method to expose
LightNVM specific sysfs entries.

To enable LightNVM sysfs entries to be exposed, lightnvm device
drivers require a struct device to attach it to. To allow both the
actual device driver and lightnvm sysfs entries to coexist, the device
driver tracks the lifetime of the nvm_dev structure.

This patch refactors NVMe and null_blk to handle the lifetime of struct
nvm_dev, which eliminates the need for struct gendisk when a lightnvm
compatible device is provided.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c     | 22 ++++++++++++++++++++--
 drivers/lightnvm/core.c      | 35 ++++++++---------------------------
 drivers/nvme/host/core.c     | 36 +++++++++++++++---------------------
 drivers/nvme/host/lightnvm.c | 31 ++++++++++++++++++++++++-------
 drivers/nvme/host/nvme.h     | 12 +++++++-----
 include/linux/lightnvm.h     | 15 +++++++++------
 6 files changed, 83 insertions(+), 68 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 895867a8a78358..91e1de898dafd8 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -34,6 +34,7 @@ struct nullb {
 	unsigned int index;
 	struct request_queue *q;
 	struct gendisk *disk;
+	struct nvm_dev *ndev;
 	struct blk_mq_tag_set tag_set;
 	struct hrtimer timer;
 	unsigned int queue_depth;
@@ -550,12 +551,29 @@ static struct nvm_dev_ops null_lnvm_dev_ops = {
 
 static int null_nvm_register(struct nullb *nullb)
 {
-	return nvm_register(nullb->q, nullb->disk_name, &null_lnvm_dev_ops);
+	struct nvm_dev *dev;
+	int rv;
+
+	dev = nvm_alloc_dev(0);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->q = nullb->q;
+	memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
+	dev->ops = &null_lnvm_dev_ops;
+
+	rv = nvm_register(dev);
+	if (rv) {
+		kfree(dev);
+		return rv;
+	}
+	nullb->ndev = dev;
+	return 0;
 }
 
 static void null_nvm_unregister(struct nullb *nullb)
 {
-	nvm_unregister(nullb->disk_name);
+	nvm_unregister(nullb->ndev);
 }
 #else
 static int null_nvm_register(struct nullb *nullb)
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 25c5df92032687..a99b59d1eb36c2 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -660,22 +660,15 @@ static void nvm_exit(struct nvm_dev *dev)
 	pr_info("nvm: successfully unloaded\n");
 }
 
-int nvm_register(struct request_queue *q, char *disk_name,
-							struct nvm_dev_ops *ops)
+struct nvm_dev *nvm_alloc_dev(int node)
 {
-	struct nvm_dev *dev;
-	int ret;
-
-	if (!ops->identity)
-		return -EINVAL;
-
-	dev = kzalloc(sizeof(struct nvm_dev), GFP_KERNEL);
-	if (!dev)
-		return -ENOMEM;
+	return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node);
+}
+EXPORT_SYMBOL(nvm_alloc_dev);
 
-	dev->q = q;
-	dev->ops = ops;
-	strncpy(dev->name, disk_name, DISK_NAME_LEN);
+int nvm_register(struct nvm_dev *dev)
+{
+	int ret;
 
 	ret = nvm_init(dev);
 	if (ret)
@@ -714,29 +707,17 @@ int nvm_register(struct request_queue *q, char *disk_name,
 	return 0;
 err_init:
 	kfree(dev->lun_map);
-	kfree(dev);
 	return ret;
 }
 EXPORT_SYMBOL(nvm_register);
 
-void nvm_unregister(char *disk_name)
+void nvm_unregister(struct nvm_dev *dev)
 {
-	struct nvm_dev *dev;
-
 	down_write(&nvm_lock);
-	dev = nvm_find_nvm_dev(disk_name);
-	if (!dev) {
-		pr_err("nvm: could not find device %s to unregister\n",
-								disk_name);
-		up_write(&nvm_lock);
-		return;
-	}
-
 	list_del(&dev->devices);
 	up_write(&nvm_lock);
 
 	nvm_exit(dev);
-	kfree(dev);
 }
 EXPORT_SYMBOL(nvm_unregister);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2c3da3315a021b..3c707d83b1da8f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -156,12 +156,14 @@ static void nvme_free_ns(struct kref *kref)
 {
 	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 
-	if (ns->type == NVME_NS_LIGHTNVM)
-		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+	if (ns->ndev)
+		nvme_nvm_unregister(ns);
 
-	spin_lock(&dev_list_lock);
-	ns->disk->private_data = NULL;
-	spin_unlock(&dev_list_lock);
+	if (ns->disk) {
+		spin_lock(&dev_list_lock);
+		ns->disk->private_data = NULL;
+		spin_unlock(&dev_list_lock);
+	}
 
 	put_disk(ns->disk);
 	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
@@ -891,8 +893,7 @@ static void nvme_config_discard(struct nvme_ns *ns)
 static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
 {
 	if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) {
-		dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n",
-				__func__);
+		dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__);
 		return -ENODEV;
 	}
 
@@ -1683,18 +1684,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		goto out_free_queue;
 
 	if (nvme_nvm_ns_supported(ns, id)) {
-		if (nvme_nvm_register(ns->queue, disk_name)) {
-			dev_warn(ctrl->dev,
-				"%s: LightNVM init failure\n", __func__);
+		if (nvme_nvm_register(ns, disk_name, node)) {
+			dev_warn(ctrl->dev, "%s: LightNVM init failure\n",
+								__func__);
 			goto out_free_id;
 		}
-
-		disk = alloc_disk_node(0, node);
-		if (!disk)
-			goto out_free_id;
-		memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
-		ns->disk = disk;
-		ns->type = NVME_NS_LIGHTNVM;
 	} else {
 		disk = alloc_disk_node(0, node);
 		if (!disk)
@@ -1718,7 +1712,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	kfree(id);
 
-	if (ns->type == NVME_NS_LIGHTNVM)
+	if (ns->ndev)
 		return;
 
 	device_add_disk(ctrl->device, ns->disk);
@@ -1742,7 +1736,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
-	if (ns->disk->flags & GENHD_FL_UP) {
+	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
 		if (blk_get_integrity(ns->disk))
 			blk_integrity_unregister(ns->disk);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
@@ -1765,7 +1759,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	ns = nvme_find_get_ns(ctrl, nsid);
 	if (ns) {
-		if (revalidate_disk(ns->disk))
+		if (ns->disk && revalidate_disk(ns->disk))
 			nvme_ns_remove(ns);
 		nvme_put_ns(ns);
 	} else
@@ -2070,7 +2064,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 		 * Revalidating a dead namespace sets capacity to 0. This will
 		 * end buffered writers dirtying pages that can't be synced.
 		 */
-		if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+		if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags))
 			revalidate_disk(ns->disk);
 
 		blk_set_queue_dying(ns->queue);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 7268a7a1a19a4c..798fcd9f5d1f7d 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -474,9 +474,8 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 	c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
 
 	if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
-		/* momentarily hardcode the shift configuration. lba_shift from
-		 * nvm_dev will be available in a follow-up patch */
-		c->hb_rw.slba = cpu_to_le64(rqd->bio->bi_iter.bi_sector >> 3);
+		c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
+					rqd->bio->bi_iter.bi_sector));
 }
 
 static void nvme_nvm_end_io(struct request *rq, int error)
@@ -593,14 +592,32 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.max_phys_sect		= 64,
 };
 
-int nvme_nvm_register(struct request_queue *q, char *disk_name)
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 {
-	return nvm_register(q, disk_name, &nvme_nvm_dev_ops);
+	struct request_queue *q = ns->queue;
+	struct nvm_dev *dev;
+	int ret;
+
+	dev = nvm_alloc_dev(node);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->q = q;
+	memcpy(dev->name, disk_name, DISK_NAME_LEN);
+	dev->ops = &nvme_nvm_dev_ops;
+	ns->ndev = dev;
+
+	ret = nvm_register(dev);
+
+	ns->lba_shift = ilog2(dev->sec_size) - 9;
+
+	return ret;
 }
 
-void nvme_nvm_unregister(struct request_queue *q, char *disk_name)
+void nvme_nvm_unregister(struct nvme_ns *ns)
 {
-	nvm_unregister(disk_name);
+	nvm_unregister(ns->ndev);
+	kfree(ns->ndev);
 }
 
 /* move to shared place when used in multiple places. */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ab18b78102bf21..e0535c14e5387a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -18,6 +18,7 @@
 #include <linux/pci.h>
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
+#include <linux/lightnvm.h>
 
 enum {
 	/*
@@ -154,6 +155,7 @@ struct nvme_ns {
 	struct nvme_ctrl *ctrl;
 	struct request_queue *queue;
 	struct gendisk *disk;
+	struct nvm_dev *ndev;
 	struct kref kref;
 	int instance;
 
@@ -165,7 +167,6 @@ struct nvme_ns {
 	u16 ms;
 	bool ext;
 	u8 pi_type;
-	int type;
 	unsigned long flags;
 
 #define NVME_NS_REMOVING 0
@@ -307,15 +308,16 @@ int nvme_sg_get_version_num(int __user *ip);
 
 #ifdef CONFIG_NVM
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id);
-int nvme_nvm_register(struct request_queue *q, char *disk_name);
-void nvme_nvm_unregister(struct request_queue *q, char *disk_name);
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
+void nvme_nvm_unregister(struct nvme_ns *ns);
 #else
-static inline int nvme_nvm_register(struct request_queue *q, char *disk_name)
+static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
+								int node)
 {
 	return 0;
 }
 
-static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {};
+static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
 
 static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
 {
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ba78b830667435..5afc2634f332d3 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -524,9 +524,9 @@ extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *,
 								unsigned long);
 extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *);
 
-extern int nvm_register(struct request_queue *, char *,
-						struct nvm_dev_ops *);
-extern void nvm_unregister(char *);
+extern struct nvm_dev *nvm_alloc_dev(int);
+extern int nvm_register(struct nvm_dev *);
+extern void nvm_unregister(struct nvm_dev *);
 
 void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type);
 
@@ -575,11 +575,14 @@ extern int nvm_dev_factory(struct nvm_dev *, int flags);
 #else /* CONFIG_NVM */
 struct nvm_dev_ops;
 
-static inline int nvm_register(struct request_queue *q, char *disk_name,
-							struct nvm_dev_ops *ops)
+static inline struct nvm_dev *nvm_alloc_dev(int node)
+{
+	return ERR_PTR(-EINVAL);
+}
+static inline int nvm_register(struct nvm_dev *dev)
 {
 	return -EINVAL;
 }
-static inline void nvm_unregister(char *disk_name) {}
+static inline void nvm_unregister(struct nvm_dev *dev) {}
 #endif /* CONFIG_NVM */
 #endif /* LIGHTNVM.H */

From 40267efddc296190d50c61d96daf277151447cf6 Mon Sep 17 00:00:00 2001
From: "Simon A. F. Lund" <slund@cnexlabs.com>
Date: Fri, 16 Sep 2016 14:25:08 +0200
Subject: [PATCH 37/54] lightnvm: expose device geometry through sysfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For a host to access an Open-Channel SSD, it has to know its geometry,
so that it writes and reads at the appropriate device bounds.

Currently, the geometry information is kept within the kernel, and not
exported to user-space for consumption. This patch exposes the
configuration through sysfs and enables user-space libraries, such as
liblightnvm, to use the sysfs implementation to get the geometry of an
Open-Channel SSD.

The sysfs entries are stored within the device hierarchy, and can be
found using the "lightnvm" device type.

An example configuration looks like this:

/sys/class/nvme/
└── nvme0n1
   ├── capabilities: 3
   ├── device_mode: 1
   ├── erase_max: 1000000
   ├── erase_typ: 1000000
   ├── flash_media_type: 0
   ├── media_capabilities: 0x00000001
   ├── media_type: 0
   ├── multiplane: 0x00010101
   ├── num_blocks: 1022
   ├── num_channels: 1
   ├── num_luns: 4
   ├── num_pages: 64
   ├── num_planes: 1
   ├── page_size: 4096
   ├── prog_max: 100000
   ├── prog_typ: 100000
   ├── read_max: 10000
   ├── read_typ: 10000
   ├── sector_oob_size: 0
   ├── sector_size: 4096
   ├── media_manager: gennvm
   ├── ppa_format: 0x380830082808001010102008
   ├── vendor_opcode: 0
   ├── max_phys_secs: 64
   └── version: 1

Signed-off-by: Simon A. F. Lund <slund@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/Makefile    |   2 +-
 drivers/lightnvm/core.c      |  20 ++--
 drivers/lightnvm/lightnvm.h  |  35 +++++++
 drivers/lightnvm/sysfs.c     | 195 +++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c     |  13 +--
 drivers/nvme/host/lightnvm.c |   9 +-
 drivers/nvme/host/nvme.h     |  18 +++-
 include/linux/lightnvm.h     |   3 +
 8 files changed, 278 insertions(+), 17 deletions(-)
 create mode 100644 drivers/lightnvm/lightnvm.h
 create mode 100644 drivers/lightnvm/sysfs.c

diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index a7a0a22cf1a596..1f6b6521016aaf 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -2,6 +2,6 @@
 # Makefile for Open-Channel SSDs.
 #
 
-obj-$(CONFIG_NVM)		:= core.o sysblk.o
+obj-$(CONFIG_NVM)		:= core.o sysblk.o sysfs.o
 obj-$(CONFIG_NVM_GENNVM) 	+= gennvm.o
 obj-$(CONFIG_NVM_RRPC)		+= rrpc.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index a99b59d1eb36c2..a2393e1ef82ed9 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -27,6 +27,8 @@
 #include <linux/lightnvm.h>
 #include <linux/sched/sysctl.h>
 
+#include "lightnvm.h"
+
 static LIST_HEAD(nvm_tgt_types);
 static DECLARE_RWSEM(nvm_tgtt_lock);
 static LIST_HEAD(nvm_mgrs);
@@ -598,15 +600,19 @@ static void nvm_free_mgr(struct nvm_dev *dev)
 	dev->mt = NULL;
 }
 
-static void nvm_free(struct nvm_dev *dev)
+void nvm_free(struct nvm_dev *dev)
 {
 	if (!dev)
 		return;
 
 	nvm_free_mgr(dev);
 
+	if (dev->dma_pool)
+		dev->ops->destroy_dma_pool(dev->dma_pool);
+
 	kfree(dev->lptbl);
 	kfree(dev->lun_map);
+	kfree(dev);
 }
 
 static int nvm_init(struct nvm_dev *dev)
@@ -653,11 +659,7 @@ static int nvm_init(struct nvm_dev *dev)
 
 static void nvm_exit(struct nvm_dev *dev)
 {
-	if (dev->dma_pool)
-		dev->ops->destroy_dma_pool(dev->dma_pool);
-	nvm_free(dev);
-
-	pr_info("nvm: successfully unloaded\n");
+	nvm_sysfs_unregister_dev(dev);
 }
 
 struct nvm_dev *nvm_alloc_dev(int node)
@@ -689,6 +691,10 @@ int nvm_register(struct nvm_dev *dev)
 		}
 	}
 
+	ret = nvm_sysfs_register_dev(dev);
+	if (ret)
+		goto err_ppalist;
+
 	if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
 		ret = nvm_get_sysblock(dev, &dev->sb);
 		if (!ret)
@@ -705,6 +711,8 @@ int nvm_register(struct nvm_dev *dev)
 	up_write(&nvm_lock);
 
 	return 0;
+err_ppalist:
+	dev->ops->destroy_dma_pool(dev->dma_pool);
 err_init:
 	kfree(dev->lun_map);
 	return ret;
diff --git a/drivers/lightnvm/lightnvm.h b/drivers/lightnvm/lightnvm.h
new file mode 100644
index 00000000000000..93f1aacc9f02bd
--- /dev/null
+++ b/drivers/lightnvm/lightnvm.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 CNEX Labs. All rights reserved.
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#ifndef LIGHTNVM_H
+#define LIGHTNVM_H
+
+#include <linux/lightnvm.h>
+
+/* core -> sysfs.c */
+int nvm_sysfs_register_dev(struct nvm_dev *);
+void nvm_sysfs_unregister_dev(struct nvm_dev *);
+int nvm_sysfs_register(void);
+void nvm_sysfs_unregister(void);
+
+/* sysfs > core */
+void nvm_free(struct nvm_dev *);
+
+#endif
diff --git a/drivers/lightnvm/sysfs.c b/drivers/lightnvm/sysfs.c
new file mode 100644
index 00000000000000..72ad089c0269dc
--- /dev/null
+++ b/drivers/lightnvm/sysfs.c
@@ -0,0 +1,195 @@
+#include <linux/kernel.h>
+#include <linux/lightnvm.h>
+#include <linux/miscdevice.h>
+#include <linux/kobject.h>
+#include <linux/blk-mq.h>
+
+#include "lightnvm.h"
+
+static ssize_t nvm_dev_attr_show(struct device *dev,
+				 struct device_attribute *dattr, char *page)
+{
+	struct nvm_dev *ndev = container_of(dev, struct nvm_dev, dev);
+	struct nvm_id *id = &ndev->identity;
+	struct nvm_id_group *grp = &id->groups[0];
+	struct attribute *attr = &dattr->attr;
+
+	if (strcmp(attr->name, "version") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id);
+	} else if (strcmp(attr->name, "vendor_opcode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt);
+	} else if (strcmp(attr->name, "capabilities") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->cap);
+	} else if (strcmp(attr->name, "device_mode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->dom);
+	} else if (strcmp(attr->name, "media_manager") == 0) {
+		if (!ndev->mt)
+			return scnprintf(page, PAGE_SIZE, "%s\n", "none");
+		return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name);
+	} else if (strcmp(attr->name, "ppa_format") == 0) {
+		return scnprintf(page, PAGE_SIZE,
+			"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+			id->ppaf.ch_offset, id->ppaf.ch_len,
+			id->ppaf.lun_offset, id->ppaf.lun_len,
+			id->ppaf.pln_offset, id->ppaf.pln_len,
+			id->ppaf.blk_offset, id->ppaf.blk_len,
+			id->ppaf.pg_offset, id->ppaf.pg_len,
+			id->ppaf.sect_offset, id->ppaf.sect_len);
+	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype);
+	} else if (strcmp(attr->name, "flash_media_type") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype);
+	} else if (strcmp(attr->name, "num_channels") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch);
+	} else if (strcmp(attr->name, "num_luns") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun);
+	} else if (strcmp(attr->name, "num_planes") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
+	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
+	} else if (strcmp(attr->name, "num_pages") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
+	} else if (strcmp(attr->name, "page_size") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz);
+	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs);
+	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos);
+	} else if (strcmp(attr->name, "read_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt);
+	} else if (strcmp(attr->name, "read_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm);
+	} else if (strcmp(attr->name, "prog_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt);
+	} else if (strcmp(attr->name, "prog_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm);
+	} else if (strcmp(attr->name, "erase_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet);
+	} else if (strcmp(attr->name, "erase_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem);
+	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos);
+	} else if (strcmp(attr->name, "media_capabilities") == 0) {
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap);
+	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n",
+				ndev->ops->max_phys_sect);
+	} else {
+		return scnprintf(page,
+				 PAGE_SIZE,
+				 "Unhandled attr(%s) in `nvm_dev_attr_show`\n",
+				 attr->name);
+	}
+}
+
+#define NVM_DEV_ATTR_RO(_name)						\
+	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
+
+static NVM_DEV_ATTR_RO(version);
+static NVM_DEV_ATTR_RO(vendor_opcode);
+static NVM_DEV_ATTR_RO(capabilities);
+static NVM_DEV_ATTR_RO(device_mode);
+static NVM_DEV_ATTR_RO(ppa_format);
+static NVM_DEV_ATTR_RO(media_manager);
+
+static NVM_DEV_ATTR_RO(media_type);
+static NVM_DEV_ATTR_RO(flash_media_type);
+static NVM_DEV_ATTR_RO(num_channels);
+static NVM_DEV_ATTR_RO(num_luns);
+static NVM_DEV_ATTR_RO(num_planes);
+static NVM_DEV_ATTR_RO(num_blocks);
+static NVM_DEV_ATTR_RO(num_pages);
+static NVM_DEV_ATTR_RO(page_size);
+static NVM_DEV_ATTR_RO(hw_sector_size);
+static NVM_DEV_ATTR_RO(oob_sector_size);
+static NVM_DEV_ATTR_RO(read_typ);
+static NVM_DEV_ATTR_RO(read_max);
+static NVM_DEV_ATTR_RO(prog_typ);
+static NVM_DEV_ATTR_RO(prog_max);
+static NVM_DEV_ATTR_RO(erase_typ);
+static NVM_DEV_ATTR_RO(erase_max);
+static NVM_DEV_ATTR_RO(multiplane_modes);
+static NVM_DEV_ATTR_RO(media_capabilities);
+static NVM_DEV_ATTR_RO(max_phys_secs);
+
+#define NVM_DEV_ATTR(_name) (dev_attr_##_name##)
+
+static struct attribute *nvm_dev_attrs[] = {
+	&dev_attr_version.attr,
+	&dev_attr_vendor_opcode.attr,
+	&dev_attr_capabilities.attr,
+	&dev_attr_device_mode.attr,
+	&dev_attr_media_manager.attr,
+
+	&dev_attr_ppa_format.attr,
+	&dev_attr_media_type.attr,
+	&dev_attr_flash_media_type.attr,
+	&dev_attr_num_channels.attr,
+	&dev_attr_num_luns.attr,
+	&dev_attr_num_planes.attr,
+	&dev_attr_num_blocks.attr,
+	&dev_attr_num_pages.attr,
+	&dev_attr_page_size.attr,
+	&dev_attr_hw_sector_size.attr,
+	&dev_attr_oob_sector_size.attr,
+	&dev_attr_read_typ.attr,
+	&dev_attr_read_max.attr,
+	&dev_attr_prog_typ.attr,
+	&dev_attr_prog_max.attr,
+	&dev_attr_erase_typ.attr,
+	&dev_attr_erase_max.attr,
+	&dev_attr_multiplane_modes.attr,
+	&dev_attr_media_capabilities.attr,
+	&dev_attr_max_phys_secs.attr,
+	NULL,
+};
+
+static struct attribute_group nvm_dev_attr_group = {
+	.name = "lightnvm",
+	.attrs = nvm_dev_attrs,
+};
+
+static const struct attribute_group *nvm_dev_attr_groups[] = {
+	&nvm_dev_attr_group,
+	NULL,
+};
+
+static void nvm_dev_release(struct device *device)
+{
+	struct nvm_dev *dev = container_of(device, struct nvm_dev, dev);
+	struct request_queue *q = dev->q;
+
+	pr_debug("nvm/sysfs: `nvm_dev_release`\n");
+
+	blk_mq_unregister_dev(device, q);
+
+	nvm_free(dev);
+}
+
+static struct device_type nvm_type = {
+	.name		= "lightnvm",
+	.groups		= nvm_dev_attr_groups,
+	.release	= nvm_dev_release,
+};
+
+int nvm_sysfs_register_dev(struct nvm_dev *dev)
+{
+	if (!dev->parent_dev)
+		return 0;
+
+	dev->dev.parent = dev->parent_dev;
+	dev_set_name(&dev->dev, "%s", dev->name);
+	dev->dev.type = &nvm_type;
+	device_initialize(&dev->dev);
+	device_add(&dev->dev);
+
+	blk_mq_register_dev(&dev->dev, dev->q);
+
+	return 0;
+}
+
+void nvm_sysfs_unregister_dev(struct nvm_dev *dev)
+{
+	if (dev && dev->parent_dev)
+		kobject_put(&dev->dev.kobj);
+}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3c707d83b1da8f..bd2156cbfc6cb9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1435,7 +1435,7 @@ static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	int serial_len = sizeof(ctrl->serial);
 	int model_len = sizeof(ctrl->model);
@@ -1459,7 +1459,7 @@ static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	return sprintf(buf, "%pU\n", ns->uuid);
 }
 static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
@@ -1467,7 +1467,7 @@ static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	return sprintf(buf, "%8phd\n", ns->eui);
 }
 static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
@@ -1475,7 +1475,7 @@ static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
 								char *buf)
 {
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	return sprintf(buf, "%d\n", ns->ns_id);
 }
 static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
@@ -1492,7 +1492,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
 		struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
-	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 
 	if (a == &dev_attr_uuid.attr) {
 		if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
@@ -1684,7 +1684,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		goto out_free_queue;
 
 	if (nvme_nvm_ns_supported(ns, id)) {
-		if (nvme_nvm_register(ns, disk_name, node)) {
+		if (nvme_nvm_register(ns, disk_name, node,
+							&nvme_ns_attr_group)) {
 			dev_warn(ctrl->dev, "%s: LightNVM init failure\n",
 								__func__);
 			goto out_free_id;
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 798fcd9f5d1f7d..f5e3011e31fcdf 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -592,7 +592,8 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.max_phys_sect		= 64,
 };
 
-int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
+		      const struct attribute_group *attrs)
 {
 	struct request_queue *q = ns->queue;
 	struct nvm_dev *dev;
@@ -605,19 +606,23 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 	dev->q = q;
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
 	dev->ops = &nvme_nvm_dev_ops;
+	dev->parent_dev = ns->ctrl->device;
+	dev->private_data = ns;
 	ns->ndev = dev;
 
 	ret = nvm_register(dev);
 
 	ns->lba_shift = ilog2(dev->sec_size) - 9;
 
+	if (sysfs_create_group(&dev->dev.kobj, attrs))
+		pr_warn("%s: failed to create sysfs group for identification\n",
+			disk_name);
 	return ret;
 }
 
 void nvme_nvm_unregister(struct nvme_ns *ns)
 {
 	nvm_unregister(ns->ndev);
-	kfree(ns->ndev);
 }
 
 /* move to shared place when used in multiple places. */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index e0535c14e5387a..bfd25dd73bca2c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -308,11 +308,21 @@ int nvme_sg_get_version_num(int __user *ip);
 
 #ifdef CONFIG_NVM
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id);
-int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
+		      const struct attribute_group *attrs);
 void nvme_nvm_unregister(struct nvme_ns *ns);
+
+static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
+{
+	if (dev->type->devnode)
+		return dev_to_disk(dev)->private_data;
+
+	return (container_of(dev, struct nvm_dev, dev))->private_data;
+}
 #else
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
-								int node)
+				    int node,
+				    const struct attribute_group *attrs)
 {
 	return 0;
 }
@@ -323,6 +333,10 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i
 {
 	return 0;
 }
+static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
+{
+	return dev_to_disk(dev)->private_data;
+}
 #endif /* CONFIG_NVM */
 
 int __init nvme_core_init(void);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 5afc2634f332d3..d190786e4ad812 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -352,7 +352,10 @@ struct nvm_dev {
 
 	/* Backend device */
 	struct request_queue *q;
+	struct device dev;
+	struct device *parent_dev;
 	char name[DISK_NAME_LEN];
+	void *private_data;
 
 	struct mutex mlock;
 	spinlock_t lock;

From 1e3aeae4ea710023dda2a6b780183ee371d1a796 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 16 Sep 2016 14:25:09 +0200
Subject: [PATCH 38/54] lightnvm: propagate device_add() error code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

device_add() may fail, and all callers are supposed to check the
return value, but one new user in lightnvm doesn't:

drivers/lightnvm/sysfs.c: In function 'nvm_sysfs_register_dev':
drivers/lightnvm/sysfs.c:184:2: error: ignoring return value of 'device_add',
  declared with attribute warn_unused_result [-Werror=unused-result]

This changes the caller to propagate any error codes, which avoids
the warning.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 38c9e260b9f9 ("lightnvm: expose device geometry through sysfs")
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/lightnvm.h | 2 +-
 drivers/lightnvm/sysfs.c    | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/lightnvm/lightnvm.h b/drivers/lightnvm/lightnvm.h
index 93f1aacc9f02bd..305c181509a63a 100644
--- a/drivers/lightnvm/lightnvm.h
+++ b/drivers/lightnvm/lightnvm.h
@@ -24,7 +24,7 @@
 #include <linux/lightnvm.h>
 
 /* core -> sysfs.c */
-int nvm_sysfs_register_dev(struct nvm_dev *);
+int __must_check nvm_sysfs_register_dev(struct nvm_dev *);
 void nvm_sysfs_unregister_dev(struct nvm_dev *);
 int nvm_sysfs_register(void);
 void nvm_sysfs_unregister(void);
diff --git a/drivers/lightnvm/sysfs.c b/drivers/lightnvm/sysfs.c
index 72ad089c0269dc..0338c27ab95a92 100644
--- a/drivers/lightnvm/sysfs.c
+++ b/drivers/lightnvm/sysfs.c
@@ -174,6 +174,8 @@ static struct device_type nvm_type = {
 
 int nvm_sysfs_register_dev(struct nvm_dev *dev)
 {
+	int ret;
+
 	if (!dev->parent_dev)
 		return 0;
 
@@ -181,11 +183,12 @@ int nvm_sysfs_register_dev(struct nvm_dev *dev)
 	dev_set_name(&dev->dev, "%s", dev->name);
 	dev->dev.type = &nvm_type;
 	device_initialize(&dev->dev);
-	device_add(&dev->dev);
+	ret = device_add(&dev->dev);
 
-	blk_mq_register_dev(&dev->dev, dev->q);
+	if (!ret)
+		blk_mq_register_dev(&dev->dev, dev->q);
 
-	return 0;
+	return ret;
 }
 
 void nvm_sysfs_unregister_dev(struct nvm_dev *dev)

From 491221f88d00651e449c9caf7415b6453c8a77b7 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 22 Sep 2016 03:10:01 -0400
Subject: [PATCH 39/54] block: export bio_free_pages to other modules

bio_free_pages is introduced in commit 1dfa0f68c040
("block: add a helper to free bio bounce buffer pages"),
we can reuse the func in other modules after it was
imported.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Shaohua Li <shli@fb.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                   | 3 ++-
 drivers/md/bcache/btree.c     | 6 +-----
 drivers/md/bcache/debug.c     | 6 ++----
 drivers/md/bcache/movinggc.c  | 5 +----
 drivers/md/bcache/request.c   | 9 ++-------
 drivers/md/bcache/writeback.c | 5 +----
 drivers/md/dm-log-writes.c    | 6 +-----
 drivers/md/raid1.c            | 8 ++------
 include/linux/bio.h           | 1 +
 9 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index a6d279e1ea9e96..db85c5753a7656 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1068,7 +1068,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
 	return 0;
 }
 
-static void bio_free_pages(struct bio *bio)
+void bio_free_pages(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	int i;
@@ -1076,6 +1076,7 @@ static void bio_free_pages(struct bio *bio)
 	bio_for_each_segment_all(bvec, bio, i)
 		__free_page(bvec->bv_page);
 }
+EXPORT_SYMBOL(bio_free_pages);
 
 /**
  *	bio_uncopy_user	-	finish previously mapped bio
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 76f7534d1dd158..81d3db40cd7be6 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -361,12 +361,8 @@ static void __btree_node_write_done(struct closure *cl)
 static void btree_node_write_done(struct closure *cl)
 {
 	struct btree *b = container_of(cl, struct btree, io);
-	struct bio_vec *bv;
-	int n;
-
-	bio_for_each_segment_all(bv, b->bio, n)
-		__free_page(bv->bv_page);
 
+	bio_free_pages(b->bio);
 	__btree_node_write_done(cl);
 }
 
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c28df164701e7b..333a1e5f6ae66c 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -107,9 +107,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 {
 	char name[BDEVNAME_SIZE];
 	struct bio *check;
-	struct bio_vec bv, *bv2;
+	struct bio_vec bv;
 	struct bvec_iter iter;
-	int i;
 
 	check = bio_clone(bio, GFP_NOIO);
 	if (!check)
@@ -136,8 +135,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 		kunmap_atomic(p1);
 	}
 
-	bio_for_each_segment_all(bv2, check, i)
-		__free_page(bv2->bv_page);
+	bio_free_pages(check);
 out_put:
 	bio_put(check);
 }
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 1881319f2298f6..5c4bddecfaf092 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -44,11 +44,8 @@ static void write_moving_finish(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;
-	struct bio_vec *bv;
-	int i;
 
-	bio_for_each_segment_all(bv, bio, i)
-		__free_page(bv->bv_page);
+	bio_free_pages(bio);
 
 	if (io->op.replace_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4b177fe11ebb99..40ffe5e424b302 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -694,13 +694,8 @@ static void cached_dev_cache_miss_done(struct closure *cl)
 	if (s->iop.replace_collision)
 		bch_mark_cache_miss_collision(s->iop.c, s->d);
 
-	if (s->iop.bio) {
-		int i;
-		struct bio_vec *bv;
-
-		bio_for_each_segment_all(bv, s->iop.bio, i)
-			__free_page(bv->bv_page);
-	}
+	if (s->iop.bio)
+		bio_free_pages(s->iop.bio);
 
 	cached_dev_bio_complete(cl);
 }
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d9fd2a62e5f6f9..e51644e503a53b 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -128,11 +128,8 @@ static void write_dirty_finish(struct closure *cl)
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 	struct cached_dev *dc = io->dc;
-	struct bio_vec *bv;
-	int i;
 
-	bio_for_each_segment_all(bv, &io->bio, i)
-		__free_page(bv->bv_page);
+	bio_free_pages(&io->bio);
 
 	/* This is kind of a dumb way of signalling errors. */
 	if (KEY_DIRTY(&w->key)) {
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 4ab68033f9d10e..b52404159ccf96 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -149,8 +149,6 @@ static void put_io_block(struct log_writes_c *lc)
 static void log_end_io(struct bio *bio)
 {
 	struct log_writes_c *lc = bio->bi_private;
-	struct bio_vec *bvec;
-	int i;
 
 	if (bio->bi_error) {
 		unsigned long flags;
@@ -161,9 +159,7 @@ static void log_end_io(struct bio *bio)
 		spin_unlock_irqrestore(&lc->blocks_lock, flags);
 	}
 
-	bio_for_each_segment_all(bvec, bio, i)
-		__free_page(bvec->bv_page);
-
+	bio_free_pages(bio);
 	put_io_block(lc);
 	bio_put(bio);
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 21dc00eb1989fd..1961d827dbd19b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -145,12 +145,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	return r1_bio;
 
 out_free_pages:
-	while (--j >= 0) {
-		struct bio_vec *bv;
-
-		bio_for_each_segment_all(bv, r1_bio->bios[j], i)
-			__free_page(bv->bv_page);
-	}
+	while (--j >= 0)
+		bio_free_pages(r1_bio->bios[j]);
 
 out_free_bio:
 	while (++j < pi->raid_disks)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e00721a2dce133..97cb48f03dc73c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -459,6 +459,7 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
+extern void bio_free_pages(struct bio *bio);
 
 extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     struct rq_map_data *,

From 841bac2c87fc21c3ecf3bc3354855921735aeec1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 21 Sep 2016 10:08:43 -0600
Subject: [PATCH 40/54] blk-mq: get rid of manual run of queue with
 __blk_mq_run_hw_queue()

Two cases:

1) blk_mq_alloc_request() needlessly re-runs the queue, after
   calling into the tag allocation without NOWAIT set. We don't
   need to do that.

2) blk_mq_map_request() should just use blk_mq_run_hw_queue() with
   the async flag set to false.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e0a69daddbd8ce..c29700010b5c82 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -34,8 +34,6 @@
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
-
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
@@ -228,19 +226,9 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-
 	rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
-	if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
-		__blk_mq_run_hw_queue(hctx);
-		blk_mq_put_ctx(ctx);
-
-		ctx = blk_mq_get_ctx(q);
-		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-		rq =  __blk_mq_alloc_request(&alloc_data, rw, 0);
-		ctx = alloc_data.ctx;
-	}
 	blk_mq_put_ctx(ctx);
+
 	if (!rq) {
 		blk_queue_exit(q);
 		return ERR_PTR(-EWOULDBLOCK);
@@ -1225,7 +1213,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
 	rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
 	if (unlikely(!rq)) {
-		__blk_mq_run_hw_queue(hctx);
+		blk_mq_run_hw_queue(hctx, false);
 		blk_mq_put_ctx(ctx);
 		trace_block_sleeprq(q, bio, op);
 

From 63581af3f31e0dbea112b83f77c4fbb6a10e1406 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 22 Sep 2016 11:38:23 -0700
Subject: [PATCH 41/54] blk-mq: remove non-blocking pass in blk_mq_map_request

bt_get already does a non-blocking pass as well as running the queue
when scheduling internally, no need to duplicate it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c29700010b5c82..80d483864247ff 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1210,20 +1210,8 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 		op_flags |= REQ_SYNC;
 
 	trace_block_getrq(q, bio, op);
-	blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
+	blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
 	rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
-	if (unlikely(!rq)) {
-		blk_mq_run_hw_queue(hctx, false);
-		blk_mq_put_ctx(ctx);
-		trace_block_sleeprq(q, bio, op);
-
-		ctx = blk_mq_get_ctx(q);
-		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
-		rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
-		ctx = alloc_data.ctx;
-		hctx = alloc_data.hctx;
-	}
 
 	hctx->queued++;
 	data->hctx = hctx;

From 1b792f2f92784c00db2e6431496e437855d6f12a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 21 Sep 2016 10:12:13 -0600
Subject: [PATCH 42/54] blk-mq: add flag for drivers wanting blocking
 ->queue_rq()

If a driver sets BLK_MQ_F_BLOCKING, it is allowed to block in its
->queue_rq() handler. For that case, blk-mq ensures that we always
calls it from a safe context.

Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-by: Josef Bacik <jbacik@fb.com>
---
 block/blk-mq.c         | 2 +-
 include/linux/blk-mq.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 80d483864247ff..e9ebe9864cc450 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -908,7 +908,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	    !blk_mq_hw_queue_mapped(hctx)))
 		return;
 
-	if (!async) {
+	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		int cpu = get_cpu();
 		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
 			__blk_mq_run_hw_queue(hctx);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fbcfdf3232437b..5daa0ef756dd29 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -155,6 +155,7 @@ enum {
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
+	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 

From 3932a86b4b9d1f0b049d64d4591ce58ad18b44ec Mon Sep 17 00:00:00 2001
From: Glauber Costa <glauber@scylladb.com>
Date: Thu, 22 Sep 2016 20:59:59 -0400
Subject: [PATCH 43/54] cfq: fix starvation of asynchronous writes

While debugging timeouts happening in my application workload (ScyllaDB), I have
observed calls to open() taking a long time, ranging everywhere from 2 seconds -
the first ones that are enough to time out my application - to more than 30
seconds.

The problem seems to happen because XFS may block on pending metadata updates
under certain circumnstances, and that's confirmed with the following backtrace
taken by the offcputime tool (iovisor/bcc):

    ffffffffb90c57b1 finish_task_switch
    ffffffffb97dffb5 schedule
    ffffffffb97e310c schedule_timeout
    ffffffffb97e1f12 __down
    ffffffffb90ea821 down
    ffffffffc046a9dc xfs_buf_lock
    ffffffffc046abfb _xfs_buf_find
    ffffffffc046ae4a xfs_buf_get_map
    ffffffffc046babd xfs_buf_read_map
    ffffffffc0499931 xfs_trans_read_buf_map
    ffffffffc044a561 xfs_da_read_buf
    ffffffffc0451390 xfs_dir3_leaf_read.constprop.16
    ffffffffc0452b90 xfs_dir2_leaf_lookup_int
    ffffffffc0452e0f xfs_dir2_leaf_lookup
    ffffffffc044d9d3 xfs_dir_lookup
    ffffffffc047d1d9 xfs_lookup
    ffffffffc0479e53 xfs_vn_lookup
    ffffffffb925347a path_openat
    ffffffffb9254a71 do_filp_open
    ffffffffb9242a94 do_sys_open
    ffffffffb9242b9e sys_open
    ffffffffb97e42b2 entry_SYSCALL_64_fastpath
    00007fb0698162ed [unknown]

Inspecting my run with blktrace, I can see that the xfsaild kthread exhibit very
high "Dispatch wait" times, on the dozens of seconds range and consistent with
the open() times I have saw in that run.

Still from the blktrace output, we can after searching a bit, identify the
request that wasn't dispatched:

  8,0   11      152    81.092472813   804  A  WM 141698288 + 8 <- (8,1) 141696240
  8,0   11      153    81.092472889   804  Q  WM 141698288 + 8 [xfsaild/sda1]
  8,0   11      154    81.092473207   804  G  WM 141698288 + 8 [xfsaild/sda1]
  8,0   11      206    81.092496118   804  I  WM 141698288 + 8 (   22911) [xfsaild/sda1]
  <==== 'I' means Inserted (into the IO scheduler) ===================================>
  8,0    0   289372    96.718761435     0  D  WM 141698288 + 8 (15626265317) [swapper/0]
  <==== Only 15s later the CFQ scheduler dispatches the request ======================>

As we can see above, in this particular example CFQ took 15 seconds to dispatch
this request. Going back to the full trace, we can see that the xfsaild queue
had plenty of opportunity to run, and it was selected as the active queue many
times. It would just always be preempted by something else (example):

  8,0    1        0    81.117912979     0  m   N cfq1618SN / insert_request
  8,0    1        0    81.117913419     0  m   N cfq1618SN / add_to_rr
  8,0    1        0    81.117914044     0  m   N cfq1618SN / preempt
  8,0    1        0    81.117914398     0  m   N cfq767A  / slice expired t=1
  8,0    1        0    81.117914755     0  m   N cfq767A  / resid=40
  8,0    1        0    81.117915340     0  m   N / served: vt=1948520448 min_vt=1948520448
  8,0    1        0    81.117915858     0  m   N cfq767A  / sl_used=1 disp=0 charge=0 iops=1 sect=0

where cfq767 is the xfsaild queue and cfq1618 corresponds to one of the ScyllaDB
IO dispatchers.

The requests preempting the xfsaild queue are synchronous requests. That's a
characteristic of ScyllaDB workloads, as we only ever issue O_DIRECT requests.
While it can be argued that preempting ASYNC requests in favor of SYNC is part
of the CFQ logic, I don't believe that doing so for 15+ seconds is anyone's
goal.

Moreover, unless I am misunderstanding something, that breaks the expectation
set by the "fifo_expire_async" tunable, which in my system is set to the
default.

Looking at the code, it seems to me that the issue is that after we make
an async queue active, there is no guarantee that it will execute any request.

When the queue itself tests if it cfq_may_dispatch() it can bail if it sees SYNC
requests in flight. An incoming request from another queue can also preempt it
in such situation before we have the chance to execute anything (as seen in the
trace above).

This patch sets the must_dispatch flag if we notice that we have requests
that are already fifo_expired. This flag is always cleared after
cfq_dispatch_request() returns from cfq_dispatch_requests(), so it won't pin
the queue for subsequent requests (unless they are themselves expired)

Care is taken during preempt to still allow rt requests to preempt us
regardless.

Testing my workload with this patch applied produces much better results.
From the application side I see no timeouts, and the open() latency histogram
generated by systemtap looks much better, with the worst outlier at 131ms:

Latency histogram of xfs_buf_lock acquisition (microseconds):
 value |-------------------------------------------------- count
     0 |                                                     11
     1 |@@@@                                                161
     2 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  1966
     4 |@                                                    54
     8 |                                                     36
    16 |                                                      7
    32 |                                                      0
    64 |                                                      0
       ~
  1024 |                                                      0
  2048 |                                                      0
  4096 |                                                      1
  8192 |                                                      1
 16384 |                                                      2
 32768 |                                                      0
 65536 |                                                      0
131072 |                                                      1
262144 |                                                      0
524288 |                                                      0

Signed-off-by: Glauber Costa <glauber@scylladb.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: linux-block@vger.kernel.org
CC: linux-kernel@vger.kernel.org

Signed-off-by: Glauber Costa <glauber@scylladb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cc2f6dbd430326..5e24d880306c2a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 	if (ktime_get_ns() < rq->fifo_time)
 		rq = NULL;
 
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 	return rq;
 }
 
@@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned int max_dispatch;
 
+	if (cfq_cfqq_must_dispatch(cfqq))
+		return true;
+
 	/*
 	 * Drain async requests before we start sync IO
 	 */
@@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
 
+	rq = cfq_check_fifo(cfqq);
+	if (rq)
+		cfq_mark_cfqq_must_dispatch(cfqq);
+
 	if (!cfq_may_dispatch(cfqd, cfqq))
 		return false;
 
 	/*
 	 * follow expired path, else get first next available
 	 */
-	rq = cfq_check_fifo(cfqq);
 	if (!rq)
 		rq = cfqq->next_rq;
+	else
+		cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 
 	/*
 	 * insert request into driver dispatch list
@@ -3989,7 +3996,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
 	 */
-	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
 		return true;
 
 	/*

From 55679c8d23d191c24ad133abc5647e3054ca8de1 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 23 Sep 2016 09:07:56 -0700
Subject: [PATCH 44/54] blkcg: Annotate blkg_hint correctly

Avoid that sparse complains about blkg_hint manipulations.

Fixes: a637120e4902 ("blkcg: use radix tree to index blkgs from blkcg")
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 10648e300c930e..cbdbf34de5b607 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -45,7 +45,7 @@ struct blkcg {
 	spinlock_t			lock;
 
 	struct radix_tree_root		blkg_tree;
-	struct blkcg_gq			*blkg_hint;
+	struct blkcg_gq	__rcu		*blkg_hint;
 	struct hlist_head		blkg_list;
 
 	struct blkcg_policy_data	*cpd[BLKCG_MAX_POLS];

From 005043ac31ba5bf6721b4ddca10ff2066e2ee2fe Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 21 Sep 2016 16:55:31 -0400
Subject: [PATCH 45/54] nbd: use BLK_MQ_F_BLOCKING

We take a mutex when sending commands and send stuff over the network, we need
to have queue_rq called asynchronously.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Fixes: fd8383fd88a2 ("nbd: convert to blkmq")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4c6dd1a85eade5..ccfcfc11399abd 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -941,7 +941,7 @@ static int __init nbd_init(void)
 		nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE;
 		nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd);
 		nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-			BLK_MQ_F_SG_MERGE;
+			BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
 		nbd_dev[i].tag_set.driver_data = &nbd_dev[i];
 
 		err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set);

From 0fe51ff269da745078e0ab2b90dfdf2a58d6f3e7 Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Tue, 2 Aug 2016 10:40:01 +0300
Subject: [PATCH 46/54] nvme-fabrics: rework nvmf_get_address() for variable
 options

Revise nvmf_get_address() string to account for not all options being
present.

Signed-off-by: James Smart <james.smart@broadcom.com>
Acked-by: Johannes Thumshirn <jth@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fabrics.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index dc996761042ffb..2867b92cbdfba1 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -109,8 +109,16 @@ static void nvmf_host_put(struct nvmf_host *host)
  */
 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
 {
-	return snprintf(buf, size, "traddr=%s,trsvcid=%s\n",
-			ctrl->opts->traddr, ctrl->opts->trsvcid);
+	int len = 0;
+
+	if (ctrl->opts->mask & NVMF_OPT_TRADDR)
+		len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr);
+	if (ctrl->opts->mask & NVMF_OPT_TRSVCID)
+		len += snprintf(buf + len, size - len, "%strsvcid=%s",
+				(len) ? "," : "", ctrl->opts->trsvcid);
+	len += snprintf(buf + len, size - len, "\n");
+
+	return len;
 }
 EXPORT_SYMBOL_GPL(nvmf_get_address);
 

From 4a9f05c57f98e794763650056731a0023ebfab5f Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Tue, 2 Aug 2016 10:41:20 +0300
Subject: [PATCH 47/54] nvme-fabrics: revise host transport option descriptions

Revise some of the comments so not so ethernet-network centric

Signed-off-by: James Smart <james.smart@broadcom.com>
Acked-by: Johannes Thumshirn <jth@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fabrics.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 89df52c8be978c..2755efde16ac1e 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -64,9 +64,10 @@ enum {
  *		being added.
  * @subsysnqn:	Hold the fully qualified NQN subystem name (format defined
  *		in the NVMe specification, "NVMe Qualified Names").
- * @traddr:	network address that will be used by the host to communicate
- *		to the added NVMe controller.
- * @trsvcid:	network port used for host-controller communication.
+ * @traddr:	The transport-specific TRADDR field for a port on the
+ *              subsystem which is adding a controller.
+ * @trsvcid:	The transport-specific TRSVCID field for a port on the
+ *              subsystem which is adding a controller.
  * @queue_size: Number of IO queue elements.
  * @nr_io_queues: Number of controller IO queues that will be established.
  * @reconnect_delay: Time between two consecutive reconnect attempts.

From 478bcb9388f2c3eedba34ed5811793400047f95d Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Tue, 2 Aug 2016 10:42:10 +0300
Subject: [PATCH 48/54] nvme-fabrics: Add host_traddr options field to host
 infrastructure

Add the host_traddr field to allow specification of the host-port
connection info for the transport. Will be used by FC transport.

Signed-off-by: James Smart <james.smart@broadcom.com>
Acked-by: Johannes Thumshirn <jth@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fabrics.c | 13 +++++++++++++
 drivers/nvme/host/fabrics.h |  4 ++++
 2 files changed, 17 insertions(+)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 2867b92cbdfba1..1c07b76cbdeb8e 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -116,6 +116,9 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
 	if (ctrl->opts->mask & NVMF_OPT_TRSVCID)
 		len += snprintf(buf + len, size - len, "%strsvcid=%s",
 				(len) ? "," : "", ctrl->opts->trsvcid);
+	if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)
+		len += snprintf(buf + len, size - len, "%shost_traddr=%s",
+				(len) ? "," : "", ctrl->opts->host_traddr);
 	len += snprintf(buf + len, size - len, "\n");
 
 	return len;
@@ -518,6 +521,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_RECONNECT_DELAY,	"reconnect_delay=%d"	},
 	{ NVMF_OPT_KATO,		"keep_alive_tmo=%d"	},
 	{ NVMF_OPT_HOSTNQN,		"hostnqn=%s"		},
+	{ NVMF_OPT_HOST_TRADDR,		"host_traddr=%s"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -674,6 +678,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			}
 			opts->reconnect_delay = token;
 			break;
+		case NVMF_OPT_HOST_TRADDR:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			opts->host_traddr = p;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
@@ -740,6 +752,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
 	kfree(opts->traddr);
 	kfree(opts->trsvcid);
 	kfree(opts->subsysnqn);
+	kfree(opts->host_traddr);
 	kfree(opts);
 }
 EXPORT_SYMBOL_GPL(nvmf_free_options);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 2755efde16ac1e..8f08c3a3406b04 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -52,6 +52,7 @@ enum {
 	NVMF_OPT_KATO		= 1 << 7,
 	NVMF_OPT_HOSTNQN	= 1 << 8,
 	NVMF_OPT_RECONNECT_DELAY = 1 << 9,
+	NVMF_OPT_HOST_TRADDR	= 1 << 10,
 };
 
 /**
@@ -68,6 +69,8 @@ enum {
  *              subsystem which is adding a controller.
  * @trsvcid:	The transport-specific TRSVCID field for a port on the
  *              subsystem which is adding a controller.
+ * @host_traddr: A transport-specific field identifying the NVME host port
+ *              to use for the connection to the controller.
  * @queue_size: Number of IO queue elements.
  * @nr_io_queues: Number of controller IO queues that will be established.
  * @reconnect_delay: Time between two consecutive reconnect attempts.
@@ -81,6 +84,7 @@ struct nvmf_ctrl_options {
 	char			*subsysnqn;
 	char			*traddr;
 	char			*trsvcid;
+	char			*host_traddr;
 	size_t			queue_size;
 	unsigned int		nr_io_queues;
 	unsigned int		reconnect_delay;

From 2d79c7dc8fe5cf1158250a5fd25c02d781324cd3 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Date: Thu, 1 Sep 2016 20:45:03 +0100
Subject: [PATCH 49/54] admin-cmd: Added smart-log command support.

This patch implements the support for smart-log command
(NVM Express 1.2.1-section 5.10.1.2 SMART / Health Information
(Log Identifier 02h)) on the target for NVMe over Fabric.

In current implementation host can retrieve following statistics:-
1. Data Units Read.
2. Data Units Written.
3. Host Read Commands.
4. Host Write Commands.

Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/admin-cmd.c | 88 +++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 47c564b5a28951..7ab9c9381b9895 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -14,6 +14,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <generated/utsrelease.h>
+#include <asm/unaligned.h>
 #include "nvmet.h"
 
 u32 nvmet_get_log_page_len(struct nvme_command *cmd)
@@ -29,8 +30,84 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd)
 	return len;
 }
 
+static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
+		struct nvme_smart_log *slog)
+{
+	u16 status;
+	struct nvmet_ns *ns;
+	u64 host_reads, host_writes, data_units_read, data_units_written;
+
+	status = NVME_SC_SUCCESS;
+	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
+	if (!ns) {
+		status = NVME_SC_INVALID_NS;
+		pr_err("nvmet : Counld not find namespace id : %d\n",
+				le32_to_cpu(req->cmd->get_log_page.nsid));
+		goto out;
+	}
+
+	host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
+	data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]);
+	host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
+	data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
+
+	put_unaligned_le64(host_reads, &slog->host_reads[0]);
+	put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
+	put_unaligned_le64(host_writes, &slog->host_writes[0]);
+	put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
+	nvmet_put_namespace(ns);
+out:
+	return status;
+}
+
+static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
+		struct nvme_smart_log *slog)
+{
+	u16 status;
+	u64 host_reads = 0, host_writes = 0;
+	u64 data_units_read = 0, data_units_written = 0;
+	struct nvmet_ns *ns;
+	struct nvmet_ctrl *ctrl;
+
+	status = NVME_SC_SUCCESS;
+	ctrl = req->sq->ctrl;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+		host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
+		data_units_read +=
+			part_stat_read(ns->bdev->bd_part, sectors[READ]);
+		host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
+		data_units_written +=
+			part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
+
+	}
+	rcu_read_unlock();
+
+	put_unaligned_le64(host_reads, &slog->host_reads[0]);
+	put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
+	put_unaligned_le64(host_writes, &slog->host_writes[0]);
+	put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
+
+	return status;
+}
+
+static u16 nvmet_get_smart_log(struct nvmet_req *req,
+		struct nvme_smart_log *slog)
+{
+	u16 status;
+
+	WARN_ON(req == NULL || slog == NULL);
+	if (req->cmd->get_log_page.nsid == 0xFFFFFFFF)
+		status = nvmet_get_smart_log_all(req, slog);
+	else
+		status = nvmet_get_smart_log_nsid(req, slog);
+	return status;
+}
+
 static void nvmet_execute_get_log_page(struct nvmet_req *req)
 {
+	struct nvme_smart_log *smart_log;
 	size_t data_len = nvmet_get_log_page_len(req->cmd);
 	void *buf;
 	u16 status = 0;
@@ -59,6 +136,16 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		 * available (e.g. units or commands read/written) those aren't
 		 * persistent over power loss.
 		 */
+		if (data_len != sizeof(*smart_log)) {
+			status = NVME_SC_INTERNAL;
+			goto err;
+		}
+		smart_log = buf;
+		status = nvmet_get_smart_log(req, smart_log);
+		if (status) {
+			memset(buf, '\0', data_len);
+			goto err;
+		}
 		break;
 	case 0x03:
 		/*
@@ -73,6 +160,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 
 	status = nvmet_copy_to_sgl(req, 0, buf, data_len);
 
+err:
 	kfree(buf);
 out:
 	nvmet_req_complete(req, status);

From 9b349b080ca9777f6dab3da06d7fa4577f7d4c29 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 21 Sep 2016 11:06:32 -0700
Subject: [PATCH 50/54] nvmet: Use direct IO for writes

We're designed to work with high-end devices where
direct IO makes perfect sense. We noticed that we
context switch by scheduling kblockd instead of going
directly to the device without REQ_SYNC for writes.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/io-cmd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 2cd069b691ae54..4132b6b9818233 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -58,6 +58,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 	if (req->cmd->rw.opcode == nvme_cmd_write) {
 		op = REQ_OP_WRITE;
+		op_flags = WRITE_ODIRECT;
 		if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
 			op_flags |= REQ_FUA;
 	} else {

From 2e5d0baa04845dc3a3d7dfa33d7663de270bb146 Mon Sep 17 00:00:00 2001
From: Alexander Solganik <sashas@lightbitslabs.com>
Date: Wed, 21 Sep 2016 14:12:38 -0700
Subject: [PATCH 51/54] nvmet: Make dsm number of ranges zero based

This caused the nvmet request data length to be
incorrect.

Signed-off-by: Alexander Solganik <sashas@lightbitslabs.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/io-cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 4132b6b9818233..4a96c2049b7b6b 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -206,7 +206,7 @@ int nvmet_parse_io_cmd(struct nvmet_req *req)
 		return 0;
 	case nvme_cmd_dsm:
 		req->execute = nvmet_execute_dsm;
-		req->data_len = le32_to_cpu(cmd->dsm.nr) *
+		req->data_len = le32_to_cpu(cmd->dsm.nr + 1) *
 			sizeof(struct nvme_dsm_range);
 		return 0;
 	default:

From 26501db8dcbc3c63c0d8fb6c5bb098bc7d35d741 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 16 Sep 2016 11:16:09 -0700
Subject: [PATCH 52/54] nvme/scsi: Remove power management support

As far as I can tell, there is basically nothing correct about this
code.  It misinterprets npss (off-by-one).  It hardcodes a bunch of
power states, which is nonsense, because they're all just indices
into a table that software needs to parse.  It completely ignores
the distinction between operational and non-operational states.
And, until 4.8, if all of the above magically succeeded, it would
dereference a NULL pointer and OOPS.

Since this code appears to be useless, just delete it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Tested-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/scsi.c | 74 ++--------------------------------------
 1 file changed, 3 insertions(+), 71 deletions(-)

diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c
index e947e298a737b1..44009105f8c808 100644
--- a/drivers/nvme/host/scsi.c
+++ b/drivers/nvme/host/scsi.c
@@ -72,15 +72,6 @@ static int sg_version_num = 30534;	/* 2 digits for each component */
 #define ALL_LUNS_RETURNED				0x02
 #define ALL_WELL_KNOWN_LUNS_RETURNED			0x01
 #define RESTRICTED_LUNS_RETURNED			0x00
-#define NVME_POWER_STATE_START_VALID			0x00
-#define NVME_POWER_STATE_ACTIVE				0x01
-#define NVME_POWER_STATE_IDLE				0x02
-#define NVME_POWER_STATE_STANDBY			0x03
-#define NVME_POWER_STATE_LU_CONTROL			0x07
-#define POWER_STATE_0					0
-#define POWER_STATE_1					1
-#define POWER_STATE_2					2
-#define POWER_STATE_3					3
 #define DOWNLOAD_SAVE_ACTIVATE				0x05
 #define DOWNLOAD_SAVE_DEFER_ACTIVATE			0x0E
 #define ACTIVATE_DEFERRED_MICROCODE			0x0F
@@ -1229,64 +1220,6 @@ static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns,
 
 /* Start Stop Unit Helper Functions */
 
-static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
-						u8 pc, u8 pcmod, u8 start)
-{
-	int res;
-	int nvme_sc;
-	struct nvme_id_ctrl *id_ctrl;
-	int lowest_pow_st;	/* max npss = lowest power consumption */
-	unsigned ps_desired = 0;
-
-	nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl);
-	res = nvme_trans_status_code(hdr, nvme_sc);
-	if (res)
-		return res;
-
-	lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1));
-	kfree(id_ctrl);
-
-	switch (pc) {
-	case NVME_POWER_STATE_START_VALID:
-		/* Action unspecified if POWER CONDITION MODIFIER != 0 */
-		if (pcmod == 0 && start == 0x1)
-			ps_desired = POWER_STATE_0;
-		if (pcmod == 0 && start == 0x0)
-			ps_desired = lowest_pow_st;
-		break;
-	case NVME_POWER_STATE_ACTIVE:
-		/* Action unspecified if POWER CONDITION MODIFIER != 0 */
-		if (pcmod == 0)
-			ps_desired = POWER_STATE_0;
-		break;
-	case NVME_POWER_STATE_IDLE:
-		/* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */
-		if (pcmod == 0x0)
-			ps_desired = POWER_STATE_1;
-		else if (pcmod == 0x1)
-			ps_desired = POWER_STATE_2;
-		else if (pcmod == 0x2)
-			ps_desired = POWER_STATE_3;
-		break;
-	case NVME_POWER_STATE_STANDBY:
-		/* Action unspecified if POWER CONDITION MODIFIER != [0,1] */
-		if (pcmod == 0x0)
-			ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2));
-		else if (pcmod == 0x1)
-			ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1));
-		break;
-	case NVME_POWER_STATE_LU_CONTROL:
-	default:
-		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
-				ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
-				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-		break;
-	}
-	nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0,
-				    NULL);
-	return nvme_trans_status_code(hdr, nvme_sc);
-}
-
 static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					u8 buffer_id)
 {
@@ -2235,11 +2168,10 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
 static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	u8 immed, pcmod, pc, no_flush, start;
+	u8 immed, pcmod, no_flush, start;
 
 	immed = cmd[1] & 0x01;
 	pcmod = cmd[3] & 0x0f;
-	pc = (cmd[4] & 0xf0) >> 4;
 	no_flush = cmd[4] & 0x04;
 	start = cmd[4] & 0x01;
 
@@ -2254,8 +2186,8 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			if (res)
 				return res;
 		}
-		/* Setup the expected power state transition */
-		return nvme_trans_power_state(ns, hdr, pc, pcmod, start);
+
+		return 0;
 	}
 }
 

From 1a6fe74dfd1bb10afb41cbbbdc14890604be42a6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 16 Sep 2016 11:16:10 -0700
Subject: [PATCH 53/54] nvme: Pass pointers, not dma addresses, to
 nvme_get/set_features()

Any user I can imagine that needs a buffer at all will want to pass
a pointer directly.  There are no currently callers that use
buffers, so this change is painless, and it will make it much easier
to start using features that use buffers (e.g. APST).

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Tested-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 14 ++++++--------
 drivers/nvme/host/nvme.h |  4 ++--
 drivers/nvme/host/scsi.c |  6 +++---
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bd2156cbfc6cb9..4669c052239ed2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -599,7 +599,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
 }
 
 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
-					dma_addr_t dma_addr, u32 *result)
+		      void *buffer, size_t buflen, u32 *result)
 {
 	struct nvme_command c;
 	struct nvme_completion cqe;
@@ -608,10 +608,9 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
 	memset(&c, 0, sizeof(c));
 	c.features.opcode = nvme_admin_get_features;
 	c.features.nsid = cpu_to_le32(nsid);
-	c.features.dptr.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(fid);
 
-	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
+	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, buffer, buflen, 0,
 			NVME_QID_ANY, 0, 0);
 	if (ret >= 0 && result)
 		*result = le32_to_cpu(cqe.result);
@@ -619,7 +618,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
 }
 
 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
-					dma_addr_t dma_addr, u32 *result)
+		      void *buffer, size_t buflen, u32 *result)
 {
 	struct nvme_command c;
 	struct nvme_completion cqe;
@@ -627,12 +626,11 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
 
 	memset(&c, 0, sizeof(c));
 	c.features.opcode = nvme_admin_set_features;
-	c.features.dptr.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
 
-	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
-			NVME_QID_ANY, 0, 0);
+	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe,
+			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
 	if (ret >= 0 && result)
 		*result = le32_to_cpu(cqe.result);
 	return ret;
@@ -666,7 +664,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
 	u32 result;
 	int status, nr_io_queues;
 
-	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
+	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
 			&result);
 	if (status < 0)
 		return status;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bfd25dd73bca2c..b0a9ec681685c8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -293,9 +293,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
 		struct nvme_id_ns **id);
 int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log);
 int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
-			dma_addr_t dma_addr, u32 *result);
+		      void *buffer, size_t buflen, u32 *result);
 int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
-			dma_addr_t dma_addr, u32 *result);
+		      void *buffer, size_t buflen, u32 *result);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c
index 44009105f8c808..c2a0a1c7d05d15 100644
--- a/drivers/nvme/host/scsi.c
+++ b/drivers/nvme/host/scsi.c
@@ -906,7 +906,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	kfree(smart_log);
 
 	/* Get Features for Temp Threshold */
-	res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0,
+	res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, NULL, 0,
 								&feature_resp);
 	if (res != NVME_SC_SUCCESS)
 		temp_c_thresh = LOG_TEMP_UNKNOWN;
@@ -1039,7 +1039,7 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
 	if (len < MODE_PAGE_CACHING_LEN)
 		return -EINVAL;
 
-	nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0,
+	nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, NULL, 0,
 								&feature_resp);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
@@ -1328,7 +1328,7 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	case MODE_PAGE_CACHING:
 		dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0);
 		nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC,
-					    dword11, 0, NULL);
+					    dword11, NULL, 0, NULL);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		break;
 	case MODE_PAGE_CONTROL:

From 997198ba1ed691c09457120576c27dbd953d0557 Mon Sep 17 00:00:00 2001
From: Pierre Morel <pmorel@linux.vnet.ibm.com>
Date: Tue, 4 Oct 2016 10:53:40 +0200
Subject: [PATCH 54/54] fs/block_dev.c: return the right error in thaw_bdev()

When triggering thaw-filesystems via magic sysrq, the system enters a
loop in do_thaw_one(), as thaw_bdev() still returns success if
bd_fsfreeze_count == 0. To fix this, let thaw_bdev() always return
error (and simplify the code a bit at the same time).

Reviewed-by: Eric Farman <farman@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a516568f63b6e9..376e4e42632416 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -299,14 +299,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 		error = sb->s_op->thaw_super(sb);
 	else
 		error = thaw_super(sb);
-	if (error) {
+	if (error)
 		bdev->bd_fsfreeze_count++;
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return error;
-	}
 out:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return 0;
+	return error;
 }
 EXPORT_SYMBOL(thaw_bdev);