From 797476b88bde2a6001f9552f383f147e58c1a330 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@hgst.com>
Date: Tue, 18 Oct 2016 15:40:29 +0900
Subject: [PATCH 001/182] block: Add 'zoned' queue limit

Add the zoned queue limit to indicate the zoning model of a block device.
Defined values are 0 (BLK_ZONED_NONE) for regular block devices,
1 (BLK_ZONED_HA) for host-aware zone block devices and 2 (BLK_ZONED_HM)
for host-managed zone block devices. The standards defined drive managed
model is not defined here since these block devices do not provide any
command for accessing zone information. Drive managed model devices will
be reported as BLK_ZONED_NONE.

The helper functions blk_queue_zoned_model and bdev_zoned_model return
the zoned limit and the functions blk_queue_is_zoned and bdev_is_zoned
return a boolean for callers to test if a block device is zoned.

The zoned attribute is also exported as a string to applications via
sysfs. BLK_ZONED_NONE shows as "none", BLK_ZONED_HA as "host-aware" and
BLK_ZONED_HM as "host-managed".

Signed-off-by: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Tested-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/ABI/testing/sysfs-block | 16 +++++++++
 block/blk-settings.c                  |  1 +
 block/blk-sysfs.c                     | 18 ++++++++++
 include/linux/blkdev.h                | 47 +++++++++++++++++++++++++++
 4 files changed, 82 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 71d184dbb70d29..75a5055a722b72 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -235,3 +235,19 @@ Description:
 		write_same_max_bytes is 0, write same is not supported
 		by the device.
 
+What:		/sys/block/<disk>/queue/zoned
+Date:		September 2016
+Contact:	Damien Le Moal <damien.lemoal@hgst.com>
+Description:
+		zoned indicates if the device is a zoned block device
+		and the zone model of the device if it is indeed zoned.
+		The possible values indicated by zoned are "none" for
+		regular block devices and "host-aware" or "host-managed"
+		for zoned block devices. The characteristics of
+		host-aware and host-managed zoned block devices are
+		described in the ZBC (Zoned Block Commands) and ZAC
+		(Zoned Device ATA Command Set) standards. These standards
+		also define the "drive-managed" zone model. However,
+		since drive-managed zoned block devices do not support
+		zone commands, they will be treated as regular block
+		devices and zoned will report "none".
diff --git a/block/blk-settings.c b/block/blk-settings.c
index f679ae12284351..b1d5b7fa4d0766 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -107,6 +107,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->cluster = 1;
+	lim->zoned = BLK_ZONED_NONE;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9cc8d7c5439a98..ff9cd9cd35a351 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -257,6 +257,18 @@ QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
 QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
 #undef QUEUE_SYSFS_BIT_FNS
 
+static ssize_t queue_zoned_show(struct request_queue *q, char *page)
+{
+	switch (blk_queue_zoned_model(q)) {
+	case BLK_ZONED_HA:
+		return sprintf(page, "host-aware\n");
+	case BLK_ZONED_HM:
+		return sprintf(page, "host-managed\n");
+	default:
+		return sprintf(page, "none\n");
+	}
+}
+
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
 	return queue_var_show((blk_queue_nomerges(q) << 1) |
@@ -485,6 +497,11 @@ static struct queue_sysfs_entry queue_nonrot_entry = {
 	.store = queue_store_nonrot,
 };
 
+static struct queue_sysfs_entry queue_zoned_entry = {
+	.attr = {.name = "zoned", .mode = S_IRUGO },
+	.show = queue_zoned_show,
+};
+
 static struct queue_sysfs_entry queue_nomerges_entry = {
 	.attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nomerges_show,
@@ -546,6 +563,7 @@ static struct attribute *default_attrs[] = {
 	&queue_discard_zeroes_data_entry.attr,
 	&queue_write_same_max_entry.attr,
 	&queue_nonrot_entry.attr,
+	&queue_zoned_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c47c358ba0529c..f19e16bb43d1cd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -261,6 +261,15 @@ struct blk_queue_tag {
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
 
+/*
+ * Zoned block device models (zoned limit).
+ */
+enum blk_zoned_model {
+	BLK_ZONED_NONE,	/* Regular block device */
+	BLK_ZONED_HA,	/* Host-aware zoned block device */
+	BLK_ZONED_HM,	/* Host-managed zoned block device */
+};
+
 struct queue_limits {
 	unsigned long		bounce_pfn;
 	unsigned long		seg_boundary_mask;
@@ -290,6 +299,7 @@ struct queue_limits {
 	unsigned char		cluster;
 	unsigned char		discard_zeroes_data;
 	unsigned char		raid_partial_stripes_expensive;
+	enum blk_zoned_model	zoned;
 };
 
 struct request_queue {
@@ -627,6 +637,23 @@ static inline unsigned int blk_queue_cluster(struct request_queue *q)
 	return q->limits.cluster;
 }
 
+static inline enum blk_zoned_model
+blk_queue_zoned_model(struct request_queue *q)
+{
+	return q->limits.zoned;
+}
+
+static inline bool blk_queue_is_zoned(struct request_queue *q)
+{
+	switch (blk_queue_zoned_model(q)) {
+	case BLK_ZONED_HA:
+	case BLK_ZONED_HM:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /*
  * We regard a request as sync, if either a read or a sync write
  */
@@ -1354,6 +1381,26 @@ static inline unsigned int bdev_write_same(struct block_device *bdev)
 	return 0;
 }
 
+static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return blk_queue_zoned_model(q);
+
+	return BLK_ZONED_NONE;
+}
+
+static inline bool bdev_is_zoned(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return blk_queue_is_zoned(q);
+
+	return false;
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;

From 87caf97cf54b5082e56af241b88a2b8a30d17ef3 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 18 Oct 2016 15:40:30 +0900
Subject: [PATCH 002/182] blk-sysfs: Add 'chunk_sectors' to sysfs attributes

The queue limits already have a 'chunk_sectors' setting, so
we should be presenting it via sysfs.

Signed-off-by: Hannes Reinecke <hare@suse.de>

[Damien: Updated Documentation/ABI/testing/sysfs-block]

Signed-off-by: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Tested-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/ABI/testing/sysfs-block | 13 +++++++++++++
 block/blk-sysfs.c                     | 11 +++++++++++
 2 files changed, 24 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 75a5055a722b72..ee2d5cd26bfe2d 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -251,3 +251,16 @@ Description:
 		since drive-managed zoned block devices do not support
 		zone commands, they will be treated as regular block
 		devices and zoned will report "none".
+
+What:		/sys/block/<disk>/queue/chunk_sectors
+Date:		September 2016
+Contact:	Hannes Reinecke <hare@suse.com>
+Description:
+		chunk_sectors has different meaning depending on the type
+		of the disk. For a RAID device (dm-raid), chunk_sectors
+		indicates the size in 512B sectors of the RAID volume
+		stripe segment. For a zoned block device, either
+		host-aware or host-managed, chunk_sectors indicates the
+		size of 512B sectors of the zones of the device, with
+		the eventual exception of the last zone of the device
+		which may be smaller.
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index ff9cd9cd35a351..488c2e28feb823 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -130,6 +130,11 @@ static ssize_t queue_physical_block_size_show(struct request_queue *q, char *pag
 	return queue_var_show(queue_physical_block_size(q), page);
 }
 
+static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.chunk_sectors, page);
+}
+
 static ssize_t queue_io_min_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(queue_io_min(q), page);
@@ -455,6 +460,11 @@ static struct queue_sysfs_entry queue_physical_block_size_entry = {
 	.show = queue_physical_block_size_show,
 };
 
+static struct queue_sysfs_entry queue_chunk_sectors_entry = {
+	.attr = {.name = "chunk_sectors", .mode = S_IRUGO },
+	.show = queue_chunk_sectors_show,
+};
+
 static struct queue_sysfs_entry queue_io_min_entry = {
 	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
 	.show = queue_io_min_show,
@@ -555,6 +565,7 @@ static struct attribute *default_attrs[] = {
 	&queue_hw_sector_size_entry.attr,
 	&queue_logical_block_size_entry.attr,
 	&queue_physical_block_size_entry.attr,
+	&queue_chunk_sectors_entry.attr,
 	&queue_io_min_entry.attr,
 	&queue_io_opt_entry.attr,
 	&queue_discard_granularity_entry.attr,

From 987b3b26eb7b19960160505faf9b2f50ae77e14d Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 18 Oct 2016 15:40:31 +0900
Subject: [PATCH 003/182] block: update chunk_sectors in blk_stack_limits()

Signed-off-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Tested-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-settings.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index b1d5b7fa4d0766..55369a65dea2a8 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -631,6 +631,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			t->discard_granularity;
 	}
 
+	if (b->chunk_sectors)
+		t->chunk_sectors = min_not_zero(t->chunk_sectors,
+						b->chunk_sectors);
+
 	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);

From 2d253440b5afb128d22ccdae812dde9ba77a2cca Mon Sep 17 00:00:00 2001
From: Shaun Tancheff <shaun.tancheff@seagate.com>
Date: Tue, 18 Oct 2016 15:40:32 +0900
Subject: [PATCH 004/182] block: Define zoned block device operations

Define REQ_OP_ZONE_REPORT and REQ_OP_ZONE_RESET for handling zones of
host-managed and host-aware zoned block devices. With with these two
new operations, the total number of operations defined reaches 8 and
still fits with the 3 bits definition of REQ_OP_BITS.

Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          | 4 ++++
 include/linux/blk_types.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 14d7c0740dc07a..e4eda5d2aa5617 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1941,6 +1941,10 @@ generic_make_request_checks(struct bio *bio)
 	case REQ_OP_WRITE_SAME:
 		if (!bdev_write_same(bio->bi_bdev))
 			goto not_supported;
+	case REQ_OP_ZONE_REPORT:
+	case REQ_OP_ZONE_RESET:
+		if (!bdev_is_zoned(bio->bi_bdev))
+			goto not_supported;
 		break;
 	default:
 		break;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cd395ecec99d01..dd50dce89a8091 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -243,6 +243,8 @@ enum req_op {
 	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
 	REQ_OP_WRITE_SAME,	/* write same block many times */
 	REQ_OP_FLUSH,		/* request for cache flush */
+	REQ_OP_ZONE_REPORT,	/* Get zone information */
+	REQ_OP_ZONE_RESET,	/* Reset a zone write pointer */
 };
 
 #define REQ_OP_BITS 3

From 6a0cb1bc106fc07ce0443303bcdb7f7da5131e5c Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 18 Oct 2016 15:40:33 +0900
Subject: [PATCH 005/182] block: Implement support for zoned block devices

Implement zoned block device zone information reporting and reset.
Zone information are reported as struct blk_zone. This implementation
does not differentiate between host-aware and host-managed device
models and is valid for both. Two functions are provided:
blkdev_report_zones for discovering the zone configuration of a
zoned block device, and blkdev_reset_zones for resetting the write
pointer of sequential zones. The helper function blk_queue_zone_size
and bdev_zone_size are also provided for, as the name suggest,
obtaining the zone size (in 512B sectors) of the zones of the device.

Signed-off-by: Hannes Reinecke <hare@suse.de>

[Damien: * Removed the zone cache
         * Implement report zones operation based on earlier proposal
           by Shaun Tancheff <shaun.tancheff@seagate.com>]
Signed-off-by: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Tested-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Kconfig                 |   8 ++
 block/Makefile                |   1 +
 block/blk-zoned.c             | 257 ++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h        |  31 ++++
 include/uapi/linux/Kbuild     |   1 +
 include/uapi/linux/blkzoned.h | 103 ++++++++++++++
 6 files changed, 401 insertions(+)
 create mode 100644 block/blk-zoned.c
 create mode 100644 include/uapi/linux/blkzoned.h

diff --git a/block/Kconfig b/block/Kconfig
index 1d4d624492fcd0..6b0ad08f0677d2 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -89,6 +89,14 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_DEV_ZONED
+	bool "Zoned block device support"
+	---help---
+	Block layer zoned block device support. This option enables
+	support for ZAC/ZBC host-managed and host-aware zoned block devices.
+
+	Say yes here if you have a ZAC or ZBC storage device.
+
 config BLK_DEV_THROTTLING
 	bool "Block layer bio throttling support"
 	depends on BLK_CGROUP=y
diff --git a/block/Makefile b/block/Makefile
index 36acdd7545bee0..934dac73fb3741 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
 obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
+obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
new file mode 100644
index 00000000000000..1603573f9605d7
--- /dev/null
+++ b/block/blk-zoned.c
@@ -0,0 +1,257 @@
+/*
+ * Zoned block device handling
+ *
+ * Copyright (c) 2015, Hannes Reinecke
+ * Copyright (c) 2015, SUSE Linux GmbH
+ *
+ * Copyright (c) 2016, Damien Le Moal
+ * Copyright (c) 2016, Western Digital
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/blkdev.h>
+
+static inline sector_t blk_zone_start(struct request_queue *q,
+				      sector_t sector)
+{
+	sector_t zone_mask = blk_queue_zone_size(q) - 1;
+
+	return sector & ~zone_mask;
+}
+
+/*
+ * Check that a zone report belongs to the partition.
+ * If yes, fix its start sector and write pointer, copy it in the
+ * zone information array and return true. Return false otherwise.
+ */
+static bool blkdev_report_zone(struct block_device *bdev,
+			       struct blk_zone *rep,
+			       struct blk_zone *zone)
+{
+	sector_t offset = get_start_sect(bdev);
+
+	if (rep->start < offset)
+		return false;
+
+	rep->start -= offset;
+	if (rep->start + rep->len > bdev->bd_part->nr_sects)
+		return false;
+
+	if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		rep->wp = rep->start + rep->len;
+	else
+		rep->wp -= offset;
+	memcpy(zone, rep, sizeof(struct blk_zone));
+
+	return true;
+}
+
+/**
+ * blkdev_report_zones - Get zones information
+ * @bdev:	Target block device
+ * @sector:	Sector from which to report zones
+ * @zones:	Array of zone structures where to return the zones information
+ * @nr_zones:	Number of zone structures in the zone array
+ * @gfp_mask:	Memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *    Get zone information starting from the zone containing @sector.
+ *    The number of zone information reported may be less than the number
+ *    requested by @nr_zones. The number of zones actually reported is
+ *    returned in @nr_zones.
+ */
+int blkdev_report_zones(struct block_device *bdev,
+			sector_t sector,
+			struct blk_zone *zones,
+			unsigned int *nr_zones,
+			gfp_t gfp_mask)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct blk_zone_report_hdr *hdr;
+	unsigned int nrz = *nr_zones;
+	struct page *page;
+	unsigned int nr_rep;
+	size_t rep_bytes;
+	unsigned int nr_pages;
+	struct bio *bio;
+	struct bio_vec *bv;
+	unsigned int i, n, nz;
+	unsigned int ofst;
+	void *addr;
+	int ret = 0;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_is_zoned(q))
+		return -EOPNOTSUPP;
+
+	if (!nrz)
+		return 0;
+
+	if (sector > bdev->bd_part->nr_sects) {
+		*nr_zones = 0;
+		return 0;
+	}
+
+	/*
+	 * The zone report has a header. So make room for it in the
+	 * payload. Also make sure that the report fits in a single BIO
+	 * that will not be split down the stack.
+	 */
+	rep_bytes = sizeof(struct blk_zone_report_hdr) +
+		sizeof(struct blk_zone) * nrz;
+	rep_bytes = (rep_bytes + PAGE_SIZE - 1) & PAGE_MASK;
+	if (rep_bytes > (queue_max_sectors(q) << 9))
+		rep_bytes = queue_max_sectors(q) << 9;
+
+	nr_pages = min_t(unsigned int, BIO_MAX_PAGES,
+			 rep_bytes >> PAGE_SHIFT);
+	nr_pages = min_t(unsigned int, nr_pages,
+			 queue_max_segments(q));
+
+	bio = bio_alloc(gfp_mask, nr_pages);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_bdev = bdev;
+	bio->bi_iter.bi_sector = blk_zone_start(q, sector);
+	bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0);
+
+	for (i = 0; i < nr_pages; i++) {
+		page = alloc_page(gfp_mask);
+		if (!page) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
+			__free_page(page);
+			break;
+		}
+	}
+
+	if (i == 0)
+		ret = -ENOMEM;
+	else
+		ret = submit_bio_wait(bio);
+	if (ret)
+		goto out;
+
+	/*
+	 * Process the report result: skip the header and go through the
+	 * reported zones to fixup and fixup the zone information for
+	 * partitions. At the same time, return the zone information into
+	 * the zone array.
+	 */
+	n = 0;
+	nz = 0;
+	nr_rep = 0;
+	bio_for_each_segment_all(bv, bio, i) {
+
+		if (!bv->bv_page)
+			break;
+
+		addr = kmap_atomic(bv->bv_page);
+
+		/* Get header in the first page */
+		ofst = 0;
+		if (!nr_rep) {
+			hdr = (struct blk_zone_report_hdr *) addr;
+			nr_rep = hdr->nr_zones;
+			ofst = sizeof(struct blk_zone_report_hdr);
+		}
+
+		/* Fixup and report zones */
+		while (ofst < bv->bv_len &&
+		       n < nr_rep && nz < nrz) {
+			if (blkdev_report_zone(bdev, addr + ofst, &zones[nz]))
+				nz++;
+			ofst += sizeof(struct blk_zone);
+			n++;
+		}
+
+		kunmap_atomic(addr);
+
+		if (n >= nr_rep || nz >= nrz)
+			break;
+
+	}
+
+out:
+	bio_for_each_segment_all(bv, bio, i)
+		__free_page(bv->bv_page);
+	bio_put(bio);
+
+	if (ret == 0)
+		*nr_zones = nz;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_report_zones);
+
+/**
+ * blkdev_reset_zones - Reset zones write pointer
+ * @bdev:	Target block device
+ * @sector:	Start sector of the first zone to reset
+ * @nr_sectors:	Number of sectors, at least the length of one zone
+ * @gfp_mask:	Memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *    Reset the write pointer of the zones contained in the range
+ *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
+ *    is valid, but the specified range should not contain conventional zones.
+ */
+int blkdev_reset_zones(struct block_device *bdev,
+		       sector_t sector, sector_t nr_sectors,
+		       gfp_t gfp_mask)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	sector_t zone_sectors;
+	sector_t end_sector = sector + nr_sectors;
+	struct bio *bio;
+	int ret;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_is_zoned(q))
+		return -EOPNOTSUPP;
+
+	if (end_sector > bdev->bd_part->nr_sects)
+		/* Out of range */
+		return -EINVAL;
+
+	/* Check alignment (handle eventual smaller last zone) */
+	zone_sectors = blk_queue_zone_size(q);
+	if (sector & (zone_sectors - 1))
+		return -EINVAL;
+
+	if ((nr_sectors & (zone_sectors - 1)) &&
+	    end_sector != bdev->bd_part->nr_sects)
+		return -EINVAL;
+
+	while (sector < end_sector) {
+
+		bio = bio_alloc(gfp_mask, 0);
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_bdev = bdev;
+		bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
+
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+
+		if (ret)
+			return ret;
+
+		sector += zone_sectors;
+
+		/* This may take a while, so be nice to others */
+		cond_resched();
+
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkdev_reset_zones);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f19e16bb43d1cd..252043f7cd2c8e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -24,6 +24,7 @@
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
+#include <linux/blkzoned.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -302,6 +303,21 @@ struct queue_limits {
 	enum blk_zoned_model	zoned;
 };
 
+#ifdef CONFIG_BLK_DEV_ZONED
+
+struct blk_zone_report_hdr {
+	unsigned int	nr_zones;
+	u8		padding[60];
+};
+
+extern int blkdev_report_zones(struct block_device *bdev,
+			       sector_t sector, struct blk_zone *zones,
+			       unsigned int *nr_zones, gfp_t gfp_mask);
+extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
+			      sector_t nr_sectors, gfp_t gfp_mask);
+
+#endif /* CONFIG_BLK_DEV_ZONED */
+
 struct request_queue {
 	/*
 	 * Together with queue_head for cacheline sharing
@@ -654,6 +670,11 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 	}
 }
 
+static inline unsigned int blk_queue_zone_size(struct request_queue *q)
+{
+	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
+}
+
 /*
  * We regard a request as sync, if either a read or a sync write
  */
@@ -1401,6 +1422,16 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
 	return false;
 }
 
+static inline unsigned int bdev_zone_size(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return blk_queue_zone_size(q);
+
+	return 0;
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 6965d090955457..b2166f283da9b2 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -70,6 +70,7 @@ header-y += bfs_fs.h
 header-y += binfmts.h
 header-y += blkpg.h
 header-y += blktrace_api.h
+header-y += blkzoned.h
 header-y += bpf_common.h
 header-y += bpf_perf_event.h
 header-y += bpf.h
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
new file mode 100644
index 00000000000000..a3817214b0e01e
--- /dev/null
+++ b/include/uapi/linux/blkzoned.h
@@ -0,0 +1,103 @@
+/*
+ * Zoned block devices handling.
+ *
+ * Copyright (C) 2015 Seagate Technology PLC
+ *
+ * Written by: Shaun Tancheff <shaun.tancheff@seagate.com>
+ *
+ * Modified by: Damien Le Moal <damien.lemoal@hgst.com>
+ * Copyright (C) 2016 Western Digital
+ *
+ * This file is licensed under  the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef _UAPI_BLKZONED_H
+#define _UAPI_BLKZONED_H
+
+#include <linux/types.h>
+
+/**
+ * enum blk_zone_type - Types of zones allowed in a zoned device.
+ *
+ * @BLK_ZONE_TYPE_CONVENTIONAL: The zone has no write pointer and can be writen
+ *                              randomly. Zone reset has no effect on the zone.
+ * @BLK_ZONE_TYPE_SEQWRITE_REQ: The zone must be written sequentially
+ * @BLK_ZONE_TYPE_SEQWRITE_PREF: The zone can be written non-sequentially
+ *
+ * Any other value not defined is reserved and must be considered as invalid.
+ */
+enum blk_zone_type {
+	BLK_ZONE_TYPE_CONVENTIONAL	= 0x1,
+	BLK_ZONE_TYPE_SEQWRITE_REQ	= 0x2,
+	BLK_ZONE_TYPE_SEQWRITE_PREF	= 0x3,
+};
+
+/**
+ * enum blk_zone_cond - Condition [state] of a zone in a zoned device.
+ *
+ * @BLK_ZONE_COND_NOT_WP: The zone has no write pointer, it is conventional.
+ * @BLK_ZONE_COND_EMPTY: The zone is empty.
+ * @BLK_ZONE_COND_IMP_OPEN: The zone is open, but not explicitly opened.
+ * @BLK_ZONE_COND_EXP_OPEN: The zones was explicitly opened by an
+ *                          OPEN ZONE command.
+ * @BLK_ZONE_COND_CLOSED: The zone was [explicitly] closed after writing.
+ * @BLK_ZONE_COND_FULL: The zone is marked as full, possibly by a zone
+ *                      FINISH ZONE command.
+ * @BLK_ZONE_COND_READONLY: The zone is read-only.
+ * @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written).
+ *
+ * The Zone Condition state machine in the ZBC/ZAC standards maps the above
+ * deinitions as:
+ *   - ZC1: Empty         | BLK_ZONE_EMPTY
+ *   - ZC2: Implicit Open | BLK_ZONE_COND_IMP_OPEN
+ *   - ZC3: Explicit Open | BLK_ZONE_COND_EXP_OPEN
+ *   - ZC4: Closed        | BLK_ZONE_CLOSED
+ *   - ZC5: Full          | BLK_ZONE_FULL
+ *   - ZC6: Read Only     | BLK_ZONE_READONLY
+ *   - ZC7: Offline       | BLK_ZONE_OFFLINE
+ *
+ * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should
+ * be considered invalid.
+ */
+enum blk_zone_cond {
+	BLK_ZONE_COND_NOT_WP	= 0x0,
+	BLK_ZONE_COND_EMPTY	= 0x1,
+	BLK_ZONE_COND_IMP_OPEN	= 0x2,
+	BLK_ZONE_COND_EXP_OPEN	= 0x3,
+	BLK_ZONE_COND_CLOSED	= 0x4,
+	BLK_ZONE_COND_READONLY	= 0xD,
+	BLK_ZONE_COND_FULL	= 0xE,
+	BLK_ZONE_COND_OFFLINE	= 0xF,
+};
+
+/**
+ * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl.
+ *
+ * @start: Zone start in 512 B sector units
+ * @len: Zone length in 512 B sector units
+ * @wp: Zone write pointer location in 512 B sector units
+ * @type: see enum blk_zone_type for possible values
+ * @cond: see enum blk_zone_cond for possible values
+ * @non_seq: Flag indicating that the zone is using non-sequential resources
+ *           (for host-aware zoned block devices only).
+ * @reset: Flag indicating that a zone reset is recommended.
+ * @reserved: Padding to 64 B to match the ZBC/ZAC defined zone descriptor size.
+ *
+ * start, len and wp use the regular 512 B sector unit, regardless of the
+ * device logical block size. The overall structure size is 64 B to match the
+ * ZBC/ZAC defined zone descriptor and allow support for future additional
+ * zone information.
+ */
+struct blk_zone {
+	__u64	start;		/* Zone start sector */
+	__u64	len;		/* Zone length in number of sectors */
+	__u64	wp;		/* Zone write pointer position */
+	__u8	type;		/* Zone type */
+	__u8	cond;		/* Zone condition */
+	__u8	non_seq;	/* Non-sequential write resources active */
+	__u8	reset;		/* Reset write pointer recommended */
+	__u8	reserved[36];
+};
+
+#endif /* _UAPI_BLKZONED_H */

From 3ed05a987e0f63b21e634101e0b460d32f3581c3 Mon Sep 17 00:00:00 2001
From: Shaun Tancheff <shaun@tancheff.com>
Date: Tue, 18 Oct 2016 15:40:35 +0900
Subject: [PATCH 006/182] blk-zoned: implement ioctls

Adds the new BLKREPORTZONE and BLKRESETZONE ioctls for respectively
obtaining the zone configuration of a zoned block device and resetting
the write pointer of sequential zones of a zoned block device.

The BLKREPORTZONE ioctl maps directly to a single call of the function
blkdev_report_zones. The zone information result is passed as an array
of struct blk_zone identical to the structure used internally for
processing the REQ_OP_ZONE_REPORT operation.  The BLKRESETZONE ioctl
maps to a single call of the blkdev_reset_zones function.

Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-zoned.c             | 93 +++++++++++++++++++++++++++++++++++
 block/ioctl.c                 |  4 ++
 include/linux/blkdev.h        | 21 ++++++++
 include/uapi/linux/blkzoned.h | 40 +++++++++++++++
 include/uapi/linux/fs.h       |  4 ++
 5 files changed, 162 insertions(+)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 1603573f9605d7..667f95d8669590 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -255,3 +255,96 @@ int blkdev_reset_zones(struct block_device *bdev,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blkdev_reset_zones);
+
+/**
+ * BLKREPORTZONE ioctl processing.
+ * Called from blkdev_ioctl.
+ */
+int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
+			      unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct request_queue *q;
+	struct blk_zone_report rep;
+	struct blk_zone *zones;
+	int ret;
+
+	if (!argp)
+		return -EINVAL;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_is_zoned(q))
+		return -ENOTTY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
+		return -EFAULT;
+
+	if (!rep.nr_zones)
+		return -EINVAL;
+
+	zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL);
+	if (!zones)
+		return -ENOMEM;
+
+	ret = blkdev_report_zones(bdev, rep.sector,
+				  zones, &rep.nr_zones,
+				  GFP_KERNEL);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (rep.nr_zones) {
+		if (copy_to_user(argp + sizeof(struct blk_zone_report), zones,
+				 sizeof(struct blk_zone) * rep.nr_zones))
+			ret = -EFAULT;
+	}
+
+ out:
+	kfree(zones);
+
+	return ret;
+}
+
+/**
+ * BLKRESETZONE ioctl processing.
+ * Called from blkdev_ioctl.
+ */
+int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct request_queue *q;
+	struct blk_zone_range zrange;
+
+	if (!argp)
+		return -EINVAL;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_is_zoned(q))
+		return -ENOTTY;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!(mode & FMODE_WRITE))
+		return -EBADF;
+
+	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
+		return -EFAULT;
+
+	return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
+				  GFP_KERNEL);
+}
diff --git a/block/ioctl.c b/block/ioctl.c
index 755119c3c1b912..f856963204f494 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -519,6 +519,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 				BLKDEV_DISCARD_SECURE);
 	case BLKZEROOUT:
 		return blk_ioctl_zeroout(bdev, mode, arg);
+	case BLKREPORTZONE:
+		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
+	case BLKRESETZONE:
+		return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg);
 	case HDIO_GETGEO:
 		return blkdev_getgeo(bdev, argp);
 	case BLKRAGET:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 252043f7cd2c8e..90097dd8b8edbb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -316,6 +316,27 @@ extern int blkdev_report_zones(struct block_device *bdev,
 extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
 			      sector_t nr_sectors, gfp_t gfp_mask);
 
+extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
+				     unsigned int cmd, unsigned long arg);
+extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
+				    unsigned int cmd, unsigned long arg);
+
+#else /* CONFIG_BLK_DEV_ZONED */
+
+static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
+					    fmode_t mode, unsigned int cmd,
+					    unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
+					   fmode_t mode, unsigned int cmd,
+					   unsigned long arg)
+{
+	return -ENOTTY;
+}
+
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 struct request_queue {
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index a3817214b0e01e..40d1d7bff5373e 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -16,6 +16,7 @@
 #define _UAPI_BLKZONED_H
 
 #include <linux/types.h>
+#include <linux/ioctl.h>
 
 /**
  * enum blk_zone_type - Types of zones allowed in a zoned device.
@@ -100,4 +101,43 @@ struct blk_zone {
 	__u8	reserved[36];
 };
 
+/**
+ * struct blk_zone_report - BLKREPORTZONE ioctl request/reply
+ *
+ * @sector: starting sector of report
+ * @nr_zones: IN maximum / OUT actual
+ * @reserved: padding to 16 byte alignment
+ * @zones: Space to hold @nr_zones @zones entries on reply.
+ *
+ * The array of at most @nr_zones must follow this structure in memory.
+ */
+struct blk_zone_report {
+	__u64		sector;
+	__u32		nr_zones;
+	__u8		reserved[4];
+	struct blk_zone zones[0];
+} __packed;
+
+/**
+ * struct blk_zone_range - BLKRESETZONE ioctl request
+ * @sector: starting sector of the first zone to issue reset write pointer
+ * @nr_sectors: Total number of sectors of 1 or more zones to reset
+ */
+struct blk_zone_range {
+	__u64		sector;
+	__u64		nr_sectors;
+};
+
+/**
+ * Zoned block device ioctl's:
+ *
+ * @BLKREPORTZONE: Get zone information. Takes a zone report as argument.
+ *                 The zone report will start from the zone containing the
+ *                 sector specified in the report request structure.
+ * @BLKRESETZONE: Reset the write pointer of the zones in the specified
+ *                sector range. The sector range must be zone aligned.
+ */
+#define BLKREPORTZONE	_IOWR(0x12, 130, struct blk_zone_report)
+#define BLKRESETZONE	_IOW(0x12, 131, struct blk_zone_range)
+
 #endif /* _UAPI_BLKZONED_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index acb2b6152ba012..c1d11df07b289f 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -225,6 +225,10 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+/*
+ * A jump here: 130-131 are reserved for zoned block devices
+ * (see uapi/linux/blkzoned.h)
+ */
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */

From 89d9475610771b5e5fe1879075f0fc9ba6e3755f Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 18 Oct 2016 15:40:34 +0900
Subject: [PATCH 007/182] sd: Implement support for ZBC devices

Implement ZBC support functions to setup zoned disks, both
host-managed and host-aware models. Only zoned disks that satisfy
the following conditions are supported:
1) All zones are the same size, with the exception of an eventual
   last smaller runt zone.
2) For host-managed disks, reads are unrestricted (reads are not
   failed due to zone or write pointer alignement constraints).
Zoned disks that do not satisfy these 2 conditions are setup with
a capacity of 0 to prevent their use.

The function sd_zbc_read_zones, called from sd_revalidate_disk,
checks that the device satisfies the above two constraints. This
function may also change the disk capacity previously set by
sd_read_capacity for devices reporting only the capacity of
conventional zones at the beginning of the LBA range (i.e. devices
reporting rc_basis set to 0).

The capacity message output was moved out of sd_read_capacity into
a new function sd_print_capacity to include this eventual capacity
change by sd_zbc_read_zones. This new function also includes a call
to sd_zbc_print_zones to display the number of zones and zone size
of the device.

Signed-off-by: Hannes Reinecke <hare@suse.de>

[Damien: * Removed zone cache support
         * Removed mapping of discard to reset write pointer command
         * Modified sd_zbc_read_zones to include checks that the
           device satisfies the kernel constraints
         * Implemeted REPORT ZONES setup and post-processing based
           on code from Shaun Tancheff <shaun.tancheff@seagate.com>
         * Removed confusing use of 512B sector units in functions
           interface]
Signed-off-by: Damien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Tested-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/scsi/Makefile     |   1 +
 drivers/scsi/sd.c         | 148 ++++++---
 drivers/scsi/sd.h         |  70 +++++
 drivers/scsi/sd_zbc.c     | 642 ++++++++++++++++++++++++++++++++++++++
 include/scsi/scsi_proto.h |  17 +
 5 files changed, 843 insertions(+), 35 deletions(-)
 create mode 100644 drivers/scsi/sd_zbc.c

diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 38d938d7fe679c..1520596f54a6c1 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -173,6 +173,7 @@ hv_storvsc-y			:= storvsc_drv.o
 
 sd_mod-objs	:= sd.o
 sd_mod-$(CONFIG_BLK_DEV_INTEGRITY) += sd_dif.o
+sd_mod-$(CONFIG_BLK_DEV_ZONED) += sd_zbc.o
 
 sr_mod-objs	:= sr.o sr_ioctl.o sr_vendor.o
 ncr53c8xx-flags-$(CONFIG_SCSI_ZALON) \
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 51e56296f465a8..b9618ffca829bf 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -93,6 +93,7 @@ MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
+MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC);
 
 #if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
 #define SD_MINORS	16
@@ -163,7 +164,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr,
 	static const char temp[] = "temporary ";
 	int len;
 
-	if (sdp->type != TYPE_DISK)
+	if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
 		/* no cache control on RBC devices; theoretically they
 		 * can do it, but there's probably so many exceptions
 		 * it's not worth the risk */
@@ -262,7 +263,7 @@ allow_restart_store(struct device *dev, struct device_attribute *attr,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	if (sdp->type != TYPE_DISK)
+	if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
 		return -EINVAL;
 
 	sdp->allow_restart = simple_strtoul(buf, NULL, 10);
@@ -392,6 +393,11 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
+	if (sd_is_zoned(sdkp)) {
+		sd_config_discard(sdkp, SD_LBP_DISABLE);
+		return count;
+	}
+
 	if (sdp->type != TYPE_DISK)
 		return -EINVAL;
 
@@ -459,7 +465,7 @@ max_write_same_blocks_store(struct device *dev, struct device_attribute *attr,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	if (sdp->type != TYPE_DISK)
+	if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
 		return -EINVAL;
 
 	err = kstrtoul(buf, 10, &max);
@@ -844,6 +850,12 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 
 	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
+	if (sd_is_zoned(sdkp)) {
+		ret = sd_zbc_setup_write_cmnd(cmd);
+		if (ret != BLKPREP_OK)
+			return ret;
+	}
+
 	sector >>= ilog2(sdp->sector_size) - 9;
 	nr_sectors >>= ilog2(sdp->sector_size) - 9;
 
@@ -901,19 +913,25 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	struct request *rq = SCpnt->request;
 	struct scsi_device *sdp = SCpnt->device;
 	struct gendisk *disk = rq->rq_disk;
-	struct scsi_disk *sdkp;
+	struct scsi_disk *sdkp = scsi_disk(disk);
 	sector_t block = blk_rq_pos(rq);
 	sector_t threshold;
 	unsigned int this_count = blk_rq_sectors(rq);
 	unsigned int dif, dix;
+	bool zoned_write = sd_is_zoned(sdkp) && rq_data_dir(rq) == WRITE;
 	int ret;
 	unsigned char protect;
 
+	if (zoned_write) {
+		ret = sd_zbc_setup_write_cmnd(SCpnt);
+		if (ret != BLKPREP_OK)
+			return ret;
+	}
+
 	ret = scsi_init_io(SCpnt);
 	if (ret != BLKPREP_OK)
 		goto out;
 	SCpnt = rq->special;
-	sdkp = scsi_disk(disk);
 
 	/* from here on until we're complete, any goto out
 	 * is used for a killable error condition */
@@ -1132,6 +1150,9 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	 */
 	ret = BLKPREP_OK;
  out:
+	if (zoned_write && ret != BLKPREP_OK)
+		sd_zbc_cancel_write_cmnd(SCpnt);
+
 	return ret;
 }
 
@@ -1149,6 +1170,10 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 	case REQ_OP_READ:
 	case REQ_OP_WRITE:
 		return sd_setup_read_write_cmnd(cmd);
+	case REQ_OP_ZONE_REPORT:
+		return sd_zbc_setup_report_cmnd(cmd);
+	case REQ_OP_ZONE_RESET:
+		return sd_zbc_setup_reset_cmnd(cmd);
 	default:
 		BUG();
 	}
@@ -1780,7 +1805,10 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 	unsigned char op = SCpnt->cmnd[0];
 	unsigned char unmap = SCpnt->cmnd[1] & 8;
 
-	if (req_op(req) == REQ_OP_DISCARD || req_op(req) == REQ_OP_WRITE_SAME) {
+	switch (req_op(req)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_ZONE_RESET:
 		if (!result) {
 			good_bytes = blk_rq_bytes(req);
 			scsi_set_resid(SCpnt, 0);
@@ -1788,6 +1816,17 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 			good_bytes = 0;
 			scsi_set_resid(SCpnt, blk_rq_bytes(req));
 		}
+		break;
+	case REQ_OP_ZONE_REPORT:
+		if (!result) {
+			good_bytes = scsi_bufflen(SCpnt)
+				- scsi_get_resid(SCpnt);
+			scsi_set_resid(SCpnt, 0);
+		} else {
+			good_bytes = 0;
+			scsi_set_resid(SCpnt, blk_rq_bytes(req));
+		}
+		break;
 	}
 
 	if (result) {
@@ -1848,7 +1887,11 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 	default:
 		break;
 	}
+
  out:
+	if (sd_is_zoned(sdkp))
+		sd_zbc_complete(SCpnt, good_bytes, &sshdr);
+
 	SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
 					   "sd_done: completed %d of %d bytes\n",
 					   good_bytes, scsi_bufflen(SCpnt)));
@@ -1983,7 +2026,6 @@ sd_spinup_disk(struct scsi_disk *sdkp)
 	}
 }
 
-
 /*
  * Determine whether disk supports Data Integrity Field.
  */
@@ -2133,6 +2175,9 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
 	/* Logical blocks per physical block exponent */
 	sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size;
 
+	/* RC basis */
+	sdkp->rc_basis = (buffer[12] >> 4) & 0x3;
+
 	/* Lowest aligned logical block */
 	alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size;
 	blk_queue_alignment_offset(sdp->request_queue, alignment);
@@ -2242,7 +2287,6 @@ sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer)
 {
 	int sector_size;
 	struct scsi_device *sdp = sdkp->device;
-	sector_t old_capacity = sdkp->capacity;
 
 	if (sd_try_rc16_first(sdp)) {
 		sector_size = read_capacity_16(sdkp, sdp, buffer);
@@ -2323,35 +2367,44 @@ sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer)
 		sector_size = 512;
 	}
 	blk_queue_logical_block_size(sdp->request_queue, sector_size);
+	blk_queue_physical_block_size(sdp->request_queue,
+				      sdkp->physical_block_size);
+	sdkp->device->sector_size = sector_size;
 
-	{
-		char cap_str_2[10], cap_str_10[10];
+	if (sdkp->capacity > 0xffffffff)
+		sdp->use_16_for_rw = 1;
 
-		string_get_size(sdkp->capacity, sector_size,
-				STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
-		string_get_size(sdkp->capacity, sector_size,
-				STRING_UNITS_10, cap_str_10,
-				sizeof(cap_str_10));
+}
 
-		if (sdkp->first_scan || old_capacity != sdkp->capacity) {
-			sd_printk(KERN_NOTICE, sdkp,
-				  "%llu %d-byte logical blocks: (%s/%s)\n",
-				  (unsigned long long)sdkp->capacity,
-				  sector_size, cap_str_10, cap_str_2);
+/*
+ * Print disk capacity
+ */
+static void
+sd_print_capacity(struct scsi_disk *sdkp,
+		  sector_t old_capacity)
+{
+	int sector_size = sdkp->device->sector_size;
+	char cap_str_2[10], cap_str_10[10];
 
-			if (sdkp->physical_block_size != sector_size)
-				sd_printk(KERN_NOTICE, sdkp,
-					  "%u-byte physical blocks\n",
-					  sdkp->physical_block_size);
-		}
-	}
+	string_get_size(sdkp->capacity, sector_size,
+			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
+	string_get_size(sdkp->capacity, sector_size,
+			STRING_UNITS_10, cap_str_10,
+			sizeof(cap_str_10));
 
-	if (sdkp->capacity > 0xffffffff)
-		sdp->use_16_for_rw = 1;
+	if (sdkp->first_scan || old_capacity != sdkp->capacity) {
+		sd_printk(KERN_NOTICE, sdkp,
+			  "%llu %d-byte logical blocks: (%s/%s)\n",
+			  (unsigned long long)sdkp->capacity,
+			  sector_size, cap_str_10, cap_str_2);
 
-	blk_queue_physical_block_size(sdp->request_queue,
-				      sdkp->physical_block_size);
-	sdkp->device->sector_size = sector_size;
+		if (sdkp->physical_block_size != sector_size)
+			sd_printk(KERN_NOTICE, sdkp,
+				  "%u-byte physical blocks\n",
+				  sdkp->physical_block_size);
+
+		sd_zbc_print_zones(sdkp);
+	}
 }
 
 /* called with buffer of length 512 */
@@ -2613,7 +2666,7 @@ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer)
 	struct scsi_mode_data data;
 	struct scsi_sense_hdr sshdr;
 
-	if (sdp->type != TYPE_DISK)
+	if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
 		return;
 
 	if (sdkp->protection_type == 0)
@@ -2720,6 +2773,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
  */
 static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 {
+	struct request_queue *q = sdkp->disk->queue;
 	unsigned char *buffer;
 	u16 rot;
 	const int vpd_len = 64;
@@ -2734,10 +2788,21 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
 	rot = get_unaligned_be16(&buffer[4]);
 
 	if (rot == 1) {
-		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, sdkp->disk->queue);
-		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, sdkp->disk->queue);
+		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
 	}
 
+	sdkp->zoned = (buffer[8] >> 4) & 3;
+	if (sdkp->zoned == 1)
+		q->limits.zoned = BLK_ZONED_HA;
+	else if (sdkp->device->type == TYPE_ZBC)
+		q->limits.zoned = BLK_ZONED_HM;
+	else
+		q->limits.zoned = BLK_ZONED_NONE;
+	if (blk_queue_is_zoned(q) && sdkp->first_scan)
+		sd_printk(KERN_NOTICE, sdkp, "Host-%s zoned block device\n",
+		      q->limits.zoned == BLK_ZONED_HM ? "managed" : "aware");
+
  out:
 	kfree(buffer);
 }
@@ -2809,6 +2874,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	struct request_queue *q = sdkp->disk->queue;
+	sector_t old_capacity = sdkp->capacity;
 	unsigned char *buffer;
 	unsigned int dev_max, rw_max;
 
@@ -2842,8 +2908,11 @@ static int sd_revalidate_disk(struct gendisk *disk)
 			sd_read_block_provisioning(sdkp);
 			sd_read_block_limits(sdkp);
 			sd_read_block_characteristics(sdkp);
+			sd_zbc_read_zones(sdkp, buffer);
 		}
 
+		sd_print_capacity(sdkp, old_capacity);
+
 		sd_read_write_protect_flag(sdkp, buffer);
 		sd_read_cache_type(sdkp, buffer);
 		sd_read_app_tag_own(sdkp, buffer);
@@ -3041,9 +3110,16 @@ static int sd_probe(struct device *dev)
 
 	scsi_autopm_get_device(sdp);
 	error = -ENODEV;
-	if (sdp->type != TYPE_DISK && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC)
+	if (sdp->type != TYPE_DISK &&
+	    sdp->type != TYPE_ZBC &&
+	    sdp->type != TYPE_MOD &&
+	    sdp->type != TYPE_RBC)
 		goto out;
 
+#ifndef CONFIG_BLK_DEV_ZONED
+	if (sdp->type == TYPE_ZBC)
+		goto out;
+#endif
 	SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp,
 					"sd_probe\n"));
 
@@ -3147,6 +3223,8 @@ static int sd_remove(struct device *dev)
 	del_gendisk(sdkp->disk);
 	sd_shutdown(dev);
 
+	sd_zbc_remove(sdkp);
+
 	blk_register_region(devt, SD_MINORS, NULL,
 			    sd_default_probe, NULL, NULL);
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index c8d986368da9e6..4dac35e96a75ba 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -64,6 +64,15 @@ struct scsi_disk {
 	struct scsi_device *device;
 	struct device	dev;
 	struct gendisk	*disk;
+#ifdef CONFIG_BLK_DEV_ZONED
+	unsigned int	nr_zones;
+	unsigned int	zone_blocks;
+	unsigned int	zone_shift;
+	unsigned long	*zones_wlock;
+	unsigned int	zones_optimal_open;
+	unsigned int	zones_optimal_nonseq;
+	unsigned int	zones_max_open;
+#endif
 	atomic_t	openers;
 	sector_t	capacity;	/* size in logical blocks */
 	u32		max_xfer_blocks;
@@ -94,6 +103,9 @@ struct scsi_disk {
 	unsigned	lbpvpd : 1;
 	unsigned	ws10 : 1;
 	unsigned	ws16 : 1;
+	unsigned	rc_basis: 2;
+	unsigned	zoned: 2;
+	unsigned	urswrz : 1;
 };
 #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev)
 
@@ -156,6 +168,11 @@ static inline unsigned int logical_to_bytes(struct scsi_device *sdev, sector_t b
 	return blocks * sdev->sector_size;
 }
 
+static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sector)
+{
+	return sector >> (ilog2(sdev->sector_size) - 9);
+}
+
 /*
  * Look up the DIX operation based on whether the command is read or
  * write and whether dix and dif are enabled.
@@ -239,4 +256,57 @@ static inline void sd_dif_complete(struct scsi_cmnd *cmd, unsigned int a)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
+static inline int sd_is_zoned(struct scsi_disk *sdkp)
+{
+	return sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC;
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+
+extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
+extern void sd_zbc_remove(struct scsi_disk *sdkp);
+extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
+extern int sd_zbc_setup_write_cmnd(struct scsi_cmnd *cmd);
+extern void sd_zbc_cancel_write_cmnd(struct scsi_cmnd *cmd);
+extern int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd);
+extern int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
+extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
+			    struct scsi_sense_hdr *sshdr);
+
+#else /* CONFIG_BLK_DEV_ZONED */
+
+static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
+				    unsigned char *buf)
+{
+	return 0;
+}
+
+static inline void sd_zbc_remove(struct scsi_disk *sdkp) {}
+
+static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {}
+
+static inline int sd_zbc_setup_write_cmnd(struct scsi_cmnd *cmd)
+{
+	/* Let the drive fail requests */
+	return BLKPREP_OK;
+}
+
+static inline void sd_zbc_cancel_write_cmnd(struct scsi_cmnd *cmd) {}
+
+static inline int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd)
+{
+	return BLKPREP_INVALID;
+}
+
+static inline int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+{
+	return BLKPREP_INVALID;
+}
+
+static inline void sd_zbc_complete(struct scsi_cmnd *cmd,
+				   unsigned int good_bytes,
+				   struct scsi_sense_hdr *sshdr) {}
+
+#endif /* CONFIG_BLK_DEV_ZONED */
+
 #endif /* _SCSI_DISK_H */
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
new file mode 100644
index 00000000000000..16d3fa62d8acbf
--- /dev/null
+++ b/drivers/scsi/sd_zbc.c
@@ -0,0 +1,642 @@
+/*
+ * SCSI Zoned Block commands
+ *
+ * Copyright (C) 2014-2015 SUSE Linux GmbH
+ * Written by: Hannes Reinecke <hare@suse.de>
+ * Modified by: Damien Le Moal <damien.lemoal@hgst.com>
+ * Modified by: Shaun Tancheff <shaun.tancheff@seagate.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+
+#include <asm/unaligned.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_driver.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_eh.h>
+
+#include "sd.h"
+#include "scsi_priv.h"
+
+enum zbc_zone_type {
+	ZBC_ZONE_TYPE_CONV = 0x1,
+	ZBC_ZONE_TYPE_SEQWRITE_REQ,
+	ZBC_ZONE_TYPE_SEQWRITE_PREF,
+	ZBC_ZONE_TYPE_RESERVED,
+};
+
+enum zbc_zone_cond {
+	ZBC_ZONE_COND_NO_WP,
+	ZBC_ZONE_COND_EMPTY,
+	ZBC_ZONE_COND_IMP_OPEN,
+	ZBC_ZONE_COND_EXP_OPEN,
+	ZBC_ZONE_COND_CLOSED,
+	ZBC_ZONE_COND_READONLY = 0xd,
+	ZBC_ZONE_COND_FULL,
+	ZBC_ZONE_COND_OFFLINE,
+};
+
+/**
+ * Convert a zone descriptor to a zone struct.
+ */
+static void sd_zbc_parse_report(struct scsi_disk *sdkp,
+				u8 *buf,
+				struct blk_zone *zone)
+{
+	struct scsi_device *sdp = sdkp->device;
+
+	memset(zone, 0, sizeof(struct blk_zone));
+
+	zone->type = buf[0] & 0x0f;
+	zone->cond = (buf[1] >> 4) & 0xf;
+	if (buf[1] & 0x01)
+		zone->reset = 1;
+	if (buf[1] & 0x02)
+		zone->non_seq = 1;
+
+	zone->len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8]));
+	zone->start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16]));
+	zone->wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24]));
+	if (zone->type != ZBC_ZONE_TYPE_CONV &&
+	    zone->cond == ZBC_ZONE_COND_FULL)
+		zone->wp = zone->start + zone->len;
+}
+
+/**
+ * Issue a REPORT ZONES scsi command.
+ */
+static int sd_zbc_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
+			       unsigned int buflen, sector_t lba)
+{
+	struct scsi_device *sdp = sdkp->device;
+	const int timeout = sdp->request_queue->rq_timeout;
+	struct scsi_sense_hdr sshdr;
+	unsigned char cmd[16];
+	unsigned int rep_len;
+	int result;
+
+	memset(cmd, 0, 16);
+	cmd[0] = ZBC_IN;
+	cmd[1] = ZI_REPORT_ZONES;
+	put_unaligned_be64(lba, &cmd[2]);
+	put_unaligned_be32(buflen, &cmd[10]);
+	memset(buf, 0, buflen);
+
+	result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
+				  buf, buflen, &sshdr,
+				  timeout, SD_MAX_RETRIES, NULL);
+	if (result) {
+		sd_printk(KERN_ERR, sdkp,
+			  "REPORT ZONES lba %llu failed with %d/%d\n",
+			  (unsigned long long)lba,
+			  host_byte(result), driver_byte(result));
+		return -EIO;
+	}
+
+	rep_len = get_unaligned_be32(&buf[0]);
+	if (rep_len < 64) {
+		sd_printk(KERN_ERR, sdkp,
+			  "REPORT ZONES report invalid length %u\n",
+			  rep_len);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd)
+{
+	struct request *rq = cmd->request;
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	sector_t lba, sector = blk_rq_pos(rq);
+	unsigned int nr_bytes = blk_rq_bytes(rq);
+	int ret;
+
+	WARN_ON(nr_bytes == 0);
+
+	if (!sd_is_zoned(sdkp))
+		/* Not a zoned device */
+		return BLKPREP_KILL;
+
+	ret = scsi_init_io(cmd);
+	if (ret != BLKPREP_OK)
+		return ret;
+
+	cmd->cmd_len = 16;
+	memset(cmd->cmnd, 0, cmd->cmd_len);
+	cmd->cmnd[0] = ZBC_IN;
+	cmd->cmnd[1] = ZI_REPORT_ZONES;
+	lba = sectors_to_logical(sdkp->device, sector);
+	put_unaligned_be64(lba, &cmd->cmnd[2]);
+	put_unaligned_be32(nr_bytes, &cmd->cmnd[10]);
+	/* Do partial report for speeding things up */
+	cmd->cmnd[14] = ZBC_REPORT_ZONE_PARTIAL;
+
+	cmd->sc_data_direction = DMA_FROM_DEVICE;
+	cmd->sdb.length = nr_bytes;
+	cmd->transfersize = sdkp->device->sector_size;
+	cmd->allowed = 0;
+
+	/*
+	 * Report may return less bytes than requested. Make sure
+	 * to report completion on the entire initial request.
+	 */
+	rq->__data_len = nr_bytes;
+
+	return BLKPREP_OK;
+}
+
+static void sd_zbc_report_zones_complete(struct scsi_cmnd *scmd,
+					 unsigned int good_bytes)
+{
+	struct request *rq = scmd->request;
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	struct sg_mapping_iter miter;
+	struct blk_zone_report_hdr hdr;
+	struct blk_zone zone;
+	unsigned int offset, bytes = 0;
+	unsigned long flags;
+	u8 *buf;
+
+	if (good_bytes < 64)
+		return;
+
+	memset(&hdr, 0, sizeof(struct blk_zone_report_hdr));
+
+	sg_miter_start(&miter, scsi_sglist(scmd), scsi_sg_count(scmd),
+		       SG_MITER_TO_SG | SG_MITER_ATOMIC);
+
+	local_irq_save(flags);
+	while (sg_miter_next(&miter) && bytes < good_bytes) {
+
+		buf = miter.addr;
+		offset = 0;
+
+		if (bytes == 0) {
+			/* Set the report header */
+			hdr.nr_zones = min_t(unsigned int,
+					 (good_bytes - 64) / 64,
+					 get_unaligned_be32(&buf[0]) / 64);
+			memcpy(buf, &hdr, sizeof(struct blk_zone_report_hdr));
+			offset += 64;
+			bytes += 64;
+		}
+
+		/* Parse zone descriptors */
+		while (offset < miter.length && hdr.nr_zones) {
+			WARN_ON(offset > miter.length);
+			buf = miter.addr + offset;
+			sd_zbc_parse_report(sdkp, buf, &zone);
+			memcpy(buf, &zone, sizeof(struct blk_zone));
+			offset += 64;
+			bytes += 64;
+			hdr.nr_zones--;
+		}
+
+		if (!hdr.nr_zones)
+			break;
+
+	}
+	sg_miter_stop(&miter);
+	local_irq_restore(flags);
+}
+
+static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
+{
+	return logical_to_sectors(sdkp->device, sdkp->zone_blocks);
+}
+
+static inline unsigned int sd_zbc_zone_no(struct scsi_disk *sdkp,
+					  sector_t sector)
+{
+	return sectors_to_logical(sdkp->device, sector) >> sdkp->zone_shift;
+}
+
+int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+{
+	struct request *rq = cmd->request;
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	sector_t sector = blk_rq_pos(rq);
+	sector_t block = sectors_to_logical(sdkp->device, sector);
+	unsigned int zno = block >> sdkp->zone_shift;
+
+	if (!sd_is_zoned(sdkp))
+		/* Not a zoned device */
+		return BLKPREP_KILL;
+
+	if (sdkp->device->changed)
+		return BLKPREP_KILL;
+
+	if (sector & (sd_zbc_zone_sectors(sdkp) - 1))
+		/* Unaligned request */
+		return BLKPREP_KILL;
+
+	/* Do not allow concurrent reset and writes */
+	if (sdkp->zones_wlock &&
+	    test_and_set_bit(zno, sdkp->zones_wlock))
+		return BLKPREP_DEFER;
+
+	cmd->cmd_len = 16;
+	memset(cmd->cmnd, 0, cmd->cmd_len);
+	cmd->cmnd[0] = ZBC_OUT;
+	cmd->cmnd[1] = ZO_RESET_WRITE_POINTER;
+	put_unaligned_be64(block, &cmd->cmnd[2]);
+
+	rq->timeout = SD_TIMEOUT;
+	cmd->sc_data_direction = DMA_NONE;
+	cmd->transfersize = 0;
+	cmd->allowed = 0;
+
+	return BLKPREP_OK;
+}
+
+int sd_zbc_setup_write_cmnd(struct scsi_cmnd *cmd)
+{
+	struct request *rq = cmd->request;
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	sector_t sector = blk_rq_pos(rq);
+	sector_t zone_sectors = sd_zbc_zone_sectors(sdkp);
+	unsigned int zno = sd_zbc_zone_no(sdkp, sector);
+
+	/*
+	 * Note: Checks of the alignment of the write command on
+	 * logical blocks is done in sd.c
+	 */
+
+	/* Do not allow zone boundaries crossing on host-managed drives */
+	if (blk_queue_zoned_model(sdkp->disk->queue) == BLK_ZONED_HM &&
+	    (sector & (zone_sectors - 1)) + blk_rq_sectors(rq) > zone_sectors)
+		return BLKPREP_KILL;
+
+	/*
+	 * Do not issue more than one write at a time per
+	 * zone. This solves write ordering problems due to
+	 * the unlocking of the request queue in the dispatch
+	 * path in the non scsi-mq case. For scsi-mq, this
+	 * also avoids potential write reordering when multiple
+	 * threads running on different CPUs write to the same
+	 * zone (with a synchronized sequential pattern).
+	 */
+	if (sdkp->zones_wlock &&
+	    test_and_set_bit(zno, sdkp->zones_wlock))
+		return BLKPREP_DEFER;
+
+	return BLKPREP_OK;
+}
+
+static void sd_zbc_unlock_zone(struct request *rq)
+{
+	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+
+	if (sdkp->zones_wlock) {
+		unsigned int zno = sd_zbc_zone_no(sdkp, blk_rq_pos(rq));
+		WARN_ON_ONCE(!test_bit(zno, sdkp->zones_wlock));
+		clear_bit_unlock(zno, sdkp->zones_wlock);
+		smp_mb__after_atomic();
+	}
+}
+
+void sd_zbc_cancel_write_cmnd(struct scsi_cmnd *cmd)
+{
+	sd_zbc_unlock_zone(cmd->request);
+}
+
+void sd_zbc_complete(struct scsi_cmnd *cmd,
+		     unsigned int good_bytes,
+		     struct scsi_sense_hdr *sshdr)
+{
+	int result = cmd->result;
+	struct request *rq = cmd->request;
+
+	switch (req_op(rq)) {
+	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_ZONE_RESET:
+
+		/* Unlock the zone */
+		sd_zbc_unlock_zone(rq);
+
+		if (!result ||
+		    sshdr->sense_key != ILLEGAL_REQUEST)
+			break;
+
+		switch (sshdr->asc) {
+		case 0x24:
+			/*
+			 * INVALID FIELD IN CDB error: For a zone reset,
+			 * this means that a reset of a conventional
+			 * zone was attempted. Nothing to worry about in
+			 * this case, so be quiet about the error.
+			 */
+			if (req_op(rq) == REQ_OP_ZONE_RESET)
+				rq->cmd_flags |= REQ_QUIET;
+			break;
+		case 0x21:
+			/*
+			 * INVALID ADDRESS FOR WRITE error: It is unlikely that
+			 * retrying write requests failed with any kind of
+			 * alignement error will result in success. So don't.
+			 */
+			cmd->allowed = 0;
+			break;
+		}
+
+		break;
+
+	case REQ_OP_ZONE_REPORT:
+
+		if (!result)
+			sd_zbc_report_zones_complete(cmd, good_bytes);
+		break;
+
+	}
+}
+
+/**
+ * Read zoned block device characteristics (VPD page B6).
+ */
+static int sd_zbc_read_zoned_characteristics(struct scsi_disk *sdkp,
+					     unsigned char *buf)
+{
+
+	if (scsi_get_vpd_page(sdkp->device, 0xb6, buf, 64)) {
+		sd_printk(KERN_NOTICE, sdkp,
+			  "Unconstrained-read check failed\n");
+		return -ENODEV;
+	}
+
+	if (sdkp->device->type != TYPE_ZBC) {
+		/* Host-aware */
+		sdkp->urswrz = 1;
+		sdkp->zones_optimal_open = get_unaligned_be64(&buf[8]);
+		sdkp->zones_optimal_nonseq = get_unaligned_be64(&buf[12]);
+		sdkp->zones_max_open = 0;
+	} else {
+		/* Host-managed */
+		sdkp->urswrz = buf[4] & 1;
+		sdkp->zones_optimal_open = 0;
+		sdkp->zones_optimal_nonseq = 0;
+		sdkp->zones_max_open = get_unaligned_be64(&buf[16]);
+	}
+
+	return 0;
+}
+
+/**
+ * Check reported capacity.
+ */
+static int sd_zbc_check_capacity(struct scsi_disk *sdkp,
+				 unsigned char *buf)
+{
+	sector_t lba;
+	int ret;
+
+	if (sdkp->rc_basis != 0)
+		return 0;
+
+	/* Do a report zone to get the maximum LBA to check capacity */
+	ret = sd_zbc_report_zones(sdkp, buf, SD_BUF_SIZE, 0);
+	if (ret)
+		return ret;
+
+	/* The max_lba field is the capacity of this device */
+	lba = get_unaligned_be64(&buf[8]);
+	if (lba + 1 == sdkp->capacity)
+		return 0;
+
+	if (sdkp->first_scan)
+		sd_printk(KERN_WARNING, sdkp,
+			  "Changing capacity from %llu to max LBA+1 %llu\n",
+			  (unsigned long long)sdkp->capacity,
+			  (unsigned long long)lba + 1);
+	sdkp->capacity = lba + 1;
+
+	return 0;
+}
+
+#define SD_ZBC_BUF_SIZE 131072
+
+static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
+{
+	u64 zone_blocks;
+	sector_t block = 0;
+	unsigned char *buf;
+	unsigned char *rec;
+	unsigned int buf_len;
+	unsigned int list_length;
+	int ret;
+	u8 same;
+
+	sdkp->zone_blocks = 0;
+
+	/* Get a buffer */
+	buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* Do a report zone to get the same field */
+	ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0);
+	if (ret)
+		goto out;
+
+	same = buf[4] & 0x0f;
+	if (same > 0) {
+		rec = &buf[64];
+		zone_blocks = get_unaligned_be64(&rec[8]);
+		goto out;
+	}
+
+	/*
+	 * Check the size of all zones: all zones must be of
+	 * equal size, except the last zone which can be smaller
+	 * than other zones.
+	 */
+	do {
+
+		/* Parse REPORT ZONES header */
+		list_length = get_unaligned_be32(&buf[0]) + 64;
+		rec = buf + 64;
+		if (list_length < SD_ZBC_BUF_SIZE)
+			buf_len = list_length;
+		else
+			buf_len = SD_ZBC_BUF_SIZE;
+
+		/* Parse zone descriptors */
+		while (rec < buf + buf_len) {
+			zone_blocks = get_unaligned_be64(&rec[8]);
+			if (sdkp->zone_blocks == 0) {
+				sdkp->zone_blocks = zone_blocks;
+			} else if (zone_blocks != sdkp->zone_blocks &&
+				   (block + zone_blocks < sdkp->capacity
+				    || zone_blocks > sdkp->zone_blocks)) {
+				zone_blocks = 0;
+				goto out;
+			}
+			block += zone_blocks;
+			rec += 64;
+		}
+
+		if (block < sdkp->capacity) {
+			ret = sd_zbc_report_zones(sdkp, buf,
+						  SD_ZBC_BUF_SIZE, block);
+			if (ret)
+				return ret;
+		}
+
+	} while (block < sdkp->capacity);
+
+	zone_blocks = sdkp->zone_blocks;
+
+out:
+	kfree(buf);
+
+	if (!zone_blocks) {
+		if (sdkp->first_scan)
+			sd_printk(KERN_NOTICE, sdkp,
+				  "Devices with non constant zone "
+				  "size are not supported\n");
+		return -ENODEV;
+	}
+
+	if (!is_power_of_2(zone_blocks)) {
+		if (sdkp->first_scan)
+			sd_printk(KERN_NOTICE, sdkp,
+				  "Devices with non power of 2 zone "
+				  "size are not supported\n");
+		return -ENODEV;
+	}
+
+	if (logical_to_sectors(sdkp->device, zone_blocks) > UINT_MAX) {
+		if (sdkp->first_scan)
+			sd_printk(KERN_NOTICE, sdkp,
+				  "Zone size too large\n");
+		return -ENODEV;
+	}
+
+	sdkp->zone_blocks = zone_blocks;
+
+	return 0;
+}
+
+static int sd_zbc_setup(struct scsi_disk *sdkp)
+{
+
+	/* chunk_sectors indicates the zone size */
+	blk_queue_chunk_sectors(sdkp->disk->queue,
+			logical_to_sectors(sdkp->device, sdkp->zone_blocks));
+	sdkp->zone_shift = ilog2(sdkp->zone_blocks);
+	sdkp->nr_zones = sdkp->capacity >> sdkp->zone_shift;
+	if (sdkp->capacity & (sdkp->zone_blocks - 1))
+		sdkp->nr_zones++;
+
+	if (!sdkp->zones_wlock) {
+		sdkp->zones_wlock = kcalloc(BITS_TO_LONGS(sdkp->nr_zones),
+					    sizeof(unsigned long),
+					    GFP_KERNEL);
+		if (!sdkp->zones_wlock)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int sd_zbc_read_zones(struct scsi_disk *sdkp,
+		      unsigned char *buf)
+{
+	sector_t capacity;
+	int ret = 0;
+
+	if (!sd_is_zoned(sdkp))
+		/*
+		 * Device managed or normal SCSI disk,
+		 * no special handling required
+		 */
+		return 0;
+
+
+	/* Get zoned block device characteristics */
+	ret = sd_zbc_read_zoned_characteristics(sdkp, buf);
+	if (ret)
+		goto err;
+
+	/*
+	 * Check for unconstrained reads: host-managed devices with
+	 * constrained reads (drives failing read after write pointer)
+	 * are not supported.
+	 */
+	if (!sdkp->urswrz) {
+		if (sdkp->first_scan)
+			sd_printk(KERN_NOTICE, sdkp,
+			  "constrained reads devices are not supported\n");
+		ret = -ENODEV;
+		goto err;
+	}
+
+	/* Check capacity */
+	ret = sd_zbc_check_capacity(sdkp, buf);
+	if (ret)
+		goto err;
+	capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
+
+	/*
+	 * Check zone size: only devices with a constant zone size (except
+	 * an eventual last runt zone) that is a power of 2 are supported.
+	 */
+	ret = sd_zbc_check_zone_size(sdkp);
+	if (ret)
+		goto err;
+
+	/* The drive satisfies the kernel restrictions: set it up */
+	ret = sd_zbc_setup(sdkp);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	sdkp->capacity = 0;
+
+	return ret;
+}
+
+void sd_zbc_remove(struct scsi_disk *sdkp)
+{
+	kfree(sdkp->zones_wlock);
+	sdkp->zones_wlock = NULL;
+}
+
+void sd_zbc_print_zones(struct scsi_disk *sdkp)
+{
+	if (!sd_is_zoned(sdkp) || !sdkp->capacity)
+		return;
+
+	if (sdkp->capacity & (sdkp->zone_blocks - 1))
+		sd_printk(KERN_NOTICE, sdkp,
+			  "%u zones of %u logical blocks + 1 runt zone\n",
+			  sdkp->nr_zones - 1,
+			  sdkp->zone_blocks);
+	else
+		sd_printk(KERN_NOTICE, sdkp,
+			  "%u zones of %u logical blocks\n",
+			  sdkp->nr_zones,
+			  sdkp->zone_blocks);
+}
diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h
index d1defd1ebd95c6..6ba66e01f6df33 100644
--- a/include/scsi/scsi_proto.h
+++ b/include/scsi/scsi_proto.h
@@ -299,4 +299,21 @@ struct scsi_lun {
 #define SCSI_ACCESS_STATE_MASK        0x0f
 #define SCSI_ACCESS_STATE_PREFERRED   0x80
 
+/* Reporting options for REPORT ZONES */
+enum zbc_zone_reporting_options {
+	ZBC_ZONE_REPORTING_OPTION_ALL = 0,
+	ZBC_ZONE_REPORTING_OPTION_EMPTY,
+	ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN,
+	ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN,
+	ZBC_ZONE_REPORTING_OPTION_CLOSED,
+	ZBC_ZONE_REPORTING_OPTION_FULL,
+	ZBC_ZONE_REPORTING_OPTION_READONLY,
+	ZBC_ZONE_REPORTING_OPTION_OFFLINE,
+	ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP = 0x10,
+	ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE,
+	ZBC_ZONE_REPORTING_OPTION_NON_WP = 0x3f,
+};
+
+#define ZBC_REPORT_ZONE_PARTIAL 0x80
+
 #endif /* _SCSI_PROTO_H_ */

From 3c4da75814c4b8871116940eb32d3a5243026918 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 21 Oct 2016 17:42:33 +0200
Subject: [PATCH 008/182] block: zoned: fix harmless maybe-uninitialized
 warning

The blkdev_report_zones produces a harmless warning when
-Wmaybe-uninitialized is set, after gcc gets a little confused
about the multiple 'goto' here:

block/blk-zoned.c: In function 'blkdev_report_zones':
block/blk-zoned.c:188:13: error: 'nz' may be used uninitialized in this function [-Werror=maybe-uninitialized]

Moving the assignment to nr_zones makes this a little simpler
while also avoiding the warning reliably. I'm removing the
extraneous initialization of 'int ret' in the same patch, as
that is semi-related and could cause an uninitialized use of
that variable to not produce a warning.

Fixes: 6a0cb1bc106f ("block: Implement support for zoned block devices")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-zoned.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 667f95d8669590..472211fa183a64 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -80,7 +80,7 @@ int blkdev_report_zones(struct block_device *bdev,
 	unsigned int i, n, nz;
 	unsigned int ofst;
 	void *addr;
-	int ret = 0;
+	int ret;
 
 	if (!q)
 		return -ENXIO;
@@ -179,14 +179,12 @@ int blkdev_report_zones(struct block_device *bdev,
 
 	}
 
+	*nr_zones = nz;
 out:
 	bio_for_each_segment_all(bv, bio, i)
 		__free_page(bv->bv_page);
 	bio_put(bio);
 
-	if (ret == 0)
-		*nr_zones = nz;
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_report_zones);

From 5f2808ff1582202175d3908973eb9e05a5e6625d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 21 Oct 2016 17:32:24 +0200
Subject: [PATCH 009/182] sd: fix uninitialized variable access in error
 handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If sd_zbc_report_zones fails, the check for 'zone_blocks == 0'
later in the function accesses uninitialized data:

drivers/scsi/sd_zbc.c: In function ‘sd_zbc_read_zones’:
drivers/scsi/sd_zbc.c:520:7: error: ‘zone_blocks’ may be used uninitialized in this function [-Werror=maybe-uninitialized]

This sets it to zero, which has the desired effect of leaving
the sd_zbc_read_zones successfully with sdkp->zone_blocks = 0.

Fixes: 89d947561077 ("sd: Implement support for ZBC devices")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/scsi/sd_zbc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 16d3fa62d8acbf..d5b3bd915d9eb4 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -455,8 +455,10 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 
 	/* Do a report zone to get the same field */
 	ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0);
-	if (ret)
+	if (ret) {
+		zone_blocks = 0;
 		goto out;
+	}
 
 	same = buf[4] & 0x0f;
 	if (same > 0) {

From 366f4aea649a65c3735d91b4409d84c771811290 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Oct 2016 13:53:27 +0200
Subject: [PATCH 010/182] brd: Switch rd_size to unsigned long

Currently rd_size was int which lead to overflow and bogus device size
once the requested ramdisk size was 1 TB or more. Although these days
ramdisks with 1 TB size are mostly a mistake, the days when they are
useful are not far.

Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/brd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 0c76d4016eebeb..9d533585719169 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -443,8 +443,8 @@ static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
 module_param(rd_nr, int, S_IRUGO);
 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
 
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-module_param(rd_size, int, S_IRUGO);
+unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
+module_param(rd_size, ulong, S_IRUGO);
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 
 static int max_part = 1;

From ff26956875c2f05e12ecec9938411a2c7dfc767d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 25 Oct 2016 08:46:23 -0600
Subject: [PATCH 011/182] brd: remove support for BLKFLSBUF

Discontinue having the brd driver destructively free all pages in the
ramdisk in response to the BLKFLSBUF ioctl.  Doing so allows a BLKFLSBUF
ioctl issued to a logical partition to destroy pages of the parent brd
device (and all other partitions of that brd device).

This change breaks compatibility - but in this case the compatibility
breaks more than it helps.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/brd.c | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 9d533585719169..ad793f35632c6f 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -395,44 +395,9 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
 #define brd_direct_access NULL
 #endif
 
-static int brd_ioctl(struct block_device *bdev, fmode_t mode,
-			unsigned int cmd, unsigned long arg)
-{
-	int error;
-	struct brd_device *brd = bdev->bd_disk->private_data;
-
-	if (cmd != BLKFLSBUF)
-		return -ENOTTY;
-
-	/*
-	 * ram device BLKFLSBUF has special semantics, we want to actually
-	 * release and destroy the ramdisk data.
-	 */
-	mutex_lock(&brd_mutex);
-	mutex_lock(&bdev->bd_mutex);
-	error = -EBUSY;
-	if (bdev->bd_openers <= 1) {
-		/*
-		 * Kill the cache first, so it isn't written back to the
-		 * device.
-		 *
-		 * Another thread might instantiate more buffercache here,
-		 * but there is not much we can do to close that race.
-		 */
-		kill_bdev(bdev);
-		brd_free_pages(brd);
-		error = 0;
-	}
-	mutex_unlock(&bdev->bd_mutex);
-	mutex_unlock(&brd_mutex);
-
-	return error;
-}
-
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		brd_rw_page,
-	.ioctl =		brd_ioctl,
 	.direct_access =	brd_direct_access,
 };
 

From 7dd2fb6877b601a0b9c7284534ca4916cd7ac4bd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 27 Oct 2016 09:49:19 -0600
Subject: [PATCH 012/182] blk-mq: update hardware and software queues for
 sleeping alloc

If we end up sleeping due to running out of requests, we should
update the hardware and software queues in the map ctx structure.
Otherwise we could end up having rq->mq_ctx point to the pre-sleep
context, and risk corrupting ctx->rq_list since we'll be
grabbing the wrong lock when inserting the request.

Reported-by: Dave Jones <davej@codemonkey.org.uk>
Reported-by: Chris Mason <clm@fb.com>
Tested-by: Chris Mason <clm@fb.com>
Fixes: 63581af3f31e ("blk-mq: remove non-blocking pass in blk_mq_map_request")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ddc2eed6477146..f3d27a6dee09df 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1217,9 +1217,9 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
 	rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
 
-	hctx->queued++;
-	data->hctx = hctx;
-	data->ctx = ctx;
+	data->hctx = alloc_data.hctx;
+	data->ctx = alloc_data.ctx;
+	data->hctx->queued++;
 	return rq;
 }
 

From 2552e3f878c2b43b41d7728a328821d8220c28da Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 27 Oct 2016 19:03:55 -0600
Subject: [PATCH 013/182] blk-mq: get rid of confusing blk_map_ctx structure

We can just use struct blk_mq_alloc_data - it has a few more
members, but we allocate it further down the stack anyway. So
this cleans up the code, and reduces the stack overhead a bit.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f3d27a6dee09df..d74a74a9f9efe9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1190,21 +1190,15 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
-struct blk_map_ctx {
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-};
-
 static struct request *blk_mq_map_request(struct request_queue *q,
 					  struct bio *bio,
-					  struct blk_map_ctx *data)
+					  struct blk_mq_alloc_data *data)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct request *rq;
 	int op = bio_data_dir(bio);
 	int op_flags = 0;
-	struct blk_mq_alloc_data alloc_data;
 
 	blk_queue_enter_live(q);
 	ctx = blk_mq_get_ctx(q);
@@ -1214,11 +1208,9 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 		op_flags |= REQ_SYNC;
 
 	trace_block_getrq(q, bio, op);
-	blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
+	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+	rq = __blk_mq_alloc_request(data, op, op_flags);
 
-	data->hctx = alloc_data.hctx;
-	data->ctx = alloc_data.ctx;
 	data->hctx->queued++;
 	return rq;
 }
@@ -1267,7 +1259,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
 	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
-	struct blk_map_ctx data;
+	struct blk_mq_alloc_data data;
 	struct request *rq;
 	unsigned int request_count = 0;
 	struct blk_plug *plug;
@@ -1363,7 +1355,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_plug *plug;
 	unsigned int request_count = 0;
-	struct blk_map_ctx data;
+	struct blk_mq_alloc_data data;
 	struct request *rq;
 	blk_qc_t cookie;
 

From c4aebd0332da831a3403faf2035af45059ab6b7c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:12:09 +0200
Subject: [PATCH 014/182] block: remove bio_is_rw

With the addition of the zoned operations the tests in this function
became incorrect.  But I think it's much better to just open code the
allow operations in the only caller anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c |  2 +-
 include/linux/bio.h   | 11 -----------
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 63f72f00c72eef..5384713d48bc99 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -172,7 +172,7 @@ bool bio_integrity_enabled(struct bio *bio)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 
-	if (!bio_is_rw(bio))
+	if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
 		return false;
 
 	/* Already protected? */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 97cb48f03dc73c..87ce64dafb93ad 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -83,17 +83,6 @@ static inline bool bio_no_advance_iter(struct bio *bio)
 	       bio_op(bio) == REQ_OP_WRITE_SAME;
 }
 
-static inline bool bio_is_rw(struct bio *bio)
-{
-	if (!bio_has_data(bio))
-		return false;
-
-	if (bio_no_advance_iter(bio))
-		return false;
-
-	return true;
-}
-
 static inline bool bio_mergeable(struct bio *bio)
 {
 	if (bio->bi_opf & REQ_NOMERGE_FLAGS)

From bd1c1c21741cbd6e894960bcbc8b36f719590064 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:12:10 +0200
Subject: [PATCH 015/182] block: REQ_NOMERGE is common to the bio and request

So move it into the common setion of the request flags.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dd50dce89a8091..b54142534793b1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -158,6 +158,7 @@ enum rq_flag_bits {
 	__REQ_META,		/* metadata io request */
 	__REQ_PRIO,		/* boost priority in cfq */
 
+	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
 	__REQ_FUA,		/* forced unit access */
@@ -171,7 +172,6 @@ enum rq_flag_bits {
 	/* request only flags */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
-	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_STARTED,		/* drive already may have started this one */
 	__REQ_DONTPREP,		/* don't call prep for this one */
 	__REQ_QUEUED,		/* uses queueing */

From 188bd2b16b3c6ea87a90df20f33db0adcdb75f0c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:12:11 +0200
Subject: [PATCH 016/182] block: move REQ_RAHEAD to common flags

The information that am I/O is a read-ahead can be useful for drivers.
In fact the NVMe driver already checks it, even if it won't ever be set
at the moment.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index b54142534793b1..44f9bca332e514 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -163,9 +163,9 @@ enum rq_flag_bits {
 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
 	__REQ_FUA,		/* forced unit access */
 	__REQ_PREFLUSH,		/* request for cache flush */
+	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 
 	/* bio only flags */
-	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_THROTTLED,	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
 
@@ -205,7 +205,7 @@ enum rq_flag_bits {
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 #define REQ_COMMON_MASK \
 	(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
-	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE)
+	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_RAHEAD)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 /* This mask is used for both bio and request merge checking */

From 8d2bbd4c8236e9e38e6b36ac9e2c54fdcfe5b335 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:12:12 +0200
Subject: [PATCH 017/182] block: replace REQ_THROTTLED with a bio flag

It's the last bio-only REQ_* flag, and we have space for it in the bio
bi_flags field.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-throttle.c      | 10 +++++-----
 include/linux/blk_types.h |  8 ++------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a3ea8260c94c89..a6bb4fe326c39b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -818,13 +818,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	tg->io_disp[rw]++;
 
 	/*
-	 * REQ_THROTTLED is used to prevent the same bio to be throttled
+	 * BIO_THROTTLED is used to prevent the same bio to be throttled
 	 * more than once as a throttled bio will go through blk-throtl the
 	 * second time when it eventually gets issued.  Set it when a bio
 	 * is being charged to a tg.
 	 */
-	if (!(bio->bi_opf & REQ_THROTTLED))
-		bio->bi_opf |= REQ_THROTTLED;
+	if (!bio_flagged(bio, BIO_THROTTLED))
+		bio_set_flag(bio, BIO_THROTTLED);
 }
 
 /**
@@ -1401,7 +1401,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* see throtl_charge_bio() */
-	if ((bio->bi_opf & REQ_THROTTLED) || !tg->has_rules[rw])
+	if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
 		goto out;
 
 	spin_lock_irq(q->queue_lock);
@@ -1480,7 +1480,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	 * being issued.
 	 */
 	if (!throttled)
-		bio->bi_opf &= ~REQ_THROTTLED;
+		bio_clear_flag(bio, BIO_THROTTLED);
 	return throttled;
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 44f9bca332e514..6df722de2e2267 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -119,6 +119,8 @@ struct bio {
 #define BIO_QUIET	6	/* Make BIO Quiet */
 #define BIO_CHAIN	7	/* chained bio, ->bi_remaining in effect */
 #define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
+#define BIO_THROTTLED	9	/* This bio has already been subjected to
+				 * throttling rules. Don't do it again. */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
@@ -165,10 +167,6 @@ enum rq_flag_bits {
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 
-	/* bio only flags */
-	__REQ_THROTTLED,	/* This bio has already been subjected to
-				 * throttling rules. Don't do it again. */
-
 	/* request only flags */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
@@ -213,8 +211,6 @@ enum rq_flag_bits {
 	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_PREFLUSH | REQ_FUA | REQ_FLUSH_SEQ)
 
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
-#define REQ_THROTTLED		(1ULL << __REQ_THROTTLED)
-
 #define REQ_SORTED		(1ULL << __REQ_SORTED)
 #define REQ_SOFTBARRIER		(1ULL << __REQ_SOFTBARRIER)
 #define REQ_FUA			(1ULL << __REQ_FUA)

From e806402130c9c494e22c73ae9ead4e79d2a5811c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:12:13 +0200
Subject: [PATCH 018/182] block: split out request-only flags into a new
 namespace

A lot of the REQ_* flags are only used on struct requests, and only of
use to the block layer and a few drivers that dig into struct request
internals.

This patch adds a new req_flags_t rq_flags field to struct request for
them, and thus dramatically shrinks the number of common requests.  It
also removes the unfortunate situation where we have to fit the fields
from the same enum into 32 bits for struct bio and 64 bits for
struct request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/biodoc.txt              |  2 +-
 block/blk-core.c                            | 71 +++++++++----------
 block/blk-exec.c                            |  2 +-
 block/blk-flush.c                           |  9 +--
 block/blk-map.c                             |  4 +-
 block/blk-merge.c                           |  8 +--
 block/blk-mq.c                              | 19 +++---
 block/blk-tag.c                             |  6 +-
 block/blk.h                                 |  4 +-
 block/elevator.c                            | 32 +++++----
 drivers/block/pktcdvd.c                     |  2 +-
 drivers/ide/ide-atapi.c                     |  6 +-
 drivers/ide/ide-cd.c                        | 46 ++++++-------
 drivers/ide/ide-cd.h                        |  2 +-
 drivers/ide/ide-cd_ioctl.c                  |  6 +-
 drivers/ide/ide-io.c                        |  6 +-
 drivers/ide/ide-pm.c                        |  4 +-
 drivers/md/dm-rq.c                          | 12 ++--
 drivers/memstick/core/ms_block.c            |  2 +-
 drivers/memstick/core/mspro_block.c         |  2 +-
 drivers/mmc/card/block.c                    |  4 +-
 drivers/mmc/card/queue.c                    |  4 +-
 drivers/nvme/host/pci.c                     |  4 +-
 drivers/scsi/device_handler/scsi_dh_alua.c  |  8 ++-
 drivers/scsi/device_handler/scsi_dh_emc.c   |  2 +-
 drivers/scsi/device_handler/scsi_dh_hp_sw.c |  2 +-
 drivers/scsi/device_handler/scsi_dh_rdac.c  |  2 +-
 drivers/scsi/osd/osd_initiator.c            |  2 +-
 drivers/scsi/osst.c                         |  2 +-
 drivers/scsi/scsi_error.c                   |  2 +-
 drivers/scsi/scsi_lib.c                     | 75 ++++++++++++---------
 drivers/scsi/sd.c                           |  6 +-
 drivers/scsi/sd_zbc.c                       |  2 +-
 drivers/scsi/st.c                           |  2 +-
 drivers/scsi/ufs/ufshcd.c                   |  6 +-
 include/linux/blk_types.h                   | 39 +----------
 include/linux/blkdev.h                      | 49 +++++++++++++-
 include/scsi/scsi_device.h                  |  4 +-
 38 files changed, 242 insertions(+), 218 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 918e1e0d0e78b2..6acea160298cc9 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -348,7 +348,7 @@ Drivers can now specify a request prepare function (q->prep_rq_fn) that the
 block layer would invoke to pre-build device commands for a given request,
 or perform other preparatory processing for the request. This is routine is
 called by elv_next_request(), i.e. typically just before servicing a request.
-(The prepare function would not be called for requests that have REQ_DONTPREP
+(The prepare function would not be called for requests that have RQF_DONTPREP
 enabled)
 
 Aside:
diff --git a/block/blk-core.c b/block/blk-core.c
index e4eda5d2aa5617..fd416651a6765f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -145,13 +145,13 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	if (error)
 		bio->bi_error = error;
 
-	if (unlikely(rq->cmd_flags & REQ_QUIET))
+	if (unlikely(rq->rq_flags & RQF_QUIET))
 		bio_set_flag(bio, BIO_QUIET);
 
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
 		bio_endio(bio);
 }
 
@@ -899,7 +899,7 @@ EXPORT_SYMBOL(blk_get_queue);
 
 static inline void blk_free_request(struct request_list *rl, struct request *rq)
 {
-	if (rq->cmd_flags & REQ_ELVPRIV) {
+	if (rq->rq_flags & RQF_ELVPRIV) {
 		elv_put_request(rl->q, rq);
 		if (rq->elv.icq)
 			put_io_context(rq->elv.icq->ioc);
@@ -961,14 +961,14 @@ static void __freed_request(struct request_list *rl, int sync)
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
-static void freed_request(struct request_list *rl, int op, unsigned int flags)
+static void freed_request(struct request_list *rl, bool sync,
+		req_flags_t rq_flags)
 {
 	struct request_queue *q = rl->q;
-	int sync = rw_is_sync(op, flags);
 
 	q->nr_rqs[sync]--;
 	rl->count[sync]--;
-	if (flags & REQ_ELVPRIV)
+	if (rq_flags & RQF_ELVPRIV)
 		q->nr_rqs_elvpriv--;
 
 	__freed_request(rl, sync);
@@ -1079,6 +1079,7 @@ static struct request *__get_request(struct request_list *rl, int op,
 	struct io_cq *icq = NULL;
 	const bool is_sync = rw_is_sync(op, op_flags) != 0;
 	int may_queue;
+	req_flags_t rq_flags = RQF_ALLOCED;
 
 	if (unlikely(blk_queue_dying(q)))
 		return ERR_PTR(-ENODEV);
@@ -1127,7 +1128,7 @@ static struct request *__get_request(struct request_list *rl, int op,
 
 	/*
 	 * Decide whether the new request will be managed by elevator.  If
-	 * so, mark @op_flags and increment elvpriv.  Non-zero elvpriv will
+	 * so, mark @rq_flags and increment elvpriv.  Non-zero elvpriv will
 	 * prevent the current elevator from being destroyed until the new
 	 * request is freed.  This guarantees icq's won't be destroyed and
 	 * makes creating new ones safe.
@@ -1136,14 +1137,14 @@ static struct request *__get_request(struct request_list *rl, int op,
 	 * it will be created after releasing queue_lock.
 	 */
 	if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
-		op_flags |= REQ_ELVPRIV;
+		rq_flags |= RQF_ELVPRIV;
 		q->nr_rqs_elvpriv++;
 		if (et->icq_cache && ioc)
 			icq = ioc_lookup_icq(ioc, q);
 	}
 
 	if (blk_queue_io_stat(q))
-		op_flags |= REQ_IO_STAT;
+		rq_flags |= RQF_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 
 	/* allocate and init request */
@@ -1153,10 +1154,11 @@ static struct request *__get_request(struct request_list *rl, int op,
 
 	blk_rq_init(q, rq);
 	blk_rq_set_rl(rq, rl);
-	req_set_op_attrs(rq, op, op_flags | REQ_ALLOCED);
+	req_set_op_attrs(rq, op, op_flags);
+	rq->rq_flags = rq_flags;
 
 	/* init elvpriv */
-	if (op_flags & REQ_ELVPRIV) {
+	if (rq_flags & RQF_ELVPRIV) {
 		if (unlikely(et->icq_cache && !icq)) {
 			if (ioc)
 				icq = ioc_create_icq(ioc, q, gfp_mask);
@@ -1195,7 +1197,7 @@ static struct request *__get_request(struct request_list *rl, int op,
 	printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
 			   __func__, dev_name(q->backing_dev_info.dev));
 
-	rq->cmd_flags &= ~REQ_ELVPRIV;
+	rq->rq_flags &= ~RQF_ELVPRIV;
 	rq->elv.icq = NULL;
 
 	spin_lock_irq(q->queue_lock);
@@ -1212,7 +1214,7 @@ static struct request *__get_request(struct request_list *rl, int op,
 	 * queue, but this is pretty rare.
 	 */
 	spin_lock_irq(q->queue_lock);
-	freed_request(rl, op, op_flags);
+	freed_request(rl, is_sync, rq_flags);
 
 	/*
 	 * in the very unlikely event that allocation failed and no
@@ -1347,7 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
 
-	if (rq->cmd_flags & REQ_QUEUED)
+	if (rq->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, rq);
 
 	BUG_ON(blk_queued_rq(rq));
@@ -1409,7 +1411,7 @@ EXPORT_SYMBOL_GPL(part_round_stats);
 #ifdef CONFIG_PM
 static void blk_pm_put_request(struct request *rq)
 {
-	if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
+	if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
 		pm_runtime_mark_last_busy(rq->q->dev);
 }
 #else
@@ -1421,6 +1423,8 @@ static inline void blk_pm_put_request(struct request *rq) {}
  */
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
+	req_flags_t rq_flags = req->rq_flags;
+
 	if (unlikely(!q))
 		return;
 
@@ -1440,16 +1444,15 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
 	 */
-	if (req->cmd_flags & REQ_ALLOCED) {
-		unsigned int flags = req->cmd_flags;
-		int op = req_op(req);
+	if (rq_flags & RQF_ALLOCED) {
 		struct request_list *rl = blk_rq_rl(req);
+		bool sync = rw_is_sync(req_op(req), req->cmd_flags);
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(ELV_ON_HASH(req));
 
 		blk_free_request(rl, req);
-		freed_request(rl, op, flags);
+		freed_request(rl, sync, rq_flags);
 		blk_put_rl(rl);
 	}
 }
@@ -2214,7 +2217,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
 	unsigned int bytes = 0;
 	struct bio *bio;
 
-	if (!(rq->cmd_flags & REQ_MIXED_MERGE))
+	if (!(rq->rq_flags & RQF_MIXED_MERGE))
 		return blk_rq_bytes(rq);
 
 	/*
@@ -2257,7 +2260,7 @@ void blk_account_io_done(struct request *req)
 	 * normal IO on queueing nor completion.  Accounting the
 	 * containing request is enough.
 	 */
-	if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
+	if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
@@ -2285,7 +2288,7 @@ static struct request *blk_pm_peek_request(struct request_queue *q,
 					   struct request *rq)
 {
 	if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
-	    (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
+	    (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM))))
 		return NULL;
 	else
 		return rq;
@@ -2361,13 +2364,13 @@ struct request *blk_peek_request(struct request_queue *q)
 		if (!rq)
 			break;
 
-		if (!(rq->cmd_flags & REQ_STARTED)) {
+		if (!(rq->rq_flags & RQF_STARTED)) {
 			/*
 			 * This is the first time the device driver
 			 * sees this request (possibly after
 			 * requeueing).  Notify IO scheduler.
 			 */
-			if (rq->cmd_flags & REQ_SORTED)
+			if (rq->rq_flags & RQF_SORTED)
 				elv_activate_rq(q, rq);
 
 			/*
@@ -2375,7 +2378,7 @@ struct request *blk_peek_request(struct request_queue *q)
 			 * it, a request that has been delayed should
 			 * not be passed by new incoming requests
 			 */
-			rq->cmd_flags |= REQ_STARTED;
+			rq->rq_flags |= RQF_STARTED;
 			trace_block_rq_issue(q, rq);
 		}
 
@@ -2384,7 +2387,7 @@ struct request *blk_peek_request(struct request_queue *q)
 			q->boundary_rq = NULL;
 		}
 
-		if (rq->cmd_flags & REQ_DONTPREP)
+		if (rq->rq_flags & RQF_DONTPREP)
 			break;
 
 		if (q->dma_drain_size && blk_rq_bytes(rq)) {
@@ -2407,11 +2410,11 @@ struct request *blk_peek_request(struct request_queue *q)
 			/*
 			 * the request may have been (partially) prepped.
 			 * we need to keep this request in the front to
-			 * avoid resource deadlock.  REQ_STARTED will
+			 * avoid resource deadlock.  RQF_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
 			if (q->dma_drain_size && blk_rq_bytes(rq) &&
-			    !(rq->cmd_flags & REQ_DONTPREP)) {
+			    !(rq->rq_flags & RQF_DONTPREP)) {
 				/*
 				 * remove the space for the drain we added
 				 * so that we don't add it again
@@ -2424,7 +2427,7 @@ struct request *blk_peek_request(struct request_queue *q)
 		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
 			int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO;
 
-			rq->cmd_flags |= REQ_QUIET;
+			rq->rq_flags |= RQF_QUIET;
 			/*
 			 * Mark this request as started so we don't trigger
 			 * any debug logic in the end I/O path.
@@ -2561,7 +2564,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		req->errors = 0;
 
 	if (error && req->cmd_type == REQ_TYPE_FS &&
-	    !(req->cmd_flags & REQ_QUIET)) {
+	    !(req->rq_flags & RQF_QUIET)) {
 		char *error_type;
 
 		switch (error) {
@@ -2634,7 +2637,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		req->__sector += total_bytes >> 9;
 
 	/* mixed attributes always follow the first bio */
-	if (req->cmd_flags & REQ_MIXED_MERGE) {
+	if (req->rq_flags & RQF_MIXED_MERGE) {
 		req->cmd_flags &= ~REQ_FAILFAST_MASK;
 		req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
 	}
@@ -2687,7 +2690,7 @@ void blk_unprep_request(struct request *req)
 {
 	struct request_queue *q = req->q;
 
-	req->cmd_flags &= ~REQ_DONTPREP;
+	req->rq_flags &= ~RQF_DONTPREP;
 	if (q->unprep_rq_fn)
 		q->unprep_rq_fn(q, req);
 }
@@ -2698,7 +2701,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
  */
 void blk_finish_request(struct request *req, int error)
 {
-	if (req->cmd_flags & REQ_QUEUED)
+	if (req->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(req->q, req);
 
 	BUG_ON(blk_queued_rq(req));
@@ -2708,7 +2711,7 @@ void blk_finish_request(struct request *req, int error)
 
 	blk_delete_timer(req);
 
-	if (req->cmd_flags & REQ_DONTPREP)
+	if (req->rq_flags & RQF_DONTPREP)
 		blk_unprep_request(req);
 
 	blk_account_io_done(req);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 7ea04325d02f56..3ecb00a6cf45dd 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -72,7 +72,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_dying(q))) {
-		rq->cmd_flags |= REQ_QUIET; 
+		rq->rq_flags |= RQF_QUIET;
 		rq->errors = -ENXIO;
 		__blk_end_request_all(rq, rq->errors);
 		spin_unlock_irq(q->queue_lock);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 6a14b68b91358b..3990b9cfbda51b 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -56,7 +56,7 @@
  * Once while executing DATA and again after the whole sequence is
  * complete.  The first completion updates the contained bio but doesn't
  * finish it so that the bio submitter is notified only after the whole
- * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
+ * sequence is complete.  This is implemented by testing RQF_FLUSH_SEQ in
  * req_bio_endio().
  *
  * The above peculiarity requires that each FLUSH/FUA request has only one
@@ -127,7 +127,7 @@ static void blk_flush_restore_request(struct request *rq)
 	rq->bio = rq->biotail;
 
 	/* make @rq a normal request */
-	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
+	rq->rq_flags &= ~RQF_FLUSH_SEQ;
 	rq->end_io = rq->flush.saved_end_io;
 }
 
@@ -330,7 +330,8 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	}
 
 	flush_rq->cmd_type = REQ_TYPE_FS;
-	req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH | REQ_FLUSH_SEQ);
+	req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH);
+	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
 
@@ -433,7 +434,7 @@ void blk_insert_flush(struct request *rq)
 	 */
 	memset(&rq->flush, 0, sizeof(rq->flush));
 	INIT_LIST_HEAD(&rq->flush.list);
-	rq->cmd_flags |= REQ_FLUSH_SEQ;
+	rq->rq_flags |= RQF_FLUSH_SEQ;
 	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 	if (q->mq_ops) {
 		rq->end_io = mq_flush_data_end_io;
diff --git a/block/blk-map.c b/block/blk-map.c
index b8657fa8dc9af7..2c5ae5fef473ce 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -135,7 +135,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	} while (iov_iter_count(&i));
 
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
-		rq->cmd_flags |= REQ_COPY_USER;
+		rq->rq_flags |= RQF_COPY_USER;
 	return 0;
 
 unmap_rq:
@@ -232,7 +232,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 	if (do_copy)
-		rq->cmd_flags |= REQ_COPY_USER;
+		rq->rq_flags |= RQF_COPY_USER;
 
 	ret = blk_rq_append_bio(rq, bio);
 	if (unlikely(ret)) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2642e5fc8b69a0..fda6a12fc776b1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -456,7 +456,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	if (rq->bio)
 		nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
 
-	if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
+	if (unlikely(rq->rq_flags & RQF_COPY_USER) &&
 	    (blk_rq_bytes(rq) & q->dma_pad_mask)) {
 		unsigned int pad_len =
 			(q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
@@ -634,7 +634,7 @@ void blk_rq_set_mixed_merge(struct request *rq)
 	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
 	struct bio *bio;
 
-	if (rq->cmd_flags & REQ_MIXED_MERGE)
+	if (rq->rq_flags & RQF_MIXED_MERGE)
 		return;
 
 	/*
@@ -647,7 +647,7 @@ void blk_rq_set_mixed_merge(struct request *rq)
 			     (bio->bi_opf & REQ_FAILFAST_MASK) != ff);
 		bio->bi_opf |= ff;
 	}
-	rq->cmd_flags |= REQ_MIXED_MERGE;
+	rq->rq_flags |= RQF_MIXED_MERGE;
 }
 
 static void blk_account_io_merge(struct request *req)
@@ -709,7 +709,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	 * makes sure that all involved bios have mixable attributes
 	 * set properly.
 	 */
-	if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
+	if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) ||
 	    (req->cmd_flags & REQ_FAILFAST_MASK) !=
 	    (next->cmd_flags & REQ_FAILFAST_MASK)) {
 		blk_rq_set_mixed_merge(req);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d74a74a9f9efe9..b49c6658eb05f9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -142,14 +142,13 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 			       struct request *rq, int op,
 			       unsigned int op_flags)
 {
-	if (blk_queue_io_stat(q))
-		op_flags |= REQ_IO_STAT;
-
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = q;
 	rq->mq_ctx = ctx;
 	req_set_op_attrs(rq, op, op_flags);
+	if (blk_queue_io_stat(q))
+		rq->rq_flags |= RQF_IO_STAT;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
 	rq->cpu = -1;
 	INIT_HLIST_NODE(&rq->hash);
@@ -198,7 +197,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
 		rq = data->hctx->tags->rqs[tag];
 
 		if (blk_mq_tag_busy(data->hctx)) {
-			rq->cmd_flags = REQ_MQ_INFLIGHT;
+			rq->rq_flags = RQF_MQ_INFLIGHT;
 			atomic_inc(&data->hctx->nr_active);
 		}
 
@@ -298,9 +297,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
 
-	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
-	rq->cmd_flags = 0;
+	rq->rq_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	blk_mq_put_tag(hctx, ctx, tag);
@@ -489,10 +488,10 @@ static void blk_mq_requeue_work(struct work_struct *work)
 	spin_unlock_irqrestore(&q->requeue_lock, flags);
 
 	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
-		if (!(rq->cmd_flags & REQ_SOFTBARRIER))
+		if (!(rq->rq_flags & RQF_SOFTBARRIER))
 			continue;
 
-		rq->cmd_flags &= ~REQ_SOFTBARRIER;
+		rq->rq_flags &= ~RQF_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
 		blk_mq_insert_request(rq, true, false, false);
 	}
@@ -519,11 +518,11 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
 	 * We abuse this flag that is otherwise used by the I/O scheduler to
 	 * request head insertation from the workqueue.
 	 */
-	BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
+	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 
 	spin_lock_irqsave(&q->requeue_lock, flags);
 	if (at_head) {
-		rq->cmd_flags |= REQ_SOFTBARRIER;
+		rq->rq_flags |= RQF_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->requeue_list);
 	} else {
 		list_add_tail(&rq->queuelist, &q->requeue_list);
diff --git a/block/blk-tag.c b/block/blk-tag.c
index f0344e6939d537..bae1decb6ec39c 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -270,7 +270,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 	BUG_ON(tag >= bqt->real_max_depth);
 
 	list_del_init(&rq->queuelist);
-	rq->cmd_flags &= ~REQ_QUEUED;
+	rq->rq_flags &= ~RQF_QUEUED;
 	rq->tag = -1;
 
 	if (unlikely(bqt->tag_index[tag] == NULL))
@@ -316,7 +316,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	unsigned max_depth;
 	int tag;
 
-	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
+	if (unlikely((rq->rq_flags & RQF_QUEUED))) {
 		printk(KERN_ERR
 		       "%s: request %p for device [%s] already tagged %d",
 		       __func__, rq,
@@ -371,7 +371,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 */
 
 	bqt->next_tag = (tag + 1) % bqt->max_depth;
-	rq->cmd_flags |= REQ_QUEUED;
+	rq->rq_flags |= RQF_QUEUED;
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
 	blk_start_request(rq);
diff --git a/block/blk.h b/block/blk.h
index 74444c49078fc7..aa132dea598c5e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -130,7 +130,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
 /*
  * Internal elevator interface
  */
-#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
+#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
 
 void blk_insert_flush(struct request *rq);
 
@@ -247,7 +247,7 @@ extern int blk_update_nr_requests(struct request_queue *, unsigned int);
 static inline int blk_do_io_stat(struct request *rq)
 {
 	return rq->rq_disk &&
-	       (rq->cmd_flags & REQ_IO_STAT) &&
+	       (rq->rq_flags & RQF_IO_STAT) &&
 		(rq->cmd_type == REQ_TYPE_FS);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index f7d973a56fd750..ac80f89a08424c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -245,7 +245,7 @@ EXPORT_SYMBOL(elevator_exit);
 static inline void __elv_rqhash_del(struct request *rq)
 {
 	hash_del(&rq->hash);
-	rq->cmd_flags &= ~REQ_HASHED;
+	rq->rq_flags &= ~RQF_HASHED;
 }
 
 static void elv_rqhash_del(struct request_queue *q, struct request *rq)
@@ -260,7 +260,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 
 	BUG_ON(ELV_ON_HASH(rq));
 	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
-	rq->cmd_flags |= REQ_HASHED;
+	rq->rq_flags |= RQF_HASHED;
 }
 
 static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
@@ -352,7 +352,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 {
 	sector_t boundary;
 	struct list_head *entry;
-	int stop_flags;
 
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
@@ -362,7 +361,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	q->nr_sorted--;
 
 	boundary = q->end_sector;
-	stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 
@@ -370,7 +368,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 			break;
 		if (rq_data_dir(rq) != rq_data_dir(pos))
 			break;
-		if (pos->cmd_flags & stop_flags)
+		if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
 			break;
 		if (blk_rq_pos(rq) >= boundary) {
 			if (blk_rq_pos(pos) < boundary)
@@ -510,7 +508,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
-	const int next_sorted = next->cmd_flags & REQ_SORTED;
+	const int next_sorted = next->rq_flags & RQF_SORTED;
 
 	if (next_sorted && e->type->ops.elevator_merge_req_fn)
 		e->type->ops.elevator_merge_req_fn(q, rq, next);
@@ -537,13 +535,13 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 #ifdef CONFIG_PM
 static void blk_pm_requeue_request(struct request *rq)
 {
-	if (rq->q->dev && !(rq->cmd_flags & REQ_PM))
+	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
 		rq->q->nr_pending--;
 }
 
 static void blk_pm_add_request(struct request_queue *q, struct request *rq)
 {
-	if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 &&
+	if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
 	    (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
 		pm_request_resume(q->dev);
 }
@@ -563,11 +561,11 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
-		if (rq->cmd_flags & REQ_SORTED)
+		if (rq->rq_flags & RQF_SORTED)
 			elv_deactivate_rq(q, rq);
 	}
 
-	rq->cmd_flags &= ~REQ_STARTED;
+	rq->rq_flags &= ~RQF_STARTED;
 
 	blk_pm_requeue_request(rq);
 
@@ -597,13 +595,13 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 
 	rq->q = q;
 
-	if (rq->cmd_flags & REQ_SOFTBARRIER) {
+	if (rq->rq_flags & RQF_SOFTBARRIER) {
 		/* barriers are scheduling boundary, update end_sector */
 		if (rq->cmd_type == REQ_TYPE_FS) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
-	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
+	} else if (!(rq->rq_flags & RQF_ELVPRIV) &&
 		    (where == ELEVATOR_INSERT_SORT ||
 		     where == ELEVATOR_INSERT_SORT_MERGE))
 		where = ELEVATOR_INSERT_BACK;
@@ -611,12 +609,12 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 	switch (where) {
 	case ELEVATOR_INSERT_REQUEUE:
 	case ELEVATOR_INSERT_FRONT:
-		rq->cmd_flags |= REQ_SOFTBARRIER;
+		rq->rq_flags |= RQF_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->queue_head);
 		break;
 
 	case ELEVATOR_INSERT_BACK:
-		rq->cmd_flags |= REQ_SOFTBARRIER;
+		rq->rq_flags |= RQF_SOFTBARRIER;
 		elv_drain_elevator(q);
 		list_add_tail(&rq->queuelist, &q->queue_head);
 		/*
@@ -642,7 +640,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 			break;
 	case ELEVATOR_INSERT_SORT:
 		BUG_ON(rq->cmd_type != REQ_TYPE_FS);
-		rq->cmd_flags |= REQ_SORTED;
+		rq->rq_flags |= RQF_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
 			elv_rqhash_add(q, rq);
@@ -659,7 +657,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		break;
 
 	case ELEVATOR_INSERT_FLUSH:
-		rq->cmd_flags |= REQ_SOFTBARRIER;
+		rq->rq_flags |= RQF_SOFTBARRIER;
 		blk_insert_flush(rq);
 		break;
 	default:
@@ -735,7 +733,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
-		if ((rq->cmd_flags & REQ_SORTED) &&
+		if ((rq->rq_flags & RQF_SORTED) &&
 		    e->type->ops.elevator_completed_req_fn)
 			e->type->ops.elevator_completed_req_fn(q, rq);
 	}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 90fa4ac149dbe0..7cf795e0fc8d63 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -721,7 +721,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 
 	rq->timeout = 60*HZ;
 	if (cgc->quiet)
-		rq->cmd_flags |= REQ_QUIET;
+		rq->rq_flags |= RQF_QUIET;
 
 	blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
 	if (rq->errors)
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 05352f490d6088..f90ea221f7f270 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -211,7 +211,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
 	sense_rq->cmd_type = REQ_TYPE_ATA_SENSE;
-	sense_rq->cmd_flags |= REQ_PREEMPT;
+	sense_rq->rq_flags |= RQF_PREEMPT;
 
 	if (drive->media == ide_tape)
 		sense_rq->cmd[13] = REQ_IDETAPE_PC1;
@@ -295,7 +295,7 @@ int ide_cd_expiry(ide_drive_t *drive)
 		wait = ATAPI_WAIT_PC;
 		break;
 	default:
-		if (!(rq->cmd_flags & REQ_QUIET))
+		if (!(rq->rq_flags & RQF_QUIET))
 			printk(KERN_INFO PFX "cmd 0x%x timed out\n",
 					 rq->cmd[0]);
 		wait = 0;
@@ -375,7 +375,7 @@ int ide_check_ireason(ide_drive_t *drive, struct request *rq, int len,
 	}
 
 	if (dev_is_idecd(drive) && rq->cmd_type == REQ_TYPE_ATA_PC)
-		rq->cmd_flags |= REQ_FAILED;
+		rq->rq_flags |= RQF_FAILED;
 
 	return 1;
 }
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index bf9a2ad296edea..9cbd217bc0c920 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -98,7 +98,7 @@ static int cdrom_log_sense(ide_drive_t *drive, struct request *rq)
 	struct request_sense *sense = &drive->sense_data;
 	int log = 0;
 
-	if (!sense || !rq || (rq->cmd_flags & REQ_QUIET))
+	if (!sense || !rq || (rq->rq_flags & RQF_QUIET))
 		return 0;
 
 	ide_debug_log(IDE_DBG_SENSE, "sense_key: 0x%x", sense->sense_key);
@@ -291,7 +291,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * (probably while trying to recover from a former error).
 		 * Just give up.
 		 */
-		rq->cmd_flags |= REQ_FAILED;
+		rq->rq_flags |= RQF_FAILED;
 		return 2;
 	}
 
@@ -311,7 +311,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 			cdrom_saw_media_change(drive);
 
 			if (rq->cmd_type == REQ_TYPE_FS &&
-			    !(rq->cmd_flags & REQ_QUIET))
+			    !(rq->rq_flags & RQF_QUIET))
 				printk(KERN_ERR PFX "%s: tray open\n",
 					drive->name);
 		}
@@ -346,7 +346,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * No point in retrying after an illegal request or data
 		 * protect error.
 		 */
-		if (!(rq->cmd_flags & REQ_QUIET))
+		if (!(rq->rq_flags & RQF_QUIET))
 			ide_dump_status(drive, "command error", stat);
 		do_end_request = 1;
 		break;
@@ -355,14 +355,14 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * No point in re-trying a zillion times on a bad sector.
 		 * If we got here the error is not correctable.
 		 */
-		if (!(rq->cmd_flags & REQ_QUIET))
+		if (!(rq->rq_flags & RQF_QUIET))
 			ide_dump_status(drive, "media error "
 					"(bad sector)", stat);
 		do_end_request = 1;
 		break;
 	case BLANK_CHECK:
 		/* disk appears blank? */
-		if (!(rq->cmd_flags & REQ_QUIET))
+		if (!(rq->rq_flags & RQF_QUIET))
 			ide_dump_status(drive, "media error (blank)",
 					stat);
 		do_end_request = 1;
@@ -380,7 +380,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 	}
 
 	if (rq->cmd_type != REQ_TYPE_FS) {
-		rq->cmd_flags |= REQ_FAILED;
+		rq->rq_flags |= RQF_FAILED;
 		do_end_request = 1;
 	}
 
@@ -422,19 +422,19 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		    int write, void *buffer, unsigned *bufflen,
 		    struct request_sense *sense, int timeout,
-		    unsigned int cmd_flags)
+		    req_flags_t rq_flags)
 {
 	struct cdrom_info *info = drive->driver_data;
 	struct request_sense local_sense;
 	int retries = 10;
-	unsigned int flags = 0;
+	req_flags_t flags = 0;
 
 	if (!sense)
 		sense = &local_sense;
 
 	ide_debug_log(IDE_DBG_PC, "cmd[0]: 0x%x, write: 0x%x, timeout: %d, "
-				  "cmd_flags: 0x%x",
-				  cmd[0], write, timeout, cmd_flags);
+				  "rq_flags: 0x%x",
+				  cmd[0], write, timeout, rq_flags);
 
 	/* start of retry loop */
 	do {
@@ -446,7 +446,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		memcpy(rq->cmd, cmd, BLK_MAX_CDB);
 		rq->cmd_type = REQ_TYPE_ATA_PC;
 		rq->sense = sense;
-		rq->cmd_flags |= cmd_flags;
+		rq->rq_flags |= rq_flags;
 		rq->timeout = timeout;
 		if (buffer) {
 			error = blk_rq_map_kern(drive->queue, rq, buffer,
@@ -462,14 +462,14 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		if (buffer)
 			*bufflen = rq->resid_len;
 
-		flags = rq->cmd_flags;
+		flags = rq->rq_flags;
 		blk_put_request(rq);
 
 		/*
 		 * FIXME: we should probably abort/retry or something in case of
 		 * failure.
 		 */
-		if (flags & REQ_FAILED) {
+		if (flags & RQF_FAILED) {
 			/*
 			 * The request failed.  Retry if it was due to a unit
 			 * attention status (usually means media was changed).
@@ -494,10 +494,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		}
 
 		/* end of retry loop */
-	} while ((flags & REQ_FAILED) && retries >= 0);
+	} while ((flags & RQF_FAILED) && retries >= 0);
 
 	/* return an error if the command failed */
-	return (flags & REQ_FAILED) ? -EIO : 0;
+	return (flags & RQF_FAILED) ? -EIO : 0;
 }
 
 /*
@@ -589,7 +589,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 					"(%u bytes)\n", drive->name, __func__,
 					cmd->nleft);
 				if (!write)
-					rq->cmd_flags |= REQ_FAILED;
+					rq->rq_flags |= RQF_FAILED;
 				uptodate = 0;
 			}
 		} else if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
@@ -607,7 +607,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			}
 
 			if (!uptodate)
-				rq->cmd_flags |= REQ_FAILED;
+				rq->rq_flags |= RQF_FAILED;
 		}
 		goto out_end;
 	}
@@ -745,9 +745,9 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 				  rq->cmd[0], rq->cmd_type);
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
-		rq->cmd_flags |= REQ_QUIET;
+		rq->rq_flags |= RQF_QUIET;
 	else
-		rq->cmd_flags &= ~REQ_FAILED;
+		rq->rq_flags &= ~RQF_FAILED;
 
 	drive->dma = 0;
 
@@ -867,7 +867,7 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
 	 */
 	cmd[7] = cdi->sanyo_slot % 3;
 
-	return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, REQ_QUIET);
+	return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET);
 }
 
 static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
@@ -890,7 +890,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
 	cmd[0] = GPCMD_READ_CDVD_CAPACITY;
 
 	stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, sense, 0,
-			       REQ_QUIET);
+			       RQF_QUIET);
 	if (stat)
 		return stat;
 
@@ -943,7 +943,7 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
 	if (msf_flag)
 		cmd[1] = 2;
 
-	return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, REQ_QUIET);
+	return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, RQF_QUIET);
 }
 
 /* Try to read the entire TOC for the disk into our internal buffer. */
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 1efc936f5b6672..eea60c986c4fad 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -101,7 +101,7 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *);
 
 /* ide-cd.c functions used by ide-cd_ioctl.c */
 int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *,
-		    unsigned *, struct request_sense *, int, unsigned int);
+		    unsigned *, struct request_sense *, int, req_flags_t);
 int ide_cd_read_toc(ide_drive_t *, struct request_sense *);
 int ide_cdrom_get_capabilities(ide_drive_t *, u8 *);
 void ide_cdrom_update_speed(ide_drive_t *, u8 *);
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 5887a7a09e3702..f085e3a2e1d69e 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -305,7 +305,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	rq->cmd_type = REQ_TYPE_DRV_PRIV;
-	rq->cmd_flags = REQ_QUIET;
+	rq->rq_flags = RQF_QUIET;
 	ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
 	blk_put_request(rq);
 	/*
@@ -449,7 +449,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
 			    struct packet_command *cgc)
 {
 	ide_drive_t *drive = cdi->handle;
-	unsigned int flags = 0;
+	req_flags_t flags = 0;
 	unsigned len = cgc->buflen;
 
 	if (cgc->timeout <= 0)
@@ -463,7 +463,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
 		memset(cgc->sense, 0, sizeof(struct request_sense));
 
 	if (cgc->quiet)
-		flags |= REQ_QUIET;
+		flags |= RQF_QUIET;
 
 	cgc->stat = ide_cd_queue_pc(drive, cgc->cmd,
 				    cgc->data_direction == CGC_DATA_WRITE,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 669ea1e4579586..6360bbd37efe52 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -307,7 +307,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 {
 	ide_startstop_t startstop;
 
-	BUG_ON(!(rq->cmd_flags & REQ_STARTED));
+	BUG_ON(!(rq->rq_flags & RQF_STARTED));
 
 #ifdef DEBUG
 	printk("%s: start_request: current=0x%08lx\n",
@@ -316,7 +316,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 
 	/* bail early if we've exceeded max_failures */
 	if (drive->max_failures && (drive->failures > drive->max_failures)) {
-		rq->cmd_flags |= REQ_FAILED;
+		rq->rq_flags |= RQF_FAILED;
 		goto kill_rq;
 	}
 
@@ -539,7 +539,7 @@ void do_ide_request(struct request_queue *q)
 		 */
 		if ((drive->dev_flags & IDE_DFLAG_BLOCKED) &&
 		    ata_pm_request(rq) == 0 &&
-		    (rq->cmd_flags & REQ_PREEMPT) == 0) {
+		    (rq->rq_flags & RQF_PREEMPT) == 0) {
 			/* there should be no pending command at this point */
 			ide_unlock_port(hwif);
 			goto plug_device;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index e34af488693a62..a015acdffb3933 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -53,7 +53,7 @@ static int ide_pm_execute_rq(struct request *rq)
 
 	spin_lock_irq(q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
-		rq->cmd_flags |= REQ_QUIET;
+		rq->rq_flags |= RQF_QUIET;
 		rq->errors = -ENXIO;
 		__blk_end_request_all(rq, rq->errors);
 		spin_unlock_irq(q->queue_lock);
@@ -90,7 +90,7 @@ int generic_ide_resume(struct device *dev)
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
 	rq->cmd_type = REQ_TYPE_ATA_PM_RESUME;
-	rq->cmd_flags |= REQ_PREEMPT;
+	rq->rq_flags |= RQF_PREEMPT;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index dc75bea0d541b3..f76cc36b8546d7 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -313,7 +313,7 @@ static void dm_unprep_request(struct request *rq)
 
 	if (!rq->q->mq_ops) {
 		rq->special = NULL;
-		rq->cmd_flags &= ~REQ_DONTPREP;
+		rq->rq_flags &= ~RQF_DONTPREP;
 	}
 
 	if (clone)
@@ -431,7 +431,7 @@ static void dm_softirq_done(struct request *rq)
 		return;
 	}
 
-	if (rq->cmd_flags & REQ_FAILED)
+	if (rq->rq_flags & RQF_FAILED)
 		mapped = false;
 
 	dm_done(clone, tio->error, mapped);
@@ -460,7 +460,7 @@ static void dm_complete_request(struct request *rq, int error)
  */
 static void dm_kill_unmapped_request(struct request *rq, int error)
 {
-	rq->cmd_flags |= REQ_FAILED;
+	rq->rq_flags |= RQF_FAILED;
 	dm_complete_request(rq, error);
 }
 
@@ -476,7 +476,7 @@ static void end_clone_request(struct request *clone, int error)
 		 * For just cleaning up the information of the queue in which
 		 * the clone was dispatched.
 		 * The clone is *NOT* freed actually here because it is alloced
-		 * from dm own mempool (REQ_ALLOCED isn't set).
+		 * from dm own mempool (RQF_ALLOCED isn't set).
 		 */
 		__blk_put_request(clone->q, clone);
 	}
@@ -497,7 +497,7 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 	int r;
 
 	if (blk_queue_io_stat(clone->q))
-		clone->cmd_flags |= REQ_IO_STAT;
+		clone->rq_flags |= RQF_IO_STAT;
 
 	clone->start_time = jiffies;
 	r = blk_insert_cloned_request(clone->q, clone);
@@ -633,7 +633,7 @@ static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
 		return BLKPREP_DEFER;
 
 	rq->special = tio;
-	rq->cmd_flags |= REQ_DONTPREP;
+	rq->rq_flags |= RQF_DONTPREP;
 
 	return BLKPREP_OK;
 }
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index aacf584f2a42ed..f3512404bc5282 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2006,7 +2006,7 @@ static int msb_prepare_req(struct request_queue *q, struct request *req)
 		blk_dump_rq_flags(req, "MS unsupported request");
 		return BLKPREP_KILL;
 	}
-	req->cmd_flags |= REQ_DONTPREP;
+	req->rq_flags |= RQF_DONTPREP;
 	return BLKPREP_OK;
 }
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c1472275fe57a7..fa0746d182ff50 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -834,7 +834,7 @@ static int mspro_block_prepare_req(struct request_queue *q, struct request *req)
 		return BLKPREP_KILL;
 	}
 
-	req->cmd_flags |= REQ_DONTPREP;
+	req->rq_flags |= RQF_DONTPREP;
 
 	return BLKPREP_OK;
 }
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index c3335112e68c29..f8190dd4a35cb4 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -2117,7 +2117,7 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
 		mmc_blk_abort_packed_req(mq_rq);
 	} else {
 		if (mmc_card_removed(card))
-			req->cmd_flags |= REQ_QUIET;
+			req->rq_flags |= RQF_QUIET;
 		while (ret)
 			ret = blk_end_request(req, -EIO,
 					blk_rq_cur_bytes(req));
@@ -2126,7 +2126,7 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
  start_new_req:
 	if (rqc) {
 		if (mmc_card_removed(card)) {
-			rqc->cmd_flags |= REQ_QUIET;
+			rqc->rq_flags |= RQF_QUIET;
 			blk_end_request_all(rqc, -EIO);
 		} else {
 			/*
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 8037f73a109a14..8a67f1c2ce2102 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -44,7 +44,7 @@ static int mmc_prep_request(struct request_queue *q, struct request *req)
 	if (mq && (mmc_card_removed(mq->card) || mmc_access_rpmb(mq)))
 		return BLKPREP_KILL;
 
-	req->cmd_flags |= REQ_DONTPREP;
+	req->rq_flags |= RQF_DONTPREP;
 
 	return BLKPREP_OK;
 }
@@ -120,7 +120,7 @@ static void mmc_request_fn(struct request_queue *q)
 
 	if (!mq) {
 		while ((req = blk_fetch_request(q)) != NULL) {
-			req->cmd_flags |= REQ_QUIET;
+			req->rq_flags |= RQF_QUIET;
 			__blk_end_request_all(req, -EIO);
 		}
 		return;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0fc99f0f257110..0955e9d2202035 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -323,9 +323,9 @@ static int nvme_init_iod(struct request *rq, unsigned size,
 	iod->nents = 0;
 	iod->length = size;
 
-	if (!(rq->cmd_flags & REQ_DONTPREP)) {
+	if (!(rq->rq_flags & RQF_DONTPREP)) {
 		rq->retries = 0;
-		rq->cmd_flags |= REQ_DONTPREP;
+		rq->rq_flags |= RQF_DONTPREP;
 	}
 	return 0;
 }
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index 241829e596680f..05813a42018882 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -154,7 +154,8 @@ static int submit_rtpg(struct scsi_device *sdev, unsigned char *buff,
 	return scsi_execute_req_flags(sdev, cdb, DMA_FROM_DEVICE,
 				      buff, bufflen, sshdr,
 				      ALUA_FAILOVER_TIMEOUT * HZ,
-				      ALUA_FAILOVER_RETRIES, NULL, req_flags);
+				      ALUA_FAILOVER_RETRIES, NULL,
+				      req_flags, 0);
 }
 
 /*
@@ -187,7 +188,8 @@ static int submit_stpg(struct scsi_device *sdev, int group_id,
 	return scsi_execute_req_flags(sdev, cdb, DMA_TO_DEVICE,
 				      stpg_data, stpg_len,
 				      sshdr, ALUA_FAILOVER_TIMEOUT * HZ,
-				      ALUA_FAILOVER_RETRIES, NULL, req_flags);
+				      ALUA_FAILOVER_RETRIES, NULL,
+				      req_flags, 0);
 }
 
 static struct alua_port_group *alua_find_get_pg(char *id_str, size_t id_size,
@@ -1063,7 +1065,7 @@ static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
 		 state != SCSI_ACCESS_STATE_ACTIVE &&
 		 state != SCSI_ACCESS_STATE_LBA) {
 		ret = BLKPREP_KILL;
-		req->cmd_flags |= REQ_QUIET;
+		req->rq_flags |= RQF_QUIET;
 	}
 	return ret;
 
diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c
index 375d81850f15ad..5b80746980b8ff 100644
--- a/drivers/scsi/device_handler/scsi_dh_emc.c
+++ b/drivers/scsi/device_handler/scsi_dh_emc.c
@@ -452,7 +452,7 @@ static int clariion_prep_fn(struct scsi_device *sdev, struct request *req)
 
 	if (h->lun_state != CLARIION_LUN_OWNED) {
 		ret = BLKPREP_KILL;
-		req->cmd_flags |= REQ_QUIET;
+		req->rq_flags |= RQF_QUIET;
 	}
 	return ret;
 
diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
index 9406d5f4a3d389..308e87195dc1ec 100644
--- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c
+++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
@@ -266,7 +266,7 @@ static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
 
 	if (h->path_state != HP_SW_PATH_ACTIVE) {
 		ret = BLKPREP_KILL;
-		req->cmd_flags |= REQ_QUIET;
+		req->rq_flags |= RQF_QUIET;
 	}
 	return ret;
 
diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c
index 06fbd0b0c68a30..00d9c326158eba 100644
--- a/drivers/scsi/device_handler/scsi_dh_rdac.c
+++ b/drivers/scsi/device_handler/scsi_dh_rdac.c
@@ -724,7 +724,7 @@ static int rdac_prep_fn(struct scsi_device *sdev, struct request *req)
 
 	if (h->state != RDAC_STATE_ACTIVE) {
 		ret = BLKPREP_KILL;
-		req->cmd_flags |= REQ_QUIET;
+		req->rq_flags |= RQF_QUIET;
 	}
 	return ret;
 
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 2f2a9910e30e82..ef99f62831fb85 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1595,7 +1595,7 @@ static int _init_blk_request(struct osd_request *or,
 	}
 
 	or->request = req;
-	req->cmd_flags |= REQ_QUIET;
+	req->rq_flags |= RQF_QUIET;
 
 	req->timeout = or->timeout;
 	req->retries = or->retries;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 5033223f628770..a2960f5d98eca5 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -368,7 +368,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 		return DRIVER_ERROR << 24;
 
 	blk_rq_set_block_pc(req);
-	req->cmd_flags |= REQ_QUIET;
+	req->rq_flags |= RQF_QUIET;
 
 	SRpnt->bio = NULL;
 
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 106a6adbd6f12d..996e134d79faaf 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 
 	req->cmd_len = COMMAND_SIZE(req->cmd[0]);
 
-	req->cmd_flags |= REQ_QUIET;
+	req->rq_flags |= RQF_QUIET;
 	req->timeout = 10 * HZ;
 	req->retries = 5;
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2cca9cffc63fde..8c52622ac2577e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -163,26 +163,11 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
 {
 	__scsi_queue_insert(cmd, reason, 1);
 }
-/**
- * scsi_execute - insert request and wait for the result
- * @sdev:	scsi device
- * @cmd:	scsi command
- * @data_direction: data direction
- * @buffer:	data buffer
- * @bufflen:	len of buffer
- * @sense:	optional sense buffer
- * @timeout:	request timeout in seconds
- * @retries:	number of times to retry request
- * @flags:	or into request flags;
- * @resid:	optional residual length
- *
- * returns the req->errors value which is the scsi_cmnd result
- * field.
- */
-int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
+
+static int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 		 int data_direction, void *buffer, unsigned bufflen,
 		 unsigned char *sense, int timeout, int retries, u64 flags,
-		 int *resid)
+		 req_flags_t rq_flags, int *resid)
 {
 	struct request *req;
 	int write = (data_direction == DMA_TO_DEVICE);
@@ -203,7 +188,8 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	req->sense_len = 0;
 	req->retries = retries;
 	req->timeout = timeout;
-	req->cmd_flags |= flags | REQ_QUIET | REQ_PREEMPT;
+	req->cmd_flags |= flags;
+	req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
 
 	/*
 	 * head injection *required* here otherwise quiesce won't work
@@ -227,12 +213,37 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 
 	return ret;
 }
+
+/**
+ * scsi_execute - insert request and wait for the result
+ * @sdev:	scsi device
+ * @cmd:	scsi command
+ * @data_direction: data direction
+ * @buffer:	data buffer
+ * @bufflen:	len of buffer
+ * @sense:	optional sense buffer
+ * @timeout:	request timeout in seconds
+ * @retries:	number of times to retry request
+ * @flags:	or into request flags;
+ * @resid:	optional residual length
+ *
+ * returns the req->errors value which is the scsi_cmnd result
+ * field.
+ */
+int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
+		 int data_direction, void *buffer, unsigned bufflen,
+		 unsigned char *sense, int timeout, int retries, u64 flags,
+		 int *resid)
+{
+	return __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense,
+			timeout, retries, flags, 0, resid);
+}
 EXPORT_SYMBOL(scsi_execute);
 
 int scsi_execute_req_flags(struct scsi_device *sdev, const unsigned char *cmd,
 		     int data_direction, void *buffer, unsigned bufflen,
 		     struct scsi_sense_hdr *sshdr, int timeout, int retries,
-		     int *resid, u64 flags)
+		     int *resid, u64 flags, req_flags_t rq_flags)
 {
 	char *sense = NULL;
 	int result;
@@ -242,8 +253,8 @@ int scsi_execute_req_flags(struct scsi_device *sdev, const unsigned char *cmd,
 		if (!sense)
 			return DRIVER_ERROR << 24;
 	}
-	result = scsi_execute(sdev, cmd, data_direction, buffer, bufflen,
-			      sense, timeout, retries, flags, resid);
+	result = __scsi_execute(sdev, cmd, data_direction, buffer, bufflen,
+			      sense, timeout, retries, flags, rq_flags, resid);
 	if (sshdr)
 		scsi_normalize_sense(sense, SCSI_SENSE_BUFFERSIZE, sshdr);
 
@@ -813,7 +824,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		 */
 		if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d))
 			;
-		else if (!(req->cmd_flags & REQ_QUIET))
+		else if (!(req->rq_flags & RQF_QUIET))
 			scsi_print_sense(cmd);
 		result = 0;
 		/* BLOCK_PC may have set error */
@@ -943,7 +954,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	switch (action) {
 	case ACTION_FAIL:
 		/* Give up and fail the remainder of the request */
-		if (!(req->cmd_flags & REQ_QUIET)) {
+		if (!(req->rq_flags & RQF_QUIET)) {
 			static DEFINE_RATELIMIT_STATE(_rs,
 					DEFAULT_RATELIMIT_INTERVAL,
 					DEFAULT_RATELIMIT_BURST);
@@ -972,7 +983,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		 * A new command will be prepared and issued.
 		 */
 		if (q->mq_ops) {
-			cmd->request->cmd_flags &= ~REQ_DONTPREP;
+			cmd->request->rq_flags &= ~RQF_DONTPREP;
 			scsi_mq_uninit_cmd(cmd);
 			scsi_mq_requeue_cmd(cmd);
 		} else {
@@ -1234,7 +1245,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 			/*
 			 * If the devices is blocked we defer normal commands.
 			 */
-			if (!(req->cmd_flags & REQ_PREEMPT))
+			if (!(req->rq_flags & RQF_PREEMPT))
 				ret = BLKPREP_DEFER;
 			break;
 		default:
@@ -1243,7 +1254,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 			 * special commands.  In particular any user initiated
 			 * command is not allowed.
 			 */
-			if (!(req->cmd_flags & REQ_PREEMPT))
+			if (!(req->rq_flags & RQF_PREEMPT))
 				ret = BLKPREP_KILL;
 			break;
 		}
@@ -1279,7 +1290,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 			blk_delay_queue(q, SCSI_QUEUE_DELAY);
 		break;
 	default:
-		req->cmd_flags |= REQ_DONTPREP;
+		req->rq_flags |= RQF_DONTPREP;
 	}
 
 	return ret;
@@ -1736,7 +1747,7 @@ static void scsi_request_fn(struct request_queue *q)
 		 * we add the dev to the starved list so it eventually gets
 		 * a run when a tag is freed.
 		 */
-		if (blk_queue_tagged(q) && !(req->cmd_flags & REQ_QUEUED)) {
+		if (blk_queue_tagged(q) && !(req->rq_flags & RQF_QUEUED)) {
 			spin_lock_irq(shost->host_lock);
 			if (list_empty(&sdev->starved_entry))
 				list_add_tail(&sdev->starved_entry,
@@ -1903,11 +1914,11 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 		goto out_dec_target_busy;
 
 
-	if (!(req->cmd_flags & REQ_DONTPREP)) {
+	if (!(req->rq_flags & RQF_DONTPREP)) {
 		ret = prep_to_mq(scsi_mq_prep_fn(req));
 		if (ret)
 			goto out_dec_host_busy;
-		req->cmd_flags |= REQ_DONTPREP;
+		req->rq_flags |= RQF_DONTPREP;
 	} else {
 		blk_mq_start_request(req);
 	}
@@ -1952,7 +1963,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 		 * we hit an error, as we will never see this command
 		 * again.
 		 */
-		if (req->cmd_flags & REQ_DONTPREP)
+		if (req->rq_flags & RQF_DONTPREP)
 			scsi_mq_uninit_cmd(cmd);
 		break;
 	default:
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index b9618ffca829bf..cef1f78031d413 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1520,7 +1520,7 @@ static int sd_sync_cache(struct scsi_disk *sdkp)
 		 */
 		res = scsi_execute_req_flags(sdp, cmd, DMA_NONE, NULL, 0,
 					     &sshdr, timeout, SD_MAX_RETRIES,
-					     NULL, REQ_PM);
+					     NULL, 0, RQF_PM);
 		if (res == 0)
 			break;
 	}
@@ -1879,7 +1879,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 
 					good_bytes = 0;
 					req->__data_len = blk_rq_bytes(req);
-					req->cmd_flags |= REQ_QUIET;
+					req->rq_flags |= RQF_QUIET;
 				}
 			}
 		}
@@ -3278,7 +3278,7 @@ static int sd_start_stop_device(struct scsi_disk *sdkp, int start)
 		return -ENODEV;
 
 	res = scsi_execute_req_flags(sdp, cmd, DMA_NONE, NULL, 0, &sshdr,
-			       SD_TIMEOUT, SD_MAX_RETRIES, NULL, REQ_PM);
+			       SD_TIMEOUT, SD_MAX_RETRIES, NULL, 0, RQF_PM);
 	if (res) {
 		sd_print_result(sdkp, "Start/Stop Unit failed", res);
 		if (driver_byte(res) & DRIVER_SENSE)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index d5b3bd915d9eb4..394ab490919c6b 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -348,7 +348,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd,
 			 * this case, so be quiet about the error.
 			 */
 			if (req_op(rq) == REQ_OP_ZONE_RESET)
-				rq->cmd_flags |= REQ_QUIET;
+				rq->rq_flags |= RQF_QUIET;
 			break;
 		case 0x21:
 			/*
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 7af5226aa55ba0..3bc46a4abd4306 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -546,7 +546,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 		return DRIVER_ERROR << 24;
 
 	blk_rq_set_block_pc(req);
-	req->cmd_flags |= REQ_QUIET;
+	req->rq_flags |= RQF_QUIET;
 
 	mdata->null_mapped = 1;
 
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 05c745663c103a..cf549871c1ee14 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -5590,7 +5590,7 @@ ufshcd_send_request_sense(struct ufs_hba *hba, struct scsi_device *sdp)
 
 	ret = scsi_execute_req_flags(sdp, cmd, DMA_FROM_DEVICE, buffer,
 				SCSI_SENSE_BUFFERSIZE, NULL,
-				msecs_to_jiffies(1000), 3, NULL, REQ_PM);
+				msecs_to_jiffies(1000), 3, NULL, 0, RQF_PM);
 	if (ret)
 		pr_err("%s: failed with err %d\n", __func__, ret);
 
@@ -5652,11 +5652,11 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba,
 
 	/*
 	 * Current function would be generally called from the power management
-	 * callbacks hence set the REQ_PM flag so that it doesn't resume the
+	 * callbacks hence set the RQF_PM flag so that it doesn't resume the
 	 * already suspended childs.
 	 */
 	ret = scsi_execute_req_flags(sdp, cmd, DMA_NONE, NULL, 0, &sshdr,
-				     START_STOP_TIMEOUT, 0, NULL, REQ_PM);
+				     START_STOP_TIMEOUT, 0, NULL, 0, RQF_PM);
 	if (ret) {
 		sdev_printk(KERN_WARNING, sdp,
 			    "START_STOP failed for power mode: %d, result %x\n",
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6df722de2e2267..ec69a8fe3b2974 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -167,26 +167,6 @@ enum rq_flag_bits {
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 
-	/* request only flags */
-	__REQ_SORTED,		/* elevator knows about this request */
-	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
-	__REQ_STARTED,		/* drive already may have started this one */
-	__REQ_DONTPREP,		/* don't call prep for this one */
-	__REQ_QUEUED,		/* uses queueing */
-	__REQ_ELVPRIV,		/* elevator private data attached */
-	__REQ_FAILED,		/* set if the request failed */
-	__REQ_QUIET,		/* don't worry about errors */
-	__REQ_PREEMPT,		/* set for "ide_preempt" requests and also
-				   for requests for which the SCSI "quiesce"
-				   state must be ignored. */
-	__REQ_ALLOCED,		/* request came from our alloc pool */
-	__REQ_COPY_USER,	/* contains copies of user pages */
-	__REQ_FLUSH_SEQ,	/* request for flush sequence */
-	__REQ_IO_STAT,		/* account I/O stat */
-	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
-	__REQ_PM,		/* runtime pm request */
-	__REQ_HASHED,		/* on IO scheduler merge hash */
-	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -208,29 +188,12 @@ enum rq_flag_bits {
 
 /* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
-	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_PREFLUSH | REQ_FUA | REQ_FLUSH_SEQ)
+	(REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
 
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
-#define REQ_SORTED		(1ULL << __REQ_SORTED)
-#define REQ_SOFTBARRIER		(1ULL << __REQ_SOFTBARRIER)
 #define REQ_FUA			(1ULL << __REQ_FUA)
 #define REQ_NOMERGE		(1ULL << __REQ_NOMERGE)
-#define REQ_STARTED		(1ULL << __REQ_STARTED)
-#define REQ_DONTPREP		(1ULL << __REQ_DONTPREP)
-#define REQ_QUEUED		(1ULL << __REQ_QUEUED)
-#define REQ_ELVPRIV		(1ULL << __REQ_ELVPRIV)
-#define REQ_FAILED		(1ULL << __REQ_FAILED)
-#define REQ_QUIET		(1ULL << __REQ_QUIET)
-#define REQ_PREEMPT		(1ULL << __REQ_PREEMPT)
-#define REQ_ALLOCED		(1ULL << __REQ_ALLOCED)
-#define REQ_COPY_USER		(1ULL << __REQ_COPY_USER)
 #define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
-#define REQ_FLUSH_SEQ		(1ULL << __REQ_FLUSH_SEQ)
-#define REQ_IO_STAT		(1ULL << __REQ_IO_STAT)
-#define REQ_MIXED_MERGE		(1ULL << __REQ_MIXED_MERGE)
-#define REQ_PM			(1ULL << __REQ_PM)
-#define REQ_HASHED		(1ULL << __REQ_HASHED)
-#define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 
 enum req_op {
 	REQ_OP_READ,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 90097dd8b8edbb..b4415feac6795d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -78,6 +78,50 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_DRV_PRIV,		/* driver defined types from here */
 };
 
+/*
+ * request flags */
+typedef __u32 __bitwise req_flags_t;
+
+/* elevator knows about this request */
+#define RQF_SORTED		((__force req_flags_t)(1 << 0))
+/* drive already may have started this one */
+#define RQF_STARTED		((__force req_flags_t)(1 << 1))
+/* uses tagged queueing */
+#define RQF_QUEUED		((__force req_flags_t)(1 << 2))
+/* may not be passed by ioscheduler */
+#define RQF_SOFTBARRIER		((__force req_flags_t)(1 << 3))
+/* request for flush sequence */
+#define RQF_FLUSH_SEQ		((__force req_flags_t)(1 << 4))
+/* merge of different types, fail separately */
+#define RQF_MIXED_MERGE		((__force req_flags_t)(1 << 5))
+/* track inflight for MQ */
+#define RQF_MQ_INFLIGHT		((__force req_flags_t)(1 << 6))
+/* don't call prep for this one */
+#define RQF_DONTPREP		((__force req_flags_t)(1 << 7))
+/* set for "ide_preempt" requests and also for requests for which the SCSI
+   "quiesce" state must be ignored. */
+#define RQF_PREEMPT		((__force req_flags_t)(1 << 8))
+/* contains copies of user pages */
+#define RQF_COPY_USER		((__force req_flags_t)(1 << 9))
+/* vaguely specified driver internal error.  Ignored by the block layer */
+#define RQF_FAILED		((__force req_flags_t)(1 << 10))
+/* don't warn about errors */
+#define RQF_QUIET		((__force req_flags_t)(1 << 11))
+/* elevator private data attached */
+#define RQF_ELVPRIV		((__force req_flags_t)(1 << 12))
+/* account I/O stat */
+#define RQF_IO_STAT		((__force req_flags_t)(1 << 13))
+/* request came from our alloc pool */
+#define RQF_ALLOCED		((__force req_flags_t)(1 << 14))
+/* runtime pm request */
+#define RQF_PM			((__force req_flags_t)(1 << 15))
+/* on IO scheduler merge hash */
+#define RQF_HASHED		((__force req_flags_t)(1 << 16))
+
+/* flags that prevent us from merging requests: */
+#define RQF_NOMERGE_FLAGS \
+	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ)
+
 #define BLK_MAX_CDB	16
 
 /*
@@ -99,6 +143,7 @@ struct request {
 	int cpu;
 	unsigned cmd_type;
 	u64 cmd_flags;
+	req_flags_t rq_flags;
 	unsigned long atomic_flags;
 
 	/* the following two fields are internal, NEVER access directly */
@@ -648,7 +693,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 			     REQ_FAILFAST_DRIVER))
 
 #define blk_account_rq(rq) \
-	(((rq)->cmd_flags & REQ_STARTED) && \
+	(((rq)->rq_flags & RQF_STARTED) && \
 	 ((rq)->cmd_type == REQ_TYPE_FS))
 
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
@@ -740,6 +785,8 @@ static inline bool rq_mergeable(struct request *rq)
 
 	if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
 		return false;
+	if (rq->rq_flags & RQF_NOMERGE_FLAGS)
+		return false;
 
 	return true;
 }
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 8a956314489050..8990e580b278bd 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -414,14 +414,14 @@ extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 extern int scsi_execute_req_flags(struct scsi_device *sdev,
 	const unsigned char *cmd, int data_direction, void *buffer,
 	unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,
-	int retries, int *resid, u64 flags);
+	int retries, int *resid, u64 flags, req_flags_t rq_flags);
 static inline int scsi_execute_req(struct scsi_device *sdev,
 	const unsigned char *cmd, int data_direction, void *buffer,
 	unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,
 	int retries, int *resid)
 {
 	return scsi_execute_req_flags(sdev, cmd, data_direction, buffer,
-		bufflen, sshdr, timeout, retries, resid, 0);
+		bufflen, sshdr, timeout, retries, resid, 0, 0);
 }
 extern void sdev_disable_disk_events(struct scsi_device *sdev);
 extern void sdev_enable_disk_events(struct scsi_device *sdev);

From ef295ecf090d3e86e5b742fc6ab34f1122a43773 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 28 Oct 2016 08:48:16 -0600
Subject: [PATCH 019/182] block: better op and flags encoding

Now that we don't need the common flags to overflow outside the range
of a 32-bit type we can encode them the same way for both the bio and
request fields.  This in addition allows us to place the operation
first (and make some room for more ops while we're at it) and to
stop having to shift around the operation values.

In addition this allows passing around only one value in the block layer
instead of two (and eventuall also in the file systems, but we can do
that later) and thus clean up a lot of code.

Last but not least this allows decreasing the size of the cmd_flags
field in struct request to 32-bits.  Various functions passing this
value could also be updated, but I'd like to avoid the churn for now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/biodoc.txt |  4 +-
 block/blk-core.c               | 60 ++++++++----------------
 block/blk-flush.c              |  2 +-
 block/blk-lib.c                |  2 +-
 block/blk-map.c                |  2 +
 block/blk-mq.c                 | 28 +++++-------
 block/cfq-iosched.c            | 66 +++++++++++++--------------
 block/elevator.c               |  4 +-
 drivers/md/dm-crypt.c          |  2 +-
 drivers/scsi/sd.c              |  3 +-
 fs/btrfs/inode.c               |  5 +-
 fs/buffer.c                    |  2 +-
 fs/f2fs/f2fs.h                 |  2 +-
 fs/gfs2/lops.c                 |  2 +-
 include/linux/blk-cgroup.h     | 11 ++---
 include/linux/blk_types.h      | 83 +++++++++++++++-------------------
 include/linux/blkdev.h         | 26 +----------
 include/linux/blktrace_api.h   |  2 +-
 include/linux/dm-io.h          |  2 +-
 include/linux/elevator.h       |  4 +-
 include/trace/events/bcache.h  | 12 ++---
 include/trace/events/block.h   | 31 +++++--------
 kernel/trace/blktrace.c        | 14 +++---
 23 files changed, 148 insertions(+), 221 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 6acea160298cc9..01ddeaf64b0f6c 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -553,8 +553,8 @@ struct request {
 	struct request_list *rl;
 }
 	
-See the rq_flag_bits definitions for an explanation of the various flags
-available. Some bits are used by the block layer or i/o scheduler.
+See the req_ops and req_flag_bits definitions for an explanation of the various
+flags available. Some bits are used by the block layer or i/o scheduler.
 	
 The behaviour of the various sector counts are almost the same as before,
 except that since we have multi-segment bios, current_nr_sectors refers
diff --git a/block/blk-core.c b/block/blk-core.c
index fd416651a6765f..0bfaa54d3e9f5d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1056,8 +1056,7 @@ static struct io_context *rq_ioc(struct bio *bio)
 /**
  * __get_request - get a free request
  * @rl: request list to allocate from
- * @op: REQ_OP_READ/REQ_OP_WRITE
- * @op_flags: rq_flag_bits
+ * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @gfp_mask: allocation mask
  *
@@ -1068,23 +1067,22 @@ static struct io_context *rq_ioc(struct bio *bio)
  * Returns ERR_PTR on failure, with @q->queue_lock held.
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
-static struct request *__get_request(struct request_list *rl, int op,
-				     int op_flags, struct bio *bio,
-				     gfp_t gfp_mask)
+static struct request *__get_request(struct request_list *rl, unsigned int op,
+		struct bio *bio, gfp_t gfp_mask)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
 	struct elevator_type *et = q->elevator->type;
 	struct io_context *ioc = rq_ioc(bio);
 	struct io_cq *icq = NULL;
-	const bool is_sync = rw_is_sync(op, op_flags) != 0;
+	const bool is_sync = op_is_sync(op);
 	int may_queue;
 	req_flags_t rq_flags = RQF_ALLOCED;
 
 	if (unlikely(blk_queue_dying(q)))
 		return ERR_PTR(-ENODEV);
 
-	may_queue = elv_may_queue(q, op, op_flags);
+	may_queue = elv_may_queue(q, op);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 
@@ -1154,7 +1152,7 @@ static struct request *__get_request(struct request_list *rl, int op,
 
 	blk_rq_init(q, rq);
 	blk_rq_set_rl(rq, rl);
-	req_set_op_attrs(rq, op, op_flags);
+	rq->cmd_flags = op;
 	rq->rq_flags = rq_flags;
 
 	/* init elvpriv */
@@ -1232,8 +1230,7 @@ static struct request *__get_request(struct request_list *rl, int op,
 /**
  * get_request - get a free request
  * @q: request_queue to allocate request from
- * @op: REQ_OP_READ/REQ_OP_WRITE
- * @op_flags: rq_flag_bits
+ * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @gfp_mask: allocation mask
  *
@@ -1244,18 +1241,17 @@ static struct request *__get_request(struct request_list *rl, int op,
  * Returns ERR_PTR on failure, with @q->queue_lock held.
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
-static struct request *get_request(struct request_queue *q, int op,
-				   int op_flags, struct bio *bio,
-				   gfp_t gfp_mask)
+static struct request *get_request(struct request_queue *q, unsigned int op,
+		struct bio *bio, gfp_t gfp_mask)
 {
-	const bool is_sync = rw_is_sync(op, op_flags) != 0;
+	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
 	struct request_list *rl;
 	struct request *rq;
 
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(rl, op, op_flags, bio, gfp_mask);
+	rq = __get_request(rl, op, bio, gfp_mask);
 	if (!IS_ERR(rq))
 		return rq;
 
@@ -1297,7 +1293,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 	create_io_context(gfp_mask, q->node);
 
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, rw, 0, NULL, gfp_mask);
+	rq = get_request(q, rw, NULL, gfp_mask);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		return rq;
@@ -1446,7 +1442,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	 */
 	if (rq_flags & RQF_ALLOCED) {
 		struct request_list *rl = blk_rq_rl(req);
-		bool sync = rw_is_sync(req_op(req), req->cmd_flags);
+		bool sync = op_is_sync(req->cmd_flags);
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(ELV_ON_HASH(req));
@@ -1652,8 +1648,6 @@ unsigned int blk_plug_queued_count(struct request_queue *q)
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cmd_type = REQ_TYPE_FS;
-
-	req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK;
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
@@ -1665,9 +1659,8 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
-	const bool sync = !!(bio->bi_opf & REQ_SYNC);
 	struct blk_plug *plug;
-	int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT;
+	int el_ret, where = ELEVATOR_INSERT_SORT;
 	struct request *req;
 	unsigned int request_count = 0;
 
@@ -1722,24 +1715,11 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	}
 
 get_rq:
-	/*
-	 * This sync check and mask will be re-done in init_request_from_bio(),
-	 * but we need to set it earlier to expose the sync flag to the
-	 * rq allocator and io schedulers.
-	 */
-	if (sync)
-		rw_flags |= REQ_SYNC;
-
-	/*
-	 * Add in META/PRIO flags, if set, before we get to the IO scheduler
-	 */
-	rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO));
-
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
-	req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO);
+	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
 		bio->bi_error = PTR_ERR(req);
 		bio_endio(bio);
@@ -2946,8 +2926,6 @@ EXPORT_SYMBOL_GPL(__blk_end_request_err);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
-	req_set_op(rq, bio_op(bio));
-
 	if (bio_has_data(bio))
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 
@@ -3031,8 +3009,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	req_set_op_attrs(dst, req_op(src),
-			 (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE);
+	dst->cmd_flags = src->cmd_flags | REQ_NOMERGE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
@@ -3537,8 +3514,11 @@ EXPORT_SYMBOL(blk_set_runtime_active);
 
 int __init blk_dev_init(void)
 {
-	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
+	BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
+	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
 			FIELD_SIZEOF(struct request, cmd_flags));
+	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
+			FIELD_SIZEOF(struct bio, bi_opf));
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 3990b9cfbda51b..95f1d4d357df8e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	}
 
 	flush_rq->cmd_type = REQ_TYPE_FS;
-	req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH);
+	flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH;
 	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 46fe9248410d74..18abda862915ab 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct request_queue *q = bdev_get_queue(bdev);
 	struct bio *bio = *biop;
 	unsigned int granularity;
-	enum req_op op;
+	unsigned int op;
 	int alignment;
 	sector_t bs_mask;
 
diff --git a/block/blk-map.c b/block/blk-map.c
index 2c5ae5fef473ce..0173a72a8aa9ce 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -16,6 +16,8 @@
 int blk_rq_append_bio(struct request *rq, struct bio *bio)
 {
 	if (!rq->bio) {
+		rq->cmd_flags &= REQ_OP_MASK;
+		rq->cmd_flags |= (bio->bi_opf & REQ_OP_MASK);
 		blk_rq_bio_prep(rq->q, rq, bio);
 	} else {
 		if (!ll_back_merge_fn(rq->q, rq, bio))
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b49c6658eb05f9..2da1a0ee33182e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -139,14 +139,13 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 EXPORT_SYMBOL(blk_mq_can_queue);
 
 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-			       struct request *rq, int op,
-			       unsigned int op_flags)
+			       struct request *rq, unsigned int op)
 {
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = q;
 	rq->mq_ctx = ctx;
-	req_set_op_attrs(rq, op, op_flags);
+	rq->cmd_flags = op;
 	if (blk_queue_io_stat(q))
 		rq->rq_flags |= RQF_IO_STAT;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
@@ -183,11 +182,11 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
-	ctx->rq_dispatched[rw_is_sync(op, op_flags)]++;
+	ctx->rq_dispatched[op_is_sync(op)]++;
 }
 
 static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
+__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
 {
 	struct request *rq;
 	unsigned int tag;
@@ -202,7 +201,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
 		}
 
 		rq->tag = tag;
-		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags);
+		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
 		return rq;
 	}
 
@@ -225,7 +224,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 	ctx = blk_mq_get_ctx(q);
 	hctx = blk_mq_map_queue(q, ctx->cpu);
 	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	blk_mq_put_ctx(ctx);
 
 	if (!rq) {
@@ -277,7 +276,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 	ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
 
 	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (!rq) {
 		ret = -EWOULDBLOCK;
 		goto out_queue_exit;
@@ -1196,19 +1195,14 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct request *rq;
-	int op = bio_data_dir(bio);
-	int op_flags = 0;
 
 	blk_queue_enter_live(q);
 	ctx = blk_mq_get_ctx(q);
 	hctx = blk_mq_map_queue(q, ctx->cpu);
 
-	if (rw_is_sync(bio_op(bio), bio->bi_opf))
-		op_flags |= REQ_SYNC;
-
-	trace_block_getrq(q, bio, op);
+	trace_block_getrq(q, bio, bio->bi_opf);
 	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-	rq = __blk_mq_alloc_request(data, op, op_flags);
+	rq = __blk_mq_alloc_request(data, bio->bi_opf);
 
 	data->hctx->queued++;
 	return rq;
@@ -1256,7 +1250,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
  */
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
-	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+	const int is_sync = op_is_sync(bio->bi_opf);
 	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_mq_alloc_data data;
 	struct request *rq;
@@ -1350,7 +1344,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  */
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
-	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+	const int is_sync = op_is_sync(bio->bi_opf);
 	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_plug *plug;
 	unsigned int request_count = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5e24d880306c2a..c96186adaa6680 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -667,10 +667,10 @@ static inline void cfqg_put(struct cfq_group *cfqg)
 } while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-					    struct cfq_group *curr_cfqg, int op,
-					    int op_flags)
+					    struct cfq_group *curr_cfqg,
+					    unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1);
+	blkg_rwstat_add(&cfqg->stats.queued, op, 1);
 	cfqg_stats_end_empty_time(&cfqg->stats);
 	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 }
@@ -684,30 +684,29 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 #endif
 }
 
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op,
-					       int op_flags)
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
+					       unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1);
+	blkg_rwstat_add(&cfqg->stats.queued, op, -1);
 }
 
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op,
-					       int op_flags)
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
+					       unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1);
+	blkg_rwstat_add(&cfqg->stats.merged, op, 1);
 }
 
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time, int op,
-			int op_flags)
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	unsigned long long now = sched_clock();
 
 	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, op, op_flags,
-				now - io_start_time);
+		blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
 	if (time_after64(io_start_time, start_time))
-		blkg_rwstat_add(&stats->wait_time, op, op_flags,
+		blkg_rwstat_add(&stats->wait_time, op,
 				io_start_time - start_time);
 }
 
@@ -786,16 +785,16 @@ static inline void cfqg_put(struct cfq_group *cfqg) { }
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-			struct cfq_group *curr_cfqg, int op, int op_flags) { }
+			struct cfq_group *curr_cfqg, unsigned int op) { }
 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 			uint64_t time, unsigned long unaccounted_time) { }
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op,
-			int op_flags) { }
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op,
-			int op_flags) { }
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
+			unsigned int op) { }
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
+			unsigned int op) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time, int op,
-			int op_flags) { }
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -2474,10 +2473,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags);
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	cfq_add_rq_rb(rq);
 	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
-				 req_op(rq), rq->cmd_flags);
+				 rq->cmd_flags);
 }
 
 static struct request *
@@ -2530,7 +2529,7 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags);
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	if (rq->cmd_flags & REQ_PRIO) {
 		WARN_ON(!cfqq->prio_pending);
 		cfqq->prio_pending--;
@@ -2565,7 +2564,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf);
+	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
 }
 
 static void
@@ -2588,7 +2587,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	cfqg_stats_update_io_merged(RQ_CFQG(rq), req_op(next), next->cmd_flags);
+	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
 
 	cfqq = RQ_CFQQ(next);
 	/*
@@ -4142,7 +4141,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, req_op(rq),
+	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
 				 rq->cmd_flags);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
@@ -4240,8 +4239,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
 	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), req_op(rq),
-				     rq->cmd_flags);
+				     rq_io_start_time_ns(rq), rq->cmd_flags);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -4319,14 +4317,14 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		cfq_schedule_dispatch(cfqd);
 }
 
-static void cfqq_boost_on_prio(struct cfq_queue *cfqq, int op_flags)
+static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
 {
 	/*
 	 * If REQ_PRIO is set, boost class and prio level, if it's below
 	 * BE/NORM. If prio is not set, restore the potentially boosted
 	 * class/prio level.
 	 */
-	if (!(op_flags & REQ_PRIO)) {
+	if (!(op & REQ_PRIO)) {
 		cfqq->ioprio_class = cfqq->org_ioprio_class;
 		cfqq->ioprio = cfqq->org_ioprio;
 	} else {
@@ -4347,7 +4345,7 @@ static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 	return ELV_MQUEUE_MAY;
 }
 
-static int cfq_may_queue(struct request_queue *q, int op, int op_flags)
+static int cfq_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
@@ -4364,10 +4362,10 @@ static int cfq_may_queue(struct request_queue *q, int op, int op_flags)
 	if (!cic)
 		return ELV_MQUEUE_MAY;
 
-	cfqq = cic_to_cfqq(cic, rw_is_sync(op, op_flags));
+	cfqq = cic_to_cfqq(cic, op_is_sync(op));
 	if (cfqq) {
 		cfq_init_prio_data(cfqq, cic);
-		cfqq_boost_on_prio(cfqq, op_flags);
+		cfqq_boost_on_prio(cfqq, op);
 
 		return __cfq_may_queue(cfqq);
 	}
diff --git a/block/elevator.c b/block/elevator.c
index ac80f89a08424c..a18a5db274e4a0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -714,12 +714,12 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 		e->type->ops.elevator_put_req_fn(rq);
 }
 
-int elv_may_queue(struct request_queue *q, int op, int op_flags)
+int elv_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct elevator_queue *e = q->elevator;
 
 	if (e->type->ops.elevator_may_queue_fn)
-		return e->type->ops.elevator_may_queue_fn(q, op, op_flags);
+		return e->type->ops.elevator_may_queue_fn(q, op);
 
 	return ELV_MQUEUE_MAY;
 }
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index a2768835d3948a..68a9eb4f3f3664 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1135,7 +1135,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_private = io;
 	clone->bi_end_io  = crypt_endio;
 	clone->bi_bdev    = cc->dev->bdev;
-	bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio));
+	clone->bi_opf	  = io->base_bio->bi_opf;
 }
 
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index cef1f78031d413..65738b0aad367d 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1031,8 +1031,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	} else if (rq_data_dir(rq) == READ) {
 		SCpnt->cmnd[0] = READ_6;
 	} else {
-		scmd_printk(KERN_ERR, SCpnt, "Unknown command %llu,%llx\n",
-			    req_op(rq), (unsigned long long) rq->cmd_flags);
+		scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq));
 		goto out;
 	}
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2b790bda799880..9a377079af267f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8427,7 +8427,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	if (!bio)
 		return -ENOMEM;
 
-	bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
+	bio->bi_opf = orig_bio->bi_opf;
 	bio->bi_private = dip;
 	bio->bi_end_io = btrfs_end_dio_bio;
 	btrfs_io_bio(bio)->logical = file_offset;
@@ -8465,8 +8465,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 						  start_sector, GFP_NOFS);
 			if (!bio)
 				goto out_err;
-			bio_set_op_attrs(bio, bio_op(orig_bio),
-					 bio_flags(orig_bio));
+			bio->bi_opf = orig_bio->bi_opf;
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 			btrfs_io_bio(bio)->logical = file_offset;
diff --git a/fs/buffer.c b/fs/buffer.c
index b205a629001df1..a29335867e3038 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3118,7 +3118,7 @@ EXPORT_SYMBOL(submit_bh);
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
  * @op: whether to %READ or %WRITE
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9e8de18a168af6..2cf4f7f09e322c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -688,7 +688,7 @@ struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	int op;			/* contains REQ_OP_ */
-	int op_flags;		/* rq_flag_bits */
+	int op_flags;		/* req_flag_bits */
 	block_t new_blkaddr;	/* new block address to be written */
 	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 49d5a1b61b0692..b1f9144b42c7fd 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio)
  * gfs2_log_flush_bio - Submit any pending log bio
  * @sdp: The superblock
  * @op: REQ_OP
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
  *
  * Submit any pending part-built or full bio to the block device. If
  * there is no pending bio, then this is a no-op.
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 3bf5d33800ab61..ddaf28d0988f9b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -581,15 +581,14 @@ static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
 /**
  * blkg_rwstat_add - add a value to a blkg_rwstat
  * @rwstat: target blkg_rwstat
- * @op: REQ_OP
- * @op_flags: rq_flag_bits
+ * @op: REQ_OP and flags
  * @val: value to add
  *
  * Add @val to @rwstat.  The counters are chosen according to @rw.  The
  * caller is responsible for synchronizing calls to this function.
  */
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-				   int op, int op_flags, uint64_t val)
+				   unsigned int op, uint64_t val)
 {
 	struct percpu_counter *cnt;
 
@@ -600,7 +599,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 
 	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 
-	if (op_flags & REQ_SYNC)
+	if (op & REQ_SYNC)
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 	else
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
@@ -705,9 +704,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	if (!throtl) {
 		blkg = blkg ?: q->root_blkg;
-		blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_opf,
+		blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
 				bio->bi_iter.bi_size);
-		blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_opf, 1);
+		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 	}
 
 	rcu_read_unlock();
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ec69a8fe3b2974..dca972d6754865 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -88,24 +88,6 @@ struct bio {
 	struct bio_vec		bi_inline_vecs[0];
 };
 
-#define BIO_OP_SHIFT	(8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS)
-#define bio_flags(bio)	((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1))
-#define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
-
-#define bio_set_op_attrs(bio, op, op_flags) do {			\
-	if (__builtin_constant_p(op))					\
-		BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS));		\
-	else								\
-		WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS));		\
-	if (__builtin_constant_p(op_flags))				\
-		BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT));	\
-	else								\
-		WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT));	\
-	(bio)->bi_opf = bio_flags(bio);					\
-	(bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT);			\
-	(bio)->bi_opf |= (op_flags);					\
-} while (0)
-
 #define BIO_RESET_BYTES		offsetof(struct bio, bi_max_vecs)
 
 /*
@@ -147,26 +129,40 @@ struct bio {
 #endif /* CONFIG_BLOCK */
 
 /*
- * Request flags.  For use in the cmd_flags field of struct request, and in
- * bi_opf of struct bio.  Note that some flags are only valid in either one.
+ * Operations and flags common to the bio and request structures.
+ * We use 8 bits for encoding the operation, and the remaining 24 for flags.
  */
-enum rq_flag_bits {
-	/* common flags */
-	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
+#define REQ_OP_BITS	8
+#define REQ_OP_MASK	((1 << REQ_OP_BITS) - 1)
+#define REQ_FLAG_BITS	24
+
+enum req_opf {
+	REQ_OP_READ,
+	REQ_OP_WRITE,
+	REQ_OP_DISCARD,		/* request to discard sectors */
+	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
+	REQ_OP_WRITE_SAME,	/* write same block many times */
+	REQ_OP_FLUSH,		/* request for cache flush */
+	REQ_OP_ZONE_REPORT,	/* Get zone information */
+	REQ_OP_ZONE_RESET,	/* Reset a zone write pointer */
+
+	REQ_OP_LAST,
+};
+
+enum req_flag_bits {
+	__REQ_FAILFAST_DEV =	/* no driver retries of device errors */
+		REQ_OP_BITS,
 	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
 	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
-
 	__REQ_SYNC,		/* request is sync (sync write or read) */
 	__REQ_META,		/* metadata io request */
 	__REQ_PRIO,		/* boost priority in cfq */
-
 	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
 	__REQ_FUA,		/* forced unit access */
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
-
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -176,37 +172,32 @@ enum rq_flag_bits {
 #define REQ_SYNC		(1ULL << __REQ_SYNC)
 #define REQ_META		(1ULL << __REQ_META)
 #define REQ_PRIO		(1ULL << __REQ_PRIO)
+#define REQ_NOMERGE		(1ULL << __REQ_NOMERGE)
 #define REQ_NOIDLE		(1ULL << __REQ_NOIDLE)
 #define REQ_INTEGRITY		(1ULL << __REQ_INTEGRITY)
+#define REQ_FUA			(1ULL << __REQ_FUA)
+#define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
+#define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
-#define REQ_COMMON_MASK \
-	(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
-	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_RAHEAD)
-#define REQ_CLONE_MASK		REQ_COMMON_MASK
 
-/* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
 	(REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
 
-#define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
-#define REQ_FUA			(1ULL << __REQ_FUA)
-#define REQ_NOMERGE		(1ULL << __REQ_NOMERGE)
-#define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
+#define bio_op(bio) \
+	((bio)->bi_opf & REQ_OP_MASK)
+#define req_op(req) \
+	((req)->cmd_flags & REQ_OP_MASK)
 
-enum req_op {
-	REQ_OP_READ,
-	REQ_OP_WRITE,
-	REQ_OP_DISCARD,		/* request to discard sectors */
-	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
-	REQ_OP_WRITE_SAME,	/* write same block many times */
-	REQ_OP_FLUSH,		/* request for cache flush */
-	REQ_OP_ZONE_REPORT,	/* Get zone information */
-	REQ_OP_ZONE_RESET,	/* Reset a zone write pointer */
-};
+/* obsolete, don't use in new code */
+#define bio_set_op_attrs(bio, op, op_flags) \
+	((bio)->bi_opf |= (op | op_flags))
 
-#define REQ_OP_BITS 3
+static inline bool op_is_sync(unsigned int op)
+{
+	return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC);
+}
 
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE	-1U
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b4415feac6795d..8396da2bb6988d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -142,7 +142,7 @@ struct request {
 
 	int cpu;
 	unsigned cmd_type;
-	u64 cmd_flags;
+	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
 	unsigned long atomic_flags;
 
@@ -244,20 +244,6 @@ struct request {
 	struct request *next_rq;
 };
 
-#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS)
-#define req_op(req)  ((req)->cmd_flags >> REQ_OP_SHIFT)
-
-#define req_set_op(req, op) do {				\
-	WARN_ON(op >= (1 << REQ_OP_BITS));			\
-	(req)->cmd_flags &= ((1ULL << REQ_OP_SHIFT) - 1);	\
-	(req)->cmd_flags |= ((u64) (op) << REQ_OP_SHIFT);	\
-} while (0)
-
-#define req_set_op_attrs(req, op, flags) do {	\
-	req_set_op(req, op);			\
-	(req)->cmd_flags |= flags;		\
-} while (0)
-
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
@@ -741,17 +727,9 @@ static inline unsigned int blk_queue_zone_size(struct request_queue *q)
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
-/*
- * We regard a request as sync, if either a read or a sync write
- */
-static inline bool rw_is_sync(int op, unsigned int rw_flags)
-{
-	return op == REQ_OP_READ || (rw_flags & REQ_SYNC);
-}
-
 static inline bool rq_is_sync(struct request *rq)
 {
-	return rw_is_sync(req_op(rq), rq->cmd_flags);
+	return op_is_sync(rq->cmd_flags);
 }
 
 static inline bool blk_rl_full(struct request_list *rl, bool sync)
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index cceb72f9e29f53..e417f080219a68 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -118,7 +118,7 @@ static inline int blk_cmd_buf_len(struct request *rq)
 }
 
 extern void blk_dump_cmd(char *buf, struct request *rq);
-extern void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes);
+extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
 
 #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
 
diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h
index b91b023deffbb3..a52c6580cc9a93 100644
--- a/include/linux/dm-io.h
+++ b/include/linux/dm-io.h
@@ -58,7 +58,7 @@ struct dm_io_notify {
 struct dm_io_client;
 struct dm_io_request {
 	int bi_op;			/* REQ_OP */
-	int bi_op_flags;		/* rq_flag_bits */
+	int bi_op_flags;		/* req_flag_bits */
 	struct dm_io_memory mem;	/* Memory to use for io */
 	struct dm_io_notify notify;	/* Synchronous if notify.fn is NULL */
 	struct dm_io_client *client;	/* Client memory handler */
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index e7f358d2e5fc6e..f219c9aed360fe 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -30,7 +30,7 @@ typedef int (elevator_dispatch_fn) (struct request_queue *, int);
 typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
 typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
-typedef int (elevator_may_queue_fn) (struct request_queue *, int, int);
+typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int);
 
 typedef void (elevator_init_icq_fn) (struct io_cq *);
 typedef void (elevator_exit_icq_fn) (struct io_cq *);
@@ -139,7 +139,7 @@ extern struct request *elv_former_request(struct request_queue *, struct request
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 extern int elv_register_queue(struct request_queue *q);
 extern void elv_unregister_queue(struct request_queue *q);
-extern int elv_may_queue(struct request_queue *, int, int);
+extern int elv_may_queue(struct request_queue *, unsigned int);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *q, struct request *rq,
 			   struct bio *bio, gfp_t gfp_mask);
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index d336b890e31f52..df3e9ae5ad8d2b 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -27,8 +27,7 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
@@ -102,8 +101,7 @@ DECLARE_EVENT_CLASS(bcache_bio,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d  %s %llu + %u",
@@ -138,8 +136,7 @@ TRACE_EVENT(bcache_read,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		__entry->cache_hit = hit;
 		__entry->bypass = bypass;
 	),
@@ -170,8 +167,7 @@ TRACE_EVENT(bcache_write,
 		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		__entry->writeback = writeback;
 		__entry->bypass = bypass;
 	),
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 8f3a163b8166da..3e02e3a2541373 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -84,8 +84,7 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 					0 : blk_rq_sectors(rq);
 		__entry->errors    = rq->errors;
 
-		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
-			      blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 	),
 
@@ -163,7 +162,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->nr_sector = nr_bytes >> 9;
 		__entry->errors    = rq->errors;
 
-		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, nr_bytes);
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
 		blk_dump_cmd(__get_str(cmd), rq);
 	),
 
@@ -199,8 +198,7 @@ DECLARE_EVENT_CLASS(block_rq,
 		__entry->bytes     = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					blk_rq_bytes(rq) : 0;
 
-		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
-			      blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -274,8 +272,7 @@ TRACE_EVENT(block_bio_bounce,
 					  bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -313,8 +310,7 @@ TRACE_EVENT(block_bio_complete,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->error		= error;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u [%d]",
@@ -341,8 +337,7 @@ DECLARE_EVENT_CLASS(block_bio_merge,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -409,8 +404,7 @@ TRACE_EVENT(block_bio_queue,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -438,7 +432,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
 		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio ? bio->bi_iter.bi_sector : 0;
 		__entry->nr_sector	= bio ? bio_sectors(bio) : 0;
-		blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0,
+		blk_fill_rwbs(__entry->rwbs,
 			      bio ? bio->bi_opf : 0, __entry->nr_sector);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@ -573,8 +567,7 @@ TRACE_EVENT(block_split,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->new_sector	= new_sector;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -617,8 +610,7 @@ TRACE_EVENT(block_bio_remap,
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
@@ -664,8 +656,7 @@ TRACE_EVENT(block_rq_remap,
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
 		__entry->nr_bios	= blk_rq_count_bios(rq);
-		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
-			      blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index dbafc5df03f3f0..95cecbf67f5ca0 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1777,14 +1777,14 @@ void blk_dump_cmd(char *buf, struct request *rq)
 	}
 }
 
-void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
 {
 	int i = 0;
 
-	if (rw & REQ_PREFLUSH)
+	if (op & REQ_PREFLUSH)
 		rwbs[i++] = 'F';
 
-	switch (op) {
+	switch (op & REQ_OP_MASK) {
 	case REQ_OP_WRITE:
 	case REQ_OP_WRITE_SAME:
 		rwbs[i++] = 'W';
@@ -1806,13 +1806,13 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 		rwbs[i++] = 'N';
 	}
 
-	if (rw & REQ_FUA)
+	if (op & REQ_FUA)
 		rwbs[i++] = 'F';
-	if (rw & REQ_RAHEAD)
+	if (op & REQ_RAHEAD)
 		rwbs[i++] = 'A';
-	if (rw & REQ_SYNC)
+	if (op & REQ_SYNC)
 		rwbs[i++] = 'S';
-	if (rw & REQ_META)
+	if (op & REQ_META)
 		rwbs[i++] = 'M';
 
 	rwbs[i] = '\0';

From 87374179c535a98337569904727aa02f960fe79e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:12:15 +0200
Subject: [PATCH 020/182] block: add a proper block layer data direction
 encoding

Currently the block layer op_is_write, bio_data_dir and rq_data_dir
helper treat every operation that is not a READ as a data out operation.
This worked surprisingly long, but the new REQ_OP_ZONE_REPORT operation
actually adds a second operation that reads data from the device.
Surprisingly nothing critical relied on this direction, but this might
be a good opportunity to properly fix this issue up.

We take a little inspiration and use the least significant bit of the
operation number to encode the data direction, which just requires us
to renumber the operations to fix this scheme.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 38 ++++++++++++++++++++++++++++++--------
 include/linux/fs.h        |  5 -----
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dca972d6754865..3fa62cabe8d219 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -131,20 +131,37 @@ struct bio {
 /*
  * Operations and flags common to the bio and request structures.
  * We use 8 bits for encoding the operation, and the remaining 24 for flags.
+ *
+ * The least significant bit of the operation number indicates the data
+ * transfer direction:
+ *
+ *   - if the least significant bit is set transfers are TO the device
+ *   - if the least significant bit is not set transfers are FROM the device
+ *
+ * If a operation does not transfer data the least significant bit has no
+ * meaning.
  */
 #define REQ_OP_BITS	8
 #define REQ_OP_MASK	((1 << REQ_OP_BITS) - 1)
 #define REQ_FLAG_BITS	24
 
 enum req_opf {
-	REQ_OP_READ,
-	REQ_OP_WRITE,
-	REQ_OP_DISCARD,		/* request to discard sectors */
-	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
-	REQ_OP_WRITE_SAME,	/* write same block many times */
-	REQ_OP_FLUSH,		/* request for cache flush */
-	REQ_OP_ZONE_REPORT,	/* Get zone information */
-	REQ_OP_ZONE_RESET,	/* Reset a zone write pointer */
+	/* read sectors from the device */
+	REQ_OP_READ		= 0,
+	/* write sectors to the device */
+	REQ_OP_WRITE		= 1,
+	/* flush the volatile write cache */
+	REQ_OP_FLUSH		= 2,
+	/* discard sectors */
+	REQ_OP_DISCARD		= 3,
+	/* get zone information */
+	REQ_OP_ZONE_REPORT	= 4,
+	/* securely erase sectors */
+	REQ_OP_SECURE_ERASE	= 5,
+	/* seset a zone write pointer */
+	REQ_OP_ZONE_RESET	= 6,
+	/* write the same sector many times */
+	REQ_OP_WRITE_SAME	= 7,
 
 	REQ_OP_LAST,
 };
@@ -194,6 +211,11 @@ enum req_flag_bits {
 #define bio_set_op_attrs(bio, op, op_flags) \
 	((bio)->bi_opf |= (op | op_flags))
 
+static inline bool op_is_write(unsigned int op)
+{
+	return (op & 1);
+}
+
 static inline bool op_is_sync(unsigned int op)
 {
 	return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 16d2b6e874d679..e3e878f12b2522 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2499,11 +2499,6 @@ extern void make_bad_inode(struct inode *);
 extern bool is_bad_inode(struct inode *);
 
 #ifdef CONFIG_BLOCK
-static inline bool op_is_write(unsigned int op)
-{
-	return op == REQ_OP_READ ? false : true;
-}
-
 /*
  * return data direction, READ or WRITE
  */

From aa39ebd404423e62f74cfd3e27e9ffe7e38b2a25 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:02 -0600
Subject: [PATCH 021/182] cfq-iosched: use op_is_sync instead of opencoding it

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c96186adaa6680..f28db97c3fe07e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -911,15 +911,6 @@ static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
 	return cic->icq.q->elevator->elevator_data;
 }
 
-/*
- * We regard a request as SYNC, if it's either a read or has the SYNC bit
- * set (in which case it could also be direct WRITE).
- */
-static inline bool cfq_bio_sync(struct bio *bio)
-{
-	return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
-}
-
 /*
  * scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing
@@ -2490,7 +2481,7 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 	if (!cic)
 		return NULL;
 
-	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
+	cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
 	if (cfqq)
 		return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
 
@@ -2604,13 +2595,14 @@ static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
 			       struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
+	bool is_sync = op_is_sync(bio->bi_opf);
 	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
 
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
-	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
+	if (is_sync && !rq_is_sync(rq))
 		return false;
 
 	/*
@@ -2621,7 +2613,7 @@ static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
 	if (!cic)
 		return false;
 
-	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
+	cfqq = cic_to_cfqq(cic, is_sync);
 	return cfqq == RQ_CFQQ(rq);
 }
 

From d71d9ae14a0942fae519d890a743b12679e3d153 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:03 -0600
Subject: [PATCH 022/182] blk-cgroup: use op_is_sync to check for synchronous
 requests

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index ddaf28d0988f9b..01b62e7bac74bb 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -599,7 +599,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 
 	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 
-	if (op & REQ_SYNC)
+	if (op_is_sync(op))
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 	else
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];

From 03ea4afa67566a3c4e83c0fde8bf0c48f16ac1a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:04 -0600
Subject: [PATCH 023/182] umem: use op_is_sync to check for synchronous
 requests

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/umem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index be90e15854ed5b..46f4c719fed963 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -535,7 +535,7 @@ static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio)
 	*card->biotail = bio;
 	bio->bi_next = NULL;
 	card->biotail = &bio->bi_next;
-	if (bio->bi_opf & REQ_SYNC || !mm_check_plugged(card))
+	if (op_is_sync(bio->bi_opf) || !mm_check_plugged(card))
 		activate(card);
 	spin_unlock_irq(&card->lock);
 

From 83b5df67c50995ed93b2b7adac3f002ad8bb6069 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:05 -0600
Subject: [PATCH 024/182] bcache: use op_is_sync to check for synchronous
 requests

(and remove one layer of masking for the op_is_write call next to it).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/request.c   | 4 ++--
 drivers/md/bcache/writeback.h | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 40ffe5e424b302..e8a2b693c92883 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -404,8 +404,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 
 	if (!congested &&
 	    mode == CACHE_MODE_WRITEBACK &&
-	    op_is_write(bio_op(bio)) &&
-	    (bio->bi_opf & REQ_SYNC))
+	    op_is_write(bio->bi_opf) &&
+	    op_is_sync(bio->bi_opf))
 		goto rescale;
 
 	spin_lock(&dc->io_lock);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 301eaf5651673f..629bd1a502fdf0 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -57,8 +57,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	if (would_skip)
 		return false;
 
-	return bio->bi_opf & REQ_SYNC ||
-		in_use <= CUTOFF_WRITEBACK;
+	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
 }
 
 static inline void bch_writeback_queue(struct cached_dev *dc)

From 67f055c798c72c49ee0c844eae0cd6e9c83b1b16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:06 -0600
Subject: [PATCH 025/182] btrfs: use op_is_sync to check for synchronous
 requests

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/disk-io.c | 2 +-
 fs/btrfs/volumes.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3a57f99d96aa7a..c8454a8e35f2e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -930,7 +930,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 	atomic_inc(&fs_info->nr_async_submits);
 
-	if (bio->bi_opf & REQ_SYNC)
+	if (op_is_sync(bio->bi_opf))
 		btrfs_set_work_high_priority(&async->work);
 
 	btrfs_queue_work(fs_info->workers, &async->work);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 71a60cc014519c..deda46cf1292f6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6100,7 +6100,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 	bio->bi_next = NULL;
 
 	spin_lock(&device->io_lock);
-	if (bio->bi_opf & REQ_SYNC)
+	if (op_is_sync(bio->bi_opf))
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;

From 6f6b29171a192e84b666c816e49d2175afbbb09f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:07 -0600
Subject: [PATCH 026/182] block: don't use REQ_SYNC in the READ_SYNC definition

Reads are synchronous per definition, don't add another flag for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e3e878f12b2522..5e0078fceed798 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -196,7 +196,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define READ			REQ_OP_READ
 #define WRITE			REQ_OP_WRITE
 
-#define READ_SYNC		REQ_SYNC
+#define READ_SYNC		0
 #define WRITE_SYNC		(REQ_SYNC | REQ_NOIDLE)
 #define WRITE_ODIRECT		REQ_SYNC
 #define WRITE_FLUSH		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)

From b685d3d65ac791406e0dfd8779cc9b3707fea5a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:08 -0600
Subject: [PATCH 027/182] block: treat REQ_FUA and REQ_PREFLUSH as synchronous
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of requiring everyone to specify the REQ_SYNC flag aѕ well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 8 +++++++-
 include/linux/fs.h        | 6 +++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3fa62cabe8d219..107d23d1809664 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -216,9 +216,15 @@ static inline bool op_is_write(unsigned int op)
 	return (op & 1);
 }
 
+/*
+ * Reads are always treated as synchronous, as are requests with the FUA or
+ * PREFLUSH flag.  Other operations may be marked as synchronous using the
+ * REQ_SYNC flag.
+ */
 static inline bool op_is_sync(unsigned int op)
 {
-	return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC);
+	return (op & REQ_OP_MASK) == REQ_OP_READ ||
+		(op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
 }
 
 typedef unsigned int blk_qc_t;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5e0078fceed798..ccedccb28ec8bb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -199,9 +199,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define READ_SYNC		0
 #define WRITE_SYNC		(REQ_SYNC | REQ_NOIDLE)
 #define WRITE_ODIRECT		REQ_SYNC
-#define WRITE_FLUSH		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
-#define WRITE_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_FUA)
-#define WRITE_FLUSH_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
+#define WRITE_FLUSH		(REQ_NOIDLE | REQ_PREFLUSH)
+#define WRITE_FUA		(REQ_NOIDLE | REQ_FUA)
+#define WRITE_FLUSH_FUA		(REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what

From a2b809672ee6fcb4d5756ea815725b3dbaea654e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:09 -0600
Subject: [PATCH 028/182] block: replace REQ_NOIDLE with REQ_IDLE

Noidle should be the default for writes as seen by all the compounds
definitions in fs.h using it.  In fact only direct I/O really should
be using NODILE, so turn the whole flag around to get the defaults
right, which will make our life much easier especially onces the
WRITE_* defines go away.

This assumes all the existing "raw" users of REQ_SYNC for writes
want noidle behavior, which seems to be spot on from a quick audit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/cfq-iosched.txt | 32 ++++++++++++++---------------
 block/cfq-iosched.c                 | 11 +++++++---
 drivers/block/drbd/drbd_actlog.c    |  2 +-
 include/linux/blk_types.h           |  4 ++--
 include/linux/fs.h                  | 10 ++++-----
 include/trace/events/f2fs.h         |  2 +-
 6 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
index 1e4f835a659db7..895bd3813115f6 100644
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -240,11 +240,11 @@ All cfq queues doing synchronous sequential IO go on to sync-idle tree.
 On this tree we idle on each queue individually.
 
 All synchronous non-sequential queues go on sync-noidle tree. Also any
-request which are marked with REQ_NOIDLE go on this service tree. On this
-tree we do not idle on individual queues instead idle on the whole group
-of queues or the tree. So if there are 4 queues waiting for IO to dispatch
-we will idle only once last queue has dispatched the IO and there is
-no more IO on this service tree.
+synchronous write request which is not marked with REQ_IDLE goes on this
+service tree. On this tree we do not idle on individual queues instead idle
+on the whole group of queues or the tree. So if there are 4 queues waiting
+for IO to dispatch we will idle only once last queue has dispatched the IO
+and there is no more IO on this service tree.
 
 All async writes go on async service tree. There is no idling on async
 queues.
@@ -257,17 +257,17 @@ tree idling provides isolation with buffered write queues on async tree.
 
 FAQ
 ===
-Q1. Why to idle at all on queues marked with REQ_NOIDLE.
+Q1. Why to idle at all on queues not marked with REQ_IDLE.
 
-A1. We only do tree idle (all queues on sync-noidle tree) on queues marked
-    with REQ_NOIDLE. This helps in providing isolation with all the sync-idle
+A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked
+    with REQ_IDLE. This helps in providing isolation with all the sync-idle
     queues. Otherwise in presence of many sequential readers, other
     synchronous IO might not get fair share of disk.
 
     For example, if there are 10 sequential readers doing IO and they get
-    100ms each. If a REQ_NOIDLE request comes in, it will be scheduled
-    roughly after 1 second. If after completion of REQ_NOIDLE request we
-    do not idle, and after a couple of milli seconds a another REQ_NOIDLE
+    100ms each. If a !REQ_IDLE request comes in, it will be scheduled
+    roughly after 1 second. If after completion of !REQ_IDLE request we
+    do not idle, and after a couple of milli seconds a another !REQ_IDLE
     request comes in, again it will be scheduled after 1second. Repeat it
     and notice how a workload can lose its disk share and suffer due to
     multiple sequential readers.
@@ -276,16 +276,16 @@ A1. We only do tree idle (all queues on sync-noidle tree) on queues marked
     context of fsync, and later some journaling data is written. Journaling
     data comes in only after fsync has finished its IO (atleast for ext4
     that seemed to be the case). Now if one decides not to idle on fsync
-    thread due to REQ_NOIDLE, then next journaling write will not get
+    thread due to !REQ_IDLE, then next journaling write will not get
     scheduled for another second. A process doing small fsync, will suffer
     badly in presence of multiple sequential readers.
 
-    Hence doing tree idling on threads using REQ_NOIDLE flag on requests
+    Hence doing tree idling on threads using !REQ_IDLE flag on requests
     provides isolation from multiple sequential readers and at the same
     time we do not idle on individual threads.
 
-Q2. When to specify REQ_NOIDLE
-A2. I would think whenever one is doing synchronous write and not expecting
+Q2. When to specify REQ_IDLE
+A2. I would think whenever one is doing synchronous write and expecting
     more writes to be dispatched from same context soon, should be able
-    to specify REQ_NOIDLE on writes and that probably should work well for
+    to specify REQ_IDLE on writes and that probably should work well for
     most of the cases.
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f28db97c3fe07e..dcbed8c9c82cbb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3914,6 +3914,12 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
 }
 
+static inline bool req_noidle(struct request *req)
+{
+	return req_op(req) == REQ_OP_WRITE &&
+		(req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
+}
+
 /*
  * Disable idle window if the process thinks too long or seeks so much that
  * it doesn't matter
@@ -3935,7 +3941,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 
-	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
+	if (cfqq->next_rq && req_noidle(cfqq->next_rq))
 		enable_idle = 0;
 	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
 		 !cfqd->cfq_slice_idle ||
@@ -4220,8 +4226,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	const int sync = rq_is_sync(rq);
 	u64 now = ktime_get_ns();
 
-	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
-		     !!(rq->cmd_flags & REQ_NOIDLE));
+	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
 
 	cfq_update_hw_tag(cfqd);
 
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 2d3d50ab74bfb0..8d7bcfa49c1223 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -148,7 +148,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 
 	if ((op == REQ_OP_WRITE) && !test_bit(MD_NO_FUA, &device->flags))
 		op_flags |= REQ_FUA | REQ_PREFLUSH;
-	op_flags |= REQ_SYNC | REQ_NOIDLE;
+	op_flags |= REQ_SYNC;
 
 	bio = bio_alloc_drbd(GFP_NOIO);
 	bio->bi_bdev = bdev->md_bdev;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 107d23d1809664..63b750a3b16551 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -175,7 +175,7 @@ enum req_flag_bits {
 	__REQ_META,		/* metadata io request */
 	__REQ_PRIO,		/* boost priority in cfq */
 	__REQ_NOMERGE,		/* don't touch this for merging */
-	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
+	__REQ_IDLE,		/* anticipate more IO after this one */
 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
 	__REQ_FUA,		/* forced unit access */
 	__REQ_PREFLUSH,		/* request for cache flush */
@@ -190,7 +190,7 @@ enum req_flag_bits {
 #define REQ_META		(1ULL << __REQ_META)
 #define REQ_PRIO		(1ULL << __REQ_PRIO)
 #define REQ_NOMERGE		(1ULL << __REQ_NOMERGE)
-#define REQ_NOIDLE		(1ULL << __REQ_NOIDLE)
+#define REQ_IDLE		(1ULL << __REQ_IDLE)
 #define REQ_INTEGRITY		(1ULL << __REQ_INTEGRITY)
 #define REQ_FUA			(1ULL << __REQ_FUA)
 #define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ccedccb28ec8bb..46a74209917fa5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -197,11 +197,11 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define WRITE			REQ_OP_WRITE
 
 #define READ_SYNC		0
-#define WRITE_SYNC		(REQ_SYNC | REQ_NOIDLE)
-#define WRITE_ODIRECT		REQ_SYNC
-#define WRITE_FLUSH		(REQ_NOIDLE | REQ_PREFLUSH)
-#define WRITE_FUA		(REQ_NOIDLE | REQ_FUA)
-#define WRITE_FLUSH_FUA		(REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
+#define WRITE_SYNC		REQ_SYNC
+#define WRITE_ODIRECT		(REQ_SYNC | REQ_IDLE)
+#define WRITE_FLUSH		REQ_PREFLUSH
+#define WRITE_FUA		REQ_FUA
+#define WRITE_FLUSH_FUA		(REQ_PREFLUSH | REQ_FUA)
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 903a09165bb107..a9d34424450dd9 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -32,7 +32,7 @@ TRACE_DEFINE_ENUM(LFS);
 TRACE_DEFINE_ENUM(SSR);
 TRACE_DEFINE_ENUM(__REQ_RAHEAD);
 TRACE_DEFINE_ENUM(__REQ_SYNC);
-TRACE_DEFINE_ENUM(__REQ_NOIDLE);
+TRACE_DEFINE_ENUM(__REQ_IDLE);
 TRACE_DEFINE_ENUM(__REQ_PREFLUSH);
 TRACE_DEFINE_ENUM(__REQ_FUA);
 TRACE_DEFINE_ENUM(__REQ_PRIO);

From 70fd76140a6cb63262bd47b68d57b42e889c10ee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:10 -0600
Subject: [PATCH 029/182] block,fs: use REQ_* flags directly

Remove the WRITE_* and READ_SYNC wrappers, and just use the flags
directly.  Where applicable this also drops usage of the
bio_set_op_attrs wrapper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-flush.c                   |  4 +--
 drivers/block/drbd/drbd_receiver.c  |  2 +-
 drivers/block/xen-blkback/blkback.c | 10 +++---
 drivers/md/bcache/btree.c           |  4 +--
 drivers/md/bcache/debug.c           |  4 +--
 drivers/md/bcache/request.c         |  2 +-
 drivers/md/bcache/super.c           |  4 +--
 drivers/md/dm-bufio.c               |  2 +-
 drivers/md/dm-log.c                 |  2 +-
 drivers/md/dm-raid1.c               |  4 +--
 drivers/md/dm-snap-persistent.c     |  4 +--
 drivers/md/dm.c                     |  2 +-
 drivers/md/md.c                     |  4 +--
 drivers/md/raid5-cache.c            |  4 +--
 drivers/md/raid5.c                  |  2 +-
 drivers/nvme/target/io-cmd.c        |  4 +--
 drivers/target/target_core_iblock.c |  8 ++---
 fs/btrfs/disk-io.c                  |  6 ++--
 fs/btrfs/extent_io.c                | 16 +++++-----
 fs/btrfs/inode.c                    |  6 ++--
 fs/btrfs/scrub.c                    |  2 +-
 fs/btrfs/volumes.c                  |  2 +-
 fs/btrfs/volumes.h                  |  2 +-
 fs/buffer.c                         |  8 ++---
 fs/direct-io.c                      |  2 +-
 fs/ext4/mmp.c                       |  6 ++--
 fs/ext4/page-io.c                   |  2 +-
 fs/ext4/super.c                     |  2 +-
 fs/f2fs/checkpoint.c                |  4 +--
 fs/f2fs/data.c                      | 16 +++++-----
 fs/f2fs/gc.c                        |  6 ++--
 fs/f2fs/inline.c                    |  2 +-
 fs/f2fs/node.c                      |  4 +--
 fs/f2fs/segment.c                   |  8 ++---
 fs/f2fs/super.c                     |  2 +-
 fs/gfs2/log.c                       |  4 +--
 fs/gfs2/meta_io.c                   |  6 ++--
 fs/gfs2/ops_fstype.c                |  2 +-
 fs/hfsplus/super.c                  |  4 +--
 fs/jbd2/checkpoint.c                |  2 +-
 fs/jbd2/commit.c                    |  9 +++---
 fs/jbd2/journal.c                   | 15 ++++-----
 fs/jbd2/revoke.c                    |  2 +-
 fs/jfs/jfs_logmgr.c                 |  4 +--
 fs/mpage.c                          |  6 ++--
 fs/nilfs2/super.c                   |  2 +-
 fs/ocfs2/cluster/heartbeat.c        |  2 +-
 fs/reiserfs/journal.c               |  6 ++--
 fs/xfs/xfs_aops.c                   | 11 ++++---
 fs/xfs/xfs_buf.c                    |  2 +-
 include/linux/fs.h                  | 47 -----------------------------
 include/trace/events/f2fs.h         | 10 +++---
 kernel/power/swap.c                 | 19 ++++++------
 53 files changed, 133 insertions(+), 182 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 95f1d4d357df8e..d35beca1848167 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	}
 
 	flush_rq->cmd_type = REQ_TYPE_FS;
-	flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH;
+	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
 	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
@@ -486,7 +486,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_bdev = bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	ret = submit_bio_wait(bio);
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 942384f34e2284..a89538cb3eaabb 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1266,7 +1266,7 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont
 	bio->bi_bdev = device->ldev->backing_bdev;
 	bio->bi_private = octx;
 	bio->bi_end_io = one_flush_endio;
-	bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
 
 	device->flush_jif = jiffies;
 	set_bit(FLUSH_PENDING, &device->flags);
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 4a80ee752597f0..726c32e35db9c5 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1253,14 +1253,14 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	case BLKIF_OP_WRITE:
 		ring->st_wr_req++;
 		operation = REQ_OP_WRITE;
-		operation_flags = WRITE_ODIRECT;
+		operation_flags = REQ_SYNC | REQ_IDLE;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
 		drain = true;
 	case BLKIF_OP_FLUSH_DISKCACHE:
 		ring->st_f_req++;
 		operation = REQ_OP_WRITE;
-		operation_flags = WRITE_FLUSH;
+		operation_flags = REQ_PREFLUSH;
 		break;
 	default:
 		operation = 0; /* make gcc happy */
@@ -1272,7 +1272,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	nseg = req->operation == BLKIF_OP_INDIRECT ?
 	       req->u.indirect.nr_segments : req->u.rw.nr_segments;
 
-	if (unlikely(nseg == 0 && operation_flags != WRITE_FLUSH) ||
+	if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
 	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
 		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
 	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
@@ -1334,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	}
 
 	/* Wait on all outstanding I/O's and once that has been completed
-	 * issue the WRITE_FLUSH.
+	 * issue the flush.
 	 */
 	if (drain)
 		xen_blk_drain_io(pending_req->ring);
@@ -1380,7 +1380,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 
 	/* This will be hit if the operation was a flush or discard. */
 	if (!bio) {
-		BUG_ON(operation_flags != WRITE_FLUSH);
+		BUG_ON(operation_flags != REQ_PREFLUSH);
 
 		bio = bio_alloc(GFP_KERNEL, 0);
 		if (unlikely(bio == NULL))
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81d3db40cd7be6..6fdd8e252760cb 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -297,7 +297,7 @@ static void bch_btree_node_read(struct btree *b)
 	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
 	bio->bi_end_io	= btree_node_read_endio;
 	bio->bi_private	= &cl;
-	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
+	bio->bi_opf = REQ_OP_READ | REQ_META;
 
 	bch_bio_map(bio, b->keys.set[0].data);
 
@@ -393,7 +393,7 @@ static void do_btree_node_write(struct btree *b)
 	b->bio->bi_end_io	= btree_node_write_endio;
 	b->bio->bi_private	= cl;
 	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
-	bio_set_op_attrs(b->bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
+	b->bio->bi_opf		= REQ_OP_WRITE | REQ_META | REQ_FUA;
 	bch_bio_map(b->bio, i);
 
 	/*
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 333a1e5f6ae66c..1c9130ae00736c 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -52,7 +52,7 @@ void bch_btree_verify(struct btree *b)
 	bio->bi_bdev		= PTR_CACHE(b->c, &b->key, 0)->bdev;
 	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
 	bio->bi_iter.bi_size	= KEY_SIZE(&v->key) << 9;
-	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
+	bio->bi_opf		= REQ_OP_READ | REQ_META;
 	bch_bio_map(bio, sorted);
 
 	submit_bio_wait(bio);
@@ -113,7 +113,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 	check = bio_clone(bio, GFP_NOIO);
 	if (!check)
 		return;
-	bio_set_op_attrs(check, REQ_OP_READ, READ_SYNC);
+	check->bi_opf = REQ_OP_READ;
 
 	if (bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index e8a2b693c92883..0d99b5f4b3e613 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -923,7 +923,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			flush->bi_bdev	= bio->bi_bdev;
 			flush->bi_end_io = request_endio;
 			flush->bi_private = cl;
-			bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH);
+			flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 			closure_bio_submit(flush, cl);
 		}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 849ad441cd76ba..988edf92846603 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -381,7 +381,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 		return "bad uuid pointer";
 
 	bkey_copy(&c->uuid_bucket, k);
-	uuid_io(c, REQ_OP_READ, READ_SYNC, k, cl);
+	uuid_io(c, REQ_OP_READ, 0, k, cl);
 
 	if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
 		struct uuid_entry_v0	*u0 = (void *) c->uuids;
@@ -600,7 +600,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 			ca->prio_last_buckets[bucket_nr] = bucket;
 			bucket_nr++;
 
-			prio_io(ca, bucket, REQ_OP_READ, READ_SYNC);
+			prio_io(ca, bucket, REQ_OP_READ, 0);
 
 			if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
 				pr_warn("bad csum reading priorities");
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 125aedc3875f4f..b3ba142e59a4cd 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1316,7 +1316,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
 {
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = WRITE_FLUSH,
+		.bi_op_flags = REQ_PREFLUSH,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = c->dm_io,
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 07fc1ad42ec57c..33e71ea6cc1435 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -308,7 +308,7 @@ static int flush_header(struct log_c *lc)
 	};
 
 	lc->io_req.bi_op = REQ_OP_WRITE;
-	lc->io_req.bi_op_flags = WRITE_FLUSH;
+	lc->io_req.bi_op_flags = REQ_PREFLUSH;
 
 	return dm_io(&lc->io_req, 1, &null_location, NULL);
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index bdf1606f67bcfb..1a176d7c8b905d 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = WRITE_FLUSH,
+		.bi_op_flags = REQ_PREFLUSH,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = ms->io_client,
@@ -657,7 +657,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = bio->bi_opf & WRITE_FLUSH_FUA,
+		.bi_op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH),
 		.mem.type = DM_IO_BIO,
 		.mem.ptr.bio = bio,
 		.notify.fn = write_callback,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index b8cf956b577b4a..b93476c3ba3f97 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -741,7 +741,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 	/*
 	 * Commit exceptions to disk.
 	 */
-	if (ps->valid && area_io(ps, REQ_OP_WRITE, WRITE_FLUSH_FUA))
+	if (ps->valid && area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA))
 		ps->valid = 0;
 
 	/*
@@ -818,7 +818,7 @@ static int persistent_commit_merge(struct dm_exception_store *store,
 	for (i = 0; i < nr_merged; i++)
 		clear_exception(ps, ps->current_committed - 1 - i);
 
-	r = area_io(ps, REQ_OP_WRITE, WRITE_FLUSH_FUA);
+	r = area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA);
 	if (r < 0)
 		return r;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 147af9536d0c10..b2abfa41af3ef3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1527,7 +1527,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	bio_init(&md->flush_bio);
 	md->flush_bio.bi_bdev = md->bdev;
-	bio_set_op_attrs(&md->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
+	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	dm_stats_init(&md->stats);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index eac84d8ff7244b..b69ec7da4baeef 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -394,7 +394,7 @@ static void submit_flushes(struct work_struct *ws)
 			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
 			bi->bi_bdev = rdev->bdev;
-			bio_set_op_attrs(bi, REQ_OP_WRITE, WRITE_FLUSH);
+			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 			atomic_inc(&mddev->flush_pending);
 			submit_bio(bi);
 			rcu_read_lock();
@@ -743,7 +743,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
 	bio->bi_end_io = super_written;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA;
 
 	atomic_inc(&mddev->pending_writes);
 	submit_bio(bio);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 1b1ab4a1d132b3..28d015c6fffe1e 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -685,7 +685,7 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
 	bio_reset(&log->flush_bio);
 	log->flush_bio.bi_bdev = log->rdev->bdev;
 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
-	bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
+	log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	submit_bio(&log->flush_bio);
 }
 
@@ -1053,7 +1053,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
 	mb->checksum = cpu_to_le32(crc);
 
 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
-			  WRITE_FUA, false)) {
+			  REQ_FUA, false)) {
 		__free_page(page);
 		return -EIO;
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 92ac251e91e62e..70acdd379e4490 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -913,7 +913,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 			op = REQ_OP_WRITE;
 			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
-				op_flags = WRITE_FUA;
+				op_flags = REQ_FUA;
 			if (test_bit(R5_Discard, &sh->dev[i].flags))
 				op = REQ_OP_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 4a96c2049b7b6b..c2784cfc5e298b 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -58,7 +58,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 	if (req->cmd->rw.opcode == nvme_cmd_write) {
 		op = REQ_OP_WRITE;
-		op_flags = WRITE_ODIRECT;
+		op_flags = REQ_SYNC | REQ_IDLE;
 		if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
 			op_flags |= REQ_FUA;
 	} else {
@@ -109,7 +109,7 @@ static void nvmet_execute_flush(struct nvmet_req *req)
 	bio->bi_bdev = req->ns->bdev;
 	bio->bi_private = req;
 	bio->bi_end_io = nvmet_bio_done;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	submit_bio(bio);
 }
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 372d744315f38e..d316ed537d5913 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -388,7 +388,7 @@ iblock_execute_sync_cache(struct se_cmd *cmd)
 	bio = bio_alloc(GFP_KERNEL, 0);
 	bio->bi_end_io = iblock_end_io_flush;
 	bio->bi_bdev = ib_dev->ibd_bd;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	if (!immed)
 		bio->bi_private = cmd;
 	submit_bio(bio);
@@ -686,15 +686,15 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 		struct iblock_dev *ib_dev = IBLOCK_DEV(dev);
 		struct request_queue *q = bdev_get_queue(ib_dev->ibd_bd);
 		/*
-		 * Force writethrough using WRITE_FUA if a volatile write cache
+		 * Force writethrough using REQ_FUA if a volatile write cache
 		 * is not enabled, or if initiator set the Force Unit Access bit.
 		 */
 		op = REQ_OP_WRITE;
 		if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) {
 			if (cmd->se_cmd_flags & SCF_FUA)
-				op_flags = WRITE_FUA;
+				op_flags = REQ_FUA;
 			else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
-				op_flags = WRITE_FUA;
+				op_flags = REQ_FUA;
 		}
 	} else {
 		op = REQ_OP_READ;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c8454a8e35f2e3..fe10afd51e027f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3485,9 +3485,9 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * to go down lazy.
 		 */
 		if (i == 0)
-			ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh);
+			ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh);
 		else
-			ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+			ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 		if (ret)
 			errors++;
 	}
@@ -3551,7 +3551,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	bio->bi_bdev = device->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
 	device->flush_bio = bio;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 66a755150056ed..ff87bff7bdb656 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -127,7 +127,7 @@ struct extent_page_data {
 	 */
 	unsigned int extent_locked:1;
 
-	/* tells the submit_bio code to use a WRITE_SYNC */
+	/* tells the submit_bio code to use REQ_SYNC */
 	unsigned int sync_io:1;
 };
 
@@ -2047,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		return -EIO;
 	}
 	bio->bi_bdev = dev->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 	bio_add_page(bio, page, length, pg_offset);
 
 	if (btrfsic_submit_bio_wait(bio)) {
@@ -2388,7 +2388,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	struct bio *bio;
-	int read_mode;
+	int read_mode = 0;
 	int ret;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2404,9 +2404,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	}
 
 	if (failed_bio->bi_vcnt > 1)
-		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
-	else
-		read_mode = READ_SYNC;
+		read_mode |= REQ_FAILFAST_DEV;
 
 	phy_offset >>= inode->i_sb->s_blocksize_bits;
 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
@@ -3484,7 +3482,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	unsigned long nr_written = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		write_flags = WRITE_SYNC;
+		write_flags = REQ_SYNC;
 
 	trace___extent_writepage(page, inode, wbc);
 
@@ -3729,7 +3727,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
 	unsigned long start, end;
-	int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META;
+	int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META;
 	int ret = 0;
 
 	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
@@ -4076,7 +4074,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
 		int ret;
 
 		bio_set_op_attrs(epd->bio, REQ_OP_WRITE,
-				 epd->sync_io ? WRITE_SYNC : 0);
+				 epd->sync_io ? REQ_SYNC : 0);
 
 		ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
 		BUG_ON(ret < 0); /* -ENOMEM */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9a377079af267f..c8eb82a416b334 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7917,7 +7917,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	struct io_failure_record *failrec;
 	struct bio *bio;
 	int isector;
-	int read_mode;
+	int read_mode = 0;
 	int ret;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -7936,9 +7936,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	if ((failed_bio->bi_vcnt > 1)
 		|| (failed_bio->bi_io_vec->bv_len
 			> BTRFS_I(inode)->root->sectorsize))
-		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
-	else
-		read_mode = READ_SYNC;
+		read_mode |= REQ_FAILFAST_DEV;
 
 	isector = start - btrfs_io_bio(failed_bio)->logical;
 	isector >>= inode->i_sb->s_blocksize_bits;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fffb9ab8526eb4..ff3078234d94d0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4440,7 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 	ret = bio_add_page(bio, page, PAGE_SIZE, 0);
 	if (ret != PAGE_SIZE) {
 leave_with_eio:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index deda46cf1292f6..0d7d635d8bfbea 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6023,7 +6023,7 @@ static void btrfs_end_bio(struct bio *bio)
 				else
 					btrfs_dev_stat_inc(dev,
 						BTRFS_DEV_STAT_READ_ERRS);
-				if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH)
+				if (bio->bi_opf & REQ_PREFLUSH)
 					btrfs_dev_stat_inc(dev,
 						BTRFS_DEV_STAT_FLUSH_ERRS);
 				btrfs_dev_stat_print_on_error(dev);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 09ed29c67848c4..f137ffe6654c5d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -62,7 +62,7 @@ struct btrfs_device {
 	int running_pending;
 	/* regular prio bios */
 	struct btrfs_pending_bios pending_bios;
-	/* WRITE_SYNC bios */
+	/* sync bios */
 	struct btrfs_pending_bios pending_sync_bios;
 
 	struct block_device *bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index a29335867e3038..bc7c2bb30a9bfa 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -753,7 +753,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * still in flight on potentially older
 				 * contents.
 				 */
-				write_dirty_buffer(bh, WRITE_SYNC);
+				write_dirty_buffer(bh, REQ_SYNC);
 
 				/*
 				 * Kick off IO for the previous mapping. Note
@@ -1684,7 +1684,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
  * prevents this contention from occurring.
  *
  * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
  * causes the writes to be flagged as synchronous writes.
  */
 int __block_write_full_page(struct inode *inode, struct page *page,
@@ -1697,7 +1697,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -3210,7 +3210,7 @@ EXPORT_SYMBOL(__sync_dirty_buffer);
 
 int sync_dirty_buffer(struct buffer_head *bh)
 {
-	return __sync_dirty_buffer(bh, WRITE_SYNC);
+	return __sync_dirty_buffer(bh, REQ_SYNC);
 }
 EXPORT_SYMBOL(sync_dirty_buffer);
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index fb9aa16a772728..a5138c564019a4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,7 +1209,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	dio->inode = inode;
 	if (iov_iter_rw(iter) == WRITE) {
 		dio->op = REQ_OP_WRITE;
-		dio->op_flags = WRITE_ODIRECT;
+		dio->op_flags = REQ_SYNC | REQ_IDLE;
 	} else {
 		dio->op = REQ_OP_READ;
 	}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d89754ef1aab72..eb98356386808b 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -35,7 +35,7 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
 }
 
 /*
- * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * Write the MMP block using REQ_SYNC to try to get the block on-disk
  * faster.
  */
 static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
@@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 	lock_buffer(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+	submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	sb_end_write(sb);
 	if (unlikely(!buffer_uptodate(bh)))
@@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
 	get_bh(*bh);
 	lock_buffer(*bh);
 	(*bh)->b_end_io = end_buffer_read_sync;
-	submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh);
+	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
 	wait_on_buffer(*bh);
 	if (!buffer_uptodate(*bh)) {
 		ret = -EIO;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0094923e5ebf56..e0b3b54cdef326 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -340,7 +340,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
 
 	if (bio) {
 		int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
-				  WRITE_SYNC : 0;
+				  REQ_SYNC : 0;
 		bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
 		submit_bio(io->io_bio);
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6db81fbcbaa6cc..f31eb286af90a5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4553,7 +4553,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	unlock_buffer(sbh);
 	if (sync) {
 		error = __sync_dirty_buffer(sbh,
-			test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
+			test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
 		if (error)
 			return error;
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7e9b504bd8b295..d935c06a84f038 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_READ,
-		.op_flags = READ_SYNC | REQ_META | REQ_PRIO,
+		.op_flags = REQ_META | REQ_PRIO,
 		.old_blkaddr = index,
 		.new_blkaddr = index,
 		.encrypted_page = NULL,
@@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_READ,
-		.op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD,
+		.op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
 		.encrypted_page = NULL,
 	};
 	struct blk_plug plug;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9ae194fd2fdb81..b80bf10603d748 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -198,11 +198,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 	if (type >= META_FLUSH) {
 		io->fio.type = META_FLUSH;
 		io->fio.op = REQ_OP_WRITE;
-		if (test_opt(sbi, NOBARRIER))
-			io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
-		else
-			io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META |
-								REQ_PRIO;
+		io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO;
+		if (!test_opt(sbi, NOBARRIER))
+			io->fio.op_flags |= REQ_FUA;
 	}
 	__submit_merged_bio(io);
 out:
@@ -483,7 +481,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
 		return page;
 	f2fs_put_page(page, 0);
 
-	page = get_read_data_page(inode, index, READ_SYNC, false);
+	page = get_read_data_page(inode, index, 0, false);
 	if (IS_ERR(page))
 		return page;
 
@@ -509,7 +507,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 repeat:
-	page = get_read_data_page(inode, index, READ_SYNC, for_write);
+	page = get_read_data_page(inode, index, 0, for_write);
 	if (IS_ERR(page))
 		return page;
 
@@ -1251,7 +1249,7 @@ static int f2fs_write_data_page(struct page *page,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? REQ_SYNC : 0,
 		.page = page,
 		.encrypted_page = NULL,
 	};
@@ -1663,7 +1661,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 			err = PTR_ERR(bio);
 			goto fail;
 		}
-		bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+		bio->bi_opf = REQ_OP_READ;
 		if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 			bio_put(bio);
 			err = -EFAULT;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 93985c64d8a8be..9eb11b2244eac8 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -550,7 +550,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 		.sbi = F2FS_I_SB(inode),
 		.type = DATA,
 		.op = REQ_OP_READ,
-		.op_flags = READ_SYNC,
+		.op_flags = 0,
 		.encrypted_page = NULL,
 	};
 	struct dnode_of_data dn;
@@ -625,7 +625,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
 
 	fio.op = REQ_OP_WRITE;
-	fio.op_flags = WRITE_SYNC;
+	fio.op_flags = REQ_SYNC;
 	fio.new_blkaddr = newaddr;
 	f2fs_submit_page_mbio(&fio);
 
@@ -663,7 +663,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 			.sbi = F2FS_I_SB(inode),
 			.type = DATA,
 			.op = REQ_OP_WRITE,
-			.op_flags = WRITE_SYNC,
+			.op_flags = REQ_SYNC,
 			.page = page,
 			.encrypted_page = NULL,
 		};
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5f1a67f756afc4..2e7f54c191b441 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -111,7 +111,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 		.sbi = F2FS_I_SB(dn->inode),
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_PRIO,
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 01177ecdeab8e7..932f3f8bb57bba 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1134,7 +1134,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	err = read_node_page(page, READ_SYNC);
+	err = read_node_page(page, 0);
 	if (err < 0) {
 		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
@@ -1570,7 +1570,7 @@ static int f2fs_write_node_page(struct page *page,
 		.sbi = sbi,
 		.type = NODE,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? REQ_SYNC : 0,
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fc886f0084495e..f1b4a1775ebebf 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -259,7 +259,7 @@ static int __commit_inmem_pages(struct inode *inode,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_PRIO,
 		.encrypted_page = NULL,
 	};
 	bool submit_bio = false;
@@ -420,7 +420,7 @@ static int issue_flush_thread(void *data)
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
 		bio->bi_bdev = sbi->sb->s_bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+		bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 		ret = submit_bio_wait(bio);
 
 		llist_for_each_entry_safe(cmd, next,
@@ -454,7 +454,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 
 		atomic_inc(&fcc->submit_flush);
 		bio->bi_bdev = sbi->sb->s_bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+		bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 		ret = submit_bio_wait(bio);
 		atomic_dec(&fcc->submit_flush);
 		bio_put(bio);
@@ -1515,7 +1515,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_META | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
 		.old_blkaddr = page->index,
 		.new_blkaddr = page->index,
 		.page = page,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6132b4ce4e4ce3..2cac6bb860808d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1238,7 +1238,7 @@ static int __f2fs_commit_super(struct buffer_head *bh,
 	unlock_buffer(bh);
 
 	/* it's rare case, we can do fua all the time */
-	return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+	return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA);
 }
 
 static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e58ccef09c917b..27c00a16def0a5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 	struct gfs2_log_header *lh;
 	unsigned int tail;
 	u32 hash;
-	int op_flags = WRITE_FLUSH_FUA | REQ_META;
+	int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META;
 	struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
 	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
 	lh = page_address(page);
@@ -682,7 +682,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
 		gfs2_ordered_wait(sdp);
 		log_flush_wait(sdp);
-		op_flags = WRITE_SYNC | REQ_META | REQ_PRIO;
+		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
 	}
 
 	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 373639a597823c..e562b1191c9c8d 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -38,7 +38,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
 	int write_flags = REQ_META | REQ_PRIO |
-		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+		(wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
@@ -285,7 +285,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		}
 	}
 
-	gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
+	gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
@@ -453,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh);
+		ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
 
 	dblock++;
 	extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ff72ac6439c821..a34308df927f4f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -246,7 +246,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 
 	bio->bi_end_io = end_bio_io_page;
 	bio->bi_private = page;
-	bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META);
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
 	submit_bio(bio);
 	wait_on_page_locked(page);
 	bio_put(bio);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 11854dd8457263..67aedf4c2e7c5b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	error2 = hfsplus_submit_bio(sb,
 				   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
 				   sbi->s_vhdr_buf, NULL, REQ_OP_WRITE,
-				   WRITE_SYNC);
+				   REQ_SYNC);
 	if (!error)
 		error = error2;
 	if (!write_backup)
@@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	error2 = hfsplus_submit_bio(sb,
 				  sbi->part_start + sbi->sect_count - 2,
 				  sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE,
-				  WRITE_SYNC);
+				  REQ_SYNC);
 	if (!error)
 		error2 = error;
 out:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 684996c8a3a4a2..4055f51617eff9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -186,7 +186,7 @@ __flush_batch(journal_t *journal, int *batch_count)
 
 	blk_start_plug(&plug);
 	for (i = 0; i < *batch_count; i++)
-		write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
+		write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
 	blk_finish_plug(&plug);
 
 	for (i = 0; i < *batch_count; i++) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 31f8ca0466392e..8c514367ba5acb 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -155,9 +155,10 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_has_feature_async_commit(journal))
-		ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh);
+		ret = submit_bh(REQ_OP_WRITE,
+			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
 	else
-		ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 
 	*cbh = bh;
 	return ret;
@@ -402,7 +403,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
 						journal->j_tail,
-						WRITE_SYNC);
+						REQ_SYNC);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
 		jbd_debug(3, "superblock not updated\n");
@@ -717,7 +718,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 			}
 			cond_resched();
 			stats.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 927da4956a89e5..8ed971eeab44d6 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -913,7 +913,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	 * space and if we lose sb update during power failure we'd replay
 	 * old transaction with possibly newly overwritten data.
 	 */
-	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
 	if (ret)
 		goto out;
 
@@ -1306,7 +1306,7 @@ static int journal_reset(journal_t *journal)
 		/* Lock here to make assertions happy... */
 		mutex_lock(&journal->j_checkpoint_mutex);
 		/*
-		 * Update log tail information. We use WRITE_FUA since new
+		 * Update log tail information. We use REQ_FUA since new
 		 * transaction will start reusing journal space and so we
 		 * must make sure information about current log tail is on
 		 * disk before that.
@@ -1314,7 +1314,7 @@ static int journal_reset(journal_t *journal)
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
 						journal->j_tail,
-						WRITE_FUA);
+						REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 	return jbd2_journal_start_thread(journal);
@@ -1454,7 +1454,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
 	read_unlock(&journal->j_state_lock);
 
-	jbd2_write_superblock(journal, WRITE_FUA);
+	jbd2_write_superblock(journal, REQ_FUA);
 }
 EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 
@@ -1720,7 +1720,8 @@ int jbd2_journal_destroy(journal_t *journal)
 				++journal->j_transaction_sequence;
 			write_unlock(&journal->j_state_lock);
 
-			jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
+			jbd2_mark_journal_empty(journal,
+					REQ_PREFLUSH | REQ_FUA);
 			mutex_unlock(&journal->j_checkpoint_mutex);
 		} else
 			err = -EIO;
@@ -1979,7 +1980,7 @@ int jbd2_journal_flush(journal_t *journal)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	jbd2_mark_journal_empty(journal, WRITE_FUA);
+	jbd2_mark_journal_empty(journal, REQ_FUA);
 	mutex_unlock(&journal->j_checkpoint_mutex);
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(!journal->j_running_transaction);
@@ -2025,7 +2026,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (write) {
 		/* Lock to make assertions happy... */
 		mutex_lock(&journal->j_checkpoint_mutex);
-		jbd2_mark_journal_empty(journal, WRITE_FUA);
+		jbd2_mark_journal_empty(journal, REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 91171dc352cbd1..cfc38b5521189f 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -648,7 +648,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(descriptor);
 	BUFFER_TRACE(descriptor, "write");
 	set_buffer_dirty(descriptor);
-	write_dirty_buffer(descriptor, WRITE_SYNC);
+	write_dirty_buffer(descriptor, REQ_SYNC);
 }
 #endif
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a21ea8b3e5fa67..bb1da1feafeb82 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2002,7 +2002,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+	bio->bi_opf = REQ_OP_READ;
 	/*check if journaling to disk has been disabled*/
 	if (log->no_integrity) {
 		bio->bi_iter.bi_size = 0;
@@ -2146,7 +2146,7 @@ static void lbmStartIO(struct lbuf * bp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 
 	/* check if journaling to disk has been disabled */
 	if (log->no_integrity) {
diff --git a/fs/mpage.c b/fs/mpage.c
index d2413af0823a24..f35e2819d0c699 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
-	int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : 0);
+	int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -705,7 +705,7 @@ mpage_writepages(struct address_space *mapping,
 		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
 		if (mpd.bio) {
 			int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
-				  WRITE_SYNC : 0);
+				  REQ_SYNC : 0);
 			mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
 		}
 	}
@@ -726,7 +726,7 @@ int mpage_writepage(struct page *page, get_block_t get_block,
 	int ret = __mpage_writepage(page, wbc, &mpd);
 	if (mpd.bio) {
 		int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
-			  WRITE_SYNC : 0);
+			  REQ_SYNC : 0);
 		mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
 	}
 	return ret;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c95d369e90aa94..12eeae62a2b1f7 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -189,7 +189,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
 	set_buffer_dirty(nilfs->ns_sbh[0]);
 	if (nilfs_test_opt(nilfs, BARRIER)) {
 		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-					  WRITE_SYNC | WRITE_FLUSH_FUA);
+					  REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
 	} else {
 		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 636abcbd46501b..52eef16edb01fc 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -627,7 +627,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
 	slot = o2nm_this_node();
 
 	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE,
-				 WRITE_SYNC);
+				 REQ_SYNC);
 	if (IS_ERR(bio)) {
 		status = PTR_ERR(bio);
 		mlog_errno(status);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bc2dde2423c2eb..aa40c242f1db3d 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1111,7 +1111,8 @@ static int flush_commit_list(struct super_block *s,
 		mark_buffer_dirty(jl->j_commit_bh) ;
 		depth = reiserfs_write_unlock_nested(s);
 		if (reiserfs_barrier_flush(s))
-			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+			__sync_dirty_buffer(jl->j_commit_bh,
+					REQ_PREFLUSH | REQ_FUA);
 		else
 			sync_dirty_buffer(jl->j_commit_bh);
 		reiserfs_write_lock_nested(s, depth);
@@ -1269,7 +1270,8 @@ static int _update_journal_header_block(struct super_block *sb,
 		depth = reiserfs_write_unlock_nested(sb);
 
 		if (reiserfs_barrier_flush(sb))
-			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+			__sync_dirty_buffer(journal->j_header_bh,
+					REQ_PREFLUSH | REQ_FUA);
 		else
 			sync_dirty_buffer(journal->j_header_bh);
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3e57a56cf82947..594e02c485b23d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -495,8 +495,10 @@ xfs_submit_ioend(
 
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
-			 (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+	ioend->io_bio->bi_opf = REQ_OP_WRITE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ioend->io_bio->bi_opf |= REQ_SYNC;
+
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
 	 * error and finish it. This will run IO completion immediately
@@ -567,8 +569,9 @@ xfs_chain_bio(
 
 	bio_chain(ioend->io_bio, new);
 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
-			  (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+	ioend->io_bio->bi_opf = REQ_OP_WRITE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ioend->io_bio->bi_opf |= REQ_SYNC;
 	submit_bio(ioend->io_bio);
 	ioend->io_bio = new;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b5b9bffe352074..33c435f3316c6f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1304,7 +1304,7 @@ _xfs_buf_ioapply(
 	if (bp->b_flags & XBF_WRITE) {
 		op = REQ_OP_WRITE;
 		if (bp->b_flags & XBF_SYNCIO)
-			op_flags = WRITE_SYNC;
+			op_flags = REQ_SYNC;
 		if (bp->b_flags & XBF_FUA)
 			op_flags |= REQ_FUA;
 		if (bp->b_flags & XBF_FLUSH)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 46a74209917fa5..7a1b78ab7c1527 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -151,58 +151,11 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
  */
 #define CHECK_IOVEC_ONLY -1
 
-/*
- * The below are the various read and write flags that we support. Some of
- * them include behavioral modifiers that send information down to the
- * block layer and IO scheduler. They should be used along with a req_op.
- * Terminology:
- *
- *	The block layer uses device plugging to defer IO a little bit, in
- *	the hope that we will see more IO very shortly. This increases
- *	coalescing of adjacent IO and thus reduces the number of IOs we
- *	have to send to the device. It also allows for better queuing,
- *	if the IO isn't mergeable. If the caller is going to be waiting
- *	for the IO, then he must ensure that the device is unplugged so
- *	that the IO is dispatched to the driver.
- *
- *	All IO is handled async in Linux. This is fine for background
- *	writes, but for reads or writes that someone waits for completion
- *	on, we want to notify the block layer and IO scheduler so that they
- *	know about it. That allows them to make better scheduling
- *	decisions. So when the below references 'sync' and 'async', it
- *	is referencing this priority hint.
- *
- * With that in mind, the available types are:
- *
- * READ			A normal read operation. Device will be plugged.
- * READ_SYNC		A synchronous read. Device is not plugged, caller can
- *			immediately wait on this read without caring about
- *			unplugging.
- * WRITE		A normal async write. Device will be plugged.
- * WRITE_SYNC		Synchronous write. Identical to WRITE, but passes down
- *			the hint that someone will be waiting on this IO
- *			shortly. The write equivalent of READ_SYNC.
- * WRITE_ODIRECT	Special case write for O_DIRECT only.
- * WRITE_FLUSH		Like WRITE_SYNC but with preceding cache flush.
- * WRITE_FUA		Like WRITE_SYNC but data is guaranteed to be on
- *			non-volatile media on completion.
- * WRITE_FLUSH_FUA	Combination of WRITE_FLUSH and FUA. The IO is preceded
- *			by a cache flush and data is guaranteed to be on
- *			non-volatile media on completion.
- *
- */
 #define RW_MASK			REQ_OP_WRITE
 
 #define READ			REQ_OP_READ
 #define WRITE			REQ_OP_WRITE
 
-#define READ_SYNC		0
-#define WRITE_SYNC		REQ_SYNC
-#define WRITE_ODIRECT		(REQ_SYNC | REQ_IDLE)
-#define WRITE_FLUSH		REQ_PREFLUSH
-#define WRITE_FUA		REQ_FUA
-#define WRITE_FLUSH_FUA		(REQ_PREFLUSH | REQ_FUA)
-
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index a9d34424450dd9..5da2c829a71899 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -55,7 +55,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ IPU,		"IN-PLACE" },				\
 		{ OPU,		"OUT-OF-PLACE" })
 
-#define F2FS_BIO_FLAG_MASK(t)	(t & (REQ_RAHEAD | WRITE_FLUSH_FUA))
+#define F2FS_BIO_FLAG_MASK(t)	(t & (REQ_RAHEAD | REQ_PREFLUSH | REQ_FUA))
 #define F2FS_BIO_EXTRA_MASK(t)	(t & (REQ_META | REQ_PRIO))
 
 #define show_bio_type(op_flags)	show_bio_op_flags(op_flags), 		\
@@ -65,11 +65,9 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 	__print_symbolic(F2FS_BIO_FLAG_MASK(flags),			\
 		{ 0,			"WRITE" },			\
 		{ REQ_RAHEAD, 		"READAHEAD" },			\
-		{ READ_SYNC, 		"READ_SYNC" },			\
-		{ WRITE_SYNC, 		"WRITE_SYNC" },			\
-		{ WRITE_FLUSH,		"WRITE_FLUSH" },		\
-		{ WRITE_FUA, 		"WRITE_FUA" },			\
-		{ WRITE_FLUSH_FUA,	"WRITE_FLUSH_FUA" })
+		{ REQ_SYNC, 		"REQ_SYNC" },			\
+		{ REQ_PREFLUSH,		"REQ_PREFLUSH" },		\
+		{ REQ_FUA,		"REQ_FUA" })
 
 #define show_bio_extra(type)						\
 	__print_symbolic(F2FS_BIO_EXTRA_MASK(type),			\
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a3b1e617bcdc39..32e0c232efbafa 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -307,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
-	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+	hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
 		      swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
@@ -317,7 +317,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		swsusp_header->flags = flags;
 		if (flags & SF_CRC32_MODE)
 			swsusp_header->crc32 = handle->crc32;
-		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+		error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 				      swsusp_resume_block, swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
@@ -397,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 	} else {
 		src = buf;
 	}
-	return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
+	return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb);
 }
 
 static void release_swap_writer(struct swap_map_handle *handle)
@@ -1000,8 +1000,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 			return -ENOMEM;
 		}
 
-		error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
-				      tmp->map, NULL);
+		error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL);
 		if (error) {
 			release_swap_reader(handle);
 			return error;
@@ -1025,7 +1024,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
+	error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1534,7 +1533,7 @@ int swsusp_check(void)
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-		error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+		error = hib_submit_io(REQ_OP_READ, 0,
 					swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
@@ -1543,7 +1542,7 @@ int swsusp_check(void)
 		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+			error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 						swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
@@ -1588,11 +1587,11 @@ int swsusp_unmark(void)
 {
 	int error;
 
-	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+	hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
 		      swsusp_header, NULL);
 	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+		error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 					swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {

From d38499530e5f170d30f32d3841fade204e63081d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:11 -0600
Subject: [PATCH 030/182] fs: decouple READ and WRITE from the block layer ops

Move READ and WRITE to kernel.h and don't define them in terms of block
layer ops; they are our generic data direction indicators these days
and have no more resemblance with the block layer ops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h    |  6 ++++++
 include/linux/fs.h     | 13 -------------
 include/linux/kernel.h |  4 ++++
 include/linux/uio.h    |  2 +-
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 87ce64dafb93ad..fe9a1701760816 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -62,6 +62,12 @@
 #define bio_sectors(bio)	((bio)->bi_iter.bi_size >> 9)
 #define bio_end_sector(bio)	((bio)->bi_iter.bi_sector + bio_sectors((bio)))
 
+/*
+ * Return the data direction, READ or WRITE.
+ */
+#define bio_data_dir(bio) \
+	(op_is_write(bio_op(bio)) ? WRITE : READ)
+
 /*
  * Check whether this bio carries any data or not. A NULL bio is allowed.
  */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a1b78ab7c1527..0ad36e0c7fa795 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -151,11 +151,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
  */
 #define CHECK_IOVEC_ONLY -1
 
-#define RW_MASK			REQ_OP_WRITE
-
-#define READ			REQ_OP_READ
-#define WRITE			REQ_OP_WRITE
-
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
@@ -2452,14 +2447,6 @@ extern void make_bad_inode(struct inode *);
 extern bool is_bad_inode(struct inode *);
 
 #ifdef CONFIG_BLOCK
-/*
- * return data direction, READ or WRITE
- */
-static inline int bio_data_dir(struct bio *bio)
-{
-	return op_is_write(bio_op(bio)) ? WRITE : READ;
-}
-
 extern void check_disk_size_change(struct gendisk *disk,
 				   struct block_device *bdev);
 extern int revalidate_disk(struct gendisk *);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bc6ed52a39b967..01b6b460c34d2b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -50,6 +50,10 @@
 #define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
 #define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
 
+/* generic data direction definitions */
+#define READ			0
+#define WRITE			1
+
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
 #define u64_to_user_ptr(x) (		\
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 6e22b544d03913..d5aba1512b8bdd 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -125,7 +125,7 @@ static inline bool iter_is_iovec(const struct iov_iter *i)
  *
  * The ?: is just for type safety.
  */
-#define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))->type & RW_MASK)
+#define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))->type & (READ | WRITE))
 
 /*
  * Cap the iov_iter by given limit; note that the second argument is

From 1e3914d4cf4e14653b7917b0e965217465cb7a9c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:12 -0600
Subject: [PATCH 031/182] block, fs: move submit_bio to bio.h

This is where all the other bio operations live, so users must include
bio.h anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h | 2 ++
 include/linux/fs.h  | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index fe9a1701760816..5c604b4914bfaa 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -404,6 +404,8 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 
 }
 
+extern blk_qc_t submit_bio(struct bio *);
+
 extern void bio_endio(struct bio *);
 
 static inline void bio_io_error(struct bio *bio)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0ad36e0c7fa795..5b0a9b77534d20 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2717,7 +2717,6 @@ static inline void remove_inode_hash(struct inode *inode)
 extern void inode_sb_list_add(struct inode *inode);
 
 #ifdef CONFIG_BLOCK
-extern blk_qc_t submit_bio(struct bio *);
 extern int bdev_read_only(struct block_device *);
 #endif
 extern int set_blocksize(struct block_device *, int);

From 2f8b544477e627a42e66902e948d87f86554aeca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:13 -0600
Subject: [PATCH 032/182] block,fs: untangle fs.h and blk_types.h

Nothing in fs.h should require blk_types.h to be included.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/9p/vfs_addr.c          | 1 +
 fs/cifs/connect.c         | 1 +
 fs/cifs/transport.c       | 1 +
 fs/gfs2/dir.c             | 1 +
 fs/isofs/compress.c       | 1 +
 fs/ntfs/logfile.c         | 1 +
 fs/ocfs2/buffer_head_io.c | 1 +
 fs/orangefs/inode.c       | 1 +
 fs/reiserfs/stree.c       | 1 +
 fs/squashfs/block.c       | 1 +
 fs/udf/dir.c              | 1 +
 fs/udf/directory.c        | 1 +
 fs/udf/inode.c            | 1 +
 fs/ufs/balloc.c           | 1 +
 include/linux/fs.h        | 2 +-
 include/linux/swap.h      | 1 +
 include/linux/writeback.h | 2 ++
 lib/iov_iter.c            | 1 +
 18 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6181ad79e1a543..5ca1fb0043f650 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -34,6 +34,7 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
+#include <linux/bvec.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index aab5227979e2ee..db726e8311caec 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -41,6 +41,7 @@
 #include <keys/user-type.h>
 #include <net/ipv6.h>
 #include <linux/parser.h>
+#include <linux/bvec.h>
 
 #include "cifspdu.h"
 #include "cifsglob.h"
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 206a597b2293cb..5f02edc819af27 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -28,6 +28,7 @@
 #include <linux/delay.h>
 #include <linux/freezer.h>
 #include <linux/tcp.h>
+#include <linux/bvec.h>
 #include <linux/highmem.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 3cdde5f5d399c9..79113219be5f98 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -62,6 +62,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/vmalloc.h>
+#include <linux/bio.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 44af14b2e91663..9bb2fe35799d3d 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -18,6 +18,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 
 #include <linux/vmalloc.h>
 #include <linux/zlib.h>
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 761f12f7f3efcf..353379ff6057d9 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
 #include <linux/log2.h>
+#include <linux/bio.h>
 
 #include "attrib.h"
 #include "aops.h"
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 8f040f88ade44d..d9ebe11c89909b 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/highmem.h>
+#include <linux/bio.h>
 
 #include <cluster/masklog.h>
 
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index ef3b4eb54cf25d..551bc74ed2b822 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -8,6 +8,7 @@
  *  Linux VFS inode operations.
  */
 
+#include <linux/bvec.h>
 #include "protocol.h"
 #include "orangefs-kernel.h"
 #include "orangefs-bufmap.h"
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index a97e352d05d3b5..0037aea97d39a6 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -11,6 +11,7 @@
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 #include "reiserfs.h"
 #include <linux/buffer_head.h>
 #include <linux/quotaops.h>
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index ce62a380314f09..2751476e6b6e85 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -31,6 +31,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/bio.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index aaec13c952531d..2d0e028067eb91 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,6 +30,7 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/bio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 988d5352bdb866..7aa48bd7cbaf31 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -16,6 +16,7 @@
 
 #include <linux/fs.h>
 #include <linux/string.h>
+#include <linux/bio.h>
 
 struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 					 struct udf_fileident_bh *fibh,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index aad46401ede5d8..0f3db71753aa26 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
 #include <linux/crc-itu-t.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
+#include <linux/bio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 67e085d591d839..b035af54f53832 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/bitops.h>
+#include <linux/bio.h>
 #include <asm/byteorder.h>
 
 #include "ufs_fs.h"
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b0a9b77534d20..8533e9d59c29da 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -28,7 +28,6 @@
 #include <linux/uidgid.h>
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
-#include <linux/blk_types.h>
 #include <linux/workqueue.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/delayed_call.h>
@@ -38,6 +37,7 @@
 
 struct backing_dev_info;
 struct bdi_writeback;
+struct bio;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a56523cefb9b7b..3a6aebc230015b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
+#include <linux/blk_types.h>
 #include <asm/page.h>
 
 struct notifier_block;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 797100e1001097..e4c38703bf4e5c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -10,6 +10,8 @@
 #include <linux/flex_proportions.h>
 #include <linux/backing-dev-defs.h>
 
+struct bio;
+
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
 /*
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f0c7f1481baeef..efc953c4757274 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1,4 +1,5 @@
 #include <linux/export.h>
+#include <linux/bvec.h>
 #include <linux/uio.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>

From fe8ecc86fa83e6dca25b055651811dff8d80bfed Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:14 -0600
Subject: [PATCH 033/182] arm, arm64: don't include blk_types.h in <asm/io.h>

No need for it - we only use struct bio_vec in prototypes and already have
forward declarations for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 arch/arm/include/asm/io.h   | 1 -
 arch/arm64/include/asm/io.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index 021692c64de3bf..42871fb8340e0f 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -25,7 +25,6 @@
 
 #include <linux/string.h>
 #include <linux/types.h>
-#include <linux/blk_types.h>
 #include <asm/byteorder.h>
 #include <asm/memory.h>
 #include <asm-generic/pci_iomap.h>
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 0bba427bb4c292..0c00c87bb9dd0c 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -22,7 +22,6 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
-#include <linux/blk_types.h>
 
 #include <asm/byteorder.h>
 #include <asm/barrier.h>

From 9f08217120568afdfb59973a89a675e649c0096d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:15 -0600
Subject: [PATCH 034/182] ceph: don't include blk_types.h in messenger.h

The file only needs the struct bvec_iter delcaration, which is available
from bvec.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/ceph/messenger.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 8dbd7879fdc6b7..67bcef2ecddb82 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -1,7 +1,7 @@
 #ifndef __FS_CEPH_MESSENGER_H
 #define __FS_CEPH_MESSENGER_H
 
-#include <linux/blk_types.h>
+#include <linux/bvec.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/net.h>

From be297968da22cf40c9c419df51e71ba8856a2ec2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:16 -0600
Subject: [PATCH 035/182] mm: only include blk_types in swap.h if CONFIG_SWAP
 is enabled

It's only needed for the CONFIG_SWAP-only use of bio_end_io_t.

Because CONFIG_SWAP implies CONFIG_BLOCK this will allow to drop some
ifdefs in blk_types.h.

Instead we'll need to add a few explicit includes that were implicit
before, though.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/staging/lustre/include/linux/lnet/types.h | 1 +
 drivers/staging/lustre/lustre/llite/rw.c          | 1 +
 fs/ntfs/aops.c                                    | 1 +
 fs/ntfs/mft.c                                     | 1 +
 fs/reiserfs/inode.c                               | 1 +
 fs/splice.c                                       | 1 +
 include/linux/swap.h                              | 4 +++-
 7 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/lustre/include/linux/lnet/types.h b/drivers/staging/lustre/include/linux/lnet/types.h
index f8be0e2f7bf7f7..8ca1e9d0cfe2b4 100644
--- a/drivers/staging/lustre/include/linux/lnet/types.h
+++ b/drivers/staging/lustre/include/linux/lnet/types.h
@@ -34,6 +34,7 @@
 #define __LNET_TYPES_H__
 
 #include <linux/types.h>
+#include <linux/bvec.h>
 
 /** \addtogroup lnet
  * @{
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
index 50c0152ba0224d..76a6836cdf70ca 100644
--- a/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -47,6 +47,7 @@
 #include <linux/pagemap.h>
 /* current_is_kswapd() */
 #include <linux/swap.h>
+#include <linux/bvec.h>
 
 #define DEBUG_SUBSYSTEM S_LLITE
 
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fe251f187ff8ff..d0cf6fee5c77fe 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -29,6 +29,7 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
+#include <linux/bio.h>
 
 #include "aops.h"
 #include "attrib.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index d3c009626032fe..b6f402194f02c2 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
+#include <linux/bio.h>
 
 #include "attrib.h"
 #include "aops.h"
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 58b2dedb2a3a66..cfeae9b0a2b77c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/quotaops.h>
 #include <linux/swap.h>
 #include <linux/uio.h>
+#include <linux/bio.h>
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
diff --git a/fs/splice.c b/fs/splice.c
index 153d4f3bd441fe..51492f26915afb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -17,6 +17,7 @@
  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  *
  */
+#include <linux/bvec.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3a6aebc230015b..bfee1af1f54f0f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -11,7 +11,6 @@
 #include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
-#include <linux/blk_types.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -352,6 +351,9 @@ extern int kswapd_run(int nid);
 extern void kswapd_stop(int nid);
 
 #ifdef CONFIG_SWAP
+
+#include <linux/blk_types.h> /* for bio_end_io_t */
+
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);

From 7281b4526cefc898d180850b54d1369f38c6b202 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:17 -0600
Subject: [PATCH 036/182] block: remove the CONFIG_BLOCK ifdef in blk_types.h

Now that we have a separate header for struct bio_vec there is absolutely
no excuse for including this header from non-block I/O code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 63b750a3b16551..bb921028e7c5b8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -17,7 +17,6 @@ struct io_context;
 struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
 
-#ifdef CONFIG_BLOCK
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
@@ -126,8 +125,6 @@ struct bio {
 #define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
 #define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
 
-#endif /* CONFIG_BLOCK */
-
 /*
  * Operations and flags common to the bio and request structures.
  * We use 8 bits for encoding the operation, and the remaining 24 for flags.

From 1d796d6a9641fbfcd90fcfaf6fb4894a13d0304f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 1 Nov 2016 09:52:57 -0600
Subject: [PATCH 037/182] block: add REQ_BACKGROUND

This adds a new request flag, REQ_BACKGROUND, that callers can use to
tell the block layer that this is background (non-urgent) IO.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/blk_types.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index bb921028e7c5b8..562ac46cb79023 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -177,6 +177,7 @@ enum req_flag_bits {
 	__REQ_FUA,		/* forced unit access */
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
+	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -192,6 +193,7 @@ enum req_flag_bits {
 #define REQ_FUA			(1ULL << __REQ_FUA)
 #define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
+#define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)

From 7637241e651ec36e409412869f986dd5f097735f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 1 Nov 2016 10:00:38 -0600
Subject: [PATCH 038/182] writeback: add wbc_to_write_flags()

Add wbc_to_write_flags(), which returns the write modifier flags to use,
based on a struct writeback_control. No functional changes in this
patch, but it prepares us for factoring other wbc fields for write type.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/buffer.c               | 2 +-
 fs/f2fs/data.c            | 2 +-
 fs/f2fs/node.c            | 2 +-
 fs/gfs2/meta_io.c         | 3 +--
 fs/mpage.c                | 2 +-
 fs/xfs/xfs_aops.c         | 8 ++------
 include/linux/writeback.h | 9 +++++++++
 mm/page_io.c              | 5 +----
 8 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index bc7c2bb30a9bfa..af5776da814af7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1697,7 +1697,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
+	int write_flags = wbc_to_write_flags(wbc);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b80bf10603d748..9e5561fa4cb638 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1249,7 +1249,7 @@ static int f2fs_write_data_page(struct page *page,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? REQ_SYNC : 0,
+		.op_flags = wbc_to_write_flags(wbc),
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 932f3f8bb57bba..d1e29deb4598f3 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1570,7 +1570,7 @@ static int f2fs_write_node_page(struct page *page,
 		.sbi = sbi,
 		.type = NODE,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? REQ_SYNC : 0,
+		.op_flags = wbc_to_write_flags(wbc),
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index e562b1191c9c8d..49db8ef13fdff5 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 {
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
-	int write_flags = REQ_META | REQ_PRIO |
-		(wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
+	int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
diff --git a/fs/mpage.c b/fs/mpage.c
index f35e2819d0c699..98fc11aa7e0b72 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
-	int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
+	int op_flags = wbc_to_write_flags(wbc);
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 594e02c485b23d..6be5204a06d3ac 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -495,9 +495,7 @@ xfs_submit_ioend(
 
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
-	ioend->io_bio->bi_opf = REQ_OP_WRITE;
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		ioend->io_bio->bi_opf |= REQ_SYNC;
+	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
@@ -569,9 +567,7 @@ xfs_chain_bio(
 
 	bio_chain(ioend->io_bio, new);
 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
-	ioend->io_bio->bi_opf = REQ_OP_WRITE;
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		ioend->io_bio->bi_opf |= REQ_SYNC;
+	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 	submit_bio(ioend->io_bio);
 	ioend->io_bio = new;
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e4c38703bf4e5c..50c96ee8108f90 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -9,6 +9,7 @@
 #include <linux/fs.h>
 #include <linux/flex_proportions.h>
 #include <linux/backing-dev-defs.h>
+#include <linux/blk_types.h>
 
 struct bio;
 
@@ -102,6 +103,14 @@ struct writeback_control {
 #endif
 };
 
+static inline int wbc_to_write_flags(struct writeback_control *wbc)
+{
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		return REQ_SYNC;
+
+	return 0;
+}
+
 /*
  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
  * and are measured against each other in.  There always is one global
diff --git a/mm/page_io.c b/mm/page_io.c
index a2651f58c86a25..23f6d0d3470fb3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -320,10 +320,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		ret = -ENOMEM;
 		goto out;
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
-		bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC);
-	else
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 	count_vm_event(PSWPOUT);
 	set_page_writeback(page);
 	unlock_page(page);

From 13edd5e7315a26b448c5f7f33fc7721b1e0c17ef Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 1 Nov 2016 10:01:35 -0600
Subject: [PATCH 039/182] writeback: mark background writeback as such

If we're doing background type writes, then use the appropriate
background write flags for that.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/writeback.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 50c96ee8108f90..c78f9f0920b51b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -107,6 +107,8 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
 {
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		return REQ_SYNC;
+	else if (wbc->for_kupdate || wbc->for_background)
+		return REQ_BACKGROUND;
 
 	return 0;
 }

From 2cefe4dbaadf83b236caab46705b4b5a4958e3b6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 31 Oct 2016 11:59:24 -0600
Subject: [PATCH 040/182] block: add bio_iov_iter_get_pages()

This is a helper that pins down a range from an iov_iter and adds it to
a bio without requiring a separate memory allocation for the page array.
It will be used for upcoming direct I/O implementations for block devices
and iomap based file systems.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
[hch: ported to the iov_iter interface, renamed and added comments.
      All blame should be directed to me and all fame should go to Kent
      after this!]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h |  1 +
 2 files changed, 50 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index db85c5753a7656..2cf6ebabc68c5c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -847,6 +847,55 @@ int bio_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+/**
+ * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
+ * @bio: bio to add pages to
+ * @iter: iov iterator describing the region to be mapped
+ *
+ * Pins as many pages from *iter and appends them to @bio's bvec array. The
+ * pages will have to be released using put_page() when done.
+ */
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+{
+	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
+	struct page **pages = (struct page **)bv;
+	size_t offset, diff;
+	ssize_t size;
+
+	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	if (unlikely(size <= 0))
+		return size ? size : -EFAULT;
+	nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	/*
+	 * Deep magic below:  We need to walk the pinned pages backwards
+	 * because we are abusing the space allocated for the bio_vecs
+	 * for the page array.  Because the bio_vecs are larger than the
+	 * page pointers by definition this will always work.  But it also
+	 * means we can't use bio_add_page, so any changes to it's semantics
+	 * need to be reflected here as well.
+	 */
+	bio->bi_iter.bi_size += size;
+	bio->bi_vcnt += nr_pages;
+
+	diff = (nr_pages * PAGE_SIZE - offset) - size;
+	while (nr_pages--) {
+		bv[nr_pages].bv_page = pages[nr_pages];
+		bv[nr_pages].bv_len = PAGE_SIZE;
+		bv[nr_pages].bv_offset = 0;
+	}
+
+	bv[0].bv_offset += offset;
+	bv[0].bv_len -= offset;
+	if (diff)
+		bv[bio->bi_vcnt - 1].bv_len -= diff;
+
+	iov_iter_advance(iter, size);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
+
 struct submit_bio_ret {
 	struct completion event;
 	int error;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5c604b4914bfaa..d367cd37a7f756 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -427,6 +427,7 @@ void bio_chain(struct bio *, struct bio *);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    const struct iov_iter *, gfp_t);

From bc27c01b5c46d3bfec42c96537c7a3fae0bb2cc4 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:18:48 -0700
Subject: [PATCH 041/182] blk-mq: Do not invoke .queue_rq() for a stopped queue

The meaning of the BLK_MQ_S_STOPPED flag is "do not call
.queue_rq()". Hence modify blk_mq_make_request() such that requests
are queued instead of issued if a queue has been stopped.

Reported-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2da1a0ee33182e..92edd28b7e76c6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1317,9 +1317,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		if (!old_rq)
 			goto done;
-		if (!blk_mq_direct_issue_request(old_rq, &cookie))
-			goto done;
-		blk_mq_insert_request(old_rq, false, true, true);
+		if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) ||
+		    blk_mq_direct_issue_request(old_rq, &cookie) != 0)
+			blk_mq_insert_request(old_rq, false, true, true);
 		goto done;
 	}
 

From 5d1b25c1ecabb37f8eb58c8e9dd74f77f703e5d9 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:19:15 -0700
Subject: [PATCH 042/182] blk-mq: Introduce blk_mq_hctx_stopped()

Multiple functions test the BLK_MQ_S_STOPPED bit so introduce
a helper function that performs this test.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 12 ++++++------
 block/blk-mq.h |  5 +++++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 92edd28b7e76c6..2864e191cc86c3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -785,7 +785,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	struct list_head *dptr;
 	int queued;
 
-	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
+	if (unlikely(blk_mq_hctx_stopped(hctx)))
 		return;
 
 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
@@ -910,8 +910,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
-	    !blk_mq_hw_queue_mapped(hctx)))
+	if (unlikely(blk_mq_hctx_stopped(hctx) ||
+		     !blk_mq_hw_queue_mapped(hctx)))
 		return;
 
 	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
@@ -936,7 +936,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if ((!blk_mq_hctx_has_pending(hctx) &&
 		    list_empty_careful(&hctx->dispatch)) ||
-		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+		    blk_mq_hctx_stopped(hctx))
 			continue;
 
 		blk_mq_run_hw_queue(hctx, async);
@@ -986,7 +986,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
+		if (!blk_mq_hctx_stopped(hctx))
 			continue;
 
 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
@@ -1317,7 +1317,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		if (!old_rq)
 			goto done;
-		if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) ||
+		if (blk_mq_hctx_stopped(data.hctx) ||
 		    blk_mq_direct_issue_request(old_rq, &cookie) != 0)
 			blk_mq_insert_request(old_rq, false, true, true);
 		goto done;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e5d25249028c74..ac772dac7ce8c0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -100,6 +100,11 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
 	data->hctx = hctx;
 }
 
+static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
+{
+	return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
+}
+
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 {
 	return hctx->nr_ctx && hctx->tags;

From fd00144301d64f1742541a3c5e64cd1c51f39c55 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:19:37 -0700
Subject: [PATCH 043/182] blk-mq: Introduce blk_mq_queue_stopped()

The function blk_queue_stopped() allows to test whether or not a
traditional request queue has been stopped. Introduce a helper
function that allows block drivers to query easily whether or not
one or more hardware contexts of a blk-mq queue have been stopped.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 20 ++++++++++++++++++++
 include/linux/blk-mq.h |  1 +
 2 files changed, 21 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2864e191cc86c3..28bf667bfe092b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -944,6 +944,26 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 }
 EXPORT_SYMBOL(blk_mq_run_hw_queues);
 
+/**
+ * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
+ * @q: request queue.
+ *
+ * The caller is responsible for serializing this function against
+ * blk_mq_{start,stop}_hw_queue().
+ */
+bool blk_mq_queue_stopped(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		if (blk_mq_hctx_stopped(hctx))
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL(blk_mq_queue_stopped);
+
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	cancel_work(&hctx->run_work);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 535ab2e13d2e74..aa930009fcd317 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -223,6 +223,7 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs
 void blk_mq_abort_requeue_list(struct request_queue *q);
 void blk_mq_complete_request(struct request *rq, int error);
 
+bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);

From 2253efc850c4cf690516bbc07854eeb1077202ba Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:20:02 -0700
Subject: [PATCH 044/182] blk-mq: Move more code into
 blk_mq_direct_issue_request()

Move the "hctx stopped" test and the insert request calls into
blk_mq_direct_issue_request(). Rename that function into
blk_mq_try_issue_directly() to reflect its new semantics. Pass
the hctx pointer to that function instead of looking it up a
second time. These changes avoid that code has to be duplicated
in the next patch.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 28bf667bfe092b..447c37f39e3217 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1228,11 +1228,11 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }
 
-static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
+static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+				      struct request *rq, blk_qc_t *cookie)
 {
 	int ret;
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
 		.list = NULL,
@@ -1240,6 +1240,9 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 	};
 	blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
 
+	if (blk_mq_hctx_stopped(hctx))
+		goto insert;
+
 	/*
 	 * For OK queue, we are done. For error, kill it. Any other
 	 * error (busy), just add it to our list as we previously
@@ -1248,7 +1251,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 	ret = q->mq_ops->queue_rq(hctx, &bd);
 	if (ret == BLK_MQ_RQ_QUEUE_OK) {
 		*cookie = new_cookie;
-		return 0;
+		return;
 	}
 
 	__blk_mq_requeue_request(rq);
@@ -1257,10 +1260,11 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
 		*cookie = BLK_QC_T_NONE;
 		rq->errors = -EIO;
 		blk_mq_end_request(rq, rq->errors);
-		return 0;
+		return;
 	}
 
-	return -1;
+insert:
+	blk_mq_insert_request(rq, false, true, true);
 }
 
 /*
@@ -1337,9 +1341,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		if (!old_rq)
 			goto done;
-		if (blk_mq_hctx_stopped(data.hctx) ||
-		    blk_mq_direct_issue_request(old_rq, &cookie) != 0)
-			blk_mq_insert_request(old_rq, false, true, true);
+		blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
 		goto done;
 	}
 

From 52d7f1b5c2f33b5d34dc2b6af5175fb6a44999f6 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:20:32 -0700
Subject: [PATCH 045/182] blk-mq: Avoid that requeueing starts stopped queues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since blk_mq_requeue_work() starts stopped queues and since
execution of this function can be scheduled after a queue has
been stopped it is not possible to stop queues without using
an additional state variable to track whether or not the queue
has been stopped. Hence modify blk_mq_requeue_work() such that it
does not start stopped queues. My conclusion after a review of
the blk_mq_stop_hw_queues() and blk_mq_{delay_,}kick_requeue_list()
callers is as follows:
* In the dm driver starting and stopping queues should only happen
  if __dm_suspend() or __dm_resume() is called and not if the
  requeue list is processed.
* In the SCSI core queue stopping and starting should only be
  performed by the scsi_internal_device_block() and
  scsi_internal_device_unblock() functions but not by any other
  function. Although the blk_mq_stop_hw_queue() call in
  scsi_queue_rq() may help to reduce CPU load if a LLD queue is
  full, figuring out whether or not a queue should be restarted
  when requeueing a command would require to introduce additional
  locking in scsi_mq_requeue_cmd() to avoid a race with
  scsi_internal_device_block(). Avoid this complexity by removing
  the blk_mq_stop_hw_queue() call from scsi_queue_rq().
* In the NVMe core only the functions that call
  blk_mq_start_stopped_hw_queues() explicitly should start stopped
  queues.
* A blk_mq_start_stopped_hwqueues() call must be added in the
  xen-blkfront driver in its blkif_recover() function.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Roger Pau Monné <roger.pau@citrix.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: James Bottomley <jejb@linux.vnet.ibm.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c               | 6 +-----
 drivers/block/xen-blkfront.c | 1 +
 drivers/md/dm-rq.c           | 7 +------
 drivers/scsi/scsi_lib.c      | 1 -
 4 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 447c37f39e3217..d95034ae64f691 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -501,11 +501,7 @@ static void blk_mq_requeue_work(struct work_struct *work)
 		blk_mq_insert_request(rq, false, false, false);
 	}
 
-	/*
-	 * Use the start variant of queue running here, so that running
-	 * the requeue work will kick stopped queues.
-	 */
-	blk_mq_start_hw_queues(q);
+	blk_mq_run_hw_queues(q, false);
 }
 
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9908597c5209e1..71ca36eab558ed 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2045,6 +2045,7 @@ static int blkif_recover(struct blkfront_info *info)
 		BUG_ON(req->nr_phys_segments > segs);
 		blk_mq_requeue_request(req);
 	}
+	blk_mq_start_stopped_hw_queues(info->rq, true);
 	blk_mq_kick_requeue_list(info->rq);
 
 	while ((bio = bio_list_pop(&info->bio_list)) != NULL) {
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index f76cc36b8546d7..a9e9e781bb770c 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -338,12 +338,7 @@ static void dm_old_requeue_request(struct request *rq)
 
 static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (!blk_queue_stopped(q))
-		blk_mq_delay_kick_requeue_list(q, msecs);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	blk_mq_delay_kick_requeue_list(q, msecs);
 }
 
 void dm_mq_kick_requeue_list(struct mapped_device *md)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8c52622ac2577e..2b78ff12bf3cc9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1952,7 +1952,6 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 out:
 	switch (ret) {
 	case BLK_MQ_RQ_QUEUE_BUSY:
-		blk_mq_stop_hw_queue(hctx);
 		if (atomic_read(&sdev->device_busy) == 0 &&
 		    !scsi_device_blocked(sdev))
 			blk_mq_delay_queue(hctx, SCSI_QUEUE_DELAY);

From 9b7dd572cc439fa92e120290eb74d0295567c5a0 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:20:49 -0700
Subject: [PATCH 046/182] blk-mq: Remove blk_mq_cancel_requeue_work()

Since blk_mq_requeue_work() no longer restarts stopped queues
canceling requeue work is no longer needed to prevent that a
stopped queue would be restarted. Hence remove this function.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c           | 6 ------
 drivers/md/dm-rq.c       | 2 --
 drivers/nvme/host/core.c | 1 -
 include/linux/blk-mq.h   | 1 -
 4 files changed, 10 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d95034ae64f691..a461823644fbf5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -526,12 +526,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
 }
 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
-void blk_mq_cancel_requeue_work(struct request_queue *q)
-{
-	cancel_delayed_work_sync(&q->requeue_work);
-}
-EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
-
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
 	kblockd_schedule_delayed_work(&q->requeue_work, 0);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index a9e9e781bb770c..060ccc5a4b1ceb 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -116,8 +116,6 @@ static void dm_mq_stop_queue(struct request_queue *q)
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	/* Avoid that requeuing could restart the queue. */
-	blk_mq_cancel_requeue_work(q);
 	blk_mq_stop_hw_queues(q);
 }
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 329381a28edf8a..a764c2aa00a11c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2081,7 +2081,6 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
 		spin_unlock_irq(ns->queue->queue_lock);
 
-		blk_mq_cancel_requeue_work(ns->queue);
 		blk_mq_stop_hw_queues(ns->queue);
 	}
 	mutex_unlock(&ctrl->namespaces_mutex);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index aa930009fcd317..a85a20f80aaa37 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -217,7 +217,6 @@ void __blk_mq_end_request(struct request *rq, int error);
 
 void blk_mq_requeue_request(struct request *rq);
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
-void blk_mq_cancel_requeue_work(struct request_queue *q);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_abort_requeue_list(struct request_queue *q);

From 6a83e74d214a47a1371cd2e6a783264fcba7d428 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 2 Nov 2016 10:09:51 -0600
Subject: [PATCH 047/182] blk-mq: Introduce blk_mq_quiesce_queue()

blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations
have finished. This function does *not* wait until all outstanding
requests have finished (this means invocation of request.end_io()).
The algorithm used by blk_mq_quiesce_queue() is as follows:
* Hold either an RCU read lock or an SRCU read lock around
  .queue_rq() calls. The former is used if .queue_rq() does not
  block and the latter if .queue_rq() may block.
* blk_mq_quiesce_queue() first calls blk_mq_stop_hw_queues()
  followed by synchronize_srcu() or synchronize_rcu(). The latter
  call waits for .queue_rq() invocations that started before
  blk_mq_quiesce_queue() was called.
* The blk_mq_hctx_stopped() calls that control whether or not
  .queue_rq() will be called are called with the (S)RCU read lock
  held. This is necessary to avoid race conditions against
  blk_mq_quiesce_queue().

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Kconfig          |  1 +
 block/blk-mq.c         | 71 +++++++++++++++++++++++++++++++++++++-----
 include/linux/blk-mq.h |  3 ++
 include/linux/blkdev.h |  1 +
 4 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 6b0ad08f0677d2..3a024440a669e1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@ menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
        select SBITMAP
+       select SRCU
        help
 	 Provide block layer support for the kernel.
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a461823644fbf5..3dc32354329389 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -115,6 +115,33 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
+/**
+ * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ * @q: request queue.
+ *
+ * Note: this function does not prevent that the struct request end_io()
+ * callback function is invoked. Additionally, it is not prevented that
+ * new queue_rq() calls occur unless the queue has been stopped first.
+ */
+void blk_mq_quiesce_queue(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+	bool rcu = false;
+
+	blk_mq_stop_hw_queues(q);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (hctx->flags & BLK_MQ_F_BLOCKING)
+			synchronize_srcu(&hctx->queue_rq_srcu);
+		else
+			rcu = true;
+	}
+	if (rcu)
+		synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
+
 void blk_mq_wake_waiters(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -766,7 +793,7 @@ static inline unsigned int queued_to_index(unsigned int queued)
  * of IO. In particular, we'd like FIFO behaviour on handling existing
  * items on the hctx->dispatch list. Ignore that for now.
  */
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct request *rq;
@@ -778,9 +805,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	if (unlikely(blk_mq_hctx_stopped(hctx)))
 		return;
 
-	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-		cpu_online(hctx->next_cpu));
-
 	hctx->run++;
 
 	/*
@@ -871,6 +895,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	}
 }
 
+static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	int srcu_idx;
+
+	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+		cpu_online(hctx->next_cpu));
+
+	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+		rcu_read_lock();
+		blk_mq_process_rq_list(hctx);
+		rcu_read_unlock();
+	} else {
+		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+		blk_mq_process_rq_list(hctx);
+		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+	}
+}
+
 /*
  * It'd be great if the workqueue API had a way to pass
  * in a mask and had some smarts for more clever placement.
@@ -1268,7 +1310,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_mq_alloc_data data;
 	struct request *rq;
-	unsigned int request_count = 0;
+	unsigned int request_count = 0, srcu_idx;
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
 	blk_qc_t cookie;
@@ -1311,7 +1353,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
-		 * We do limited pluging. If the bio can be merged, do that.
+		 * We do limited plugging. If the bio can be merged, do that.
 		 * Otherwise the existing request in the plug list will be
 		 * issued. So the plug list will have one request at most
 		 */
@@ -1331,7 +1373,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		if (!old_rq)
 			goto done;
-		blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+
+		if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
+			rcu_read_lock();
+			blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+			rcu_read_unlock();
+		} else {
+			srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
+			blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+			srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+		}
 		goto done;
 	}
 
@@ -1610,6 +1661,9 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
 
+	if (hctx->flags & BLK_MQ_F_BLOCKING)
+		cleanup_srcu_struct(&hctx->queue_rq_srcu);
+
 	blk_mq_remove_cpuhp(hctx);
 	blk_free_flush_queue(hctx->fq);
 	sbitmap_free(&hctx->ctx_map);
@@ -1690,6 +1744,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 				   flush_start_tag + hctx_idx, node))
 		goto free_fq;
 
+	if (hctx->flags & BLK_MQ_F_BLOCKING)
+		init_srcu_struct(&hctx->queue_rq_srcu);
+
 	return 0;
 
  free_fq:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a85a20f80aaa37..ed20ac74c62a9a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -3,6 +3,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
+#include <linux/srcu.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -35,6 +36,8 @@ struct blk_mq_hw_ctx {
 
 	struct blk_mq_tags	*tags;
 
+	struct srcu_struct	queue_rq_srcu;
+
 	unsigned long		queued;
 	unsigned long		run;
 #define BLK_MQ_MAX_DISPATCH_ORDER	7
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8396da2bb6988d..13d893a69b464d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -918,6 +918,7 @@ extern void __blk_run_queue(struct request_queue *q);
 extern void __blk_run_queue_uncond(struct request_queue *q);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_run_queue_async(struct request_queue *q);
+extern void blk_mq_quiesce_queue(struct request_queue *q);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);

From 2b053aca76b48e681be57b34ca3a8c2c10b275c5 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:21:41 -0700
Subject: [PATCH 048/182] blk-mq: Add a kick_requeue_list argument to
 blk_mq_requeue_request()

Most blk_mq_requeue_request() and blk_mq_add_to_requeue_list() calls
are followed by kicking the requeue list. Hence add an argument to
these two functions that allows to kick the requeue list. This was
proposed by Christoph Hellwig.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-flush.c            |  5 +----
 block/blk-mq.c               | 10 +++++++---
 drivers/block/xen-blkfront.c |  2 +-
 drivers/md/dm-rq.c           |  2 +-
 drivers/nvme/host/core.c     |  2 +-
 drivers/scsi/scsi_lib.c      |  4 +---
 include/linux/blk-mq.h       |  5 +++--
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index d35beca1848167..c486b7aa62eeb7 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -134,10 +134,7 @@ static void blk_flush_restore_request(struct request *rq)
 static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
 	if (rq->q->mq_ops) {
-		struct request_queue *q = rq->q;
-
-		blk_mq_add_to_requeue_list(rq, add_front);
-		blk_mq_kick_requeue_list(q);
+		blk_mq_add_to_requeue_list(rq, add_front, true);
 		return false;
 	} else {
 		if (add_front)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3dc32354329389..8d3de5bd4d6f93 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -492,12 +492,12 @@ static void __blk_mq_requeue_request(struct request *rq)
 	}
 }
 
-void blk_mq_requeue_request(struct request *rq)
+void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 {
 	__blk_mq_requeue_request(rq);
 
 	BUG_ON(blk_queued_rq(rq));
-	blk_mq_add_to_requeue_list(rq, true);
+	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
 
@@ -531,7 +531,8 @@ static void blk_mq_requeue_work(struct work_struct *work)
 	blk_mq_run_hw_queues(q, false);
 }
 
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
+				bool kick_requeue_list)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
@@ -550,6 +551,9 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
 		list_add_tail(&rq->queuelist, &q->requeue_list);
 	}
 	spin_unlock_irqrestore(&q->requeue_lock, flags);
+
+	if (kick_requeue_list)
+		blk_mq_kick_requeue_list(q);
 }
 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 71ca36eab558ed..c000fdf048b224 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2043,7 +2043,7 @@ static int blkif_recover(struct blkfront_info *info)
 		/* Requeue pending requests (flush or discard) */
 		list_del_init(&req->queuelist);
 		BUG_ON(req->nr_phys_segments > segs);
-		blk_mq_requeue_request(req);
+		blk_mq_requeue_request(req, false);
 	}
 	blk_mq_start_stopped_hw_queues(info->rq, true);
 	blk_mq_kick_requeue_list(info->rq);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 060ccc5a4b1ceb..315257959fc0de 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -347,7 +347,7 @@ EXPORT_SYMBOL(dm_mq_kick_requeue_list);
 
 static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs)
 {
-	blk_mq_requeue_request(rq);
+	blk_mq_requeue_request(rq, false);
 	__dm_mq_kick_requeue_list(rq->q, msecs);
 }
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a764c2aa00a11c..e8070e3cc7c756 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -203,7 +203,7 @@ void nvme_requeue_req(struct request *req)
 {
 	unsigned long flags;
 
-	blk_mq_requeue_request(req);
+	blk_mq_requeue_request(req, false);
 	spin_lock_irqsave(req->q->queue_lock, flags);
 	if (!blk_queue_stopped(req->q))
 		blk_mq_kick_requeue_list(req->q);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2b78ff12bf3cc9..2e35132f8be17b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -86,10 +86,8 @@ scsi_set_blocked(struct scsi_cmnd *cmd, int reason)
 static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd)
 {
 	struct scsi_device *sdev = cmd->device;
-	struct request_queue *q = cmd->request->q;
 
-	blk_mq_requeue_request(cmd->request);
-	blk_mq_kick_requeue_list(q);
+	blk_mq_requeue_request(cmd->request, true);
 	put_device(&sdev->sdev_gendev);
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ed20ac74c62a9a..35a0af5ede6d1e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -218,8 +218,9 @@ void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, int error);
 void __blk_mq_end_request(struct request *rq, int error);
 
-void blk_mq_requeue_request(struct request *rq);
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
+void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
+				bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_abort_requeue_list(struct request_queue *q);

From f0d33ab76cfcd696328c1330e2462b5d62e74f00 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:22:00 -0700
Subject: [PATCH 049/182] dm: Use BLK_MQ_S_STOPPED instead of
 QUEUE_FLAG_STOPPED in blk-mq code

Instead of manipulating both QUEUE_FLAG_STOPPED and BLK_MQ_S_STOPPED
in the dm start and stop queue functions, only manipulate the latter
flag. Change blk_queue_stopped() tests into blk_mq_queue_stopped().

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-rq.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 315257959fc0de..09c958b6f038f8 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -75,12 +75,6 @@ static void dm_old_start_queue(struct request_queue *q)
 
 static void dm_mq_start_queue(struct request_queue *q)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
 	blk_mq_start_stopped_hw_queues(q, true);
 	blk_mq_kick_requeue_list(q);
 }
@@ -105,16 +99,8 @@ static void dm_old_stop_queue(struct request_queue *q)
 
 static void dm_mq_stop_queue(struct request_queue *q)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (blk_queue_stopped(q)) {
-		spin_unlock_irqrestore(q->queue_lock, flags);
+	if (blk_mq_queue_stopped(q))
 		return;
-	}
-
-	queue_flag_set(QUEUE_FLAG_STOPPED, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	blk_mq_stop_hw_queues(q);
 }

From 7b17c2f7292ba1f3f98dae3f7077f9e569653276 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:22:16 -0700
Subject: [PATCH 050/182] dm: Fix a race condition related to stopping and
 starting queues

Ensure that all ongoing dm_mq_queue_rq() and dm_mq_requeue_request()
calls have stopped before setting the "queue stopped" flag. This
allows to remove the "queue stopped" test from dm_mq_queue_rq() and
dm_mq_requeue_request(). This patch fixes a race condition because
dm_mq_queue_rq() is called without holding the queue lock and hence
BLK_MQ_S_STOPPED can be set at any time while dm_mq_queue_rq() is
in progress. This patch prevents that the following hang occurs
sporadically when using dm-mq:

INFO: task systemd-udevd:10111 blocked for more than 480 seconds.
Call Trace:
 [<ffffffff8161f397>] schedule+0x37/0x90
 [<ffffffff816239ef>] schedule_timeout+0x27f/0x470
 [<ffffffff8161e76f>] io_schedule_timeout+0x9f/0x110
 [<ffffffff8161fb36>] bit_wait_io+0x16/0x60
 [<ffffffff8161f929>] __wait_on_bit_lock+0x49/0xa0
 [<ffffffff8114fe69>] __lock_page+0xb9/0xc0
 [<ffffffff81165d90>] truncate_inode_pages_range+0x3e0/0x760
 [<ffffffff81166120>] truncate_inode_pages+0x10/0x20
 [<ffffffff81212a20>] kill_bdev+0x30/0x40
 [<ffffffff81213d41>] __blkdev_put+0x71/0x360
 [<ffffffff81214079>] blkdev_put+0x49/0x170
 [<ffffffff812141c0>] blkdev_close+0x20/0x30
 [<ffffffff811d48e8>] __fput+0xe8/0x1f0
 [<ffffffff811d4a29>] ____fput+0x9/0x10
 [<ffffffff810842d3>] task_work_run+0x83/0xb0
 [<ffffffff8106606e>] do_exit+0x3ee/0xc40
 [<ffffffff8106694b>] do_group_exit+0x4b/0xc0
 [<ffffffff81073d9a>] get_signal+0x2ca/0x940
 [<ffffffff8101bf43>] do_signal+0x23/0x660
 [<ffffffff810022b3>] exit_to_usermode_loop+0x73/0xb0
 [<ffffffff81002cb0>] syscall_return_slowpath+0xb0/0xc0
 [<ffffffff81624e33>] entry_SYSCALL_64_fastpath+0xa6/0xa8

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-rq.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 09c958b6f038f8..8b92e066bb6998 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -102,7 +102,7 @@ static void dm_mq_stop_queue(struct request_queue *q)
 	if (blk_mq_queue_stopped(q))
 		return;
 
-	blk_mq_stop_hw_queues(q);
+	blk_mq_quiesce_queue(q);
 }
 
 void dm_stop_queue(struct request_queue *q)
@@ -880,17 +880,6 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 		dm_put_live_table(md, srcu_idx);
 	}
 
-	/*
-	 * On suspend dm_stop_queue() handles stopping the blk-mq
-	 * request_queue BUT: even though the hw_queues are marked
-	 * BLK_MQ_S_STOPPED at that point there is still a race that
-	 * is allowing block/blk-mq.c to call ->queue_rq against a
-	 * hctx that it really shouldn't.  The following check guards
-	 * against this rarity (albeit _not_ race-free).
-	 */
-	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
-		return BLK_MQ_RQ_QUEUE_BUSY;
-
 	if (ti->type->busy && ti->type->busy(ti))
 		return BLK_MQ_RQ_QUEUE_BUSY;
 

From 3174dd33fa8316d29f238f093291ca9bb25e6f58 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:23:19 -0700
Subject: [PATCH 051/182] nvme: Fix a race condition related to stopping queues

Avoid that nvme_queue_rq() is still running when nvme_stop_queues()
returns.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e8070e3cc7c756..75565b5951d92a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2081,7 +2081,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
 		spin_unlock_irq(ns->queue->queue_lock);
 
-		blk_mq_stop_hw_queues(ns->queue);
+		blk_mq_quiesce_queue(ns->queue);
 	}
 	mutex_unlock(&ctrl->namespaces_mutex);
 }

From a6eaa8849f92a40f8894d6d0dcab2c16243aaf20 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 28 Oct 2016 17:23:40 -0700
Subject: [PATCH 052/182] nvme: Use BLK_MQ_S_STOPPED instead of
 QUEUE_FLAG_STOPPED in blk-mq code

Make nvme_requeue_req() check BLK_MQ_S_STOPPED instead of
QUEUE_FLAG_STOPPED. Remove the QUEUE_FLAG_STOPPED manipulations
that became superfluous because of this change. Change
blk_queue_stopped() tests into blk_mq_queue_stopped().

This patch fixes a race condition: using queue_flag_clear_unlocked()
is not safe if any other function that manipulates the queue flags
can be called concurrently, e.g. blk_cleanup_queue().

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 75565b5951d92a..ef34f2f3566ae6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -201,13 +201,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
 
 void nvme_requeue_req(struct request *req)
 {
-	unsigned long flags;
-
-	blk_mq_requeue_request(req, false);
-	spin_lock_irqsave(req->q->queue_lock, flags);
-	if (!blk_queue_stopped(req->q))
-		blk_mq_kick_requeue_list(req->q);
-	spin_unlock_irqrestore(req->q->queue_lock, flags);
+	blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
 }
 EXPORT_SYMBOL_GPL(nvme_requeue_req);
 
@@ -2076,13 +2070,8 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 	struct nvme_ns *ns;
 
 	mutex_lock(&ctrl->namespaces_mutex);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
-		spin_lock_irq(ns->queue->queue_lock);
-		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
-		spin_unlock_irq(ns->queue->queue_lock);
-
+	list_for_each_entry(ns, &ctrl->namespaces, list)
 		blk_mq_quiesce_queue(ns->queue);
-	}
 	mutex_unlock(&ctrl->namespaces_mutex);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
@@ -2093,7 +2082,6 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 
 	mutex_lock(&ctrl->namespaces_mutex);
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
-		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
 		blk_mq_start_stopped_hw_queues(ns->queue, true);
 		blk_mq_kick_requeue_list(ns->queue);
 	}

From 46f3cc176210b2db5db89d9c0e62796cad7e3cdc Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Thu, 3 Nov 2016 03:38:54 -0600
Subject: [PATCH 053/182] block: drop q argument from bsg_validate_sgv4_hdr

bsg_validate_sgv4_hdr() doesn't care about the request_queue, so drop it
from it's arguments.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/bsg.c b/block/bsg.c
index d214e929ce1855..8a05a404ae7085 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -176,7 +176,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
  * Check if sg_io_v4 from user is allowed and valid
  */
 static int
-bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw)
+bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *rw)
 {
 	int ret = 0;
 
@@ -226,7 +226,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 		hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
 		hdr->din_xfer_len);
 
-	ret = bsg_validate_sgv4_hdr(q, hdr, &rw);
+	ret = bsg_validate_sgv4_hdr(hdr, &rw);
 	if (ret)
 		return ERR_PTR(ret);
 

From 50d24c34403c62ad29e8b6db559d491bae20b4b7 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 3 Nov 2016 17:03:53 -0700
Subject: [PATCH 054/182] block: immediately dispatch big size request

Currently block plug holds up to 16 non-mergeable requests. This makes
sense if the request size is small, eg, reduce lock contention. But if
request size is big enough, we don't need to worry about lock
contention. Holding such request makes no sense and it lows the disk
utilization.

In practice, this improves 10% throughput for my raid5 sequential write
workload.

The size (128k) is arbitrary right now, but it makes sure lock
contention is small. This probably could be more intelligent, eg, check
average request size holded. Since this is mainly for sequential IO,
probably not worthy.

V2: check the last request instead of the first request, so as long as
there is one big size request we flush the plug.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 4 +++-
 include/linux/blkdev.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0bfaa54d3e9f5d..2deca48a4a0545 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1746,7 +1746,9 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 		if (!request_count)
 			trace_block_plug(q);
 		else {
-			if (request_count >= BLK_MAX_REQUEST_COUNT) {
+			struct request *last = list_entry_rq(plug->list.prev);
+			if (request_count >= BLK_MAX_REQUEST_COUNT ||
+			    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
 				blk_flush_plug_list(plug, false);
 				trace_block_plug(q);
 			}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 13d893a69b464d..9189a2d5c392a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1173,6 +1173,7 @@ struct blk_plug {
 	struct list_head cb_list; /* md requires an unplug callback */
 };
 #define BLK_MAX_REQUEST_COUNT 16
+#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
 
 struct blk_plug_cb;
 typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);

From 600271d9000027c013c01be87cbb90a5a18c5c3f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 3 Nov 2016 17:03:54 -0700
Subject: [PATCH 055/182] blk-mq: immediately dispatch big size request

This is corresponding part for blk-mq. Disk with multiple hardware
queues doesn't need this as we only hold 1 request at most.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8d3de5bd4d6f93..077c2416a95565 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1453,13 +1453,18 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	 */
 	plug = current->plug;
 	if (plug) {
+		struct request *last = NULL;
+
 		blk_mq_bio_to_request(rq, bio);
 		if (!request_count)
 			trace_block_plug(q);
+		else
+			last = list_entry_rq(plug->mq_list.prev);
 
 		blk_mq_put_ctx(data.ctx);
 
-		if (request_count >= BLK_MAX_REQUEST_COUNT) {
+		if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
+		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
 			blk_flush_plug_list(plug, false);
 			trace_block_plug(q);
 		}

From d278d4a8892f13b6a9eb6102b356402f0e062324 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 30 Mar 2016 10:21:08 -0600
Subject: [PATCH 056/182] block: add code to track actual device queue depth

For blk-mq, ->nr_requests does track queue depth, at least at init
time. But for the older queue paths, it's simply a soft setting.
On top of that, it's generally larger than the hardware setting
on purpose, to allow backup of requests for merging.

Fill a hole in struct request with a 'queue_depth' member, that
drivers can call to more closely inform the block layer of the
real queue depth.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 block/blk-settings.c   | 12 ++++++++++++
 drivers/scsi/scsi.c    |  3 +++
 include/linux/blkdev.h | 11 +++++++++++
 3 files changed, 26 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 55369a65dea2a8..9cf0537593630f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -836,6 +836,18 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 
+/**
+ * blk_set_queue_depth - tell the block layer about the device queue depth
+ * @q:		the request queue for the device
+ * @depth:		queue depth
+ *
+ */
+void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
+{
+	q->queue_depth = depth;
+}
+EXPORT_SYMBOL(blk_set_queue_depth);
+
 /**
  * blk_queue_write_cache - configure queue's write cache
  * @q:		the request queue for the device
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1deb6adc411f79..75455d4dab68b5 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
 		wmb();
 	}
 
+	if (sdev->request_queue)
+		blk_set_queue_depth(sdev->request_queue, depth);
+
 	return sdev->queue_depth;
 }
 EXPORT_SYMBOL(scsi_change_queue_depth);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9189a2d5c392a8..d364be6e695983 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -405,6 +405,8 @@ struct request_queue {
 	struct blk_mq_ctx __percpu	*queue_ctx;
 	unsigned int		nr_queues;
 
+	unsigned int		queue_depth;
+
 	/* hw dispatch queues */
 	struct blk_mq_hw_ctx	**queue_hw_ctx;
 	unsigned int		nr_hw_queues;
@@ -777,6 +779,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
 	return false;
 }
 
+static inline unsigned int blk_queue_depth(struct request_queue *q)
+{
+	if (q->queue_depth)
+		return q->queue_depth;
+
+	return q->nr_requests;
+}
+
 /*
  * q->prep_rq_fn return values
  */
@@ -1094,6 +1104,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
 extern void blk_set_default_limits(struct queue_limits *lim);
 extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,

From c02ebfdddbafa9a6a0f52fbd715e6bfa229af9d3 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Date: Wed, 28 Sep 2016 00:24:24 -0300
Subject: [PATCH 057/182] blk-mq: Always schedule hctx->next_cpu

Commit 0e87e58bf60e ("blk-mq: improve warning for running a queue on the
wrong CPU") attempts to avoid triggering the WARN_ON in
__blk_mq_run_hw_queue when the expected CPU is dead.  Problem is, in the
last batch execution before round robin, blk_mq_hctx_next_cpu can
schedule a dead CPU and also update next_cpu to the next alive CPU in
the mask, which will trigger the WARN_ON despite the previous
workaround.

The following patch fixes this scenario by always scheduling the value
in hctx->next_cpu.  This changes the moment when we round-robin the CPU
running the hctx, but it really doesn't matter, since it still executes
BLK_MQ_CPU_WORK_BATCH times in a row before switching to another CPU.

Fixes: 0e87e58bf60e ("blk-mq: improve warning for running a queue on the wrong CPU")
Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 077c2416a95565..6f5cb3f3dcacff 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -929,7 +929,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 		return WORK_CPU_UNBOUND;
 
 	if (--hctx->next_cpu_batch <= 0) {
-		int cpu = hctx->next_cpu, next_cpu;
+		int next_cpu;
 
 		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 		if (next_cpu >= nr_cpu_ids)
@@ -937,8 +937,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
 		hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
-		return cpu;
 	}
 
 	return hctx->next_cpu;

From feebd5687257750bb75eaf5188b93756994b8c41 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Oct 2016 09:04:44 +0200
Subject: [PATCH 058/182] pktcdvd: don't scribble over the bvec array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hi Peter, hi Jens,

I've been looking over the multi page bio vec work again recently, and
one of the stumbling blocks is raw biovec access in the pktcdvd.

The first issue is that it directly sets up the page and offset pointers
in the biovec just before calling bio_add_page.  As bio_add_page already
does the setup it's trivial to just switch it to stack variables for the
arguments.

The second issue is the copy code in pkt_make_local_copy, which
effectively is an opencoded version of bio_copy_data except that it
skips pages that already are the same in the ѕource and destination.
But we look at the only calleer we just set up the bio using bio_add_page
to point exactly to the page array that pkt_make_local_copy compares,
so the pages will always be the same and we can just remove this function.

Note that all this is done based on code inspection, I don't have any
packet writing hardware myself.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/pktcdvd.c | 47 ++++++-----------------------------------
 1 file changed, 6 insertions(+), 41 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 7cf795e0fc8d63..95c98de929714c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -944,39 +944,6 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que
 	}
 }
 
-/*
- * Copy all data for this packet to pkt->pages[], so that
- * a) The number of required segments for the write bio is minimized, which
- *    is necessary for some scsi controllers.
- * b) The data can be used as cache to avoid read requests if we receive a
- *    new write request for the same zone.
- */
-static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
-{
-	int f, p, offs;
-
-	/* Copy all data to pkt->pages[] */
-	p = 0;
-	offs = 0;
-	for (f = 0; f < pkt->frames; f++) {
-		if (bvec[f].bv_page != pkt->pages[p]) {
-			void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset;
-			void *vto = page_address(pkt->pages[p]) + offs;
-			memcpy(vto, vfrom, CD_FRAMESIZE);
-			kunmap_atomic(vfrom);
-			bvec[f].bv_page = pkt->pages[p];
-			bvec[f].bv_offset = offs;
-		} else {
-			BUG_ON(bvec[f].bv_offset != offs);
-		}
-		offs += CD_FRAMESIZE;
-		if (offs >= PAGE_SIZE) {
-			offs = 0;
-			p++;
-		}
-	}
-}
-
 static void pkt_end_io_read(struct bio *bio)
 {
 	struct packet_data *pkt = bio->bi_private;
@@ -1298,7 +1265,6 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
 static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 {
 	int f;
-	struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
 
 	bio_reset(pkt->w_bio);
 	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
@@ -1308,9 +1274,10 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 	/* XXX: locking? */
 	for (f = 0; f < pkt->frames; f++) {
-		bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
-		bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
-		if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
+		struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
+		unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+
+		if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset))
 			BUG();
 	}
 	pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
@@ -1327,12 +1294,10 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	pkt_dbg(2, pd, "Writing %d frames for zone %llx\n",
 		pkt->write_size, (unsigned long long)pkt->sector);
 
-	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
-		pkt_make_local_copy(pkt, bvec);
+	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames))
 		pkt->cache_valid = 1;
-	} else {
+	else
 		pkt->cache_valid = 0;
-	}
 
 	/* Start the write request */
 	atomic_set(&pkt->io_wait, 1);

From 180b0ae77d49dd63183b18609f3601430582c403 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Nov 2016 11:14:07 -0800
Subject: [PATCH 059/182] skd: use pci_alloc_irq_vectors

Switch the skd driver to use pci_alloc_irq_vectors.  We need to two calls to
pci_alloc_irq_vectors as skd only supports multiple MSI-X vectors, but not
multiple MSI vectors.

Otherwise this cleans up a lot of cruft and allows to a lot more common code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/skd_main.c | 212 ++++++++++++---------------------------
 1 file changed, 66 insertions(+), 146 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 3822eae102db33..702f543a4a64ad 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -270,8 +270,6 @@ struct skd_device {
 	resource_size_t mem_phys[SKD_MAX_BARS];
 	u32 mem_size[SKD_MAX_BARS];
 
-	skd_irq_type_t irq_type;
-	u32 msix_count;
 	struct skd_msix_entry *msix_entries;
 
 	struct pci_dev *pdev;
@@ -3821,10 +3819,6 @@ static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data)
  */
 
 struct skd_msix_entry {
-	int have_irq;
-	u32 vector;
-	u32 entry;
-	struct skd_device *rsp;
 	char isr_name[30];
 };
 
@@ -3853,56 +3847,21 @@ static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = {
 	{ "(Queue Full 3)", skd_qfull_isr    },
 };
 
-static void skd_release_msix(struct skd_device *skdev)
-{
-	struct skd_msix_entry *qentry;
-	int i;
-
-	if (skdev->msix_entries) {
-		for (i = 0; i < skdev->msix_count; i++) {
-			qentry = &skdev->msix_entries[i];
-			skdev = qentry->rsp;
-
-			if (qentry->have_irq)
-				devm_free_irq(&skdev->pdev->dev,
-					      qentry->vector, qentry->rsp);
-		}
-
-		kfree(skdev->msix_entries);
-	}
-
-	if (skdev->msix_count)
-		pci_disable_msix(skdev->pdev);
-
-	skdev->msix_count = 0;
-	skdev->msix_entries = NULL;
-}
-
 static int skd_acquire_msix(struct skd_device *skdev)
 {
 	int i, rc;
 	struct pci_dev *pdev = skdev->pdev;
-	struct msix_entry *entries;
-	struct skd_msix_entry *qentry;
-
-	entries = kzalloc(sizeof(struct msix_entry) * SKD_MAX_MSIX_COUNT,
-			  GFP_KERNEL);
-	if (!entries)
-		return -ENOMEM;
 
-	for (i = 0; i < SKD_MAX_MSIX_COUNT; i++)
-		entries[i].entry = i;
-
-	rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT);
-	if (rc) {
+	rc = pci_alloc_irq_vectors(pdev, SKD_MAX_MSIX_COUNT, SKD_MAX_MSIX_COUNT,
+			PCI_IRQ_MSIX);
+	if (rc < 0) {
 		pr_err("(%s): failed to enable MSI-X %d\n",
 		       skd_name(skdev), rc);
 		goto msix_out;
 	}
 
-	skdev->msix_count = SKD_MAX_MSIX_COUNT;
-	skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) *
-				      skdev->msix_count, GFP_KERNEL);
+	skdev->msix_entries = kcalloc(SKD_MAX_MSIX_COUNT,
+			sizeof(struct skd_msix_entry), GFP_KERNEL);
 	if (!skdev->msix_entries) {
 		rc = -ENOMEM;
 		pr_err("(%s): msix table allocation error\n",
@@ -3910,136 +3869,98 @@ static int skd_acquire_msix(struct skd_device *skdev)
 		goto msix_out;
 	}
 
-	for (i = 0; i < skdev->msix_count; i++) {
-		qentry = &skdev->msix_entries[i];
-		qentry->vector = entries[i].vector;
-		qentry->entry = entries[i].entry;
-		qentry->rsp = NULL;
-		qentry->have_irq = 0;
-		pr_debug("%s:%s:%d %s: <%s> msix (%d) vec %d, entry %x\n",
-			 skdev->name, __func__, __LINE__,
-			 pci_name(pdev), skdev->name,
-			 i, qentry->vector, qentry->entry);
-	}
-
 	/* Enable MSI-X vectors for the base queue */
-	for (i = 0; i < skdev->msix_count; i++) {
-		qentry = &skdev->msix_entries[i];
+	for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) {
+		struct skd_msix_entry *qentry = &skdev->msix_entries[i];
+
 		snprintf(qentry->isr_name, sizeof(qentry->isr_name),
 			 "%s%d-msix %s", DRV_NAME, skdev->devno,
 			 msix_entries[i].name);
-		rc = devm_request_irq(&skdev->pdev->dev, qentry->vector,
-				      msix_entries[i].handler, 0,
-				      qentry->isr_name, skdev);
+
+		rc = devm_request_irq(&skdev->pdev->dev,
+				pci_irq_vector(skdev->pdev, i),
+				msix_entries[i].handler, 0,
+				qentry->isr_name, skdev);
 		if (rc) {
 			pr_err("(%s): Unable to register(%d) MSI-X "
 			       "handler %d: %s\n",
 			       skd_name(skdev), rc, i, qentry->isr_name);
 			goto msix_out;
-		} else {
-			qentry->have_irq = 1;
-			qentry->rsp = skdev;
 		}
 	}
+
 	pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n",
 		 skdev->name, __func__, __LINE__,
-		 pci_name(pdev), skdev->name, skdev->msix_count);
+		 pci_name(pdev), skdev->name, SKD_MAX_MSIX_COUNT);
 	return 0;
 
 msix_out:
-	if (entries)
-		kfree(entries);
-	skd_release_msix(skdev);
+	while (--i >= 0)
+		devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), skdev);
+	kfree(skdev->msix_entries);
+	skdev->msix_entries = NULL;
 	return rc;
 }
 
 static int skd_acquire_irq(struct skd_device *skdev)
 {
+	struct pci_dev *pdev = skdev->pdev;
+	unsigned int irq_flag = PCI_IRQ_LEGACY;
 	int rc;
-	struct pci_dev *pdev;
 
-	pdev = skdev->pdev;
-	skdev->msix_count = 0;
-
-RETRY_IRQ_TYPE:
-	switch (skdev->irq_type) {
-	case SKD_IRQ_MSIX:
+	if (skd_isr_type == SKD_IRQ_MSIX) {
 		rc = skd_acquire_msix(skdev);
 		if (!rc)
-			pr_info("(%s): MSI-X %d irqs enabled\n",
-			       skd_name(skdev), skdev->msix_count);
-		else {
-			pr_err(
-			       "(%s): failed to enable MSI-X, re-trying with MSI %d\n",
-			       skd_name(skdev), rc);
-			skdev->irq_type = SKD_IRQ_MSI;
-			goto RETRY_IRQ_TYPE;
-		}
-		break;
-	case SKD_IRQ_MSI:
-		snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d-msi",
-			 DRV_NAME, skdev->devno);
-		rc = pci_enable_msi_range(pdev, 1, 1);
-		if (rc > 0) {
-			rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, 0,
-					      skdev->isr_name, skdev);
-			if (rc) {
-				pci_disable_msi(pdev);
-				pr_err(
-				       "(%s): failed to allocate the MSI interrupt %d\n",
-				       skd_name(skdev), rc);
-				goto RETRY_IRQ_LEGACY;
-			}
-			pr_info("(%s): MSI irq %d enabled\n",
-			       skd_name(skdev), pdev->irq);
-		} else {
-RETRY_IRQ_LEGACY:
-			pr_err(
-			       "(%s): failed to enable MSI, re-trying with LEGACY %d\n",
-			       skd_name(skdev), rc);
-			skdev->irq_type = SKD_IRQ_LEGACY;
-			goto RETRY_IRQ_TYPE;
-		}
-		break;
-	case SKD_IRQ_LEGACY:
-		snprintf(skdev->isr_name, sizeof(skdev->isr_name),
-			 "%s%d-legacy", DRV_NAME, skdev->devno);
-		rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr,
-				      IRQF_SHARED, skdev->isr_name, skdev);
-		if (!rc)
-			pr_info("(%s): LEGACY irq %d enabled\n",
-			       skd_name(skdev), pdev->irq);
-		else
-			pr_err("(%s): request LEGACY irq error %d\n",
-			       skd_name(skdev), rc);
-		break;
-	default:
-		pr_info("(%s): irq_type %d invalid, re-set to %d\n",
-		       skd_name(skdev), skdev->irq_type, SKD_IRQ_DEFAULT);
-		skdev->irq_type = SKD_IRQ_LEGACY;
-		goto RETRY_IRQ_TYPE;
+			return 0;
+
+		pr_err("(%s): failed to enable MSI-X, re-trying with MSI %d\n",
+		       skd_name(skdev), rc);
 	}
-	return rc;
+
+	snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d", DRV_NAME,
+			skdev->devno);
+
+	if (skd_isr_type != SKD_IRQ_LEGACY)
+		irq_flag |= PCI_IRQ_MSI;
+	rc = pci_alloc_irq_vectors(pdev, 1, 1, irq_flag);
+	if (rc < 0) {
+		pr_err("(%s): failed to allocate the MSI interrupt %d\n",
+			skd_name(skdev), rc);
+		return rc;
+	}
+
+	rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr,
+			pdev->msi_enabled ? 0 : IRQF_SHARED,
+			skdev->isr_name, skdev);
+	if (rc) {
+		pci_free_irq_vectors(pdev);
+		pr_err("(%s): failed to allocate interrupt %d\n",
+			skd_name(skdev), rc);
+		return rc;
+	}
+
+	return 0;
 }
 
 static void skd_release_irq(struct skd_device *skdev)
 {
-	switch (skdev->irq_type) {
-	case SKD_IRQ_MSIX:
-		skd_release_msix(skdev);
-		break;
-	case SKD_IRQ_MSI:
-		devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
-		pci_disable_msi(skdev->pdev);
-		break;
-	case SKD_IRQ_LEGACY:
-		devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
-		break;
-	default:
-		pr_err("(%s): wrong irq type %d!",
-		       skd_name(skdev), skdev->irq_type);
-		break;
+	struct pci_dev *pdev = skdev->pdev;
+
+	if (skdev->msix_entries) {
+		int i;
+
+		for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) {
+			devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i),
+					skdev);
+		}
+
+		kfree(skdev->msix_entries);
+		skdev->msix_entries = NULL;
+	} else {
+		devm_free_irq(&pdev->dev, pdev->irq, skdev);
 	}
+
+	pci_free_irq_vectors(pdev);
 }
 
 /*
@@ -4402,7 +4323,6 @@ static struct skd_device *skd_construct(struct pci_dev *pdev)
 	skdev->pdev = pdev;
 	skdev->devno = skd_next_devno++;
 	skdev->major = blk_major;
-	skdev->irq_type = skd_isr_type;
 	sprintf(skdev->name, DRV_NAME "%d", skdev->devno);
 	skdev->dev_max_queue_depth = 0;
 

From 8e1de26cd5db192b52819f5ee97bab308964f3ab Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sat, 22 Oct 2016 19:52:01 +0300
Subject: [PATCH 060/182] skd_main: use %*ph to dump small buffers

Replace custom approach by %*ph specifier to dump small buffers in hex format.

Unfortunately we can't use print_hex_dump_bytes() here since tha gap is
present, though one familiar with the code may change this.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/skd_main.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 702f543a4a64ad..a58256cd94d73d 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -2136,12 +2136,8 @@ static void skd_send_fitmsg(struct skd_device *skdev,
 		u8 *bp = (u8 *)skmsg->msg_buf;
 		int i;
 		for (i = 0; i < skmsg->length; i += 8) {
-			pr_debug("%s:%s:%d msg[%2d] %02x %02x %02x %02x "
-				 "%02x %02x %02x %02x\n",
-				 skdev->name, __func__, __LINE__,
-				 i, bp[i + 0], bp[i + 1], bp[i + 2],
-				 bp[i + 3], bp[i + 4], bp[i + 5],
-				 bp[i + 6], bp[i + 7]);
+			pr_debug("%s:%s:%d msg[%2d] %8ph\n",
+				 skdev->name, __func__, __LINE__, i, &bp[i]);
 			if (i == 0)
 				i = 64 - 8;
 		}
@@ -2162,7 +2158,6 @@ static void skd_send_fitmsg(struct skd_device *skdev,
 		qcmd |= FIT_QCMD_MSGSIZE_64;
 
 	SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
-
 }
 
 static void skd_send_special_fitmsg(struct skd_device *skdev,
@@ -2175,11 +2170,8 @@ static void skd_send_special_fitmsg(struct skd_device *skdev,
 		int i;
 
 		for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) {
-			pr_debug("%s:%s:%d  spcl[%2d] %02x %02x %02x %02x  "
-				 "%02x %02x %02x %02x\n",
-				 skdev->name, __func__, __LINE__, i,
-				 bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3],
-				 bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]);
+			pr_debug("%s:%s:%d  spcl[%2d] %8ph\n",
+				 skdev->name, __func__, __LINE__, i, &bp[i]);
 			if (i == 0)
 				i = 64 - 8;
 		}

From b57d74aff9ab92fbfb7c197c384d1adfa2827b2e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 1 Sep 2016 10:20:33 -0600
Subject: [PATCH 061/182] writeback: track if we're sleeping on progress in
 balance_dirty_pages()

Note in the bdi_writeback structure whenever a task ends up sleeping
waiting for progress. We can use that information in the lower layers
to increase the priority of writes.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 include/linux/backing-dev-defs.h | 2 ++
 mm/backing-dev.c                 | 1 +
 mm/page-writeback.c              | 1 +
 3 files changed, 4 insertions(+)

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c357f27d54835f..dc5f76d7f648b2 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,8 @@ struct bdi_writeback {
 	struct list_head work_list;
 	struct delayed_work dwork;	/* work item used for writeback */
 
+	unsigned long dirty_sleep;	/* last wait */
+
 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fde443f36d778..3bfed5ab2475cb 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 	spin_lock_init(&wb->work_lock);
 	INIT_LIST_HEAD(&wb->work_list);
 	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+	wb->dirty_sleep = jiffies;
 
 	wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
 	if (!wb->congested)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439cc63ad903fb..52e2f8e3b4721c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1778,6 +1778,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 					  pause,
 					  start_time);
 		__set_current_state(TASK_KILLABLE);
+		wb->dirty_sleep = now;
 		io_schedule_timeout(pause);
 
 		current->dirty_paused_when = now + pause;

From ae5b2ec8ad5e017126cd4552220f25ce8a6b92e9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 8 Nov 2016 19:39:28 -0700
Subject: [PATCH 062/182] block: set REQ_SYNC if we clear REQ_FUA|REQ_PREFLUSH

If we insert a flush request, we clear REQ_PREFLUSH and/or REQ_FUA,
depending on flush settings. Since op_is_sync() factors those flags
in for deciding whether this request is sync or not, we should
set REQ_SYNC to avoid screwing up this accounting.

This should be less fragile.

Reported-by: Logan Gunthorpe <logang@deltatee.com>
Fixes: b685d3d65ac ("block: treat REQ_FUA and REQ_PREFLUSH as synchronous")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-flush.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index c486b7aa62eeb7..1bdbb3d3e5f579 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -395,6 +395,13 @@ void blk_insert_flush(struct request *rq)
 	if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
 		rq->cmd_flags &= ~REQ_FUA;
 
+	/*
+	 * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any
+	 * of those flags, we have to set REQ_SYNC to avoid skewing
+	 * the request accounting.
+	 */
+	rq->cmd_flags |= REQ_SYNC;
+
 	/*
 	 * An empty flush handed down from a stacking driver may
 	 * translate into nothing if the underlying device does not

From 3bc8492f009c76a9e77cad6e200b0748d3972fa0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 9 Nov 2016 13:55:34 +0100
Subject: [PATCH 063/182] skd: fix msix error handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As reported by gcc -Wmaybe-uninitialized, the cleanup path for
skd_acquire_msix tries to free the already allocated msi-x vectors
in reverse order, but the index variable may not have been
used yet:

drivers/block/skd_main.c: In function ‘skd_acquire_irq’:
drivers/block/skd_main.c:3890:8: error: ‘i’ may be used uninitialized in this function [-Werror=maybe-uninitialized]

This changes the failure path to skip releasing the interrupts
if we have not started requesting them yet.

Fixes: 180b0ae77d49 ("skd: use pci_alloc_irq_vectors")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/skd_main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index a58256cd94d73d..66146b34922909 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -3849,7 +3849,7 @@ static int skd_acquire_msix(struct skd_device *skdev)
 	if (rc < 0) {
 		pr_err("(%s): failed to enable MSI-X %d\n",
 		       skd_name(skdev), rc);
-		goto msix_out;
+		goto out;
 	}
 
 	skdev->msix_entries = kcalloc(SKD_MAX_MSIX_COUNT,
@@ -3858,7 +3858,7 @@ static int skd_acquire_msix(struct skd_device *skdev)
 		rc = -ENOMEM;
 		pr_err("(%s): msix table allocation error\n",
 		       skd_name(skdev));
-		goto msix_out;
+		goto out;
 	}
 
 	/* Enable MSI-X vectors for the base queue */
@@ -3889,6 +3889,7 @@ static int skd_acquire_msix(struct skd_device *skdev)
 msix_out:
 	while (--i >= 0)
 		devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), skdev);
+out:
 	kfree(skdev->msix_entries);
 	skdev->msix_entries = NULL;
 	return rc;

From 41c9499b221ebe102f776589085cde43d0dadc46 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 9 Nov 2016 13:55:35 +0100
Subject: [PATCH 064/182] skd: fix function prototype
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Building with W=1 shows a harmless warning for the skd driver:

drivers/block/skd_main.c:2959:1: error: ‘static’ is not at beginning of declaration [-Werror=old-style-declaration]

This changes the prototype to the expected formatting.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/skd_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 66146b34922909..1e536b97f802a2 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -2945,8 +2945,8 @@ static void skd_completion_worker(struct work_struct *work)
 
 static void skd_isr_msg_from_dev(struct skd_device *skdev);
 
-irqreturn_t
-static skd_isr(int irq, void *ptr)
+static irqreturn_t
+skd_isr(int irq, void *ptr)
 {
 	struct skd_device *skdev;
 	u32 intstat;

From d49187e97e94e2eb613cb6fed810356972077cc3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Nov 2016 07:32:33 -0800
Subject: [PATCH 065/182] nvme: introduce struct nvme_request

This adds a shared per-request structure for all NVMe I/O.  This structure
is embedded as the first member in all NVMe transport drivers request
private data and allows to implement common functionality between the
drivers.

The first use is to replace the current abuse of the SCSI command
passthrough fields in struct request for the NVMe command passthrough,
but it will grow a field more fields to allow implementing things
like common abort handlers in the future.

The passthrough commands are handled by having a pointer to the SQE
(struct nvme_command) in struct nvme_request, and the union of the
possible result fields, which had to be turned from an anonymous
into a named union for that purpose.  This avoids having to pass
a reference to a full CQE around and thus makes checking the result
a lot more lightweight.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c          | 28 +++++++++++++---------------
 drivers/nvme/host/fabrics.c       | 26 +++++++++++++-------------
 drivers/nvme/host/lightnvm.c      | 31 +++++++------------------------
 drivers/nvme/host/nvme.h          | 16 +++++++++++++++-
 drivers/nvme/host/pci.c           |  4 ++--
 drivers/nvme/host/rdma.c          | 11 +++--------
 drivers/nvme/target/core.c        |  8 ++++----
 drivers/nvme/target/fabrics-cmd.c | 14 +++++++-------
 drivers/nvme/target/loop.c        | 12 ++++++------
 drivers/nvme/target/nvmet.h       |  2 +-
 include/linux/nvme.h              | 10 +++++-----
 11 files changed, 76 insertions(+), 86 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ef34f2f3566ae6..2fd632bcd97501 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -221,8 +221,7 @@ struct request *nvme_alloc_request(struct request_queue *q,
 
 	req->cmd_type = REQ_TYPE_DRV_PRIV;
 	req->cmd_flags |= REQ_FAILFAST_DRIVER;
-	req->cmd = (unsigned char *)cmd;
-	req->cmd_len = sizeof(struct nvme_command);
+	nvme_req(req)->cmd = cmd;
 
 	return req;
 }
@@ -321,7 +320,7 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 	int ret = 0;
 
 	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
-		memcpy(cmd, req->cmd, sizeof(*cmd));
+		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
 	else if (req_op(req) == REQ_OP_FLUSH)
 		nvme_setup_flush(ns, cmd);
 	else if (req_op(req) == REQ_OP_DISCARD)
@@ -338,7 +337,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
  * if the result is positive, it's an NVM Express status code
  */
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-		struct nvme_completion *cqe, void *buffer, unsigned bufflen,
+		union nvme_result *result, void *buffer, unsigned bufflen,
 		unsigned timeout, int qid, int at_head, int flags)
 {
 	struct request *req;
@@ -349,7 +348,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		return PTR_ERR(req);
 
 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
-	req->special = cqe;
 
 	if (buffer && bufflen) {
 		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
@@ -358,6 +356,8 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 	}
 
 	blk_execute_rq(req->q, NULL, req, at_head);
+	if (result)
+		*result = nvme_req(req)->result;
 	ret = req->errors;
  out:
 	blk_mq_free_request(req);
@@ -379,7 +379,6 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 		u32 *result, unsigned timeout)
 {
 	bool write = nvme_is_write(cmd);
-	struct nvme_completion cqe;
 	struct nvme_ns *ns = q->queuedata;
 	struct gendisk *disk = ns ? ns->disk : NULL;
 	struct request *req;
@@ -392,7 +391,6 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 		return PTR_ERR(req);
 
 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
-	req->special = &cqe;
 
 	if (ubuffer && bufflen) {
 		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
@@ -447,7 +445,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 	blk_execute_rq(req->q, disk, req, 0);
 	ret = req->errors;
 	if (result)
-		*result = le32_to_cpu(cqe.result);
+		*result = le32_to_cpu(nvme_req(req)->result.u32);
 	if (meta && !ret && !write) {
 		if (copy_to_user(meta_buffer, meta, meta_len))
 			ret = -EFAULT;
@@ -596,7 +594,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
 		      void *buffer, size_t buflen, u32 *result)
 {
 	struct nvme_command c;
-	struct nvme_completion cqe;
+	union nvme_result res;
 	int ret;
 
 	memset(&c, 0, sizeof(c));
@@ -604,10 +602,10 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
 	c.features.nsid = cpu_to_le32(nsid);
 	c.features.fid = cpu_to_le32(fid);
 
-	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, buffer, buflen, 0,
+	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, buffer, buflen, 0,
 			NVME_QID_ANY, 0, 0);
 	if (ret >= 0 && result)
-		*result = le32_to_cpu(cqe.result);
+		*result = le32_to_cpu(res.u32);
 	return ret;
 }
 
@@ -615,7 +613,7 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
 		      void *buffer, size_t buflen, u32 *result)
 {
 	struct nvme_command c;
-	struct nvme_completion cqe;
+	union nvme_result res;
 	int ret;
 
 	memset(&c, 0, sizeof(c));
@@ -623,10 +621,10 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
 
-	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe,
+	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
 			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
 	if (ret >= 0 && result)
-		*result = le32_to_cpu(cqe.result);
+		*result = le32_to_cpu(res.u32);
 	return ret;
 }
 
@@ -1901,7 +1899,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl,
 		struct nvme_completion *cqe)
 {
 	u16 status = le16_to_cpu(cqe->status) >> 1;
-	u32 result = le32_to_cpu(cqe->result);
+	u32 result = le32_to_cpu(cqe->result.u32);
 
 	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) {
 		++ctrl->event_limit;
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 5a3f008d348094..68fb26b3bfb9e7 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -161,7 +161,7 @@ EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn);
 int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
 {
 	struct nvme_command cmd;
-	struct nvme_completion cqe;
+	union nvme_result res;
 	int ret;
 
 	memset(&cmd, 0, sizeof(cmd));
@@ -169,11 +169,11 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
 	cmd.prop_get.fctype = nvme_fabrics_type_property_get;
 	cmd.prop_get.offset = cpu_to_le32(off);
 
-	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0,
+	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
 			NVME_QID_ANY, 0, 0);
 
 	if (ret >= 0)
-		*val = le64_to_cpu(cqe.result64);
+		*val = le64_to_cpu(res.u64);
 	if (unlikely(ret != 0))
 		dev_err(ctrl->device,
 			"Property Get error: %d, offset %#x\n",
@@ -207,7 +207,7 @@ EXPORT_SYMBOL_GPL(nvmf_reg_read32);
 int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
 {
 	struct nvme_command cmd;
-	struct nvme_completion cqe;
+	union nvme_result res;
 	int ret;
 
 	memset(&cmd, 0, sizeof(cmd));
@@ -216,11 +216,11 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
 	cmd.prop_get.attrib = 1;
 	cmd.prop_get.offset = cpu_to_le32(off);
 
-	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe, NULL, 0, 0,
+	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
 			NVME_QID_ANY, 0, 0);
 
 	if (ret >= 0)
-		*val = le64_to_cpu(cqe.result64);
+		*val = le64_to_cpu(res.u64);
 	if (unlikely(ret != 0))
 		dev_err(ctrl->device,
 			"Property Get error: %d, offset %#x\n",
@@ -368,7 +368,7 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl,
 int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 {
 	struct nvme_command cmd;
-	struct nvme_completion cqe;
+	union nvme_result res;
 	struct nvmf_connect_data *data;
 	int ret;
 
@@ -400,16 +400,16 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 	strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
 	strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
 
-	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &cqe,
+	ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
 			data, sizeof(*data), 0, NVME_QID_ANY, 1,
 			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
 	if (ret) {
-		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result),
+		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
 		goto out_free_data;
 	}
 
-	ctrl->cntlid = le16_to_cpu(cqe.result16);
+	ctrl->cntlid = le16_to_cpu(res.u16);
 
 out_free_data:
 	kfree(data);
@@ -441,7 +441,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 {
 	struct nvme_command cmd;
 	struct nvmf_connect_data *data;
-	struct nvme_completion cqe;
+	union nvme_result res;
 	int ret;
 
 	memset(&cmd, 0, sizeof(cmd));
@@ -459,11 +459,11 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 	strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
 	strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
 
-	ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &cqe,
+	ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
 			data, sizeof(*data), 0, qid, 1,
 			BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
 	if (ret) {
-		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(cqe.result),
+		nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
 				       &cmd, data);
 	}
 	kfree(data);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index f5e3011e31fcdf..442f67774ea9db 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -146,14 +146,6 @@ struct nvme_nvm_command {
 	};
 };
 
-struct nvme_nvm_completion {
-	__le64	result;		/* Used by LightNVM to return ppa completions */
-	__le16	sq_head;	/* how much of this queue may be reclaimed */
-	__le16	sq_id;		/* submission queue that generated this entry */
-	__u16	command_id;	/* of the command which completed */
-	__le16	status;		/* did the command fail, and if so, why? */
-};
-
 #define NVME_NVM_LP_MLC_PAIRS 886
 struct nvme_nvm_lp_mlc {
 	__le16			num_pairs;
@@ -481,11 +473,8 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
 static void nvme_nvm_end_io(struct request *rq, int error)
 {
 	struct nvm_rq *rqd = rq->end_io_data;
-	struct nvme_nvm_completion *cqe = rq->special;
-
-	if (cqe)
-		rqd->ppa_status = le64_to_cpu(cqe->result);
 
+	rqd->ppa_status = nvme_req(rq)->result.u64;
 	nvm_end_io(rqd, error);
 
 	kfree(rq->cmd);
@@ -500,20 +489,18 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	struct bio *bio = rqd->bio;
 	struct nvme_nvm_command *cmd;
 
-	rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0);
-	if (IS_ERR(rq))
+	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
+	if (!cmd)
 		return -ENOMEM;
 
-	cmd = kzalloc(sizeof(struct nvme_nvm_command) +
-				sizeof(struct nvme_nvm_completion), GFP_KERNEL);
-	if (!cmd) {
-		blk_mq_free_request(rq);
+	rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY);
+	if (IS_ERR(rq)) {
+		kfree(cmd);
 		return -ENOMEM;
 	}
+	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
-	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->ioprio = bio_prio(bio);
-
 	if (bio_has_data(bio))
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 
@@ -522,10 +509,6 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 
 	nvme_nvm_rqtocmd(rq, rqd, ns, cmd);
 
-	rq->cmd = (unsigned char *)cmd;
-	rq->cmd_len = sizeof(struct nvme_nvm_command);
-	rq->special = cmd + 1;
-
 	rq->end_io_data = rqd;
 
 	blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d47f5a5d18c7c8..5e64957a9b96ec 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -79,6 +79,20 @@ enum nvme_quirks {
 	NVME_QUIRK_DELAY_BEFORE_CHK_RDY		= (1 << 3),
 };
 
+/*
+ * Common request structure for NVMe passthrough.  All drivers must have
+ * this structure as the first member of their request-private data.
+ */
+struct nvme_request {
+	struct nvme_command	*cmd;
+	union nvme_result	result;
+};
+
+static inline struct nvme_request *nvme_req(struct request *req)
+{
+	return blk_mq_rq_to_pdu(req);
+}
+
 /* The below value is the specific amount of delay needed before checking
  * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
  * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
@@ -278,7 +292,7 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void *buf, unsigned bufflen);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-		struct nvme_completion *cqe, void *buffer, unsigned bufflen,
+		union nvme_result *result, void *buffer, unsigned bufflen,
 		unsigned timeout, int qid, int at_head, int flags);
 int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 		void __user *ubuffer, unsigned bufflen, u32 *result,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0955e9d2202035..de8e0505d97978 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -140,6 +140,7 @@ struct nvme_queue {
  * allocated to store the PRP list.
  */
 struct nvme_iod {
+	struct nvme_request req;
 	struct nvme_queue *nvmeq;
 	int aborted;
 	int npages;		/* In the PRP list. 0 means small pool in use */
@@ -707,8 +708,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 		}
 
 		req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special)
-			memcpy(req->special, &cqe, sizeof(cqe));
+		nvme_req(req)->result = cqe.result;
 		blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1);
 
 	}
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 5a838817795991..0b8a161cf88105 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -66,6 +66,7 @@ struct nvme_rdma_qe {
 
 struct nvme_rdma_queue;
 struct nvme_rdma_request {
+	struct nvme_request	req;
 	struct ib_mr		*mr;
 	struct nvme_rdma_qe	sqe;
 	struct ib_sge		sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
@@ -1117,13 +1118,10 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 		struct nvme_completion *cqe, struct ib_wc *wc, int tag)
 {
-	u16 status = le16_to_cpu(cqe->status);
 	struct request *rq;
 	struct nvme_rdma_request *req;
 	int ret = 0;
 
-	status >>= 1;
-
 	rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
 	if (!rq) {
 		dev_err(queue->ctrl->ctrl.device,
@@ -1134,9 +1132,6 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 	}
 	req = blk_mq_rq_to_pdu(rq);
 
-	if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special)
-		memcpy(rq->special, cqe, sizeof(*cqe));
-
 	if (rq->tag == tag)
 		ret = 1;
 
@@ -1144,8 +1139,8 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 	    wc->ex.invalidate_rkey == req->mr->rkey)
 		req->mr->need_inval = false;
 
-	blk_mq_complete_request(rq, status);
-
+	req->req.result = cqe->result;
+	blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
 	return ret;
 }
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 6559d5afa7bfd9..c232552be2d807 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -617,7 +617,7 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
 	if (!subsys) {
 		pr_warn("connect request for invalid subsystem %s!\n",
 			subsysnqn);
-		req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn);
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
 		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
 	}
 
@@ -638,7 +638,7 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
 
 	pr_warn("could not find controller %d for subsys %s / host %s\n",
 		cntlid, subsysnqn, hostnqn);
-	req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid);
+	req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
 	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
 
 out:
@@ -700,7 +700,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	if (!subsys) {
 		pr_warn("connect request for invalid subsystem %s!\n",
 			subsysnqn);
-		req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn);
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
 		goto out;
 	}
 
@@ -709,7 +709,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	if (!nvmet_host_allowed(req, subsys, hostnqn)) {
 		pr_info("connect by host %s for subsystem %s not allowed\n",
 			hostnqn, subsysnqn);
-		req->rsp->result = IPO_IATTR_CONNECT_DATA(hostnqn);
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
 		up_read(&nvmet_config_sem);
 		goto out_put_subsystem;
 	}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 9a97ae67e65645..f4088198cd0d0a 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -69,7 +69,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
 		}
 	}
 
-	req->rsp->result64 = cpu_to_le64(val);
+	req->rsp->result.u64 = cpu_to_le64(val);
 	nvmet_req_complete(req, status);
 }
 
@@ -125,7 +125,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 	d = kmap(sg_page(req->sg)) + req->sg->offset;
 
 	/* zero out initial completion result, assign values as needed */
-	req->rsp->result = 0;
+	req->rsp->result.u32 = 0;
 
 	if (c->recfmt != 0) {
 		pr_warn("invalid connect version (%d).\n",
@@ -138,7 +138,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 		pr_warn("connect attempt for invalid controller ID %#x\n",
 			d->cntlid);
 		status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
-		req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid);
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
 		goto out;
 	}
 
@@ -155,7 +155,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 
 	pr_info("creating controller %d for NQN %s.\n",
 			ctrl->cntlid, ctrl->hostnqn);
-	req->rsp->result16 = cpu_to_le16(ctrl->cntlid);
+	req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
 
 out:
 	kunmap(sg_page(req->sg));
@@ -173,7 +173,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
 	d = kmap(sg_page(req->sg)) + req->sg->offset;
 
 	/* zero out initial completion result, assign values as needed */
-	req->rsp->result = 0;
+	req->rsp->result.u32 = 0;
 
 	if (c->recfmt != 0) {
 		pr_warn("invalid connect version (%d).\n",
@@ -191,14 +191,14 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
 	if (unlikely(qid > ctrl->subsys->max_qid)) {
 		pr_warn("invalid queue id (%d)\n", qid);
 		status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
-		req->rsp->result = IPO_IATTR_CONNECT_SQE(qid);
+		req->rsp->result.u32 = IPO_IATTR_CONNECT_SQE(qid);
 		goto out_ctrl_put;
 	}
 
 	status = nvmet_install_queue(ctrl, req);
 	if (status) {
 		/* pass back cntlid that had the issue of installing queue */
-		req->rsp->result16 = cpu_to_le16(ctrl->cntlid);
+		req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
 		goto out_ctrl_put;
 	}
 
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index d5df77d686b26a..757e21a31128f2 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -36,6 +36,7 @@
 	(NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS)
 
 struct nvme_loop_iod {
+	struct nvme_request	nvme_req;
 	struct nvme_command	cmd;
 	struct nvme_completion	rsp;
 	struct nvmet_req	req;
@@ -112,10 +113,10 @@ static void nvme_loop_complete_rq(struct request *req)
 	blk_mq_end_request(req, error);
 }
 
-static void nvme_loop_queue_response(struct nvmet_req *nvme_req)
+static void nvme_loop_queue_response(struct nvmet_req *req)
 {
 	struct nvme_loop_iod *iod =
-		container_of(nvme_req, struct nvme_loop_iod, req);
+		container_of(req, struct nvme_loop_iod, req);
 	struct nvme_completion *cqe = &iod->rsp;
 
 	/*
@@ -128,11 +129,10 @@ static void nvme_loop_queue_response(struct nvmet_req *nvme_req)
 			cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
 		nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe);
 	} else {
-		struct request *req = blk_mq_rq_from_pdu(iod);
+		struct request *rq = blk_mq_rq_from_pdu(iod);
 
-		if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special)
-			memcpy(req->special, cqe, sizeof(*cqe));
-		blk_mq_complete_request(req, le16_to_cpu(cqe->status) >> 1);
+		iod->nvme_req.result = cqe->result;
+		blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
 	}
 }
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 76b6eedccaf92c..f9c76441e8c941 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -238,7 +238,7 @@ static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
 
 static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
 {
-	req->rsp->result = cpu_to_le32(result);
+	req->rsp->result.u32 = cpu_to_le32(result);
 }
 
 /*
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7676557ce357d6..18ce9f7cc88160 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -949,11 +949,11 @@ struct nvme_completion {
 	/*
 	 * Used by Admin and Fabrics commands to return data:
 	 */
-	union {
-		__le16	result16;
-		__le32	result;
-		__le64	result64;
-	};
+	union nvme_result {
+		__le16	u16;
+		__le32	u32;
+		__le64	u64;
+	} result;
 	__le16	sq_head;	/* how much of this queue may be reclaimed */
 	__le16	sq_id;		/* submission queue that generated this entry */
 	__u16	command_id;	/* of the command which completed */

From 7bf58533a0bc257edff883619befe7e5a1e8caca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Nov 2016 07:32:34 -0800
Subject: [PATCH 066/182] nvme: don't pass the full CQE to
 nvme_complete_async_event

We only need the status and result fields, and passing them explicitly
makes life a lot easier for the Fibre Channel transport which doesn't
have a full CQE for the fast path case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c   | 19 +++++++++++++------
 drivers/nvme/host/nvme.h   |  4 ++--
 drivers/nvme/host/pci.c    |  3 ++-
 drivers/nvme/host/rdma.c   |  3 ++-
 drivers/nvme/target/loop.c |  3 ++-
 5 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2fd632bcd97501..53584d21c80522 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1895,18 +1895,25 @@ static void nvme_async_event_work(struct work_struct *work)
 	spin_unlock_irq(&ctrl->lock);
 }
 
-void nvme_complete_async_event(struct nvme_ctrl *ctrl,
-		struct nvme_completion *cqe)
+void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
+		union nvme_result *res)
 {
-	u16 status = le16_to_cpu(cqe->status) >> 1;
-	u32 result = le32_to_cpu(cqe->result.u32);
+	u32 result = le32_to_cpu(res->u32);
+	bool done = true;
 
-	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) {
+	switch (le16_to_cpu(status) >> 1) {
+	case NVME_SC_SUCCESS:
+		done = false;
+		/*FALLTHRU*/
+	case NVME_SC_ABORT_REQ:
 		++ctrl->event_limit;
 		schedule_work(&ctrl->async_event_work);
+		break;
+	default:
+		break;
 	}
 
-	if (status != NVME_SC_SUCCESS)
+	if (done)
 		return;
 
 	switch (result & 0xff07) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 5e64957a9b96ec..468fc445bf3512 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -275,8 +275,8 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl);
 void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 
 #define NVME_NR_AERS	1
-void nvme_complete_async_event(struct nvme_ctrl *ctrl,
-		struct nvme_completion *cqe);
+void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
+		union nvme_result *res);
 void nvme_queue_async_events(struct nvme_ctrl *ctrl);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index de8e0505d97978..51d13d5ec7a84b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -703,7 +703,8 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 		 */
 		if (unlikely(nvmeq->qid == 0 &&
 				cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
-			nvme_complete_async_event(&nvmeq->dev->ctrl, &cqe);
+			nvme_complete_async_event(&nvmeq->dev->ctrl,
+					cqe.status, &cqe.result);
 			continue;
 		}
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0b8a161cf88105..c4700efc03906d 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1168,7 +1168,8 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
 	 */
 	if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
 			cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
-		nvme_complete_async_event(&queue->ctrl->ctrl, cqe);
+		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
+				&cqe->result);
 	else
 		ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
 	ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 757e21a31128f2..26aa3a5afb0dbf 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -127,7 +127,8 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
 	 */
 	if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 &&
 			cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
-		nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe);
+		nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe->status,
+				&cqe->result);
 	} else {
 		struct request *rq = blk_mq_rq_from_pdu(iod);
 

From ebc4ff661fbe76781c6b16dfb7b754a5d5073f8e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 10 Nov 2016 11:16:37 -0500
Subject: [PATCH 067/182] block: cfq_cpd_alloc() should use @gfp

cfq_cpd_alloc() which is the cpd_alloc_fn implementation for cfq was
incorrectly hard coding GFP_KERNEL instead of using the mask specified
through the @gfp parameter.  This currently doesn't cause any actual
issues because all current callers specify GFP_KERNEL.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: e4a9bde9589f ("blkcg: replace blkcg_policy->cpd_size with ->cpd_alloc/free_fn() methods")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dcbed8c9c82cbb..61010511c5a028 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1586,7 +1586,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 {
 	struct cfq_group_data *cgd;
 
-	cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+	cgd = kzalloc(sizeof(*cgd), gfp);
 	if (!cgd)
 		return NULL;
 	return &cgd->cpd;

From cf43e6be865a582ba66ee4747ae27a0513f6bba1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 7 Nov 2016 21:32:37 -0700
Subject: [PATCH 068/182] block: add scalable completion tracking of requests

For legacy block, we simply track them in the request queue. For
blk-mq, we track them on a per-sw queue basis, which we can then
sum up through the hardware queues and finally to a per device
state.

The stats are tracked in, roughly, 0.1s interval windows.

Add sysfs files to display the stats.

The feature is off by default, to avoid any extra overhead. In-kernel
users of it can turn it on by setting QUEUE_FLAG_STATS in the queue
flags. We currently don't turn it on if someone just reads any of
the stats files, that is something we could add as well.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Makefile            |   2 +-
 block/blk-core.c          |  14 ++-
 block/blk-mq-sysfs.c      |  47 ++++++++
 block/blk-mq.c            |  25 ++++
 block/blk-mq.h            |   3 +
 block/blk-stat.c          | 248 ++++++++++++++++++++++++++++++++++++++
 block/blk-stat.h          |  42 +++++++
 block/blk-sysfs.c         |  26 ++++
 include/linux/blk_types.h |  16 +++
 include/linux/blkdev.h    |   7 ++
 10 files changed, 427 insertions(+), 3 deletions(-)
 create mode 100644 block/blk-stat.c
 create mode 100644 block/blk-stat.h

diff --git a/block/Makefile b/block/Makefile
index 934dac73fb3741..2528c596f7ecc3 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-lib.o blk-mq.o blk-mq-tag.o \
+			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
 			blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
 			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			badblocks.o partitions/
diff --git a/block/blk-core.c b/block/blk-core.c
index 2deca48a4a0545..216372b01624cd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2464,6 +2464,11 @@ void blk_start_request(struct request *req)
 {
 	blk_dequeue_request(req);
 
+	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
+		blk_stat_set_issue_time(&req->issue_stat);
+		req->rq_flags |= RQF_STATS;
+	}
+
 	/*
 	 * We are now handing the request to the hardware, initialize
 	 * resid_len to full count and add the timeout handler.
@@ -2683,8 +2688,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
  */
 void blk_finish_request(struct request *req, int error)
 {
+	struct request_queue *q = req->q;
+
+	if (req->rq_flags & RQF_STATS)
+		blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+
 	if (req->rq_flags & RQF_QUEUED)
-		blk_queue_end_tag(req->q, req);
+		blk_queue_end_tag(q, req);
 
 	BUG_ON(blk_queued_rq(req));
 
@@ -2704,7 +2714,7 @@ void blk_finish_request(struct request *req, int error)
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 
-		__blk_put_request(req->q, req);
+		__blk_put_request(q, req);
 	}
 }
 EXPORT_SYMBOL(blk_finish_request);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 01fb455d337747..eacd3af7209901 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -259,6 +259,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return ret;
 }
 
+static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_ctx *ctx;
+	unsigned int i;
+
+	hctx_for_each_ctx(hctx, ctx, i) {
+		blk_stat_init(&ctx->stat[BLK_STAT_READ]);
+		blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
+	}
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
+					  const char *page, size_t count)
+{
+	blk_mq_stat_clear(hctx);
+	return count;
+}
+
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+			pre, (long long) stat->nr_samples,
+			(long long) stat->mean, (long long) stat->min,
+			(long long) stat->max);
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	struct blk_rq_stat stat[2];
+	ssize_t ret;
+
+	blk_stat_init(&stat[BLK_STAT_READ]);
+	blk_stat_init(&stat[BLK_STAT_WRITE]);
+
+	blk_hctx_stat_get(hctx, stat);
+
+	ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
+	ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
+	return ret;
+}
+
 static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
 	.attr = {.name = "dispatched", .mode = S_IRUGO },
 	.show = blk_mq_sysfs_dispatched_show,
@@ -317,6 +358,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
 	.show = blk_mq_hw_sysfs_poll_show,
 	.store = blk_mq_hw_sysfs_poll_store,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
+	.attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
+	.show = blk_mq_hw_sysfs_stat_show,
+	.store = blk_mq_hw_sysfs_stat_store,
+};
 
 static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_queued.attr,
@@ -327,6 +373,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_cpus.attr,
 	&blk_mq_hw_sysfs_active.attr,
 	&blk_mq_hw_sysfs_poll.attr,
+	&blk_mq_hw_sysfs_stat.attr,
 	NULL,
 };
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6f5cb3f3dcacff..19795886d46e8a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -30,6 +30,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-stat.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -403,10 +404,27 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 	put_cpu();
 }
 
+static void blk_mq_stat_add(struct request *rq)
+{
+	if (rq->rq_flags & RQF_STATS) {
+		/*
+		 * We could rq->mq_ctx here, but there's less of a risk
+		 * of races if we have the completion event add the stats
+		 * to the local software queue.
+		 */
+		struct blk_mq_ctx *ctx;
+
+		ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
+		blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+	}
+}
+
 static void __blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_stat_add(rq);
+
 	if (!q->softirq_done_fn)
 		blk_mq_end_request(rq, rq->errors);
 	else
@@ -450,6 +468,11 @@ void blk_mq_start_request(struct request *rq)
 	if (unlikely(blk_bidi_rq(rq)))
 		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 
+	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+		blk_stat_set_issue_time(&rq->issue_stat);
+		rq->rq_flags |= RQF_STATS;
+	}
+
 	blk_add_timer(rq);
 
 	/*
@@ -1784,6 +1807,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		spin_lock_init(&__ctx->lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
+		blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
+		blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
 
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpu_online(i))
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ac772dac7ce8c0..b444370ae05ba0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H
 
+#include "blk-stat.h"
+
 struct blk_mq_tag_set;
 
 struct blk_mq_ctx {
@@ -18,6 +20,7 @@ struct blk_mq_ctx {
 
 	/* incremented at completion time */
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
+	struct blk_rq_stat	stat[2];
 
 	struct request_queue	*queue;
 	struct kobject		kobj;
diff --git a/block/blk-stat.c b/block/blk-stat.c
new file mode 100644
index 00000000000000..688c958367eed5
--- /dev/null
+++ b/block/blk-stat.c
@@ -0,0 +1,248 @@
+/*
+ * Block stat tracking code
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/blk-mq.h>
+
+#include "blk-stat.h"
+#include "blk-mq.h"
+
+static void blk_stat_flush_batch(struct blk_rq_stat *stat)
+{
+	const s32 nr_batch = READ_ONCE(stat->nr_batch);
+	const s32 nr_samples = READ_ONCE(stat->nr_batch);
+
+	if (!nr_batch)
+		return;
+	if (!nr_samples)
+		stat->mean = div64_s64(stat->batch, nr_batch);
+	else {
+		stat->mean = div64_s64((stat->mean * nr_samples) +
+					stat->batch,
+					nr_batch + nr_samples);
+	}
+
+	stat->nr_samples += nr_batch;
+	stat->nr_batch = stat->batch = 0;
+}
+
+static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+	if (!src->nr_samples)
+		return;
+
+	blk_stat_flush_batch(src);
+
+	dst->min = min(dst->min, src->min);
+	dst->max = max(dst->max, src->max);
+
+	if (!dst->nr_samples)
+		dst->mean = src->mean;
+	else {
+		dst->mean = div64_s64((src->mean * src->nr_samples) +
+					(dst->mean * dst->nr_samples),
+					dst->nr_samples + src->nr_samples);
+	}
+	dst->nr_samples += src->nr_samples;
+}
+
+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	uint64_t latest = 0;
+	int i, j, nr;
+
+	blk_stat_init(&dst[BLK_STAT_READ]);
+	blk_stat_init(&dst[BLK_STAT_WRITE]);
+
+	nr = 0;
+	do {
+		uint64_t newest = 0;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				if (!ctx->stat[BLK_STAT_READ].nr_samples &&
+				    !ctx->stat[BLK_STAT_WRITE].nr_samples)
+					continue;
+				if (ctx->stat[BLK_STAT_READ].time > newest)
+					newest = ctx->stat[BLK_STAT_READ].time;
+				if (ctx->stat[BLK_STAT_WRITE].time > newest)
+					newest = ctx->stat[BLK_STAT_WRITE].time;
+			}
+		}
+
+		/*
+		 * No samples
+		 */
+		if (!newest)
+			break;
+
+		if (newest > latest)
+			latest = newest;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				if (ctx->stat[BLK_STAT_READ].time == newest) {
+					blk_stat_sum(&dst[BLK_STAT_READ],
+						     &ctx->stat[BLK_STAT_READ]);
+					nr++;
+				}
+				if (ctx->stat[BLK_STAT_WRITE].time == newest) {
+					blk_stat_sum(&dst[BLK_STAT_WRITE],
+						     &ctx->stat[BLK_STAT_WRITE]);
+					nr++;
+				}
+			}
+		}
+		/*
+		 * If we race on finding an entry, just loop back again.
+		 * Should be very rare.
+		 */
+	} while (!nr);
+
+	dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest;
+}
+
+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+	if (q->mq_ops)
+		blk_mq_stat_get(q, dst);
+	else {
+		memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ],
+				sizeof(struct blk_rq_stat));
+		memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE],
+				sizeof(struct blk_rq_stat));
+	}
+}
+
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+{
+	struct blk_mq_ctx *ctx;
+	unsigned int i, nr;
+
+	nr = 0;
+	do {
+		uint64_t newest = 0;
+
+		hctx_for_each_ctx(hctx, ctx, i) {
+			if (!ctx->stat[BLK_STAT_READ].nr_samples &&
+			    !ctx->stat[BLK_STAT_WRITE].nr_samples)
+				continue;
+
+			if (ctx->stat[BLK_STAT_READ].time > newest)
+				newest = ctx->stat[BLK_STAT_READ].time;
+			if (ctx->stat[BLK_STAT_WRITE].time > newest)
+				newest = ctx->stat[BLK_STAT_WRITE].time;
+		}
+
+		if (!newest)
+			break;
+
+		hctx_for_each_ctx(hctx, ctx, i) {
+			if (ctx->stat[BLK_STAT_READ].time == newest) {
+				blk_stat_sum(&dst[BLK_STAT_READ],
+						&ctx->stat[BLK_STAT_READ]);
+				nr++;
+			}
+			if (ctx->stat[BLK_STAT_WRITE].time == newest) {
+				blk_stat_sum(&dst[BLK_STAT_WRITE],
+						&ctx->stat[BLK_STAT_WRITE]);
+				nr++;
+			}
+		}
+		/*
+		 * If we race on finding an entry, just loop back again.
+		 * Should be very rare, as the window is only updated
+		 * occasionally
+		 */
+	} while (!nr);
+}
+
+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+{
+	stat->min = -1ULL;
+	stat->max = stat->nr_samples = stat->mean = 0;
+	stat->batch = stat->nr_batch = 0;
+	stat->time = time_now & BLK_STAT_NSEC_MASK;
+}
+
+void blk_stat_init(struct blk_rq_stat *stat)
+{
+	__blk_stat_init(stat, ktime_to_ns(ktime_get()));
+}
+
+static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
+{
+	return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+}
+
+bool blk_stat_is_current(struct blk_rq_stat *stat)
+{
+	return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+}
+
+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+{
+	s64 now, value;
+
+	now = __blk_stat_time(ktime_to_ns(ktime_get()));
+	if (now < blk_stat_time(&rq->issue_stat))
+		return;
+
+	if (!__blk_stat_is_current(stat, now))
+		__blk_stat_init(stat, now);
+
+	value = now - blk_stat_time(&rq->issue_stat);
+	if (value > stat->max)
+		stat->max = value;
+	if (value < stat->min)
+		stat->min = value;
+
+	if (stat->batch + value < stat->batch ||
+	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+		blk_stat_flush_batch(stat);
+
+	stat->batch += value;
+	stat->nr_batch++;
+}
+
+void blk_stat_clear(struct request_queue *q)
+{
+	if (q->mq_ops) {
+		struct blk_mq_hw_ctx *hctx;
+		struct blk_mq_ctx *ctx;
+		int i, j;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				blk_stat_init(&ctx->stat[BLK_STAT_READ]);
+				blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
+			}
+		}
+	} else {
+		blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
+		blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
+	}
+}
+
+void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+{
+	stat->time = (stat->time & BLK_STAT_MASK) |
+			(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+}
+
+/*
+ * Enable stat tracking, return whether it was enabled
+ */
+bool blk_stat_enable(struct request_queue *q)
+{
+	if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+		set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+		return false;
+	}
+
+	return true;
+}
diff --git a/block/blk-stat.h b/block/blk-stat.h
new file mode 100644
index 00000000000000..a2050a0a5314bb
--- /dev/null
+++ b/block/blk-stat.h
@@ -0,0 +1,42 @@
+#ifndef BLK_STAT_H
+#define BLK_STAT_H
+
+/*
+ * ~0.13s window as a power-of-2 (2^27 nsecs)
+ */
+#define BLK_STAT_NSEC		134217728ULL
+#define BLK_STAT_NSEC_MASK	~(BLK_STAT_NSEC - 1)
+
+/*
+ * Upper 3 bits can be used elsewhere
+ */
+#define BLK_STAT_RES_BITS	3
+#define BLK_STAT_SHIFT		(64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SHIFT) - 1)
+#define BLK_STAT_MASK		~BLK_STAT_TIME_MASK
+
+enum {
+	BLK_STAT_READ	= 0,
+	BLK_STAT_WRITE,
+};
+
+void blk_stat_add(struct blk_rq_stat *, struct request *);
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
+void blk_stat_clear(struct request_queue *);
+void blk_stat_init(struct blk_rq_stat *);
+bool blk_stat_is_current(struct blk_rq_stat *);
+void blk_stat_set_issue_time(struct blk_issue_stat *);
+bool blk_stat_enable(struct request_queue *);
+
+static inline u64 __blk_stat_time(u64 time)
+{
+	return time & BLK_STAT_TIME_MASK;
+}
+
+static inline u64 blk_stat_time(struct blk_issue_stat *stat)
+{
+	return __blk_stat_time(stat->time);
+}
+
+#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 488c2e28feb823..9cdb7247727a55 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -401,6 +401,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+			pre, (long long) stat->nr_samples,
+			(long long) stat->mean, (long long) stat->min,
+			(long long) stat->max);
+}
+
+static ssize_t queue_stats_show(struct request_queue *q, char *page)
+{
+	struct blk_rq_stat stat[2];
+	ssize_t ret;
+
+	blk_queue_stat_get(q, stat);
+
+	ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
+	ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
+	return ret;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -553,6 +573,11 @@ static struct queue_sysfs_entry queue_dax_entry = {
 	.show = queue_dax_show,
 };
 
+static struct queue_sysfs_entry queue_stats_entry = {
+	.attr = {.name = "stats", .mode = S_IRUGO },
+	.show = queue_stats_show,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -582,6 +607,7 @@ static struct attribute *default_attrs[] = {
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
 	&queue_dax_entry.attr,
+	&queue_stats_entry.attr,
 	NULL,
 };
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 562ac46cb79023..4d0044d0998410 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -250,4 +250,20 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
 	return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
 }
 
+struct blk_issue_stat {
+	u64 time;
+};
+
+#define BLK_RQ_STAT_BATCH	64
+
+struct blk_rq_stat {
+	s64 mean;
+	u64 min;
+	u64 max;
+	s32 nr_samples;
+	s32 nr_batch;
+	u64 batch;
+	s64 time;
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d364be6e695983..303723a2e5b8aa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -117,6 +117,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_PM			((__force req_flags_t)(1 << 15))
 /* on IO scheduler merge hash */
 #define RQF_HASHED		((__force req_flags_t)(1 << 16))
+/* IO stats tracking on */
+#define RQF_STATS		((__force req_flags_t)(1 << 17))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
@@ -197,6 +199,7 @@ struct request {
 	struct gendisk *rq_disk;
 	struct hd_struct *part;
 	unsigned long start_time;
+	struct blk_issue_stat issue_stat;
 #ifdef CONFIG_BLK_CGROUP
 	struct request_list *rl;		/* rl this rq is alloced from */
 	unsigned long long start_time_ns;
@@ -492,6 +495,9 @@ struct request_queue {
 
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
+
+	struct blk_rq_stat	rq_stats[2];
+
 	/*
 	 * Number of active block driver functions for which blk_drain_queue()
 	 * must wait. Must be incremented around functions that unlock the
@@ -585,6 +591,7 @@ struct request_queue {
 #define QUEUE_FLAG_FUA	       24	/* device supports FUA writes */
 #define QUEUE_FLAG_FLUSH_NQ    25	/* flush not queueuable */
 #define QUEUE_FLAG_DAX         26	/* device supports DAX */
+#define QUEUE_FLAG_STATS       27	/* track rq completion times */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\

From e34cbd307477ae07c5d8a8d0bd15e65a9ddaba5c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 9 Nov 2016 12:36:15 -0700
Subject: [PATCH 069/182] blk-wbt: add general throttling mechanism

We can hook this up to the block layer, to help throttle buffered
writes.

wbt registers a few trace points that can be used to track what is
happening in the system:

wbt_lat: 259:0: latency 2446318
wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1,
               wmean=518866, wmin=15522, wmax=5330353, wsamples=57
wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, max=32

This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat
dumps the current read/write stats for that window, and wbt_step shows a
step down event where we now scale back writes. Each trace includes the
device, 259:0 in this case.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Makefile             |   1 +
 block/blk-wbt.c            | 735 +++++++++++++++++++++++++++++++++++++
 block/blk-wbt.h            | 165 +++++++++
 include/trace/events/wbt.h | 153 ++++++++
 4 files changed, 1054 insertions(+)
 create mode 100644 block/blk-wbt.c
 create mode 100644 block/blk-wbt.h
 create mode 100644 include/trace/events/wbt.h

diff --git a/block/Makefile b/block/Makefile
index 2528c596f7ecc3..a827f988c4e674 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
 obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
 obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
+obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
new file mode 100644
index 00000000000000..889c17ff85031a
--- /dev/null
+++ b/block/blk-wbt.c
@@ -0,0 +1,735 @@
+/*
+ * buffered writeback throttling. loosely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ *   scaling step and scale down queue depth by a factor of 2x. The monitoring
+ *   window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ *   look like, retain status quo.
+ * - If latencies look good, decrement scaling step.
+ * - If we're only doing writes, allow the scaling step to go negative. This
+ *   will temporarily boost write performance, snapping back to a stable
+ *   scaling step of 0 if reads show up or the heavy writers finish. Unlike
+ *   positive scaling steps where we shrink the monitoring window, a negative
+ *   scaling step retains the default step==0 window size.
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+
+#include "blk-wbt.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/wbt.h>
+
+enum {
+	/*
+	 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
+	 * from here depending on device stats
+	 */
+	RWB_DEF_DEPTH	= 16,
+
+	/*
+	 * 100msec window
+	 */
+	RWB_WINDOW_NSEC		= 100 * 1000 * 1000ULL,
+
+	/*
+	 * Disregard stats, if we don't meet this minimum
+	 */
+	RWB_MIN_WRITE_SAMPLES	= 3,
+
+	/*
+	 * If we have this number of consecutive windows with not enough
+	 * information to scale up or down, scale up.
+	 */
+	RWB_UNKNOWN_BUMP	= 5,
+};
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+	return rwb && rwb->wb_normal != 0;
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+	int cur = atomic_read(v);
+
+	for (;;) {
+		int old;
+
+		if (cur >= below)
+			return false;
+		old = atomic_cmpxchg(v, cur, cur + 1);
+		if (old == cur)
+			break;
+		cur = old;
+	}
+
+	return true;
+}
+
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
+{
+	if (rwb_enabled(rwb)) {
+		const unsigned long cur = jiffies;
+
+		if (cur != *var)
+			*var = cur;
+	}
+}
+
+/*
+ * If a task was rate throttled in balance_dirty_pages() within the last
+ * second or so, use that to indicate a higher cleaning rate.
+ */
+static bool wb_recent_wait(struct rq_wb *rwb)
+{
+	struct bdi_writeback *wb = &rwb->bdi->wb;
+
+	return time_before(jiffies, wb->dirty_sleep + HZ);
+}
+
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+{
+	return &rwb->rq_wait[is_kswapd];
+}
+
+static void rwb_wake_all(struct rq_wb *rwb)
+{
+	int i;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++) {
+		struct rq_wait *rqw = &rwb->rq_wait[i];
+
+		if (waitqueue_active(&rqw->wait))
+			wake_up_all(&rqw->wait);
+	}
+}
+
+void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+{
+	struct rq_wait *rqw;
+	int inflight, limit;
+
+	if (!(wb_acct & WBT_TRACKED))
+		return;
+
+	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+	inflight = atomic_dec_return(&rqw->inflight);
+
+	/*
+	 * wbt got disabled with IO in flight. Wake up any potential
+	 * waiters, we don't have to do more than that.
+	 */
+	if (unlikely(!rwb_enabled(rwb))) {
+		rwb_wake_all(rwb);
+		return;
+	}
+
+	/*
+	 * If the device does write back caching, drop further down
+	 * before we wake people up.
+	 */
+	if (rwb->wc && !wb_recent_wait(rwb))
+		limit = 0;
+	else
+		limit = rwb->wb_normal;
+
+	/*
+	 * Don't wake anyone up if we are above the normal limit.
+	 */
+	if (inflight && inflight >= limit)
+		return;
+
+	if (waitqueue_active(&rqw->wait)) {
+		int diff = limit - inflight;
+
+		if (!inflight || diff >= rwb->wb_background / 2)
+			wake_up_all(&rqw->wait);
+	}
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+	if (!rwb)
+		return;
+
+	if (!wbt_is_tracked(stat)) {
+		if (rwb->sync_cookie == stat) {
+			rwb->sync_issue = 0;
+			rwb->sync_cookie = NULL;
+		}
+
+		if (wbt_is_read(stat))
+			wb_timestamp(rwb, &rwb->last_comp);
+		wbt_clear_state(stat);
+	} else {
+		WARN_ON_ONCE(stat == rwb->sync_cookie);
+		__wbt_done(rwb, wbt_stat_to_mask(stat));
+		wbt_clear_state(stat);
+	}
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+static bool calc_wb_limits(struct rq_wb *rwb)
+{
+	unsigned int depth;
+	bool ret = false;
+
+	if (!rwb->min_lat_nsec) {
+		rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
+		return false;
+	}
+
+	/*
+	 * For QD=1 devices, this is a special case. It's important for those
+	 * to have one request ready when one completes, so force a depth of
+	 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+	 * since the device can't have more than that in flight. If we're
+	 * scaling down, then keep a setting of 1/1/1.
+	 */
+	if (rwb->queue_depth == 1) {
+		if (rwb->scale_step > 0)
+			rwb->wb_max = rwb->wb_normal = 1;
+		else {
+			rwb->wb_max = rwb->wb_normal = 2;
+			ret = true;
+		}
+		rwb->wb_background = 1;
+	} else {
+		/*
+		 * scale_step == 0 is our default state. If we have suffered
+		 * latency spikes, step will be > 0, and we shrink the
+		 * allowed write depths. If step is < 0, we're only doing
+		 * writes, and we allow a temporarily higher depth to
+		 * increase performance.
+		 */
+		depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
+		if (rwb->scale_step > 0)
+			depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
+		else if (rwb->scale_step < 0) {
+			unsigned int maxd = 3 * rwb->queue_depth / 4;
+
+			depth = 1 + ((depth - 1) << -rwb->scale_step);
+			if (depth > maxd) {
+				depth = maxd;
+				ret = true;
+			}
+		}
+
+		/*
+		 * Set our max/normal/bg queue depths based on how far
+		 * we have scaled down (->scale_step).
+		 */
+		rwb->wb_max = depth;
+		rwb->wb_normal = (rwb->wb_max + 1) / 2;
+		rwb->wb_background = (rwb->wb_max + 3) / 4;
+	}
+
+	return ret;
+}
+
+static bool inline stat_sample_valid(struct blk_rq_stat *stat)
+{
+	/*
+	 * We need at least one read sample, and a minimum of
+	 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
+	 * that it's writes impacting us, and not just some sole read on
+	 * a device that is in a lower power state.
+	 */
+	return stat[0].nr_samples >= 1 &&
+		stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+}
+
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
+{
+	u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+
+	if (!issue || !rwb->sync_cookie)
+		return 0;
+
+	now = ktime_to_ns(ktime_get());
+	return now - issue;
+}
+
+enum {
+	LAT_OK = 1,
+	LAT_UNKNOWN,
+	LAT_UNKNOWN_WRITES,
+	LAT_EXCEEDED,
+};
+
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+{
+	u64 thislat;
+
+	/*
+	 * If our stored sync issue exceeds the window size, or it
+	 * exceeds our min target AND we haven't logged any entries,
+	 * flag the latency as exceeded. wbt works off completion latencies,
+	 * but for a flooded device, a single sync IO can take a long time
+	 * to complete after being issued. If this time exceeds our
+	 * monitoring window AND we didn't see any other completions in that
+	 * window, then count that sync IO as a violation of the latency.
+	 */
+	thislat = rwb_sync_issue_lat(rwb);
+	if (thislat > rwb->cur_win_nsec ||
+	    (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
+		trace_wbt_lat(rwb->bdi, thislat);
+		return LAT_EXCEEDED;
+	}
+
+	/*
+	 * No read/write mix, if stat isn't valid
+	 */
+	if (!stat_sample_valid(stat)) {
+		/*
+		 * If we had writes in this stat window and the window is
+		 * current, we're only doing writes. If a task recently
+		 * waited or still has writes in flights, consider us doing
+		 * just writes as well.
+		 */
+		if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) ||
+		    wb_recent_wait(rwb) || wbt_inflight(rwb))
+			return LAT_UNKNOWN_WRITES;
+		return LAT_UNKNOWN;
+	}
+
+	/*
+	 * If the 'min' latency exceeds our target, step down.
+	 */
+	if (stat[0].min > rwb->min_lat_nsec) {
+		trace_wbt_lat(rwb->bdi, stat[0].min);
+		trace_wbt_stat(rwb->bdi, stat);
+		return LAT_EXCEEDED;
+	}
+
+	if (rwb->scale_step)
+		trace_wbt_stat(rwb->bdi, stat);
+
+	return LAT_OK;
+}
+
+static int latency_exceeded(struct rq_wb *rwb)
+{
+	struct blk_rq_stat stat[2];
+
+	rwb->stat_ops->get(rwb->ops_data, stat);
+	return __latency_exceeded(rwb, stat);
+}
+
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
+{
+	trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+			rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+}
+
+static void scale_up(struct rq_wb *rwb)
+{
+	/*
+	 * Hit max in previous round, stop here
+	 */
+	if (rwb->scaled_max)
+		return;
+
+	rwb->scale_step--;
+	rwb->unknown_cnt = 0;
+	rwb->stat_ops->clear(rwb->ops_data);
+
+	rwb->scaled_max = calc_wb_limits(rwb);
+
+	rwb_wake_all(rwb);
+
+	rwb_trace_step(rwb, "step up");
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+static void scale_down(struct rq_wb *rwb, bool hard_throttle)
+{
+	/*
+	 * Stop scaling down when we've hit the limit. This also prevents
+	 * ->scale_step from going to crazy values, if the device can't
+	 * keep up.
+	 */
+	if (rwb->wb_max == 1)
+		return;
+
+	if (rwb->scale_step < 0 && hard_throttle)
+		rwb->scale_step = 0;
+	else
+		rwb->scale_step++;
+
+	rwb->scaled_max = false;
+	rwb->unknown_cnt = 0;
+	rwb->stat_ops->clear(rwb->ops_data);
+	calc_wb_limits(rwb);
+	rwb_trace_step(rwb, "step down");
+}
+
+static void rwb_arm_timer(struct rq_wb *rwb)
+{
+	unsigned long expires;
+
+	if (rwb->scale_step > 0) {
+		/*
+		 * We should speed this up, using some variant of a fast
+		 * integer inverse square root calculation. Since we only do
+		 * this for every window expiration, it's not a huge deal,
+		 * though.
+		 */
+		rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+					int_sqrt((rwb->scale_step + 1) << 8));
+	} else {
+		/*
+		 * For step < 0, we don't want to increase/decrease the
+		 * window size.
+		 */
+		rwb->cur_win_nsec = rwb->win_nsec;
+	}
+
+	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
+	mod_timer(&rwb->window_timer, expires);
+}
+
+static void wb_timer_fn(unsigned long data)
+{
+	struct rq_wb *rwb = (struct rq_wb *) data;
+	unsigned int inflight = wbt_inflight(rwb);
+	int status;
+
+	status = latency_exceeded(rwb);
+
+	trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight);
+
+	/*
+	 * If we exceeded the latency target, step down. If we did not,
+	 * step one level up. If we don't know enough to say either exceeded
+	 * or ok, then don't do anything.
+	 */
+	switch (status) {
+	case LAT_EXCEEDED:
+		scale_down(rwb, true);
+		break;
+	case LAT_OK:
+		scale_up(rwb);
+		break;
+	case LAT_UNKNOWN_WRITES:
+		/*
+		 * We started a the center step, but don't have a valid
+		 * read/write sample, but we do have writes going on.
+		 * Allow step to go negative, to increase write perf.
+		 */
+		scale_up(rwb);
+		break;
+	case LAT_UNKNOWN:
+		if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
+			break;
+		/*
+		 * We get here when previously scaled reduced depth, and we
+		 * currently don't have a valid read/write sample. For that
+		 * case, slowly return to center state (step == 0).
+		 */
+		if (rwb->scale_step > 0)
+			scale_up(rwb);
+		else if (rwb->scale_step < 0)
+			scale_down(rwb, false);
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Re-arm timer, if we have IO in flight
+	 */
+	if (rwb->scale_step || inflight)
+		rwb_arm_timer(rwb);
+}
+
+void wbt_update_limits(struct rq_wb *rwb)
+{
+	rwb->scale_step = 0;
+	rwb->scaled_max = false;
+	calc_wb_limits(rwb);
+
+	rwb_wake_all(rwb);
+}
+
+static bool close_io(struct rq_wb *rwb)
+{
+	const unsigned long now = jiffies;
+
+	return time_before(now, rwb->last_issue + HZ / 10) ||
+		time_before(now, rwb->last_comp + HZ / 10);
+}
+
+#define REQ_HIPRIO	(REQ_SYNC | REQ_META | REQ_PRIO)
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+{
+	unsigned int limit;
+
+	/*
+	 * At this point we know it's a buffered write. If this is
+	 * kswapd trying to free memory, or REQ_SYNC is set, set, then
+	 * it's WB_SYNC_ALL writeback, and we'll use the max limit for
+	 * that. If the write is marked as a background write, then use
+	 * the idle limit, or go to normal if we haven't had competing
+	 * IO for a bit.
+	 */
+	if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+		limit = rwb->wb_max;
+	else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
+		/*
+		 * If less than 100ms since we completed unrelated IO,
+		 * limit us to half the depth for background writeback.
+		 */
+		limit = rwb->wb_background;
+	} else
+		limit = rwb->wb_normal;
+
+	return limit;
+}
+
+static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
+			     wait_queue_t *wait, unsigned long rw)
+{
+	/*
+	 * inc it here even if disabled, since we'll dec it at completion.
+	 * this only happens if the task was sleeping in __wbt_wait(),
+	 * and someone turned it off at the same time.
+	 */
+	if (!rwb_enabled(rwb)) {
+		atomic_inc(&rqw->inflight);
+		return true;
+	}
+
+	/*
+	 * If the waitqueue is already active and we are not the next
+	 * in line to be woken up, wait for our turn.
+	 */
+	if (waitqueue_active(&rqw->wait) &&
+	    rqw->wait.task_list.next != &wait->task_list)
+		return false;
+
+	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+{
+	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+	DEFINE_WAIT(wait);
+
+	if (may_queue(rwb, rqw, &wait, rw))
+		return;
+
+	do {
+		prepare_to_wait_exclusive(&rqw->wait, &wait,
+						TASK_UNINTERRUPTIBLE);
+
+		if (may_queue(rwb, rqw, &wait, rw))
+			break;
+
+		if (lock)
+			spin_unlock_irq(lock);
+
+		io_schedule();
+
+		if (lock)
+			spin_lock_irq(lock);
+	} while (1);
+
+	finish_wait(&rqw->wait, &wait);
+}
+
+static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
+{
+	const int op = bio_op(bio);
+
+	/*
+	 * If not a WRITE (or a discard), do nothing
+	 */
+	if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
+		return false;
+
+	/*
+	 * Don't throttle WRITE_ODIRECT
+	 */
+	if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE))
+		return false;
+
+	return true;
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+unsigned int wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
+{
+	unsigned int ret = 0;
+
+	if (!rwb_enabled(rwb))
+		return 0;
+
+	if (bio_op(bio) == REQ_OP_READ)
+		ret = WBT_READ;
+
+	if (!wbt_should_throttle(rwb, bio)) {
+		if (ret & WBT_READ)
+			wb_timestamp(rwb, &rwb->last_issue);
+		return ret;
+	}
+
+	__wbt_wait(rwb, bio->bi_opf, lock);
+
+	if (!timer_pending(&rwb->window_timer))
+		rwb_arm_timer(rwb);
+
+	if (current_is_kswapd())
+		ret |= WBT_KSWAPD;
+
+	return ret | WBT_TRACKED;
+}
+
+void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+	if (!rwb_enabled(rwb))
+		return;
+
+	/*
+	 * Track sync issue, in case it takes a long time to complete. Allows
+	 * us to react quicker, if a sync IO takes a long time to complete.
+	 * Note that this is just a hint. 'stat' can go away when the
+	 * request completes, so it's important we never dereference it. We
+	 * only use the address to compare with, which is why we store the
+	 * sync_issue time locally.
+	 */
+	if (wbt_is_read(stat) && !rwb->sync_issue) {
+		rwb->sync_cookie = stat;
+		rwb->sync_issue = blk_stat_time(stat);
+	}
+}
+
+void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+	if (!rwb_enabled(rwb))
+		return;
+	if (stat == rwb->sync_cookie) {
+		rwb->sync_issue = 0;
+		rwb->sync_cookie = NULL;
+	}
+}
+
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+	if (rwb) {
+		rwb->queue_depth = depth;
+		wbt_update_limits(rwb);
+	}
+}
+
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+{
+	if (rwb)
+		rwb->wc = write_cache_on;
+}
+
+void wbt_disable(struct rq_wb *rwb)
+{
+	if (rwb) {
+		del_timer_sync(&rwb->window_timer);
+		rwb->win_nsec = rwb->min_lat_nsec = 0;
+		wbt_update_limits(rwb);
+	}
+}
+EXPORT_SYMBOL_GPL(wbt_disable);
+
+int wbt_init(struct request_queue *q, struct wb_stat_ops *ops)
+{
+	struct rq_wb *rwb;
+	int i;
+
+	/*
+	 * For now, we depend on the stats window being larger than
+	 * our monitoring window. Ensure that this isn't inadvertently
+	 * violated.
+	 */
+	BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
+	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
+
+	if (!ops->get || !ops->is_current || !ops->clear)
+		return -EINVAL;
+
+	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+	if (!rwb)
+		return -ENOMEM;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++) {
+		atomic_set(&rwb->rq_wait[i].inflight, 0);
+		init_waitqueue_head(&rwb->rq_wait[i].wait);
+	}
+
+	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
+	rwb->wc = 1;
+	rwb->queue_depth = RWB_DEF_DEPTH;
+	rwb->last_comp = rwb->last_issue = jiffies;
+	rwb->bdi = &q->backing_dev_info;
+	rwb->win_nsec = RWB_WINDOW_NSEC;
+	rwb->stat_ops = ops;
+	rwb->ops_data = q;
+	wbt_update_limits(rwb);
+
+	/*
+	 * Assign rwb, and turn on stats tracking for this queue
+	 */
+	q->rq_wb = rwb;
+	blk_stat_enable(q);
+
+	if (blk_queue_nonrot(q))
+		rwb->min_lat_nsec = 2000000ULL;
+	else
+		rwb->min_lat_nsec = 75000000ULL;
+
+	wbt_set_queue_depth(rwb, blk_queue_depth(q));
+	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+
+	return 0;
+}
+
+void wbt_exit(struct request_queue *q)
+{
+	struct rq_wb *rwb = q->rq_wb;
+
+	if (rwb) {
+		del_timer_sync(&rwb->window_timer);
+		q->rq_wb = NULL;
+		kfree(rwb);
+	}
+}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
new file mode 100644
index 00000000000000..e57700337c2656
--- /dev/null
+++ b/block/blk-wbt.h
@@ -0,0 +1,165 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
+#include <linux/ktime.h>
+
+#include "blk-stat.h"
+
+enum wbt_flags {
+	WBT_TRACKED		= 1,	/* write, tracked for throttling */
+	WBT_READ		= 2,	/* read */
+	WBT_KSWAPD		= 4,	/* write, from kswapd */
+
+	WBT_NR_BITS		= 3,	/* number of bits */
+};
+
+enum {
+	WBT_NUM_RWQ		= 2,
+};
+
+static inline void wbt_clear_state(struct blk_issue_stat *stat)
+{
+	stat->time &= BLK_STAT_TIME_MASK;
+}
+
+static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
+{
+	return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+}
+
+static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
+{
+	stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+}
+
+static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
+{
+	return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+}
+
+static inline bool wbt_is_read(struct blk_issue_stat *stat)
+{
+	return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+}
+
+struct wb_stat_ops {
+	void (*get)(void *, struct blk_rq_stat *);
+	bool (*is_current)(struct blk_rq_stat *);
+	void (*clear)(void *);
+};
+
+struct rq_wait {
+	wait_queue_head_t wait;
+	atomic_t inflight;
+};
+
+struct rq_wb {
+	/*
+	 * Settings that govern how we throttle
+	 */
+	unsigned int wb_background;		/* background writeback */
+	unsigned int wb_normal;			/* normal writeback */
+	unsigned int wb_max;			/* max throughput writeback */
+	int scale_step;
+	bool scaled_max;
+
+	/*
+	 * Number of consecutive periods where we don't have enough
+	 * information to make a firm scale up/down decision.
+	 */
+	unsigned int unknown_cnt;
+
+	u64 win_nsec;				/* default window size */
+	u64 cur_win_nsec;			/* current window size */
+
+	struct timer_list window_timer;
+
+	s64 sync_issue;
+	void *sync_cookie;
+
+	unsigned int wc;
+	unsigned int queue_depth;
+
+	unsigned long last_issue;		/* last non-throttled issue */
+	unsigned long last_comp;		/* last non-throttled comp */
+	unsigned long min_lat_nsec;
+	struct backing_dev_info *bdi;
+	struct rq_wait rq_wait[WBT_NUM_RWQ];
+
+	struct wb_stat_ops *stat_ops;
+	void *ops_data;
+};
+
+static inline unsigned int wbt_inflight(struct rq_wb *rwb)
+{
+	unsigned int i, ret = 0;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++)
+		ret += atomic_read(&rwb->rq_wait[i].inflight);
+
+	return ret;
+}
+
+struct backing_dev_info;
+
+#ifdef CONFIG_BLK_WBT
+
+void __wbt_done(struct rq_wb *, enum wbt_flags);
+void wbt_done(struct rq_wb *, struct blk_issue_stat *);
+enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
+int wbt_init(struct request_queue *, struct wb_stat_ops *);
+void wbt_exit(struct request_queue *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_disable(struct rq_wb *);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#else
+
+static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
+{
+}
+static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
+				      spinlock_t *lock)
+{
+	return 0;
+}
+static inline int wbt_init(struct request_queue *q, struct wb_stat_ops *ops)
+{
+	return -EINVAL;
+}
+static inline void wbt_exit(struct request_queue *q)
+{
+}
+static inline void wbt_update_limits(struct rq_wb *rwb)
+{
+}
+static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+{
+}
+static inline void wbt_disable(struct rq_wb *rwb)
+{
+}
+static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+}
+static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
+{
+}
+
+#endif /* CONFIG_BLK_WBT */
+
+#endif
diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index 00000000000000..3c518e455680ab
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,153 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM wbt
+
+#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WBT_H
+
+#include <linux/tracepoint.h>
+#include "../../../block/blk-wbt.h"
+
+/**
+ * wbt_stat - trace stats for blk_wb
+ * @stat: array of read/write stats
+ */
+TRACE_EVENT(wbt_stat,
+
+	TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
+
+	TP_ARGS(bdi, stat),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(s64, rmean)
+		__field(u64, rmin)
+		__field(u64, rmax)
+		__field(s64, rnr_samples)
+		__field(s64, rtime)
+		__field(s64, wmean)
+		__field(u64, wmin)
+		__field(u64, wmax)
+		__field(s64, wnr_samples)
+		__field(s64, wtime)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->rmean		= stat[0].mean;
+		__entry->rmin		= stat[0].min;
+		__entry->rmax		= stat[0].max;
+		__entry->rnr_samples	= stat[0].nr_samples;
+		__entry->wmean		= stat[1].mean;
+		__entry->wmin		= stat[1].min;
+		__entry->wmax		= stat[1].max;
+		__entry->wnr_samples	= stat[1].nr_samples;
+	),
+
+	TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, "
+		  "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n",
+		  __entry->name, __entry->rmean, __entry->rmin, __entry->rmax,
+		  __entry->rnr_samples, __entry->wmean, __entry->wmin,
+		  __entry->wmax, __entry->wnr_samples)
+);
+
+/**
+ * wbt_lat - trace latency event
+ * @lat: latency trigger
+ */
+TRACE_EVENT(wbt_lat,
+
+	TP_PROTO(struct backing_dev_info *bdi, unsigned long lat),
+
+	TP_ARGS(bdi, lat),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long, lat)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->lat = div_u64(lat, 1000);
+	),
+
+	TP_printk("%s: latency %lluus\n", __entry->name,
+			(unsigned long long) __entry->lat)
+);
+
+/**
+ * wbt_step - trace wb event step
+ * @msg: context message
+ * @step: the current scale step count
+ * @window: the current monitoring window
+ * @bg: the current background queue limit
+ * @normal: the current normal writeback limit
+ * @max: the current max throughput writeback limit
+ */
+TRACE_EVENT(wbt_step,
+
+	TP_PROTO(struct backing_dev_info *bdi, const char *msg,
+		 int step, unsigned long window, unsigned int bg,
+		 unsigned int normal, unsigned int max),
+
+	TP_ARGS(bdi, msg, step, window, bg, normal, max),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(const char *, msg)
+		__field(int, step)
+		__field(unsigned long, window)
+		__field(unsigned int, bg)
+		__field(unsigned int, normal)
+		__field(unsigned int, max)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->msg	= msg;
+		__entry->step	= step;
+		__entry->window	= div_u64(window, 1000);
+		__entry->bg	= bg;
+		__entry->normal	= normal;
+		__entry->max	= max;
+	),
+
+	TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n",
+		  __entry->name, __entry->msg, __entry->step, __entry->window,
+		  __entry->bg, __entry->normal, __entry->max)
+);
+
+/**
+ * wbt_timer - trace wb timer event
+ * @status: timer state status
+ * @step: the current scale step count
+ * @inflight: tracked writes inflight
+ */
+TRACE_EVENT(wbt_timer,
+
+	TP_PROTO(struct backing_dev_info *bdi, unsigned int status,
+		 int step, unsigned int inflight),
+
+	TP_ARGS(bdi, status, step, inflight),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned int, status)
+		__field(int, step)
+		__field(unsigned int, inflight)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->status		= status;
+		__entry->step		= step;
+		__entry->inflight	= inflight;
+	),
+
+	TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name,
+		  __entry->status, __entry->step, __entry->inflight)
+);
+
+#endif /* _TRACE_WBT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

From 87760e5eef359788047d6fd54fc12eec74ce0d27 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 9 Nov 2016 12:38:14 -0700
Subject: [PATCH 070/182] block: hook up writeback throttling

Enable throttling of buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

The algorithm for when to throttle takes its inspiration in the
CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors
the minimum latencies of requests over a window of time. In that
window of time, if the minimum latency of any request exceeds a
given target, then a scale count is incremented and the queue depth
is shrunk. The next monitoring window is shrunk accordingly. Unlike
CoDel, if we hit a window that exhibits good behavior, then we
simply increment the scale count and re-calculate the limits for that
scale value. This prevents us from oscillating between a
close-to-ideal value and max all the time, instead remaining in the
windows where we get good behavior.

Unlike CoDel, blk-wb allows the scale count to to negative. This
happens if we primarily have writes going on. Unlike positive
scale counts, this doesn't change the size of the monitoring window.
When the heavy writers finish, blk-bw quickly snaps back to it's
stable state of a zero scale count.

The patch registers a sysfs entry, 'wb_lat_usec'. This sets the latency
target to me met. It defaults to 2 msec for non-rotational storage, and
75 msec for rotational storage. Setting this value to '0' disables
blk-wb. Generally, a user would not have to touch this setting.

We don't enable WBT on devices that are managed with CFQ, and have
a non-root block cgroup attached. If we have a proportional share setup
on this particular disk, then the wbt throttling will interfere with
that. We don't have a strong need for wbt for that case, since we will
rely on CFQ doing that for us.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/queue-sysfs.txt |  7 +++
 block/Kconfig                       | 26 +++++++++
 block/blk-core.c                    | 17 +++++-
 block/blk-mq.c                      | 26 ++++++++-
 block/blk-settings.c                |  4 ++
 block/blk-sysfs.c                   | 88 +++++++++++++++++++++++++++++
 block/cfq-iosched.c                 | 14 +++++
 include/linux/blkdev.h              |  3 +
 8 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 2a3904030dea5d..87abf1ac29391e 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -169,5 +169,12 @@ This is the number of bytes the device can write in a single write-same
 command.  A value of '0' means write-same is not supported by this
 device.
 
+wb_lat_usec (RW)
+----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/Kconfig b/block/Kconfig
index 3a024440a669e1..8bf114a3858acd 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -121,6 +121,32 @@ config BLK_CMDLINE_PARSER
 
 	See Documentation/block/cmdline-partition.txt for more information.
 
+config BLK_WBT
+	bool "Enable support for block device writeback throttling"
+	default n
+	---help---
+	Enabling this option enables the block layer to throttle buffered
+	background writeback from the VM, making it more smooth and having
+	less impact on foreground operations. The throttling is done
+	dynamically on an algorithm loosely based on CoDel, factoring in
+	the realtime performance of the disk.
+
+config BLK_WBT_SQ
+	bool "Single queue writeback throttling"
+	default n
+	depends on BLK_WBT
+	---help---
+	Enable writeback throttling by default on legacy single queue devices
+
+config BLK_WBT_MQ
+	bool "Multiqueue writeback throttling"
+	default y
+	depends on BLK_WBT
+	---help---
+	Enable writeback throttling by default on multiqueue devices.
+	Multiqueue currently doesn't have support for IO scheduling,
+	enabling this option is recommended.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/blk-core.c b/block/blk-core.c
index 216372b01624cd..59f8129a42950e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wbt.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -882,6 +883,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 
 fail:
 	blk_free_flush_queue(q->fq);
+	wbt_exit(q);
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1344,6 +1346,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
+	wbt_requeue(q->rq_wb, &rq->issue_stat);
 
 	if (rq->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, rq);
@@ -1436,6 +1439,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 
+	wbt_done(q->rq_wb, &req->issue_stat);
+
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
@@ -1663,6 +1668,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	int el_ret, where = ELEVATOR_INSERT_SORT;
 	struct request *req;
 	unsigned int request_count = 0;
+	unsigned int wb_acct;
 
 	/*
 	 * low level driver can indicate that it wants pages above a
@@ -1715,17 +1721,22 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	}
 
 get_rq:
+	wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		bio->bi_error = PTR_ERR(req);
 		bio_endio(bio);
 		goto out_unlock;
 	}
 
+	wbt_track(&req->issue_stat, wb_acct);
+
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
@@ -2467,6 +2478,7 @@ void blk_start_request(struct request *req)
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
 		blk_stat_set_issue_time(&req->issue_stat);
 		req->rq_flags |= RQF_STATS;
+		wbt_issue(req->q->rq_wb, &req->issue_stat);
 	}
 
 	/*
@@ -2708,9 +2720,10 @@ void blk_finish_request(struct request *req, int error)
 
 	blk_account_io_done(req);
 
-	if (req->end_io)
+	if (req->end_io) {
+		wbt_done(req->q->rq_wb, &req->issue_stat);
 		req->end_io(req, error);
-	else {
+	} else {
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 19795886d46e8a..d180c989a0e522 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -31,6 +31,7 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
+#include "blk-wbt.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -326,6 +327,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
+
+	wbt_done(q->rq_wb, &rq->issue_stat);
 	rq->rq_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
@@ -354,6 +357,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
 	blk_account_io_done(rq);
 
 	if (rq->end_io) {
+		wbt_done(rq->q->rq_wb, &rq->issue_stat);
 		rq->end_io(rq, error);
 	} else {
 		if (unlikely(blk_bidi_rq(rq)))
@@ -471,6 +475,7 @@ void blk_mq_start_request(struct request *rq)
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 		blk_stat_set_issue_time(&rq->issue_stat);
 		rq->rq_flags |= RQF_STATS;
+		wbt_issue(q->rq_wb, &rq->issue_stat);
 	}
 
 	blk_add_timer(rq);
@@ -508,6 +513,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 	struct request_queue *q = rq->q;
 
 	trace_block_rq_requeue(q, rq);
+	wbt_requeue(q->rq_wb, &rq->issue_stat);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 		if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -1339,6 +1345,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
 	blk_qc_t cookie;
+	unsigned int wb_acct;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1353,9 +1360,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return BLK_QC_T_NONE;
 
+	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
 	rq = blk_mq_map_request(q, bio, &data);
-	if (unlikely(!rq))
+	if (unlikely(!rq)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
+	}
+
+	wbt_track(&rq->issue_stat, wb_acct);
 
 	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
@@ -1439,6 +1452,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	struct blk_mq_alloc_data data;
 	struct request *rq;
 	blk_qc_t cookie;
+	unsigned int wb_acct;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1455,9 +1469,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	} else
 		request_count = blk_plug_queued_count(q);
 
+	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
 	rq = blk_mq_map_request(q, bio, &data);
-	if (unlikely(!rq))
+	if (unlikely(!rq)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
+	}
+
+	wbt_track(&rq->issue_stat, wb_acct);
 
 	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
@@ -2139,6 +2159,8 @@ void blk_mq_free_queue(struct request_queue *q)
 	list_del_init(&q->all_q_node);
 	mutex_unlock(&all_q_mutex);
 
+	wbt_exit(q);
+
 	blk_mq_del_queue_tag_set(q);
 
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9cf0537593630f..c7ccabc0ec3ea6 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 
 #include "blk.h"
+#include "blk-wbt.h"
 
 unsigned long blk_max_low_pfn;
 EXPORT_SYMBOL(blk_max_low_pfn);
@@ -845,6 +846,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
 {
 	q->queue_depth = depth;
+	wbt_set_queue_depth(q->rq_wb, depth);
 }
 EXPORT_SYMBOL(blk_set_queue_depth);
 
@@ -868,6 +870,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 	else
 		queue_flag_clear(QUEUE_FLAG_FUA, q);
 	spin_unlock_irq(q->queue_lock);
+
+	wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
 }
 EXPORT_SYMBOL_GPL(blk_queue_write_cache);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9cdb7247727a55..9262d2d60a0916 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -13,6 +13,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wbt.h"
 
 struct queue_sysfs_entry {
 	struct attribute attr;
@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
+static ssize_t queue_var_store64(u64 *var, const char *page)
+{
+	int err;
+	u64 v;
+
+	err = kstrtou64(page, 10, &v);
+	if (err < 0)
+		return err;
+
+	*var = v;
+	return 0;
+}
+
 static ssize_t queue_requests_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->nr_requests, (page));
@@ -364,6 +378,32 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	return ret;
 }
 
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+				  size_t count)
+{
+	ssize_t ret;
+	u64 val;
+
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	ret = queue_var_store64(&val, page);
+	if (ret < 0)
+		return ret;
+
+	q->rq_wb->min_lat_nsec = val * 1000ULL;
+	wbt_update_limits(q->rq_wb);
+	return count;
+}
+
 static ssize_t queue_wc_show(struct request_queue *q, char *page)
 {
 	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -578,6 +618,12 @@ static struct queue_sysfs_entry queue_stats_entry = {
 	.show = queue_stats_show,
 };
 
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_wb_lat_show,
+	.store = queue_wb_lat_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -608,6 +654,7 @@ static struct attribute *default_attrs[] = {
 	&queue_wc_entry.attr,
 	&queue_dax_entry.attr,
 	&queue_stats_entry.attr,
+	&queue_wb_lat_entry.attr,
 	NULL,
 };
 
@@ -682,6 +729,7 @@ static void blk_release_queue(struct kobject *kobj)
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 
+	wbt_exit(q);
 	bdi_exit(&q->backing_dev_info);
 	blkcg_exit_queue(q);
 
@@ -722,6 +770,44 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
+{
+	blk_queue_stat_get(data, stat);
+}
+
+static void blk_wb_stat_clear(void *data)
+{
+	blk_stat_clear(data);
+}
+
+static bool blk_wb_stat_is_current(struct blk_rq_stat *stat)
+{
+	return blk_stat_is_current(stat);
+}
+
+static struct wb_stat_ops wb_stat_ops = {
+	.get		= blk_wb_stat_get,
+	.is_current	= blk_wb_stat_is_current,
+	.clear		= blk_wb_stat_clear,
+};
+
+static void blk_wb_init(struct request_queue *q)
+{
+#ifndef CONFIG_BLK_WBT_MQ
+	if (q->mq_ops)
+		return;
+#endif
+#ifndef CONFIG_BLK_WBT_SQ
+	if (q->request_fn)
+		return;
+#endif
+
+	/*
+	 * If this fails, we don't get throttling
+	 */
+	wbt_init(q, &wb_stat_ops);
+}
+
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
@@ -761,6 +847,8 @@ int blk_register_queue(struct gendisk *disk)
 	if (q->mq_ops)
 		blk_mq_register_dev(dev, q);
 
+	blk_wb_init(q);
+
 	if (!q->request_fn)
 		return 0;
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 61010511c5a028..e280d08ef6d7ac 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -16,6 +16,7 @@
 #include <linux/blktrace_api.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
+#include "blk-wbt.h"
 
 /*
  * tunables
@@ -3762,9 +3763,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
 	uint64_t serial_nr;
+	bool nonroot_cg;
 
 	rcu_read_lock();
 	serial_nr = bio_blkcg(bio)->css.serial_nr;
+	nonroot_cg = bio_blkcg(bio) != &blkcg_root;
 	rcu_read_unlock();
 
 	/*
@@ -3774,6 +3777,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
 		return;
 
+	/*
+	 * If we have a non-root cgroup, we can depend on that to
+	 * do proper throttling of writes. Turn off wbt for that
+	 * case.
+	 */
+	if (nonroot_cg) {
+		struct request_queue *q = cfqd->queue;
+
+		wbt_disable(q->rq_wb);
+	}
+
 	/*
 	 * Drop reference to queues.  New queues will be assigned in new
 	 * group upon arrival of fresh requests.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 303723a2e5b8aa..15da9e430f90a0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -38,6 +38,7 @@ struct bsg_job;
 struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
+struct rq_wb;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -383,6 +384,8 @@ struct request_queue {
 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
+	struct rq_wb		*rq_wb;
+
 	/*
 	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
 	 * is used, root blkg allocates from @q->root_rl and all other

From 066a4a73cee9a44a906b98825e70c47de5bd8b5c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 11 Nov 2016 12:24:46 -0700
Subject: [PATCH 071/182] blk-mq: blk_mq_try_issue_directly() should lookup
 hardware queue

A previous commit changed this to pass in the hardware queue, but
it was using the wrong hardware queue. Hence a request that was
allocated on one hardware queue ended up being issued on another
one, and that caused IO timeouts and oopses on some drivers. Since
the request holds hardware queue private resources, like a tag,
we can't just issue it on a different hardware queue.

Fixes: 2253efc850c4 ("blk-mq: Move more code into blk_mq_direct_issue_request()")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index d180c989a0e522..77110aed24ea07 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1291,11 +1291,11 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }
 
-static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-				      struct request *rq, blk_qc_t *cookie)
+static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
 	int ret;
 	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
 		.list = NULL,
@@ -1414,11 +1414,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 		if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
 			rcu_read_lock();
-			blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+			blk_mq_try_issue_directly(old_rq, &cookie);
 			rcu_read_unlock();
 		} else {
 			srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
-			blk_mq_try_issue_directly(data.hctx, old_rq, &cookie);
+			blk_mq_try_issue_directly(old_rq, &cookie);
 			srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
 		}
 		goto done;

From b425b0201e89e6509032985532a33f1f92ac62a6 Mon Sep 17 00:00:00 2001
From: Sachin Shukla <sachin.s5@samsung.com>
Date: Fri, 11 Nov 2016 14:34:51 +0530
Subject: [PATCH 072/182] Block: mtip32xx: Improvement in code readability when
 memdup_user() fails.

There is no need to call kfree() if memdup_user() fails, as no memory
was allocated and the error in the error-valued pointer should be returned.

Signed-off-by: Sachin Shukla <sachin.s5@samsung.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3cfd879267b2dd..cfe0108a7d30a2 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2035,18 +2035,14 @@ static int exec_drive_taskfile(struct driver_data *dd,
 	taskout = req_task->out_size;
 	taskin = req_task->in_size;
 	/* 130560 = 512 * 0xFF*/
-	if (taskin > 130560 || taskout > 130560) {
-		err = -EINVAL;
-		goto abort;
-	}
+	if (taskin > 130560 || taskout > 130560)
+		return -EINVAL;
 
 	if (taskout) {
 		outbuf = memdup_user(buf + outtotal, taskout);
-		if (IS_ERR(outbuf)) {
-			err = PTR_ERR(outbuf);
-			outbuf = NULL;
-			goto abort;
-		}
+		if (IS_ERR(outbuf))
+			return PTR_ERR(outbuf);
+
 		outbuf_dma = pci_map_single(dd->pdev,
 					 outbuf,
 					 taskout,

From bbd7bb7017d5c2b1e75f3818b4ce88fa58bb0eab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 4 Nov 2016 09:34:34 -0600
Subject: [PATCH 073/182] block: move poll code to blk-mq

The poll code is blk-mq specific, let's move it to blk-mq.c. This
is a prep patch for improving the polling code.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c             | 46 ------------------------------
 block/blk-mq.c               | 54 ++++++++++++++++++++++++++++++++++++
 drivers/nvme/target/io-cmd.c |  2 +-
 fs/direct-io.c               |  2 +-
 include/linux/blkdev.h       |  2 +-
 5 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 59f8129a42950e..eea246567884bc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3312,52 +3312,6 @@ void blk_finish_plug(struct blk_plug *plug)
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
-bool blk_poll(struct request_queue *q, blk_qc_t cookie)
-{
-	struct blk_plug *plug;
-	long state;
-	unsigned int queue_num;
-	struct blk_mq_hw_ctx *hctx;
-
-	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
-	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-		return false;
-
-	queue_num = blk_qc_t_to_queue_num(cookie);
-	hctx = q->queue_hw_ctx[queue_num];
-	hctx->poll_considered++;
-
-	plug = current->plug;
-	if (plug)
-		blk_flush_plug_list(plug, false);
-
-	state = current->state;
-	while (!need_resched()) {
-		int ret;
-
-		hctx->poll_invoked++;
-
-		ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
-		if (ret > 0) {
-			hctx->poll_success++;
-			set_current_state(TASK_RUNNING);
-			return true;
-		}
-
-		if (signal_pending_state(state, current))
-			set_current_state(TASK_RUNNING);
-
-		if (current->state == TASK_RUNNING)
-			return true;
-		if (ret < 0)
-			break;
-		cpu_relax();
-	}
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(blk_poll);
-
 #ifdef CONFIG_PM
 /**
  * blk_pm_runtime_init - Block layer runtime PM initialization routine
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 77110aed24ea07..ae8df5ec20d386 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2461,6 +2461,60 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct request_queue *q = hctx->queue;
+	long state;
+
+	hctx->poll_considered++;
+
+	state = current->state;
+	while (!need_resched()) {
+		int ret;
+
+		hctx->poll_invoked++;
+
+		ret = q->mq_ops->poll(hctx, rq->tag);
+		if (ret > 0) {
+			hctx->poll_success++;
+			set_current_state(TASK_RUNNING);
+			return true;
+		}
+
+		if (signal_pending_state(state, current))
+			set_current_state(TASK_RUNNING);
+
+		if (current->state == TASK_RUNNING)
+			return true;
+		if (ret < 0)
+			break;
+		cpu_relax();
+	}
+
+	return false;
+}
+
+bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_plug *plug;
+	struct request *rq;
+
+	if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
+	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		return false;
+
+	plug = current->plug;
+	if (plug)
+		blk_flush_plug_list(plug, false);
+
+	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+	rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+
+	return __blk_mq_poll(hctx, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_poll);
+
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index c2784cfc5e298b..ef52b1e701447e 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -96,7 +96,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 	cookie = submit_bio(bio);
 
-	blk_poll(bdev_get_queue(req->ns->bdev), cookie);
+	blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie);
 }
 
 static void nvmet_execute_flush(struct nvmet_req *req)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a5138c564019a4..835e23a4ee4b44 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -457,7 +457,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+		    !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 15da9e430f90a0..bab18ee5810d3a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -952,7 +952,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 
-bool blk_poll(struct request_queue *q, blk_qc_t cookie);
+bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {

From d8a0cbfd73cb7281120d1b49f90afeef26ad48a2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 10 Nov 2016 21:52:53 -0700
Subject: [PATCH 074/182] blk-wbt: store queue instead of bdi

The bdi was a leftover from when the code was block layer agnostic.
Now that we just support a block layer user, store the queue directly.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-wbt.c | 20 ++++++++++++--------
 block/blk-wbt.h |  4 +---
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 889c17ff85031a..4ab9cebc8003af 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -96,7 +96,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
  */
 static bool wb_recent_wait(struct rq_wb *rwb)
 {
-	struct bdi_writeback *wb = &rwb->bdi->wb;
+	struct bdi_writeback *wb = &rwb->queue->backing_dev_info.wb;
 
 	return time_before(jiffies, wb->dirty_sleep + HZ);
 }
@@ -279,6 +279,7 @@ enum {
 
 static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 {
+	struct backing_dev_info *bdi = &rwb->queue->backing_dev_info;
 	u64 thislat;
 
 	/*
@@ -293,7 +294,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 	thislat = rwb_sync_issue_lat(rwb);
 	if (thislat > rwb->cur_win_nsec ||
 	    (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
-		trace_wbt_lat(rwb->bdi, thislat);
+		trace_wbt_lat(bdi, thislat);
 		return LAT_EXCEEDED;
 	}
 
@@ -317,13 +318,13 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 	 * If the 'min' latency exceeds our target, step down.
 	 */
 	if (stat[0].min > rwb->min_lat_nsec) {
-		trace_wbt_lat(rwb->bdi, stat[0].min);
-		trace_wbt_stat(rwb->bdi, stat);
+		trace_wbt_lat(bdi, stat[0].min);
+		trace_wbt_stat(bdi, stat);
 		return LAT_EXCEEDED;
 	}
 
 	if (rwb->scale_step)
-		trace_wbt_stat(rwb->bdi, stat);
+		trace_wbt_stat(bdi, stat);
 
 	return LAT_OK;
 }
@@ -338,7 +339,9 @@ static int latency_exceeded(struct rq_wb *rwb)
 
 static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
 {
-	trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+	struct backing_dev_info *bdi = &rwb->queue->backing_dev_info;
+
+	trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
 			rwb->wb_background, rwb->wb_normal, rwb->wb_max);
 }
 
@@ -420,7 +423,8 @@ static void wb_timer_fn(unsigned long data)
 
 	status = latency_exceeded(rwb);
 
-	trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight);
+	trace_wbt_timer(&rwb->queue->backing_dev_info, status, rwb->scale_step,
+			inflight);
 
 	/*
 	 * If we exceeded the latency target, step down. If we did not,
@@ -700,7 +704,7 @@ int wbt_init(struct request_queue *q, struct wb_stat_ops *ops)
 	rwb->wc = 1;
 	rwb->queue_depth = RWB_DEF_DEPTH;
 	rwb->last_comp = rwb->last_issue = jiffies;
-	rwb->bdi = &q->backing_dev_info;
+	rwb->queue = q;
 	rwb->win_nsec = RWB_WINDOW_NSEC;
 	rwb->stat_ops = ops;
 	rwb->ops_data = q;
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index e57700337c2656..09c61a3f8295d9 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -87,7 +87,7 @@ struct rq_wb {
 	unsigned long last_issue;		/* last non-throttled issue */
 	unsigned long last_comp;		/* last non-throttled comp */
 	unsigned long min_lat_nsec;
-	struct backing_dev_info *bdi;
+	struct request_queue *queue;
 	struct rq_wait rq_wait[WBT_NUM_RWQ];
 
 	struct wb_stat_ops *stat_ops;
@@ -104,8 +104,6 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
 	return ret;
 }
 
-struct backing_dev_info;
-
 #ifdef CONFIG_BLK_WBT
 
 void __wbt_done(struct rq_wb *, enum wbt_flags);

From 8054b89f8fca75d514965ee627a15b47020d2053 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 10 Nov 2016 21:50:51 -0700
Subject: [PATCH 075/182] blk-wbt: remove stat ops

Again a leftover from when the throttling code was generic. Now that we
just have the block user, get rid of the stat ops and indirections.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-sysfs.c | 23 +----------------------
 block/blk-wbt.c   | 15 +++++----------
 block/blk-wbt.h   | 13 ++-----------
 3 files changed, 8 insertions(+), 43 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9262d2d60a0916..415e764807d03d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -770,27 +770,6 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
-static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
-{
-	blk_queue_stat_get(data, stat);
-}
-
-static void blk_wb_stat_clear(void *data)
-{
-	blk_stat_clear(data);
-}
-
-static bool blk_wb_stat_is_current(struct blk_rq_stat *stat)
-{
-	return blk_stat_is_current(stat);
-}
-
-static struct wb_stat_ops wb_stat_ops = {
-	.get		= blk_wb_stat_get,
-	.is_current	= blk_wb_stat_is_current,
-	.clear		= blk_wb_stat_clear,
-};
-
 static void blk_wb_init(struct request_queue *q)
 {
 #ifndef CONFIG_BLK_WBT_MQ
@@ -805,7 +784,7 @@ static void blk_wb_init(struct request_queue *q)
 	/*
 	 * If this fails, we don't get throttling
 	 */
-	wbt_init(q, &wb_stat_ops);
+	wbt_init(q);
 }
 
 int blk_register_queue(struct gendisk *disk)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 4ab9cebc8003af..f6ec7e587fa605 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -308,7 +308,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 		 * waited or still has writes in flights, consider us doing
 		 * just writes as well.
 		 */
-		if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) ||
+		if ((stat[1].nr_samples && blk_stat_is_current(stat)) ||
 		    wb_recent_wait(rwb) || wbt_inflight(rwb))
 			return LAT_UNKNOWN_WRITES;
 		return LAT_UNKNOWN;
@@ -333,7 +333,7 @@ static int latency_exceeded(struct rq_wb *rwb)
 {
 	struct blk_rq_stat stat[2];
 
-	rwb->stat_ops->get(rwb->ops_data, stat);
+	blk_queue_stat_get(rwb->queue, stat);
 	return __latency_exceeded(rwb, stat);
 }
 
@@ -355,7 +355,7 @@ static void scale_up(struct rq_wb *rwb)
 
 	rwb->scale_step--;
 	rwb->unknown_cnt = 0;
-	rwb->stat_ops->clear(rwb->ops_data);
+	blk_stat_clear(rwb->queue);
 
 	rwb->scaled_max = calc_wb_limits(rwb);
 
@@ -385,7 +385,7 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
 
 	rwb->scaled_max = false;
 	rwb->unknown_cnt = 0;
-	rwb->stat_ops->clear(rwb->ops_data);
+	blk_stat_clear(rwb->queue);
 	calc_wb_limits(rwb);
 	rwb_trace_step(rwb, "step down");
 }
@@ -675,7 +675,7 @@ void wbt_disable(struct rq_wb *rwb)
 }
 EXPORT_SYMBOL_GPL(wbt_disable);
 
-int wbt_init(struct request_queue *q, struct wb_stat_ops *ops)
+int wbt_init(struct request_queue *q)
 {
 	struct rq_wb *rwb;
 	int i;
@@ -688,9 +688,6 @@ int wbt_init(struct request_queue *q, struct wb_stat_ops *ops)
 	BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
 	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
 
-	if (!ops->get || !ops->is_current || !ops->clear)
-		return -EINVAL;
-
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
 		return -ENOMEM;
@@ -706,8 +703,6 @@ int wbt_init(struct request_queue *q, struct wb_stat_ops *ops)
 	rwb->last_comp = rwb->last_issue = jiffies;
 	rwb->queue = q;
 	rwb->win_nsec = RWB_WINDOW_NSEC;
-	rwb->stat_ops = ops;
-	rwb->ops_data = q;
 	wbt_update_limits(rwb);
 
 	/*
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 09c61a3f8295d9..44dc2173dc1f38 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -46,12 +46,6 @@ static inline bool wbt_is_read(struct blk_issue_stat *stat)
 	return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
 }
 
-struct wb_stat_ops {
-	void (*get)(void *, struct blk_rq_stat *);
-	bool (*is_current)(struct blk_rq_stat *);
-	void (*clear)(void *);
-};
-
 struct rq_wait {
 	wait_queue_head_t wait;
 	atomic_t inflight;
@@ -89,9 +83,6 @@ struct rq_wb {
 	unsigned long min_lat_nsec;
 	struct request_queue *queue;
 	struct rq_wait rq_wait[WBT_NUM_RWQ];
-
-	struct wb_stat_ops *stat_ops;
-	void *ops_data;
 };
 
 static inline unsigned int wbt_inflight(struct rq_wb *rwb)
@@ -109,7 +100,7 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
 void __wbt_done(struct rq_wb *, enum wbt_flags);
 void wbt_done(struct rq_wb *, struct blk_issue_stat *);
 enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
-int wbt_init(struct request_queue *, struct wb_stat_ops *);
+int wbt_init(struct request_queue *);
 void wbt_exit(struct request_queue *);
 void wbt_update_limits(struct rq_wb *);
 void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
@@ -132,7 +123,7 @@ static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
 {
 	return 0;
 }
-static inline int wbt_init(struct request_queue *q, struct wb_stat_ops *ops)
+static inline int wbt_init(struct request_queue *q)
 {
 	return -EINVAL;
 }

From 382cf633edcb0371a6dd506653014897c4ac2a4d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 11 Nov 2016 08:23:53 -0700
Subject: [PATCH 076/182] blk-wbt: use BLK_STAT_{READ,WRITE} instead of 0/1

Since we have proper enums for the stats directions, use them.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-wbt.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index f6ec7e587fa605..20712f0db6eab0 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -255,8 +255,8 @@ static bool inline stat_sample_valid(struct blk_rq_stat *stat)
 	 * that it's writes impacting us, and not just some sole read on
 	 * a device that is in a lower power state.
 	 */
-	return stat[0].nr_samples >= 1 &&
-		stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+	return stat[BLK_STAT_READ].nr_samples >= 1 &&
+		stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES;
 }
 
 static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
@@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 	 */
 	thislat = rwb_sync_issue_lat(rwb);
 	if (thislat > rwb->cur_win_nsec ||
-	    (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
+	    (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) {
 		trace_wbt_lat(bdi, thislat);
 		return LAT_EXCEEDED;
 	}
@@ -308,7 +308,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 		 * waited or still has writes in flights, consider us doing
 		 * just writes as well.
 		 */
-		if ((stat[1].nr_samples && blk_stat_is_current(stat)) ||
+		if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) ||
 		    wb_recent_wait(rwb) || wbt_inflight(rwb))
 			return LAT_UNKNOWN_WRITES;
 		return LAT_UNKNOWN;
@@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 	/*
 	 * If the 'min' latency exceeds our target, step down.
 	 */
-	if (stat[0].min > rwb->min_lat_nsec) {
-		trace_wbt_lat(bdi, stat[0].min);
+	if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) {
+		trace_wbt_lat(bdi, stat[BLK_STAT_READ].min);
 		trace_wbt_stat(bdi, stat);
 		return LAT_EXCEEDED;
 	}

From dbb3ab03563cbf0880b1801a238b3e3964e364a1 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Sun, 25 Sep 2016 19:54:38 -0700
Subject: [PATCH 077/182] bsg: Add sparse annotations to bsg_request_fn()

Avoid that sparse complains about unbalanced lock actions.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg-lib.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 650f427d915be2..b2a61e3ecb1472 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -161,6 +161,8 @@ static int bsg_create_job(struct device *dev, struct request *req)
  * Drivers/subsys should pass this to the queue init function.
  */
 void bsg_request_fn(struct request_queue *q)
+	__releases(q->queue_lock)
+	__acquires(q->queue_lock)
 {
 	struct device *dev = q->queuedata;
 	struct request *req;

From c6463c651d7ab599a4878115723a5bcb6ae5b470 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 11 Nov 2016 14:53:26 +0900
Subject: [PATCH 078/182] sd_zbc: Force use of READ16/WRITE16

Normally, sd_read_capacity sets sdp->use_16_for_rw to 1 based on the
disk capacity so that READ16/WRITE16 are used for large drives.
However, for a zoned disk with RC_BASIS set to 0, the capacity reported
through READ_CAPACITY may be very small, leading to use_16_for_rw not being
set and READ10/WRITE10 commands being used, even after the actual zoned disk
capacity is corrected in sd_zbc_read_zones. This causes LBA offset overflow for
accesses beyond 2TB.

As the ZBC standard makes it mandatory for ZBC drives to support
the READ16/WRITE16 commands anyway, make sure that use_16_for_rw is set.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
eviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/scsi/sd_zbc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 394ab490919c6b..92620c8ea8ad93 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -612,6 +612,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp,
 	if (ret)
 		goto err;
 
+	/* READ16/WRITE16 is mandatory for ZBC disks */
+	sdkp->device->use_16_for_rw = 1;
+	sdkp->device->use_10_for_rw = 0;
+
 	return 0;
 
 err:

From b4a567e8114327518c09f5632339a5954ab975a3 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 14 Nov 2016 14:56:17 -0800
Subject: [PATCH 079/182] loop: return proper error from loop_queue_rq()

->queue_rq() should return one of the BLK_MQ_RQ_QUEUE_* constants, not
an errno.

f4aa4c7bbac6 ("block: loop: convert to per-device workqueue")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fa1b7a90ba11de..4af81876679766 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1646,7 +1646,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_mq_start_request(bd->rq);
 
 	if (lo->lo_state != Lo_bound)
-		return -EIO;
+		return BLK_MQ_RQ_QUEUE_ERROR;
 
 	switch (req_op(cmd->rq)) {
 	case REQ_OP_FLUSH:

From bac0000af5f8476a64ca7529a4243e23c016fc89 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 15 Nov 2016 11:11:58 -0800
Subject: [PATCH 080/182] nvme: untangle 0 and BLK_MQ_RQ_QUEUE_OK

Let's not depend on any of the BLK_MQ_RQ_QUEUE_* constants having
specific values. No functional change.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c   | 4 ++--
 drivers/nvme/host/pci.c    | 8 ++++----
 drivers/nvme/host/rdma.c   | 2 +-
 drivers/nvme/target/loop.c | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 53584d21c80522..e54bb10ead9cf9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -269,7 +269,7 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	 */
 	req->__data_len = nr_bytes;
 
-	return 0;
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
@@ -317,7 +317,7 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
 int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmd)
 {
-	int ret = 0;
+	int ret = BLK_MQ_RQ_QUEUE_OK;
 
 	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
 		memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 51d13d5ec7a84b..d58f8e4e2c06f2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -328,7 +328,7 @@ static int nvme_init_iod(struct request *rq, unsigned size,
 		rq->retries = 0;
 		rq->rq_flags |= RQF_DONTPREP;
 	}
-	return 0;
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
@@ -598,17 +598,17 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	map_len = nvme_map_len(req);
 	ret = nvme_init_iod(req, map_len, dev);
-	if (ret)
+	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		return ret;
 
 	ret = nvme_setup_cmd(ns, req, &cmnd);
-	if (ret)
+	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		goto out;
 
 	if (req->nr_phys_segments)
 		ret = nvme_map_data(dev, req, map_len, &cmnd);
 
-	if (ret)
+	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		goto out;
 
 	cmnd.common.command_id = req->tag;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index c4700efc03906d..ff1b340606b277 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1395,7 +1395,7 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 			sizeof(struct nvme_command), DMA_TO_DEVICE);
 
 	ret = nvme_setup_cmd(ns, rq, c);
-	if (ret)
+	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		return ret;
 
 	c->common.command_id = rq->tag;
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 26aa3a5afb0dbf..be56d0567155a2 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -169,7 +169,7 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	int ret;
 
 	ret = nvme_setup_cmd(ns, req, &iod->cmd);
-	if (ret)
+	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		return ret;
 
 	iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
@@ -179,7 +179,7 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		nvme_cleanup_cmd(req);
 		blk_mq_start_request(req);
 		nvme_loop_queue_response(&iod->req);
-		return 0;
+		return BLK_MQ_RQ_QUEUE_OK;
 	}
 
 	if (blk_rq_bytes(req)) {
@@ -198,7 +198,7 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_mq_start_request(req);
 
 	schedule_work(&iod->work);
-	return 0;
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx)

From 2868f13c303e1472bfd3941c62916c1f4128a713 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 15 Nov 2016 11:11:59 -0800
Subject: [PATCH 081/182] scsi_lib: untangle 0 and BLK_MQ_RQ_QUEUE_OK

Let's not depend on any of the BLK_MQ_RQ_QUEUE_* constants having
specific values. No functional change.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/scsi/scsi_lib.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2e35132f8be17b..47a5c8783b894c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1810,7 +1810,7 @@ static inline int prep_to_mq(int ret)
 {
 	switch (ret) {
 	case BLKPREP_OK:
-		return 0;
+		return BLK_MQ_RQ_QUEUE_OK;
 	case BLKPREP_DEFER:
 		return BLK_MQ_RQ_QUEUE_BUSY;
 	default:
@@ -1897,7 +1897,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	int reason;
 
 	ret = prep_to_mq(scsi_prep_state_check(sdev, req));
-	if (ret)
+	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		goto out;
 
 	ret = BLK_MQ_RQ_QUEUE_BUSY;
@@ -1914,7 +1914,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		ret = prep_to_mq(scsi_mq_prep_fn(req));
-		if (ret)
+		if (ret != BLK_MQ_RQ_QUEUE_OK)
 			goto out_dec_host_busy;
 		req->rq_flags |= RQF_DONTPREP;
 	} else {

From 0a6219a95f0b0690fb7094acb26002e7a4791197 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 16 Nov 2016 18:07:05 +0800
Subject: [PATCH 082/182] block: deal with stale req count of plug list

In both legacy and mq path, req count of plug list is computed
before allocating request, so the number can be stale when falling
back to slept allocation, also the new introduced wbt can sleep
too.

This patch deals with the case by checking if plug list becomes
empty, and fixes the KASAN report of 'BUG: KASAN: stack-out-of-bounds'
which is introduced by Shaohua's patches of dispatching big request.

Fixes: 600271d900002(blk-mq: immediately dispatch big size request)
Fixes: 50d24c34403c6(block: immediately dispatch big size request)
Cc: Shaohua Li <shli@fb.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 5 ++++-
 block/blk-mq.c   | 7 +++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index eea246567884bc..473dd698effdb8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1753,8 +1753,11 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 		/*
 		 * If this is the first request added after a plug, fire
 		 * of a plug trace.
+		 *
+		 * @request_count may become stale because of schedule
+		 * out, so check plug list again.
 		 */
-		if (!request_count)
+		if (!request_count || list_empty(&plug->list))
 			trace_block_plug(q);
 		else {
 			struct request *last = list_entry_rq(plug->list.prev);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ae8df5ec20d386..f39e69c732cc62 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1497,6 +1497,13 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		struct request *last = NULL;
 
 		blk_mq_bio_to_request(rq, bio);
+
+		/*
+		 * @request_count may become stale because of schedule
+		 * out, so check the list again.
+		 */
+		if (list_empty(&plug->mq_list))
+			request_count = 0;
 		if (!request_count)
 			trace_block_plug(q);
 		else

From 92153d30c7c8863022cad684926d71ffaed6a56d Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Date: Wed, 16 Nov 2016 08:26:11 -0700
Subject: [PATCH 083/182] null_blk: add usage hints for NVM

If CONFIG_NVM is disabled, loading null_block module with use_lightnvm=1
fails. But there are no messages and documents related to the failure.

Add the appropriate error message.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

Massaged the text a bit.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/null_blk.txt | 2 +-
 drivers/block/null_blk.c         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index d8880ca30af4c3..3140dbd860d8c7 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -72,4 +72,4 @@ use_per_node_hctx=[0/1]: Default: 0
      queue for each CPU node in the system.
 
 use_lightnvm=[0/1]: Default: 0
-  Register device with LightNVM. Requires blk-mq to be used.
+  Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled.
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index ba6f4a2e73db50..4943ee22716e04 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -577,6 +577,7 @@ static void null_nvm_unregister(struct nullb *nullb)
 #else
 static int null_nvm_register(struct nullb *nullb)
 {
+	pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
 	return -EINVAL;
 }
 static void null_nvm_unregister(struct nullb *nullb) {}

From 4121d385f1457d9beb2067d4b5b4659ef3e6c316 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 16 Nov 2016 16:29:57 +0100
Subject: [PATCH 084/182] blk-wbt: fix old-style function declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The newly added driver causes a harmless warning in some configurations:

block/blk-wbt.c:250:1: error: ‘inline’ is not at beginning of declaration [-Werror=old-style-declaration]
 static bool inline stat_sample_valid(struct blk_rq_stat *stat)

This makes it use the expected format for the declaration.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-wbt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 20712f0db6eab0..9f97594e68ce73 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -247,7 +247,7 @@ static bool calc_wb_limits(struct rq_wb *rwb)
 	return ret;
 }
 
-static bool inline stat_sample_valid(struct blk_rq_stat *stat)
+static inline bool stat_sample_valid(struct blk_rq_stat *stat)
 {
 	/*
 	 * We need at least one read sample, and a minimum of

From 429a787be6793554ee02aacc7e1f11ebcecc4453 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 17 Nov 2016 12:30:37 -0700
Subject: [PATCH 085/182] nbd: fix use-after-free of rq/bio in the xmit path

For writes, we can get a completion in while we're still iterating
the request and bio chain. If that happens, we're reading freed
memory and we can crash.

Break out after the last segment and avoid having the iterator
read freed memory.

Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index ba405b55329fb7..807d24a2f1de7c 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -272,6 +272,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 	int result, flags;
 	struct nbd_request request;
 	unsigned long size = blk_rq_bytes(req);
+	struct bio *bio;
 	u32 type;
 
 	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
@@ -305,16 +306,20 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 		return -EIO;
 	}
 
-	if (type == NBD_CMD_WRITE) {
-		struct req_iterator iter;
+	if (type != NBD_CMD_WRITE)
+		return 0;
+
+	flags = 0;
+	bio = req->bio;
+	while (bio) {
+		struct bio *next = bio->bi_next;
+		struct bvec_iter iter;
 		struct bio_vec bvec;
-		/*
-		 * we are really probing at internals to determine
-		 * whether to set MSG_MORE or not...
-		 */
-		rq_for_each_segment(bvec, req, iter) {
-			flags = 0;
-			if (!rq_iter_last(bvec, iter))
+
+		bio_for_each_segment(bvec, bio, iter) {
+			bool is_last = !next && bio_iter_last(bvec, iter);
+
+			if (is_last)
 				flags = MSG_MORE;
 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 				cmd, bvec.bv_len);
@@ -325,7 +330,16 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 					result);
 				return -EIO;
 			}
+			/*
+			 * The completion might already have come in,
+			 * so break for the last one instead of letting
+			 * the iterator do it. This prevents use-after-free
+			 * of the bio.
+			 */
+			if (is_last)
+				break;
 		}
+		bio = next;
 	}
 	return 0;
 }

From 189ce2b9dcc3494410a576fbecbedbb6b21e51e0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 31 Oct 2016 11:59:25 -0600
Subject: [PATCH 086/182] block: fast-path for small and simple direct I/O
 requests

This patch adds a small and simple fast patch for small direct I/O
requests on block devices that don't use AIO.  Between the neat
bio_iov_iter_get_pages helper that avoids allocating a page array
for get_user_pages and the on-stack bio and biovec this avoid memory
allocations and atomic operations entirely in the direct I/O code
(lower levels might still do memory allocations and will usually
have at least some atomic operations, though).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-By: Stephen Bates <sbates@raithlin.com>
Reviewed-By: Stephen Bates <sbates@raithlin.com>
---
 fs/block_dev.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 05b553368bb4f1..7c3ec604907378 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -30,6 +30,7 @@
 #include <linux/cleancache.h>
 #include <linux/dax.h>
 #include <linux/badblocks.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/falloc.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -175,12 +176,91 @@ static struct inode *bdev_file_inode(struct file *file)
 	return file->f_mapping->host;
 }
 
+#define DIO_INLINE_BIO_VECS 4
+
+static void blkdev_bio_end_io_simple(struct bio *bio)
+{
+	struct task_struct *waiter = bio->bi_private;
+
+	WRITE_ONCE(bio->bi_private, NULL);
+	wake_up_process(waiter);
+}
+
+static ssize_t
+__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
+		int nr_pages)
+{
+	struct file *file = iocb->ki_filp;
+	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+	unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
+	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *bvec;
+	loff_t pos = iocb->ki_pos;
+	bool should_dirty = false;
+	struct bio bio;
+	ssize_t ret;
+	blk_qc_t qc;
+	int i;
+
+	if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	bio_init(&bio);
+	bio.bi_max_vecs = nr_pages;
+	bio.bi_io_vec = inline_vecs;
+	bio.bi_bdev = bdev;
+	bio.bi_iter.bi_sector = pos >> blkbits;
+	bio.bi_private = current;
+	bio.bi_end_io = blkdev_bio_end_io_simple;
+
+	ret = bio_iov_iter_get_pages(&bio, iter);
+	if (unlikely(ret))
+		return ret;
+	ret = bio.bi_iter.bi_size;
+
+	if (iov_iter_rw(iter) == READ) {
+		bio_set_op_attrs(&bio, REQ_OP_READ, 0);
+		if (iter_is_iovec(iter))
+			should_dirty = true;
+	} else {
+		bio_set_op_attrs(&bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+		task_io_account_write(ret);
+	}
+
+	qc = submit_bio(&bio);
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!READ_ONCE(bio.bi_private))
+			break;
+		if (!(iocb->ki_flags & IOCB_HIPRI) ||
+		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+			io_schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	bio_for_each_segment_all(bvec, &bio, i) {
+		if (should_dirty && !PageCompound(bvec->bv_page))
+			set_page_dirty_lock(bvec->bv_page);
+		put_page(bvec->bv_page);
+	}
+
+	if (unlikely(bio.bi_error))
+		return bio.bi_error;
+	iocb->ki_pos += ret;
+	return ret;
+}
+
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
+	int nr_pages;
 
+	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+	if (!nr_pages)
+		return 0;
+	if (is_sync_kiocb(iocb) && nr_pages <= DIO_INLINE_BIO_VECS)
+		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
 				    blkdev_get_block, NULL, NULL,
 				    DIO_SKIP_DIO_COUNT);

From 06426adf072bca62ac31ea396ff2159a34f276c2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 14 Nov 2016 13:01:59 -0700
Subject: [PATCH 087/182] blk-mq: implement hybrid poll mode for sync O_DIRECT

This patch enables a hybrid polling mode. Instead of polling after IO
submission, we can induce an artificial delay, and then poll after that.
For example, if the IO is presumed to complete in 8 usecs from now, we
can sleep for 4 usecs, wake up, and then do our polling. This still puts
a sleep/wakeup cycle in the IO path, but instead of the wakeup happening
after the IO has completed, it'll happen before. With this hybrid
scheme, we can achieve big latency reductions while still using the same
(or less) amount of CPU.

Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-By: Stephen Bates <sbates@raithlin.com>
Reviewed-By: Stephen Bates <sbates@raithlin.com>
---
 block/blk-mq.c         | 50 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-sysfs.c      | 29 ++++++++++++++++++++++++
 block/blk.h            |  1 +
 include/linux/blkdev.h |  1 +
 4 files changed, 81 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f39e69c732cc62..8cb248fb6a6830 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -332,6 +332,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	rq->rq_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	blk_mq_put_tag(hctx, ctx, tag);
 	blk_queue_exit(q);
 }
@@ -2468,11 +2469,60 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+				     struct request *rq)
+{
+	struct hrtimer_sleeper hs;
+	enum hrtimer_mode mode;
+	ktime_t kt;
+
+	if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+		return false;
+
+	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+
+	/*
+	 * This will be replaced with the stats tracking code, using
+	 * 'avg_completion_time / 2' as the pre-sleep target.
+	 */
+	kt = ktime_set(0, q->poll_nsec);
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&hs.timer, kt);
+
+	hrtimer_init_sleeper(&hs, current);
+	do {
+		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+			break;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		hrtimer_start_expires(&hs.timer, mode);
+		if (hs.task)
+			io_schedule();
+		hrtimer_cancel(&hs.timer);
+		mode = HRTIMER_MODE_ABS;
+	} while (hs.task && !signal_pending(current));
+
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&hs.timer);
+	return true;
+}
+
 static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct request_queue *q = hctx->queue;
 	long state;
 
+	/*
+	 * If we sleep, have the caller restart the poll loop to reset
+	 * the state. Like for the other success return cases, the
+	 * caller is responsible for checking if the IO completed. If
+	 * the IO isn't complete, we'll get called again and will go
+	 * straight to the busy poll loop.
+	 */
+	if (blk_mq_poll_hybrid_sleep(q, rq))
+		return true;
+
 	hctx->poll_considered++;
 
 	state = current->state;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 415e764807d03d..dcdfcaa126539c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -350,6 +350,28 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 	return ret;
 }
 
+static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->poll_nsec / 1000, page);
+}
+
+static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
+				size_t count)
+{
+	unsigned long poll_usec;
+	ssize_t ret;
+
+	if (!q->mq_ops || !q->mq_ops->poll)
+		return -EINVAL;
+
+	ret = queue_var_store(&poll_usec, page, count);
+	if (ret < 0)
+		return ret;
+
+	q->poll_nsec = poll_usec * 1000;
+	return ret;
+}
+
 static ssize_t queue_poll_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
@@ -602,6 +624,12 @@ static struct queue_sysfs_entry queue_poll_entry = {
 	.store = queue_poll_store,
 };
 
+static struct queue_sysfs_entry queue_poll_delay_entry = {
+	.attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_poll_delay_show,
+	.store = queue_poll_delay_store,
+};
+
 static struct queue_sysfs_entry queue_wc_entry = {
 	.attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_wc_show,
@@ -655,6 +683,7 @@ static struct attribute *default_attrs[] = {
 	&queue_dax_entry.attr,
 	&queue_stats_entry.attr,
 	&queue_wb_lat_entry.attr,
+	&queue_poll_delay_entry.attr,
 	NULL,
 };
 
diff --git a/block/blk.h b/block/blk.h
index aa132dea598c5e..041185e5f12994 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -111,6 +111,7 @@ void blk_account_io_done(struct request *req);
 enum rq_atomic_flags {
 	REQ_ATOM_COMPLETE = 0,
 	REQ_ATOM_STARTED,
+	REQ_ATOM_POLL_SLEPT,
 };
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bab18ee5810d3a..37ed4ea705c840 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -509,6 +509,7 @@ struct request_queue {
 	unsigned int		request_fn_active;
 
 	unsigned int		rq_timeout;
+	unsigned int		poll_nsec;
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 	struct list_head	timeout_list;

From 64f1c21e86f7fe63337b5c23c129de3ec506431d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 14 Nov 2016 13:03:03 -0700
Subject: [PATCH 088/182] blk-mq: make the polling code adaptive

The previous commit introduced the hybrid sleep/poll mode. Take
that one step further, and use the completion latencies to
automatically sleep for half the mean completion time. This is
a good approximation.

This changes the 'io_poll_delay' sysfs file a bit to expose the
various options. Depending on the value, the polling code will
behave differently:

-1	Never enter hybrid sleep mode
 0	Use half of the completion mean for the sleep delay
>0	Use this specific value as the sleep delay

Signed-off-by: Jens Axboe <axboe@fb.com>
Tested-By: Stephen Bates <sbates@raithlin.com>
Reviewed-By: Stephen Bates <sbates@raithlin.com>
---
 block/blk-mq.c         | 67 ++++++++++++++++++++++++++++++++++++++++--
 block/blk-sysfs.c      | 26 +++++++++++-----
 include/linux/blkdev.h |  2 +-
 3 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8cb248fb6a6830..9d4a1d630d0b8b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2132,6 +2132,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	 */
 	q->nr_requests = set->queue_depth;
 
+	/*
+	 * Default to classic polling
+	 */
+	q->poll_nsec = -1;
+
 	if (set->ops->complete)
 		blk_queue_softirq_done(q, set->ops->complete);
 
@@ -2469,14 +2474,70 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
+				       struct blk_mq_hw_ctx *hctx,
+				       struct request *rq)
+{
+	struct blk_rq_stat stat[2];
+	unsigned long ret = 0;
+
+	/*
+	 * If stats collection isn't on, don't sleep but turn it on for
+	 * future users
+	 */
+	if (!blk_stat_enable(q))
+		return 0;
+
+	/*
+	 * We don't have to do this once per IO, should optimize this
+	 * to just use the current window of stats until it changes
+	 */
+	memset(&stat, 0, sizeof(stat));
+	blk_hctx_stat_get(hctx, stat);
+
+	/*
+	 * As an optimistic guess, use half of the mean service time
+	 * for this type of request. We can (and should) make this smarter.
+	 * For instance, if the completion latencies are tight, we can
+	 * get closer than just half the mean. This is especially
+	 * important on devices where the completion latencies are longer
+	 * than ~10 usec.
+	 */
+	if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
+		ret = (stat[BLK_STAT_READ].mean + 1) / 2;
+	else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
+		ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+
+	return ret;
+}
+
 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
+				     struct blk_mq_hw_ctx *hctx,
 				     struct request *rq)
 {
 	struct hrtimer_sleeper hs;
 	enum hrtimer_mode mode;
+	unsigned int nsecs;
 	ktime_t kt;
 
-	if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+		return false;
+
+	/*
+	 * poll_nsec can be:
+	 *
+	 * -1:	don't ever hybrid sleep
+	 *  0:	use half of prev avg
+	 * >0:	use this specific value
+	 */
+	if (q->poll_nsec == -1)
+		return false;
+	else if (q->poll_nsec > 0)
+		nsecs = q->poll_nsec;
+	else
+		nsecs = blk_mq_poll_nsecs(q, hctx, rq);
+
+	if (!nsecs)
 		return false;
 
 	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
@@ -2485,7 +2546,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	 * This will be replaced with the stats tracking code, using
 	 * 'avg_completion_time / 2' as the pre-sleep target.
 	 */
-	kt = ktime_set(0, q->poll_nsec);
+	kt = ktime_set(0, nsecs);
 
 	mode = HRTIMER_MODE_REL;
 	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
@@ -2520,7 +2581,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	 * the IO isn't complete, we'll get called again and will go
 	 * straight to the busy poll loop.
 	 */
-	if (blk_mq_poll_hybrid_sleep(q, rq))
+	if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
 		return true;
 
 	hctx->poll_considered++;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index dcdfcaa126539c..1855c6770045c0 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -352,24 +352,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(q->poll_nsec / 1000, page);
+	int val;
+
+	if (q->poll_nsec == -1)
+		val = -1;
+	else
+		val = q->poll_nsec / 1000;
+
+	return sprintf(page, "%d\n", val);
 }
 
 static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
 				size_t count)
 {
-	unsigned long poll_usec;
-	ssize_t ret;
+	int err, val;
 
 	if (!q->mq_ops || !q->mq_ops->poll)
 		return -EINVAL;
 
-	ret = queue_var_store(&poll_usec, page, count);
-	if (ret < 0)
-		return ret;
+	err = kstrtoint(page, 10, &val);
+	if (err < 0)
+		return err;
 
-	q->poll_nsec = poll_usec * 1000;
-	return ret;
+	if (val == -1)
+		q->poll_nsec = -1;
+	else
+		q->poll_nsec = val * 1000;
+
+	return count;
 }
 
 static ssize_t queue_poll_show(struct request_queue *q, char *page)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 37ed4ea705c840..85699bc90a5166 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -509,7 +509,7 @@ struct request_queue {
 	unsigned int		request_fn_active;
 
 	unsigned int		rq_timeout;
-	unsigned int		poll_nsec;
+	int			poll_nsec;
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 	struct list_head	timeout_list;

From 72ecad22d9f198aafee64218512e02ffa7818671 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 16 Nov 2016 23:11:42 -0700
Subject: [PATCH 089/182] block: support a full bio worth of IO for simplified
 bdev direct-io

Just alloc the bio_vec array if we exceed the inline limit.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7c3ec604907378..dad97e19bb1fd5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -193,7 +193,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	struct file *file = iocb->ki_filp;
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
 	unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
-	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *bvec;
+	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
 	struct bio bio;
@@ -204,9 +204,17 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
 		return -EINVAL;
 
+	if (nr_pages <= DIO_INLINE_BIO_VECS)
+		vecs = inline_vecs;
+	else {
+		vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL);
+		if (!vecs)
+			return -ENOMEM;
+	}
+
 	bio_init(&bio);
 	bio.bi_max_vecs = nr_pages;
-	bio.bi_io_vec = inline_vecs;
+	bio.bi_io_vec = vecs;
 	bio.bi_bdev = bdev;
 	bio.bi_iter.bi_sector = pos >> blkbits;
 	bio.bi_private = current;
@@ -243,6 +251,9 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 		put_page(bvec->bv_page);
 	}
 
+	if (vecs != inline_vecs)
+		kfree(vecs);
+
 	if (unlikely(bio.bi_error))
 		return bio.bi_error;
 	iocb->ki_pos += ret;
@@ -256,10 +267,10 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	struct inode *inode = bdev_file_inode(file);
 	int nr_pages;
 
-	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
 	if (!nr_pages)
 		return 0;
-	if (is_sync_kiocb(iocb) && nr_pages <= DIO_INLINE_BIO_VECS)
+	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
 				    blkdev_get_block, NULL, NULL,

From 78250c02d9739b5b20f1884f8c1b30efc2f861e4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 17 Nov 2016 17:50:47 +0100
Subject: [PATCH 090/182] block: make __blkdev_direct_IO_sync() support
 O_SYNC/DSYNC

Split the op setting code into a helper, use it in both places.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index dad97e19bb1fd5..a1b9abe6231c19 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -176,6 +176,16 @@ static struct inode *bdev_file_inode(struct file *file)
 	return file->f_mapping->host;
 }
 
+static unsigned int dio_bio_write_op(struct kiocb *iocb)
+{
+	unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+
+	/* avoid the need for a I/O completion work item */
+	if (iocb->ki_flags & IOCB_DSYNC)
+		op |= REQ_FUA;
+	return op;
+}
+
 #define DIO_INLINE_BIO_VECS 4
 
 static void blkdev_bio_end_io_simple(struct bio *bio)
@@ -226,11 +236,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	ret = bio.bi_iter.bi_size;
 
 	if (iov_iter_rw(iter) == READ) {
-		bio_set_op_attrs(&bio, REQ_OP_READ, 0);
+		bio.bi_opf = REQ_OP_READ;
 		if (iter_is_iovec(iter))
 			should_dirty = true;
 	} else {
-		bio_set_op_attrs(&bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+		bio.bi_opf = dio_bio_write_op(iocb);
 		task_io_account_write(ret);
 	}
 

From 542ff7bf18c63cf403e36a4a1c71d86dc120d924 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 16 Nov 2016 23:14:22 -0700
Subject: [PATCH 091/182] block: new direct I/O implementation

Similar to the simple fast path, but we now need a dio structure to
track multiple-bio completions.  It's basically a cut-down version
of the new iomap-based direct I/O code for filesystems, but without
all the logic to call into the filesystem for extent lookup or
allocation, and without the complex I/O completion workqueue handler
for AIO - instead we just use the FUA bit on the bios to ensure
data is flushed to stable storage.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 162 insertions(+), 4 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a1b9abe6231c19..35cc494f2e1a08 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -270,11 +270,161 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	return ret;
 }
 
+struct blkdev_dio {
+	union {
+		struct kiocb		*iocb;
+		struct task_struct	*waiter;
+	};
+	size_t			size;
+	atomic_t		ref;
+	bool			multi_bio : 1;
+	bool			should_dirty : 1;
+	bool			is_sync : 1;
+	struct bio		bio;
+};
+
+static struct bio_set *blkdev_dio_pool __read_mostly;
+
+static void blkdev_bio_end_io(struct bio *bio)
+{
+	struct blkdev_dio *dio = bio->bi_private;
+	bool should_dirty = dio->should_dirty;
+
+	if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
+		if (bio->bi_error && !dio->bio.bi_error)
+			dio->bio.bi_error = bio->bi_error;
+	} else {
+		if (!dio->is_sync) {
+			struct kiocb *iocb = dio->iocb;
+			ssize_t ret = dio->bio.bi_error;
+
+			if (likely(!ret)) {
+				ret = dio->size;
+				iocb->ki_pos += ret;
+			}
+
+			dio->iocb->ki_complete(iocb, ret, 0);
+			bio_put(&dio->bio);
+		} else {
+			struct task_struct *waiter = dio->waiter;
+
+			WRITE_ONCE(dio->waiter, NULL);
+			wake_up_process(waiter);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		struct bio_vec *bvec;
+		int i;
+
+		bio_for_each_segment_all(bvec, bio, i)
+			put_page(bvec->bv_page);
+		bio_put(bio);
+	}
+}
+
 static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
+	struct block_device *bdev = I_BDEV(inode);
+	unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
+	struct blkdev_dio *dio;
+	struct bio *bio;
+	bool is_read = (iov_iter_rw(iter) == READ);
+	loff_t pos = iocb->ki_pos;
+	blk_qc_t qc = BLK_QC_T_NONE;
+	int ret;
+
+	if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);
+	bio_get(bio); /* extra ref for the completion handler */
+
+	dio = container_of(bio, struct blkdev_dio, bio);
+	dio->is_sync = is_sync_kiocb(iocb);
+	if (dio->is_sync)
+		dio->waiter = current;
+	else
+		dio->iocb = iocb;
+
+	dio->size = 0;
+	dio->multi_bio = false;
+	dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
+
+	for (;;) {
+		bio->bi_bdev = bdev;
+		bio->bi_iter.bi_sector = pos >> blkbits;
+		bio->bi_private = dio;
+		bio->bi_end_io = blkdev_bio_end_io;
+
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (unlikely(ret)) {
+			bio->bi_error = ret;
+			bio_endio(bio);
+			break;
+		}
+
+		if (is_read) {
+			bio->bi_opf = REQ_OP_READ;
+			if (dio->should_dirty)
+				bio_set_pages_dirty(bio);
+		} else {
+			bio->bi_opf = dio_bio_write_op(iocb);
+			task_io_account_write(bio->bi_iter.bi_size);
+		}
+
+		dio->size += bio->bi_iter.bi_size;
+		pos += bio->bi_iter.bi_size;
+
+		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+		if (!nr_pages) {
+			qc = submit_bio(bio);
+			break;
+		}
+
+		if (!dio->multi_bio) {
+			dio->multi_bio = true;
+			atomic_set(&dio->ref, 2);
+		} else {
+			atomic_inc(&dio->ref);
+		}
+
+		submit_bio(bio);
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+	}
+
+	if (!dio->is_sync)
+		return -EIOCBQUEUED;
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!READ_ONCE(dio->waiter))
+			break;
+
+		if (!(iocb->ki_flags & IOCB_HIPRI) ||
+		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+			io_schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	ret = dio->bio.bi_error;
+	if (likely(!ret)) {
+		ret = dio->size;
+		iocb->ki_pos += ret;
+	}
+
+	bio_put(&dio->bio);
+	return ret;
+}
+
+static ssize_t
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
 	int nr_pages;
 
 	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
@@ -282,10 +432,18 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		return 0;
 	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
 		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
-				    blkdev_get_block, NULL, NULL,
-				    DIO_SKIP_DIO_COUNT);
+
+	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
+}
+
+static __init int blkdev_init(void)
+{
+	blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
+	if (!blkdev_dio_pool)
+		return -ENOMEM;
+	return 0;
 }
+module_init(blkdev_init);
 
 int __sync_blockdev(struct block_device *bdev, int wait)
 {

From 10e6246e2275865ab917494e418f44b8c25ddd34 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 17 Nov 2016 22:23:02 -0700
Subject: [PATCH 092/182] block: document the 'io_poll_delay' queue sysfs file

This was documented in the original commit, 64f1c21e86f7, but it
never made it into the proper location for queue sysfs files.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/queue-sysfs.txt | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 87abf1ac29391e..14235e72a70274 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -58,6 +58,20 @@ When read, this file shows the total number of block IO polls and how
 many returned success.  Writing '0' to this file will disable polling
 for this device.  Writing any non-zero value will enable this feature.
 
+io_poll_delay (RW)
+------------------
+If polling is enabled, this controls what kind of polling will be
+performed. It defaults to -1, which is classic polling. In this mode,
+the CPU will repeatedly ask for completions without giving up any time.
+If set to 0, a hybrid polling mode is used, where the kernel will attempt
+to make an educated guess at when the IO will complete. Based on this
+guess, the kernel will put the process issuing IO to sleep for an amount
+of time, before entering a classic poll loop. This mode might be a
+little slower than pure classic polling, but it will be more efficient.
+If set to a value larger than 0, the kernel will put the process issuing
+IO to sleep for this amont of microseconds before entering classic
+polling.
+
 iostats (RW)
 -------------
 This file is used to control (on/off) the iostats accounting of the

From 55f958cc6ccf7eb21d05a7a3be0c3a92c0c804d3 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Fri, 18 Nov 2016 22:21:16 +0800
Subject: [PATCH 093/182] skd_main: drop duplicate header scatterlist.h

Drop duplicate header scatterlist.h from skd_main.c.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/skd_main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 1e536b97f802a2..abf805e332e2a1 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -36,7 +36,6 @@
 #include <linux/scatterlist.h>
 #include <linux/version.h>
 #include <linux/err.h>
-#include <linux/scatterlist.h>
 #include <linux/aer.h>
 #include <linux/ctype.h>
 #include <linux/wait.h>

From 9a05e7541c39680d28ecf91892338e074738d5fd Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Fri, 18 Nov 2016 15:16:06 +0100
Subject: [PATCH 094/182] block: Change extern inline to static inline

With compilers which follow the C99 standard (like modern versions of
gcc and clang), "extern inline" does the opposite thing from older
versions of gcc (emits code for an externally linkable version of the
inline function).

"static inline" does the intended behavior in all cases instead.

Description taken from commit 6d91857d4826 ("staging, rtl8192e,
LLVMLinux: Change extern inline to static inline").

This also fixes the following GCC warning when building with CONFIG_PM
disabled:

  ./include/linux/blkdev.h:1143:20: warning: no previous prototype for 'blk_set_runtime_active' [-Wmissing-prototypes]

Fixes: d07ab6d11477 ("block: Add blk_set_runtime_active()")
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 85699bc90a5166..541fdd8787a5b6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1174,7 +1174,7 @@ static inline int blk_pre_runtime_suspend(struct request_queue *q)
 static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {}
 static inline void blk_pre_runtime_resume(struct request_queue *q) {}
 static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
-extern inline void blk_set_runtime_active(struct request_queue *q) {}
+static inline void blk_set_runtime_active(struct request_queue *q) {}
 #endif
 
 /*

From 5a8b187c61e9cb1aa1e960fcbadb13beb9401e5e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 21 Nov 2016 09:33:17 -0700
Subject: [PATCH 095/182] pktcdvd: mark as unmaintained and deprecated

This driver is both orphaned, and not really useful anymore. Mark
it as such, and remove it in a future kernel after a release or
two.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 MAINTAINERS           | 4 ++--
 drivers/block/Kconfig | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7e0064e5..bbc2b39e67e957 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9551,8 +9551,8 @@ F:      arch/mips/boot/dts/pistachio/
 F:      arch/mips/configs/pistachio*_defconfig
 
 PKTCDVD DRIVER
-M:	Jiri Kosina <jikos@kernel.org>
-S:	Maintained
+S:	Orphan
+M:	linux-block@vger.kernel.org
 F:	drivers/block/pktcdvd.c
 F:	include/linux/pktcdvd.h
 F:	include/uapi/linux/pktcdvd.h
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 39dd30b6ef86e8..223ff2fcae7e06 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -384,9 +384,12 @@ config BLK_DEV_RAM_DAX
 	  allocated from highmem (only a problem for highmem systems).
 
 config CDROM_PKTCDVD
-	tristate "Packet writing on CD/DVD media"
+	tristate "Packet writing on CD/DVD media (DEPRECATED)"
 	depends on !UML
 	help
+	  Note: This driver is deprecated and will be removed from the
+	  kernel in the near future!
+
 	  If you have a CDROM/DVD drive that supports packet writing, say
 	  Y to include support. It should work with any MMC/Mt Fuji
 	  compliant ATAPI or SCSI drive, which is just about any newer

From 93c5bdf7ab71bbdae27f8f51fa175e06f000d69d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 21 Nov 2016 16:18:18 +0100
Subject: [PATCH 096/182] block: clear all of bi_opf in bio_set_op_attrs

Since commit 87374179 ("block: add a proper block layer data direction
encoding") we only or the new op and flags into bi_opf in bio_set_op_attrs
instead of clearing the old value.  I've not seen any breakage with the
new behavior, but it seems dangerous.

Also convert it to an inline function to make the argument passing
safer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4d0044d0998410..f57458a6a93bc3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -207,8 +207,11 @@ enum req_flag_bits {
 	((req)->cmd_flags & REQ_OP_MASK)
 
 /* obsolete, don't use in new code */
-#define bio_set_op_attrs(bio, op, op_flags) \
-	((bio)->bi_opf |= (op | op_flags))
+static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
+		unsigned op_flags)
+{
+	bio->bi_opf = op | op_flags;
+}
 
 static inline bool op_is_write(unsigned int op)
 {

From 778889d8412e36e666b1e4ce108373613c84b428 Mon Sep 17 00:00:00 2001
From: Shaun Tancheff <shaun@tancheff.com>
Date: Mon, 21 Nov 2016 15:52:23 -0600
Subject: [PATCH 097/182] block: apply blk_partition_remap to REQ_OP_ZONE_RESET

If a ZBC device is partitioned and operations are performed on the partition
the zone information is rebased to the partition, however the zone reset
is not mapped from the partition to device as are other operations.

This causes the API (report zones / reset zone) to be unbalanced in this
regard. Checking for the zone reset op code explicitly will balance the
API.

Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 473dd698effdb8..6c4a425690fc7f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1787,7 +1787,12 @@ static inline void blk_partition_remap(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 
-	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
+	/*
+	 * Zone reset does not include bi_size so bio_sectors() is always 0.
+	 * Include a test for the reset op code and perform the remap if needed.
+	 */
+	if (bdev != bdev->bd_contains &&
+	    (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)) {
 		struct hd_struct *p = bdev->bd_part;
 
 		bio->bi_iter.bi_sector += p->start_sect;

From 4d1a4765426fb57439164089b3cb537ec65356eb Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 22 Nov 2016 15:38:49 +0900
Subject: [PATCH 098/182] block_dev: Fixed direct I/O bio sector calculation

A direct I/O alignment must be always checked against the device blocks size,
but the I/O offset (bio->bi_iter.bi_sector must always use 512B sector unit, and
not the actual logical block size.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 35cc494f2e1a08..e49fb797e44763 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -226,7 +226,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	bio.bi_max_vecs = nr_pages;
 	bio.bi_io_vec = vecs;
 	bio.bi_bdev = bdev;
-	bio.bi_iter.bi_sector = pos >> blkbits;
+	bio.bi_iter.bi_sector = pos >> 9;
 	bio.bi_private = current;
 	bio.bi_end_io = blkdev_bio_end_io_simple;
 
@@ -358,7 +358,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 
 	for (;;) {
 		bio->bi_bdev = bdev;
-		bio->bi_iter.bi_sector = pos >> blkbits;
+		bio->bi_iter.bi_sector = pos >> 9;
 		bio->bi_private = dio;
 		bio->bi_end_io = blkdev_bio_end_io;
 

From 9a794fb9bddeda0b8c8c13858038318f3cbd4b7e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 22 Nov 2016 08:12:39 -0700
Subject: [PATCH 099/182] block_dev: get rid of blksize bits calculation

We store the bits in the bdev sector size locally, but we don't use
the calculation anymore. All we do with it is shift it back up to
the bdev sector size. So let's just use that directly and kill the
variable and bits calculation.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index e49fb797e44763..b0c790a19db9a3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -202,7 +202,6 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 {
 	struct file *file = iocb->ki_filp;
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
-	unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
 	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
@@ -211,7 +210,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	blk_qc_t qc;
 	int i;
 
-	if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
+	if ((pos | iov_iter_alignment(iter)) &
+	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
 	if (nr_pages <= DIO_INLINE_BIO_VECS)
@@ -331,7 +331,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
 	struct block_device *bdev = I_BDEV(inode);
-	unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	bool is_read = (iov_iter_rw(iter) == READ);
@@ -339,7 +338,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	blk_qc_t qc = BLK_QC_T_NONE;
 	int ret;
 
-	if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
+	if ((pos | iov_iter_alignment(iter)) &
+	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
 	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);

From 3a83f4677539bce8eaa2bca9ee9c20e172d7ab04 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Tue, 22 Nov 2016 08:57:21 -0700
Subject: [PATCH 100/182] block: bio: pass bvec table to bio_init()

Some drivers often use external bvec table, so introduce
this helper for this case. It is always safe to access the
bio->bi_io_vec in this way for this case.

After converting to this usage, it will becomes a bit easier
to evaluate the remaining direct access to bio->bi_io_vec,
so it can help to prepare for the following multipage bvec
support.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>

Fixed up the new O_DIRECT cases.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                   |  8 ++++++--
 drivers/block/floppy.c        |  3 +--
 drivers/md/bcache/io.c        |  4 +---
 drivers/md/bcache/journal.c   |  4 +---
 drivers/md/bcache/movinggc.c  |  6 ++----
 drivers/md/bcache/request.c   |  2 +-
 drivers/md/bcache/super.c     | 12 +++---------
 drivers/md/bcache/writeback.c |  5 ++---
 drivers/md/dm-bufio.c         |  4 +---
 drivers/md/dm.c               |  2 +-
 drivers/md/multipath.c        |  2 +-
 drivers/md/raid5-cache.c      |  2 +-
 drivers/md/raid5.c            |  9 ++-------
 drivers/nvme/target/io-cmd.c  |  4 +---
 fs/block_dev.c                |  4 +---
 fs/logfs/dev_bdev.c           |  4 +---
 include/linux/bio.h           |  3 ++-
 17 files changed, 28 insertions(+), 50 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 2cf6ebabc68c5c..de257ced69b1df 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,11 +270,15 @@ static void bio_free(struct bio *bio)
 	}
 }
 
-void bio_init(struct bio *bio)
+void bio_init(struct bio *bio, struct bio_vec *table,
+	      unsigned short max_vecs)
 {
 	memset(bio, 0, sizeof(*bio));
 	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->__bi_cnt, 1);
+
+	bio->bi_io_vec = table;
+	bio->bi_max_vecs = max_vecs;
 }
 EXPORT_SYMBOL(bio_init);
 
@@ -480,7 +484,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		return NULL;
 
 	bio = p + front_pad;
-	bio_init(bio);
+	bio_init(bio, NULL, 0);
 
 	if (nr_iovecs > inline_vecs) {
 		unsigned long idx = 0;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index e3d8e4ced4a23c..6a3ff2b2e3ae7b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3806,8 +3806,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
 
 	cbdata.drive = drive;
 
-	bio_init(&bio);
-	bio.bi_io_vec = &bio_vec;
+	bio_init(&bio, &bio_vec, 1);
 	bio_vec.bv_page = page;
 	bio_vec.bv_len = size;
 	bio_vec.bv_offset = 0;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index e97b0acf7b8dca..db45a88c0ce9d7 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -24,9 +24,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
 	struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
 	struct bio *bio = &b->bio;
 
-	bio_init(bio);
-	bio->bi_max_vecs	 = bucket_pages(c);
-	bio->bi_io_vec		 = bio->bi_inline_vecs;
+	bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));
 
 	return bio;
 }
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 6925023e12d456..1198e53d567031 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -448,13 +448,11 @@ static void do_journal_discard(struct cache *ca)
 
 		atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
 
-		bio_init(bio);
+		bio_init(bio, bio->bi_inline_vecs, 1);
 		bio_set_op_attrs(bio, REQ_OP_DISCARD, 0);
 		bio->bi_iter.bi_sector	= bucket_to_sector(ca->set,
 						ca->sb.d[ja->discard_idx]);
 		bio->bi_bdev		= ca->bdev;
-		bio->bi_max_vecs	= 1;
-		bio->bi_io_vec		= bio->bi_inline_vecs;
 		bio->bi_iter.bi_size	= bucket_bytes(ca);
 		bio->bi_end_io		= journal_discard_endio;
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 5c4bddecfaf092..13b8a907006dd2 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -77,15 +77,13 @@ static void moving_init(struct moving_io *io)
 {
 	struct bio *bio = &io->bio.bio;
 
-	bio_init(bio);
+	bio_init(bio, bio->bi_inline_vecs,
+		 DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS));
 	bio_get(bio);
 	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 
 	bio->bi_iter.bi_size	= KEY_SIZE(&io->w->key) << 9;
-	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key),
-					       PAGE_SECTORS);
 	bio->bi_private		= &io->cl;
-	bio->bi_io_vec		= bio->bi_inline_vecs;
 	bch_bio_map(bio, NULL);
 }
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 0d99b5f4b3e613..f49c5417527dcb 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -623,7 +623,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio)
 {
 	struct bio *bio = &s->bio.bio;
 
-	bio_init(bio);
+	bio_init(bio, NULL, 0);
 	__bio_clone_fast(bio, orig_bio);
 	bio->bi_end_io		= request_endio;
 	bio->bi_private		= &s->cl;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 988edf92846603..2fb5bfeb43e2e1 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1152,9 +1152,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
 	dc->bdev = bdev;
 	dc->bdev->bd_holder = dc;
 
-	bio_init(&dc->sb_bio);
-	dc->sb_bio.bi_max_vecs	= 1;
-	dc->sb_bio.bi_io_vec	= dc->sb_bio.bi_inline_vecs;
+	bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
 	dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
 	get_page(sb_page);
 
@@ -1814,9 +1812,7 @@ static int cache_alloc(struct cache *ca)
 	__module_get(THIS_MODULE);
 	kobject_init(&ca->kobj, &bch_cache_ktype);
 
-	bio_init(&ca->journal.bio);
-	ca->journal.bio.bi_max_vecs = 8;
-	ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
+	bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
 
 	free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
 
@@ -1852,9 +1848,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
 	ca->bdev = bdev;
 	ca->bdev->bd_holder = ca;
 
-	bio_init(&ca->sb_bio);
-	ca->sb_bio.bi_max_vecs	= 1;
-	ca->sb_bio.bi_io_vec	= ca->sb_bio.bi_inline_vecs;
+	bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
 	ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
 	get_page(sb_page);
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index e51644e503a53b..69e1ae59cab8b9 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -106,14 +106,13 @@ static void dirty_init(struct keybuf_key *w)
 	struct dirty_io *io = w->private;
 	struct bio *bio = &io->bio;
 
-	bio_init(bio);
+	bio_init(bio, bio->bi_inline_vecs,
+		 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
 	if (!io->dc->writeback_percent)
 		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 
 	bio->bi_iter.bi_size	= KEY_SIZE(&w->key) << 9;
-	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
 	bio->bi_private		= w;
-	bio->bi_io_vec		= bio->bi_inline_vecs;
 	bch_bio_map(bio, NULL);
 }
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b3ba142e59a4cd..262e75365cc04b 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -611,9 +611,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	char *ptr;
 	int len;
 
-	bio_init(&b->bio);
-	b->bio.bi_io_vec = b->bio_vec;
-	b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
+	bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
 	b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
 	b->bio.bi_bdev = b->c->bdev;
 	b->bio.bi_end_io = inline_endio;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b2abfa41af3ef3..7915467d372686 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1525,7 +1525,7 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->bdev)
 		goto bad;
 
-	bio_init(&md->flush_bio);
+	bio_init(&md->flush_bio, NULL, 0);
 	md->flush_bio.bi_bdev = md->bdev;
 	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 673efbd6fc4767..4da06d813b8fca 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -130,7 +130,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	}
 	multipath = conf->multipaths + mp_bh->path;
 
-	bio_init(&mp_bh->bio);
+	bio_init(&mp_bh->bio, NULL, 0);
 	__bio_clone_fast(&mp_bh->bio, bio);
 
 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 28d015c6fffe1e..25e9622d7d80be 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1201,7 +1201,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	INIT_LIST_HEAD(&log->io_end_ios);
 	INIT_LIST_HEAD(&log->flushing_ios);
 	INIT_LIST_HEAD(&log->finished_ios);
-	bio_init(&log->flush_bio);
+	bio_init(&log->flush_bio, NULL, 0);
 
 	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
 	if (!log->io_kc)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 70acdd379e4490..5f9e28443c8acb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2004,13 +2004,8 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
-			bio_init(&dev->req);
-			dev->req.bi_io_vec = &dev->vec;
-			dev->req.bi_max_vecs = 1;
-
-			bio_init(&dev->rreq);
-			dev->rreq.bi_io_vec = &dev->rvec;
-			dev->rreq.bi_max_vecs = 1;
+			bio_init(&dev->req, &dev->vec, 1);
+			bio_init(&dev->rreq, &dev->rvec, 1);
 		}
 	}
 	return sh;
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index ef52b1e701447e..c4dc9ea8ade0b1 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -37,9 +37,7 @@ static void nvmet_inline_bio_init(struct nvmet_req *req)
 {
 	struct bio *bio = &req->inline_bio;
 
-	bio_init(bio);
-	bio->bi_max_vecs = NVMET_MAX_INLINE_BIOVEC;
-	bio->bi_io_vec = req->inline_bvec;
+	bio_init(bio, req->inline_bvec, NVMET_MAX_INLINE_BIOVEC);
 }
 
 static void nvmet_execute_rw(struct nvmet_req *req)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b0c790a19db9a3..7022ddc55b1244 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -222,9 +222,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 			return -ENOMEM;
 	}
 
-	bio_init(&bio);
-	bio.bi_max_vecs = nr_pages;
-	bio.bi_io_vec = vecs;
+	bio_init(&bio, vecs, nr_pages);
 	bio.bi_bdev = bdev;
 	bio.bi_iter.bi_sector = pos >> 9;
 	bio.bi_private = current;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a8329cc47decd6..dc8cafeee038bc 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -19,9 +19,7 @@ static int sync_request(struct page *page, struct block_device *bdev, int op)
 	struct bio bio;
 	struct bio_vec bio_vec;
 
-	bio_init(&bio);
-	bio.bi_max_vecs = 1;
-	bio.bi_io_vec = &bio_vec;
+	bio_init(&bio, &bio_vec, 1);
 	bio_vec.bv_page = page;
 	bio_vec.bv_len = PAGE_SIZE;
 	bio_vec.bv_offset = 0;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d367cd37a7f756..70a7244f08a72e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -420,7 +420,8 @@ extern int bio_phys_segments(struct request_queue *, struct bio *);
 extern int submit_bio_wait(struct bio *bio);
 extern void bio_advance(struct bio *, unsigned);
 
-extern void bio_init(struct bio *);
+extern void bio_init(struct bio *bio, struct bio_vec *table,
+		     unsigned short max_vecs);
 extern void bio_reset(struct bio *);
 void bio_chain(struct bio *, struct bio *);
 

From 06efffda51d9785ea765b42ad25bd65405facf12 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 11 Nov 2016 20:05:30 +0800
Subject: [PATCH 101/182] block: drbd: remove impossible failure handling

For a non-cloned bio, bio_add_page() only returns failure when
the io vec table is full, but in that case, bio->bi_vcnt can't
be zero at all.

So remove the impossible failure handling.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_receiver.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index a89538cb3eaabb..c7728dd77230a8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1648,20 +1648,8 @@ int drbd_submit_peer_request(struct drbd_device *device,
 
 	page_chain_for_each(page) {
 		unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
-		if (!bio_add_page(bio, page, len, 0)) {
-			/* A single page must always be possible!
-			 * But in case it fails anyways,
-			 * we deal with it, and complain (below). */
-			if (bio->bi_vcnt == 0) {
-				drbd_err(device,
-					"bio_add_page failed for len=%u, "
-					"bi_vcnt=0 (bi_sector=%llu)\n",
-					len, (uint64_t)bio->bi_iter.bi_sector);
-				err = -ENOSPC;
-				goto fail;
-			}
+		if (!bio_add_page(bio, page, len, 0))
 			goto next_bio;
-		}
 		data_size -= len;
 		sector += len >> 9;
 		--nr_pages;

From 2c73a603cd0722c0594afc9efa0a617204303348 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 11 Nov 2016 20:05:31 +0800
Subject: [PATCH 102/182] block: floppy: use bio_add_page()

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/floppy.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 6a3ff2b2e3ae7b..a391a3cfb3fe6e 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3807,12 +3807,9 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
 	cbdata.drive = drive;
 
 	bio_init(&bio, &bio_vec, 1);
-	bio_vec.bv_page = page;
-	bio_vec.bv_len = size;
-	bio_vec.bv_offset = 0;
-	bio.bi_vcnt = 1;
-	bio.bi_iter.bi_size = size;
 	bio.bi_bdev = bdev;
+	bio_add_page(&bio, page, size, 0);
+
 	bio.bi_iter.bi_sector = 0;
 	bio.bi_flags |= (1 << BIO_QUIET);
 	bio.bi_private = &cbdata;

From 84c8590646d5b35804bac60eb58b145839b5893e Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 11 Nov 2016 20:05:32 +0800
Subject: [PATCH 103/182] target: avoid accessing .bi_vcnt directly

When the bio is full, bio_add_pc_page() will return zero,
so use this information tell when the bio is full.

Also replace access to .bi_vcnt for pr_debug() with bio_segments().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/target/target_core_pscsi.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 9125d9358dea1d..04d7aa7390d0fc 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -935,13 +935,9 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 
 			rc = bio_add_pc_page(pdv->pdv_sd->request_queue,
 					bio, page, bytes, off);
-			if (rc != bytes)
-				goto fail;
-
 			pr_debug("PSCSI: bio->bi_vcnt: %d nr_vecs: %d\n",
-				bio->bi_vcnt, nr_vecs);
-
-			if (bio->bi_vcnt > nr_vecs) {
+				bio_segments(bio), nr_vecs);
+			if (rc != bytes) {
 				pr_debug("PSCSI: Reached bio->bi_vcnt max:"
 					" %d i: %d bio: %p, allocating another"
 					" bio\n", bio->bi_vcnt, i, bio);

From 4113b88a658772e09b3fe475a5a4b7c9a9cdc04b Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 11 Nov 2016 20:05:33 +0800
Subject: [PATCH 104/182] bcache: debug: avoid accessing .bi_io_vec directly

Instead we use standard iterator way to do that.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/debug.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 1c9130ae00736c..06f55056aaae86 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -107,8 +107,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 {
 	char name[BDEVNAME_SIZE];
 	struct bio *check;
-	struct bio_vec bv;
-	struct bvec_iter iter;
+	struct bio_vec bv, cbv;
+	struct bvec_iter iter, citer = { 0 };
 
 	check = bio_clone(bio, GFP_NOIO);
 	if (!check)
@@ -120,9 +120,13 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 
 	submit_bio_wait(check);
 
+	citer.bi_size = UINT_MAX;
 	bio_for_each_segment(bv, bio, iter) {
 		void *p1 = kmap_atomic(bv.bv_page);
-		void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
+		void *p2;
+
+		cbv = bio_iter_iovec(check, citer);
+		p2 = page_address(cbv.bv_page);
 
 		cache_set_err_on(memcmp(p1 + bv.bv_offset,
 					p2 + bv.bv_offset,
@@ -133,6 +137,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 				 (uint64_t) bio->bi_iter.bi_sector);
 
 		kunmap_atomic(p1);
+		bio_advance_iter(check, &citer, bv.bv_len);
 	}
 
 	bio_free_pages(check);

From 739a9975468cc64b346da4d40c0a2e61b1f69af0 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 11 Nov 2016 20:05:37 +0800
Subject: [PATCH 105/182] fs: logfs: convert to bio_add_page() in
 sync_request()

Always bio_add_page() is the standard and preferred way to
do the task.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/logfs/dev_bdev.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index dc8cafeee038bc..e0107579060003 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -20,13 +20,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int op)
 	struct bio_vec bio_vec;
 
 	bio_init(&bio, &bio_vec, 1);
-	bio_vec.bv_page = page;
-	bio_vec.bv_len = PAGE_SIZE;
-	bio_vec.bv_offset = 0;
-	bio.bi_vcnt = 1;
 	bio.bi_bdev = bdev;
+	bio_add_page(&bio, page, PAGE_SIZE, 0);
 	bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
-	bio.bi_iter.bi_size = PAGE_SIZE;
 	bio_set_op_attrs(&bio, op, 0);
 
 	return submit_bio_wait(&bio);

From d4f98a89f9cdc42926fadb49ed6652c5da20fb54 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 11 Nov 2016 20:05:38 +0800
Subject: [PATCH 106/182] fs: logfs: use bio_add_page() in __bdev_writeseg()

Also this patch simplify the code a bit.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/logfs/dev_bdev.c | 51 ++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 31 deletions(-)

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index e0107579060003..a2e8511b89d607 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -71,56 +71,45 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	struct bio *bio;
+	struct bio *bio = NULL;
 	struct page *page;
 	unsigned int max_pages;
-	int i;
+	int i, ret;
 
 	max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
 
-	bio = bio_alloc(GFP_NOFS, max_pages);
-	BUG_ON(!bio);
-
 	for (i = 0; i < nr_pages; i++) {
-		if (i >= max_pages) {
-			/* Block layer cannot split bios :( */
-			bio->bi_vcnt = i;
-			bio->bi_iter.bi_size = i * PAGE_SIZE;
+		if (!bio) {
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+
 			bio->bi_bdev = super->s_bdev;
 			bio->bi_iter.bi_sector = ofs >> 9;
 			bio->bi_private = sb;
 			bio->bi_end_io = writeseg_end_io;
 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-			atomic_inc(&super->s_pending_writes);
-			submit_bio(bio);
-
-			ofs += i * PAGE_SIZE;
-			index += i;
-			nr_pages -= i;
-			i = 0;
-
-			bio = bio_alloc(GFP_NOFS, max_pages);
-			BUG_ON(!bio);
 		}
 		page = find_lock_page(mapping, index + i);
 		BUG_ON(!page);
-		bio->bi_io_vec[i].bv_page = page;
-		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
-		bio->bi_io_vec[i].bv_offset = 0;
+		ret = bio_add_page(bio, page, PAGE_SIZE, 0);
 
 		BUG_ON(PageWriteback(page));
 		set_page_writeback(page);
 		unlock_page(page);
+
+		if (!ret) {
+			/* Block layer cannot split bios :( */
+			ofs += bio->bi_iter.bi_size;
+			atomic_inc(&super->s_pending_writes);
+			submit_bio(bio);
+			bio = NULL;
+		}
+	}
+
+	if (bio) {
+		atomic_inc(&super->s_pending_writes);
+		submit_bio(bio);
 	}
-	bio->bi_vcnt = nr_pages;
-	bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
-	bio->bi_bdev = super->s_bdev;
-	bio->bi_iter.bi_sector = ofs >> 9;
-	bio->bi_private = sb;
-	bio->bi_end_io = writeseg_end_io;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	atomic_inc(&super->s_pending_writes);
-	submit_bio(bio);
 	return 0;
 }
 

From c12484367865187107af131717a6e8ab44588fb6 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 11 Nov 2016 20:05:39 +0800
Subject: [PATCH 107/182] fs: logfs: use bio_add_page() in do_erase()

Also code gets simplified a bit.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/logfs/dev_bdev.c | 44 +++++++++++++++-----------------------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a2e8511b89d607..b9188be0bf8358 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -153,49 +153,35 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 		size_t nr_pages)
 {
 	struct logfs_super *super = logfs_super(sb);
-	struct bio *bio;
+	struct bio *bio = NULL;
 	unsigned int max_pages;
-	int i;
+	int i, ret;
 
 	max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
 
-	bio = bio_alloc(GFP_NOFS, max_pages);
-	BUG_ON(!bio);
-
 	for (i = 0; i < nr_pages; i++) {
-		if (i >= max_pages) {
-			/* Block layer cannot split bios :( */
-			bio->bi_vcnt = i;
-			bio->bi_iter.bi_size = i * PAGE_SIZE;
+		if (!bio) {
+			bio = bio_alloc(GFP_NOFS, max_pages);
+			BUG_ON(!bio);
+
 			bio->bi_bdev = super->s_bdev;
 			bio->bi_iter.bi_sector = ofs >> 9;
 			bio->bi_private = sb;
 			bio->bi_end_io = erase_end_io;
 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+		}
+		ret = bio_add_page(bio, super->s_erase_page, PAGE_SIZE, 0);
+		if (!ret) {
+			/* Block layer cannot split bios :( */
+			ofs += bio->bi_iter.bi_size;
 			atomic_inc(&super->s_pending_writes);
 			submit_bio(bio);
-
-			ofs += i * PAGE_SIZE;
-			index += i;
-			nr_pages -= i;
-			i = 0;
-
-			bio = bio_alloc(GFP_NOFS, max_pages);
-			BUG_ON(!bio);
 		}
-		bio->bi_io_vec[i].bv_page = super->s_erase_page;
-		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
-		bio->bi_io_vec[i].bv_offset = 0;
 	}
-	bio->bi_vcnt = nr_pages;
-	bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
-	bio->bi_bdev = super->s_bdev;
-	bio->bi_iter.bi_sector = ofs >> 9;
-	bio->bi_private = sb;
-	bio->bi_end_io = erase_end_io;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	atomic_inc(&super->s_pending_writes);
-	submit_bio(bio);
+	if (bio) {
+		atomic_inc(&super->s_pending_writes);
+		submit_bio(bio);
+	}
 	return 0;
 }
 

From 05aea81b4bc9e51dabd4559666c1d8d004f3662a Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 11 Nov 2016 20:05:40 +0800
Subject: [PATCH 108/182] fs: logfs: remove unnecesary check

The check on bio->bi_vcnt doesn't make sense in erase_end_io().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/logfs/dev_bdev.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index b9188be0bf8358..9bfa0151d7c904 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -143,7 +143,6 @@ static void erase_end_io(struct bio *bio)
 	struct logfs_super *super = logfs_super(sb); 
 
 	BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ 
-	BUG_ON(bio->bi_vcnt == 0); 
 	bio_put(bio); 
 	if (atomic_dec_and_test(&super->s_pending_writes))
 		wake_up(&wq); 

From e00f4f4d0ff7e13b9115428a245b49108d625f09 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2016 18:03:32 -0500
Subject: [PATCH 109/182] block,blkcg: use __GFP_NOWARN for best-effort
 allocations in blkcg

blkcg allocates some per-cgroup data structures with GFP_NOWAIT and
when that fails falls back to operations which aren't specific to the
cgroup.  Occassional failures are expected under pressure and falling
back to non-cgroup operation is the right thing to do.

Unfortunately, I forgot to add __GFP_NOWARN to these allocations and
these expected failures end up creating a lot of noise.  Add
__GFP_NOWARN.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Marc MERLIN <marc@merlins.org>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c  | 9 +++++----
 block/cfq-iosched.c | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b08ccbb9393a75..8ba0af780e880e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -185,7 +185,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	}
 
 	wb_congested = wb_congested_get_create(&q->backing_dev_info,
-					       blkcg->css.id, GFP_NOWAIT);
+					       blkcg->css.id,
+					       GFP_NOWAIT | __GFP_NOWARN);
 	if (!wb_congested) {
 		ret = -ENOMEM;
 		goto err_put_css;
@@ -193,7 +194,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
 	/* allocate */
 	if (!new_blkg) {
-		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
+		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
 			goto err_put_congested;
@@ -1022,7 +1023,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	}
 
 	spin_lock_init(&blkcg->lock);
-	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
+	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -1240,7 +1241,7 @@ int blkcg_activate_policy(struct request_queue *q,
 		if (blkg->pd[pol->plid])
 			continue;
 
-		pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+		pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
 		if (!pd)
 			swap(pd, pd_prealloc);
 		if (!pd) {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e280d08ef6d7ac..9894dc985e0935 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3859,7 +3859,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 			goto out;
 	}
 
-	cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+	cfqq = kmem_cache_alloc_node(cfq_pool,
+				     GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
 				     cfqd->queue->node);
 	if (!cfqq) {
 		cfqq = &cfqd->oom_cfqq;

From 9561a7ade0c205bc2ee035a2ac880478dcc1a024 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 22 Nov 2016 14:04:40 -0500
Subject: [PATCH 110/182] nbd: add multi-connection support

NBD can become contended on its single connection.  We have to serialize all
writes and we can only process one read response at a time.  Fix this by
allowing userspace to provide multiple connections to a single nbd device.  This
coupled with block-mq drastically increases performance in multi-process cases.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c      | 382 ++++++++++++++++++++++++---------------
 include/uapi/linux/nbd.h |   9 +-
 2 files changed, 243 insertions(+), 148 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 807d24a2f1de7c..e683e2241cbdae 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -41,26 +41,34 @@
 
 #include <linux/nbd.h>
 
+struct nbd_sock {
+	struct socket *sock;
+	struct mutex tx_lock;
+};
+
 #define NBD_TIMEDOUT			0
 #define NBD_DISCONNECT_REQUESTED	1
+#define NBD_DISCONNECTED		2
+#define NBD_RUNNING			3
 
 struct nbd_device {
 	u32 flags;
 	unsigned long runtime_flags;
-	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
+	struct nbd_sock **socks;
 	int magic;
 
 	struct blk_mq_tag_set tag_set;
 
-	struct mutex tx_lock;
+	struct mutex config_lock;
 	struct gendisk *disk;
+	int num_connections;
+	atomic_t recv_threads;
+	wait_queue_head_t recv_wq;
 	int blksize;
 	loff_t bytesize;
 
-	/* protects initialization and shutdown of the socket */
-	spinlock_t sock_lock;
 	struct task_struct *task_recv;
-	struct task_struct *task_send;
+	struct task_struct *task_setup;
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	struct dentry *dbg_dir;
@@ -69,7 +77,7 @@ struct nbd_device {
 
 struct nbd_cmd {
 	struct nbd_device *nbd;
-	struct list_head list;
+	struct completion send_complete;
 };
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -159,22 +167,20 @@ static void nbd_end_request(struct nbd_cmd *cmd)
  */
 static void sock_shutdown(struct nbd_device *nbd)
 {
-	struct socket *sock;
-
-	spin_lock(&nbd->sock_lock);
+	int i;
 
-	if (!nbd->sock) {
-		spin_unlock_irq(&nbd->sock_lock);
+	if (nbd->num_connections == 0)
+		return;
+	if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
 		return;
-	}
-
-	sock = nbd->sock;
-	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
-	nbd->sock = NULL;
-	spin_unlock(&nbd->sock_lock);
 
-	kernel_sock_shutdown(sock, SHUT_RDWR);
-	sockfd_put(sock);
+	for (i = 0; i < nbd->num_connections; i++) {
+		struct nbd_sock *nsock = nbd->socks[i];
+		mutex_lock(&nsock->tx_lock);
+		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+		mutex_unlock(&nsock->tx_lock);
+	}
+	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
 }
 
 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
@@ -182,35 +188,31 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct nbd_device *nbd = cmd->nbd;
-	struct socket *sock = NULL;
-
-	spin_lock(&nbd->sock_lock);
 
+	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
 	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
-
-	if (nbd->sock) {
-		sock = nbd->sock;
-		get_file(sock->file);
-	}
-
-	spin_unlock(&nbd->sock_lock);
-	if (sock) {
-		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sockfd_put(sock);
-	}
-
 	req->errors++;
-	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
+
+	/*
+	 * If our disconnect packet times out then we're already holding the
+	 * config_lock and could deadlock here, so just set an error and return,
+	 * we'll handle shutting everything down later.
+	 */
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		return BLK_EH_HANDLED;
+	mutex_lock(&nbd->config_lock);
+	sock_shutdown(nbd);
+	mutex_unlock(&nbd->config_lock);
 	return BLK_EH_HANDLED;
 }
 
 /*
  *  Send or receive packet.
  */
-static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
-		int msg_flags)
+static int sock_xmit(struct nbd_device *nbd, int index, int send, void *buf,
+		     int size, int msg_flags)
 {
-	struct socket *sock = nbd->sock;
+	struct socket *sock = nbd->socks[index]->sock;
 	int result;
 	struct msghdr msg;
 	struct kvec iov;
@@ -254,19 +256,19 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 	return result;
 }
 
-static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
-		int flags)
+static inline int sock_send_bvec(struct nbd_device *nbd, int index,
+				 struct bio_vec *bvec, int flags)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+	result = sock_xmit(nbd, index, 1, kaddr + bvec->bv_offset,
 			   bvec->bv_len, flags);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* always call with the tx_lock held */
-static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
+static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 {
 	struct request *req = blk_mq_rq_from_pdu(cmd);
 	int result, flags;
@@ -274,10 +276,9 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 	unsigned long size = blk_rq_bytes(req);
 	struct bio *bio;
 	u32 type;
+	u32 tag = blk_mq_unique_tag(req);
 
-	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
-		type = NBD_CMD_DISC;
-	else if (req_op(req) == REQ_OP_DISCARD)
+	if (req_op(req) == REQ_OP_DISCARD)
 		type = NBD_CMD_TRIM;
 	else if (req_op(req) == REQ_OP_FLUSH)
 		type = NBD_CMD_FLUSH;
@@ -289,16 +290,16 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 	memset(&request, 0, sizeof(request));
 	request.magic = htonl(NBD_REQUEST_MAGIC);
 	request.type = htonl(type);
-	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
+	if (type != NBD_CMD_FLUSH) {
 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 		request.len = htonl(size);
 	}
-	memcpy(request.handle, &req->tag, sizeof(req->tag));
+	memcpy(request.handle, &tag, sizeof(tag));
 
 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
 		cmd, nbdcmd_to_ascii(type),
 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
-	result = sock_xmit(nbd, 1, &request, sizeof(request),
+	result = sock_xmit(nbd, index, 1, &request, sizeof(request),
 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
 		dev_err(disk_to_dev(nbd->disk),
@@ -323,7 +324,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 				flags = MSG_MORE;
 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 				cmd, bvec.bv_len);
-			result = sock_send_bvec(nbd, &bvec, flags);
+			result = sock_send_bvec(nbd, index, &bvec, flags);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk),
 					"Send data failed (result %d)\n",
@@ -344,31 +345,34 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
 	return 0;
 }
 
-static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, int index,
+				 struct bio_vec *bvec)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
-			MSG_WAITALL);
+	result = sock_xmit(nbd, index, 0, kaddr + bvec->bv_offset,
+			   bvec->bv_len, MSG_WAITALL);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
+static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 {
 	int result;
 	struct nbd_reply reply;
 	struct nbd_cmd *cmd;
 	struct request *req = NULL;
 	u16 hwq;
-	int tag;
+	u32 tag;
 
 	reply.magic = 0;
-	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
+	result = sock_xmit(nbd, index, 0, &reply, sizeof(reply), MSG_WAITALL);
 	if (result <= 0) {
-		dev_err(disk_to_dev(nbd->disk),
-			"Receive control failed (result %d)\n", result);
+		if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
+		    !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+			dev_err(disk_to_dev(nbd->disk),
+				"Receive control failed (result %d)\n", result);
 		return ERR_PTR(result);
 	}
 
@@ -378,7 +382,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 		return ERR_PTR(-EPROTO);
 	}
 
-	memcpy(&tag, reply.handle, sizeof(int));
+	memcpy(&tag, reply.handle, sizeof(u32));
 
 	hwq = blk_mq_unique_tag_to_hwq(tag);
 	if (hwq < nbd->tag_set.nr_hw_queues)
@@ -390,7 +394,6 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 		return ERR_PTR(-ENOENT);
 	}
 	cmd = blk_mq_rq_to_pdu(req);
-
 	if (ntohl(reply.error)) {
 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 			ntohl(reply.error));
@@ -404,7 +407,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 		struct bio_vec bvec;
 
 		rq_for_each_segment(bvec, req, iter) {
-			result = sock_recv_bvec(nbd, &bvec);
+			result = sock_recv_bvec(nbd, index, &bvec);
 			if (result <= 0) {
 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
@@ -414,6 +417,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
 			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
 				cmd, bvec.bv_len);
 		}
+	} else {
+		/* See the comment in nbd_queue_rq. */
+		wait_for_completion(&cmd->send_complete);
 	}
 	return cmd;
 }
@@ -432,25 +438,24 @@ static struct device_attribute pid_attr = {
 	.show = pid_show,
 };
 
-static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
+struct recv_thread_args {
+	struct work_struct work;
+	struct nbd_device *nbd;
+	int index;
+};
+
+static void recv_work(struct work_struct *work)
 {
+	struct recv_thread_args *args = container_of(work,
+						     struct recv_thread_args,
+						     work);
+	struct nbd_device *nbd = args->nbd;
 	struct nbd_cmd *cmd;
-	int ret;
+	int ret = 0;
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
-
-	sk_set_memalloc(nbd->sock->sk);
-
-	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
-	if (ret) {
-		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
-		return ret;
-	}
-
-	nbd_size_update(nbd, bdev);
-
 	while (1) {
-		cmd = nbd_read_stat(nbd);
+		cmd = nbd_read_stat(nbd, args->index);
 		if (IS_ERR(cmd)) {
 			ret = PTR_ERR(cmd);
 			break;
@@ -459,10 +464,14 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 		nbd_end_request(cmd);
 	}
 
-	nbd_size_clear(nbd, bdev);
-
-	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-	return ret;
+	/*
+	 * We got an error, shut everybody down if this wasn't the result of a
+	 * disconnect request.
+	 */
+	if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+		sock_shutdown(nbd);
+	atomic_dec(&nbd->recv_threads);
+	wake_up(&nbd->recv_wq);
 }
 
 static void nbd_clear_req(struct request *req, void *data, bool reserved)
@@ -480,26 +489,35 @@ static void nbd_clear_que(struct nbd_device *nbd)
 {
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	/*
-	 * Because we have set nbd->sock to NULL under the tx_lock, all
-	 * modifications to the list must have completed by now.
-	 */
-	BUG_ON(nbd->sock);
-
 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 }
 
 
-static void nbd_handle_cmd(struct nbd_cmd *cmd)
+static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 {
 	struct request *req = blk_mq_rq_from_pdu(cmd);
 	struct nbd_device *nbd = cmd->nbd;
+	struct nbd_sock *nsock;
+
+	if (index >= nbd->num_connections) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted send on invalid socket\n");
+		goto error_out;
+	}
+
+	if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Attempted send on closed socket\n");
+		goto error_out;
+	}
 
-	if (req->cmd_type != REQ_TYPE_FS)
+	if (req->cmd_type != REQ_TYPE_FS &&
+	    req->cmd_type != REQ_TYPE_DRV_PRIV)
 		goto error_out;
 
-	if (rq_data_dir(req) == WRITE &&
+	if (req->cmd_type == REQ_TYPE_FS &&
+	    rq_data_dir(req) == WRITE &&
 	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
 		dev_err(disk_to_dev(nbd->disk),
 			"Write on read-only\n");
@@ -508,23 +526,22 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd)
 
 	req->errors = 0;
 
-	mutex_lock(&nbd->tx_lock);
-	nbd->task_send = current;
-	if (unlikely(!nbd->sock)) {
-		mutex_unlock(&nbd->tx_lock);
+	nsock = nbd->socks[index];
+	mutex_lock(&nsock->tx_lock);
+	if (unlikely(!nsock->sock)) {
+		mutex_unlock(&nsock->tx_lock);
 		dev_err(disk_to_dev(nbd->disk),
 			"Attempted send on closed socket\n");
 		goto error_out;
 	}
 
-	if (nbd_send_cmd(nbd, cmd) != 0) {
+	if (nbd_send_cmd(nbd, cmd, index) != 0) {
 		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
 		nbd_end_request(cmd);
 	}
 
-	nbd->task_send = NULL;
-	mutex_unlock(&nbd->tx_lock);
+	mutex_unlock(&nsock->tx_lock);
 
 	return;
 
@@ -538,39 +555,70 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
+	/*
+	 * Since we look at the bio's to send the request over the network we
+	 * need to make sure the completion work doesn't mark this request done
+	 * before we are done doing our send.  This keeps us from dereferencing
+	 * freed data if we have particularly fast completions (ie we get the
+	 * completion before we exit sock_xmit on the last bvec) or in the case
+	 * that the server is misbehaving (or there was an error) before we're
+	 * done sending everything over the wire.
+	 */
+	init_completion(&cmd->send_complete);
 	blk_mq_start_request(bd->rq);
-	nbd_handle_cmd(cmd);
+	nbd_handle_cmd(cmd, hctx->queue_num);
+	complete(&cmd->send_complete);
+
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
+static int nbd_add_socket(struct nbd_device *nbd, struct socket *sock)
 {
-	int ret = 0;
-
-	spin_lock_irq(&nbd->sock_lock);
+	struct nbd_sock **socks;
+	struct nbd_sock *nsock;
 
-	if (nbd->sock) {
-		ret = -EBUSY;
-		goto out;
+	if (!nbd->task_setup)
+		nbd->task_setup = current;
+	if (nbd->task_setup != current) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Device being setup by another task");
+		return -EINVAL;
 	}
 
-	nbd->sock = sock;
+	socks = krealloc(nbd->socks, (nbd->num_connections + 1) *
+			 sizeof(struct nbd_sock *), GFP_KERNEL);
+	if (!socks)
+		return -ENOMEM;
+	nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
+	if (!nsock)
+		return -ENOMEM;
 
-out:
-	spin_unlock_irq(&nbd->sock_lock);
+	nbd->socks = socks;
 
-	return ret;
+	mutex_init(&nsock->tx_lock);
+	nsock->sock = sock;
+	socks[nbd->num_connections++] = nsock;
+
+	return 0;
 }
 
 /* Reset all properties of an NBD device */
 static void nbd_reset(struct nbd_device *nbd)
 {
+	int i;
+
+	for (i = 0; i < nbd->num_connections; i++)
+		kfree(nbd->socks[i]);
+	kfree(nbd->socks);
+	nbd->socks = NULL;
 	nbd->runtime_flags = 0;
 	nbd->blksize = 1024;
 	nbd->bytesize = 0;
 	set_capacity(nbd->disk, 0);
 	nbd->flags = 0;
 	nbd->tag_set.timeout = 0;
+	nbd->num_connections = 0;
+	nbd->task_setup = NULL;
 	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 }
 
@@ -596,48 +644,67 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
 		blk_queue_write_cache(nbd->disk->queue, false, false);
 }
 
+static void send_disconnects(struct nbd_device *nbd)
+{
+	struct nbd_request request = {};
+	int i, ret;
+
+	request.magic = htonl(NBD_REQUEST_MAGIC);
+	request.type = htonl(NBD_CMD_DISC);
+
+	for (i = 0; i < nbd->num_connections; i++) {
+		ret = sock_xmit(nbd, i, 1, &request, sizeof(request), 0);
+		if (ret <= 0)
+			dev_err(disk_to_dev(nbd->disk),
+				"Send disconnect failed %d\n", ret);
+	}
+}
+
 static int nbd_dev_dbg_init(struct nbd_device *nbd);
 static void nbd_dev_dbg_close(struct nbd_device *nbd);
 
-/* Must be called with tx_lock held */
-
+/* Must be called with config_lock held */
 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		       unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
 	case NBD_DISCONNECT: {
-		struct request *sreq;
-
 		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
-		if (!nbd->sock)
+		if (!nbd->socks)
 			return -EINVAL;
 
-		sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0);
-		if (!sreq)
-			return -ENOMEM;
-
-		mutex_unlock(&nbd->tx_lock);
+		mutex_unlock(&nbd->config_lock);
 		fsync_bdev(bdev);
-		mutex_lock(&nbd->tx_lock);
-		sreq->cmd_type = REQ_TYPE_DRV_PRIV;
+		mutex_lock(&nbd->config_lock);
 
 		/* Check again after getting mutex back.  */
-		if (!nbd->sock) {
-			blk_mq_free_request(sreq);
+		if (!nbd->socks)
 			return -EINVAL;
-		}
 
-		set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
-
-		nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
-		blk_mq_free_request(sreq);
+		if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
+				      &nbd->runtime_flags))
+			send_disconnects(nbd);
 		return 0;
 	}
- 
+
 	case NBD_CLEAR_SOCK:
 		sock_shutdown(nbd);
 		nbd_clear_que(nbd);
 		kill_bdev(bdev);
+		nbd_bdev_reset(bdev);
+		/*
+		 * We want to give the run thread a chance to wait for everybody
+		 * to clean up and then do it's own cleanup.
+		 */
+		if (!test_bit(NBD_RUNNING, &nbd->runtime_flags)) {
+			int i;
+
+			for (i = 0; i < nbd->num_connections; i++)
+				kfree(nbd->socks[i]);
+			kfree(nbd->socks);
+			nbd->socks = NULL;
+			nbd->num_connections = 0;
+		}
 		return 0;
 
 	case NBD_SET_SOCK: {
@@ -647,7 +714,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (!sock)
 			return err;
 
-		err = nbd_set_socket(nbd, sock);
+		err = nbd_add_socket(nbd, sock);
 		if (!err && max_part)
 			bdev->bd_invalidated = 1;
 
@@ -676,26 +743,58 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		return 0;
 
 	case NBD_DO_IT: {
-		int error;
+		struct recv_thread_args *args;
+		int num_connections = nbd->num_connections;
+		int error, i;
 
 		if (nbd->task_recv)
 			return -EBUSY;
-		if (!nbd->sock)
+		if (!nbd->socks)
 			return -EINVAL;
+		if (num_connections > 1 &&
+		    !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
+			dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
+			goto out_err;
+		}
 
-		/* We have to claim the device under the lock */
+		set_bit(NBD_RUNNING, &nbd->runtime_flags);
+		blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
+		args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
+		if (!args)
+			goto out_err;
 		nbd->task_recv = current;
-		mutex_unlock(&nbd->tx_lock);
+		mutex_unlock(&nbd->config_lock);
 
 		nbd_parse_flags(nbd, bdev);
 
+		error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
+		if (error) {
+			dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+			goto out_recv;
+		}
+
+		nbd_size_update(nbd, bdev);
+
 		nbd_dev_dbg_init(nbd);
-		error = nbd_thread_recv(nbd, bdev);
+		for (i = 0; i < num_connections; i++) {
+			sk_set_memalloc(nbd->socks[i]->sock->sk);
+			atomic_inc(&nbd->recv_threads);
+			INIT_WORK(&args[i].work, recv_work);
+			args[i].nbd = nbd;
+			args[i].index = i;
+			queue_work(system_long_wq, &args[i].work);
+		}
+		wait_event_interruptible(nbd->recv_wq,
+					 atomic_read(&nbd->recv_threads) == 0);
+		for (i = 0; i < num_connections; i++)
+			flush_work(&args[i].work);
 		nbd_dev_dbg_close(nbd);
-
-		mutex_lock(&nbd->tx_lock);
+		nbd_size_clear(nbd, bdev);
+		device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+out_recv:
+		mutex_lock(&nbd->config_lock);
 		nbd->task_recv = NULL;
-
+out_err:
 		sock_shutdown(nbd);
 		nbd_clear_que(nbd);
 		kill_bdev(bdev);
@@ -708,7 +807,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 			error = -ETIMEDOUT;
 
 		nbd_reset(nbd);
-
 		return error;
 	}
 
@@ -740,9 +838,9 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 
 	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	mutex_lock(&nbd->tx_lock);
+	mutex_lock(&nbd->config_lock);
 	error = __nbd_ioctl(bdev, nbd, cmd, arg);
-	mutex_unlock(&nbd->tx_lock);
+	mutex_unlock(&nbd->config_lock);
 
 	return error;
 }
@@ -762,8 +860,6 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
 
 	if (nbd->task_recv)
 		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
-	if (nbd->task_send)
-		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));
 
 	return 0;
 }
@@ -887,9 +983,7 @@ static int nbd_init_request(void *data, struct request *rq,
 			    unsigned int numa_node)
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
-
 	cmd->nbd = data;
-	INIT_LIST_HEAD(&cmd->list);
 	return 0;
 }
 
@@ -999,13 +1093,13 @@ static int __init nbd_init(void)
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].magic = NBD_MAGIC;
-		spin_lock_init(&nbd_dev[i].sock_lock);
-		mutex_init(&nbd_dev[i].tx_lock);
+		mutex_init(&nbd_dev[i].config_lock);
 		disk->major = NBD_MAJOR;
 		disk->first_minor = i << part_shift;
 		disk->fops = &nbd_fops;
 		disk->private_data = &nbd_dev[i];
 		sprintf(disk->disk_name, "nbd%d", i);
+		init_waitqueue_head(&nbd_dev[i].recv_wq);
 		nbd_reset(&nbd_dev[i]);
 		add_disk(disk);
 	}
diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index e08e413d5f71c9..f063cff178c58e 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -38,11 +38,12 @@ enum {
 };
 
 /* values for flags field */
-#define NBD_FLAG_HAS_FLAGS    (1 << 0) /* nbd-server supports flags */
-#define NBD_FLAG_READ_ONLY    (1 << 1) /* device is read-only */
-#define NBD_FLAG_SEND_FLUSH   (1 << 2) /* can flush writeback cache */
+#define NBD_FLAG_HAS_FLAGS	(1 << 0) /* nbd-server supports flags */
+#define NBD_FLAG_READ_ONLY	(1 << 1) /* device is read-only */
+#define NBD_FLAG_SEND_FLUSH	(1 << 2) /* can flush writeback cache */
 /* there is a gap here to match userspace */
-#define NBD_FLAG_SEND_TRIM    (1 << 5) /* send trim/discard */
+#define NBD_FLAG_SEND_TRIM	(1 << 5) /* send trim/discard */
+#define NBD_FLAG_CAN_MULTI_CONN	(1 << 7)	/* Server supports multiple connections per export. */
 
 /* userspace doesn't need the nbd_device structure */
 

From 63db89eaf0b9bf856c5e079ed5eab5c3f13165c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 22 Nov 2016 13:09:50 -0700
Subject: [PATCH 111/182] nbd: move multi-connection bit to unused value

Bit #7 is already used, move to bit #8 which is the first unused
one.

Fixes: 9561a7ade0c2 ("nbd: add multi-connection support")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/uapi/linux/nbd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index f063cff178c58e..c91c642ea9003e 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -43,7 +43,7 @@ enum {
 #define NBD_FLAG_SEND_FLUSH	(1 << 2) /* can flush writeback cache */
 /* there is a gap here to match userspace */
 #define NBD_FLAG_SEND_TRIM	(1 << 5) /* send trim/discard */
-#define NBD_FLAG_CAN_MULTI_CONN	(1 << 7)	/* Server supports multiple connections per export. */
+#define NBD_FLAG_CAN_MULTI_CONN	(1 << 8)	/* Server supports multiple connections per export. */
 
 /* userspace doesn't need the nbd_device structure */
 

From feffa5cc7b47f38210d4997ceb3fe30881d6c337 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 22 Nov 2016 19:09:45 -0700
Subject: [PATCH 112/182] nbd: fix setting of 'error' in NBD_DO_IT ioctl

Multiple paths don't set it properly, ensure that we do.

Fixes: 9561a7ade0c2 ("nbd: add multi-connection support")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index e683e2241cbdae..dc722a7adf580a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -745,7 +745,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 	case NBD_DO_IT: {
 		struct recv_thread_args *args;
 		int num_connections = nbd->num_connections;
-		int error, i;
+		int error = 0, i;
 
 		if (nbd->task_recv)
 			return -EBUSY;
@@ -754,14 +754,17 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (num_connections > 1 &&
 		    !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
 			dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
+			error = -EINVAL;
 			goto out_err;
 		}
 
 		set_bit(NBD_RUNNING, &nbd->runtime_flags);
 		blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
 		args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
-		if (!args)
+		if (!args) {
+			error = -ENOMEM;
 			goto out_err;
+		}
 		nbd->task_recv = current;
 		mutex_unlock(&nbd->config_lock);
 

From 80e091d10e8bf7b801d634ea8870b9e907314424 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 28 Nov 2016 09:22:47 -0700
Subject: [PATCH 113/182] blk-wbt: allow reset of default latency through sysfs

Allow a write of '-1' to reset the default latency target for
a given device. This removes knowledge of the different default
settings for rotational vs non-rotational from user space.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/queue-sysfs.txt |  4 +++-
 block/blk-sysfs.c                   | 22 +++++++++++++++-------
 block/blk-wbt.c                     | 17 +++++++++++++----
 block/blk-wbt.h                     |  6 ++++++
 4 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 14235e72a70274..51642159aedbbc 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -188,7 +188,9 @@ wb_lat_usec (RW)
 If the device is registered for writeback throttling, then this file shows
 the target minimum read latency. If this latency is exceeded in a given
 window of time (see wb_window_usec), then the writeback throttling will start
-scaling back writes.
+scaling back writes. Writing a value of '0' to this file disables the
+feature. Writing a value of '-1' to this file resets the value to the
+default setting.
 
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1855c6770045c0..f0ca569e276b95 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -42,12 +42,12 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t queue_var_store64(u64 *var, const char *page)
+static ssize_t queue_var_store64(s64 *var, const char *page)
 {
 	int err;
-	u64 v;
+	s64 v;
 
-	err = kstrtou64(page, 10, &v);
+	err = kstrtos64(page, 10, &v);
 	if (err < 0)
 		return err;
 
@@ -421,18 +421,26 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
 static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
 				  size_t count)
 {
+	struct rq_wb *rwb;
 	ssize_t ret;
-	u64 val;
+	s64 val;
 
-	if (!q->rq_wb)
+	rwb = q->rq_wb;
+	if (!rwb)
 		return -EINVAL;
 
 	ret = queue_var_store64(&val, page);
 	if (ret < 0)
 		return ret;
 
-	q->rq_wb->min_lat_nsec = val * 1000ULL;
-	wbt_update_limits(q->rq_wb);
+	if (val == -1)
+		rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+	else if (val >= 0)
+		rwb->min_lat_nsec = val * 1000ULL;
+	else
+		return -EINVAL;
+
+	wbt_update_limits(rwb);
 	return count;
 }
 
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 9f97594e68ce73..92df2f7c5af1a0 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -675,6 +675,18 @@ void wbt_disable(struct rq_wb *rwb)
 }
 EXPORT_SYMBOL_GPL(wbt_disable);
 
+u64 wbt_default_latency_nsec(struct request_queue *q)
+{
+	/*
+	 * We default to 2msec for non-rotational storage, and 75msec
+	 * for rotational storage.
+	 */
+	if (blk_queue_nonrot(q))
+		return 2000000ULL;
+	else
+		return 75000000ULL;
+}
+
 int wbt_init(struct request_queue *q)
 {
 	struct rq_wb *rwb;
@@ -711,10 +723,7 @@ int wbt_init(struct request_queue *q)
 	q->rq_wb = rwb;
 	blk_stat_enable(q);
 
-	if (blk_queue_nonrot(q))
-		rwb->min_lat_nsec = 2000000ULL;
-	else
-		rwb->min_lat_nsec = 75000000ULL;
+	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
 
 	wbt_set_queue_depth(rwb, blk_queue_depth(q));
 	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 44dc2173dc1f38..9dfc88ad7f305d 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -110,6 +110,8 @@ void wbt_disable(struct rq_wb *);
 void wbt_set_queue_depth(struct rq_wb *, unsigned int);
 void wbt_set_write_cache(struct rq_wb *, bool);
 
+u64 wbt_default_latency_nsec(struct request_queue *);
+
 #else
 
 static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
@@ -148,6 +150,10 @@ static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
 static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc)
 {
 }
+static inline u64 wbt_default_latency_nsec(struct request_queue *q)
+{
+	return 0;
+}
 
 #endif /* CONFIG_BLK_WBT */
 

From fa224eed2b5e0f2f9a57281e9dc733c843d590ad Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 28 Nov 2016 09:25:50 -0700
Subject: [PATCH 114/182] blk-wbt: cleanup disable-by-default for CFQ

Make it clear that we are disabling wbt for the specified queued,
if it was enabled by default. This is in preparation for allowing
users to re-enable wbt, and not have it disabled automatically
again.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-wbt.c     | 10 ++++++++--
 block/blk-wbt.h     |  4 ++--
 block/cfq-iosched.c |  9 +++------
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 92df2f7c5af1a0..7c0e618d6e7d97 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -665,15 +665,21 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
 		rwb->wc = write_cache_on;
 }
 
-void wbt_disable(struct rq_wb *rwb)
+ /*
+ * Disable wbt, if enabled by default. Only called from CFQ, if we have
+ * cgroups enabled
+ */
+void wbt_disable_default(struct request_queue *q)
 {
+	struct rq_wb *rwb = q->rq_wb;
+
 	if (rwb) {
 		del_timer_sync(&rwb->window_timer);
 		rwb->win_nsec = rwb->min_lat_nsec = 0;
 		wbt_update_limits(rwb);
 	}
 }
-EXPORT_SYMBOL_GPL(wbt_disable);
+EXPORT_SYMBOL_GPL(wbt_disable_default);
 
 u64 wbt_default_latency_nsec(struct request_queue *q)
 {
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 9dfc88ad7f305d..8f485f8e1baf7a 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -105,7 +105,7 @@ void wbt_exit(struct request_queue *);
 void wbt_update_limits(struct rq_wb *);
 void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
 void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
-void wbt_disable(struct rq_wb *);
+void wbt_disable_default(struct request_queue *);
 
 void wbt_set_queue_depth(struct rq_wb *, unsigned int);
 void wbt_set_write_cache(struct rq_wb *, bool);
@@ -141,7 +141,7 @@ static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
 static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
 {
 }
-static inline void wbt_disable(struct rq_wb *rwb)
+static inline void wbt_disable_default(struct request_queue *q)
 {
 }
 static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9894dc985e0935..c73a6fcaeb9d52 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3780,13 +3780,10 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	/*
 	 * If we have a non-root cgroup, we can depend on that to
 	 * do proper throttling of writes. Turn off wbt for that
-	 * case.
+	 * case, if it was enabled by default.
 	 */
-	if (nonroot_cg) {
-		struct request_queue *q = cfqd->queue;
-
-		wbt_disable(q->rq_wb);
-	}
+	if (nonroot_cg)
+		wbt_disable_default(cfqd->queue);
 
 	/*
 	 * Drop reference to queues.  New queues will be assigned in new

From d62118b6dd99b8f64350206a6ea6996083b28c9a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 28 Nov 2016 09:40:34 -0700
Subject: [PATCH 115/182] blk-wbt: allow wbt to be enabled always through sysfs

Currently there's no way to enable wbt if it's not enabled in the
kernel config by default for a device. Allow a write to the
'wbt_lat_usec' queue sysfs file to enable wbt.

This is useful for both the kernel config case, but also if the
device is CFQ managed and it was turned off by default.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-sysfs.c | 22 ++++++++++++++++------
 block/blk-wbt.c   |  3 ++-
 block/blk-wbt.h   | 11 +++++++++++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f0ca569e276b95..a9784149176902 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -425,20 +425,30 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
 	ssize_t ret;
 	s64 val;
 
-	rwb = q->rq_wb;
-	if (!rwb)
-		return -EINVAL;
-
 	ret = queue_var_store64(&val, page);
 	if (ret < 0)
 		return ret;
+	if (val < -1)
+		return -EINVAL;
+
+	rwb = q->rq_wb;
+	if (!rwb) {
+		ret = wbt_init(q);
+		if (ret)
+			return ret;
+
+		rwb = q->rq_wb;
+		if (!rwb)
+			return -EINVAL;
+	}
 
 	if (val == -1)
 		rwb->min_lat_nsec = wbt_default_latency_nsec(q);
 	else if (val >= 0)
 		rwb->min_lat_nsec = val * 1000ULL;
-	else
-		return -EINVAL;
+
+	if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
+		rwb->enable_state = WBT_STATE_ON_MANUAL;
 
 	wbt_update_limits(rwb);
 	return count;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 7c0e618d6e7d97..b8647343141f89 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -673,7 +673,7 @@ void wbt_disable_default(struct request_queue *q)
 {
 	struct rq_wb *rwb = q->rq_wb;
 
-	if (rwb) {
+	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
 		del_timer_sync(&rwb->window_timer);
 		rwb->win_nsec = rwb->min_lat_nsec = 0;
 		wbt_update_limits(rwb);
@@ -721,6 +721,7 @@ int wbt_init(struct request_queue *q)
 	rwb->last_comp = rwb->last_issue = jiffies;
 	rwb->queue = q;
 	rwb->win_nsec = RWB_WINDOW_NSEC;
+	rwb->enable_state = WBT_STATE_ON_DEFAULT;
 	wbt_update_limits(rwb);
 
 	/*
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 8f485f8e1baf7a..65f1de519f67eb 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -21,6 +21,15 @@ enum {
 	WBT_NUM_RWQ		= 2,
 };
 
+/*
+ * Enable states. Either off, or on by default (done at init time),
+ * or on through manual setup in sysfs.
+ */
+enum {
+	WBT_STATE_ON_DEFAULT	= 1,
+	WBT_STATE_ON_MANUAL	= 2,
+};
+
 static inline void wbt_clear_state(struct blk_issue_stat *stat)
 {
 	stat->time &= BLK_STAT_TIME_MASK;
@@ -61,6 +70,8 @@ struct rq_wb {
 	int scale_step;
 	bool scaled_max;
 
+	short enable_state;			/* WBT_STATE_* */
+
 	/*
 	 * Number of consecutive periods where we don't have enough
 	 * information to make a firm scale up/down decision.

From 415d3dab964c1a9bc6ceb8941bd4dbe4fbe36a09 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2016 15:01:48 -0200
Subject: [PATCH 116/182] blk-mq: Drop explicit timeout sync in hotplug

After commit 287922eb0b18 ("block: defer timeouts to a workqueue"),
deleting the timeout work after freezing the queue shouldn't be
necessary, since the synchronization is already enforced by the
acquisition of a q_usage_counter reference in blk_mq_timeout_work.

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Reviewed-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9d4a1d630d0b8b..bac12caece0664 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2220,16 +2220,9 @@ static void blk_mq_queue_reinit_work(void)
 	 */
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_freeze_queue_start(q);
-	list_for_each_entry(q, &all_q_list, all_q_node) {
+	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_freeze_queue_wait(q);
 
-		/*
-		 * timeout handler can't touch hw queue during the
-		 * reinitialization
-		 */
-		del_timer_sync(&q->timeout);
-	}
-
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_queue_reinit(q, &cpuhp_online_new);
 

From 7498e99fc51ca60b960ef79061e0e7b521feb07e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Mon, 28 Nov 2016 22:38:52 +0100
Subject: [PATCH 117/182] nvme: lightnvm: frees wrong cmd structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When struct nvme_request was introduced, the nvme_nvm_submit_io was
converted to the new interface. The interface moves nvme_nvm_command
data structure into the struct request pdu. On io completion, rq->cmd is
freed, which should have been the dereferenced pdu nvme_request->cmd.

Fixes: d49187e97e94 "nvme: introduce struct nvme_request"
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/lightnvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 442f67774ea9db..de3dc87226bb60 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -477,7 +477,7 @@ static void nvme_nvm_end_io(struct request *rq, int error)
 	rqd->ppa_status = nvme_req(rq)->result.u64;
 	nvm_end_io(rqd, error);
 
-	kfree(rq->cmd);
+	kfree(nvme_req(rq)->cmd);
 	blk_mq_free_request(rq);
 }
 

From 3dc87dd048dc442bab633e85bfb96c893612d765 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <m@bjorling.me>
Date: Mon, 28 Nov 2016 22:38:53 +0100
Subject: [PATCH 118/182] nvme: lightnvm: attach lightnvm sysfs to nvme block
 device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, LBA read and write were not supported in the lightnvm
specification. Now that it supports it, lets use the traditional
NVMe gendisk, and attach the lightnvm sysfs geometry export.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/Makefile    |   2 +-
 drivers/lightnvm/core.c      |  15 +--
 drivers/lightnvm/lightnvm.h  |  35 -------
 drivers/lightnvm/sysfs.c     | 198 -----------------------------------
 drivers/nvme/host/core.c     |  43 ++++----
 drivers/nvme/host/lightnvm.c | 175 ++++++++++++++++++++++++++++---
 drivers/nvme/host/nvme.h     |  25 ++---
 include/linux/lightnvm.h     |   2 -
 8 files changed, 197 insertions(+), 298 deletions(-)
 delete mode 100644 drivers/lightnvm/lightnvm.h
 delete mode 100644 drivers/lightnvm/sysfs.c

diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index 1f6b6521016aaf..a7a0a22cf1a596 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -2,6 +2,6 @@
 # Makefile for Open-Channel SSDs.
 #
 
-obj-$(CONFIG_NVM)		:= core.o sysblk.o sysfs.o
+obj-$(CONFIG_NVM)		:= core.o sysblk.o
 obj-$(CONFIG_NVM_GENNVM) 	+= gennvm.o
 obj-$(CONFIG_NVM_RRPC)		+= rrpc.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 1cac0f8bc0dc85..11117404b5f5f0 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -27,8 +27,6 @@
 #include <linux/lightnvm.h>
 #include <linux/sched/sysctl.h>
 
-#include "lightnvm.h"
-
 static LIST_HEAD(nvm_tgt_types);
 static DECLARE_RWSEM(nvm_tgtt_lock);
 static LIST_HEAD(nvm_mgrs);
@@ -657,11 +655,6 @@ static int nvm_init(struct nvm_dev *dev)
 	return ret;
 }
 
-static void nvm_exit(struct nvm_dev *dev)
-{
-	nvm_sysfs_unregister_dev(dev);
-}
-
 struct nvm_dev *nvm_alloc_dev(int node)
 {
 	return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node);
@@ -691,10 +684,6 @@ int nvm_register(struct nvm_dev *dev)
 		}
 	}
 
-	ret = nvm_sysfs_register_dev(dev);
-	if (ret)
-		goto err_ppalist;
-
 	if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
 		ret = nvm_get_sysblock(dev, &dev->sb);
 		if (!ret)
@@ -711,8 +700,6 @@ int nvm_register(struct nvm_dev *dev)
 	up_write(&nvm_lock);
 
 	return 0;
-err_ppalist:
-	dev->ops->destroy_dma_pool(dev->dma_pool);
 err_init:
 	kfree(dev->lun_map);
 	return ret;
@@ -725,7 +712,7 @@ void nvm_unregister(struct nvm_dev *dev)
 	list_del(&dev->devices);
 	up_write(&nvm_lock);
 
-	nvm_exit(dev);
+	nvm_free(dev);
 }
 EXPORT_SYMBOL(nvm_unregister);
 
diff --git a/drivers/lightnvm/lightnvm.h b/drivers/lightnvm/lightnvm.h
deleted file mode 100644
index 305c181509a63a..00000000000000
--- a/drivers/lightnvm/lightnvm.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2016 CNEX Labs. All rights reserved.
- * Initial release: Matias Bjorling <matias@cnexlabs.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- */
-
-#ifndef LIGHTNVM_H
-#define LIGHTNVM_H
-
-#include <linux/lightnvm.h>
-
-/* core -> sysfs.c */
-int __must_check nvm_sysfs_register_dev(struct nvm_dev *);
-void nvm_sysfs_unregister_dev(struct nvm_dev *);
-int nvm_sysfs_register(void);
-void nvm_sysfs_unregister(void);
-
-/* sysfs > core */
-void nvm_free(struct nvm_dev *);
-
-#endif
diff --git a/drivers/lightnvm/sysfs.c b/drivers/lightnvm/sysfs.c
deleted file mode 100644
index 0338c27ab95a92..00000000000000
--- a/drivers/lightnvm/sysfs.c
+++ /dev/null
@@ -1,198 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/lightnvm.h>
-#include <linux/miscdevice.h>
-#include <linux/kobject.h>
-#include <linux/blk-mq.h>
-
-#include "lightnvm.h"
-
-static ssize_t nvm_dev_attr_show(struct device *dev,
-				 struct device_attribute *dattr, char *page)
-{
-	struct nvm_dev *ndev = container_of(dev, struct nvm_dev, dev);
-	struct nvm_id *id = &ndev->identity;
-	struct nvm_id_group *grp = &id->groups[0];
-	struct attribute *attr = &dattr->attr;
-
-	if (strcmp(attr->name, "version") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id);
-	} else if (strcmp(attr->name, "vendor_opcode") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt);
-	} else if (strcmp(attr->name, "capabilities") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", id->cap);
-	} else if (strcmp(attr->name, "device_mode") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", id->dom);
-	} else if (strcmp(attr->name, "media_manager") == 0) {
-		if (!ndev->mt)
-			return scnprintf(page, PAGE_SIZE, "%s\n", "none");
-		return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name);
-	} else if (strcmp(attr->name, "ppa_format") == 0) {
-		return scnprintf(page, PAGE_SIZE,
-			"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
-			id->ppaf.ch_offset, id->ppaf.ch_len,
-			id->ppaf.lun_offset, id->ppaf.lun_len,
-			id->ppaf.pln_offset, id->ppaf.pln_len,
-			id->ppaf.blk_offset, id->ppaf.blk_len,
-			id->ppaf.pg_offset, id->ppaf.pg_len,
-			id->ppaf.sect_offset, id->ppaf.sect_len);
-	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype);
-	} else if (strcmp(attr->name, "flash_media_type") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype);
-	} else if (strcmp(attr->name, "num_channels") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch);
-	} else if (strcmp(attr->name, "num_luns") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun);
-	} else if (strcmp(attr->name, "num_planes") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
-	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
-	} else if (strcmp(attr->name, "num_pages") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
-	} else if (strcmp(attr->name, "page_size") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz);
-	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs);
-	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos);
-	} else if (strcmp(attr->name, "read_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt);
-	} else if (strcmp(attr->name, "read_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm);
-	} else if (strcmp(attr->name, "prog_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt);
-	} else if (strcmp(attr->name, "prog_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm);
-	} else if (strcmp(attr->name, "erase_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet);
-	} else if (strcmp(attr->name, "erase_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem);
-	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
-		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos);
-	} else if (strcmp(attr->name, "media_capabilities") == 0) {
-		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap);
-	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n",
-				ndev->ops->max_phys_sect);
-	} else {
-		return scnprintf(page,
-				 PAGE_SIZE,
-				 "Unhandled attr(%s) in `nvm_dev_attr_show`\n",
-				 attr->name);
-	}
-}
-
-#define NVM_DEV_ATTR_RO(_name)						\
-	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
-
-static NVM_DEV_ATTR_RO(version);
-static NVM_DEV_ATTR_RO(vendor_opcode);
-static NVM_DEV_ATTR_RO(capabilities);
-static NVM_DEV_ATTR_RO(device_mode);
-static NVM_DEV_ATTR_RO(ppa_format);
-static NVM_DEV_ATTR_RO(media_manager);
-
-static NVM_DEV_ATTR_RO(media_type);
-static NVM_DEV_ATTR_RO(flash_media_type);
-static NVM_DEV_ATTR_RO(num_channels);
-static NVM_DEV_ATTR_RO(num_luns);
-static NVM_DEV_ATTR_RO(num_planes);
-static NVM_DEV_ATTR_RO(num_blocks);
-static NVM_DEV_ATTR_RO(num_pages);
-static NVM_DEV_ATTR_RO(page_size);
-static NVM_DEV_ATTR_RO(hw_sector_size);
-static NVM_DEV_ATTR_RO(oob_sector_size);
-static NVM_DEV_ATTR_RO(read_typ);
-static NVM_DEV_ATTR_RO(read_max);
-static NVM_DEV_ATTR_RO(prog_typ);
-static NVM_DEV_ATTR_RO(prog_max);
-static NVM_DEV_ATTR_RO(erase_typ);
-static NVM_DEV_ATTR_RO(erase_max);
-static NVM_DEV_ATTR_RO(multiplane_modes);
-static NVM_DEV_ATTR_RO(media_capabilities);
-static NVM_DEV_ATTR_RO(max_phys_secs);
-
-#define NVM_DEV_ATTR(_name) (dev_attr_##_name##)
-
-static struct attribute *nvm_dev_attrs[] = {
-	&dev_attr_version.attr,
-	&dev_attr_vendor_opcode.attr,
-	&dev_attr_capabilities.attr,
-	&dev_attr_device_mode.attr,
-	&dev_attr_media_manager.attr,
-
-	&dev_attr_ppa_format.attr,
-	&dev_attr_media_type.attr,
-	&dev_attr_flash_media_type.attr,
-	&dev_attr_num_channels.attr,
-	&dev_attr_num_luns.attr,
-	&dev_attr_num_planes.attr,
-	&dev_attr_num_blocks.attr,
-	&dev_attr_num_pages.attr,
-	&dev_attr_page_size.attr,
-	&dev_attr_hw_sector_size.attr,
-	&dev_attr_oob_sector_size.attr,
-	&dev_attr_read_typ.attr,
-	&dev_attr_read_max.attr,
-	&dev_attr_prog_typ.attr,
-	&dev_attr_prog_max.attr,
-	&dev_attr_erase_typ.attr,
-	&dev_attr_erase_max.attr,
-	&dev_attr_multiplane_modes.attr,
-	&dev_attr_media_capabilities.attr,
-	&dev_attr_max_phys_secs.attr,
-	NULL,
-};
-
-static struct attribute_group nvm_dev_attr_group = {
-	.name = "lightnvm",
-	.attrs = nvm_dev_attrs,
-};
-
-static const struct attribute_group *nvm_dev_attr_groups[] = {
-	&nvm_dev_attr_group,
-	NULL,
-};
-
-static void nvm_dev_release(struct device *device)
-{
-	struct nvm_dev *dev = container_of(device, struct nvm_dev, dev);
-	struct request_queue *q = dev->q;
-
-	pr_debug("nvm/sysfs: `nvm_dev_release`\n");
-
-	blk_mq_unregister_dev(device, q);
-
-	nvm_free(dev);
-}
-
-static struct device_type nvm_type = {
-	.name		= "lightnvm",
-	.groups		= nvm_dev_attr_groups,
-	.release	= nvm_dev_release,
-};
-
-int nvm_sysfs_register_dev(struct nvm_dev *dev)
-{
-	int ret;
-
-	if (!dev->parent_dev)
-		return 0;
-
-	dev->dev.parent = dev->parent_dev;
-	dev_set_name(&dev->dev, "%s", dev->name);
-	dev->dev.type = &nvm_type;
-	device_initialize(&dev->dev);
-	ret = device_add(&dev->dev);
-
-	if (!ret)
-		blk_mq_register_dev(&dev->dev, dev->q);
-
-	return ret;
-}
-
-void nvm_sysfs_unregister_dev(struct nvm_dev *dev)
-{
-	if (dev && dev->parent_dev)
-		kobject_put(&dev->dev.kobj);
-}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e54bb10ead9cf9..8c644838252ea8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1673,27 +1673,24 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (nvme_revalidate_ns(ns, &id))
 		goto out_free_queue;
 
-	if (nvme_nvm_ns_supported(ns, id)) {
-		if (nvme_nvm_register(ns, disk_name, node,
-							&nvme_ns_attr_group)) {
-			dev_warn(ctrl->dev, "%s: LightNVM init failure\n",
-								__func__);
-			goto out_free_id;
-		}
-	} else {
-		disk = alloc_disk_node(0, node);
-		if (!disk)
-			goto out_free_id;
+	if (nvme_nvm_ns_supported(ns, id) &&
+				nvme_nvm_register(ns, disk_name, node)) {
+		dev_warn(ctrl->dev, "%s: LightNVM init failure\n", __func__);
+		goto out_free_id;
+	}
 
-		disk->fops = &nvme_fops;
-		disk->private_data = ns;
-		disk->queue = ns->queue;
-		disk->flags = GENHD_FL_EXT_DEVT;
-		memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
-		ns->disk = disk;
+	disk = alloc_disk_node(0, node);
+	if (!disk)
+		goto out_free_id;
 
-		__nvme_revalidate_disk(disk, id);
-	}
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	disk->flags = GENHD_FL_EXT_DEVT;
+	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
+	ns->disk = disk;
+
+	__nvme_revalidate_disk(disk, id);
 
 	mutex_lock(&ctrl->namespaces_mutex);
 	list_add_tail(&ns->list, &ctrl->namespaces);
@@ -1703,14 +1700,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	kfree(id);
 
-	if (ns->ndev)
-		return;
-
 	device_add_disk(ctrl->device, ns->disk);
 	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_attr_group))
 		pr_warn("%s: failed to create sysfs group for identification\n",
 			ns->disk->disk_name);
+	if (ns->ndev && nvme_nvm_register_sysfs(ns))
+		pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
+			ns->disk->disk_name);
 	return;
  out_free_id:
 	kfree(id);
@@ -1732,6 +1729,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 			blk_integrity_unregister(ns->disk);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_attr_group);
+		if (ns->ndev)
+			nvme_nvm_unregister_sysfs(ns);
 		del_gendisk(ns->disk);
 		blk_mq_abort_requeue_list(ns->queue);
 		blk_cleanup_queue(ns->queue);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index de3dc87226bb60..f23e6fefd26279 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -575,12 +575,10 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.max_phys_sect		= 64,
 };
 
-int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
-		      const struct attribute_group *attrs)
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 {
 	struct request_queue *q = ns->queue;
 	struct nvm_dev *dev;
-	int ret;
 
 	dev = nvm_alloc_dev(node);
 	if (!dev)
@@ -589,18 +587,10 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
 	dev->q = q;
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
 	dev->ops = &nvme_nvm_dev_ops;
-	dev->parent_dev = ns->ctrl->device;
 	dev->private_data = ns;
 	ns->ndev = dev;
 
-	ret = nvm_register(dev);
-
-	ns->lba_shift = ilog2(dev->sec_size) - 9;
-
-	if (sysfs_create_group(&dev->dev.kobj, attrs))
-		pr_warn("%s: failed to create sysfs group for identification\n",
-			disk_name);
-	return ret;
+	return nvm_register(dev);
 }
 
 void nvme_nvm_unregister(struct nvme_ns *ns)
@@ -608,6 +598,167 @@ void nvme_nvm_unregister(struct nvme_ns *ns)
 	nvm_unregister(ns->ndev);
 }
 
+static ssize_t nvm_dev_attr_show(struct device *dev,
+				 struct device_attribute *dattr, char *page)
+{
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_id *id;
+	struct nvm_id_group *grp;
+	struct attribute *attr;
+
+	if (!ndev)
+		return 0;
+
+	id = &ndev->identity;
+	grp = &id->groups[0];
+	attr = &dattr->attr;
+
+	if (strcmp(attr->name, "version") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id);
+	} else if (strcmp(attr->name, "vendor_opcode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt);
+	} else if (strcmp(attr->name, "capabilities") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->cap);
+	} else if (strcmp(attr->name, "device_mode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", id->dom);
+	} else if (strcmp(attr->name, "media_manager") == 0) {
+		if (!ndev->mt)
+			return scnprintf(page, PAGE_SIZE, "%s\n", "none");
+		return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name);
+	} else if (strcmp(attr->name, "ppa_format") == 0) {
+		return scnprintf(page, PAGE_SIZE,
+			"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+			id->ppaf.ch_offset, id->ppaf.ch_len,
+			id->ppaf.lun_offset, id->ppaf.lun_len,
+			id->ppaf.pln_offset, id->ppaf.pln_len,
+			id->ppaf.blk_offset, id->ppaf.blk_len,
+			id->ppaf.pg_offset, id->ppaf.pg_len,
+			id->ppaf.sect_offset, id->ppaf.sect_len);
+	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype);
+	} else if (strcmp(attr->name, "flash_media_type") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype);
+	} else if (strcmp(attr->name, "num_channels") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch);
+	} else if (strcmp(attr->name, "num_luns") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun);
+	} else if (strcmp(attr->name, "num_planes") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
+	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
+	} else if (strcmp(attr->name, "num_pages") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
+	} else if (strcmp(attr->name, "page_size") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz);
+	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs);
+	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos);
+	} else if (strcmp(attr->name, "read_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt);
+	} else if (strcmp(attr->name, "read_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm);
+	} else if (strcmp(attr->name, "prog_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt);
+	} else if (strcmp(attr->name, "prog_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm);
+	} else if (strcmp(attr->name, "erase_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet);
+	} else if (strcmp(attr->name, "erase_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem);
+	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos);
+	} else if (strcmp(attr->name, "media_capabilities") == 0) {
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap);
+	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n",
+				ndev->ops->max_phys_sect);
+	} else {
+		return scnprintf(page,
+				 PAGE_SIZE,
+				 "Unhandled attr(%s) in `nvm_dev_attr_show`\n",
+				 attr->name);
+	}
+}
+
+#define NVM_DEV_ATTR_RO(_name)						\
+	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
+
+static NVM_DEV_ATTR_RO(version);
+static NVM_DEV_ATTR_RO(vendor_opcode);
+static NVM_DEV_ATTR_RO(capabilities);
+static NVM_DEV_ATTR_RO(device_mode);
+static NVM_DEV_ATTR_RO(ppa_format);
+static NVM_DEV_ATTR_RO(media_manager);
+
+static NVM_DEV_ATTR_RO(media_type);
+static NVM_DEV_ATTR_RO(flash_media_type);
+static NVM_DEV_ATTR_RO(num_channels);
+static NVM_DEV_ATTR_RO(num_luns);
+static NVM_DEV_ATTR_RO(num_planes);
+static NVM_DEV_ATTR_RO(num_blocks);
+static NVM_DEV_ATTR_RO(num_pages);
+static NVM_DEV_ATTR_RO(page_size);
+static NVM_DEV_ATTR_RO(hw_sector_size);
+static NVM_DEV_ATTR_RO(oob_sector_size);
+static NVM_DEV_ATTR_RO(read_typ);
+static NVM_DEV_ATTR_RO(read_max);
+static NVM_DEV_ATTR_RO(prog_typ);
+static NVM_DEV_ATTR_RO(prog_max);
+static NVM_DEV_ATTR_RO(erase_typ);
+static NVM_DEV_ATTR_RO(erase_max);
+static NVM_DEV_ATTR_RO(multiplane_modes);
+static NVM_DEV_ATTR_RO(media_capabilities);
+static NVM_DEV_ATTR_RO(max_phys_secs);
+
+static struct attribute *nvm_dev_attrs[] = {
+	&dev_attr_version.attr,
+	&dev_attr_vendor_opcode.attr,
+	&dev_attr_capabilities.attr,
+	&dev_attr_device_mode.attr,
+	&dev_attr_media_manager.attr,
+
+	&dev_attr_ppa_format.attr,
+	&dev_attr_media_type.attr,
+	&dev_attr_flash_media_type.attr,
+	&dev_attr_num_channels.attr,
+	&dev_attr_num_luns.attr,
+	&dev_attr_num_planes.attr,
+	&dev_attr_num_blocks.attr,
+	&dev_attr_num_pages.attr,
+	&dev_attr_page_size.attr,
+	&dev_attr_hw_sector_size.attr,
+	&dev_attr_oob_sector_size.attr,
+	&dev_attr_read_typ.attr,
+	&dev_attr_read_max.attr,
+	&dev_attr_prog_typ.attr,
+	&dev_attr_prog_max.attr,
+	&dev_attr_erase_typ.attr,
+	&dev_attr_erase_max.attr,
+	&dev_attr_multiplane_modes.attr,
+	&dev_attr_media_capabilities.attr,
+	&dev_attr_max_phys_secs.attr,
+	NULL,
+};
+
+static const struct attribute_group nvm_dev_attr_group = {
+	.name		= "lightnvm",
+	.attrs		= nvm_dev_attrs,
+};
+
+int nvme_nvm_register_sysfs(struct nvme_ns *ns)
+{
+	return sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group);
+}
+
+void nvme_nvm_unregister_sysfs(struct nvme_ns *ns)
+{
+	sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group);
+}
+
 /* move to shared place when used in multiple places. */
 #define PCI_VENDOR_ID_CNEX 0x1d1d
 #define PCI_DEVICE_ID_CNEX_WL 0x2807
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 468fc445bf3512..a3d6ffd874afb0 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -321,36 +321,33 @@ int nvme_sg_get_version_num(int __user *ip);
 
 #ifdef CONFIG_NVM
 int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id);
-int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
-		      const struct attribute_group *attrs);
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
-
-static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
-{
-	if (dev->type->devnode)
-		return dev_to_disk(dev)->private_data;
-
-	return (container_of(dev, struct nvm_dev, dev))->private_data;
-}
+int nvme_nvm_register_sysfs(struct nvme_ns *ns);
+void nvme_nvm_unregister_sysfs(struct nvme_ns *ns);
 #else
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
-				    int node,
-				    const struct attribute_group *attrs)
+				    int node)
 {
 	return 0;
 }
 
 static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
-
+static inline int nvme_nvm_register_sysfs(struct nvme_ns *ns)
+{
+	return 0;
+}
+static inline void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) {};
 static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
 {
 	return 0;
 }
+#endif /* CONFIG_NVM */
+
 static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
 {
 	return dev_to_disk(dev)->private_data;
 }
-#endif /* CONFIG_NVM */
 
 int __init nvme_core_init(void);
 void nvme_core_exit(void);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index d190786e4ad812..fb2e601d674e77 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -352,8 +352,6 @@ struct nvm_dev {
 
 	/* Backend device */
 	struct request_queue *q;
-	struct device dev;
-	struct device *parent_dev;
 	char name[DISK_NAME_LEN];
 	void *private_data;
 

From bb3149792e0ed52cf5f457dda4c9bf9c5bda1542 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:38:54 +0100
Subject: [PATCH 119/182] lightnvm: enable to send hint to erase command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Erases might be subject to host hints. An example is multi-plane
programming to erase blocks in parallel. Enable targets to specify this
hint.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      | 9 ++++++---
 drivers/lightnvm/gennvm.c    | 5 ++---
 drivers/lightnvm/rrpc.c      | 2 +-
 drivers/lightnvm/sysblk.c    | 4 ++--
 drivers/nvme/host/lightnvm.c | 1 +
 include/linux/lightnvm.h     | 7 +++----
 6 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 11117404b5f5f0..8664fe09cc82c9 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -202,9 +202,9 @@ int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk)
+int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, int flags)
 {
-	return dev->mt->erase_blk(dev, blk, 0);
+	return dev->mt->erase_blk(dev, blk, flags);
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
@@ -285,7 +285,8 @@ void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_free_rqd_ppalist);
 
-int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas)
+int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
+								int flags)
 {
 	struct nvm_rq rqd;
 	int ret;
@@ -301,6 +302,8 @@ int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas)
 
 	nvm_generic_to_addr_mode(dev, &rqd);
 
+	rqd.flags = flags;
+
 	ret = dev->ops->erase_block(dev, &rqd);
 
 	nvm_free_rqd_ppalist(dev, &rqd);
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index b74174c6d021b0..730d7361f87018 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -593,12 +593,11 @@ static int gen_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	return dev->ops->submit_io(dev, rqd);
 }
 
-static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk,
-							unsigned long flags)
+static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, int flags)
 {
 	struct ppa_addr addr = block_to_ppa(dev, blk);
 
-	return nvm_erase_ppa(dev, &addr, 1);
+	return nvm_erase_ppa(dev, &addr, 1, flags);
 }
 
 static int gen_reserve_lun(struct nvm_dev *dev, int lunid)
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 37fcaadbf80ca0..067e890ae2bf65 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -404,7 +404,7 @@ static void rrpc_block_gc(struct work_struct *work)
 	if (rrpc_move_valid_pages(rrpc, rblk))
 		goto put_back;
 
-	if (nvm_erase_blk(dev, rblk->parent))
+	if (nvm_erase_blk(dev, rblk->parent, 0))
 		goto put_back;
 
 	rrpc_put_blk(rrpc, rblk);
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index a75bd28aaca3a6..d229067574159a 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -379,7 +379,7 @@ static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s)
 		ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)];
 		ppa->g.pg = ppa_to_slc(dev, 0);
 
-		ret = nvm_erase_ppa(dev, ppa, 1);
+		ret = nvm_erase_ppa(dev, ppa, 1, 0);
 		if (ret)
 			return ret;
 
@@ -725,7 +725,7 @@ int nvm_dev_factory(struct nvm_dev *dev, int flags)
 	/* continue to erase until list of blks until empty */
 	while ((ppa_cnt =
 			nvm_fact_get_blks(dev, ppas, max_ppas, blk_bitmap)) > 0)
-		nvm_erase_ppa(dev, ppas, ppa_cnt);
+		nvm_erase_ppa(dev, ppas, ppa_cnt, 0);
 
 	/* mark host reserved blocks free */
 	if (flags & NVM_FACTORY_RESET_HOST_BLKS) {
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index f23e6fefd26279..037dff5951d433 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -526,6 +526,7 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
 	c.erase.nsid = cpu_to_le32(ns->ns_id);
 	c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa);
 	c.erase.length = cpu_to_le16(rqd->nr_ppas - 1);
+	c.erase.control = cpu_to_le16(rqd->flags);
 
 	return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0);
 }
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index fb2e601d674e77..d87be02edc39c1 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -470,8 +470,7 @@ typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *,
-								unsigned long);
+typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int);
 typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int);
 typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
 typedef int (nvmm_reserve_lun)(struct nvm_dev *, int);
@@ -537,8 +536,8 @@ extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
 					const struct ppa_addr *, int, int);
 extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int);
-extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *);
+extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int);
+extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *, int);
 extern void nvm_end_io(struct nvm_rq *, int);
 extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
 								void *, int);

From 8a3c95ab385fb98621455807ae52b4454192f8c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:38:55 +0100
Subject: [PATCH 120/182] lightnvm: do not protect block 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Device blocks should be marked by the device and considered as bad
blocks by the media manager. Thus, do not make assumptions on which
blocks are going to be used by the device. In doing so we might lose
valid blocks from the free list.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 730d7361f87018..a7e17faea2cd6e 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -371,12 +371,6 @@ static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn)
 			block->lun = &lun->vlun;
 			block->id = cur_block_id++;
 
-			/* First block is reserved for device */
-			if (unlikely(lun_iter == 0 && blk_iter == 0)) {
-				lun->vlun.nr_free_blocks--;
-				continue;
-			}
-
 			list_add_tail(&block->list, &lun->free_list);
 		}
 

From a24ba4644b7ae5af3cd2eb6992c237cb4548c45e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:38:56 +0100
Subject: [PATCH 121/182] lightnvm: export set bad block table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bad blocks should be managed by block owners. This would be either
targets for data blocks or sysblk for system blocks.

In order to support this, export two functions: One to mark a block as
an specific type (e.g., bad block) and another to update the bad block
table on the device.

Move bad block management to rrpc.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c   | 27 +++++++++++++++++++++++++++
 drivers/lightnvm/gennvm.c | 24 ------------------------
 drivers/lightnvm/rrpc.c   | 34 +++++++++++++++++++++++++++++++++-
 drivers/lightnvm/sysblk.c | 29 +++++------------------------
 include/linux/lightnvm.h  | 13 ++++++++++++-
 5 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 8664fe09cc82c9..6527cf6862fa09 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -196,6 +196,33 @@ void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
 }
 EXPORT_SYMBOL(nvm_mark_blk);
 
+int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
+								int type)
+{
+	struct nvm_rq rqd;
+	int ret;
+
+	if (nr_ppas > dev->ops->max_phys_sect) {
+		pr_err("nvm: unable to update all sysblocks atomically\n");
+		return -EINVAL;
+	}
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
+	nvm_generic_to_addr_mode(dev, &rqd);
+
+	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
+	nvm_free_rqd_ppalist(dev, &rqd);
+	if (ret) {
+		pr_err("nvm: sysblk failed bb mark\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(nvm_set_bb_tbl);
+
 int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 {
 	return dev->mt->submit_io(dev, rqd);
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index a7e17faea2cd6e..e969e3a801c47f 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -543,34 +543,10 @@ static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
 	blk->state = type;
 }
 
-/*
- * mark block bad in gen. It is expected that the target recovers separately
- */
-static void gen_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-	int bit = -1;
-	int max_secs = dev->ops->max_phys_sect;
-	void *comp_bits = &rqd->ppa_status;
-
-	nvm_addr_to_generic_mode(dev, rqd);
-
-	/* look up blocks and mark them as bad */
-	if (rqd->nr_ppas == 1) {
-		gen_mark_blk(dev, rqd->ppa_addr, NVM_BLK_ST_BAD);
-		return;
-	}
-
-	while ((bit = find_next_bit(comp_bits, max_secs, bit + 1)) < max_secs)
-		gen_mark_blk(dev, rqd->ppa_list[bit], NVM_BLK_ST_BAD);
-}
-
 static void gen_end_io(struct nvm_rq *rqd)
 {
 	struct nvm_tgt_instance *ins = rqd->ins;
 
-	if (rqd->error == NVM_RSP_ERR_FAILWRITE)
-		gen_mark_blk_bad(rqd->dev, rqd);
-
 	ins->tt->end_io(rqd);
 }
 
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 067e890ae2bf65..2b71b7e59dac5a 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -675,6 +675,34 @@ static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
 	queue_work(rrpc->kgc_wq, &gcb->ws_gc);
 }
 
+static void __rrpc_mark_bad_block(struct nvm_dev *dev, struct ppa_addr *ppa)
+{
+		nvm_mark_blk(dev, *ppa, NVM_BLK_ST_BAD);
+		nvm_set_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+}
+
+static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
+{
+	struct nvm_dev *dev = rrpc->dev;
+	void *comp_bits = &rqd->ppa_status;
+	struct ppa_addr ppa, prev_ppa;
+	int nr_ppas = rqd->nr_ppas;
+	int bit;
+
+	if (rqd->nr_ppas == 1)
+		__rrpc_mark_bad_block(dev, &rqd->ppa_addr);
+
+	ppa_set_empty(&prev_ppa);
+	bit = -1;
+	while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
+		ppa = rqd->ppa_list[bit];
+		if (ppa_cmp_blk(ppa, prev_ppa))
+			continue;
+
+		__rrpc_mark_bad_block(dev, &ppa);
+	}
+}
+
 static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
 						sector_t laddr, uint8_t npages)
 {
@@ -701,8 +729,12 @@ static void rrpc_end_io(struct nvm_rq *rqd)
 	uint8_t npages = rqd->nr_ppas;
 	sector_t laddr = rrpc_get_laddr(rqd->bio) - npages;
 
-	if (bio_data_dir(rqd->bio) == WRITE)
+	if (bio_data_dir(rqd->bio) == WRITE) {
+		if (rqd->error == NVM_RSP_ERR_FAILWRITE)
+			rrpc_mark_bad_block(rrpc, rqd);
+
 		rrpc_end_io_write(rrpc, rrqd, laddr, npages);
+	}
 
 	bio_put(rqd->bio);
 
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index d229067574159a..fa644afb25de91 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -267,29 +267,10 @@ static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
 	return found;
 }
 
-static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type)
+static int nvm_sysblk_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s,
+								int type)
 {
-	struct nvm_rq rqd;
-	int ret;
-
-	if (s->nr_ppas > dev->ops->max_phys_sect) {
-		pr_err("nvm: unable to update all sysblocks atomically\n");
-		return -EINVAL;
-	}
-
-	memset(&rqd, 0, sizeof(struct nvm_rq));
-
-	nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas, 1);
-	nvm_generic_to_addr_mode(dev, &rqd);
-
-	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
-	nvm_free_rqd_ppalist(dev, &rqd);
-	if (ret) {
-		pr_err("nvm: sysblk failed bb mark\n");
-		return -EINVAL;
-	}
-
-	return 0;
+	return nvm_set_bb_tbl(dev, s->ppas, s->nr_ppas, type);
 }
 
 static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
@@ -573,7 +554,7 @@ int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
 	if (ret)
 		goto err_mark;
 
-	ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_HOST);
+	ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_HOST);
 	if (ret)
 		goto err_mark;
 
@@ -733,7 +714,7 @@ int nvm_dev_factory(struct nvm_dev *dev, int flags)
 		mutex_lock(&dev->mlock);
 		ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, 0);
 		if (!ret)
-			ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_FREE);
+			ret = nvm_sysblk_set_bb_tbl(dev, &s, NVM_BLK_T_FREE);
 		mutex_unlock(&dev->mlock);
 	}
 err_ppas:
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index d87be02edc39c1..4480d1c6a1a5c3 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -423,6 +423,15 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev,
 	return ppa;
 }
 
+static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
+{
+	if (ppa_empty(ppa1) || ppa_empty(ppa2))
+		return 0;
+
+	return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) &&
+					(ppa1.g.blk == ppa2.g.blk));
+}
+
 static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg)
 {
 	return dev->lptbl[slc_pg];
@@ -528,7 +537,9 @@ extern struct nvm_dev *nvm_alloc_dev(int);
 extern int nvm_register(struct nvm_dev *);
 extern void nvm_unregister(struct nvm_dev *);
 
-void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type);
+extern void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type);
+extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas,
+							int nr_ppas, int type);
 
 extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
 extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);

From 402ab9a89d7b5bab08a5534027b39d80085ec19b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:38:57 +0100
Subject: [PATCH 122/182] lightnvm: add ECC error codes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add ECC error codes to enable the appropriate handling in the target.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/lightnvm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 4480d1c6a1a5c3..6b26a3289bced7 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -107,6 +107,8 @@ enum {
 	NVM_RSP_NOT_CHANGEABLE	= 0x1,
 	NVM_RSP_ERR_FAILWRITE	= 0x40ff,
 	NVM_RSP_ERR_EMPTYPAGE	= 0x42ff,
+	NVM_RSP_ERR_FAILECC	= 0x4281,
+	NVM_RSP_WARN_HIGHECC	= 0x4700,
 
 	/* Device opcodes */
 	NVM_OP_HBREAD		= 0x02,

From f0b01b6a610f99fb7683d0f5259bb65649b0fd78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:38:58 +0100
Subject: [PATCH 123/182] lightnvm: rrpc: split bios of size > 256kb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rrpc cannot handle bios of size > 256kb due to NVMe using a 64 bit
bitmap to signal I/O completion. If a larger bio comes, split it
explicitly.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/rrpc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 2b71b7e59dac5a..60ca9d4bd6e9cd 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -943,6 +943,8 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
 	struct nvm_rq *rqd;
 	int err;
 
+	blk_queue_split(q, &bio, q->bio_split);
+
 	if (bio_op(bio) == REQ_OP_DISCARD) {
 		rrpc_discard(rrpc, bio);
 		return BLK_QC_T_NONE;

From 17b25cfc873e6d7e2283cf8d77f2af93192484a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:38:59 +0100
Subject: [PATCH 124/182] lightnvm: remove sysfs configuration interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LightNVM used to be managed and configured through sysfs. Since the
introduction of management ioctls this interface is redundant and
outdated. Get rid of it.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c | 134 ----------------------------------------
 1 file changed, 134 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 6527cf6862fa09..d4433d3b485561 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -780,140 +780,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 	return dev->mt->create_tgt(dev, create);
 }
 
-#ifdef CONFIG_NVM_DEBUG
-static int nvm_configure_show(const char *val)
-{
-	struct nvm_dev *dev;
-	char opcode, devname[DISK_NAME_LEN];
-	int ret;
-
-	ret = sscanf(val, "%c %32s", &opcode, devname);
-	if (ret != 2) {
-		pr_err("nvm: invalid command. Use \"opcode devicename\".\n");
-		return -EINVAL;
-	}
-
-	down_write(&nvm_lock);
-	dev = nvm_find_nvm_dev(devname);
-	up_write(&nvm_lock);
-	if (!dev) {
-		pr_err("nvm: device not found\n");
-		return -EINVAL;
-	}
-
-	if (!dev->mt)
-		return 0;
-
-	dev->mt->lun_info_print(dev);
-
-	return 0;
-}
-
-static int nvm_configure_remove(const char *val)
-{
-	struct nvm_ioctl_remove remove;
-	struct nvm_dev *dev;
-	char opcode;
-	int ret = 0;
-
-	ret = sscanf(val, "%c %256s", &opcode, remove.tgtname);
-	if (ret != 2) {
-		pr_err("nvm: invalid command. Use \"d targetname\".\n");
-		return -EINVAL;
-	}
-
-	remove.flags = 0;
-
-	list_for_each_entry(dev, &nvm_devices, devices) {
-		ret = dev->mt->remove_tgt(dev, &remove);
-		if (!ret)
-			break;
-	}
-
-	return ret;
-}
-
-static int nvm_configure_create(const char *val)
-{
-	struct nvm_ioctl_create create;
-	char opcode;
-	int lun_begin, lun_end, ret;
-
-	ret = sscanf(val, "%c %256s %256s %48s %u:%u", &opcode, create.dev,
-						create.tgtname, create.tgttype,
-						&lun_begin, &lun_end);
-	if (ret != 6) {
-		pr_err("nvm: invalid command. Use \"opcode device name tgttype lun_begin:lun_end\".\n");
-		return -EINVAL;
-	}
-
-	create.flags = 0;
-	create.conf.type = NVM_CONFIG_TYPE_SIMPLE;
-	create.conf.s.lun_begin = lun_begin;
-	create.conf.s.lun_end = lun_end;
-
-	return __nvm_configure_create(&create);
-}
-
-
-/* Exposes administrative interface through /sys/module/lnvm/configure_by_str */
-static int nvm_configure_by_str_event(const char *val,
-					const struct kernel_param *kp)
-{
-	char opcode;
-	int ret;
-
-	ret = sscanf(val, "%c", &opcode);
-	if (ret != 1) {
-		pr_err("nvm: string must have the format of \"cmd ...\"\n");
-		return -EINVAL;
-	}
-
-	switch (opcode) {
-	case 'a':
-		return nvm_configure_create(val);
-	case 'd':
-		return nvm_configure_remove(val);
-	case 's':
-		return nvm_configure_show(val);
-	default:
-		pr_err("nvm: invalid command\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int nvm_configure_get(char *buf, const struct kernel_param *kp)
-{
-	int sz;
-	struct nvm_dev *dev;
-
-	sz = sprintf(buf, "available devices:\n");
-	down_write(&nvm_lock);
-	list_for_each_entry(dev, &nvm_devices, devices) {
-		if (sz > 4095 - DISK_NAME_LEN - 2)
-			break;
-		sz += sprintf(buf + sz, " %32s\n", dev->name);
-	}
-	up_write(&nvm_lock);
-
-	return sz;
-}
-
-static const struct kernel_param_ops nvm_configure_by_str_event_param_ops = {
-	.set	= nvm_configure_by_str_event,
-	.get	= nvm_configure_get,
-};
-
-#undef MODULE_PARAM_PREFIX
-#define MODULE_PARAM_PREFIX	"lnvm."
-
-module_param_cb(configure_debug, &nvm_configure_by_str_event_param_ops, NULL,
-									0644);
-
-#endif /* CONFIG_NVM_DEBUG */
-
 static long nvm_ioctl_info(struct file *file, void __user *arg)
 {
 	struct nvm_ioctl_info *info;

From 7e4f64a9b3004ce592f21653c3b7781628862232 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:00 +0100
Subject: [PATCH 125/182] lightnvm: cleanup unused target operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cleanup definition leftovers from old gennvm interface

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/lightnvm.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 6b26a3289bced7..e598308882aa51 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -477,9 +477,6 @@ typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
 typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *,
 					      struct nvm_lun *, unsigned long);
 typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *);
-typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *);
-typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *);
-typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int);
 typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int);
@@ -504,9 +501,6 @@ struct nvmm_type {
 	/* Block administration callbacks */
 	nvmm_get_blk_fn *get_blk;
 	nvmm_put_blk_fn *put_blk;
-	nvmm_open_blk_fn *open_blk;
-	nvmm_close_blk_fn *close_blk;
-	nvmm_flush_blk_fn *flush_blk;
 
 	nvmm_submit_io_fn *submit_io;
 	nvmm_erase_blk_fn *erase_blk;

From 0e5c3246dbb96b6870634e7d51b2490f05c976cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:01 +0100
Subject: [PATCH 126/182] lightnvm: make address conversion functions global
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Targets are assumed to used the same generic ppa format, where the
address is partitioned on ch:lun:block:pg:pl:sec. Thus, make the
function in charge of transforming the ppa address from a linear format
to the generic one available to all targets.

This function will be needed by the media manager in order to do target
mapping translations when targets are divided on different physical
partitions.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/rrpc.c  | 30 ------------------------------
 include/linux/lightnvm.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 60ca9d4bd6e9cd..9253acc8071414 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -136,36 +136,6 @@ static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
 	return blk->id * rrpc->dev->sec_per_blk;
 }
 
-static struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev,
-							struct ppa_addr r)
-{
-	struct ppa_addr l;
-	int secs, pgs, blks, luns;
-	sector_t ppa = r.ppa;
-
-	l.ppa = 0;
-
-	div_u64_rem(ppa, dev->sec_per_pg, &secs);
-	l.g.sec = secs;
-
-	sector_div(ppa, dev->sec_per_pg);
-	div_u64_rem(ppa, dev->pgs_per_blk, &pgs);
-	l.g.pg = pgs;
-
-	sector_div(ppa, dev->pgs_per_blk);
-	div_u64_rem(ppa, dev->blks_per_lun, &blks);
-	l.g.blk = blks;
-
-	sector_div(ppa, dev->blks_per_lun);
-	div_u64_rem(ppa, dev->luns_per_chnl, &luns);
-	l.g.lun = luns;
-
-	sector_div(ppa, dev->luns_per_chnl);
-	l.g.ch = ppa;
-
-	return l;
-}
-
 static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr)
 {
 	struct ppa_addr paddr;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index e598308882aa51..98278a9fcb1fb8 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -361,6 +361,36 @@ struct nvm_dev {
 	spinlock_t lock;
 };
 
+static inline struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev,
+							struct ppa_addr r)
+{
+	struct ppa_addr l;
+	int secs, pgs, blks, luns;
+	sector_t ppa = r.ppa;
+
+	l.ppa = 0;
+
+	div_u64_rem(ppa, dev->sec_per_pg, &secs);
+	l.g.sec = secs;
+
+	sector_div(ppa, dev->sec_per_pg);
+	div_u64_rem(ppa, dev->pgs_per_blk, &pgs);
+	l.g.pg = pgs;
+
+	sector_div(ppa, dev->pgs_per_blk);
+	div_u64_rem(ppa, dev->blks_per_lun, &blks);
+	l.g.blk = blks;
+
+	sector_div(ppa, dev->blks_per_lun);
+	div_u64_rem(ppa, dev->luns_per_chnl, &luns);
+	l.g.lun = luns;
+
+	sector_div(ppa, dev->luns_per_chnl);
+	l.g.ch = ppa;
+
+	return l;
+}
+
 static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
 						struct ppa_addr r)
 {

From eb00352b5213e52419ac7dd8bbd84a1300fe4b5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:02 +0100
Subject: [PATCH 127/182] lightnvm: remove unnecessary variables in rrpc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before vectored I/Os were supported on rrpc, the physical address was
stored as part of the nvm_rqd request. This variable become obsolete
when the ppa_list was introduced. Cleanup this variable.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/rrpc.c | 6 ------
 drivers/lightnvm/rrpc.h | 3 ---
 2 files changed, 9 deletions(-)

diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 9253acc8071414..34d2ebf90ed6ae 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -758,7 +758,6 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
 							unsigned long flags)
 {
-	struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
 	int is_gc = flags & NVM_IOTYPE_GC;
 	sector_t laddr = rrpc_get_laddr(bio);
 	struct rrpc_addr *gp;
@@ -778,7 +777,6 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
 	}
 
 	rqd->opcode = NVM_OP_HBREAD;
-	rrqd->addr = gp;
 
 	return NVM_IO_OK;
 }
@@ -821,7 +819,6 @@ static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
 				struct nvm_rq *rqd, unsigned long flags)
 {
-	struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
 	struct rrpc_addr *p;
 	int is_gc = flags & NVM_IOTYPE_GC;
 	sector_t laddr = rrpc_get_laddr(bio);
@@ -839,7 +836,6 @@ static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
 
 	rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, p->addr);
 	rqd->opcode = NVM_OP_HBWRITE;
-	rrqd->addr = p;
 
 	return NVM_IO_OK;
 }
@@ -1389,7 +1385,6 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
 	INIT_WORK(&rrpc->ws_requeue, rrpc_requeue);
 
 	rrpc->nr_luns = lun_end - lun_begin + 1;
-	rrpc->total_blocks = (unsigned long)dev->blks_per_lun * rrpc->nr_luns;
 	rrpc->nr_sects = (unsigned long long)dev->sec_per_lun * rrpc->nr_luns;
 
 	/* simple round-robin strategy */
@@ -1409,7 +1404,6 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
 	}
 
 	rrpc->poffset = dev->sec_per_lun * lun_begin;
-	rrpc->lun_offset = lun_begin;
 
 	ret = rrpc_core_init(rrpc);
 	if (ret) {
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index 5e87d52cb98396..bed49baa784101 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -48,7 +48,6 @@ struct rrpc_inflight_rq {
 
 struct rrpc_rq {
 	struct rrpc_inflight_rq inflight_rq;
-	struct rrpc_addr *addr;
 	unsigned long flags;
 };
 
@@ -92,14 +91,12 @@ struct rrpc {
 
 	sector_t soffset; /* logical sector offset */
 	u64 poffset; /* physical page offset */
-	int lun_offset;
 
 	int nr_luns;
 	struct rrpc_lun *luns;
 
 	/* calculated values */
 	unsigned long long nr_sects;
-	unsigned long total_blocks;
 
 	/* Write strategy variables. Move these into each for structure for each
 	 * strategy

From 98379a12c54974ee5856dcf81781a5dc845505c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:03 +0100
Subject: [PATCH 128/182] lightnvm: use constant name instead of value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a constant to refer to free blocks. Use it when marking bad
blocks instead of using a constant value

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index e969e3a801c47f..aee5b722ba63bd 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -279,7 +279,7 @@ static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa,
 	lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
 
 	for (i = 0; i < nr_blks; i++) {
-		if (blks[i] == 0)
+		if (blks[i] == NVM_BLK_T_FREE)
 			continue;
 
 		blk = &lun->vlun.blocks[i];

From de93434fcf74d41754a48e45365a5914e00bc0be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:04 +0100
Subject: [PATCH 129/182] lightnvm: remove gen_lun abstraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gen_lun abstraction in the generic media manager was conceived on
the assumption that a single target would instantiated on top of it.
This has complicated target design to implement multi-instances. Remove
this abstraction and move its logic to nvm_lun, which manages physical
lun geometry and operations.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 85 +++++++++++++++++++--------------------
 drivers/lightnvm/gennvm.h | 16 +-------
 include/linux/lightnvm.h  | 10 +++++
 3 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index aee5b722ba63bd..3572ebbb50cecc 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -223,13 +223,13 @@ static void gen_put_area(struct nvm_dev *dev, sector_t begin)
 static void gen_blocks_free(struct nvm_dev *dev)
 {
 	struct gen_dev *gn = dev->mp;
-	struct gen_lun *lun;
+	struct nvm_lun *lun;
 	int i;
 
 	gen_for_each_lun(gn, lun, i) {
-		if (!lun->vlun.blocks)
+		if (!lun->blocks)
 			break;
-		vfree(lun->vlun.blocks);
+		vfree(lun->blocks);
 	}
 }
 
@@ -242,24 +242,24 @@ static void gen_luns_free(struct nvm_dev *dev)
 
 static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn)
 {
-	struct gen_lun *lun;
+	struct nvm_lun *lun;
 	int i;
 
-	gn->luns = kcalloc(dev->nr_luns, sizeof(struct gen_lun), GFP_KERNEL);
+	gn->luns = kcalloc(dev->nr_luns, sizeof(struct nvm_lun), GFP_KERNEL);
 	if (!gn->luns)
 		return -ENOMEM;
 
 	gen_for_each_lun(gn, lun, i) {
-		spin_lock_init(&lun->vlun.lock);
 		INIT_LIST_HEAD(&lun->free_list);
 		INIT_LIST_HEAD(&lun->used_list);
 		INIT_LIST_HEAD(&lun->bb_list);
 
-		lun->reserved_blocks = 2; /* for GC only */
-		lun->vlun.id = i;
-		lun->vlun.lun_id = i % dev->luns_per_chnl;
-		lun->vlun.chnl_id = i / dev->luns_per_chnl;
-		lun->vlun.nr_free_blocks = dev->blks_per_lun;
+		spin_lock_init(&lun->lock);
+
+		lun->id = i;
+		lun->lun_id = i % dev->luns_per_chnl;
+		lun->chnl_id = i / dev->luns_per_chnl;
+		lun->nr_free_blocks = dev->blks_per_lun;
 	}
 	return 0;
 }
@@ -268,7 +268,7 @@ static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa,
 							u8 *blks, int nr_blks)
 {
 	struct nvm_dev *dev = gn->dev;
-	struct gen_lun *lun;
+	struct nvm_lun *lun;
 	struct nvm_block *blk;
 	int i;
 
@@ -282,9 +282,10 @@ static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa,
 		if (blks[i] == NVM_BLK_T_FREE)
 			continue;
 
-		blk = &lun->vlun.blocks[i];
+		blk = &lun->blocks[i];
 		list_move_tail(&blk->list, &lun->bb_list);
-		lun->vlun.nr_free_blocks--;
+		blk->state = NVM_BLK_ST_BAD;
+		lun->nr_free_blocks--;
 	}
 
 	return 0;
@@ -295,7 +296,7 @@ static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 	struct nvm_dev *dev = private;
 	struct gen_dev *gn = dev->mp;
 	u64 elba = slba + nlb;
-	struct gen_lun *lun;
+	struct nvm_lun *lun;
 	struct nvm_block *blk;
 	u64 i;
 	int lun_id;
@@ -326,7 +327,7 @@ static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 
 		/* Calculate block offset into lun */
 		pba = pba - (dev->sec_per_lun * lun_id);
-		blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)];
+		blk = &lun->blocks[div_u64(pba, dev->sec_per_blk)];
 
 		if (!blk->state) {
 			/* at this point, we don't know anything about the
@@ -335,7 +336,7 @@ static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 			 */
 			list_move_tail(&blk->list, &lun->used_list);
 			blk->state = NVM_BLK_ST_TGT;
-			lun->vlun.nr_free_blocks--;
+			lun->nr_free_blocks--;
 		}
 	}
 
@@ -344,7 +345,7 @@ static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 
 static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn)
 {
-	struct gen_lun *lun;
+	struct nvm_lun *lun;
 	struct nvm_block *block;
 	sector_t lun_iter, blk_iter, cur_block_id = 0;
 	int ret, nr_blks;
@@ -356,19 +357,19 @@ static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn)
 		return -ENOMEM;
 
 	gen_for_each_lun(gn, lun, lun_iter) {
-		lun->vlun.blocks = vzalloc(sizeof(struct nvm_block) *
+		lun->blocks = vzalloc(sizeof(struct nvm_block) *
 							dev->blks_per_lun);
-		if (!lun->vlun.blocks) {
+		if (!lun->blocks) {
 			kfree(blks);
 			return -ENOMEM;
 		}
 
 		for (blk_iter = 0; blk_iter < dev->blks_per_lun; blk_iter++) {
-			block = &lun->vlun.blocks[blk_iter];
+			block = &lun->blocks[blk_iter];
 
 			INIT_LIST_HEAD(&block->list);
 
-			block->lun = &lun->vlun;
+			block->lun = lun;
 			block->id = cur_block_id++;
 
 			list_add_tail(&block->list, &lun->free_list);
@@ -378,8 +379,8 @@ static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn)
 			struct ppa_addr ppa;
 
 			ppa.ppa = 0;
-			ppa.g.ch = lun->vlun.chnl_id;
-			ppa.g.lun = lun->vlun.lun_id;
+			ppa.g.ch = lun->chnl_id;
+			ppa.g.lun = lun->lun_id;
 
 			ret = nvm_get_bb_tbl(dev, ppa, blks);
 			if (ret)
@@ -468,41 +469,39 @@ static void gen_unregister(struct nvm_dev *dev)
 }
 
 static struct nvm_block *gen_get_blk(struct nvm_dev *dev,
-				struct nvm_lun *vlun, unsigned long flags)
+				struct nvm_lun *lun, unsigned long flags)
 {
-	struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
 	struct nvm_block *blk = NULL;
 	int is_gc = flags & NVM_IOTYPE_GC;
 
-	spin_lock(&vlun->lock);
+	spin_lock(&lun->lock);
 	if (list_empty(&lun->free_list)) {
 		pr_err_ratelimited("gen: lun %u have no free pages available",
-								lun->vlun.id);
+								lun->id);
 		goto out;
 	}
 
-	if (!is_gc && lun->vlun.nr_free_blocks < lun->reserved_blocks)
+	if (!is_gc && lun->nr_free_blocks < lun->reserved_blocks)
 		goto out;
 
 	blk = list_first_entry(&lun->free_list, struct nvm_block, list);
 
 	list_move_tail(&blk->list, &lun->used_list);
 	blk->state = NVM_BLK_ST_TGT;
-	lun->vlun.nr_free_blocks--;
+	lun->nr_free_blocks--;
 out:
-	spin_unlock(&vlun->lock);
+	spin_unlock(&lun->lock);
 	return blk;
 }
 
 static void gen_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
 {
-	struct nvm_lun *vlun = blk->lun;
-	struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun);
+	struct nvm_lun *lun = blk->lun;
 
-	spin_lock(&vlun->lock);
+	spin_lock(&lun->lock);
 	if (blk->state & NVM_BLK_ST_TGT) {
 		list_move_tail(&blk->list, &lun->free_list);
-		lun->vlun.nr_free_blocks++;
+		lun->nr_free_blocks++;
 		blk->state = NVM_BLK_ST_FREE;
 	} else if (blk->state & NVM_BLK_ST_BAD) {
 		list_move_tail(&blk->list, &lun->bb_list);
@@ -513,13 +512,13 @@ static void gen_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
 							blk->id, blk->state);
 		list_move_tail(&blk->list, &lun->bb_list);
 	}
-	spin_unlock(&vlun->lock);
+	spin_unlock(&lun->lock);
 }
 
 static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
 {
 	struct gen_dev *gn = dev->mp;
-	struct gen_lun *lun;
+	struct nvm_lun *lun;
 	struct nvm_block *blk;
 
 	pr_debug("gen: ppa  (ch: %u lun: %u blk: %u pg: %u) -> %u\n",
@@ -537,7 +536,7 @@ static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
 	}
 
 	lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
-	blk = &lun->vlun.blocks[ppa.g.blk];
+	blk = &lun->blocks[ppa.g.blk];
 
 	/* will be moved to bb list on put_blk from target */
 	blk->state = type;
@@ -587,23 +586,23 @@ static struct nvm_lun *gen_get_lun(struct nvm_dev *dev, int lunid)
 	if (unlikely(lunid >= dev->nr_luns))
 		return NULL;
 
-	return &gn->luns[lunid].vlun;
+	return &gn->luns[lunid];
 }
 
 static void gen_lun_info_print(struct nvm_dev *dev)
 {
 	struct gen_dev *gn = dev->mp;
-	struct gen_lun *lun;
+	struct nvm_lun *lun;
 	unsigned int i;
 
 
 	gen_for_each_lun(gn, lun, i) {
-		spin_lock(&lun->vlun.lock);
+		spin_lock(&lun->lock);
 
 		pr_info("%s: lun%8u\t%u\n", dev->name, i,
-						lun->vlun.nr_free_blocks);
+						lun->nr_free_blocks);
 
-		spin_unlock(&lun->vlun.lock);
+		spin_unlock(&lun->lock);
 	}
 }
 
diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h
index 8ecfa817d21d14..d167f391fbaef2 100644
--- a/drivers/lightnvm/gennvm.h
+++ b/drivers/lightnvm/gennvm.h
@@ -20,25 +20,11 @@
 
 #include <linux/lightnvm.h>
 
-struct gen_lun {
-	struct nvm_lun vlun;
-
-	int reserved_blocks;
-	/* lun block lists */
-	struct list_head used_list;	/* In-use blocks */
-	struct list_head free_list;	/* Not used blocks i.e. released
-					 * and ready for use
-					 */
-	struct list_head bb_list;	/* Bad blocks. Mutually exclusive with
-					 * free_list and used_list
-					 */
-};
-
 struct gen_dev {
 	struct nvm_dev *dev;
 
 	int nr_luns;
-	struct gen_lun *luns;
+	struct nvm_lun *luns;
 	struct list_head area_list;
 
 	struct mutex lock;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 98278a9fcb1fb8..33940bdc18a9cc 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -275,7 +275,17 @@ struct nvm_lun {
 
 	spinlock_t lock;
 
+	/* lun block lists */
+	struct list_head used_list;	/* In-use blocks */
+	struct list_head free_list;	/* Not used blocks i.e. released
+					 * and ready for use
+					 */
+	struct list_head bb_list;	/* Bad blocks. Mutually exclusive with
+					 * free_list and used_list
+					 */
 	unsigned int nr_free_blocks;	/* Number of unused blocks */
+	int reserved_blocks;
+
 	struct nvm_block *blocks;
 };
 

From 8176117b82e49e043d045f214ba7a892fba6b827 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:05 +0100
Subject: [PATCH 130/182] lightnvm: manage lun partitions internally in mm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LUNs are exclusively owned by targets implementing a block device FTL.
Doing this reservation requires at the moment a 2-way callback gennvm
<-> target. The reason behind this is that LUNs were not assumed to
always be exclusively owned by targets. However, this design decision
goes against I/O determinism QoS (two targets would mix I/O on the same
parallel unit in the device).

This patch makes LUN reservation as part of the target creation on the
media manager. This makes that LUNs are always exclusively owned by the
target instantiated on top of them. LUN stripping and/or sharing should
be implemented on the target itself or the layers on top.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 62 +++++++++++++++++++++++++++++++--------
 drivers/lightnvm/rrpc.c   |  7 -----
 include/linux/lightnvm.h  |  6 ++--
 3 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 3572ebbb50cecc..9671e11356be4d 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -35,6 +35,45 @@ static const struct block_device_operations gen_fops = {
 	.owner		= THIS_MODULE,
 };
 
+static int gen_reserve_luns(struct nvm_dev *dev, struct nvm_target *t,
+			    int lun_begin, int lun_end)
+{
+	struct gen_dev *gn = dev->mp;
+	struct nvm_lun *lun;
+	int i;
+
+	for (i = lun_begin; i <= lun_end; i++) {
+		if (test_and_set_bit(i, dev->lun_map)) {
+			pr_err("nvm: lun %d already allocated\n", i);
+			goto err;
+		}
+
+		lun = &gn->luns[i];
+		list_add_tail(&lun->list, &t->lun_list);
+	}
+
+	return 0;
+
+err:
+	while (--i > lun_begin) {
+		lun = &gn->luns[i];
+		clear_bit(i, dev->lun_map);
+		list_del(&lun->list);
+	}
+
+	return -EBUSY;
+}
+
+static void gen_release_luns(struct nvm_dev *dev, struct nvm_target *t)
+{
+	struct nvm_lun *lun, *tmp;
+
+	list_for_each_entry_safe(lun, tmp, &t->lun_list, list) {
+		WARN_ON(!test_and_clear_bit(lun->id, dev->lun_map));
+		list_del(&lun->list);
+	}
+}
+
 static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 {
 	struct gen_dev *gn = dev->mp;
@@ -64,9 +103,14 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	if (!t)
 		return -ENOMEM;
 
+	INIT_LIST_HEAD(&t->lun_list);
+
+	if (gen_reserve_luns(dev, t, s->lun_begin, s->lun_end))
+		goto err_t;
+
 	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
 	if (!tqueue)
-		goto err_t;
+		goto err_reserve;
 	blk_queue_make_request(tqueue, tt->make_rq);
 
 	tdisk = alloc_disk(0);
@@ -105,6 +149,8 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	put_disk(tdisk);
 err_queue:
 	blk_cleanup_queue(tqueue);
+err_reserve:
+	gen_release_luns(dev, t);
 err_t:
 	kfree(t);
 	return -ENOMEM;
@@ -122,6 +168,7 @@ static void __gen_remove_target(struct nvm_target *t)
 	if (tt->exit)
 		tt->exit(tdisk->private_data);
 
+	gen_release_luns(t->dev, t);
 	put_disk(tdisk);
 
 	list_del(&t->list);
@@ -253,6 +300,7 @@ static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn)
 		INIT_LIST_HEAD(&lun->free_list);
 		INIT_LIST_HEAD(&lun->used_list);
 		INIT_LIST_HEAD(&lun->bb_list);
+		INIT_LIST_HEAD(&lun->list);
 
 		spin_lock_init(&lun->lock);
 
@@ -569,16 +617,6 @@ static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, int flags)
 	return nvm_erase_ppa(dev, &addr, 1, flags);
 }
 
-static int gen_reserve_lun(struct nvm_dev *dev, int lunid)
-{
-	return test_and_set_bit(lunid, dev->lun_map);
-}
-
-static void gen_release_lun(struct nvm_dev *dev, int lunid)
-{
-	WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
-}
-
 static struct nvm_lun *gen_get_lun(struct nvm_dev *dev, int lunid)
 {
 	struct gen_dev *gn = dev->mp;
@@ -625,8 +663,6 @@ static struct nvmm_type gen = {
 	.mark_blk		= gen_mark_blk,
 
 	.get_lun		= gen_get_lun,
-	.reserve_lun		= gen_reserve_lun,
-	.release_lun		= gen_release_lun,
 	.lun_info_print		= gen_lun_info_print,
 
 	.get_area		= gen_get_area,
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 34d2ebf90ed6ae..88e0d0677b094c 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -1126,7 +1126,6 @@ static void rrpc_core_free(struct rrpc *rrpc)
 
 static void rrpc_luns_free(struct rrpc *rrpc)
 {
-	struct nvm_dev *dev = rrpc->dev;
 	struct nvm_lun *lun;
 	struct rrpc_lun *rlun;
 	int i;
@@ -1139,7 +1138,6 @@ static void rrpc_luns_free(struct rrpc *rrpc)
 		lun = rlun->parent;
 		if (!lun)
 			break;
-		dev->mt->release_lun(dev, lun->id);
 		vfree(rlun->blocks);
 	}
 
@@ -1169,11 +1167,6 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 		int lunid = lun_begin + i;
 		struct nvm_lun *lun;
 
-		if (dev->mt->reserve_lun(dev, lunid)) {
-			pr_err("rrpc: lun %u is already allocated\n", lunid);
-			goto err;
-		}
-
 		lun = dev->mt->get_lun(dev, lunid);
 		if (!lun)
 			goto err;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 33940bdc18a9cc..89c695483d551c 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -210,6 +210,7 @@ struct nvm_id {
 
 struct nvm_target {
 	struct list_head list;
+	struct list_head lun_list;
 	struct nvm_dev *dev;
 	struct nvm_tgt_type *type;
 	struct gendisk *disk;
@@ -273,6 +274,7 @@ struct nvm_lun {
 	int lun_id;
 	int chnl_id;
 
+	struct list_head list;
 	spinlock_t lock;
 
 	/* lun block lists */
@@ -521,8 +523,6 @@ typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int);
 typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int);
 typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
-typedef int (nvmm_reserve_lun)(struct nvm_dev *, int);
-typedef void (nvmm_release_lun)(struct nvm_dev *, int);
 typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *);
 
 typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
@@ -550,8 +550,6 @@ struct nvmm_type {
 
 	/* Configuration management */
 	nvmm_get_lun_fn *get_lun;
-	nvmm_reserve_lun *reserve_lun;
-	nvmm_release_lun *release_lun;
 
 	/* Statistics */
 	nvmm_lun_info_print_fn *lun_info_print;

From 8e79b5cb1d3b8eceaf6862995952dd4de431dd99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:06 +0100
Subject: [PATCH 131/182] lightnvm: move block provisioning to targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to naturally support multi-target instances on an Open-Channel
SSD, targets should own the LUNs they get blocks from and manage
provisioning internally. This is done in several steps.

This patch moves the block provisioning inside of the target and removes
the get/put block interface from the media manager.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      |  97 ++++++++---------
 drivers/lightnvm/gennvm.c    | 145 ++++++++++++--------------
 drivers/lightnvm/rrpc.c      | 196 +++++++++++++++++++++++------------
 drivers/lightnvm/rrpc.h      |  15 ++-
 drivers/lightnvm/sysblk.c    |  65 ++++++------
 drivers/nvme/host/lightnvm.c |  11 +-
 include/linux/lightnvm.h     | 133 +++++++++++++-----------
 7 files changed, 371 insertions(+), 291 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index d4433d3b485561..493ce034e68769 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -176,20 +176,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 	return NULL;
 }
 
-struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun,
-							unsigned long flags)
-{
-	return dev->mt->get_blk(dev, lun, flags);
-}
-EXPORT_SYMBOL(nvm_get_blk);
-
-/* Assumes that all valid pages have already been moved on release to bm */
-void nvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
-{
-	return dev->mt->put_blk(dev, blk);
-}
-EXPORT_SYMBOL(nvm_put_blk);
-
 void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
 {
 	return dev->mt->mark_blk(dev, ppa, type);
@@ -266,10 +252,11 @@ EXPORT_SYMBOL(nvm_generic_to_addr_mode);
 int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
 			const struct ppa_addr *ppas, int nr_ppas, int vblk)
 {
+	struct nvm_geo *geo = &dev->geo;
 	int i, plane_cnt, pl_idx;
 	struct ppa_addr ppa;
 
-	if ((!vblk || dev->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
+	if ((!vblk || geo->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) {
 		rqd->nr_ppas = nr_ppas;
 		rqd->ppa_addr = ppas[0];
 
@@ -287,7 +274,7 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
 		for (i = 0; i < nr_ppas; i++)
 			rqd->ppa_list[i] = ppas[i];
 	} else {
-		plane_cnt = dev->plane_mode;
+		plane_cnt = geo->plane_mode;
 		rqd->nr_ppas *= plane_cnt;
 
 		for (i = 0; i < nr_ppas; i++) {
@@ -465,17 +452,18 @@ EXPORT_SYMBOL(nvm_submit_ppa);
  */
 int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 {
+	struct nvm_geo *geo = &dev->geo;
 	int blk, offset, pl, blktype;
 
-	if (nr_blks != dev->blks_per_lun * dev->plane_mode)
+	if (nr_blks != geo->blks_per_lun * geo->plane_mode)
 		return -EINVAL;
 
-	for (blk = 0; blk < dev->blks_per_lun; blk++) {
-		offset = blk * dev->plane_mode;
+	for (blk = 0; blk < geo->blks_per_lun; blk++) {
+		offset = blk * geo->plane_mode;
 		blktype = blks[offset];
 
 		/* Bad blocks on any planes take precedence over other types */
-		for (pl = 0; pl < dev->plane_mode; pl++) {
+		for (pl = 0; pl < geo->plane_mode; pl++) {
 			if (blks[offset + pl] &
 					(NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) {
 				blktype = blks[offset + pl];
@@ -486,7 +474,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 		blks[blk] = blktype;
 	}
 
-	return dev->blks_per_lun;
+	return geo->blks_per_lun;
 }
 EXPORT_SYMBOL(nvm_bb_tbl_fold);
 
@@ -500,9 +488,10 @@ EXPORT_SYMBOL(nvm_get_bb_tbl);
 
 static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
 {
+	struct nvm_geo *geo = &dev->geo;
 	int i;
 
-	dev->lps_per_blk = dev->pgs_per_blk;
+	dev->lps_per_blk = geo->pgs_per_blk;
 	dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
 	if (!dev->lptbl)
 		return -ENOMEM;
@@ -548,29 +537,32 @@ static int nvm_core_init(struct nvm_dev *dev)
 {
 	struct nvm_id *id = &dev->identity;
 	struct nvm_id_group *grp = &id->groups[0];
+	struct nvm_geo *geo = &dev->geo;
 	int ret;
 
-	/* device values */
-	dev->nr_chnls = grp->num_ch;
-	dev->luns_per_chnl = grp->num_lun;
-	dev->pgs_per_blk = grp->num_pg;
-	dev->blks_per_lun = grp->num_blk;
-	dev->nr_planes = grp->num_pln;
-	dev->fpg_size = grp->fpg_sz;
-	dev->pfpg_size = grp->fpg_sz * grp->num_pln;
-	dev->sec_size = grp->csecs;
-	dev->oob_size = grp->sos;
-	dev->sec_per_pg = grp->fpg_sz / grp->csecs;
-	dev->mccap = grp->mccap;
-	memcpy(&dev->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
-
-	dev->plane_mode = NVM_PLANE_SINGLE;
-	dev->max_rq_size = dev->ops->max_phys_sect * dev->sec_size;
+	/* Whole device values */
+	geo->nr_chnls = grp->num_ch;
+	geo->luns_per_chnl = grp->num_lun;
+
+	/* Generic device values */
+	geo->pgs_per_blk = grp->num_pg;
+	geo->blks_per_lun = grp->num_blk;
+	geo->nr_planes = grp->num_pln;
+	geo->fpg_size = grp->fpg_sz;
+	geo->pfpg_size = grp->fpg_sz * grp->num_pln;
+	geo->sec_size = grp->csecs;
+	geo->oob_size = grp->sos;
+	geo->sec_per_pg = grp->fpg_sz / grp->csecs;
+	geo->mccap = grp->mccap;
+	memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
+
+	geo->plane_mode = NVM_PLANE_SINGLE;
+	geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
 
 	if (grp->mpos & 0x020202)
-		dev->plane_mode = NVM_PLANE_DOUBLE;
+		geo->plane_mode = NVM_PLANE_DOUBLE;
 	if (grp->mpos & 0x040404)
-		dev->plane_mode = NVM_PLANE_QUAD;
+		geo->plane_mode = NVM_PLANE_QUAD;
 
 	if (grp->mtype != 0) {
 		pr_err("nvm: memory type not supported\n");
@@ -578,13 +570,13 @@ static int nvm_core_init(struct nvm_dev *dev)
 	}
 
 	/* calculated values */
-	dev->sec_per_pl = dev->sec_per_pg * dev->nr_planes;
-	dev->sec_per_blk = dev->sec_per_pl * dev->pgs_per_blk;
-	dev->sec_per_lun = dev->sec_per_blk * dev->blks_per_lun;
-	dev->nr_luns = dev->luns_per_chnl * dev->nr_chnls;
+	geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
+	geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk;
+	geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun;
+	geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls;
 
-	dev->total_secs = dev->nr_luns * dev->sec_per_lun;
-	dev->lun_map = kcalloc(BITS_TO_LONGS(dev->nr_luns),
+	dev->total_secs = geo->nr_luns * geo->sec_per_lun;
+	dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns),
 					sizeof(unsigned long), GFP_KERNEL);
 	if (!dev->lun_map)
 		return -ENOMEM;
@@ -611,7 +603,7 @@ static int nvm_core_init(struct nvm_dev *dev)
 	mutex_init(&dev->mlock);
 	spin_lock_init(&dev->lock);
 
-	blk_queue_logical_block_size(dev->q, dev->sec_size);
+	blk_queue_logical_block_size(dev->q, geo->sec_size);
 
 	return 0;
 err_fmtype:
@@ -645,6 +637,7 @@ void nvm_free(struct nvm_dev *dev)
 
 static int nvm_init(struct nvm_dev *dev)
 {
+	struct nvm_geo *geo = &dev->geo;
 	int ret = -EINVAL;
 
 	if (!dev->q || !dev->ops)
@@ -676,9 +669,9 @@ static int nvm_init(struct nvm_dev *dev)
 	}
 
 	pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
-			dev->name, dev->sec_per_pg, dev->nr_planes,
-			dev->pgs_per_blk, dev->blks_per_lun, dev->nr_luns,
-			dev->nr_chnls);
+			dev->name, geo->sec_per_pg, geo->nr_planes,
+			geo->pgs_per_blk, geo->blks_per_lun,
+			geo->nr_luns, geo->nr_chnls);
 	return 0;
 err:
 	pr_err("nvm: failed to initialize nvm\n");
@@ -771,9 +764,9 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
 	}
 	s = &create->conf.s;
 
-	if (s->lun_begin > s->lun_end || s->lun_end > dev->nr_luns) {
+	if (s->lun_begin > s->lun_end || s->lun_end > dev->geo.nr_luns) {
 		pr_err("nvm: lun out of bound (%u:%u > %u)\n",
-			s->lun_begin, s->lun_end, dev->nr_luns);
+			s->lun_begin, s->lun_end, dev->geo.nr_luns);
 		return -EINVAL;
 	}
 
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 9671e11356be4d..8791a2aaa9e35e 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -74,6 +74,36 @@ static void gen_release_luns(struct nvm_dev *dev, struct nvm_target *t)
 	}
 }
 
+static void gen_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
+{
+	kfree(tgt_dev);
+}
+
+static struct nvm_tgt_dev *gen_create_tgt_dev(struct nvm_dev *dev,
+					      int lun_begin, int lun_end)
+{
+	struct nvm_tgt_dev *tgt_dev = NULL;
+	int nr_luns = lun_end - lun_begin + 1;
+
+	tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
+	if (!tgt_dev)
+		goto out;
+
+	memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
+	tgt_dev->geo.nr_chnls = (nr_luns / (dev->geo.luns_per_chnl + 1)) + 1;
+	tgt_dev->geo.nr_luns = nr_luns;
+	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
+	tgt_dev->q = dev->q;
+	tgt_dev->ops = dev->ops;
+	tgt_dev->mt = dev->mt;
+	memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id));
+
+	tgt_dev->parent = dev;
+
+out:
+	return tgt_dev;
+}
+
 static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 {
 	struct gen_dev *gn = dev->mp;
@@ -82,6 +112,7 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	struct gendisk *tdisk;
 	struct nvm_tgt_type *tt;
 	struct nvm_target *t;
+	struct nvm_tgt_dev *tgt_dev;
 	void *targetdata;
 
 	tt = nvm_find_target_type(create->tgttype, 1);
@@ -108,9 +139,13 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	if (gen_reserve_luns(dev, t, s->lun_begin, s->lun_end))
 		goto err_t;
 
+	tgt_dev = gen_create_tgt_dev(dev, s->lun_begin, s->lun_end);
+	if (!tgt_dev)
+		goto err_reserve;
+
 	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
 	if (!tqueue)
-		goto err_reserve;
+		goto err_dev;
 	blk_queue_make_request(tqueue, tt->make_rq);
 
 	tdisk = alloc_disk(0);
@@ -124,7 +159,7 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	tdisk->fops = &gen_fops;
 	tdisk->queue = tqueue;
 
-	targetdata = tt->init(dev, tdisk, s->lun_begin, s->lun_end);
+	targetdata = tt->init(tgt_dev, tdisk, s->lun_begin, s->lun_end);
 	if (IS_ERR(targetdata))
 		goto err_init;
 
@@ -138,7 +173,7 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 
 	t->type = tt;
 	t->disk = tdisk;
-	t->dev = dev;
+	t->dev = tgt_dev;
 
 	mutex_lock(&gn->lock);
 	list_add_tail(&t->list, &gn->targets);
@@ -149,6 +184,8 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	put_disk(tdisk);
 err_queue:
 	blk_cleanup_queue(tqueue);
+err_dev:
+	kfree(tgt_dev);
 err_reserve:
 	gen_release_luns(dev, t);
 err_t:
@@ -168,7 +205,8 @@ static void __gen_remove_target(struct nvm_target *t)
 	if (tt->exit)
 		tt->exit(tdisk->private_data);
 
-	gen_release_luns(t->dev, t);
+	gen_release_luns(t->dev->parent, t);
+	gen_remove_tgt_dev(t->dev);
 	put_disk(tdisk);
 
 	list_del(&t->list);
@@ -207,10 +245,11 @@ static int gen_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
 
 static int gen_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct gen_dev *gn = dev->mp;
 	struct gen_area *area, *prev, *next;
 	sector_t begin = 0;
-	sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9;
+	sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
 
 	if (len > max_sectors)
 		return -EINVAL;
@@ -289,10 +328,11 @@ static void gen_luns_free(struct nvm_dev *dev)
 
 static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct nvm_lun *lun;
 	int i;
 
-	gn->luns = kcalloc(dev->nr_luns, sizeof(struct nvm_lun), GFP_KERNEL);
+	gn->luns = kcalloc(geo->nr_luns, sizeof(struct nvm_lun), GFP_KERNEL);
 	if (!gn->luns)
 		return -ENOMEM;
 
@@ -305,9 +345,9 @@ static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn)
 		spin_lock_init(&lun->lock);
 
 		lun->id = i;
-		lun->lun_id = i % dev->luns_per_chnl;
-		lun->chnl_id = i / dev->luns_per_chnl;
-		lun->nr_free_blocks = dev->blks_per_lun;
+		lun->lun_id = i % geo->luns_per_chnl;
+		lun->chnl_id = i / geo->luns_per_chnl;
+		lun->nr_free_blocks = geo->blks_per_lun;
 	}
 	return 0;
 }
@@ -324,7 +364,7 @@ static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa,
 	if (nr_blks < 0)
 		return nr_blks;
 
-	lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
+	lun = &gn->luns[(dev->geo.luns_per_chnl * ppa.g.ch) + ppa.g.lun];
 
 	for (i = 0; i < nr_blks; i++) {
 		if (blks[i] == NVM_BLK_T_FREE)
@@ -342,6 +382,7 @@ static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa,
 static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 {
 	struct nvm_dev *dev = private;
+	struct nvm_geo *geo = &dev->geo;
 	struct gen_dev *gn = dev->mp;
 	u64 elba = slba + nlb;
 	struct nvm_lun *lun;
@@ -370,12 +411,12 @@ static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 			continue;
 
 		/* resolve block from physical address */
-		lun_id = div_u64(pba, dev->sec_per_lun);
+		lun_id = div_u64(pba, geo->sec_per_lun);
 		lun = &gn->luns[lun_id];
 
 		/* Calculate block offset into lun */
-		pba = pba - (dev->sec_per_lun * lun_id);
-		blk = &lun->blocks[div_u64(pba, dev->sec_per_blk)];
+		pba = pba - (geo->sec_per_lun * lun_id);
+		blk = &lun->blocks[div_u64(pba, geo->sec_per_blk)];
 
 		if (!blk->state) {
 			/* at this point, we don't know anything about the
@@ -393,26 +434,27 @@ static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
 
 static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct nvm_lun *lun;
 	struct nvm_block *block;
 	sector_t lun_iter, blk_iter, cur_block_id = 0;
 	int ret, nr_blks;
 	u8 *blks;
 
-	nr_blks = dev->blks_per_lun * dev->plane_mode;
+	nr_blks = geo->blks_per_lun * geo->plane_mode;
 	blks = kmalloc(nr_blks, GFP_KERNEL);
 	if (!blks)
 		return -ENOMEM;
 
 	gen_for_each_lun(gn, lun, lun_iter) {
 		lun->blocks = vzalloc(sizeof(struct nvm_block) *
-							dev->blks_per_lun);
+							geo->blks_per_lun);
 		if (!lun->blocks) {
 			kfree(blks);
 			return -ENOMEM;
 		}
 
-		for (blk_iter = 0; blk_iter < dev->blks_per_lun; blk_iter++) {
+		for (blk_iter = 0; blk_iter < geo->blks_per_lun; blk_iter++) {
 			block = &lun->blocks[blk_iter];
 
 			INIT_LIST_HEAD(&block->list);
@@ -474,7 +516,7 @@ static int gen_register(struct nvm_dev *dev)
 		return -ENOMEM;
 
 	gn->dev = dev;
-	gn->nr_luns = dev->nr_luns;
+	gn->nr_luns = dev->geo.nr_luns;
 	INIT_LIST_HEAD(&gn->area_list);
 	mutex_init(&gn->lock);
 	INIT_LIST_HEAD(&gn->targets);
@@ -506,7 +548,7 @@ static void gen_unregister(struct nvm_dev *dev)
 
 	mutex_lock(&gn->lock);
 	list_for_each_entry_safe(t, tmp, &gn->targets, list) {
-		if (t->dev != dev)
+		if (t->dev->parent != dev)
 			continue;
 		__gen_remove_target(t);
 	}
@@ -516,55 +558,9 @@ static void gen_unregister(struct nvm_dev *dev)
 	module_put(THIS_MODULE);
 }
 
-static struct nvm_block *gen_get_blk(struct nvm_dev *dev,
-				struct nvm_lun *lun, unsigned long flags)
-{
-	struct nvm_block *blk = NULL;
-	int is_gc = flags & NVM_IOTYPE_GC;
-
-	spin_lock(&lun->lock);
-	if (list_empty(&lun->free_list)) {
-		pr_err_ratelimited("gen: lun %u have no free pages available",
-								lun->id);
-		goto out;
-	}
-
-	if (!is_gc && lun->nr_free_blocks < lun->reserved_blocks)
-		goto out;
-
-	blk = list_first_entry(&lun->free_list, struct nvm_block, list);
-
-	list_move_tail(&blk->list, &lun->used_list);
-	blk->state = NVM_BLK_ST_TGT;
-	lun->nr_free_blocks--;
-out:
-	spin_unlock(&lun->lock);
-	return blk;
-}
-
-static void gen_put_blk(struct nvm_dev *dev, struct nvm_block *blk)
-{
-	struct nvm_lun *lun = blk->lun;
-
-	spin_lock(&lun->lock);
-	if (blk->state & NVM_BLK_ST_TGT) {
-		list_move_tail(&blk->list, &lun->free_list);
-		lun->nr_free_blocks++;
-		blk->state = NVM_BLK_ST_FREE;
-	} else if (blk->state & NVM_BLK_ST_BAD) {
-		list_move_tail(&blk->list, &lun->bb_list);
-		blk->state = NVM_BLK_ST_BAD;
-	} else {
-		WARN_ON_ONCE(1);
-		pr_err("gen: erroneous block type (%lu -> %u)\n",
-							blk->id, blk->state);
-		list_move_tail(&blk->list, &lun->bb_list);
-	}
-	spin_unlock(&lun->lock);
-}
-
 static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct gen_dev *gn = dev->mp;
 	struct nvm_lun *lun;
 	struct nvm_block *blk;
@@ -572,18 +568,18 @@ static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
 	pr_debug("gen: ppa  (ch: %u lun: %u blk: %u pg: %u) -> %u\n",
 			ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type);
 
-	if (unlikely(ppa.g.ch > dev->nr_chnls ||
-					ppa.g.lun > dev->luns_per_chnl ||
-					ppa.g.blk > dev->blks_per_lun)) {
+	if (unlikely(ppa.g.ch > geo->nr_chnls ||
+					ppa.g.lun > geo->luns_per_chnl ||
+					ppa.g.blk > geo->blks_per_lun)) {
 		WARN_ON_ONCE(1);
 		pr_err("gen: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u",
-				ppa.g.ch, dev->nr_chnls,
-				ppa.g.lun, dev->luns_per_chnl,
-				ppa.g.blk, dev->blks_per_lun);
+				ppa.g.ch, geo->nr_chnls,
+				ppa.g.lun, geo->luns_per_chnl,
+				ppa.g.blk, geo->blks_per_lun);
 		return;
 	}
 
-	lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
+	lun = &gn->luns[(geo->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
 	blk = &lun->blocks[ppa.g.blk];
 
 	/* will be moved to bb list on put_blk from target */
@@ -621,7 +617,7 @@ static struct nvm_lun *gen_get_lun(struct nvm_dev *dev, int lunid)
 {
 	struct gen_dev *gn = dev->mp;
 
-	if (unlikely(lunid >= dev->nr_luns))
+	if (unlikely(lunid >= dev->geo.nr_luns))
 		return NULL;
 
 	return &gn->luns[lunid];
@@ -654,9 +650,6 @@ static struct nvmm_type gen = {
 	.create_tgt		= gen_create_tgt,
 	.remove_tgt		= gen_remove_tgt,
 
-	.get_blk		= gen_get_blk,
-	.put_blk		= gen_put_blk,
-
 	.submit_io		= gen_submit_io,
 	.erase_blk		= gen_erase_blk,
 
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 88e0d0677b094c..5377c7a987aac8 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -28,6 +28,7 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 
 static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_block *rblk = a->rblk;
 	unsigned int pg_offset;
 
@@ -38,7 +39,7 @@ static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a)
 
 	spin_lock(&rblk->lock);
 
-	div_u64_rem(a->addr, rrpc->dev->sec_per_blk, &pg_offset);
+	div_u64_rem(a->addr, dev->geo.sec_per_blk, &pg_offset);
 	WARN_ON(test_and_set_bit(pg_offset, rblk->invalid_pages));
 	rblk->nr_invalid_pages++;
 
@@ -116,32 +117,36 @@ static void rrpc_discard(struct rrpc *rrpc, struct bio *bio)
 
 static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
-	return (rblk->next_page == rrpc->dev->sec_per_blk);
+	struct nvm_tgt_dev *dev = rrpc->dev;
+
+	return (rblk->next_page == dev->geo.sec_per_blk);
 }
 
 /* Calculate relative addr for the given block, considering instantiated LUNs */
 static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct nvm_block *blk = rblk->parent;
-	int lun_blk = blk->id % (rrpc->dev->blks_per_lun * rrpc->nr_luns);
+	int lun_blk = blk->id % (dev->geo.blks_per_lun * rrpc->nr_luns);
 
-	return lun_blk * rrpc->dev->sec_per_blk;
+	return lun_blk * dev->geo.sec_per_blk;
 }
 
 /* Calculate global addr for the given block */
 static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct nvm_block *blk = rblk->parent;
 
-	return blk->id * rrpc->dev->sec_per_blk;
+	return blk->id * dev->geo.sec_per_blk;
 }
 
-static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_dev *dev, u64 addr)
+static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev, u64 addr)
 {
 	struct ppa_addr paddr;
 
 	paddr.ppa = addr;
-	return linear_to_generic_addr(dev, paddr);
+	return linear_to_generic_addr(&dev->geo, paddr);
 }
 
 /* requires lun->lock taken */
@@ -158,21 +163,52 @@ static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk,
 	*cur_rblk = new_rblk;
 }
 
+static struct nvm_block *__rrpc_get_blk(struct rrpc *rrpc,
+							struct rrpc_lun *rlun)
+{
+	struct nvm_lun *lun = rlun->parent;
+	struct nvm_block *blk = NULL;
+
+	if (list_empty(&lun->free_list))
+		goto out;
+
+	blk = list_first_entry(&lun->free_list, struct nvm_block, list);
+
+	list_move_tail(&blk->list, &lun->used_list);
+	blk->state = NVM_BLK_ST_TGT;
+	lun->nr_free_blocks--;
+
+out:
+	return blk;
+}
+
 static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
 							unsigned long flags)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
+	struct nvm_lun *lun = rlun->parent;
 	struct nvm_block *blk;
 	struct rrpc_block *rblk;
+	int is_gc = flags & NVM_IOTYPE_GC;
+
+	spin_lock(&rlun->lock);
+	if (!is_gc && lun->nr_free_blocks < rlun->reserved_blocks) {
+		pr_err("nvm: rrpc: cannot give block to non GC request\n");
+		spin_unlock(&rlun->lock);
+		return NULL;
+	}
 
-	blk = nvm_get_blk(rrpc->dev, rlun->parent, flags);
+	blk = __rrpc_get_blk(rrpc, rlun);
 	if (!blk) {
-		pr_err("nvm: rrpc: cannot get new block from media manager\n");
+		pr_err("nvm: rrpc: cannot get new block\n");
+		spin_unlock(&rlun->lock);
 		return NULL;
 	}
+	spin_unlock(&rlun->lock);
 
 	rblk = rrpc_get_rblk(rlun, blk->id);
 	blk->priv = rblk;
-	bitmap_zero(rblk->invalid_pages, rrpc->dev->sec_per_blk);
+	bitmap_zero(rblk->invalid_pages, dev->geo.sec_per_blk);
 	rblk->next_page = 0;
 	rblk->nr_invalid_pages = 0;
 	atomic_set(&rblk->data_cmnt_size, 0);
@@ -182,7 +218,25 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
 
 static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
-	nvm_put_blk(rrpc->dev, rblk->parent);
+	struct nvm_block *blk = rblk->parent;
+	struct rrpc_lun *rlun = rblk->rlun;
+	struct nvm_lun *lun = rlun->parent;
+
+	spin_lock(&rlun->lock);
+	if (blk->state & NVM_BLK_ST_TGT) {
+		list_move_tail(&blk->list, &lun->free_list);
+		lun->nr_free_blocks++;
+		blk->state = NVM_BLK_ST_FREE;
+	} else if (blk->state & NVM_BLK_ST_BAD) {
+		list_move_tail(&blk->list, &lun->bb_list);
+		blk->state = NVM_BLK_ST_BAD;
+	} else {
+		WARN_ON_ONCE(1);
+		pr_err("rrpc: erroneous block type (%lu -> %u)\n",
+							blk->id, blk->state);
+		list_move_tail(&blk->list, &lun->bb_list);
+	}
+	spin_unlock(&rlun->lock);
 }
 
 static void rrpc_put_blks(struct rrpc *rrpc)
@@ -250,13 +304,14 @@ static void rrpc_end_sync_bio(struct bio *bio)
  */
 static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
-	struct request_queue *q = rrpc->dev->q;
+	struct nvm_tgt_dev *dev = rrpc->dev;
+	struct request_queue *q = dev->q;
 	struct rrpc_rev_addr *rev;
 	struct nvm_rq *rqd;
 	struct bio *bio;
 	struct page *page;
 	int slot;
-	int nr_sec_per_blk = rrpc->dev->sec_per_blk;
+	int nr_sec_per_blk = dev->geo.sec_per_blk;
 	u64 phys_addr;
 	DECLARE_COMPLETION_ONSTACK(wait);
 
@@ -366,7 +421,7 @@ static void rrpc_block_gc(struct work_struct *work)
 	struct rrpc *rrpc = gcb->rrpc;
 	struct rrpc_block *rblk = gcb->rblk;
 	struct rrpc_lun *rlun = rblk->rlun;
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 
 	mempool_free(gcb, rrpc->gcb_pool);
 	pr_debug("nvm: block '%lu' being reclaimed\n", rblk->parent->id);
@@ -374,7 +429,7 @@ static void rrpc_block_gc(struct work_struct *work)
 	if (rrpc_move_valid_pages(rrpc, rblk))
 		goto put_back;
 
-	if (nvm_erase_blk(dev, rblk->parent, 0))
+	if (nvm_erase_blk(dev->parent, rblk->parent, 0))
 		goto put_back;
 
 	rrpc_put_blk(rrpc, rblk);
@@ -420,11 +475,12 @@ static void rrpc_lun_gc(struct work_struct *work)
 {
 	struct rrpc_lun *rlun = container_of(work, struct rrpc_lun, ws_gc);
 	struct rrpc *rrpc = rlun->rrpc;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct nvm_lun *lun = rlun->parent;
 	struct rrpc_block_gc *gcb;
 	unsigned int nr_blocks_need;
 
-	nr_blocks_need = rrpc->dev->blks_per_lun / GC_LIMIT_INVERSE;
+	nr_blocks_need = dev->geo.blks_per_lun / GC_LIMIT_INVERSE;
 
 	if (nr_blocks_need < rrpc->nr_luns)
 		nr_blocks_need = rrpc->nr_luns;
@@ -645,15 +701,15 @@ static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
 	queue_work(rrpc->kgc_wq, &gcb->ws_gc);
 }
 
-static void __rrpc_mark_bad_block(struct nvm_dev *dev, struct ppa_addr *ppa)
+static void __rrpc_mark_bad_block(struct nvm_tgt_dev *dev, struct ppa_addr *ppa)
 {
-		nvm_mark_blk(dev, *ppa, NVM_BLK_ST_BAD);
-		nvm_set_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+		nvm_mark_blk(dev->parent, *ppa, NVM_BLK_ST_BAD);
+		nvm_set_bb_tbl(dev->parent, ppa, 1, NVM_BLK_T_GRWN_BAD);
 }
 
 static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
 {
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	void *comp_bits = &rqd->ppa_status;
 	struct ppa_addr ppa, prev_ppa;
 	int nr_ppas = rqd->nr_ppas;
@@ -676,6 +732,7 @@ static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
 static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
 						sector_t laddr, uint8_t npages)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_addr *p;
 	struct rrpc_block *rblk;
 	struct nvm_lun *lun;
@@ -687,7 +744,7 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
 		lun = rblk->parent->lun;
 
 		cmnt_size = atomic_inc_return(&rblk->data_cmnt_size);
-		if (unlikely(cmnt_size == rrpc->dev->sec_per_blk))
+		if (unlikely(cmnt_size == dev->geo.sec_per_blk))
 			rrpc_run_gc(rrpc, rblk);
 	}
 }
@@ -695,6 +752,7 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
 static void rrpc_end_io(struct nvm_rq *rqd)
 {
 	struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance);
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
 	uint8_t npages = rqd->nr_ppas;
 	sector_t laddr = rrpc_get_laddr(rqd->bio) - npages;
@@ -714,7 +772,7 @@ static void rrpc_end_io(struct nvm_rq *rqd)
 	rrpc_unlock_rq(rrpc, rqd);
 
 	if (npages > 1)
-		nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
+		nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
 
 	mempool_free(rqd, rrpc->rq_pool);
 }
@@ -722,6 +780,7 @@ static void rrpc_end_io(struct nvm_rq *rqd)
 static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 			struct nvm_rq *rqd, unsigned long flags, int npages)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
 	struct rrpc_addr *gp;
 	sector_t laddr = rrpc_get_laddr(bio);
@@ -729,7 +788,7 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 	int i;
 
 	if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
-		nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
+		nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
 		return NVM_IO_REQUEUE;
 	}
 
@@ -739,12 +798,11 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 		gp = &rrpc->trans_map[laddr + i];
 
 		if (gp->rblk) {
-			rqd->ppa_list[i] = rrpc_ppa_to_gaddr(rrpc->dev,
-								gp->addr);
+			rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, gp->addr);
 		} else {
 			BUG_ON(is_gc);
 			rrpc_unlock_laddr(rrpc, r);
-			nvm_dev_dma_free(rrpc->dev, rqd->ppa_list,
+			nvm_dev_dma_free(dev->parent, rqd->ppa_list,
 							rqd->dma_ppa_list);
 			return NVM_IO_DONE;
 		}
@@ -784,6 +842,7 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
 static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 			struct nvm_rq *rqd, unsigned long flags, int npages)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
 	struct rrpc_addr *p;
 	sector_t laddr = rrpc_get_laddr(bio);
@@ -791,7 +850,7 @@ static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 	int i;
 
 	if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
-		nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list);
+		nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
 		return NVM_IO_REQUEUE;
 	}
 
@@ -801,14 +860,13 @@ static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 		if (!p) {
 			BUG_ON(is_gc);
 			rrpc_unlock_laddr(rrpc, r);
-			nvm_dev_dma_free(rrpc->dev, rqd->ppa_list,
+			nvm_dev_dma_free(dev->parent, rqd->ppa_list,
 							rqd->dma_ppa_list);
 			rrpc_gc_kick(rrpc);
 			return NVM_IO_REQUEUE;
 		}
 
-		rqd->ppa_list[i] = rrpc_ppa_to_gaddr(rrpc->dev,
-								p->addr);
+		rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, p->addr);
 	}
 
 	rqd->opcode = NVM_OP_HBWRITE;
@@ -843,8 +901,10 @@ static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
 static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio,
 			struct nvm_rq *rqd, unsigned long flags, uint8_t npages)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
+
 	if (npages > 1) {
-		rqd->ppa_list = nvm_dev_dma_alloc(rrpc->dev, GFP_KERNEL,
+		rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
 							&rqd->dma_ppa_list);
 		if (!rqd->ppa_list) {
 			pr_err("rrpc: not able to allocate ppa list\n");
@@ -867,14 +927,15 @@ static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio,
 static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 				struct nvm_rq *rqd, unsigned long flags)
 {
-	int err;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_rq *rrq = nvm_rq_to_pdu(rqd);
 	uint8_t nr_pages = rrpc_get_pages(bio);
 	int bio_size = bio_sectors(bio) << 9;
+	int err;
 
-	if (bio_size < rrpc->dev->sec_size)
+	if (bio_size < dev->geo.sec_size)
 		return NVM_IO_ERR;
-	else if (bio_size > rrpc->dev->max_rq_size)
+	else if (bio_size > dev->geo.max_rq_size)
 		return NVM_IO_ERR;
 
 	err = rrpc_setup_rq(rrpc, bio, rqd, flags, nr_pages);
@@ -887,15 +948,15 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 	rqd->nr_ppas = nr_pages;
 	rrq->flags = flags;
 
-	err = nvm_submit_io(rrpc->dev, rqd);
+	err = nvm_submit_io(dev->parent, rqd);
 	if (err) {
 		pr_err("rrpc: I/O submission failed: %d\n", err);
 		bio_put(bio);
 		if (!(flags & NVM_IOTYPE_GC)) {
 			rrpc_unlock_rq(rrpc, rqd);
 			if (rqd->nr_ppas > 1)
-				nvm_dev_dma_free(rrpc->dev,
-			rqd->ppa_list, rqd->dma_ppa_list);
+				nvm_dev_dma_free(dev->parent,
+					rqd->ppa_list, rqd->dma_ppa_list);
 		}
 		return NVM_IO_ERR;
 	}
@@ -997,17 +1058,11 @@ static void rrpc_map_free(struct rrpc *rrpc)
 static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
 {
 	struct rrpc *rrpc = (struct rrpc *)private;
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_addr *addr = rrpc->trans_map + slba;
 	struct rrpc_rev_addr *raddr = rrpc->rev_trans_map;
-	u64 elba = slba + nlb;
 	u64 i;
 
-	if (unlikely(elba > dev->total_secs)) {
-		pr_err("nvm: L2P data from device is out of bounds!\n");
-		return -EINVAL;
-	}
-
 	for (i = 0; i < nlb; i++) {
 		u64 pba = le64_to_cpu(entries[i]);
 		unsigned int mod;
@@ -1037,7 +1092,7 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
 
 static int rrpc_map_init(struct rrpc *rrpc)
 {
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	sector_t i;
 	int ret;
 
@@ -1062,7 +1117,7 @@ static int rrpc_map_init(struct rrpc *rrpc)
 		return 0;
 
 	/* Bring up the mapping table from device */
-	ret = dev->ops->get_l2p_tbl(dev, rrpc->soffset, rrpc->nr_sects,
+	ret = dev->ops->get_l2p_tbl(dev->parent, rrpc->soffset, rrpc->nr_sects,
 					rrpc_l2p_update, rrpc);
 	if (ret) {
 		pr_err("nvm: rrpc: could not read L2P table.\n");
@@ -1102,7 +1157,7 @@ static int rrpc_core_init(struct rrpc *rrpc)
 	if (!rrpc->page_pool)
 		return -ENOMEM;
 
-	rrpc->gcb_pool = mempool_create_slab_pool(rrpc->dev->nr_luns,
+	rrpc->gcb_pool = mempool_create_slab_pool(rrpc->dev->geo.nr_luns,
 								rrpc_gcb_cache);
 	if (!rrpc->gcb_pool)
 		return -ENOMEM;
@@ -1146,11 +1201,12 @@ static void rrpc_luns_free(struct rrpc *rrpc)
 
 static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 {
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
+	struct nvm_geo *geo = &dev->geo;
 	struct rrpc_lun *rlun;
 	int i, j, ret = -EINVAL;
 
-	if (dev->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
+	if (geo->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
 		pr_err("rrpc: number of pages per block too high.");
 		return -EINVAL;
 	}
@@ -1167,20 +1223,20 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 		int lunid = lun_begin + i;
 		struct nvm_lun *lun;
 
-		lun = dev->mt->get_lun(dev, lunid);
+		lun = dev->mt->get_lun(dev->parent, lunid);
 		if (!lun)
 			goto err;
 
 		rlun = &rrpc->luns[i];
 		rlun->parent = lun;
 		rlun->blocks = vzalloc(sizeof(struct rrpc_block) *
-						rrpc->dev->blks_per_lun);
+							geo->blks_per_lun);
 		if (!rlun->blocks) {
 			ret = -ENOMEM;
 			goto err;
 		}
 
-		for (j = 0; j < rrpc->dev->blks_per_lun; j++) {
+		for (j = 0; j < geo->blks_per_lun; j++) {
 			struct rrpc_block *rblk = &rlun->blocks[j];
 			struct nvm_block *blk = &lun->blocks[j];
 
@@ -1190,6 +1246,8 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 			spin_lock_init(&rblk->lock);
 		}
 
+		rlun->reserved_blocks = 2; /* for GC only */
+
 		rlun->rrpc = rrpc;
 		INIT_LIST_HEAD(&rlun->prio_list);
 		INIT_LIST_HEAD(&rlun->wblk_list);
@@ -1206,27 +1264,27 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 /* returns 0 on success and stores the beginning address in *begin */
 static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
 {
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct nvmm_type *mt = dev->mt;
-	sector_t size = rrpc->nr_sects * dev->sec_size;
+	sector_t size = rrpc->nr_sects * dev->geo.sec_size;
 	int ret;
 
 	size >>= 9;
 
-	ret = mt->get_area(dev, begin, size);
+	ret = mt->get_area(dev->parent, begin, size);
 	if (!ret)
-		*begin >>= (ilog2(dev->sec_size) - 9);
+		*begin >>= (ilog2(dev->geo.sec_size) - 9);
 
 	return ret;
 }
 
 static void rrpc_area_free(struct rrpc *rrpc)
 {
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct nvmm_type *mt = dev->mt;
-	sector_t begin = rrpc->soffset << (ilog2(dev->sec_size) - 9);
+	sector_t begin = rrpc->soffset << (ilog2(dev->geo.sec_size) - 9);
 
-	mt->put_area(dev, begin);
+	mt->put_area(dev->parent, begin);
 }
 
 static void rrpc_free(struct rrpc *rrpc)
@@ -1255,11 +1313,11 @@ static void rrpc_exit(void *private)
 static sector_t rrpc_capacity(void *private)
 {
 	struct rrpc *rrpc = private;
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	sector_t reserved, provisioned;
 
 	/* cur, gc, and two emergency blocks for each lun */
-	reserved = rrpc->nr_luns * dev->sec_per_blk * 4;
+	reserved = rrpc->nr_luns * dev->geo.sec_per_blk * 4;
 	provisioned = rrpc->nr_sects - reserved;
 
 	if (reserved > rrpc->nr_sects) {
@@ -1278,13 +1336,13 @@ static sector_t rrpc_capacity(void *private)
  */
 static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
-	struct nvm_dev *dev = rrpc->dev;
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	int offset;
 	struct rrpc_addr *laddr;
 	u64 bpaddr, paddr, pladdr;
 
 	bpaddr = block_to_rel_addr(rrpc, rblk);
-	for (offset = 0; offset < dev->sec_per_blk; offset++) {
+	for (offset = 0; offset < dev->geo.sec_per_blk; offset++) {
 		paddr = bpaddr + offset;
 
 		pladdr = rrpc->rev_trans_map[paddr].addr;
@@ -1304,6 +1362,7 @@ static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block *rblk)
 
 static int rrpc_blocks_init(struct rrpc *rrpc)
 {
+	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_lun *rlun;
 	struct rrpc_block *rblk;
 	int lun_iter, blk_iter;
@@ -1311,7 +1370,7 @@ static int rrpc_blocks_init(struct rrpc *rrpc)
 	for (lun_iter = 0; lun_iter < rrpc->nr_luns; lun_iter++) {
 		rlun = &rrpc->luns[lun_iter];
 
-		for (blk_iter = 0; blk_iter < rrpc->dev->blks_per_lun;
+		for (blk_iter = 0; blk_iter < dev->geo.blks_per_lun;
 								blk_iter++) {
 			rblk = &rlun->blocks[blk_iter];
 			rrpc_block_map_update(rrpc, rblk);
@@ -1350,11 +1409,12 @@ static int rrpc_luns_configure(struct rrpc *rrpc)
 
 static struct nvm_tgt_type tt_rrpc;
 
-static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
+static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 						int lun_begin, int lun_end)
 {
 	struct request_queue *bqueue = dev->q;
 	struct request_queue *tqueue = tdisk->queue;
+	struct nvm_geo *geo = &dev->geo;
 	struct rrpc *rrpc;
 	sector_t soffset;
 	int ret;
@@ -1377,8 +1437,8 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
 	spin_lock_init(&rrpc->bio_lock);
 	INIT_WORK(&rrpc->ws_requeue, rrpc_requeue);
 
-	rrpc->nr_luns = lun_end - lun_begin + 1;
-	rrpc->nr_sects = (unsigned long long)dev->sec_per_lun * rrpc->nr_luns;
+	rrpc->nr_luns = geo->nr_luns;
+	rrpc->nr_sects = (unsigned long long)geo->sec_per_lun * rrpc->nr_luns;
 
 	/* simple round-robin strategy */
 	atomic_set(&rrpc->next_lun, -1);
@@ -1396,7 +1456,7 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk,
 		goto err;
 	}
 
-	rrpc->poffset = dev->sec_per_lun * lun_begin;
+	rrpc->poffset = geo->sec_per_lun * lun_begin;
 
 	ret = rrpc_core_init(rrpc);
 	if (ret) {
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index bed49baa784101..242d4c109eb660 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -52,9 +52,12 @@ struct rrpc_rq {
 };
 
 struct rrpc_block {
+	unsigned long id;
 	struct nvm_block *parent;
 	struct rrpc_lun *rlun;
-	struct list_head prio;
+
+	struct list_head prio;		/* LUN CG list */
+	struct list_head list;		/* LUN free, used, bb list */
 
 #define MAX_INVALID_PAGES_STORAGE 8
 	/* Bitmap for invalid page intries */
@@ -64,6 +67,8 @@ struct rrpc_block {
 	/* number of pages that are invalid, wrt host page size */
 	unsigned int nr_invalid_pages;
 
+	int state;
+
 	spinlock_t lock;
 	atomic_t data_cmnt_size; /* data pages committed to stable storage */
 };
@@ -71,6 +76,7 @@ struct rrpc_block {
 struct rrpc_lun {
 	struct rrpc *rrpc;
 	struct nvm_lun *parent;
+
 	struct rrpc_block *cur, *gc_cur;
 	struct rrpc_block *blocks;	/* Reference to block allocation */
 
@@ -79,6 +85,8 @@ struct rrpc_lun {
 
 	struct work_struct ws_gc;
 
+	int reserved_blocks;
+
 	spinlock_t lock;
 };
 
@@ -86,7 +94,7 @@ struct rrpc {
 	/* instance must be kept in top to resolve rrpc in unprep */
 	struct nvm_tgt_instance instance;
 
-	struct nvm_dev *dev;
+	struct nvm_tgt_dev *dev;
 	struct gendisk *disk;
 
 	sector_t soffset; /* logical sector offset */
@@ -151,7 +159,8 @@ static inline struct rrpc_block *rrpc_get_rblk(struct rrpc_lun *rlun,
 								int blk_id)
 {
 	struct rrpc *rrpc = rlun->rrpc;
-	int lun_blk = blk_id % rrpc->dev->blks_per_lun;
+	struct nvm_tgt_dev *dev = rrpc->dev;
+	int lun_blk = blk_id % dev->geo.blks_per_lun;
 
 	return &rlun->blocks[lun_blk];
 }
diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c
index fa644afb25de91..12002bf4efc222 100644
--- a/drivers/lightnvm/sysblk.c
+++ b/drivers/lightnvm/sysblk.c
@@ -62,7 +62,8 @@ static void nvm_cpu_to_sysblk(struct nvm_system_block *sb,
 
 static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
 {
-	int nr_rows = min_t(int, MAX_SYSBLKS, dev->nr_chnls);
+	struct nvm_geo *geo = &dev->geo;
+	int nr_rows = min_t(int, MAX_SYSBLKS, geo->nr_chnls);
 	int i;
 
 	for (i = 0; i < nr_rows; i++)
@@ -71,7 +72,7 @@ static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
 	/* if possible, place sysblk at first channel, middle channel and last
 	 * channel of the device. If not, create only one or two sys blocks
 	 */
-	switch (dev->nr_chnls) {
+	switch (geo->nr_chnls) {
 	case 2:
 		sysblk_ppas[1].g.ch = 1;
 		/* fall-through */
@@ -80,8 +81,8 @@ static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas)
 		break;
 	default:
 		sysblk_ppas[0].g.ch = 0;
-		sysblk_ppas[1].g.ch = dev->nr_chnls / 2;
-		sysblk_ppas[2].g.ch = dev->nr_chnls - 1;
+		sysblk_ppas[1].g.ch = geo->nr_chnls / 2;
+		sysblk_ppas[2].g.ch = geo->nr_chnls - 1;
 		break;
 	}
 
@@ -162,11 +163,12 @@ static int sysblk_get_host_blks(struct nvm_dev *dev, struct ppa_addr ppa,
 static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
 				struct ppa_addr *ppas, int get_free)
 {
+	struct nvm_geo *geo = &dev->geo;
 	int i, nr_blks, ret = 0;
 	u8 *blks;
 
 	s->nr_ppas = 0;
-	nr_blks = dev->blks_per_lun * dev->plane_mode;
+	nr_blks = geo->blks_per_lun * geo->plane_mode;
 
 	blks = kmalloc(nr_blks, GFP_KERNEL);
 	if (!blks)
@@ -210,13 +212,14 @@ static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s,
 static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
 						struct nvm_system_block *sblk)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct nvm_system_block *cur;
 	int pg, ret, found = 0;
 
 	/* the full buffer for a flash page is allocated. Only the first of it
 	 * contains the system block information
 	 */
-	cur = kmalloc(dev->pfpg_size, GFP_KERNEL);
+	cur = kmalloc(geo->pfpg_size, GFP_KERNEL);
 	if (!cur)
 		return -ENOMEM;
 
@@ -225,7 +228,7 @@ static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa,
 		ppa->g.pg = ppa_to_slc(dev, pg);
 
 		ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE,
-							cur, dev->pfpg_size);
+							cur, geo->pfpg_size);
 		if (ret) {
 			if (ret == NVM_RSP_ERR_EMPTYPAGE) {
 				pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n",
@@ -276,6 +279,7 @@ static int nvm_sysblk_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s,
 static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
 							struct sysblk_scan *s)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct nvm_system_block nvmsb;
 	void *buf;
 	int i, sect, ret = 0;
@@ -283,12 +287,12 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
 
 	nvm_cpu_to_sysblk(&nvmsb, info);
 
-	buf = kzalloc(dev->pfpg_size, GFP_KERNEL);
+	buf = kzalloc(geo->pfpg_size, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 	memcpy(buf, &nvmsb, sizeof(struct nvm_system_block));
 
-	ppas = kcalloc(dev->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL);
+	ppas = kcalloc(geo->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL);
 	if (!ppas) {
 		ret = -ENOMEM;
 		goto err;
@@ -305,15 +309,15 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
 							ppas[0].g.pg);
 
 		/* Expand to all sectors within a flash page */
-		if (dev->sec_per_pg > 1) {
-			for (sect = 1; sect < dev->sec_per_pg; sect++) {
+		if (geo->sec_per_pg > 1) {
+			for (sect = 1; sect < geo->sec_per_pg; sect++) {
 				ppas[sect].ppa = ppas[0].ppa;
 				ppas[sect].g.sec = sect;
 			}
 		}
 
-		ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PWRITE,
-					NVM_IO_SLC_MODE, buf, dev->pfpg_size);
+		ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PWRITE,
+					NVM_IO_SLC_MODE, buf, geo->pfpg_size);
 		if (ret) {
 			pr_err("nvm: sysblk failed program (%u %u %u)\n",
 							ppas[0].g.ch,
@@ -322,8 +326,8 @@ static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info,
 			break;
 		}
 
-		ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PREAD,
-					NVM_IO_SLC_MODE, buf, dev->pfpg_size);
+		ret = nvm_submit_ppa(dev, ppas, geo->sec_per_pg, NVM_OP_PREAD,
+					NVM_IO_SLC_MODE, buf, geo->pfpg_size);
 		if (ret) {
 			pr_err("nvm: sysblk failed read (%u %u %u)\n",
 							ppas[0].g.ch,
@@ -527,6 +531,7 @@ int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new)
 
 int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
 	struct sysblk_scan s;
 	int ret;
@@ -541,7 +546,7 @@ int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info)
 	if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl)
 		return -EINVAL;
 
-	if (!(dev->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) {
+	if (!(geo->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) {
 		pr_err("nvm: memory does not support SLC access\n");
 		return -EINVAL;
 	}
@@ -571,11 +576,11 @@ static int factory_nblks(int nblks)
 	return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
 }
 
-static unsigned int factory_blk_offset(struct nvm_dev *dev, struct ppa_addr ppa)
+static unsigned int factory_blk_offset(struct nvm_geo *geo, struct ppa_addr ppa)
 {
-	int nblks = factory_nblks(dev->blks_per_lun);
+	int nblks = factory_nblks(geo->blks_per_lun);
 
-	return ((ppa.g.ch * dev->luns_per_chnl * nblks) + (ppa.g.lun * nblks)) /
+	return ((ppa.g.ch * geo->luns_per_chnl * nblks) + (ppa.g.lun * nblks)) /
 								BITS_PER_LONG;
 }
 
@@ -589,7 +594,7 @@ static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa,
 	if (nr_blks < 0)
 		return nr_blks;
 
-	lunoff = factory_blk_offset(dev, ppa);
+	lunoff = factory_blk_offset(&dev->geo, ppa);
 
 	/* non-set bits correspond to the block must be erased */
 	for (i = 0; i < nr_blks; i++) {
@@ -618,19 +623,19 @@ static int nvm_factory_blks(struct nvm_dev *dev, struct ppa_addr ppa,
 static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list,
 					int max_ppas, unsigned long *blk_bitmap)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr ppa;
 	int ch, lun, blkid, idx, done = 0, ppa_cnt = 0;
 	unsigned long *offset;
 
 	while (!done) {
 		done = 1;
-		nvm_for_each_lun_ppa(dev, ppa, ch, lun) {
-			idx = factory_blk_offset(dev, ppa);
+		nvm_for_each_lun_ppa(geo, ppa, ch, lun) {
+			idx = factory_blk_offset(geo, ppa);
 			offset = &blk_bitmap[idx];
 
-			blkid = find_first_zero_bit(offset,
-						dev->blks_per_lun);
-			if (blkid >= dev->blks_per_lun)
+			blkid = find_first_zero_bit(offset, geo->blks_per_lun);
+			if (blkid >= geo->blks_per_lun)
 				continue;
 			set_bit(blkid, offset);
 
@@ -655,16 +660,17 @@ static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list,
 static int nvm_fact_select_blks(struct nvm_dev *dev, unsigned long *blk_bitmap,
 								int flags)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr ppa;
 	int ch, lun, nr_blks, ret = 0;
 	u8 *blks;
 
-	nr_blks = dev->blks_per_lun * dev->plane_mode;
+	nr_blks = geo->blks_per_lun * geo->plane_mode;
 	blks = kmalloc(nr_blks, GFP_KERNEL);
 	if (!blks)
 		return -ENOMEM;
 
-	nvm_for_each_lun_ppa(dev, ppa, ch, lun) {
+	nvm_for_each_lun_ppa(geo, ppa, ch, lun) {
 		ret = nvm_get_bb_tbl(dev, ppa, blks);
 		if (ret)
 			pr_err("nvm: failed bb tbl for ch%u lun%u\n",
@@ -682,14 +688,15 @@ static int nvm_fact_select_blks(struct nvm_dev *dev, unsigned long *blk_bitmap,
 
 int nvm_dev_factory(struct nvm_dev *dev, int flags)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr *ppas;
 	int ppa_cnt, ret = -ENOMEM;
-	int max_ppas = dev->ops->max_phys_sect / dev->nr_planes;
+	int max_ppas = dev->ops->max_phys_sect / geo->nr_planes;
 	struct ppa_addr sysblk_ppas[MAX_SYSBLKS];
 	struct sysblk_scan s;
 	unsigned long *blk_bitmap;
 
-	blk_bitmap = kzalloc(factory_nblks(dev->blks_per_lun) * dev->nr_luns,
+	blk_bitmap = kzalloc(factory_nblks(geo->blks_per_lun) * geo->nr_luns,
 								GFP_KERNEL);
 	if (!blk_bitmap)
 		return ret;
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 037dff5951d433..1cdc8124c8c06a 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -352,6 +352,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
 
 	while (nlb) {
 		u32 cmd_nlb = min(nlb_pr_rq, nlb);
+		u64 elba = slba + cmd_nlb;
 
 		c.l2p.slba = cpu_to_le64(cmd_slba);
 		c.l2p.nlb = cpu_to_le32(cmd_nlb);
@@ -365,6 +366,11 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
 			goto out;
 		}
 
+		if (unlikely(elba > nvmdev->total_secs)) {
+			pr_err("nvm: L2P data from device is out of bounds!\n");
+			return -EINVAL;
+		}
+
 		if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
 			ret = -EINTR;
 			goto out;
@@ -383,11 +389,12 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 								u8 *blks)
 {
 	struct request_queue *q = nvmdev->q;
+	struct nvm_geo *geo = &nvmdev->geo;
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_nvm_command c = {};
 	struct nvme_nvm_bb_tbl *bb_tbl;
-	int nr_blks = nvmdev->blks_per_lun * nvmdev->plane_mode;
+	int nr_blks = geo->blks_per_lun * geo->plane_mode;
 	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
 	int ret = 0;
 
@@ -428,7 +435,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 		goto out;
 	}
 
-	memcpy(blks, bb_tbl->blk, nvmdev->blks_per_lun * nvmdev->plane_mode);
+	memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode);
 out:
 	kfree(bb_tbl);
 	return ret;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 89c695483d551c..1f1588c2557e2f 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -211,7 +211,7 @@ struct nvm_id {
 struct nvm_target {
 	struct list_head list;
 	struct list_head lun_list;
-	struct nvm_dev *dev;
+	struct nvm_tgt_dev *dev;
 	struct nvm_tgt_type *type;
 	struct gendisk *disk;
 };
@@ -286,7 +286,6 @@ struct nvm_lun {
 					 * free_list and used_list
 					 */
 	unsigned int nr_free_blocks;	/* Number of unused blocks */
-	int reserved_blocks;
 
 	struct nvm_block *blocks;
 };
@@ -315,22 +314,12 @@ struct nvm_sb_info {
 	struct ppa_addr		fs_ppa;
 };
 
-struct nvm_dev {
-	struct nvm_dev_ops *ops;
-
-	struct list_head devices;
-
-	/* Media manager */
-	struct nvmm_type *mt;
-	void *mp;
-
-	/* System blocks */
-	struct nvm_sb_info sb;
-
-	/* Device information */
+/* Device generic information */
+struct nvm_geo {
 	int nr_chnls;
+	int nr_luns;
+	int luns_per_chnl; /* -1 if channels are not symmetric */
 	int nr_planes;
-	int luns_per_chnl;
 	int sec_per_pg; /* only sectors for a single page */
 	int pgs_per_blk;
 	int blks_per_lun;
@@ -350,14 +339,43 @@ struct nvm_dev {
 	int sec_per_pl; /* all sectors across planes */
 	int sec_per_blk;
 	int sec_per_lun;
+};
+
+struct nvm_tgt_dev {
+	/* Device information */
+	struct nvm_geo geo;
+
+	sector_t total_secs;
+
+	struct nvm_id identity;
+	struct request_queue *q;
+
+	struct nvmm_type *mt;
+	struct nvm_dev_ops *ops;
+
+	void *parent;
+};
+
+struct nvm_dev {
+	struct nvm_dev_ops *ops;
+
+	struct list_head devices;
+
+	/* Media manager */
+	struct nvmm_type *mt;
+	void *mp;
+
+	/* System blocks */
+	struct nvm_sb_info sb;
+
+	/* Device information */
+	struct nvm_geo geo;
 
 	/* lower page table */
 	int lps_per_blk;
 	int *lptbl;
 
-	unsigned long total_blocks;
 	unsigned long total_secs;
-	int nr_luns;
 
 	unsigned long *lun_map;
 	void *dma_pool;
@@ -373,7 +391,7 @@ struct nvm_dev {
 	spinlock_t lock;
 };
 
-static inline struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev,
+static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
 							struct ppa_addr r)
 {
 	struct ppa_addr l;
@@ -382,22 +400,22 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev,
 
 	l.ppa = 0;
 
-	div_u64_rem(ppa, dev->sec_per_pg, &secs);
+	div_u64_rem(ppa, geo->sec_per_pg, &secs);
 	l.g.sec = secs;
 
-	sector_div(ppa, dev->sec_per_pg);
-	div_u64_rem(ppa, dev->pgs_per_blk, &pgs);
+	sector_div(ppa, geo->sec_per_pg);
+	div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
 	l.g.pg = pgs;
 
-	sector_div(ppa, dev->pgs_per_blk);
-	div_u64_rem(ppa, dev->blks_per_lun, &blks);
+	sector_div(ppa, geo->pgs_per_blk);
+	div_u64_rem(ppa, geo->blks_per_lun, &blks);
 	l.g.blk = blks;
 
-	sector_div(ppa, dev->blks_per_lun);
-	div_u64_rem(ppa, dev->luns_per_chnl, &luns);
+	sector_div(ppa, geo->blks_per_lun);
+	div_u64_rem(ppa, geo->luns_per_chnl, &luns);
 	l.g.lun = luns;
 
-	sector_div(ppa, dev->luns_per_chnl);
+	sector_div(ppa, geo->luns_per_chnl);
 	l.g.ch = ppa;
 
 	return l;
@@ -406,14 +424,15 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev,
 static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
 						struct ppa_addr r)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr l;
 
-	l.ppa = ((u64)r.g.blk) << dev->ppaf.blk_offset;
-	l.ppa |= ((u64)r.g.pg) << dev->ppaf.pg_offset;
-	l.ppa |= ((u64)r.g.sec) << dev->ppaf.sect_offset;
-	l.ppa |= ((u64)r.g.pl) << dev->ppaf.pln_offset;
-	l.ppa |= ((u64)r.g.lun) << dev->ppaf.lun_offset;
-	l.ppa |= ((u64)r.g.ch) << dev->ppaf.ch_offset;
+	l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset;
+	l.ppa |= ((u64)r.g.pg) << geo->ppaf.pg_offset;
+	l.ppa |= ((u64)r.g.sec) << geo->ppaf.sect_offset;
+	l.ppa |= ((u64)r.g.pl) << geo->ppaf.pln_offset;
+	l.ppa |= ((u64)r.g.lun) << geo->ppaf.lun_offset;
+	l.ppa |= ((u64)r.g.ch) << geo->ppaf.ch_offset;
 
 	return l;
 }
@@ -421,24 +440,25 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev,
 static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev,
 						struct ppa_addr r)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr l;
 
 	l.ppa = 0;
 	/*
 	 * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc.
 	 */
-	l.g.blk = (r.ppa >> dev->ppaf.blk_offset) &
-					(((1 << dev->ppaf.blk_len) - 1));
-	l.g.pg |= (r.ppa >> dev->ppaf.pg_offset) &
-					(((1 << dev->ppaf.pg_len) - 1));
-	l.g.sec |= (r.ppa >> dev->ppaf.sect_offset) &
-					(((1 << dev->ppaf.sect_len) - 1));
-	l.g.pl |= (r.ppa >> dev->ppaf.pln_offset) &
-					(((1 << dev->ppaf.pln_len) - 1));
-	l.g.lun |= (r.ppa >> dev->ppaf.lun_offset) &
-					(((1 << dev->ppaf.lun_len) - 1));
-	l.g.ch |= (r.ppa >> dev->ppaf.ch_offset) &
-					(((1 << dev->ppaf.ch_len) - 1));
+	l.g.blk = (r.ppa >> geo->ppaf.blk_offset) &
+					(((1 << geo->ppaf.blk_len) - 1));
+	l.g.pg |= (r.ppa >> geo->ppaf.pg_offset) &
+					(((1 << geo->ppaf.pg_len) - 1));
+	l.g.sec |= (r.ppa >> geo->ppaf.sect_offset) &
+					(((1 << geo->ppaf.sect_len) - 1));
+	l.g.pl |= (r.ppa >> geo->ppaf.pln_offset) &
+					(((1 << geo->ppaf.pln_len) - 1));
+	l.g.lun |= (r.ppa >> geo->ppaf.lun_offset) &
+					(((1 << geo->ppaf.lun_len) - 1));
+	l.g.ch |= (r.ppa >> geo->ppaf.ch_offset) &
+					(((1 << geo->ppaf.ch_len) - 1));
 
 	return l;
 }
@@ -456,11 +476,12 @@ static inline void ppa_set_empty(struct ppa_addr *ppa_addr)
 static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev,
 							struct nvm_block *blk)
 {
+	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr ppa;
 	struct nvm_lun *lun = blk->lun;
 
 	ppa.ppa = 0;
-	ppa.g.blk = blk->id % dev->blks_per_lun;
+	ppa.g.blk = blk->id % geo->blks_per_lun;
 	ppa.g.lun = lun->lun_id;
 	ppa.g.ch = lun->chnl_id;
 
@@ -483,7 +504,8 @@ static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg)
 
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int);
+typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, int,
+				int);
 typedef void (nvm_tgt_exit_fn)(void *);
 
 struct nvm_tgt_type {
@@ -516,9 +538,6 @@ typedef void (nvmm_unregister_fn)(struct nvm_dev *);
 
 typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *);
 typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
-typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *,
-					      struct nvm_lun *, unsigned long);
-typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int);
 typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int);
@@ -538,10 +557,6 @@ struct nvmm_type {
 	nvmm_create_tgt_fn *create_tgt;
 	nvmm_remove_tgt_fn *remove_tgt;
 
-	/* Block administration callbacks */
-	nvmm_get_blk_fn *get_blk;
-	nvmm_put_blk_fn *put_blk;
-
 	nvmm_submit_io_fn *submit_io;
 	nvmm_erase_blk_fn *erase_blk;
 
@@ -563,10 +578,6 @@ struct nvmm_type {
 extern int nvm_register_mgr(struct nvmm_type *);
 extern void nvm_unregister_mgr(struct nvmm_type *);
 
-extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *,
-								unsigned long);
-extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *);
-
 extern struct nvm_dev *nvm_alloc_dev(int);
 extern int nvm_register(struct nvm_dev *);
 extern void nvm_unregister(struct nvm_dev *);
@@ -611,10 +622,10 @@ extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *);
 
 extern int nvm_dev_factory(struct nvm_dev *, int flags);
 
-#define nvm_for_each_lun_ppa(dev, ppa, chid, lunid)			\
-	for ((chid) = 0, (ppa).ppa = 0; (chid) < (dev)->nr_chnls;	\
+#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid)			\
+	for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls;	\
 					(chid)++, (ppa).g.ch = (chid))	\
-		for ((lunid) = 0; (lunid) < (dev)->luns_per_chnl;	\
+		for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl;	\
 					(lunid)++, (ppa).g.lun = (lunid))
 
 #else /* CONFIG_NVM */

From 0ac4072eb10c9627415eb1ca511121156e20012c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:07 +0100
Subject: [PATCH 132/182] lightnvm: remove get_lun operation on gennvm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since LUNs are managed internally on the target, there is no need for
the media manager to implement a get_lun operation.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 13 +------------
 drivers/lightnvm/rrpc.c   | 23 +++++++++++------------
 include/linux/lightnvm.h  |  8 ++------
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 8791a2aaa9e35e..3cf5d5947070ac 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -159,7 +159,7 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	tdisk->fops = &gen_fops;
 	tdisk->queue = tqueue;
 
-	targetdata = tt->init(tgt_dev, tdisk, s->lun_begin, s->lun_end);
+	targetdata = tt->init(tgt_dev, tdisk, &t->lun_list);
 	if (IS_ERR(targetdata))
 		goto err_init;
 
@@ -613,16 +613,6 @@ static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, int flags)
 	return nvm_erase_ppa(dev, &addr, 1, flags);
 }
 
-static struct nvm_lun *gen_get_lun(struct nvm_dev *dev, int lunid)
-{
-	struct gen_dev *gn = dev->mp;
-
-	if (unlikely(lunid >= dev->geo.nr_luns))
-		return NULL;
-
-	return &gn->luns[lunid];
-}
-
 static void gen_lun_info_print(struct nvm_dev *dev)
 {
 	struct gen_dev *gn = dev->mp;
@@ -655,7 +645,6 @@ static struct nvmm_type gen = {
 
 	.mark_blk		= gen_mark_blk,
 
-	.get_lun		= gen_get_lun,
 	.lun_info_print		= gen_lun_info_print,
 
 	.get_area		= gen_get_area,
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 5377c7a987aac8..165b9d39649350 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -1199,10 +1199,11 @@ static void rrpc_luns_free(struct rrpc *rrpc)
 	kfree(rrpc->luns);
 }
 
-static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
+static int rrpc_luns_init(struct rrpc *rrpc, struct list_head *lun_list)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct nvm_geo *geo = &dev->geo;
+	struct nvm_lun *lun;
 	struct rrpc_lun *rlun;
 	int i, j, ret = -EINVAL;
 
@@ -1218,16 +1219,11 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 	if (!rrpc->luns)
 		return -ENOMEM;
 
-	/* 1:1 mapping */
-	for (i = 0; i < rrpc->nr_luns; i++) {
-		int lunid = lun_begin + i;
-		struct nvm_lun *lun;
+	i = 0;
 
-		lun = dev->mt->get_lun(dev->parent, lunid);
-		if (!lun)
-			goto err;
-
-		rlun = &rrpc->luns[i];
+	/* 1:1 mapping */
+	list_for_each_entry(lun, lun_list, list) {
+		rlun = &rrpc->luns[i++];
 		rlun->parent = lun;
 		rlun->blocks = vzalloc(sizeof(struct rrpc_block) *
 							geo->blks_per_lun);
@@ -1256,6 +1252,8 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end)
 		spin_lock_init(&rlun->lock);
 	}
 
+	WARN_ON(i != rrpc->nr_luns);
+
 	return 0;
 err:
 	return ret;
@@ -1410,12 +1408,13 @@ static int rrpc_luns_configure(struct rrpc *rrpc)
 static struct nvm_tgt_type tt_rrpc;
 
 static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
-						int lun_begin, int lun_end)
+						struct list_head *lun_list)
 {
 	struct request_queue *bqueue = dev->q;
 	struct request_queue *tqueue = tdisk->queue;
 	struct nvm_geo *geo = &dev->geo;
 	struct rrpc *rrpc;
+	int lun_begin = (list_first_entry(lun_list, struct nvm_lun, list))->id;
 	sector_t soffset;
 	int ret;
 
@@ -1450,7 +1449,7 @@ static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	}
 	rrpc->soffset = soffset;
 
-	ret = rrpc_luns_init(rrpc, lun_begin, lun_end);
+	ret = rrpc_luns_init(rrpc, lun_list);
 	if (ret) {
 		pr_err("nvm: rrpc: could not initialize luns\n");
 		goto err;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 1f1588c2557e2f..e56c352272499f 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -504,8 +504,8 @@ static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg)
 
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, int,
-				int);
+typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
+				struct list_head *lun_list);
 typedef void (nvm_tgt_exit_fn)(void *);
 
 struct nvm_tgt_type {
@@ -541,7 +541,6 @@ typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int);
 typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int);
-typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int);
 typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *);
 
 typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
@@ -563,9 +562,6 @@ struct nvmm_type {
 	/* Bad block mgmt */
 	nvmm_mark_blk_fn *mark_blk;
 
-	/* Configuration management */
-	nvmm_get_lun_fn *get_lun;
-
 	/* Statistics */
 	nvmm_lun_info_print_fn *lun_info_print;
 

From eec44565e9ab13bbf5b48864a68871eabf1115c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:08 +0100
Subject: [PATCH 133/182] lightnvm: remove debug lun statistics from gennvm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since LUNs are managed internally on targets, the media manager has no
access to the free LUN lists. Thus, debug functions that show LUN
information on the device should not be implemented on the media
manager, but rather on the target in itself.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/gennvm.c | 19 -------------------
 include/linux/lightnvm.h  |  5 -----
 2 files changed, 24 deletions(-)

diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 3cf5d5947070ac..dd9afd7ade1655 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -613,23 +613,6 @@ static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, int flags)
 	return nvm_erase_ppa(dev, &addr, 1, flags);
 }
 
-static void gen_lun_info_print(struct nvm_dev *dev)
-{
-	struct gen_dev *gn = dev->mp;
-	struct nvm_lun *lun;
-	unsigned int i;
-
-
-	gen_for_each_lun(gn, lun, i) {
-		spin_lock(&lun->lock);
-
-		pr_info("%s: lun%8u\t%u\n", dev->name, i,
-						lun->nr_free_blocks);
-
-		spin_unlock(&lun->lock);
-	}
-}
-
 static struct nvmm_type gen = {
 	.name			= "gennvm",
 	.version		= {0, 1, 0},
@@ -645,8 +628,6 @@ static struct nvmm_type gen = {
 
 	.mark_blk		= gen_mark_blk,
 
-	.lun_info_print		= gen_lun_info_print,
-
 	.get_area		= gen_get_area,
 	.put_area		= gen_put_area,
 
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index e56c352272499f..ed04fa642371bc 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -541,8 +541,6 @@ typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int);
 typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int);
-typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *);
-
 typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
 typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t);
 
@@ -562,9 +560,6 @@ struct nvmm_type {
 	/* Bad block mgmt */
 	nvmm_mark_blk_fn *mark_blk;
 
-	/* Statistics */
-	nvmm_lun_info_print_fn *lun_info_print;
-
 	nvmm_get_area_fn *get_area;
 	nvmm_put_area_fn *put_area;
 

From 2a02e627c245bfa987b97707123d7747d7b0e486 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:09 +0100
Subject: [PATCH 134/182] lightnvm: eliminate nvm_block abstraction on mm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to naturally support multi-target instances on an Open-Channel
SSD, targets should own the LUNs they get blocks from and manage
provisioning internally. This is done in several steps.

A part of this transformation is that targets manage their blocks
internally. This patch eliminates the nvm_block abstraction and moves
block management to the target logic. The rrpc target is transformed.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c   |  10 +-
 drivers/lightnvm/gennvm.c | 205 +--------------------------------
 drivers/lightnvm/rrpc.c   | 234 +++++++++++++++++++++++++++-----------
 drivers/lightnvm/rrpc.h   |  23 ++--
 include/linux/lightnvm.h  |  48 +-------
 5 files changed, 191 insertions(+), 329 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 493ce034e68769..691b16ffda88f0 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -176,12 +176,6 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 	return NULL;
 }
 
-void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
-{
-	return dev->mt->mark_blk(dev, ppa, type);
-}
-EXPORT_SYMBOL(nvm_mark_blk);
-
 int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
 								int type)
 {
@@ -215,9 +209,9 @@ int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, int flags)
+int nvm_erase_blk(struct nvm_dev *dev, struct ppa_addr *p, int flags)
 {
-	return dev->mt->erase_blk(dev, blk, flags);
+	return dev->mt->erase_blk(dev, p, flags);
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index dd9afd7ade1655..19c3924ffa1095 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -306,19 +306,6 @@ static void gen_put_area(struct nvm_dev *dev, sector_t begin)
 	spin_unlock(&dev->lock);
 }
 
-static void gen_blocks_free(struct nvm_dev *dev)
-{
-	struct gen_dev *gn = dev->mp;
-	struct nvm_lun *lun;
-	int i;
-
-	gen_for_each_lun(gn, lun, i) {
-		if (!lun->blocks)
-			break;
-		vfree(lun->blocks);
-	}
-}
-
 static void gen_luns_free(struct nvm_dev *dev)
 {
 	struct gen_dev *gn = dev->mp;
@@ -337,167 +324,17 @@ static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn)
 		return -ENOMEM;
 
 	gen_for_each_lun(gn, lun, i) {
-		INIT_LIST_HEAD(&lun->free_list);
-		INIT_LIST_HEAD(&lun->used_list);
-		INIT_LIST_HEAD(&lun->bb_list);
 		INIT_LIST_HEAD(&lun->list);
 
-		spin_lock_init(&lun->lock);
-
 		lun->id = i;
 		lun->lun_id = i % geo->luns_per_chnl;
 		lun->chnl_id = i / geo->luns_per_chnl;
-		lun->nr_free_blocks = geo->blks_per_lun;
-	}
-	return 0;
-}
-
-static int gen_block_bb(struct gen_dev *gn, struct ppa_addr ppa,
-							u8 *blks, int nr_blks)
-{
-	struct nvm_dev *dev = gn->dev;
-	struct nvm_lun *lun;
-	struct nvm_block *blk;
-	int i;
-
-	nr_blks = nvm_bb_tbl_fold(dev, blks, nr_blks);
-	if (nr_blks < 0)
-		return nr_blks;
-
-	lun = &gn->luns[(dev->geo.luns_per_chnl * ppa.g.ch) + ppa.g.lun];
-
-	for (i = 0; i < nr_blks; i++) {
-		if (blks[i] == NVM_BLK_T_FREE)
-			continue;
-
-		blk = &lun->blocks[i];
-		list_move_tail(&blk->list, &lun->bb_list);
-		blk->state = NVM_BLK_ST_BAD;
-		lun->nr_free_blocks--;
-	}
-
-	return 0;
-}
-
-static int gen_block_map(u64 slba, u32 nlb, __le64 *entries, void *private)
-{
-	struct nvm_dev *dev = private;
-	struct nvm_geo *geo = &dev->geo;
-	struct gen_dev *gn = dev->mp;
-	u64 elba = slba + nlb;
-	struct nvm_lun *lun;
-	struct nvm_block *blk;
-	u64 i;
-	int lun_id;
-
-	if (unlikely(elba > dev->total_secs)) {
-		pr_err("gen: L2P data from device is out of bounds!\n");
-		return -EINVAL;
-	}
-
-	for (i = 0; i < nlb; i++) {
-		u64 pba = le64_to_cpu(entries[i]);
-
-		if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) {
-			pr_err("gen: L2P data entry is out of bounds!\n");
-			return -EINVAL;
-		}
-
-		/* Address zero is a special one. The first page on a disk is
-		 * protected. It often holds internal device boot
-		 * information.
-		 */
-		if (!pba)
-			continue;
-
-		/* resolve block from physical address */
-		lun_id = div_u64(pba, geo->sec_per_lun);
-		lun = &gn->luns[lun_id];
-
-		/* Calculate block offset into lun */
-		pba = pba - (geo->sec_per_lun * lun_id);
-		blk = &lun->blocks[div_u64(pba, geo->sec_per_blk)];
-
-		if (!blk->state) {
-			/* at this point, we don't know anything about the
-			 * block. It's up to the FTL on top to re-etablish the
-			 * block state. The block is assumed to be open.
-			 */
-			list_move_tail(&blk->list, &lun->used_list);
-			blk->state = NVM_BLK_ST_TGT;
-			lun->nr_free_blocks--;
-		}
 	}
-
-	return 0;
-}
-
-static int gen_blocks_init(struct nvm_dev *dev, struct gen_dev *gn)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_lun *lun;
-	struct nvm_block *block;
-	sector_t lun_iter, blk_iter, cur_block_id = 0;
-	int ret, nr_blks;
-	u8 *blks;
-
-	nr_blks = geo->blks_per_lun * geo->plane_mode;
-	blks = kmalloc(nr_blks, GFP_KERNEL);
-	if (!blks)
-		return -ENOMEM;
-
-	gen_for_each_lun(gn, lun, lun_iter) {
-		lun->blocks = vzalloc(sizeof(struct nvm_block) *
-							geo->blks_per_lun);
-		if (!lun->blocks) {
-			kfree(blks);
-			return -ENOMEM;
-		}
-
-		for (blk_iter = 0; blk_iter < geo->blks_per_lun; blk_iter++) {
-			block = &lun->blocks[blk_iter];
-
-			INIT_LIST_HEAD(&block->list);
-
-			block->lun = lun;
-			block->id = cur_block_id++;
-
-			list_add_tail(&block->list, &lun->free_list);
-		}
-
-		if (dev->ops->get_bb_tbl) {
-			struct ppa_addr ppa;
-
-			ppa.ppa = 0;
-			ppa.g.ch = lun->chnl_id;
-			ppa.g.lun = lun->lun_id;
-
-			ret = nvm_get_bb_tbl(dev, ppa, blks);
-			if (ret)
-				pr_err("gen: could not get BB table\n");
-
-			ret = gen_block_bb(gn, ppa, blks, nr_blks);
-			if (ret)
-				pr_err("gen: BB table map failed\n");
-		}
-	}
-
-	if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) {
-		ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs,
-							gen_block_map, dev);
-		if (ret) {
-			pr_err("gen: could not read L2P table.\n");
-			pr_warn("gen: default block initialization");
-		}
-	}
-
-	kfree(blks);
 	return 0;
 }
 
 static void gen_free(struct nvm_dev *dev)
 {
-	gen_blocks_free(dev);
 	gen_luns_free(dev);
 	kfree(dev->mp);
 	dev->mp = NULL;
@@ -528,12 +365,6 @@ static int gen_register(struct nvm_dev *dev)
 		goto err;
 	}
 
-	ret = gen_blocks_init(dev, gn);
-	if (ret) {
-		pr_err("gen: could not initialize blocks\n");
-		goto err;
-	}
-
 	return 1;
 err:
 	gen_free(dev);
@@ -558,34 +389,6 @@ static void gen_unregister(struct nvm_dev *dev)
 	module_put(THIS_MODULE);
 }
 
-static void gen_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct gen_dev *gn = dev->mp;
-	struct nvm_lun *lun;
-	struct nvm_block *blk;
-
-	pr_debug("gen: ppa  (ch: %u lun: %u blk: %u pg: %u) -> %u\n",
-			ppa.g.ch, ppa.g.lun, ppa.g.blk, ppa.g.pg, type);
-
-	if (unlikely(ppa.g.ch > geo->nr_chnls ||
-					ppa.g.lun > geo->luns_per_chnl ||
-					ppa.g.blk > geo->blks_per_lun)) {
-		WARN_ON_ONCE(1);
-		pr_err("gen: ppa broken (ch: %u > %u lun: %u > %u blk: %u > %u",
-				ppa.g.ch, geo->nr_chnls,
-				ppa.g.lun, geo->luns_per_chnl,
-				ppa.g.blk, geo->blks_per_lun);
-		return;
-	}
-
-	lun = &gn->luns[(geo->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
-	blk = &lun->blocks[ppa.g.blk];
-
-	/* will be moved to bb list on put_blk from target */
-	blk->state = type;
-}
-
 static void gen_end_io(struct nvm_rq *rqd)
 {
 	struct nvm_tgt_instance *ins = rqd->ins;
@@ -606,11 +409,9 @@ static int gen_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	return dev->ops->submit_io(dev, rqd);
 }
 
-static int gen_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, int flags)
+static int gen_erase_blk(struct nvm_dev *dev, struct ppa_addr *p, int flags)
 {
-	struct ppa_addr addr = block_to_ppa(dev, blk);
-
-	return nvm_erase_ppa(dev, &addr, 1, flags);
+	return nvm_erase_ppa(dev, p, 1, flags);
 }
 
 static struct nvmm_type gen = {
@@ -626,8 +427,6 @@ static struct nvmm_type gen = {
 	.submit_io		= gen_submit_io,
 	.erase_blk		= gen_erase_blk,
 
-	.mark_blk		= gen_mark_blk,
-
 	.get_area		= gen_get_area,
 	.put_area		= gen_put_area,
 
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 165b9d39649350..a352285dffc026 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -126,19 +126,21 @@ static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk)
 static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
-	struct nvm_block *blk = rblk->parent;
-	int lun_blk = blk->id % (dev->geo.blks_per_lun * rrpc->nr_luns);
+	struct rrpc_lun *rlun = rblk->rlun;
+	struct nvm_lun *lun = rlun->parent;
 
-	return lun_blk * dev->geo.sec_per_blk;
+	return lun->id * dev->geo.sec_per_blk;
 }
 
 /* Calculate global addr for the given block */
 static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
-	struct nvm_block *blk = rblk->parent;
+	struct nvm_geo *geo = &dev->geo;
+	struct rrpc_lun *rlun = rblk->rlun;
+	struct nvm_lun *lun = rlun->parent;
 
-	return blk->id * dev->geo.sec_per_blk;
+	return lun->id * geo->sec_per_lun + rblk->id * geo->sec_per_blk;
 }
 
 static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev, u64 addr)
@@ -163,51 +165,46 @@ static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk,
 	*cur_rblk = new_rblk;
 }
 
-static struct nvm_block *__rrpc_get_blk(struct rrpc *rrpc,
+static struct rrpc_block *__rrpc_get_blk(struct rrpc *rrpc,
 							struct rrpc_lun *rlun)
 {
-	struct nvm_lun *lun = rlun->parent;
-	struct nvm_block *blk = NULL;
+	struct rrpc_block *rblk = NULL;
 
-	if (list_empty(&lun->free_list))
+	if (list_empty(&rlun->free_list))
 		goto out;
 
-	blk = list_first_entry(&lun->free_list, struct nvm_block, list);
+	rblk = list_first_entry(&rlun->free_list, struct rrpc_block, list);
 
-	list_move_tail(&blk->list, &lun->used_list);
-	blk->state = NVM_BLK_ST_TGT;
-	lun->nr_free_blocks--;
+	list_move_tail(&rblk->list, &rlun->used_list);
+	rblk->state = NVM_BLK_ST_TGT;
+	rlun->nr_free_blocks--;
 
 out:
-	return blk;
+	return rblk;
 }
 
 static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
 							unsigned long flags)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
-	struct nvm_lun *lun = rlun->parent;
-	struct nvm_block *blk;
 	struct rrpc_block *rblk;
 	int is_gc = flags & NVM_IOTYPE_GC;
 
 	spin_lock(&rlun->lock);
-	if (!is_gc && lun->nr_free_blocks < rlun->reserved_blocks) {
+	if (!is_gc && rlun->nr_free_blocks < rlun->reserved_blocks) {
 		pr_err("nvm: rrpc: cannot give block to non GC request\n");
 		spin_unlock(&rlun->lock);
 		return NULL;
 	}
 
-	blk = __rrpc_get_blk(rrpc, rlun);
-	if (!blk) {
+	rblk = __rrpc_get_blk(rrpc, rlun);
+	if (!rblk) {
 		pr_err("nvm: rrpc: cannot get new block\n");
 		spin_unlock(&rlun->lock);
 		return NULL;
 	}
 	spin_unlock(&rlun->lock);
 
-	rblk = rrpc_get_rblk(rlun, blk->id);
-	blk->priv = rblk;
 	bitmap_zero(rblk->invalid_pages, dev->geo.sec_per_blk);
 	rblk->next_page = 0;
 	rblk->nr_invalid_pages = 0;
@@ -218,23 +215,23 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
 
 static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
-	struct nvm_block *blk = rblk->parent;
 	struct rrpc_lun *rlun = rblk->rlun;
 	struct nvm_lun *lun = rlun->parent;
 
 	spin_lock(&rlun->lock);
-	if (blk->state & NVM_BLK_ST_TGT) {
-		list_move_tail(&blk->list, &lun->free_list);
-		lun->nr_free_blocks++;
-		blk->state = NVM_BLK_ST_FREE;
-	} else if (blk->state & NVM_BLK_ST_BAD) {
-		list_move_tail(&blk->list, &lun->bb_list);
-		blk->state = NVM_BLK_ST_BAD;
+	if (rblk->state & NVM_BLK_ST_TGT) {
+		list_move_tail(&rblk->list, &rlun->free_list);
+		rlun->nr_free_blocks++;
+		rblk->state = NVM_BLK_ST_FREE;
+	} else if (rblk->state & NVM_BLK_ST_BAD) {
+		list_move_tail(&rblk->list, &rlun->bb_list);
+		rblk->state = NVM_BLK_ST_BAD;
 	} else {
 		WARN_ON_ONCE(1);
-		pr_err("rrpc: erroneous block type (%lu -> %u)\n",
-							blk->id, blk->state);
-		list_move_tail(&blk->list, &lun->bb_list);
+		pr_err("rrpc: erroneous type (ch:%d,lun:%d,blk%d-> %u)\n",
+						lun->chnl_id, lun->lun_id,
+						rblk->id, rblk->state);
+		list_move_tail(&rblk->list, &rlun->bb_list);
 	}
 	spin_unlock(&rlun->lock);
 }
@@ -334,7 +331,7 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
 					    nr_sec_per_blk)) < nr_sec_per_blk) {
 
 		/* Lock laddr */
-		phys_addr = rblk->parent->id * nr_sec_per_blk + slot;
+		phys_addr = rrpc_blk_to_ppa(rrpc, rblk) + slot;
 
 try:
 		spin_lock(&rrpc->rev_lock);
@@ -422,14 +419,22 @@ static void rrpc_block_gc(struct work_struct *work)
 	struct rrpc_block *rblk = gcb->rblk;
 	struct rrpc_lun *rlun = rblk->rlun;
 	struct nvm_tgt_dev *dev = rrpc->dev;
+	struct ppa_addr ppa;
 
 	mempool_free(gcb, rrpc->gcb_pool);
-	pr_debug("nvm: block '%lu' being reclaimed\n", rblk->parent->id);
+	pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' being reclaimed\n",
+			rlun->parent->chnl_id, rlun->parent->lun_id,
+			rblk->id);
 
 	if (rrpc_move_valid_pages(rrpc, rblk))
 		goto put_back;
 
-	if (nvm_erase_blk(dev->parent, rblk->parent, 0))
+	ppa.ppa = 0;
+	ppa.g.ch = rlun->parent->chnl_id;
+	ppa.g.lun = rlun->parent->lun_id;
+	ppa.g.blk = rblk->id;
+
+	if (nvm_erase_blk(dev->parent, &ppa, 0))
 		goto put_back;
 
 	rrpc_put_blk(rrpc, rblk);
@@ -445,7 +450,7 @@ static void rrpc_block_gc(struct work_struct *work)
 /* the block with highest number of invalid pages, will be in the beginning
  * of the list
  */
-static struct rrpc_block *rblock_max_invalid(struct rrpc_block *ra,
+static struct rrpc_block *rblk_max_invalid(struct rrpc_block *ra,
 							struct rrpc_block *rb)
 {
 	if (ra->nr_invalid_pages == rb->nr_invalid_pages)
@@ -460,13 +465,13 @@ static struct rrpc_block *rblock_max_invalid(struct rrpc_block *ra,
 static struct rrpc_block *block_prio_find_max(struct rrpc_lun *rlun)
 {
 	struct list_head *prio_list = &rlun->prio_list;
-	struct rrpc_block *rblock, *max;
+	struct rrpc_block *rblk, *max;
 
 	BUG_ON(list_empty(prio_list));
 
 	max = list_first_entry(prio_list, struct rrpc_block, prio);
-	list_for_each_entry(rblock, prio_list, prio)
-		max = rblock_max_invalid(max, rblock);
+	list_for_each_entry(rblk, prio_list, prio)
+		max = rblk_max_invalid(max, rblk);
 
 	return max;
 }
@@ -476,7 +481,6 @@ static void rrpc_lun_gc(struct work_struct *work)
 	struct rrpc_lun *rlun = container_of(work, struct rrpc_lun, ws_gc);
 	struct rrpc *rrpc = rlun->rrpc;
 	struct nvm_tgt_dev *dev = rrpc->dev;
-	struct nvm_lun *lun = rlun->parent;
 	struct rrpc_block_gc *gcb;
 	unsigned int nr_blocks_need;
 
@@ -486,26 +490,28 @@ static void rrpc_lun_gc(struct work_struct *work)
 		nr_blocks_need = rrpc->nr_luns;
 
 	spin_lock(&rlun->lock);
-	while (nr_blocks_need > lun->nr_free_blocks &&
+	while (nr_blocks_need > rlun->nr_free_blocks &&
 					!list_empty(&rlun->prio_list)) {
-		struct rrpc_block *rblock = block_prio_find_max(rlun);
-		struct nvm_block *block = rblock->parent;
+		struct rrpc_block *rblk = block_prio_find_max(rlun);
 
-		if (!rblock->nr_invalid_pages)
+		if (!rblk->nr_invalid_pages)
 			break;
 
 		gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
 		if (!gcb)
 			break;
 
-		list_del_init(&rblock->prio);
+		list_del_init(&rblk->prio);
 
-		BUG_ON(!block_is_full(rrpc, rblock));
+		WARN_ON(!block_is_full(rrpc, rblk));
 
-		pr_debug("rrpc: selected block '%lu' for GC\n", block->id);
+		pr_debug("rrpc: selected block 'ch:%d,lun:%d,blk:%d' for GC\n",
+					rlun->parent->chnl_id,
+					rlun->parent->lun_id,
+					rblk->id);
 
 		gcb->rrpc = rrpc;
-		gcb->rblk = rblock;
+		gcb->rblk = rblk;
 		INIT_WORK(&gcb->ws_gc, rrpc_block_gc);
 
 		queue_work(rrpc->kgc_wq, &gcb->ws_gc);
@@ -530,8 +536,10 @@ static void rrpc_gc_queue(struct work_struct *work)
 	spin_unlock(&rlun->lock);
 
 	mempool_free(gcb, rrpc->gcb_pool);
-	pr_debug("nvm: block '%lu' is full, allow GC (sched)\n",
-							rblk->parent->id);
+	pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' full, allow GC (sched)\n",
+					rlun->parent->chnl_id,
+					rlun->parent->lun_id,
+					rblk->id);
 }
 
 static const struct block_device_operations rrpc_fops = {
@@ -555,8 +563,7 @@ static struct rrpc_lun *rrpc_get_lun_rr(struct rrpc *rrpc, int is_gc)
 	 * estimate.
 	 */
 	rrpc_for_each_lun(rrpc, rlun, i) {
-		if (rlun->parent->nr_free_blocks >
-					max_free->parent->nr_free_blocks)
+		if (rlun->nr_free_blocks > max_free->nr_free_blocks)
 			max_free = rlun;
 	}
 
@@ -613,14 +620,12 @@ static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
 {
 	struct rrpc_lun *rlun;
 	struct rrpc_block *rblk, **cur_rblk;
-	struct nvm_lun *lun;
 	u64 paddr;
 	int gc_force = 0;
 
 	rlun = rrpc_get_lun_rr(rrpc, is_gc);
-	lun = rlun->parent;
 
-	if (!is_gc && lun->nr_free_blocks < rrpc->nr_luns * 4)
+	if (!is_gc && rlun->nr_free_blocks < rrpc->nr_luns * 4)
 		return NULL;
 
 	/*
@@ -701,22 +706,44 @@ static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
 	queue_work(rrpc->kgc_wq, &gcb->ws_gc);
 }
 
-static void __rrpc_mark_bad_block(struct nvm_tgt_dev *dev, struct ppa_addr *ppa)
+static struct rrpc_lun *rrpc_ppa_to_lun(struct rrpc *rrpc, struct ppa_addr p)
 {
-		nvm_mark_blk(dev->parent, *ppa, NVM_BLK_ST_BAD);
-		nvm_set_bb_tbl(dev->parent, ppa, 1, NVM_BLK_T_GRWN_BAD);
+	struct rrpc_lun *rlun = NULL;
+	int i;
+
+	for (i = 0; i < rrpc->nr_luns; i++) {
+		if (rrpc->luns[i].parent->chnl_id == p.g.ch &&
+				rrpc->luns[i].parent->lun_id == p.g.lun) {
+			rlun = &rrpc->luns[i];
+			break;
+		}
+	}
+
+	return rlun;
 }
 
-static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
+static void __rrpc_mark_bad_block(struct rrpc *rrpc, struct ppa_addr ppa)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
+	struct rrpc_lun *rlun;
+	struct rrpc_block *rblk;
+
+	rlun = rrpc_ppa_to_lun(rrpc, ppa);
+	rblk = &rlun->blocks[ppa.g.blk];
+	rblk->state = NVM_BLK_ST_BAD;
+
+	nvm_set_bb_tbl(dev->parent, &ppa, 1, NVM_BLK_T_GRWN_BAD);
+}
+
+static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
+{
 	void *comp_bits = &rqd->ppa_status;
 	struct ppa_addr ppa, prev_ppa;
 	int nr_ppas = rqd->nr_ppas;
 	int bit;
 
 	if (rqd->nr_ppas == 1)
-		__rrpc_mark_bad_block(dev, &rqd->ppa_addr);
+		__rrpc_mark_bad_block(rrpc, rqd->ppa_addr);
 
 	ppa_set_empty(&prev_ppa);
 	bit = -1;
@@ -725,7 +752,7 @@ static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
 		if (ppa_cmp_blk(ppa, prev_ppa))
 			continue;
 
-		__rrpc_mark_bad_block(dev, &ppa);
+		__rrpc_mark_bad_block(rrpc, ppa);
 	}
 }
 
@@ -735,13 +762,11 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_addr *p;
 	struct rrpc_block *rblk;
-	struct nvm_lun *lun;
 	int cmnt_size, i;
 
 	for (i = 0; i < npages; i++) {
 		p = &rrpc->trans_map[laddr + i];
 		rblk = p->rblk;
-		lun = rblk->parent->lun;
 
 		cmnt_size = atomic_inc_return(&rblk->data_cmnt_size);
 		if (unlikely(cmnt_size == dev->geo.sec_per_blk))
@@ -1061,16 +1086,21 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_addr *addr = rrpc->trans_map + slba;
 	struct rrpc_rev_addr *raddr = rrpc->rev_trans_map;
+	struct rrpc_lun *rlun;
+	struct rrpc_block *rblk;
 	u64 i;
 
 	for (i = 0; i < nlb; i++) {
+		struct ppa_addr gaddr;
 		u64 pba = le64_to_cpu(entries[i]);
 		unsigned int mod;
+
 		/* LNVM treats address-spaces as silos, LBA and PBA are
 		 * equally large and zero-indexed.
 		 */
 		if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) {
 			pr_err("nvm: L2P data entry is out of bounds!\n");
+			pr_err("nvm: Maybe loaded an old target L2P\n");
 			return -EINVAL;
 		}
 
@@ -1085,6 +1115,25 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
 
 		addr[i].addr = pba;
 		raddr[mod].addr = slba + i;
+
+		gaddr = rrpc_ppa_to_gaddr(dev, pba);
+		rlun = rrpc_ppa_to_lun(rrpc, gaddr);
+		if (!rlun) {
+			pr_err("rrpc: l2p corruption on lba %llu\n",
+							slba + i);
+			return -EINVAL;
+		}
+
+		rblk = &rlun->blocks[gaddr.g.blk];
+		if (!rblk->state) {
+			/* at this point, we don't know anything about the
+			 * block. It's up to the FTL on top to re-etablish the
+			 * block state. The block is assumed to be open.
+			 */
+			list_move_tail(&rblk->list, &rlun->used_list);
+			rblk->state = NVM_BLK_ST_TGT;
+			rlun->nr_free_blocks--;
+		}
 	}
 
 	return 0;
@@ -1199,6 +1248,51 @@ static void rrpc_luns_free(struct rrpc *rrpc)
 	kfree(rrpc->luns);
 }
 
+static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
+{
+	struct nvm_geo *geo = &dev->geo;
+	struct rrpc_block *rblk;
+	struct ppa_addr ppa;
+	u8 *blks;
+	int nr_blks;
+	int i;
+	int ret;
+
+	nr_blks = geo->blks_per_lun * geo->plane_mode;
+	blks = kmalloc(nr_blks, GFP_KERNEL);
+	if (!blks)
+		return -ENOMEM;
+
+	ppa.ppa = 0;
+	ppa.g.ch = rlun->parent->chnl_id;
+	ppa.g.lun = rlun->parent->lun_id;
+
+	ret = nvm_get_bb_tbl(dev->parent, ppa, blks);
+	if (ret) {
+		pr_err("rrpc: could not get BB table\n");
+		goto out;
+	}
+
+	nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
+	if (nr_blks < 0)
+		return nr_blks;
+
+	rlun->nr_free_blocks = geo->blks_per_lun;
+	for (i = 0; i < nr_blks; i++) {
+		if (blks[i] == NVM_BLK_T_FREE)
+			continue;
+
+		rblk = &rlun->blocks[i];
+		list_move_tail(&rblk->list, &rlun->bb_list);
+		rblk->state = NVM_BLK_ST_BAD;
+		rlun->nr_free_blocks--;
+	}
+
+out:
+	kfree(blks);
+	return ret;
+}
+
 static int rrpc_luns_init(struct rrpc *rrpc, struct list_head *lun_list)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
@@ -1232,16 +1326,26 @@ static int rrpc_luns_init(struct rrpc *rrpc, struct list_head *lun_list)
 			goto err;
 		}
 
+		INIT_LIST_HEAD(&rlun->free_list);
+		INIT_LIST_HEAD(&rlun->used_list);
+		INIT_LIST_HEAD(&rlun->bb_list);
+
 		for (j = 0; j < geo->blks_per_lun; j++) {
 			struct rrpc_block *rblk = &rlun->blocks[j];
-			struct nvm_block *blk = &lun->blocks[j];
 
-			rblk->parent = blk;
+			rblk->id = j;
 			rblk->rlun = rlun;
+			rblk->state = NVM_BLK_T_FREE;
 			INIT_LIST_HEAD(&rblk->prio);
+			INIT_LIST_HEAD(&rblk->list);
 			spin_lock_init(&rblk->lock);
+
+			list_add_tail(&rblk->list, &rlun->free_list);
 		}
 
+		if (rrpc_bb_discovery(dev, rlun))
+			goto err;
+
 		rlun->reserved_blocks = 2; /* for GC only */
 
 		rlun->rrpc = rrpc;
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index 242d4c109eb660..c55cec9a1a2638 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -52,8 +52,7 @@ struct rrpc_rq {
 };
 
 struct rrpc_block {
-	unsigned long id;
-	struct nvm_block *parent;
+	int id;				/* id inside of LUN */
 	struct rrpc_lun *rlun;
 
 	struct list_head prio;		/* LUN CG list */
@@ -83,6 +82,16 @@ struct rrpc_lun {
 	struct list_head prio_list;	/* Blocks that may be GC'ed */
 	struct list_head wblk_list;	/* Queued blocks to be written to */
 
+	/* lun block lists */
+	struct list_head used_list;	/* In-use blocks */
+	struct list_head free_list;	/* Not used blocks i.e. released
+					 * and ready for use
+					 */
+	struct list_head bb_list;	/* Bad blocks. Mutually exclusive with
+					 * free_list and used_list
+					 */
+	unsigned int nr_free_blocks;	/* Number of unused blocks */
+
 	struct work_struct ws_gc;
 
 	int reserved_blocks;
@@ -155,14 +164,14 @@ struct rrpc_rev_addr {
 	u64 addr;
 };
 
-static inline struct rrpc_block *rrpc_get_rblk(struct rrpc_lun *rlun,
-								int blk_id)
+static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
-	struct rrpc *rrpc = rlun->rrpc;
 	struct nvm_tgt_dev *dev = rrpc->dev;
-	int lun_blk = blk_id % dev->geo.blks_per_lun;
+	struct nvm_geo *geo = &dev->geo;
+	struct rrpc_lun *rlun = rblk->rlun;
+	struct nvm_lun *lun = rlun->parent;
 
-	return &rlun->blocks[lun_blk];
+	return (lun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk);
 }
 
 static inline sector_t rrpc_get_laddr(struct bio *bio)
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ed04fa642371bc..cc210cc85c6d0c 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -266,8 +266,6 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata)
 	return rqdata + 1;
 }
 
-struct nvm_block;
-
 struct nvm_lun {
 	int id;
 
@@ -275,19 +273,6 @@ struct nvm_lun {
 	int chnl_id;
 
 	struct list_head list;
-	spinlock_t lock;
-
-	/* lun block lists */
-	struct list_head used_list;	/* In-use blocks */
-	struct list_head free_list;	/* Not used blocks i.e. released
-					 * and ready for use
-					 */
-	struct list_head bb_list;	/* Bad blocks. Mutually exclusive with
-					 * free_list and used_list
-					 */
-	unsigned int nr_free_blocks;	/* Number of unused blocks */
-
-	struct nvm_block *blocks;
 };
 
 enum {
@@ -296,15 +281,6 @@ enum {
 	NVM_BLK_ST_BAD =	0x8,	/* Bad block */
 };
 
-struct nvm_block {
-	struct list_head list;
-	struct nvm_lun *lun;
-	unsigned long id;
-
-	void *priv;
-	int state;
-};
-
 /* system block cpu representation */
 struct nvm_sb_info {
 	unsigned long		seqnr;
@@ -473,21 +449,6 @@ static inline void ppa_set_empty(struct ppa_addr *ppa_addr)
 	ppa_addr->ppa = ADDR_EMPTY;
 }
 
-static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev,
-							struct nvm_block *blk)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct ppa_addr ppa;
-	struct nvm_lun *lun = blk->lun;
-
-	ppa.ppa = 0;
-	ppa.g.blk = blk->id % geo->blks_per_lun;
-	ppa.g.lun = lun->lun_id;
-	ppa.g.ch = lun->chnl_id;
-
-	return ppa;
-}
-
 static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
 {
 	if (ppa_empty(ppa1) || ppa_empty(ppa2))
@@ -539,8 +500,7 @@ typedef void (nvmm_unregister_fn)(struct nvm_dev *);
 typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *);
 typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
 typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int);
-typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int);
+typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct ppa_addr *, int);
 typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
 typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t);
 
@@ -557,9 +517,6 @@ struct nvmm_type {
 	nvmm_submit_io_fn *submit_io;
 	nvmm_erase_blk_fn *erase_blk;
 
-	/* Bad block mgmt */
-	nvmm_mark_blk_fn *mark_blk;
-
 	nvmm_get_area_fn *get_area;
 	nvmm_put_area_fn *put_area;
 
@@ -573,7 +530,6 @@ extern struct nvm_dev *nvm_alloc_dev(int);
 extern int nvm_register(struct nvm_dev *);
 extern void nvm_unregister(struct nvm_dev *);
 
-extern void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type);
 extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas,
 							int nr_ppas, int type);
 
@@ -584,7 +540,7 @@ extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
 					const struct ppa_addr *, int, int);
 extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int);
-extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *, int);
+extern int nvm_erase_blk(struct nvm_dev *, struct ppa_addr *, int);
 extern void nvm_end_io(struct nvm_rq *, int);
 extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
 								void *, int);

From 8e53624d44c1de31b1b0d4f500703669418a4c67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:10 +0100
Subject: [PATCH 135/182] lightnvm: eliminate nvm_lun abstraction in mm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to naturally support multi-target instances on an Open-Channel
SSD, targets should own the LUNs they get blocks from and manage
provisioning internally. This is done in several steps.

Since targets own the LUNs the are instantiated on top of and manage the
free block list internally, there is no need for a LUN abstraction in
the media manager. LUNs are intrinsically managed as in the physical
layout (ch:0,lun:0, ..., ch:0,lun:n, ch:1,lun:0, ch:1,lun:n, ...,
ch:m,lun:0, ch:m,lun:n) and given to the targets based on the target
creation ioctl. This simplifies LUN management and clears the path for a
partition manager to sit directly underneath LightNVM targets.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      |  14 +-
 drivers/lightnvm/gennvm.c    | 336 ++++++++++++++++++++++++++++-------
 drivers/lightnvm/gennvm.h    |  20 ++-
 drivers/lightnvm/rrpc.c      | 137 +++++++-------
 drivers/lightnvm/rrpc.h      |  32 +++-
 drivers/nvme/host/lightnvm.c |   3 +
 include/linux/lightnvm.h     |  41 +++--
 7 files changed, 419 insertions(+), 164 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 691b16ffda88f0..23d582f8221973 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -203,15 +203,19 @@ int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
 }
 EXPORT_SYMBOL(nvm_set_bb_tbl);
 
-int nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
-	return dev->mt->submit_io(dev, rqd);
+	struct nvm_dev *dev = tgt_dev->parent;
+
+	return dev->mt->submit_io(tgt_dev, rqd);
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-int nvm_erase_blk(struct nvm_dev *dev, struct ppa_addr *p, int flags)
+int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags)
 {
-	return dev->mt->erase_blk(dev, p, flags);
+	struct nvm_dev *dev = tgt_dev->parent;
+
+	return dev->mt->erase_blk(tgt_dev, p, flags);
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
@@ -350,7 +354,7 @@ static int __nvm_submit_ppa(struct nvm_dev *dev, struct nvm_rq *rqd, int opcode,
 
 	nvm_generic_to_addr_mode(dev, rqd);
 
-	rqd->dev = dev;
+	rqd->dev = NULL;
 	rqd->opcode = opcode;
 	rqd->flags = flags;
 	rqd->bio = bio;
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 19c3924ffa1095..5d7c8c47bef8c4 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -38,8 +38,6 @@ static const struct block_device_operations gen_fops = {
 static int gen_reserve_luns(struct nvm_dev *dev, struct nvm_target *t,
 			    int lun_begin, int lun_end)
 {
-	struct gen_dev *gn = dev->mp;
-	struct nvm_lun *lun;
 	int i;
 
 	for (i = lun_begin; i <= lun_end; i++) {
@@ -47,35 +45,50 @@ static int gen_reserve_luns(struct nvm_dev *dev, struct nvm_target *t,
 			pr_err("nvm: lun %d already allocated\n", i);
 			goto err;
 		}
-
-		lun = &gn->luns[i];
-		list_add_tail(&lun->list, &t->lun_list);
 	}
 
 	return 0;
 
 err:
-	while (--i > lun_begin) {
-		lun = &gn->luns[i];
+	while (--i > lun_begin)
 		clear_bit(i, dev->lun_map);
-		list_del(&lun->list);
-	}
 
 	return -EBUSY;
 }
 
-static void gen_release_luns(struct nvm_dev *dev, struct nvm_target *t)
+static void gen_release_luns_err(struct nvm_dev *dev, int lun_begin,
+				 int lun_end)
 {
-	struct nvm_lun *lun, *tmp;
+	int i;
 
-	list_for_each_entry_safe(lun, tmp, &t->lun_list, list) {
-		WARN_ON(!test_and_clear_bit(lun->id, dev->lun_map));
-		list_del(&lun->list);
-	}
+	for (i = lun_begin; i <= lun_end; i++)
+		WARN_ON(!test_and_clear_bit(i, dev->lun_map));
 }
 
 static void gen_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
 {
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct gen_dev_map *dev_map = tgt_dev->map;
+	int i, j;
+
+	for (i = 0; i < dev_map->nr_chnls; i++) {
+		struct gen_ch_map *ch_map = &dev_map->chnls[i];
+		int *lun_offs = ch_map->lun_offs;
+		int ch = i + ch_map->ch_off;
+
+		for (j = 0; j < ch_map->nr_luns; j++) {
+			int lun = j + lun_offs[j];
+			int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+
+			WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
+		}
+
+		kfree(ch_map->lun_offs);
+	}
+
+	kfree(dev_map->chnls);
+	kfree(dev_map);
+	kfree(tgt_dev->luns);
 	kfree(tgt_dev);
 }
 
@@ -83,24 +96,103 @@ static struct nvm_tgt_dev *gen_create_tgt_dev(struct nvm_dev *dev,
 					      int lun_begin, int lun_end)
 {
 	struct nvm_tgt_dev *tgt_dev = NULL;
+	struct gen_dev_map *dev_rmap = dev->rmap;
+	struct gen_dev_map *dev_map;
+	struct ppa_addr *luns;
 	int nr_luns = lun_end - lun_begin + 1;
+	int luns_left = nr_luns;
+	int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
+	int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
+	int bch = lun_begin / dev->geo.luns_per_chnl;
+	int blun = lun_begin % dev->geo.luns_per_chnl;
+	int lunid = 0;
+	int lun_balanced = 1;
+	int prev_nr_luns;
+	int i, j;
+
+	nr_chnls = nr_luns / dev->geo.luns_per_chnl;
+	nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
+
+	dev_map = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL);
+	if (!dev_map)
+		goto err_dev;
+
+	dev_map->chnls = kcalloc(nr_chnls, sizeof(struct gen_ch_map),
+								GFP_KERNEL);
+	if (!dev_map->chnls)
+		goto err_chnls;
+
+	luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL);
+	if (!luns)
+		goto err_luns;
+
+	prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
+					dev->geo.luns_per_chnl : luns_left;
+	for (i = 0; i < nr_chnls; i++) {
+		struct gen_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
+		int *lun_roffs = ch_rmap->lun_offs;
+		struct gen_ch_map *ch_map = &dev_map->chnls[i];
+		int *lun_offs;
+		int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
+					dev->geo.luns_per_chnl : luns_left;
+
+		if (lun_balanced && prev_nr_luns != luns_in_chnl)
+			lun_balanced = 0;
+
+		ch_map->ch_off = ch_rmap->ch_off = bch;
+		ch_map->nr_luns = luns_in_chnl;
+
+		lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
+		if (!lun_offs)
+			goto err_ch;
+
+		for (j = 0; j < luns_in_chnl; j++) {
+			luns[lunid].ppa = 0;
+			luns[lunid].g.ch = i;
+			luns[lunid++].g.lun = j;
+
+			lun_offs[j] = blun;
+			lun_roffs[j + blun] = blun;
+		}
+
+		ch_map->lun_offs = lun_offs;
+
+		/* when starting a new channel, lun offset is reset */
+		blun = 0;
+		luns_left -= luns_in_chnl;
+	}
+
+	dev_map->nr_chnls = nr_chnls;
 
 	tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
 	if (!tgt_dev)
-		goto out;
+		goto err_ch;
 
 	memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
-	tgt_dev->geo.nr_chnls = (nr_luns / (dev->geo.luns_per_chnl + 1)) + 1;
+	/* Target device only owns a portion of the physical device */
+	tgt_dev->geo.nr_chnls = nr_chnls;
 	tgt_dev->geo.nr_luns = nr_luns;
+	tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
 	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
 	tgt_dev->q = dev->q;
 	tgt_dev->ops = dev->ops;
 	tgt_dev->mt = dev->mt;
+	tgt_dev->map = dev_map;
+	tgt_dev->luns = luns;
 	memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id));
 
 	tgt_dev->parent = dev;
 
-out:
+	return tgt_dev;
+err_ch:
+	while (--i > 0)
+		kfree(dev_map->chnls[i].lun_offs);
+	kfree(luns);
+err_luns:
+	kfree(dev_map->chnls);
+err_chnls:
+	kfree(dev_map);
+err_dev:
 	return tgt_dev;
 }
 
@@ -134,14 +226,14 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	if (!t)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&t->lun_list);
-
 	if (gen_reserve_luns(dev, t, s->lun_begin, s->lun_end))
 		goto err_t;
 
 	tgt_dev = gen_create_tgt_dev(dev, s->lun_begin, s->lun_end);
-	if (!tgt_dev)
+	if (!tgt_dev) {
+		pr_err("nvm: could not create target device\n");
 		goto err_reserve;
+	}
 
 	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
 	if (!tqueue)
@@ -159,7 +251,7 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	tdisk->fops = &gen_fops;
 	tdisk->queue = tqueue;
 
-	targetdata = tt->init(tgt_dev, tdisk, &t->lun_list);
+	targetdata = tt->init(tgt_dev, tdisk);
 	if (IS_ERR(targetdata))
 		goto err_init;
 
@@ -187,7 +279,7 @@ static int gen_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 err_dev:
 	kfree(tgt_dev);
 err_reserve:
-	gen_release_luns(dev, t);
+	gen_release_luns_err(dev, s->lun_begin, s->lun_end);
 err_t:
 	kfree(t);
 	return -ENOMEM;
@@ -205,7 +297,6 @@ static void __gen_remove_target(struct nvm_target *t)
 	if (tt->exit)
 		tt->exit(tdisk->private_data);
 
-	gen_release_luns(t->dev->parent, t);
 	gen_remove_tgt_dev(t->dev);
 	put_disk(tdisk);
 
@@ -306,51 +397,54 @@ static void gen_put_area(struct nvm_dev *dev, sector_t begin)
 	spin_unlock(&dev->lock);
 }
 
-static void gen_luns_free(struct nvm_dev *dev)
-{
-	struct gen_dev *gn = dev->mp;
-
-	kfree(gn->luns);
-}
-
-static int gen_luns_init(struct nvm_dev *dev, struct gen_dev *gn)
-{
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_lun *lun;
-	int i;
-
-	gn->luns = kcalloc(geo->nr_luns, sizeof(struct nvm_lun), GFP_KERNEL);
-	if (!gn->luns)
-		return -ENOMEM;
-
-	gen_for_each_lun(gn, lun, i) {
-		INIT_LIST_HEAD(&lun->list);
-
-		lun->id = i;
-		lun->lun_id = i % geo->luns_per_chnl;
-		lun->chnl_id = i / geo->luns_per_chnl;
-	}
-	return 0;
-}
-
 static void gen_free(struct nvm_dev *dev)
 {
-	gen_luns_free(dev);
 	kfree(dev->mp);
+	kfree(dev->rmap);
 	dev->mp = NULL;
 }
 
 static int gen_register(struct nvm_dev *dev)
 {
 	struct gen_dev *gn;
-	int ret;
+	struct gen_dev_map *dev_rmap;
+	int i, j;
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
 
 	gn = kzalloc(sizeof(struct gen_dev), GFP_KERNEL);
 	if (!gn)
-		return -ENOMEM;
+		goto err_gn;
+
+	dev_rmap = kmalloc(sizeof(struct gen_dev_map), GFP_KERNEL);
+	if (!dev_rmap)
+		goto err_rmap;
+
+	dev_rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct gen_ch_map),
+								GFP_KERNEL);
+	if (!dev_rmap->chnls)
+		goto err_chnls;
+
+	for (i = 0; i < dev->geo.nr_chnls; i++) {
+		struct gen_ch_map *ch_rmap;
+		int *lun_roffs;
+		int luns_in_chnl = dev->geo.luns_per_chnl;
+
+		ch_rmap = &dev_rmap->chnls[i];
+
+		ch_rmap->ch_off = -1;
+		ch_rmap->nr_luns = luns_in_chnl;
+
+		lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
+		if (!lun_roffs)
+			goto err_ch;
+
+		for (j = 0; j < luns_in_chnl; j++)
+			lun_roffs[j] = -1;
+
+		ch_rmap->lun_offs = lun_roffs;
+	}
 
 	gn->dev = dev;
 	gn->nr_luns = dev->geo.nr_luns;
@@ -358,18 +452,19 @@ static int gen_register(struct nvm_dev *dev)
 	mutex_init(&gn->lock);
 	INIT_LIST_HEAD(&gn->targets);
 	dev->mp = gn;
-
-	ret = gen_luns_init(dev, gn);
-	if (ret) {
-		pr_err("gen: could not initialize luns\n");
-		goto err;
-	}
+	dev->rmap = dev_rmap;
 
 	return 1;
-err:
+err_ch:
+	while (--i >= 0)
+		kfree(dev_rmap->chnls[i].lun_offs);
+err_chnls:
+	kfree(dev_rmap);
+err_rmap:
 	gen_free(dev);
+err_gn:
 	module_put(THIS_MODULE);
-	return ret;
+	return -ENOMEM;
 }
 
 static void gen_unregister(struct nvm_dev *dev)
@@ -389,29 +484,137 @@ static void gen_unregister(struct nvm_dev *dev)
 	module_put(THIS_MODULE);
 }
 
+enum {
+	TRANS_TGT_TO_DEV =	0x0,
+	TRANS_DEV_TO_TGT =	0x1,
+};
+
+
+static int gen_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
+{
+	struct gen_dev_map *dev_map = tgt_dev->map;
+	struct gen_ch_map *ch_map = &dev_map->chnls[p->g.ch];
+	int lun_off = ch_map->lun_offs[p->g.lun];
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct gen_dev_map *dev_rmap = dev->rmap;
+	struct gen_ch_map *ch_rmap;
+	int lun_roff;
+
+	p->g.ch += ch_map->ch_off;
+	p->g.lun += lun_off;
+
+	ch_rmap = &dev_rmap->chnls[p->g.ch];
+	lun_roff = ch_rmap->lun_offs[p->g.lun];
+
+	if (unlikely(ch_rmap->ch_off < 0 || lun_roff < 0)) {
+		pr_err("nvm: corrupted device partition table\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int gen_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct gen_dev_map *dev_rmap = dev->rmap;
+	struct gen_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch];
+	int lun_roff = ch_rmap->lun_offs[p->g.lun];
+
+	p->g.ch -= ch_rmap->ch_off;
+	p->g.lun -= lun_roff;
+
+	return 0;
+}
+
+static int gen_trans_rq(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
+			int flag)
+{
+	gen_trans_fn *f;
+	int i;
+	int ret = 0;
+
+	f = (flag == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt;
+
+	if (rqd->nr_ppas == 1)
+		return f(tgt_dev, &rqd->ppa_addr);
+
+	for (i = 0; i < rqd->nr_ppas; i++) {
+		ret = f(tgt_dev, &rqd->ppa_list[i]);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
 static void gen_end_io(struct nvm_rq *rqd)
 {
+	struct nvm_tgt_dev *tgt_dev = rqd->dev;
 	struct nvm_tgt_instance *ins = rqd->ins;
 
+	/* Convert address space */
+	if (tgt_dev)
+		gen_trans_rq(tgt_dev, rqd, TRANS_DEV_TO_TGT);
+
 	ins->tt->end_io(rqd);
 }
 
-static int gen_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static int gen_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
+	struct nvm_dev *dev = tgt_dev->parent;
+
 	if (!dev->ops->submit_io)
 		return -ENODEV;
 
 	/* Convert address space */
+	gen_trans_rq(tgt_dev, rqd, TRANS_TGT_TO_DEV);
 	nvm_generic_to_addr_mode(dev, rqd);
 
-	rqd->dev = dev;
+	rqd->dev = tgt_dev;
 	rqd->end_io = gen_end_io;
 	return dev->ops->submit_io(dev, rqd);
 }
 
-static int gen_erase_blk(struct nvm_dev *dev, struct ppa_addr *p, int flags)
+static int gen_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p,
+			 int flags)
 {
-	return nvm_erase_ppa(dev, p, 1, flags);
+	/* Convert address space */
+	gen_map_to_dev(tgt_dev, p);
+
+	return nvm_erase_ppa(tgt_dev->parent, p, 1, flags);
+}
+
+static void gen_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
+			       int len)
+{
+	struct nvm_geo *geo = &dev->geo;
+	struct gen_dev_map *dev_rmap = dev->rmap;
+	u64 i;
+
+	for (i = 0; i < len; i++) {
+		struct gen_ch_map *ch_rmap;
+		int *lun_roffs;
+		struct ppa_addr gaddr;
+		u64 pba = le64_to_cpu(entries[i]);
+		int off;
+		u64 diff;
+
+		if (!pba)
+			continue;
+
+		gaddr = linear_to_generic_addr(geo, pba);
+		ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
+		lun_roffs = ch_rmap->lun_offs;
+
+		off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
+
+		diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
+				(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
+
+		entries[i] -= cpu_to_le64(diff);
+	}
 }
 
 static struct nvmm_type gen = {
@@ -430,6 +633,7 @@ static struct nvmm_type gen = {
 	.get_area		= gen_get_area,
 	.put_area		= gen_put_area,
 
+	.part_to_tgt		= gen_part_to_tgt,
 };
 
 static int __init gen_module_init(void)
diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h
index d167f391fbaef2..6a4b3f368848ea 100644
--- a/drivers/lightnvm/gennvm.h
+++ b/drivers/lightnvm/gennvm.h
@@ -24,19 +24,37 @@ struct gen_dev {
 	struct nvm_dev *dev;
 
 	int nr_luns;
-	struct nvm_lun *luns;
 	struct list_head area_list;
 
 	struct mutex lock;
 	struct list_head targets;
 };
 
+/* Map between virtual and physical channel and lun */
+struct gen_ch_map {
+	int ch_off;
+	int nr_luns;
+	int *lun_offs;
+};
+
+struct gen_dev_map {
+	struct gen_ch_map *chnls;
+	int nr_chnls;
+};
+
 struct gen_area {
 	struct list_head list;
 	sector_t begin;
 	sector_t end;	/* end is excluded */
 };
 
+static inline void *ch_map_to_lun_offs(struct gen_ch_map *ch_map)
+{
+	return ch_map + 1;
+}
+
+typedef int (gen_trans_fn)(struct nvm_tgt_dev *, struct ppa_addr *);
+
 #define gen_for_each_lun(bm, lun, i) \
 		for ((i) = 0, lun = &(bm)->luns[0]; \
 			(i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)])
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index a352285dffc026..75ed12ae0f4712 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -45,7 +45,7 @@ static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a)
 
 	spin_unlock(&rblk->lock);
 
-	rrpc->rev_trans_map[a->addr - rrpc->poffset].addr = ADDR_EMPTY;
+	rrpc->rev_trans_map[a->addr].addr = ADDR_EMPTY;
 }
 
 static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba,
@@ -127,28 +127,25 @@ static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_lun *rlun = rblk->rlun;
-	struct nvm_lun *lun = rlun->parent;
 
-	return lun->id * dev->geo.sec_per_blk;
+	return rlun->id * dev->geo.sec_per_blk;
 }
 
-/* Calculate global addr for the given block */
-static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
+static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev,
+					 struct rrpc_addr *gp)
 {
-	struct nvm_tgt_dev *dev = rrpc->dev;
-	struct nvm_geo *geo = &dev->geo;
+	struct rrpc_block *rblk = gp->rblk;
 	struct rrpc_lun *rlun = rblk->rlun;
-	struct nvm_lun *lun = rlun->parent;
-
-	return lun->id * geo->sec_per_lun + rblk->id * geo->sec_per_blk;
-}
-
-static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev, u64 addr)
-{
+	u64 addr = gp->addr;
 	struct ppa_addr paddr;
 
 	paddr.ppa = addr;
-	return linear_to_generic_addr(&dev->geo, paddr);
+	paddr = rrpc_linear_to_generic_addr(&dev->geo, paddr);
+	paddr.g.ch = rlun->bppa.g.ch;
+	paddr.g.lun = rlun->bppa.g.lun;
+	paddr.g.blk = rblk->id;
+
+	return paddr;
 }
 
 /* requires lun->lock taken */
@@ -216,7 +213,6 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
 static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
 	struct rrpc_lun *rlun = rblk->rlun;
-	struct nvm_lun *lun = rlun->parent;
 
 	spin_lock(&rlun->lock);
 	if (rblk->state & NVM_BLK_ST_TGT) {
@@ -229,8 +225,8 @@ static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
 	} else {
 		WARN_ON_ONCE(1);
 		pr_err("rrpc: erroneous type (ch:%d,lun:%d,blk%d-> %u)\n",
-						lun->chnl_id, lun->lun_id,
-						rblk->id, rblk->state);
+					rlun->bppa.g.ch, rlun->bppa.g.lun,
+					rblk->id, rblk->state);
 		list_move_tail(&rblk->list, &rlun->bb_list);
 	}
 	spin_unlock(&rlun->lock);
@@ -336,7 +332,7 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
 try:
 		spin_lock(&rrpc->rev_lock);
 		/* Get logical address from physical to logical table */
-		rev = &rrpc->rev_trans_map[phys_addr - rrpc->poffset];
+		rev = &rrpc->rev_trans_map[phys_addr];
 		/* already updated by previous regular write */
 		if (rev->addr == ADDR_EMPTY) {
 			spin_unlock(&rrpc->rev_lock);
@@ -423,18 +419,18 @@ static void rrpc_block_gc(struct work_struct *work)
 
 	mempool_free(gcb, rrpc->gcb_pool);
 	pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' being reclaimed\n",
-			rlun->parent->chnl_id, rlun->parent->lun_id,
+			rlun->bppa.g.ch, rlun->bppa.g.lun,
 			rblk->id);
 
 	if (rrpc_move_valid_pages(rrpc, rblk))
 		goto put_back;
 
 	ppa.ppa = 0;
-	ppa.g.ch = rlun->parent->chnl_id;
-	ppa.g.lun = rlun->parent->lun_id;
+	ppa.g.ch = rlun->bppa.g.ch;
+	ppa.g.lun = rlun->bppa.g.lun;
 	ppa.g.blk = rblk->id;
 
-	if (nvm_erase_blk(dev->parent, &ppa, 0))
+	if (nvm_erase_blk(dev, &ppa, 0))
 		goto put_back;
 
 	rrpc_put_blk(rrpc, rblk);
@@ -506,8 +502,7 @@ static void rrpc_lun_gc(struct work_struct *work)
 		WARN_ON(!block_is_full(rrpc, rblk));
 
 		pr_debug("rrpc: selected block 'ch:%d,lun:%d,blk:%d' for GC\n",
-					rlun->parent->chnl_id,
-					rlun->parent->lun_id,
+					rlun->bppa.g.ch, rlun->bppa.g.lun,
 					rblk->id);
 
 		gcb->rrpc = rrpc;
@@ -537,8 +532,7 @@ static void rrpc_gc_queue(struct work_struct *work)
 
 	mempool_free(gcb, rrpc->gcb_pool);
 	pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' full, allow GC (sched)\n",
-					rlun->parent->chnl_id,
-					rlun->parent->lun_id,
+					rlun->bppa.g.ch, rlun->bppa.g.lun,
 					rblk->id);
 }
 
@@ -586,7 +580,7 @@ static struct rrpc_addr *rrpc_update_map(struct rrpc *rrpc, sector_t laddr,
 	gp->addr = paddr;
 	gp->rblk = rblk;
 
-	rev = &rrpc->rev_trans_map[gp->addr - rrpc->poffset];
+	rev = &rrpc->rev_trans_map[gp->addr];
 	rev->addr = laddr;
 	spin_unlock(&rrpc->rev_lock);
 
@@ -601,7 +595,7 @@ static u64 rrpc_alloc_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
 	if (block_is_full(rrpc, rblk))
 		goto out;
 
-	addr = block_to_addr(rrpc, rblk) + rblk->next_page;
+	addr = rblk->next_page;
 
 	rblk->next_page++;
 out:
@@ -615,18 +609,22 @@ static u64 rrpc_alloc_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
  * Returns rrpc_addr with the physical address and block. Returns NULL if no
  * blocks in the next rlun are available.
  */
-static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
+static struct ppa_addr rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
 								int is_gc)
 {
+	struct nvm_tgt_dev *tgt_dev = rrpc->dev;
 	struct rrpc_lun *rlun;
 	struct rrpc_block *rblk, **cur_rblk;
+	struct rrpc_addr *p;
+	struct ppa_addr ppa;
 	u64 paddr;
 	int gc_force = 0;
 
+	ppa.ppa = ADDR_EMPTY;
 	rlun = rrpc_get_lun_rr(rrpc, is_gc);
 
 	if (!is_gc && rlun->nr_free_blocks < rrpc->nr_luns * 4)
-		return NULL;
+		return ppa;
 
 	/*
 	 * page allocation steps:
@@ -683,10 +681,15 @@ static struct rrpc_addr *rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
 	}
 
 	pr_err("rrpc: failed to allocate new block\n");
-	return NULL;
+	return ppa;
 done:
 	spin_unlock(&rlun->lock);
-	return rrpc_update_map(rrpc, laddr, rblk, paddr);
+	p = rrpc_update_map(rrpc, laddr, rblk, paddr);
+	if (!p)
+		return ppa;
+
+	/* return global address */
+	return rrpc_ppa_to_gaddr(tgt_dev, p);
 }
 
 static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
@@ -712,8 +715,8 @@ static struct rrpc_lun *rrpc_ppa_to_lun(struct rrpc *rrpc, struct ppa_addr p)
 	int i;
 
 	for (i = 0; i < rrpc->nr_luns; i++) {
-		if (rrpc->luns[i].parent->chnl_id == p.g.ch &&
-				rrpc->luns[i].parent->lun_id == p.g.lun) {
+		if (rrpc->luns[i].bppa.g.ch == p.g.ch &&
+				rrpc->luns[i].bppa.g.lun == p.g.lun) {
 			rlun = &rrpc->luns[i];
 			break;
 		}
@@ -823,7 +826,7 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 		gp = &rrpc->trans_map[laddr + i];
 
 		if (gp->rblk) {
-			rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, gp->addr);
+			rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, gp);
 		} else {
 			BUG_ON(is_gc);
 			rrpc_unlock_laddr(rrpc, r);
@@ -852,7 +855,7 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
 	gp = &rrpc->trans_map[laddr];
 
 	if (gp->rblk) {
-		rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, gp->addr);
+		rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, gp);
 	} else {
 		BUG_ON(is_gc);
 		rrpc_unlock_rq(rrpc, rqd);
@@ -869,7 +872,7 @@ static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
-	struct rrpc_addr *p;
+	struct ppa_addr p;
 	sector_t laddr = rrpc_get_laddr(bio);
 	int is_gc = flags & NVM_IOTYPE_GC;
 	int i;
@@ -882,7 +885,7 @@ static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 	for (i = 0; i < npages; i++) {
 		/* We assume that mapping occurs at 4KB granularity */
 		p = rrpc_map_page(rrpc, laddr + i, is_gc);
-		if (!p) {
+		if (p.ppa == ADDR_EMPTY) {
 			BUG_ON(is_gc);
 			rrpc_unlock_laddr(rrpc, r);
 			nvm_dev_dma_free(dev->parent, rqd->ppa_list,
@@ -891,7 +894,7 @@ static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 			return NVM_IO_REQUEUE;
 		}
 
-		rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, p->addr);
+		rqd->ppa_list[i] = p;
 	}
 
 	rqd->opcode = NVM_OP_HBWRITE;
@@ -902,7 +905,7 @@ static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
 static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
 				struct nvm_rq *rqd, unsigned long flags)
 {
-	struct rrpc_addr *p;
+	struct ppa_addr p;
 	int is_gc = flags & NVM_IOTYPE_GC;
 	sector_t laddr = rrpc_get_laddr(bio);
 
@@ -910,14 +913,14 @@ static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
 		return NVM_IO_REQUEUE;
 
 	p = rrpc_map_page(rrpc, laddr, is_gc);
-	if (!p) {
+	if (p.ppa == ADDR_EMPTY) {
 		BUG_ON(is_gc);
 		rrpc_unlock_rq(rrpc, rqd);
 		rrpc_gc_kick(rrpc);
 		return NVM_IO_REQUEUE;
 	}
 
-	rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, p->addr);
+	rqd->ppa_addr = p;
 	rqd->opcode = NVM_OP_HBWRITE;
 
 	return NVM_IO_OK;
@@ -973,7 +976,7 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 	rqd->nr_ppas = nr_pages;
 	rrq->flags = flags;
 
-	err = nvm_submit_io(dev->parent, rqd);
+	err = nvm_submit_io(dev, rqd);
 	if (err) {
 		pr_err("rrpc: I/O submission failed: %d\n", err);
 		bio_put(bio);
@@ -1113,10 +1116,7 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
 
 		div_u64_rem(pba, rrpc->nr_sects, &mod);
 
-		addr[i].addr = pba;
-		raddr[mod].addr = slba + i;
-
-		gaddr = rrpc_ppa_to_gaddr(dev, pba);
+		gaddr = rrpc_recov_addr(dev->parent, pba);
 		rlun = rrpc_ppa_to_lun(rrpc, gaddr);
 		if (!rlun) {
 			pr_err("rrpc: l2p corruption on lba %llu\n",
@@ -1134,6 +1134,10 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
 			rblk->state = NVM_BLK_ST_TGT;
 			rlun->nr_free_blocks--;
 		}
+
+		addr[i].addr = pba;
+		addr[i].rblk = rblk;
+		raddr[mod].addr = slba + i;
 	}
 
 	return 0;
@@ -1230,7 +1234,6 @@ static void rrpc_core_free(struct rrpc *rrpc)
 
 static void rrpc_luns_free(struct rrpc *rrpc)
 {
-	struct nvm_lun *lun;
 	struct rrpc_lun *rlun;
 	int i;
 
@@ -1239,9 +1242,6 @@ static void rrpc_luns_free(struct rrpc *rrpc)
 
 	for (i = 0; i < rrpc->nr_luns; i++) {
 		rlun = &rrpc->luns[i];
-		lun = rlun->parent;
-		if (!lun)
-			break;
 		vfree(rlun->blocks);
 	}
 
@@ -1264,8 +1264,8 @@ static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
 		return -ENOMEM;
 
 	ppa.ppa = 0;
-	ppa.g.ch = rlun->parent->chnl_id;
-	ppa.g.lun = rlun->parent->lun_id;
+	ppa.g.ch = rlun->bppa.g.ch;
+	ppa.g.lun = rlun->bppa.g.lun;
 
 	ret = nvm_get_bb_tbl(dev->parent, ppa, blks);
 	if (ret) {
@@ -1293,11 +1293,17 @@ static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
 	return ret;
 }
 
-static int rrpc_luns_init(struct rrpc *rrpc, struct list_head *lun_list)
+static void rrpc_set_lun_ppa(struct rrpc_lun *rlun, struct ppa_addr ppa)
+{
+	rlun->bppa.ppa = 0;
+	rlun->bppa.g.ch = ppa.g.ch;
+	rlun->bppa.g.lun = ppa.g.lun;
+}
+
+static int rrpc_luns_init(struct rrpc *rrpc, struct ppa_addr *luns)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct nvm_geo *geo = &dev->geo;
-	struct nvm_lun *lun;
 	struct rrpc_lun *rlun;
 	int i, j, ret = -EINVAL;
 
@@ -1313,12 +1319,11 @@ static int rrpc_luns_init(struct rrpc *rrpc, struct list_head *lun_list)
 	if (!rrpc->luns)
 		return -ENOMEM;
 
-	i = 0;
-
 	/* 1:1 mapping */
-	list_for_each_entry(lun, lun_list, list) {
-		rlun = &rrpc->luns[i++];
-		rlun->parent = lun;
+	for (i = 0; i < rrpc->nr_luns; i++) {
+		rlun = &rrpc->luns[i];
+		rlun->id = i;
+		rrpc_set_lun_ppa(rlun, luns[i]);
 		rlun->blocks = vzalloc(sizeof(struct rrpc_block) *
 							geo->blks_per_lun);
 		if (!rlun->blocks) {
@@ -1356,8 +1361,6 @@ static int rrpc_luns_init(struct rrpc *rrpc, struct list_head *lun_list)
 		spin_lock_init(&rlun->lock);
 	}
 
-	WARN_ON(i != rrpc->nr_luns);
-
 	return 0;
 err:
 	return ret;
@@ -1511,14 +1514,12 @@ static int rrpc_luns_configure(struct rrpc *rrpc)
 
 static struct nvm_tgt_type tt_rrpc;
 
-static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
-						struct list_head *lun_list)
+static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
 {
 	struct request_queue *bqueue = dev->q;
 	struct request_queue *tqueue = tdisk->queue;
 	struct nvm_geo *geo = &dev->geo;
 	struct rrpc *rrpc;
-	int lun_begin = (list_first_entry(lun_list, struct nvm_lun, list))->id;
 	sector_t soffset;
 	int ret;
 
@@ -1553,14 +1554,12 @@ static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	}
 	rrpc->soffset = soffset;
 
-	ret = rrpc_luns_init(rrpc, lun_list);
+	ret = rrpc_luns_init(rrpc, dev->luns);
 	if (ret) {
 		pr_err("nvm: rrpc: could not initialize luns\n");
 		goto err;
 	}
 
-	rrpc->poffset = geo->sec_per_lun * lun_begin;
-
 	ret = rrpc_core_init(rrpc);
 	if (ret) {
 		pr_err("nvm: rrpc: could not initialize core\n");
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index c55cec9a1a2638..bc8adba4d63f33 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -74,7 +74,9 @@ struct rrpc_block {
 
 struct rrpc_lun {
 	struct rrpc *rrpc;
-	struct nvm_lun *parent;
+
+	int id;
+	struct ppa_addr bppa;
 
 	struct rrpc_block *cur, *gc_cur;
 	struct rrpc_block *blocks;	/* Reference to block allocation */
@@ -107,7 +109,6 @@ struct rrpc {
 	struct gendisk *disk;
 
 	sector_t soffset; /* logical sector offset */
-	u64 poffset; /* physical page offset */
 
 	int nr_luns;
 	struct rrpc_lun *luns;
@@ -164,14 +165,37 @@ struct rrpc_rev_addr {
 	u64 addr;
 };
 
+static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo,
+							  struct ppa_addr r)
+{
+	struct ppa_addr l;
+	int secs, pgs;
+	sector_t ppa = r.ppa;
+
+	l.ppa = 0;
+
+	div_u64_rem(ppa, geo->sec_per_pg, &secs);
+	l.g.sec = secs;
+
+	sector_div(ppa, geo->sec_per_pg);
+	div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
+	l.g.pg = pgs;
+
+	return l;
+}
+
+static inline struct ppa_addr rrpc_recov_addr(struct nvm_dev *dev, u64 pba)
+{
+	return linear_to_generic_addr(&dev->geo, pba);
+}
+
 static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct rrpc_lun *rlun = rblk->rlun;
-	struct nvm_lun *lun = rlun->parent;
 
-	return (lun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk);
+	return (rlun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk);
 }
 
 static inline sector_t rrpc_get_laddr(struct bio *bio)
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 1cdc8124c8c06a..588d4a34c08349 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -371,6 +371,9 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
 			return -EINVAL;
 		}
 
+		/* Transform physical address to target address space */
+		nvmdev->mt->part_to_tgt(nvmdev, entries, cmd_nlb);
+
 		if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
 			ret = -EINTR;
 			goto out;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index cc210cc85c6d0c..2222853ef96963 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -47,6 +47,7 @@ struct ppa_addr {
 struct nvm_rq;
 struct nvm_id;
 struct nvm_dev;
+struct nvm_tgt_dev;
 
 typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *);
 typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *);
@@ -210,7 +211,6 @@ struct nvm_id {
 
 struct nvm_target {
 	struct list_head list;
-	struct list_head lun_list;
 	struct nvm_tgt_dev *dev;
 	struct nvm_tgt_type *type;
 	struct gendisk *disk;
@@ -231,7 +231,7 @@ typedef void (nvm_end_io_fn)(struct nvm_rq *);
 
 struct nvm_rq {
 	struct nvm_tgt_instance *ins;
-	struct nvm_dev *dev;
+	struct nvm_tgt_dev *dev;
 
 	struct bio *bio;
 
@@ -266,15 +266,6 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata)
 	return rqdata + 1;
 }
 
-struct nvm_lun {
-	int id;
-
-	int lun_id;
-	int chnl_id;
-
-	struct list_head list;
-};
-
 enum {
 	NVM_BLK_ST_FREE =	0x1,	/* Free block */
 	NVM_BLK_ST_TGT =	0x2,	/* Block in use by target */
@@ -321,6 +312,9 @@ struct nvm_tgt_dev {
 	/* Device information */
 	struct nvm_geo geo;
 
+	/* Base ppas for target LUNs */
+	struct ppa_addr *luns;
+
 	sector_t total_secs;
 
 	struct nvm_id identity;
@@ -330,6 +324,7 @@ struct nvm_tgt_dev {
 	struct nvm_dev_ops *ops;
 
 	void *parent;
+	void *map;
 };
 
 struct nvm_dev {
@@ -363,16 +358,18 @@ struct nvm_dev {
 	char name[DISK_NAME_LEN];
 	void *private_data;
 
+	void *rmap;
+
 	struct mutex mlock;
 	spinlock_t lock;
 };
 
 static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo,
-							struct ppa_addr r)
+						     u64 pba)
 {
 	struct ppa_addr l;
 	int secs, pgs, blks, luns;
-	sector_t ppa = r.ppa;
+	sector_t ppa = pba;
 
 	l.ppa = 0;
 
@@ -465,8 +462,7 @@ static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg)
 
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
-				struct list_head *lun_list);
+typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
 typedef void (nvm_tgt_exit_fn)(void *);
 
 struct nvm_tgt_type {
@@ -499,10 +495,11 @@ typedef void (nvmm_unregister_fn)(struct nvm_dev *);
 
 typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *);
 typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *);
-typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct ppa_addr *, int);
+typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *);
+typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int);
 typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
 typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t);
+typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int);
 
 struct nvmm_type {
 	const char *name;
@@ -520,6 +517,8 @@ struct nvmm_type {
 	nvmm_get_area_fn *get_area;
 	nvmm_put_area_fn *put_area;
 
+	nvmm_part_to_tgt_fn *part_to_tgt;
+
 	struct list_head list;
 };
 
@@ -533,14 +532,18 @@ extern void nvm_unregister(struct nvm_dev *);
 extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas,
 							int nr_ppas, int type);
 
-extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *);
+extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
 extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
 					const struct ppa_addr *, int, int);
 extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int);
-extern int nvm_erase_blk(struct nvm_dev *, struct ppa_addr *, int);
+extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern int nvm_get_l2p_tbl(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *,
+			   void *);
+extern int nvm_get_area(struct nvm_dev *, sector_t *, sector_t);
+extern void nvm_put_area(struct nvm_dev *, sector_t);
 extern void nvm_end_io(struct nvm_rq *, int);
 extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
 								void *, int);

From 959e911b31981b52ed3f3d6e351b107bcb9163ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:11 +0100
Subject: [PATCH 136/182] lightnvm: introduce helpers for generic ops in rrpc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid calling media manager and device-specific operations directly from
rrpc. Create helper functions on lightnvm's core instead.

Signed-off-by: Javier González <javier@cnexlabs.com>

Made it work with null_blk as well.
Signed-off-by: Matias Bjørling <m@bjorling.me>

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c   | 22 ++++++++++++++++++++++
 drivers/lightnvm/gennvm.c |  2 --
 drivers/lightnvm/rrpc.c   | 25 ++++++++++++-------------
 include/linux/lightnvm.h  |  5 +----
 4 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 23d582f8221973..69f261e650bcad 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -219,6 +219,28 @@ int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags)
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
+int nvm_get_l2p_tbl(struct nvm_dev *dev, u64 slba, u32 nlb,
+		    nvm_l2p_update_fn *update_l2p, void *priv)
+{
+	if (!dev->ops->get_l2p_tbl)
+		return 0;
+
+	return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv);
+}
+EXPORT_SYMBOL(nvm_get_l2p_tbl);
+
+int nvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
+{
+	return dev->mt->get_area(dev, lba, len);
+}
+EXPORT_SYMBOL(nvm_get_area);
+
+void nvm_put_area(struct nvm_dev *dev, sector_t lba)
+{
+	dev->mt->put_area(dev, lba);
+}
+EXPORT_SYMBOL(nvm_put_area);
+
 void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd)
 {
 	int i;
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index 5d7c8c47bef8c4..befa8281ab3f22 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -175,8 +175,6 @@ static struct nvm_tgt_dev *gen_create_tgt_dev(struct nvm_dev *dev,
 	tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
 	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
 	tgt_dev->q = dev->q;
-	tgt_dev->ops = dev->ops;
-	tgt_dev->mt = dev->mt;
 	tgt_dev->map = dev_map;
 	tgt_dev->luns = luns;
 	memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id));
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 75ed12ae0f4712..e4678b749ba525 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -1166,11 +1166,8 @@ static int rrpc_map_init(struct rrpc *rrpc)
 		r->addr = ADDR_EMPTY;
 	}
 
-	if (!dev->ops->get_l2p_tbl)
-		return 0;
-
 	/* Bring up the mapping table from device */
-	ret = dev->ops->get_l2p_tbl(dev->parent, rrpc->soffset, rrpc->nr_sects,
+	ret = nvm_get_l2p_tbl(dev->parent, rrpc->soffset, rrpc->nr_sects,
 					rrpc_l2p_update, rrpc);
 	if (ret) {
 		pr_err("nvm: rrpc: could not read L2P table.\n");
@@ -1258,6 +1255,9 @@ static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
 	int i;
 	int ret;
 
+	if (!dev->parent->ops->get_bb_tbl)
+		return 0;
+
 	nr_blks = geo->blks_per_lun * geo->plane_mode;
 	blks = kmalloc(nr_blks, GFP_KERNEL);
 	if (!blks)
@@ -1277,7 +1277,6 @@ static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
 	if (nr_blks < 0)
 		return nr_blks;
 
-	rlun->nr_free_blocks = geo->blks_per_lun;
 	for (i = 0; i < nr_blks; i++) {
 		if (blks[i] == NVM_BLK_T_FREE)
 			continue;
@@ -1348,17 +1347,19 @@ static int rrpc_luns_init(struct rrpc *rrpc, struct ppa_addr *luns)
 			list_add_tail(&rblk->list, &rlun->free_list);
 		}
 
-		if (rrpc_bb_discovery(dev, rlun))
-			goto err;
-
+		rlun->rrpc = rrpc;
+		rlun->nr_free_blocks = geo->blks_per_lun;
 		rlun->reserved_blocks = 2; /* for GC only */
 
-		rlun->rrpc = rrpc;
 		INIT_LIST_HEAD(&rlun->prio_list);
 		INIT_LIST_HEAD(&rlun->wblk_list);
 
 		INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
 		spin_lock_init(&rlun->lock);
+
+		if (rrpc_bb_discovery(dev, rlun))
+			goto err;
+
 	}
 
 	return 0;
@@ -1370,13 +1371,12 @@ static int rrpc_luns_init(struct rrpc *rrpc, struct ppa_addr *luns)
 static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
-	struct nvmm_type *mt = dev->mt;
 	sector_t size = rrpc->nr_sects * dev->geo.sec_size;
 	int ret;
 
 	size >>= 9;
 
-	ret = mt->get_area(dev->parent, begin, size);
+	ret = nvm_get_area(dev->parent, begin, size);
 	if (!ret)
 		*begin >>= (ilog2(dev->geo.sec_size) - 9);
 
@@ -1386,10 +1386,9 @@ static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
 static void rrpc_area_free(struct rrpc *rrpc)
 {
 	struct nvm_tgt_dev *dev = rrpc->dev;
-	struct nvmm_type *mt = dev->mt;
 	sector_t begin = rrpc->soffset << (ilog2(dev->geo.sec_size) - 9);
 
-	mt->put_area(dev->parent, begin);
+	nvm_put_area(dev->parent, begin);
 }
 
 static void rrpc_free(struct rrpc *rrpc)
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2222853ef96963..99cd1e70e451e8 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -320,10 +320,7 @@ struct nvm_tgt_dev {
 	struct nvm_id identity;
 	struct request_queue *q;
 
-	struct nvmm_type *mt;
-	struct nvm_dev_ops *ops;
-
-	void *parent;
+	struct nvm_dev *parent;
 	void *map;
 };
 

From a279006afa3377493c4240395c70430f2a9b0d2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:12 +0100
Subject: [PATCH 137/182] lightnvm: introduce max_phys_sects helper function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Target devices do not have access to the device driver operations.
Introduce a helper function that exposes the max. number of physical
sectors supported by the underlying device.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 8 ++++++++
 include/linux/lightnvm.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 69f261e650bcad..3d8eaac99ba804 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -203,6 +203,14 @@ int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
 }
 EXPORT_SYMBOL(nvm_set_bb_tbl);
 
+int nvm_max_phys_sects(struct nvm_tgt_dev *tgt_dev)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+
+	return dev->ops->max_phys_sect;
+}
+EXPORT_SYMBOL(nvm_max_phys_sects);
+
 int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
 	struct nvm_dev *dev = tgt_dev->parent;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 99cd1e70e451e8..96375317b4795e 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -529,6 +529,7 @@ extern void nvm_unregister(struct nvm_dev *);
 extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas,
 							int nr_ppas, int type);
 
+extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
 extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *);

From da2d7cb828ce2714c603827ac5a6e1c98a02e861 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <jg@lightnvm.io>
Date: Mon, 28 Nov 2016 22:39:13 +0100
Subject: [PATCH 138/182] lightnvm: use target nvm on target-specific ops.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On target-specific operations pass on nvm_tgt_dev instead of the generic
nvm device.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c  | 15 ++++++++++-----
 drivers/lightnvm/rrpc.c  | 14 +++++++-------
 drivers/lightnvm/rrpc.h  |  2 +-
 include/linux/lightnvm.h |  6 +++---
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 3d8eaac99ba804..07bf989d2f776a 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -86,8 +86,7 @@ void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags,
 }
 EXPORT_SYMBOL(nvm_dev_dma_alloc);
 
-void nvm_dev_dma_free(struct nvm_dev *dev, void *addr,
-							dma_addr_t dma_handler)
+void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler)
 {
 	dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler);
 }
@@ -227,9 +226,11 @@ int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p, int flags)
 }
 EXPORT_SYMBOL(nvm_erase_blk);
 
-int nvm_get_l2p_tbl(struct nvm_dev *dev, u64 slba, u32 nlb,
+int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
 		    nvm_l2p_update_fn *update_l2p, void *priv)
 {
+	struct nvm_dev *dev = tgt_dev->parent;
+
 	if (!dev->ops->get_l2p_tbl)
 		return 0;
 
@@ -237,14 +238,18 @@ int nvm_get_l2p_tbl(struct nvm_dev *dev, u64 slba, u32 nlb,
 }
 EXPORT_SYMBOL(nvm_get_l2p_tbl);
 
-int nvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len)
+int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
 {
+	struct nvm_dev *dev = tgt_dev->parent;
+
 	return dev->mt->get_area(dev, lba, len);
 }
 EXPORT_SYMBOL(nvm_get_area);
 
-void nvm_put_area(struct nvm_dev *dev, sector_t lba)
+void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t lba)
 {
+	struct nvm_dev *dev = tgt_dev->parent;
+
 	dev->mt->put_area(dev, lba);
 }
 EXPORT_SYMBOL(nvm_put_area);
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index e4678b749ba525..8a27bcc62f23e7 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -983,8 +983,8 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
 		if (!(flags & NVM_IOTYPE_GC)) {
 			rrpc_unlock_rq(rrpc, rqd);
 			if (rqd->nr_ppas > 1)
-				nvm_dev_dma_free(dev->parent,
-					rqd->ppa_list, rqd->dma_ppa_list);
+				nvm_dev_dma_free(dev->parent, rqd->ppa_list,
+							rqd->dma_ppa_list);
 		}
 		return NVM_IO_ERR;
 	}
@@ -1116,7 +1116,7 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
 
 		div_u64_rem(pba, rrpc->nr_sects, &mod);
 
-		gaddr = rrpc_recov_addr(dev->parent, pba);
+		gaddr = rrpc_recov_addr(dev, pba);
 		rlun = rrpc_ppa_to_lun(rrpc, gaddr);
 		if (!rlun) {
 			pr_err("rrpc: l2p corruption on lba %llu\n",
@@ -1167,8 +1167,8 @@ static int rrpc_map_init(struct rrpc *rrpc)
 	}
 
 	/* Bring up the mapping table from device */
-	ret = nvm_get_l2p_tbl(dev->parent, rrpc->soffset, rrpc->nr_sects,
-					rrpc_l2p_update, rrpc);
+	ret = nvm_get_l2p_tbl(dev, rrpc->soffset, rrpc->nr_sects,
+							rrpc_l2p_update, rrpc);
 	if (ret) {
 		pr_err("nvm: rrpc: could not read L2P table.\n");
 		return -EINVAL;
@@ -1376,7 +1376,7 @@ static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
 
 	size >>= 9;
 
-	ret = nvm_get_area(dev->parent, begin, size);
+	ret = nvm_get_area(dev, begin, size);
 	if (!ret)
 		*begin >>= (ilog2(dev->geo.sec_size) - 9);
 
@@ -1388,7 +1388,7 @@ static void rrpc_area_free(struct rrpc *rrpc)
 	struct nvm_tgt_dev *dev = rrpc->dev;
 	sector_t begin = rrpc->soffset << (ilog2(dev->geo.sec_size) - 9);
 
-	nvm_put_area(dev->parent, begin);
+	nvm_put_area(dev, begin);
 }
 
 static void rrpc_free(struct rrpc *rrpc)
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
index bc8adba4d63f33..94e4d73116b294 100644
--- a/drivers/lightnvm/rrpc.h
+++ b/drivers/lightnvm/rrpc.h
@@ -184,7 +184,7 @@ static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo,
 	return l;
 }
 
-static inline struct ppa_addr rrpc_recov_addr(struct nvm_dev *dev, u64 pba)
+static inline struct ppa_addr rrpc_recov_addr(struct nvm_tgt_dev *dev, u64 pba)
 {
 	return linear_to_generic_addr(&dev->geo, pba);
 }
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 96375317b4795e..e76f9c4aa49b26 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -538,10 +538,10 @@ extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
 extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
 extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int);
 extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
-extern int nvm_get_l2p_tbl(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *,
+extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
-extern int nvm_get_area(struct nvm_dev *, sector_t *, sector_t);
-extern void nvm_put_area(struct nvm_dev *, sector_t);
+extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
+extern void nvm_put_area(struct nvm_tgt_dev *, sector_t);
 extern void nvm_end_io(struct nvm_rq *, int);
 extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int,
 								void *, int);

From 333ba053d145d6f9152f6b0311a345b876f0fed1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <javier@cnexlabs.com>
Date: Mon, 28 Nov 2016 22:39:14 +0100
Subject: [PATCH 139/182] lightnvm: transform target get/set bad block
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since targets are given a virtual target device, it is necessary to
translate all communication between targets and the backend device.
Implement the translation layer for get/set bad block table.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c   | 58 +++++++++++++++++++++++++++++++++++++++
 drivers/lightnvm/gennvm.c | 19 +++++++++----
 drivers/lightnvm/rrpc.c   |  4 +--
 include/linux/lightnvm.h  | 15 ++++++++--
 4 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 07bf989d2f776a..7622e3dc5d82e0 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -175,6 +175,26 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name)
 	return NULL;
 }
 
+static void nvm_tgt_generic_to_addr_mode(struct nvm_tgt_dev *tgt_dev,
+					 struct nvm_rq *rqd)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+	int i;
+
+	if (rqd->nr_ppas > 1) {
+		for (i = 0; i < rqd->nr_ppas; i++) {
+			rqd->ppa_list[i] = dev->mt->trans_ppa(tgt_dev,
+					rqd->ppa_list[i], TRANS_TGT_TO_DEV);
+			rqd->ppa_list[i] = generic_to_dev_addr(dev,
+							rqd->ppa_list[i]);
+		}
+	} else {
+		rqd->ppa_addr = dev->mt->trans_ppa(tgt_dev, rqd->ppa_addr,
+						TRANS_TGT_TO_DEV);
+		rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr);
+	}
+}
+
 int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
 								int type)
 {
@@ -202,6 +222,34 @@ int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas,
 }
 EXPORT_SYMBOL(nvm_set_bb_tbl);
 
+int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
+		       int nr_ppas, int type)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_rq rqd;
+	int ret;
+
+	if (nr_ppas > dev->ops->max_phys_sect) {
+		pr_err("nvm: unable to update all blocks atomically\n");
+		return -EINVAL;
+	}
+
+	memset(&rqd, 0, sizeof(struct nvm_rq));
+
+	nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
+	nvm_tgt_generic_to_addr_mode(tgt_dev, &rqd);
+
+	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
+	nvm_free_rqd_ppalist(dev, &rqd);
+	if (ret) {
+		pr_err("nvm: sysblk failed bb mark\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(nvm_set_tgt_bb_tbl);
+
 int nvm_max_phys_sects(struct nvm_tgt_dev *tgt_dev)
 {
 	struct nvm_dev *dev = tgt_dev->parent;
@@ -519,6 +567,16 @@ int nvm_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, u8 *blks)
 }
 EXPORT_SYMBOL(nvm_get_bb_tbl);
 
+int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
+		       u8 *blks)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+
+	ppa = dev->mt->trans_ppa(tgt_dev, ppa, TRANS_TGT_TO_DEV);
+	return nvm_get_bb_tbl(dev, ppa, blks);
+}
+EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
+
 static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
 {
 	struct nvm_geo *geo = &dev->geo;
diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c
index befa8281ab3f22..ca7880082d8064 100644
--- a/drivers/lightnvm/gennvm.c
+++ b/drivers/lightnvm/gennvm.c
@@ -482,12 +482,6 @@ static void gen_unregister(struct nvm_dev *dev)
 	module_put(THIS_MODULE);
 }
 
-enum {
-	TRANS_TGT_TO_DEV =	0x0,
-	TRANS_DEV_TO_TGT =	0x1,
-};
-
-
 static int gen_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
 {
 	struct gen_dev_map *dev_map = tgt_dev->map;
@@ -584,6 +578,18 @@ static int gen_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p,
 	return nvm_erase_ppa(tgt_dev->parent, p, 1, flags);
 }
 
+static struct ppa_addr gen_trans_ppa(struct nvm_tgt_dev *tgt_dev,
+				     struct ppa_addr p, int direction)
+{
+	gen_trans_fn *f;
+	struct ppa_addr ppa = p;
+
+	f = (direction == TRANS_TGT_TO_DEV) ? gen_map_to_dev : gen_map_to_tgt;
+	f(tgt_dev, &ppa);
+
+	return ppa;
+}
+
 static void gen_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
 			       int len)
 {
@@ -631,6 +637,7 @@ static struct nvmm_type gen = {
 	.get_area		= gen_get_area,
 	.put_area		= gen_put_area,
 
+	.trans_ppa		= gen_trans_ppa,
 	.part_to_tgt		= gen_part_to_tgt,
 };
 
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index 8a27bcc62f23e7..9fb7de395915ca 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -735,7 +735,7 @@ static void __rrpc_mark_bad_block(struct rrpc *rrpc, struct ppa_addr ppa)
 	rblk = &rlun->blocks[ppa.g.blk];
 	rblk->state = NVM_BLK_ST_BAD;
 
-	nvm_set_bb_tbl(dev->parent, &ppa, 1, NVM_BLK_T_GRWN_BAD);
+	nvm_set_tgt_bb_tbl(dev, &ppa, 1, NVM_BLK_T_GRWN_BAD);
 }
 
 static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
@@ -1267,7 +1267,7 @@ static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
 	ppa.g.ch = rlun->bppa.g.ch;
 	ppa.g.lun = rlun->bppa.g.lun;
 
-	ret = nvm_get_bb_tbl(dev->parent, ppa, blks);
+	ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
 	if (ret) {
 		pr_err("rrpc: could not get BB table\n");
 		goto out;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index e76f9c4aa49b26..7c273bbc5351d5 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -496,8 +496,15 @@ typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *);
 typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int);
 typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t);
 typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t);
+typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *,
+					    struct ppa_addr, int);
 typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int);
 
+enum {
+	TRANS_TGT_TO_DEV =	0x0,
+	TRANS_DEV_TO_TGT =	0x1,
+};
+
 struct nvmm_type {
 	const char *name;
 	unsigned int version[3];
@@ -514,6 +521,7 @@ struct nvmm_type {
 	nvmm_get_area_fn *get_area;
 	nvmm_put_area_fn *put_area;
 
+	nvmm_trans_ppa_fn *trans_ppa;
 	nvmm_part_to_tgt_fn *part_to_tgt;
 
 	struct list_head list;
@@ -526,9 +534,9 @@ extern struct nvm_dev *nvm_alloc_dev(int);
 extern int nvm_register(struct nvm_dev *);
 extern void nvm_unregister(struct nvm_dev *);
 
-extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas,
-							int nr_ppas, int type);
-
+extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int);
+extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
+			      int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
 extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *);
@@ -549,6 +557,7 @@ extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int,
 							int, void *, int);
 extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int);
 extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *);
+extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *);
 
 /* sysblk.c */
 #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */

From b02d8aaea12463f573c5ef485c638ab12628dc5e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 1 Dec 2016 16:00:25 +0900
Subject: [PATCH 140/182] block: Check partition alignment on zoned block
 devices

Both blkdev_report_zones and blkdev_reset_zones can operate on a partition of
a zoned block device. However, the first and last zones reported for a
partition make sense only if the partition start sector and size are aligned
on the device zone size. The same applies for zone reset. Resetting the first
or the last zone of a partition straddling zones may impact neighboring
partitions. Finally, if a partition start sector is not at the beginning of a
sequential zone, it will be impossible to write to the first sectors of the
partition on a host-managed device.
Avoid all these problems and incoherencies by ignoring partitions that are not
zone aligned.

Note: Even with CONFIG_BLK_DEV_ZONED disabled, bdev_is_zoned() will report the
correct disk zoning type (host-aware, host-managed or none) but
bdev_zone_size() will always return 0 for zoned block devices (i.e. the zone
size is unknown). So test this as a way to ensure that a zoned block device is
being handled as such. As a result, for a host-aware devices, unaligned zone
partitions will be accepted with CONFIG_BLK_DEV_ZONED disabled. That is, the
disk will be treated as a regular block device (as it should). If zoned block
device support is enabled, only aligned partitions will be accepted.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/partition-generic.c | 65 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/block/partition-generic.c b/block/partition-generic.c
index 71d9ed9df8daea..d7beb6bbbf668f 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -430,6 +430,56 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
 	return 0;
 }
 
+static bool part_zone_aligned(struct gendisk *disk,
+			      struct block_device *bdev,
+			      sector_t from, sector_t size)
+{
+	unsigned int zone_size = bdev_zone_size(bdev);
+
+	/*
+	 * If this function is called, then the disk is a zoned block device
+	 * (host-aware or host-managed). This can be detected even if the
+	 * zoned block device support is disabled (CONFIG_BLK_DEV_ZONED not
+	 * set). In this case, however, only host-aware devices will be seen
+	 * as a block device is not created for host-managed devices. Without
+	 * zoned block device support, host-aware drives can still be used as
+	 * regular block devices (no zone operation) and their zone size will
+	 * be reported as 0. Allow this case.
+	 */
+	if (!zone_size)
+		return true;
+
+	/*
+	 * Check partition start and size alignement. If the drive has a
+	 * smaller last runt zone, ignore it and allow the partition to
+	 * use it. Check the zone size too: it should be a power of 2 number
+	 * of sectors.
+	 */
+	if (WARN_ON_ONCE(!is_power_of_2(zone_size))) {
+		u32 rem;
+
+		div_u64_rem(from, zone_size, &rem);
+		if (rem)
+			return false;
+		if ((from + size) < get_capacity(disk)) {
+			div_u64_rem(size, zone_size, &rem);
+			if (rem)
+				return false;
+		}
+
+	} else {
+
+		if (from & (zone_size - 1))
+			return false;
+		if ((from + size) < get_capacity(disk) &&
+		    (size & (zone_size - 1)))
+			return false;
+
+	}
+
+	return true;
+}
+
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
 	struct parsed_partitions *state = NULL;
@@ -529,6 +579,21 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 			}
 		}
 
+		/*
+		 * On a zoned block device, partitions should be aligned on the
+		 * device zone size (i.e. zone boundary crossing not allowed).
+		 * Otherwise, resetting the write pointer of the last zone of
+		 * one partition may impact the following partition.
+		 */
+		if (bdev_is_zoned(bdev) &&
+		    !part_zone_aligned(disk, bdev, from, size)) {
+			printk(KERN_WARNING
+			       "%s: p%d start %llu+%llu is not zone aligned\n",
+			       disk->disk_name, p, (unsigned long long) from,
+			       (unsigned long long) size);
+			continue;
+		}
+
 		part = add_partition(disk, p, from, size,
 				     state->parts[p].flags,
 				     &state->parts[p].info);

From e73c23ff736e1ea371dfa419d7bf8e77ee53044a Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Date: Wed, 30 Nov 2016 12:28:58 -0800
Subject: [PATCH 141/182] block: add async variant of blkdev_issue_zeroout

Similar to __blkdev_issue_discard this variant allows submitting
the final bio asynchronously and chaining multiple ranges
into a single completion.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c        | 115 +++++++++++++++++++++++++++++------------
 include/linux/blkdev.h |   3 ++
 2 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 18abda862915ab..bfb28b03765ee3 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -137,24 +137,24 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 EXPORT_SYMBOL(blkdev_issue_discard);
 
 /**
- * blkdev_issue_write_same - queue a write same operation
+ * __blkdev_issue_write_same - generate number of bios with same page
  * @bdev:	target blockdev
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @page:	page containing data to write
+ * @biop:	pointer to anchor bio
  *
  * Description:
- *    Issue a write same request for the sectors in question.
+ *  Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
  */
-int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-			    sector_t nr_sects, gfp_t gfp_mask,
-			    struct page *page)
+static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct page *page,
+		struct bio **biop)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	unsigned int max_write_same_sectors;
-	struct bio *bio = NULL;
-	int ret = 0;
+	struct bio *bio = *biop;
 	sector_t bs_mask;
 
 	if (!q)
@@ -164,6 +164,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	if ((sector | nr_sects) & bs_mask)
 		return -EINVAL;
 
+	if (!bdev_write_same(bdev))
+		return -EOPNOTSUPP;
+
 	/* Ensure that max_write_same_sectors doesn't overflow bi_size */
 	max_write_same_sectors = UINT_MAX >> 9;
 
@@ -185,32 +188,63 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 			bio->bi_iter.bi_size = nr_sects << 9;
 			nr_sects = 0;
 		}
+		cond_resched();
 	}
 
-	if (bio) {
+	*biop = bio;
+	return 0;
+}
+
+/**
+ * blkdev_issue_write_same - queue a write same operation
+ * @bdev:	target blockdev
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @page:	page containing data
+ *
+ * Description:
+ *    Issue a write same request for the sectors in question.
+ */
+int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+				sector_t nr_sects, gfp_t gfp_mask,
+				struct page *page)
+{
+	struct bio *bio = NULL;
+	struct blk_plug plug;
+	int ret;
+
+	blk_start_plug(&plug);
+	ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
+			&bio);
+	if (ret == 0 && bio) {
 		ret = submit_bio_wait(bio);
 		bio_put(bio);
 	}
+	blk_finish_plug(&plug);
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
 /**
- * blkdev_issue_zeroout - generate number of zero filed write bios
+ * __blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:	blockdev to issue
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @biop:	pointer to anchor bio
+ * @discard:	discard flag
  *
  * Description:
  *  Generate and issue number of bios with zerofiled pages.
  */
-
-static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-				  sector_t nr_sects, gfp_t gfp_mask)
+int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
+		bool discard)
 {
 	int ret;
-	struct bio *bio = NULL;
+	int bi_size = 0;
+	struct bio *bio = *biop;
 	unsigned int sz;
 	sector_t bs_mask;
 
@@ -218,6 +252,19 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	if ((sector | nr_sects) & bs_mask)
 		return -EINVAL;
 
+	if (discard) {
+		ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
+				BLKDEV_DISCARD_ZERO, biop);
+		if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+			goto out;
+	}
+
+	ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
+			ZERO_PAGE(0), biop);
+	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+		goto out;
+
+	ret = 0;
 	while (nr_sects != 0) {
 		bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES),
 				gfp_mask);
@@ -227,21 +274,20 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 
 		while (nr_sects != 0) {
 			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
-			ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
-			nr_sects -= ret >> 9;
-			sector += ret >> 9;
-			if (ret < (sz << 9))
+			bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+			nr_sects -= bi_size >> 9;
+			sector += bi_size >> 9;
+			if (bi_size < (sz << 9))
 				break;
 		}
+		cond_resched();
 	}
 
-	if (bio) {
-		ret = submit_bio_wait(bio);
-		bio_put(bio);
-		return ret;
-	}
-	return 0;
+	*biop = bio;
+out:
+	return ret;
 }
+EXPORT_SYMBOL(__blkdev_issue_zeroout);
 
 /**
  * blkdev_issue_zeroout - zero-fill a block range
@@ -263,21 +309,22 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
  *  clearing the block range. Otherwise the zeroing will be performed
  *  using regular WRITE calls.
  */
-
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
 {
-	if (discard) {
-		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
-				BLKDEV_DISCARD_ZERO))
-			return 0;
-	}
+	int ret;
+	struct bio *bio = NULL;
+	struct blk_plug plug;
 
-	if (bdev_write_same(bdev) &&
-	    blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
-				    ZERO_PAGE(0)) == 0)
-		return 0;
+	blk_start_plug(&plug);
+	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
+			&bio, discard);
+	if (ret == 0 && bio) {
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+	}
+	blk_finish_plug(&plug);
 
-	return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
+	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 541fdd8787a5b6..7e9d8a0895beb7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1269,6 +1269,9 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		struct bio **biop);
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
+extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
+		bool discard);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, bool discard);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,

From a6f0788ec2881ac14e97ff7fa6a78a807f87b5ba Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Date: Wed, 30 Nov 2016 12:28:59 -0800
Subject: [PATCH 142/182] block: add support for REQ_OP_WRITE_ZEROES

This adds a new block layer operation to zero out a range of
LBAs. This allows to implement zeroing for devices that don't use
either discard with a predictable zero pattern or WRITE SAME of zeroes.
The prominent example of that is NVMe with the Write Zeroes command,
but in the future, this should also help with improving the way
zeroing discards work. For this operation, suitable entry is exported in
sysfs which indicate the number of maximum bytes allowed in one
write zeroes operation by the device.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/ABI/testing/sysfs-block | 13 ++++++
 block/bio.c                           |  1 +
 block/blk-core.c                      |  4 ++
 block/blk-lib.c                       | 58 ++++++++++++++++++++++++++-
 block/blk-merge.c                     | 17 ++++++--
 block/blk-settings.c                  | 17 ++++++++
 block/blk-sysfs.c                     | 11 +++++
 block/blk-wbt.c                       |  5 ++-
 include/linux/bio.h                   | 25 +++++++-----
 include/linux/blk_types.h             |  2 +
 include/linux/blkdev.h                | 19 +++++++++
 11 files changed, 153 insertions(+), 19 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index ee2d5cd26bfe2d..2da04ce6aeef48 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -235,6 +235,19 @@ Description:
 		write_same_max_bytes is 0, write same is not supported
 		by the device.
 
+What:		/sys/block/<disk>/queue/write_zeroes_max_bytes
+Date:		November 2016
+Contact:	Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
+Description:
+		Devices that support write zeroes operation in which a
+		single request can be issued to zero out the range of
+		contiguous blocks on storage without having any payload
+		in the request. This can be used to optimize writing zeroes
+		to the devices. write_zeroes_max_bytes indicates how many
+		bytes can be written in a single write zeroes command. If
+		write_zeroes_max_bytes is 0, write zeroes is not supported
+		by the device.
+
 What:		/sys/block/<disk>/queue/zoned
 Date:		September 2016
 Contact:	Damien Le Moal <damien.lemoal@hgst.com>
diff --git a/block/bio.c b/block/bio.c
index de257ced69b1df..83db1f37fd0bfb 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -674,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_ZEROES:
 		break;
 	case REQ_OP_WRITE_SAME:
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
diff --git a/block/blk-core.c b/block/blk-core.c
index 6c4a425690fc7f..3f2eb8d8018979 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1950,6 +1950,10 @@ generic_make_request_checks(struct bio *bio)
 		if (!bdev_is_zoned(bio->bi_bdev))
 			goto not_supported;
 		break;
+	case REQ_OP_WRITE_ZEROES:
+		if (!bdev_write_zeroes_sectors(bio->bi_bdev))
+			goto not_supported;
+		break;
 	default:
 		break;
 	}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index bfb28b03765ee3..510a6fb1531822 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -226,6 +226,55 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
+/**
+ * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
+ * @bdev:	blockdev to issue
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @biop:	pointer to anchor bio
+ *
+ * Description:
+ *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
+ */
+static int __blkdev_issue_write_zeroes(struct block_device *bdev,
+		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+		struct bio **biop)
+{
+	struct bio *bio = *biop;
+	unsigned int max_write_zeroes_sectors;
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (!q)
+		return -ENXIO;
+
+	/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
+	max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
+
+	if (max_write_zeroes_sectors == 0)
+		return -EOPNOTSUPP;
+
+	while (nr_sects) {
+		bio = next_bio(bio, 0, gfp_mask);
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_bdev = bdev;
+		bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+
+		if (nr_sects > max_write_zeroes_sectors) {
+			bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
+			nr_sects -= max_write_zeroes_sectors;
+			sector += max_write_zeroes_sectors;
+		} else {
+			bio->bi_iter.bi_size = nr_sects << 9;
+			nr_sects = 0;
+		}
+		cond_resched();
+	}
+
+	*biop = bio;
+	return 0;
+}
+
 /**
  * __blkdev_issue_zeroout - generate number of zero filed write bios
  * @bdev:	blockdev to issue
@@ -259,6 +308,11 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			goto out;
 	}
 
+	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
+			biop);
+	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+		goto out;
+
 	ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
 			ZERO_PAGE(0), biop);
 	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
@@ -304,8 +358,8 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
  *  the discard request fail, if the discard flag is not set, or if
  *  discard_zeroes_data is not supported, this function will resort to
  *  zeroing the blocks manually, thus provisioning (allocating,
- *  anchoring) them. If the block device supports the WRITE SAME command
- *  blkdev_issue_zeroout() will use it to optimize the process of
+ *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
+ *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
  *  clearing the block range. Otherwise the zeroing will be performed
  *  using regular WRITE calls.
  */
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fda6a12fc776b1..cf2848cb91d803 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	case REQ_OP_SECURE_ERASE:
 		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
 		break;
+	case REQ_OP_WRITE_ZEROES:
+		split = NULL;
+		nsegs = (*bio)->bi_phys_segments;
+		break;
 	case REQ_OP_WRITE_SAME:
 		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
 		break;
@@ -241,11 +245,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	 * This should probably be returning 0, but blk_add_request_payload()
 	 * (Christoph!!!!)
 	 */
-	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
-		return 1;
-
-	if (bio_op(bio) == REQ_OP_WRITE_SAME)
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE_ZEROES:
 		return 1;
+	default:
+		break;
+	}
 
 	fbio = bio;
 	cluster = blk_queue_cluster(q);
@@ -416,6 +424,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_ZEROES:
 		/*
 		 * This is a hack - drivers should be neither modifying the
 		 * biovec, nor relying on bi_vcnt - but because of
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c7ccabc0ec3ea6..8a2bc124a6840f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_dev_sectors = 0;
 	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
+	lim->max_write_zeroes_sectors = 0;
 	lim->max_discard_sectors = 0;
 	lim->max_hw_discard_sectors = 0;
 	lim->discard_granularity = 0;
@@ -132,6 +133,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	lim->max_sectors = UINT_MAX;
 	lim->max_dev_sectors = UINT_MAX;
 	lim->max_write_same_sectors = UINT_MAX;
+	lim->max_write_zeroes_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
 
@@ -299,6 +301,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
 
+/**
+ * blk_queue_max_write_zeroes_sectors - set max sectors for a single
+ *                                      write zeroes
+ * @q:  the request queue for the device
+ * @max_write_zeroes_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+		unsigned int max_write_zeroes_sectors)
+{
+	q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+
 /**
  * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
@@ -527,6 +542,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_same_sectors = min(t->max_write_same_sectors,
 					b->max_write_same_sectors);
+	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
+					b->max_write_zeroes_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index a9784149176902..706b27bd73a1df 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -211,6 +211,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
 		(unsigned long long)q->limits.max_write_same_sectors << 9);
 }
 
+static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%llu\n",
+		(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
+}
 
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
@@ -611,6 +616,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = {
 	.show = queue_write_same_max_show,
 };
 
+static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
+	.attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+	.show = queue_write_zeroes_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_show_nonrot,
@@ -700,6 +710,7 @@ static struct attribute *default_attrs[] = {
 	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
 	&queue_write_same_max_entry.attr,
+	&queue_write_zeroes_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
 	&queue_nomerges_entry.attr,
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index b8647343141f89..d500e43da5d9cc 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
 	const int op = bio_op(bio);
 
 	/*
-	 * If not a WRITE (or a discard), do nothing
+	 * If not a WRITE (or a discard or write zeroes), do nothing
 	 */
-	if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
+	if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
+				op == REQ_OP_WRITE_ZEROES))
 		return false;
 
 	/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 70a7244f08a72e..b15323934a298e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -76,7 +76,8 @@ static inline bool bio_has_data(struct bio *bio)
 	if (bio &&
 	    bio->bi_iter.bi_size &&
 	    bio_op(bio) != REQ_OP_DISCARD &&
-	    bio_op(bio) != REQ_OP_SECURE_ERASE)
+	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
+	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
 		return true;
 
 	return false;
@@ -86,7 +87,8 @@ static inline bool bio_no_advance_iter(struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
 	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
-	       bio_op(bio) == REQ_OP_WRITE_SAME;
+	       bio_op(bio) == REQ_OP_WRITE_SAME ||
+	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
 }
 
 static inline bool bio_mergeable(struct bio *bio)
@@ -188,18 +190,19 @@ static inline unsigned bio_segments(struct bio *bio)
 	struct bvec_iter iter;
 
 	/*
-	 * We special case discard/write same, because they interpret bi_size
-	 * differently:
+	 * We special case discard/write same/write zeroes, because they
+	 * interpret bi_size differently:
 	 */
 
-	if (bio_op(bio) == REQ_OP_DISCARD)
-		return 1;
-
-	if (bio_op(bio) == REQ_OP_SECURE_ERASE)
-		return 1;
-
-	if (bio_op(bio) == REQ_OP_WRITE_SAME)
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE_ZEROES:
 		return 1;
+	default:
+		break;
+	}
 
 	bio_for_each_segment(bv, bio, iter)
 		segs++;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f57458a6a93bc3..519ea2c9df612b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -159,6 +159,8 @@ enum req_opf {
 	REQ_OP_ZONE_RESET	= 6,
 	/* write the same sector many times */
 	REQ_OP_WRITE_SAME	= 7,
+	/* write the zero filled sector many times */
+	REQ_OP_WRITE_ZEROES	= 8,
 
 	REQ_OP_LAST,
 };
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7e9d8a0895beb7..ebeef2b79c5ada 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -323,6 +323,7 @@ struct queue_limits {
 	unsigned int		max_discard_sectors;
 	unsigned int		max_hw_discard_sectors;
 	unsigned int		max_write_same_sectors;
+	unsigned int		max_write_zeroes_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
 
@@ -774,6 +775,9 @@ static inline bool rq_mergeable(struct request *rq)
 	if (req_op(rq) == REQ_OP_FLUSH)
 		return false;
 
+	if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+		return false;
+
 	if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
 		return false;
 	if (rq->rq_flags & RQF_NOMERGE_FLAGS)
@@ -1004,6 +1008,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 	if (unlikely(op == REQ_OP_WRITE_SAME))
 		return q->limits.max_write_same_sectors;
 
+	if (unlikely(op == REQ_OP_WRITE_ZEROES))
+		return q->limits.max_write_zeroes_sectors;
+
 	return q->limits.max_sectors;
 }
 
@@ -1107,6 +1114,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
 extern void blk_queue_max_write_same_sectors(struct request_queue *q,
 		unsigned int max_write_same_sectors);
+extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+		unsigned int max_write_same_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
@@ -1475,6 +1484,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev)
 	return 0;
 }
 
+static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return q->limits.max_write_zeroes_sectors;
+
+	return 0;
+}
+
 static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);

From 3b7c33b28a44d4621e365e090bf8bd332ab232a1 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Date: Wed, 30 Nov 2016 12:29:00 -0800
Subject: [PATCH 143/182] nvme.h: add Write Zeroes definitions

Add the command structure, optional command set support (ONCS) bit and
a new error code for the Write Zeroes command.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/nvme.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 18ce9f7cc88160..0df9466a7c38f4 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -237,6 +237,7 @@ enum {
 	NVME_CTRL_ONCS_COMPARE			= 1 << 0,
 	NVME_CTRL_ONCS_WRITE_UNCORRECTABLE	= 1 << 1,
 	NVME_CTRL_ONCS_DSM			= 1 << 2,
+	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 };
 
@@ -543,6 +544,23 @@ struct nvme_dsm_range {
 	__le64			slba;
 };
 
+struct nvme_write_zeroes_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2;
+	__le64			metadata;
+	union nvme_data_ptr	dptr;
+	__le64			slba;
+	__le16			length;
+	__le16			control;
+	__le32			dsmgmt;
+	__le32			reftag;
+	__le16			apptag;
+	__le16			appmask;
+};
+
 /* Admin commands */
 
 enum nvme_admin_opcode {
@@ -839,6 +857,7 @@ struct nvme_command {
 		struct nvme_download_firmware dlfw;
 		struct nvme_format_cmd format;
 		struct nvme_dsm_cmd dsm;
+		struct nvme_write_zeroes_cmd write_zeroes;
 		struct nvme_abort_cmd abort;
 		struct nvme_get_log_page_command get_log_page;
 		struct nvmf_common_command fabrics;
@@ -918,6 +937,7 @@ enum {
 	NVME_SC_BAD_ATTRIBUTES		= 0x180,
 	NVME_SC_INVALID_PI		= 0x181,
 	NVME_SC_READ_ONLY		= 0x182,
+	NVME_SC_ONCS_NOT_SUPPORTED	= 0x183,
 
 	/*
 	 * I/O Command Set Specific - Fabrics commands:

From 6d31e3ba232ea22458b2f36b6d3f2f9f11bf3fa4 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Date: Wed, 30 Nov 2016 12:29:01 -0800
Subject: [PATCH 144/182] nvme: add support for the Write Zeroes command

Allow write zeroes operations (REQ_OP_WRITE_ZEROES) on the block
device, if the device supports optional command bit set for write
zeroes. Add support to setup write zeroes command. Set maximum possible
write zeroes sectors in one write zeroes command according to
nvme write zeroes command definition.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/core.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8c644838252ea8..90c24e88bb388b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -272,6 +272,21 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
+static inline void nvme_setup_write_zeroes(struct nvme_ns *ns,
+		struct request *req, struct nvme_command *cmnd)
+{
+	struct nvme_write_zeroes_cmd *write_zeroes = &cmnd->write_zeroes;
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	write_zeroes->opcode = nvme_cmd_write_zeroes;
+	write_zeroes->nsid = cpu_to_le32(ns->ns_id);
+	write_zeroes->slba =
+		cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+	write_zeroes->length =
+		cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+	write_zeroes->control = 0;
+}
+
 static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmnd)
 {
@@ -325,6 +340,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		nvme_setup_flush(ns, cmd);
 	else if (req_op(req) == REQ_OP_DISCARD)
 		ret = nvme_setup_discard(ns, req, cmd);
+	else if (req_op(req) == REQ_OP_WRITE_ZEROES)
+		nvme_setup_write_zeroes(ns, req, cmd);
 	else
 		nvme_setup_rw(ns, req, cmd);
 
@@ -943,6 +960,10 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 
 	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
 		nvme_config_discard(ns);
+	if (ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES)
+		blk_queue_max_write_zeroes_sectors(ns->queue,
+				((u32)(USHRT_MAX + 1) * bs) >> 9);
+
 	blk_mq_unfreeze_queue(disk->queue);
 }
 

From d262920998c891dfd87cf73f823f0ff60e20cdad Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Date: Wed, 30 Nov 2016 12:29:02 -0800
Subject: [PATCH 145/182] nvmet: add support for the Write Zeroes command

Add support for handling write zeroes command on target.
Call into __blkdev_issue_zeroout, which the block layer expands into the
best suitable variant of zeroing the LBAs. Allow write zeroes operation
to deallocate the LBAs when calling __blkdev_issue_zeroout.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/target/admin-cmd.c |  3 ++-
 drivers/nvme/target/io-cmd.c    | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 7ab9c9381b9895..383ea10b97cc81 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -237,7 +237,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 	id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
 
 	id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
-	id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM);
+	id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
+			NVME_CTRL_ONCS_WRITE_ZEROES);
 
 	/* XXX: don't report vwc if the underlying device is write through */
 	id->vwc = NVME_CTRL_VWC_PRESENT;
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index c4dc9ea8ade0b1..4195115c7e5493 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -170,6 +170,32 @@ static void nvmet_execute_dsm(struct nvmet_req *req)
 	}
 }
 
+static void nvmet_execute_write_zeroes(struct nvmet_req *req)
+{
+	struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes;
+	struct bio *bio = NULL;
+	u16 status = NVME_SC_SUCCESS;
+	sector_t sector;
+	sector_t nr_sector;
+
+	sector = le64_to_cpu(write_zeroes->slba) <<
+		(req->ns->blksize_shift - 9);
+	nr_sector = (((sector_t)le32_to_cpu(write_zeroes->length)) <<
+		(req->ns->blksize_shift - 9)) + 1;
+
+	if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
+				GFP_KERNEL, &bio, true))
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+
+	if (bio) {
+		bio->bi_private = req;
+		bio->bi_end_io = nvmet_bio_done;
+		submit_bio(bio);
+	} else {
+		nvmet_req_complete(req, status);
+	}
+}
+
 int nvmet_parse_io_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
@@ -207,6 +233,9 @@ int nvmet_parse_io_cmd(struct nvmet_req *req)
 		req->data_len = le32_to_cpu(cmd->dsm.nr + 1) *
 			sizeof(struct nvme_dsm_range);
 		return 0;
+	case nvme_cmd_write_zeroes:
+		req->execute = nvmet_execute_write_zeroes;
+		return 0;
 	default:
 		pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;

From 5b0e34e1949ed580ac330041dccb0334b926554e Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Thu, 1 Dec 2016 10:10:46 +0800
Subject: [PATCH 146/182] block: mtip32xx: set error code on failure

Fix bug https://bugzilla.kernel.org/show_bug.cgi?id=188531. In function
mtip_block_initialize(), variable rv takes the return value, and its
value should be negative on errors. rv is initialized as 0 and is not
reset when the call to ida_pre_get() fails. So 0 may be returned.
The return value 0 indicates that there is no error, which may be
inconsistent with the execution status. This patch fixes the bug by
explicitly assigning -ENOMEM to rv on the branch that ida_pre_get()
fails.

Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index cfe0108a7d30a2..f96ab717534c4c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3933,8 +3933,10 @@ static int mtip_block_initialize(struct driver_data *dd)
 
 	/* Generate the disk name, implemented same as in sd.c */
 	do {
-		if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL))
+		if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL)) {
+			rv = -ENOMEM;
 			goto ida_get_error;
+		}
 
 		spin_lock(&rssd_index_lock);
 		rv = ida_get_new(&rssd_index_ida, &index);

From af309226db916e2c6e08d3eba3fa5c34225200c4 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabinv@axis.com>
Date: Thu, 1 Dec 2016 09:18:28 +0100
Subject: [PATCH 147/182] block: protect iterate_bdevs() against concurrent
 close

If a block device is closed while iterate_bdevs() is handling it, the
following NULL pointer dereference occurs because bdev->b_disk is NULL
in bdev_get_queue(), which is called from blk_get_backing_dev_info() (in
turn called by the mapping_cap_writeback_dirty() call in
__filemap_fdatawrite_range()):

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000508
 IP: [<ffffffff81314790>] blk_get_backing_dev_info+0x10/0x20
 PGD 9e62067 PUD 9ee8067 PMD 0
 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
 Modules linked in:
 CPU: 1 PID: 2422 Comm: sync Not tainted 4.5.0-rc7+ #400
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
 task: ffff880009f4d700 ti: ffff880009f5c000 task.ti: ffff880009f5c000
 RIP: 0010:[<ffffffff81314790>]  [<ffffffff81314790>] blk_get_backing_dev_info+0x10/0x20
 RSP: 0018:ffff880009f5fe68  EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff88000ec17a38 RCX: ffffffff81a4e940
 RDX: 7fffffffffffffff RSI: 0000000000000000 RDI: ffff88000ec176c0
 RBP: ffff880009f5fe68 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000001 R11: 0000000000000000 R12: ffff88000ec17860
 R13: ffffffff811b25c0 R14: ffff88000ec178e0 R15: ffff88000ec17a38
 FS:  00007faee505d700(0000) GS:ffff88000fb00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
 CR2: 0000000000000508 CR3: 0000000009e8a000 CR4: 00000000000006e0
 Stack:
  ffff880009f5feb8 ffffffff8112e7f5 0000000000000000 7fffffffffffffff
  0000000000000000 0000000000000000 7fffffffffffffff 0000000000000001
  ffff88000ec178e0 ffff88000ec17860 ffff880009f5fec8 ffffffff8112e81f
 Call Trace:
  [<ffffffff8112e7f5>] __filemap_fdatawrite_range+0x85/0x90
  [<ffffffff8112e81f>] filemap_fdatawrite+0x1f/0x30
  [<ffffffff811b25d6>] fdatawrite_one_bdev+0x16/0x20
  [<ffffffff811bc402>] iterate_bdevs+0xf2/0x130
  [<ffffffff811b2763>] sys_sync+0x63/0x90
  [<ffffffff815d4272>] entry_SYSCALL_64_fastpath+0x12/0x76
 Code: 0f 1f 44 00 00 48 8b 87 f0 00 00 00 55 48 89 e5 <48> 8b 80 08 05 00 00 5d
 RIP  [<ffffffff81314790>] blk_get_backing_dev_info+0x10/0x20
  RSP <ffff880009f5fe68>
 CR2: 0000000000000508
 ---[ end trace 2487336ceb3de62d ]---

The crash is easily reproducible by running the following command, if an
msleep(100) is inserted before the call to func() in iterate_devs():

 while :; do head -c1 /dev/nullb0; done > /dev/null & while :; do sync; done

Fix it by holding the bd_mutex across the func() call and only calling
func() if the bdev is opened.

Cc: stable@vger.kernel.org
Fixes: 5c0d6b60a0ba ("vfs: Create function for iterating over block devices")
Reported-and-tested-by: Wei Fang <fangwei1@huawei.com>
Signed-off-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7022ddc55b1244..95acbd2ebc5d72 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -2207,6 +2207,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 	spin_lock(&blockdev_superblock->s_inode_list_lock);
 	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
 		struct address_space *mapping = inode->i_mapping;
+		struct block_device *bdev;
 
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
@@ -2227,8 +2228,12 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 		 */
 		iput(old_inode);
 		old_inode = inode;
+		bdev = I_BDEV(inode);
 
-		func(I_BDEV(inode), arg);
+		mutex_lock(&bdev->bd_mutex);
+		if (bdev->bd_openers)
+			func(bdev, arg);
+		mutex_unlock(&bdev->bd_mutex);
 
 		spin_lock(&blockdev_superblock->s_inode_list_lock);
 	}

From e0c723000966ae285295caaa8cda914dfa177fa4 Mon Sep 17 00:00:00 2001
From: Ritesh Harjani <riteshh@codeaurora.org>
Date: Thu, 1 Dec 2016 08:36:16 -0700
Subject: [PATCH 148/182] block: factor out req_set_nomerge

Factor out common code for setting REQ_NOMERGE flag which is being used
out at certain places and make it a helper instead, req_set_nomerge().

Signed-off-by: Ritesh Harjani <riteshh@codeaurora.org>

Get rid of the inline.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index cf2848cb91d803..1002afdfee99f8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -501,6 +501,13 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 
+static void req_set_nomerge(struct request_queue *q, struct request *req)
+{
+	req->cmd_flags |= REQ_NOMERGE;
+	if (req == q->last_merge)
+		q->last_merge = NULL;
+}
+
 static inline int ll_new_hw_segment(struct request_queue *q,
 				    struct request *req,
 				    struct bio *bio)
@@ -521,9 +528,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 	return 1;
 
 no_merge:
-	req->cmd_flags |= REQ_NOMERGE;
-	if (req == q->last_merge)
-		q->last_merge = NULL;
+	req_set_nomerge(q, req);
 	return 0;
 }
 
@@ -537,9 +542,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
-		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
-			q->last_merge = NULL;
+		req_set_nomerge(q, req);
 		return 0;
 	}
 	if (!bio_flagged(req->biotail, BIO_SEG_VALID))
@@ -561,9 +564,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		return 0;
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
-		req->cmd_flags |= REQ_NOMERGE;
-		if (req == q->last_merge)
-			q->last_merge = NULL;
+		req_set_nomerge(q, req);
 		return 0;
 	}
 	if (!bio_flagged(bio, BIO_SEG_VALID))

From 209200efa3db3d09380ee7c796efa73d900d5f3a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 2 Dec 2016 17:13:09 -0800
Subject: [PATCH 149/182] blk-stat: fix a typo

Signed-off-by: Shaohua Li <shli@fb.com>
Fixes: cf43e6be865a ("block: add scalable completion tracking of requests")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-stat.c b/block/blk-stat.c
index 688c958367eed5..4d0118568727ec 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -12,7 +12,7 @@
 static void blk_stat_flush_batch(struct blk_rq_stat *stat)
 {
 	const s32 nr_batch = READ_ONCE(stat->nr_batch);
-	const s32 nr_samples = READ_ONCE(stat->nr_batch);
+	const s32 nr_samples = READ_ONCE(stat->nr_samples);
 
 	if (!nr_batch)
 		return;

From ef77b515243b3499d62cf446eda6ca7e0a0b079c Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 2 Dec 2016 16:19:12 -0500
Subject: [PATCH 150/182] nbd: use loff_t for blocksize and nbd_set_size args

If we have large devices (say like the 40t drive I was trying to test with) we
will end up overflowing the int arguments to nbd_set_size and not get the right
size for our device.  Fix this by using loff_t everywhere so I don't have to
think about this again.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index dc722a7adf580a..92f5400edbd362 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -64,7 +64,7 @@ struct nbd_device {
 	int num_connections;
 	atomic_t recv_threads;
 	wait_queue_head_t recv_wq;
-	int blksize;
+	loff_t blksize;
 	loff_t bytesize;
 
 	struct task_struct *task_recv;
@@ -134,7 +134,7 @@ static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
 }
 
 static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
-			int blocksize, int nr_blocks)
+			loff_t blocksize, loff_t nr_blocks)
 {
 	int ret;
 
@@ -143,7 +143,7 @@ static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
 		return ret;
 
 	nbd->blksize = blocksize;
-	nbd->bytesize = (loff_t)blocksize * (loff_t)nr_blocks;
+	nbd->bytesize = blocksize * nr_blocks;
 
 	nbd_size_update(nbd, bdev);
 
@@ -930,7 +930,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
 	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
 	debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
 	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
-	debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
+	debugfs_create_u64("blocksize", 0444, dir, &nbd->blksize);
 	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
 
 	return 0;

From e88f72cb9f54f6d244e55f629fe5e2f34ca6f9ed Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Sat, 3 Dec 2016 12:08:03 -0700
Subject: [PATCH 151/182] nbd: fix 64-bit division

We have this:

ERROR: "__aeabi_ldivmod" [drivers/block/nbd.ko] undefined!
ERROR: "__divdi3" [drivers/block/nbd.ko] undefined!
nbd.c:(.text+0x247c72): undefined reference to `__divdi3'

due to a recent commit, that did 64-bit division. Use the proper
divider function so that 32-bit compiles don't break.

Fixes: ef77b515243b ("nbd: use loff_t for blocksize and nbd_set_size args")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 92f5400edbd362..bc78cbb2d18aff 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -729,7 +729,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 
 	case NBD_SET_SIZE:
 		return nbd_size_set(nbd, bdev, nbd->blksize,
-				    arg / nbd->blksize);
+					div_s64(arg, nbd->blksize));
 
 	case NBD_SET_SIZE_BLOCKS:
 		return nbd_size_set(nbd, bdev, nbd->blksize, arg);

From 58886785db318588f95c8036abb2a47016c1f14c Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Sun, 4 Dec 2016 14:56:39 +0100
Subject: [PATCH 152/182] block: fix unintended fallthrough in
 generic_make_request_checks()

Since commit e73c23ff736e ("block: add async variant of
blkdev_issue_zeroout") messages like the following show up:

  EXT4-fs (dm-1): Delayed block allocation failed for inode 2368848 at
                  logical offset 0 with max blocks 1 with error 95
  EXT4-fs (dm-1): This should not happen!! Data will be lost

Due to the following fallthrough introduced with
commit 2d253440b5af ("block: Define zoned block device operations"),
generic_make_request_checks() would accept a REQ_OP_WRITE_SAME bio only
if the block device supports "write same" *and* is a zoned one:

  switch (bio_op(bio)) {
  [...]
  case REQ_OP_WRITE_SAME:
        if (!bdev_write_same(bio->bi_bdev))
                goto not_supported;
  case REQ_OP_ZONE_REPORT:
  case REQ_OP_ZONE_RESET:
                if (!bdev_is_zoned(bio->bi_bdev))
                        goto not_supported;
                break;
  [...]
  }

Thus, although the bio setup as done by __blkdev_issue_write_same() from
commit e73c23ff736e ("block: add async variant of blkdev_issue_zeroout")
would succeed, its actual submission would not, resulting in the
EOPNOTSUPP == 95.

Fix this by removing the fallthrough which, due to the lack of an explicit
comment, seems to be unintended anyway.

Fixes: e73c23ff736e ("block: add async variant of blkdev_issue_zeroout")
Fixes: 2d253440b5af ("block: Define zoned block device operations")
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3f2eb8d8018979..4b7ec5958055a3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1945,6 +1945,7 @@ generic_make_request_checks(struct bio *bio)
 	case REQ_OP_WRITE_SAME:
 		if (!bdev_write_same(bio->bi_bdev))
 			goto not_supported;
+		break;
 	case REQ_OP_ZONE_REPORT:
 	case REQ_OP_ZONE_RESET:
 		if (!bdev_is_zoned(bio->bi_bdev))

From 6e85eaf3078bcc8552ca32a0938dbf7d2b495af0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 2 Dec 2016 20:00:14 -0700
Subject: [PATCH 153/182] blk-mq: blk_account_io_start() takes a bool

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index bac12caece0664..90db5b490df9a7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1237,7 +1237,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
 	init_request_from_bio(rq, bio);
 
-	blk_account_io_start(rq, 1);
+	blk_account_io_start(rq, true);
 }
 
 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)

From 70d4281c4b7049794e6d66baac7a18ad943cf726 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 18 Oct 2016 12:59:47 -0700
Subject: [PATCH 154/182] nvmet-rdma: Fix REJ status code

nvmet_sq_init() returns a value <= 0. nvmet_rdma_cm_reject() expects
a second argument that is a NVME_RDMA_CM_* constant. Hence this patch.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Sagi Grimberg <sagi@grimbeg.me>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/rdma.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index f8d23999e0f2c2..35b97a898ecd71 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1044,8 +1044,10 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
 	}
 
 	ret = nvmet_sq_init(&queue->nvme_sq);
-	if (ret)
+	if (ret) {
+		ret = NVME_RDMA_CM_NO_RSC;
 		goto out_free_queue;
+	}
 
 	ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
 	if (ret)
@@ -1114,6 +1116,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
 out_free_queue:
 	kfree(queue);
 out_reject:
+	pr_debug("rejecting connect request with status code %d\n", ret);
 	nvmet_rdma_cm_reject(cm_id, ret);
 	return NULL;
 }

From 76c08bf46a2891c80e854813c636bdce1f7d1ca4 Mon Sep 17 00:00:00 2001
From: Samuel Jones <sjones@kalray.eu>
Date: Tue, 25 Oct 2016 09:22:34 +0200
Subject: [PATCH 155/182] nvme-rdma: force queue size to respect controller
 capability

Queue size needs to respect the Maximum Queue Entries Supported advertised by
the controller in its Capability register.

Signed-off-by: Samuel Jones <sjones@kalray.eu>
Reviewed-by: Christoph Hellwig <hch@lst.de>
[sagig: fixed queue_size adjustment according to
Daniel Verkamp <daniel.verkamp@intel.com> comment]
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/rdma.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ff1b340606b277..d1df4a5eb1e6e2 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1904,6 +1904,14 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 		opts->queue_size = ctrl->ctrl.maxcmd;
 	}
 
+	if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
+		/* warn if sqsize is lower than queue_size */
+		dev_warn(ctrl->ctrl.device,
+			"queue_size %zu > ctrl sqsize %u, clamping down\n",
+			opts->queue_size, ctrl->ctrl.sqsize + 1);
+		opts->queue_size = ctrl->ctrl.sqsize + 1;
+	}
+
 	if (opts->nr_io_queues) {
 		ret = nvme_rdma_create_io_queues(ctrl);
 		if (ret)

From 8eadfcb1b63d7d9cd21dc7a2fd1d92efc5730ddd Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 18 Oct 2016 13:10:46 -0700
Subject: [PATCH 156/182] nvme-fabrics: Fix memory leaks in
 nvmf_parse_options()

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fabrics.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 68fb26b3bfb9e7..12f19c4665cf45 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -666,10 +666,12 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			if (nqnlen >= NVMF_NQN_SIZE) {
 				pr_err("%s needs to be < %d bytes\n",
 					p, NVMF_NQN_SIZE);
+				kfree(p);
 				ret = -EINVAL;
 				goto out;
 			}
 			opts->host = nvmf_host_add(p);
+			kfree(p);
 			if (!opts->host) {
 				ret = -ENOMEM;
 				goto out;

From f3116d8f1e110a5eeaac94f5d1c490e9e466a223 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 18 Oct 2016 13:11:03 -0700
Subject: [PATCH 157/182] nvme-fabrics: Fix a memory leak in an
 nvmf_create_ctrl() error path

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fabrics.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 12f19c4665cf45..1a68b46fff5a1a 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -827,8 +827,7 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
 out_unlock:
 	mutex_unlock(&nvmf_transports_mutex);
 out_free_opts:
-	nvmf_host_put(opts->host);
-	kfree(opts);
+	nvmf_free_options(opts);
 	return ERR_PTR(ret);
 }
 

From e4fcf07cca6a3b6c4be00df16f08be894325eaa3 Mon Sep 17 00:00:00 2001
From: Solganik Alexander <sashas@lightbitslabs.com>
Date: Sun, 30 Oct 2016 10:35:15 +0200
Subject: [PATCH 158/182] nvmet: Fix possible infinite loop triggered on hot
 namespace removal

When removing a namespace we delete it from the subsystem namespaces
list with list_del_init which allows us to know if it is enabled or
not.

The problem is that list_del_init initialize the list next and does
not respect the RCU list-traversal we do on the IO path for locating
a namespace. Instead we need to use list_del_rcu which is allowed to
run concurrently with the _rcu list-traversal primitives (keeps list
next intact) and guarantees concurrent nvmet_find_naespace forward
progress.

By changing that, we cannot rely on ns->dev_link for knowing if the
namspace is enabled, so add enabled indicator entry to nvmet_ns for
that.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Solganik Alexander <sashas@lightbitslabs.com>
Cc: <stable@vger.kernel.org> # v4.8+
---
 drivers/nvme/target/configfs.c |  6 +++---
 drivers/nvme/target/core.c     | 14 ++++++++------
 drivers/nvme/target/nvmet.h    |  6 +-----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index af5e2dc4a3d503..011f88e5663e72 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -271,7 +271,7 @@ static ssize_t nvmet_ns_device_path_store(struct config_item *item,
 
 	mutex_lock(&subsys->lock);
 	ret = -EBUSY;
-	if (nvmet_ns_enabled(ns))
+	if (ns->enabled)
 		goto out_unlock;
 
 	kfree(ns->device_path);
@@ -307,7 +307,7 @@ static ssize_t nvmet_ns_device_nguid_store(struct config_item *item,
 	int ret = 0;
 
 	mutex_lock(&subsys->lock);
-	if (nvmet_ns_enabled(ns)) {
+	if (ns->enabled) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -339,7 +339,7 @@ CONFIGFS_ATTR(nvmet_ns_, device_nguid);
 
 static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
 {
-	return sprintf(page, "%d\n", nvmet_ns_enabled(to_nvmet_ns(item)));
+	return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
 }
 
 static ssize_t nvmet_ns_enable_store(struct config_item *item,
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index c232552be2d807..8b627e2a55c6d6 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -264,7 +264,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
 	int ret = 0;
 
 	mutex_lock(&subsys->lock);
-	if (!list_empty(&ns->dev_link))
+	if (ns->enabled)
 		goto out_unlock;
 
 	ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
@@ -309,6 +309,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0);
 
+	ns->enabled = true;
 	ret = 0;
 out_unlock:
 	mutex_unlock(&subsys->lock);
@@ -325,11 +326,11 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
 	struct nvmet_ctrl *ctrl;
 
 	mutex_lock(&subsys->lock);
-	if (list_empty(&ns->dev_link)) {
-		mutex_unlock(&subsys->lock);
-		return;
-	}
-	list_del_init(&ns->dev_link);
+	if (!ns->enabled)
+		goto out_unlock;
+
+	ns->enabled = false;
+	list_del_rcu(&ns->dev_link);
 	mutex_unlock(&subsys->lock);
 
 	/*
@@ -351,6 +352,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
 
 	if (ns->bdev)
 		blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ);
+out_unlock:
 	mutex_unlock(&subsys->lock);
 }
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index f9c76441e8c941..23d5eb1c944f64 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -47,6 +47,7 @@ struct nvmet_ns {
 	loff_t			size;
 	u8			nguid[16];
 
+	bool			enabled;
 	struct nvmet_subsys	*subsys;
 	const char		*device_path;
 
@@ -61,11 +62,6 @@ static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
 	return container_of(to_config_group(item), struct nvmet_ns, group);
 }
 
-static inline bool nvmet_ns_enabled(struct nvmet_ns *ns)
-{
-	return !list_empty_careful(&ns->dev_link);
-}
-
 struct nvmet_cq {
 	u16			qid;
 	u16			size;

From 6bcb5268d46b33649354febc1cb294767220d8ca Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 18 Oct 2016 13:10:03 -0700
Subject: [PATCH 159/182] nvme/scsi: Remove set-but-not-used variables

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/scsi.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c
index c2a0a1c7d05d15..0671fe0d5f2fa8 100644
--- a/drivers/nvme/host/scsi.c
+++ b/drivers/nvme/host/scsi.c
@@ -1280,10 +1280,6 @@ static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10,
 static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list,
 					u16 idx, u16 bd_len, u8 llbaa)
 {
-	u16 bd_num;
-
-	bd_num = bd_len / ((llbaa == 0) ?
-			SHORT_DESC_BLOCK : LONG_DESC_BLOCK);
 	/* Store block descriptor info if a FORMAT UNIT comes later */
 	/* TODO Saving 1st BD info; what to do if multiple BD received? */
 	if (llbaa == 0) {
@@ -1528,7 +1524,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int nvme_sc;
 	struct nvme_id_ns *id_ns;
 	u8 i;
-	u8 flbas, nlbaf;
+	u8 nlbaf;
 	u8 selected_lbaf = 0xFF;
 	u32 cdw10 = 0;
 	struct nvme_command c;
@@ -1539,7 +1535,6 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	if (res)
 		return res;
 
-	flbas = (id_ns->flbas) & 0x0F;
 	nlbaf = id_ns->nlbaf;
 
 	for (i = 0; i < nlbaf; i++) {
@@ -2168,12 +2163,10 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
 static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	u8 immed, pcmod, no_flush, start;
+	u8 immed, no_flush;
 
 	immed = cmd[1] & 0x01;
-	pcmod = cmd[3] & 0x0f;
 	no_flush = cmd[4] & 0x04;
-	start = cmd[4] & 0x01;
 
 	if (immed != 0) {
 		return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,

From 6eb728305414bb9ef3c971a7a2932cc037b7df54 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Tue, 18 Oct 2016 13:10:24 -0700
Subject: [PATCH 160/182] nvme-fabrics: Adjust source code indentation

Adjust indentation such that arguments are aligned.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fabrics.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 1a68b46fff5a1a..916d1360805964 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -576,7 +576,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			nqnlen = strlen(opts->subsysnqn);
 			if (nqnlen >= NVMF_NQN_SIZE) {
 				pr_err("%s needs to be < %d bytes\n",
-				opts->subsysnqn, NVMF_NQN_SIZE);
+					opts->subsysnqn, NVMF_NQN_SIZE);
 				ret = -EINVAL;
 				goto out;
 			}

From d4a5340edd6405972795a1d006e8e2b6f7664ca1 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 1 Nov 2016 10:41:00 +0200
Subject: [PATCH 161/182] nvme-rdma: remove redundant define

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/rdma.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d1df4a5eb1e6e2..daf9ad340e50d8 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -28,7 +28,6 @@
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
-#include <rdma/ib_cm.h>
 #include <linux/nvme-rdma.h>
 
 #include "nvme.h"

From 675796be47d683d9b5cd3d7587e09a1cc4fe9e54 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Wed, 23 Nov 2016 11:38:47 +0200
Subject: [PATCH 162/182] nvmet-rdma: align to generic ib_event logging helper

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 35b97a898ecd71..6c1c3680c5e111 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1130,7 +1130,8 @@ static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
 		rdma_notify(queue->cm_id, event->event);
 		break;
 	default:
-		pr_err("received unrecognized IB QP event %d\n", event->event);
+		pr_err("received IB QP event: %s (%d)\n",
+		       ib_event_msg(event->event), event->event);
 		break;
 	}
 }

From 27a4beef0b0508621c34aa61725eec2aa39904e0 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Wed, 23 Nov 2016 11:38:48 +0200
Subject: [PATCH 163/182] nvme-rdma: align to generic ib_event logging helper

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index daf9ad340e50d8..f560cd724274da 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -240,7 +240,9 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
 
 static void nvme_rdma_qp_event(struct ib_event *event, void *context)
 {
-	pr_debug("QP event %d\n", event->event);
+	pr_debug("QP event %s (%d)\n",
+		 ib_event_msg(event->event), event->event);
+
 }
 
 static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)

From a317178e36b52f5f24ad226a77403eeea5ac05c4 Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Fri, 21 Oct 2016 23:51:54 +0300
Subject: [PATCH 164/182] parser: add u64 number parser

Will be used by the nvme-fabrics FC transport in parsing options

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 include/linux/parser.h |  1 +
 lib/parser.c           | 47 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/include/linux/parser.h b/include/linux/parser.h
index 39d5b7955b23f9..884c1e6eb3fe10 100644
--- a/include/linux/parser.h
+++ b/include/linux/parser.h
@@ -27,6 +27,7 @@ typedef struct {
 
 int match_token(char *, const match_table_t table, substring_t args[]);
 int match_int(substring_t *, int *result);
+int match_u64(substring_t *, u64 *result);
 int match_octal(substring_t *, int *result);
 int match_hex(substring_t *, int *result);
 bool match_wildcard(const char *pattern, const char *str);
diff --git a/lib/parser.c b/lib/parser.c
index b6d11631231b67..3278958b472a99 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -151,6 +151,36 @@ static int match_number(substring_t *s, int *result, int base)
 	return ret;
 }
 
+/**
+ * match_u64int: scan a number in the given base from a substring_t
+ * @s: substring to be scanned
+ * @result: resulting u64 on success
+ * @base: base to use when converting string
+ *
+ * Description: Given a &substring_t and a base, attempts to parse the substring
+ * as a number in that base. On success, sets @result to the integer represented
+ * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+static int match_u64int(substring_t *s, u64 *result, int base)
+{
+	char *buf;
+	int ret;
+	u64 val;
+	size_t len = s->to - s->from;
+
+	buf = kmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	memcpy(buf, s->from, len);
+	buf[len] = '\0';
+
+	ret = kstrtoull(buf, base, &val);
+	if (!ret)
+		*result = val;
+	kfree(buf);
+	return ret;
+}
+
 /**
  * match_int: - scan a decimal representation of an integer from a substring_t
  * @s: substring_t to be scanned
@@ -166,6 +196,23 @@ int match_int(substring_t *s, int *result)
 }
 EXPORT_SYMBOL(match_int);
 
+/**
+ * match_u64: - scan a decimal representation of a u64 from
+ *                  a substring_t
+ * @s: substring_t to be scanned
+ * @result: resulting unsigned long long on success
+ *
+ * Description: Attempts to parse the &substring_t @s as a long decimal
+ * integer. On success, sets @result to the integer represented by the
+ * string and returns 0.
+ * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+int match_u64(substring_t *s, u64 *result)
+{
+	return match_u64int(s, result, 0);
+}
+EXPORT_SYMBOL(match_u64);
+
 /**
  * match_octal: - scan an octal representation of an integer from a substring_t
  * @s: substring_t to be scanned

From 721b3917c4ae222085c6de70c24b73b0e7950b35 Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Fri, 21 Oct 2016 23:33:34 +0300
Subject: [PATCH 165/182] nvme-fabrics: set sqe.command_id in core not
 transports

Currently, core.c sets command_id only on rd/wr commands, leaving it to
the transport to set it again to ensure the request had a command id.

Move location of set in core so applies to all commands.
Remove transport sets.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c   | 3 ++-
 drivers/nvme/host/pci.c    | 1 -
 drivers/nvme/host/rdma.c   | 1 -
 drivers/nvme/target/loop.c | 1 -
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 90c24e88bb388b..1b48514fbe9913 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -303,7 +303,6 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
-	cmnd->rw.command_id = req->tag;
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -345,6 +344,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 	else
 		nvme_setup_rw(ns, req, cmd);
 
+	cmd->common.command_id = req->tag;
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d58f8e4e2c06f2..82b9b3f1f21d57 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -611,7 +611,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		goto out;
 
-	cmnd.common.command_id = req->tag;
 	blk_mq_start_request(req);
 
 	spin_lock_irq(&nvmeq->q_lock);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f560cd724274da..b037d0cb2a7e05 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1399,7 +1399,6 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		return ret;
 
-	c->common.command_id = rq->tag;
 	blk_mq_start_request(rq);
 
 	map_len = nvme_map_len(rq);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index be56d0567155a2..57ded6b3ed8a98 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -194,7 +194,6 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		BUG_ON(iod->req.sg_cnt > req->nr_phys_segments);
 	}
 
-	iod->cmd.common.command_id = req->tag;
 	blk_mq_start_request(req);
 
 	schedule_work(&iod->work);

From 885aa4015f70efffff86d81a5e236562dc8dcffe Mon Sep 17 00:00:00 2001
From: James Smart <james.smart@broadcom.com>
Date: Fri, 21 Oct 2016 23:32:51 +0300
Subject: [PATCH 166/182] nvme-fabrics: patch target code in prep for FC
 transport support

- Add FC transport type decoding
- Add FC address family decoding

Signed-off-by: James Smart <james.smart@broadcom.com>
Acked-by: Johannes Thumshirn <jth@kernel.org>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/configfs.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 011f88e5663e72..d0f60c36d5767e 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -37,6 +37,8 @@ static ssize_t nvmet_addr_adrfam_show(struct config_item *item,
 		return sprintf(page, "ipv6\n");
 	case NVMF_ADDR_FAMILY_IB:
 		return sprintf(page, "ib\n");
+	case NVMF_ADDR_FAMILY_FC:
+		return sprintf(page, "fc\n");
 	default:
 		return sprintf(page, "\n");
 	}
@@ -59,6 +61,8 @@ static ssize_t nvmet_addr_adrfam_store(struct config_item *item,
 		port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP6;
 	} else if (sysfs_streq(page, "ib")) {
 		port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IB;
+	} else if (sysfs_streq(page, "fc")) {
+		port->disc_addr.adrfam = NVMF_ADDR_FAMILY_FC;
 	} else {
 		pr_err("Invalid value '%s' for adrfam\n", page);
 		return -EINVAL;
@@ -209,6 +213,8 @@ static ssize_t nvmet_addr_trtype_show(struct config_item *item,
 		return sprintf(page, "rdma\n");
 	case NVMF_TRTYPE_LOOP:
 		return sprintf(page, "loop\n");
+	case NVMF_TRTYPE_FC:
+		return sprintf(page, "fc\n");
 	default:
 		return sprintf(page, "\n");
 	}
@@ -229,6 +235,12 @@ static void nvmet_port_init_tsas_loop(struct nvmet_port *port)
 	memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE);
 }
 
+static void nvmet_port_init_tsas_fc(struct nvmet_port *port)
+{
+	port->disc_addr.trtype = NVMF_TRTYPE_FC;
+	memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE);
+}
+
 static ssize_t nvmet_addr_trtype_store(struct config_item *item,
 		const char *page, size_t count)
 {
@@ -244,6 +256,8 @@ static ssize_t nvmet_addr_trtype_store(struct config_item *item,
 		nvmet_port_init_tsas_rdma(port);
 	} else if (sysfs_streq(page, "loop")) {
 		nvmet_port_init_tsas_loop(port);
+	} else if (sysfs_streq(page, "fc")) {
+		nvmet_port_init_tsas_fc(port);
 	} else {
 		pr_err("Invalid value '%s' for trtype\n", page);
 		return -EINVAL;

From 6ea76f33e9ab99c7888547e1acba2baf8e4b5b17 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 2 Dec 2016 00:28:38 -0800
Subject: [PATCH 167/182] Add type 0x28 NVME type code to scsi fc headers

Signed-off-by: James Smart <james.smart@broadcom.com>
Acked-by: Johannes Thumshirn <jth@kernel.org>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/uapi/scsi/fc/fc_fs.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/scsi/fc/fc_fs.h b/include/uapi/scsi/fc/fc_fs.h
index 50f28b143451a5..dcf314dc2a27ed 100644
--- a/include/uapi/scsi/fc/fc_fs.h
+++ b/include/uapi/scsi/fc/fc_fs.h
@@ -190,6 +190,7 @@ enum fc_fh_type {
 	FC_TYPE_FCP =	0x08,	/* SCSI FCP */
 	FC_TYPE_CT =	0x20,	/* Fibre Channel Services (FC-CT) */
 	FC_TYPE_ILS =	0x22,	/* internal link service */
+	FC_TYPE_NVME =	0x28,	/* FC-NVME */
 };
 
 /*
@@ -203,6 +204,7 @@ enum fc_fh_type {
 	[FC_TYPE_FCP] =		"FCP",			\
 	[FC_TYPE_CT] =		"CT",			\
 	[FC_TYPE_ILS] =		"ILS",			\
+	[FC_TYPE_NVME] =	"NVME",			\
 }
 
 /*

From cba3bdfd2e89edd706e2c40dfad914aca663b6ac Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 2 Dec 2016 00:28:39 -0800
Subject: [PATCH 168/182] nvme-fabrics: Add FC transport error codes to nvme.h

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0df9466a7c38f4..5ac1f57226f426 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -963,6 +963,19 @@ enum {
 	NVME_SC_ACCESS_DENIED		= 0x286,
 
 	NVME_SC_DNR			= 0x4000,
+
+
+	/*
+	 * FC Transport-specific error status values for NVME commands
+	 *
+	 * Transport-specific status code values must be in the range 0xB0..0xBF
+	 */
+
+	/* Generic FC failure - catchall */
+	NVME_SC_FC_TRANSPORT_ERROR	= 0x00B0,
+
+	/* I/O failure due to FC ABTS'd */
+	NVME_SC_FC_TRANSPORT_ABORTED	= 0x00B1,
 };
 
 struct nvme_completion {

From b1ad1475b447a7668ac8bfad77277c4405941883 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 2 Dec 2016 00:28:40 -0800
Subject: [PATCH 169/182] nvme-fabrics: Add FC transport FC-NVME definitions

- Formats for Cmd, Data, Rsp IUs
- Formats FC-4 LS definitions
- Add to MAINTAINERS file

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS             |   6 +
 include/linux/nvme-fc.h | 268 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 274 insertions(+)
 create mode 100644 include/linux/nvme-fc.h

diff --git a/MAINTAINERS b/MAINTAINERS
index bbc2b39e67e957..e630560df72077 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8659,6 +8659,12 @@ L:	linux-nvme@lists.infradead.org
 S:	Supported
 F:	drivers/nvme/target/
 
+NVM EXPRESS FC TRANSPORT DRIVERS
+M:	James Smart <james.smart@broadcom.com>
+L:	linux-nvme@lists.infradead.org
+S:	Supported
+F:	include/linux/nvme-fc.h
+
 NVMEM FRAMEWORK
 M:	Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
 M:	Maxime Ripard <maxime.ripard@free-electrons.com>
diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
new file mode 100644
index 00000000000000..4b45226bd604c5
--- /dev/null
+++ b/include/linux/nvme-fc.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2016 Avago Technologies.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful.
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
+ * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
+ * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
+ * See the GNU General Public License for more details, a copy of which
+ * can be found in the file COPYING included with this package
+ *
+ */
+
+/*
+ * This file contains definitions relative to FC-NVME r1.11 and a few
+ * newer items
+ */
+
+#ifndef _NVME_FC_H
+#define _NVME_FC_H 1
+
+
+#define NVME_CMD_SCSI_ID		0xFD
+#define NVME_CMD_FC_ID			FC_TYPE_NVME
+
+/* FC-NVME Cmd IU Flags */
+#define FCNVME_CMD_FLAGS_DIRMASK	0x03
+#define FCNVME_CMD_FLAGS_WRITE		0x01
+#define FCNVME_CMD_FLAGS_READ		0x02
+
+struct nvme_fc_cmd_iu {
+	__u8			scsi_id;
+	__u8			fc_id;
+	__be16			iu_len;
+	__u8			rsvd4[3];
+	__u8			flags;
+	__be64			connection_id;
+	__be32			csn;
+	__be32			data_len;
+	struct nvme_command	sqe;
+	__be32			rsvd88[2];
+};
+
+#define NVME_FC_SIZEOF_ZEROS_RSP	12
+
+struct nvme_fc_ersp_iu {
+	__u8			rsvd0[2];
+	__be16			iu_len;
+	__be32			rsn;
+	__be32			xfrd_len;
+	__be32			rsvd12;
+	struct nvme_completion	cqe;
+	/* for now - no additional payload */
+};
+
+
+/* FC-NVME r1.03/16-119v0 NVME Link Services */
+enum {
+	FCNVME_LS_RSVD			= 0,
+	FCNVME_LS_RJT			= 1,
+	FCNVME_LS_ACC			= 2,
+	FCNVME_LS_CREATE_ASSOCIATION	= 3,
+	FCNVME_LS_CREATE_CONNECTION	= 4,
+	FCNVME_LS_DISCONNECT		= 5,
+};
+
+/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */
+enum {
+	FCNVME_LSDESC_RSVD		= 0x0,
+	FCNVME_LSDESC_RQST		= 0x1,
+	FCNVME_LSDESC_RJT		= 0x2,
+	FCNVME_LSDESC_CREATE_ASSOC_CMD	= 0x3,
+	FCNVME_LSDESC_CREATE_CONN_CMD	= 0x4,
+	FCNVME_LSDESC_DISCONN_CMD	= 0x5,
+	FCNVME_LSDESC_CONN_ID		= 0x6,
+	FCNVME_LSDESC_ASSOC_ID		= 0x7,
+};
+
+
+/* ********** start of Link Service Descriptors ********** */
+
+
+/*
+ * fills in length of a descriptor. Struture minus descriptor header
+ */
+static inline __be32 fcnvme_lsdesc_len(size_t sz)
+{
+	return cpu_to_be32(sz - (2 * sizeof(u32)));
+}
+
+
+struct fcnvme_ls_rqst_w0 {
+	u8	ls_cmd;			/* FCNVME_LS_xxx */
+	u8	zeros[3];
+};
+
+/* FCNVME_LSDESC_RQST */
+struct fcnvme_lsdesc_rqst {
+	__be32	desc_tag;		/* FCNVME_LSDESC_xxx */
+	__be32	desc_len;
+	struct fcnvme_ls_rqst_w0	w0;
+	__be32	rsvd12;
+};
+
+
+
+
+/* FCNVME_LSDESC_RJT */
+struct fcnvme_lsdesc_rjt {
+	__be32	desc_tag;		/* FCNVME_LSDESC_xxx */
+	__be32	desc_len;
+	u8	rsvd8;
+
+	/*
+	 * Reject reason and explanaction codes are generic
+	 * to ELs's from LS-3.
+	 */
+	u8	reason_code;
+	u8	reason_explanation;
+
+	u8	vendor;
+	__be32	rsvd12;
+};
+
+
+#define FCNVME_ASSOC_HOSTID_LEN		64
+#define FCNVME_ASSOC_HOSTNQN_LEN	256
+#define FCNVME_ASSOC_SUBNQN_LEN		256
+
+/* FCNVME_LSDESC_CREATE_ASSOC_CMD */
+struct fcnvme_lsdesc_cr_assoc_cmd {
+	__be32	desc_tag;		/* FCNVME_LSDESC_xxx */
+	__be32	desc_len;
+	__be16	ersp_ratio;
+	__be16	rsvd10;
+	__be32	rsvd12[9];
+	__be16	cntlid;
+	__be16	sqsize;
+	__be32	rsvd52;
+	u8	hostid[FCNVME_ASSOC_HOSTID_LEN];
+	u8	hostnqn[FCNVME_ASSOC_HOSTNQN_LEN];
+	u8	subnqn[FCNVME_ASSOC_SUBNQN_LEN];
+	u8	rsvd632[384];
+};
+
+/* FCNVME_LSDESC_CREATE_CONN_CMD */
+struct fcnvme_lsdesc_cr_conn_cmd {
+	__be32	desc_tag;		/* FCNVME_LSDESC_xxx */
+	__be32	desc_len;
+	__be16	ersp_ratio;
+	__be16	rsvd10;
+	__be32	rsvd12[9];
+	__be16	qid;
+	__be16	sqsize;
+	__be32  rsvd52;
+};
+
+/* Disconnect Scope Values */
+enum {
+	FCNVME_DISCONN_ASSOCIATION	= 0,
+	FCNVME_DISCONN_CONNECTION	= 1,
+};
+
+/* FCNVME_LSDESC_DISCONN_CMD */
+struct fcnvme_lsdesc_disconn_cmd {
+	__be32	desc_tag;		/* FCNVME_LSDESC_xxx */
+	__be32	desc_len;
+	u8	rsvd8[3];
+	/* note: scope is really a 1 bit field */
+	u8	scope;			/* FCNVME_DISCONN_xxx */
+	__be32	rsvd12;
+	__be64	id;
+};
+
+/* FCNVME_LSDESC_CONN_ID */
+struct fcnvme_lsdesc_conn_id {
+	__be32	desc_tag;		/* FCNVME_LSDESC_xxx */
+	__be32	desc_len;
+	__be64	connection_id;
+};
+
+/* FCNVME_LSDESC_ASSOC_ID */
+struct fcnvme_lsdesc_assoc_id {
+	__be32	desc_tag;		/* FCNVME_LSDESC_xxx */
+	__be32	desc_len;
+	__be64	association_id;
+};
+
+/* r_ctl values */
+enum {
+	FCNVME_RS_RCTL_DATA		= 1,
+	FCNVME_RS_RCTL_XFER_RDY		= 5,
+	FCNVME_RS_RCTL_RSP		= 8,
+};
+
+
+/* ********** start of Link Services ********** */
+
+
+/* FCNVME_LS_RJT */
+struct fcnvme_ls_rjt {
+	struct fcnvme_ls_rqst_w0		w0;
+	__be32					desc_list_len;
+	struct fcnvme_lsdesc_rqst		rqst;
+	struct fcnvme_lsdesc_rjt		rjt;
+};
+
+/* FCNVME_LS_ACC */
+struct fcnvme_ls_acc_hdr {
+	struct fcnvme_ls_rqst_w0		w0;
+	__be32					desc_list_len;
+	struct fcnvme_lsdesc_rqst		rqst;
+	/* Followed by cmd-specific ACC descriptors, see next definitions */
+};
+
+/* FCNVME_LS_CREATE_ASSOCIATION */
+struct fcnvme_ls_cr_assoc_rqst {
+	struct fcnvme_ls_rqst_w0		w0;
+	__be32					desc_list_len;
+	struct fcnvme_lsdesc_cr_assoc_cmd	assoc_cmd;
+};
+
+struct fcnvme_ls_cr_assoc_acc {
+	struct fcnvme_ls_acc_hdr		hdr;
+	struct fcnvme_lsdesc_assoc_id		associd;
+	struct fcnvme_lsdesc_conn_id		connectid;
+};
+
+
+/* FCNVME_LS_CREATE_CONNECTION */
+struct fcnvme_ls_cr_conn_rqst {
+	struct fcnvme_ls_rqst_w0		w0;
+	__be32					desc_list_len;
+	struct fcnvme_lsdesc_assoc_id		associd;
+	struct fcnvme_lsdesc_cr_conn_cmd	connect_cmd;
+};
+
+struct fcnvme_ls_cr_conn_acc {
+	struct fcnvme_ls_acc_hdr		hdr;
+	struct fcnvme_lsdesc_conn_id		connectid;
+};
+
+/* FCNVME_LS_DISCONNECT */
+struct fcnvme_ls_disconnect_rqst {
+	struct fcnvme_ls_rqst_w0		w0;
+	__be32					desc_list_len;
+	struct fcnvme_lsdesc_assoc_id		associd;
+	struct fcnvme_lsdesc_disconn_cmd	discon_cmd;
+};
+
+struct fcnvme_ls_disconnect_acc {
+	struct fcnvme_ls_acc_hdr		hdr;
+};
+
+
+/*
+ * Yet to be defined in FC-NVME:
+ */
+#define NVME_FC_CONNECT_TIMEOUT_SEC	2		/* 2 seconds */
+#define NVME_FC_LS_TIMEOUT_SEC		2		/* 2 seconds */
+#define NVME_FC_TGTOP_TIMEOUT_SEC	2		/* 2 seconds */
+
+
+#endif /* _NVME_FC_H */

From d6d20012e116904065d192be6146040c99c03c3c Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 2 Dec 2016 00:28:41 -0800
Subject: [PATCH 170/182] nvme-fabrics: Add FC transport LLDD api definitions

Host:
 - LLDD registration with the host transport
 - registering host ports (local ports) and target ports seen on
   fabric (remote ports)
 - Data structures and call points for FC-4 LS's and FCP IO requests

Target:
 - LLDD registration with the target transport
 - registering nvme subsystem ports (target ports)
 - Data structures and call points for reception of FC-4 LS's and
   FCP IO requests, and callbacks to perform data and rsp transfers
   for the io.

Add to MAINTAINERS file

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS                    |   1 +
 include/linux/nvme-fc-driver.h | 851 +++++++++++++++++++++++++++++++++
 2 files changed, 852 insertions(+)
 create mode 100644 include/linux/nvme-fc-driver.h

diff --git a/MAINTAINERS b/MAINTAINERS
index e630560df72077..539502e7089de4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8664,6 +8664,7 @@ M:	James Smart <james.smart@broadcom.com>
 L:	linux-nvme@lists.infradead.org
 S:	Supported
 F:	include/linux/nvme-fc.h
+F:	include/linux/nvme-fc-driver.h
 
 NVMEM FRAMEWORK
 M:	Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
new file mode 100644
index 00000000000000..f21471f7ee407a
--- /dev/null
+++ b/include/linux/nvme-fc-driver.h
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2016, Avago Technologies
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _NVME_FC_DRIVER_H
+#define _NVME_FC_DRIVER_H 1
+
+
+/*
+ * **********************  LLDD FC-NVME Host API ********************
+ *
+ *  For FC LLDD's that are the NVME Host role.
+ *
+ * ******************************************************************
+ */
+
+
+
+/* FC Port role bitmask - can merge with FC Port Roles in fc transport */
+#define FC_PORT_ROLE_NVME_INITIATOR	0x10
+#define FC_PORT_ROLE_NVME_TARGET	0x11
+#define FC_PORT_ROLE_NVME_DISCOVERY	0x12
+
+
+/**
+ * struct nvme_fc_port_info - port-specific ids and FC connection-specific
+ *                            data element used during NVME Host role
+ *                            registrations
+ *
+ * Static fields describing the port being registered:
+ * @node_name: FC WWNN for the port
+ * @port_name: FC WWPN for the port
+ * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx)
+ *
+ * Initialization values for dynamic port fields:
+ * @port_id:      FC N_Port_ID currently assigned the port. Upper 8 bits must
+ *                be set to 0.
+ */
+struct nvme_fc_port_info {
+	u64			node_name;
+	u64			port_name;
+	u32			port_role;
+	u32			port_id;
+};
+
+
+/**
+ * struct nvmefc_ls_req - Request structure passed from NVME-FC transport
+ *                        to LLDD in order to perform a NVME FC-4 LS
+ *                        request and obtain a response.
+ *
+ * Values set by the NVME-FC layer prior to calling the LLDD ls_req
+ * entrypoint.
+ * @rqstaddr: pointer to request buffer
+ * @rqstdma:  PCI DMA address of request buffer
+ * @rqstlen:  Length, in bytes, of request buffer
+ * @rspaddr:  pointer to response buffer
+ * @rspdma:   PCI DMA address of response buffer
+ * @rsplen:   Length, in bytes, of response buffer
+ * @timeout:  Maximum amount of time, in seconds, to wait for the LS response.
+ *            If timeout exceeded, LLDD to abort LS exchange and complete
+ *            LS request with error status.
+ * @private:  pointer to memory allocated alongside the ls request structure
+ *            that is specifically for the LLDD to use while processing the
+ *            request. The length of the buffer corresponds to the
+ *            lsrqst_priv_sz value specified in the nvme_fc_port_template
+ *            supplied by the LLDD.
+ * @done:     The callback routine the LLDD is to invoke upon completion of
+ *            the LS request. req argument is the pointer to the original LS
+ *            request structure. Status argument must be 0 upon success, a
+ *            negative errno on failure (example: -ENXIO).
+ */
+struct nvmefc_ls_req {
+	void			*rqstaddr;
+	dma_addr_t		rqstdma;
+	u32			rqstlen;
+	void			*rspaddr;
+	dma_addr_t		rspdma;
+	u32			rsplen;
+	u32			timeout;
+
+	void			*private;
+
+	void (*done)(struct nvmefc_ls_req *req, int status);
+
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+
+enum nvmefc_fcp_datadir {
+	NVMEFC_FCP_NODATA,	/* payload_length and sg_cnt will be zero */
+	NVMEFC_FCP_WRITE,
+	NVMEFC_FCP_READ,
+};
+
+
+#define NVME_FC_MAX_SEGMENTS		256
+
+/**
+ * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport
+ *                         to LLDD in order to perform a NVME FCP IO operation.
+ *
+ * Values set by the NVME-FC layer prior to calling the LLDD fcp_io
+ * entrypoint.
+ * @cmdaddr:   pointer to the FCP CMD IU buffer
+ * @rspaddr:   pointer to the FCP RSP IU buffer
+ * @cmddma:    PCI DMA address of the FCP CMD IU buffer
+ * @rspdma:    PCI DMA address of the FCP RSP IU buffer
+ * @cmdlen:    Length, in bytes, of the FCP CMD IU buffer
+ * @rsplen:    Length, in bytes, of the FCP RSP IU buffer
+ * @payload_length: Length of DATA_IN or DATA_OUT payload data to transfer
+ * @sg_table:  scatter/gather structure for payload data
+ * @first_sgl: memory for 1st scatter/gather list segment for payload data
+ * @sg_cnt:    number of elements in the scatter/gather list
+ * @io_dir:    direction of the FCP request (see NVMEFC_FCP_xxx)
+ * @sqid:      The nvme SQID the command is being issued on
+ * @done:      The callback routine the LLDD is to invoke upon completion of
+ *             the FCP operation. req argument is the pointer to the original
+ *             FCP IO operation.
+ * @private:   pointer to memory allocated alongside the FCP operation
+ *             request structure that is specifically for the LLDD to use
+ *             while processing the operation. The length of the buffer
+ *             corresponds to the fcprqst_priv_sz value specified in the
+ *             nvme_fc_port_template supplied by the LLDD.
+ *
+ * Values set by the LLDD indicating completion status of the FCP operation.
+ * Must be set prior to calling the done() callback.
+ * @transferred_length: amount of payload data, in bytes, that were
+ *             transferred. Should equal payload_length on success.
+ * @rcv_rsplen: length, in bytes, of the FCP RSP IU received.
+ * @status:    Completion status of the FCP operation. must be 0 upon success,
+ *             NVME_SC_FC_xxx value upon failure. Note: this is NOT a
+ *             reflection of the NVME CQE completion status. Only the status
+ *             of the FCP operation at the NVME-FC level.
+ */
+struct nvmefc_fcp_req {
+	void			*cmdaddr;
+	void			*rspaddr;
+	dma_addr_t		cmddma;
+	dma_addr_t		rspdma;
+	u16			cmdlen;
+	u16			rsplen;
+
+	u32			payload_length;
+	struct sg_table		sg_table;
+	struct scatterlist	*first_sgl;
+	int			sg_cnt;
+	enum nvmefc_fcp_datadir	io_dir;
+
+	__le16			sqid;
+
+	void (*done)(struct nvmefc_fcp_req *req);
+
+	void			*private;
+
+	u32			transferred_length;
+	u16			rcv_rsplen;
+	u32			status;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+
+/*
+ * Direct copy of fc_port_state enum. For later merging
+ */
+enum nvme_fc_obj_state {
+	FC_OBJSTATE_UNKNOWN,
+	FC_OBJSTATE_NOTPRESENT,
+	FC_OBJSTATE_ONLINE,
+	FC_OBJSTATE_OFFLINE,		/* User has taken Port Offline */
+	FC_OBJSTATE_BLOCKED,
+	FC_OBJSTATE_BYPASSED,
+	FC_OBJSTATE_DIAGNOSTICS,
+	FC_OBJSTATE_LINKDOWN,
+	FC_OBJSTATE_ERROR,
+	FC_OBJSTATE_LOOPBACK,
+	FC_OBJSTATE_DELETED,
+};
+
+
+/**
+ * struct nvme_fc_local_port - structure used between NVME-FC transport and
+ *                 a LLDD to reference a local NVME host port.
+ *                 Allocated/created by the nvme_fc_register_localport()
+ *                 transport interface.
+ *
+ * Fields with static values for the port. Initialized by the
+ * port_info struct supplied to the registration call.
+ * @port_num:  NVME-FC transport host port number
+ * @port_role: NVME roles are supported on the port (see FC_PORT_ROLE_xxx)
+ * @node_name: FC WWNN for the port
+ * @port_name: FC WWPN for the port
+ * @private:   pointer to memory allocated alongside the local port
+ *             structure that is specifically for the LLDD to use.
+ *             The length of the buffer corresponds to the local_priv_sz
+ *             value specified in the nvme_fc_port_template supplied by
+ *             the LLDD.
+ *
+ * Fields with dynamic values. Values may change base on link state. LLDD
+ * may reference fields directly to change them. Initialized by the
+ * port_info struct supplied to the registration call.
+ * @port_id:      FC N_Port_ID currently assigned the port. Upper 8 bits must
+ *                be set to 0.
+ * @port_state:   Operational state of the port.
+ */
+struct nvme_fc_local_port {
+	/* static/read-only fields */
+	u32 port_num;
+	u32 port_role;
+	u64 node_name;
+	u64 port_name;
+
+	void *private;
+
+	/* dynamic fields */
+	u32 port_id;
+	enum nvme_fc_obj_state port_state;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+
+/**
+ * struct nvme_fc_remote_port - structure used between NVME-FC transport and
+ *                 a LLDD to reference a remote NVME subsystem port.
+ *                 Allocated/created by the nvme_fc_register_remoteport()
+ *                 transport interface.
+ *
+ * Fields with static values for the port. Initialized by the
+ * port_info struct supplied to the registration call.
+ * @port_num:  NVME-FC transport remote subsystem port number
+ * @port_role: NVME roles are supported on the port (see FC_PORT_ROLE_xxx)
+ * @node_name: FC WWNN for the port
+ * @port_name: FC WWPN for the port
+ * @localport: pointer to the NVME-FC local host port the subsystem is
+ *             connected to.
+ * @private:   pointer to memory allocated alongside the remote port
+ *             structure that is specifically for the LLDD to use.
+ *             The length of the buffer corresponds to the remote_priv_sz
+ *             value specified in the nvme_fc_port_template supplied by
+ *             the LLDD.
+ *
+ * Fields with dynamic values. Values may change base on link or login
+ * state. LLDD may reference fields directly to change them. Initialized by
+ * the port_info struct supplied to the registration call.
+ * @port_id:      FC N_Port_ID currently assigned the port. Upper 8 bits must
+ *                be set to 0.
+ * @port_state:   Operational state of the remote port. Valid values are
+ *                ONLINE or UNKNOWN.
+ */
+struct nvme_fc_remote_port {
+	/* static fields */
+	u32 port_num;
+	u32 port_role;
+	u64 node_name;
+	u64 port_name;
+
+	struct nvme_fc_local_port *localport;
+
+	void *private;
+
+	/* dynamic fields */
+	u32 port_id;
+	enum nvme_fc_obj_state port_state;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+
+/**
+ * struct nvme_fc_port_template - structure containing static entrypoints and
+ *                 operational parameters for an LLDD that supports NVME host
+ *                 behavior. Passed by reference in port registrations.
+ *                 NVME-FC transport remembers template reference and may
+ *                 access it during runtime operation.
+ *
+ * Host/Initiator Transport Entrypoints/Parameters:
+ *
+ * @localport_delete:  The LLDD initiates deletion of a localport via
+ *       nvme_fc_deregister_localport(). However, the teardown is
+ *       asynchronous. This routine is called upon the completion of the
+ *       teardown to inform the LLDD that the localport has been deleted.
+ *       Entrypoint is Mandatory.
+ *
+ * @remoteport_delete:  The LLDD initiates deletion of a remoteport via
+ *       nvme_fc_deregister_remoteport(). However, the teardown is
+ *       asynchronous. This routine is called upon the completion of the
+ *       teardown to inform the LLDD that the remoteport has been deleted.
+ *       Entrypoint is Mandatory.
+ *
+ * @create_queue:  Upon creating a host<->controller association, queues are
+ *       created such that they can be affinitized to cpus/cores. This
+ *       callback into the LLDD to notify that a controller queue is being
+ *       created.  The LLDD may choose to allocate an associated hw queue
+ *       or map it onto a shared hw queue. Upon return from the call, the
+ *       LLDD specifies a handle that will be given back to it for any
+ *       command that is posted to the controller queue.  The handle can
+ *       be used by the LLDD to map quickly to the proper hw queue for
+ *       command execution.  The mask of cpu's that will map to this queue
+ *       at the block-level is also passed in. The LLDD should use the
+ *       queue id and/or cpu masks to ensure proper affinitization of the
+ *       controller queue to the hw queue.
+ *       Entrypoint is Optional.
+ *
+ * @delete_queue:  This is the inverse of the crete_queue. During
+ *       host<->controller association teardown, this routine is called
+ *       when a controller queue is being terminated. Any association with
+ *       a hw queue should be termined. If there is a unique hw queue, the
+ *       hw queue should be torn down.
+ *       Entrypoint is Optional.
+ *
+ * @poll_queue:  Called to poll for the completion of an io on a blk queue.
+ *       Entrypoint is Optional.
+ *
+ * @ls_req:  Called to issue a FC-NVME FC-4 LS service request.
+ *       The nvme_fc_ls_req structure will fully describe the buffers for
+ *       the request payload and where to place the response payload. The
+ *       LLDD is to allocate an exchange, issue the LS request, obtain the
+ *       LS response, and call the "done" routine specified in the request
+ *       structure (argument to done is the ls request structure itself).
+ *       Entrypoint is Mandatory.
+ *
+ * @fcp_io:  called to issue a FC-NVME I/O request.  The I/O may be for
+ *       an admin queue or an i/o queue.  The nvmefc_fcp_req structure will
+ *       fully describe the io: the buffer containing the FC-NVME CMD IU
+ *       (which contains the SQE), the sg list for the payload if applicable,
+ *       and the buffer to place the FC-NVME RSP IU into.  The LLDD will
+ *       complete the i/o, indicating the amount of data transferred or
+ *       any transport error, and call the "done" routine specified in the
+ *       request structure (argument to done is the fcp request structure
+ *       itself).
+ *       Entrypoint is Mandatory.
+ *
+ * @ls_abort: called to request the LLDD to abort the indicated ls request.
+ *       The call may return before the abort has completed. After aborting
+ *       the request, the LLDD must still call the ls request done routine
+ *       indicating an FC transport Aborted status.
+ *       Entrypoint is Mandatory.
+ *
+ * @fcp_abort: called to request the LLDD to abort the indicated fcp request.
+ *       The call may return before the abort has completed. After aborting
+ *       the request, the LLDD must still call the fcp request done routine
+ *       indicating an FC transport Aborted status.
+ *       Entrypoint is Mandatory.
+ *
+ * @max_hw_queues:  indicates the maximum number of hw queues the LLDD
+ *       supports for cpu affinitization.
+ *       Value is Mandatory. Must be at least 1.
+ *
+ * @max_sgl_segments:  indicates the maximum number of sgl segments supported
+ *       by the LLDD
+ *       Value is Mandatory. Must be at least 1. Recommend at least 256.
+ *
+ * @max_dif_sgl_segments:  indicates the maximum number of sgl segments
+ *       supported by the LLDD for DIF operations.
+ *       Value is Mandatory. Must be at least 1. Recommend at least 256.
+ *
+ * @dma_boundary:  indicates the dma address boundary where dma mappings
+ *       will be split across.
+ *       Value is Mandatory. Typical value is 0xFFFFFFFF to split across
+ *       4Gig address boundarys
+ *
+ * @local_priv_sz: The LLDD sets this field to the amount of additional
+ *       memory that it would like fc nvme layer to allocate on the LLDD's
+ *       behalf whenever a localport is allocated.  The additional memory
+ *       area solely for the of the LLDD and its location is specified by
+ *       the localport->private pointer.
+ *       Value is Mandatory. Allowed to be zero.
+ *
+ * @remote_priv_sz: The LLDD sets this field to the amount of additional
+ *       memory that it would like fc nvme layer to allocate on the LLDD's
+ *       behalf whenever a remoteport is allocated.  The additional memory
+ *       area solely for the of the LLDD and its location is specified by
+ *       the remoteport->private pointer.
+ *       Value is Mandatory. Allowed to be zero.
+ *
+ * @lsrqst_priv_sz: The LLDD sets this field to the amount of additional
+ *       memory that it would like fc nvme layer to allocate on the LLDD's
+ *       behalf whenever a ls request structure is allocated. The additional
+ *       memory area solely for the of the LLDD and its location is
+ *       specified by the ls_request->private pointer.
+ *       Value is Mandatory. Allowed to be zero.
+ *
+ * @fcprqst_priv_sz: The LLDD sets this field to the amount of additional
+ *       memory that it would like fc nvme layer to allocate on the LLDD's
+ *       behalf whenever a fcp request structure is allocated. The additional
+ *       memory area solely for the of the LLDD and its location is
+ *       specified by the fcp_request->private pointer.
+ *       Value is Mandatory. Allowed to be zero.
+ */
+struct nvme_fc_port_template {
+	/* initiator-based functions */
+	void	(*localport_delete)(struct nvme_fc_local_port *);
+	void	(*remoteport_delete)(struct nvme_fc_remote_port *);
+	int	(*create_queue)(struct nvme_fc_local_port *,
+				unsigned int qidx, u16 qsize,
+				void **handle);
+	void	(*delete_queue)(struct nvme_fc_local_port *,
+				unsigned int qidx, void *handle);
+	void	(*poll_queue)(struct nvme_fc_local_port *, void *handle);
+	int	(*ls_req)(struct nvme_fc_local_port *,
+				struct nvme_fc_remote_port *,
+				struct nvmefc_ls_req *);
+	int	(*fcp_io)(struct nvme_fc_local_port *,
+				struct nvme_fc_remote_port *,
+				void *hw_queue_handle,
+				struct nvmefc_fcp_req *);
+	void	(*ls_abort)(struct nvme_fc_local_port *,
+				struct nvme_fc_remote_port *,
+				struct nvmefc_ls_req *);
+	void	(*fcp_abort)(struct nvme_fc_local_port *,
+				struct nvme_fc_remote_port *,
+				void *hw_queue_handle,
+				struct nvmefc_fcp_req *);
+
+	u32	max_hw_queues;
+	u16	max_sgl_segments;
+	u16	max_dif_sgl_segments;
+	u64	dma_boundary;
+
+	/* sizes of additional private data for data structures */
+	u32	local_priv_sz;
+	u32	remote_priv_sz;
+	u32	lsrqst_priv_sz;
+	u32	fcprqst_priv_sz;
+};
+
+
+/*
+ * Initiator/Host functions
+ */
+
+int nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
+			struct nvme_fc_port_template *template,
+			struct device *dev,
+			struct nvme_fc_local_port **lport_p);
+
+int nvme_fc_unregister_localport(struct nvme_fc_local_port *localport);
+
+int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
+			struct nvme_fc_port_info *pinfo,
+			struct nvme_fc_remote_port **rport_p);
+
+int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport);
+
+
+
+/*
+ * ***************  LLDD FC-NVME Target/Subsystem API ***************
+ *
+ *  For FC LLDD's that are the NVME Subsystem role
+ *
+ * ******************************************************************
+ */
+
+/**
+ * struct nvmet_fc_port_info - port-specific ids and FC connection-specific
+ *                             data element used during NVME Subsystem role
+ *                             registrations
+ *
+ * Static fields describing the port being registered:
+ * @node_name: FC WWNN for the port
+ * @port_name: FC WWPN for the port
+ *
+ * Initialization values for dynamic port fields:
+ * @port_id:      FC N_Port_ID currently assigned the port. Upper 8 bits must
+ *                be set to 0.
+ */
+struct nvmet_fc_port_info {
+	u64			node_name;
+	u64			port_name;
+	u32			port_id;
+};
+
+
+/**
+ * struct nvmefc_tgt_ls_req - Structure used between LLDD and NVMET-FC
+ *                            layer to represent the exchange context for
+ *                            a FC-NVME Link Service (LS).
+ *
+ * The structure is allocated by the LLDD whenever a LS Request is received
+ * from the FC link. The address of the structure is passed to the nvmet-fc
+ * layer via the nvmet_fc_rcv_ls_req() call. The address of the structure
+ * will be passed back to the LLDD when the response is to be transmit.
+ * The LLDD is to use the address to map back to the LLDD exchange structure
+ * which maintains information such as the targetport the LS was received
+ * on, the remote FC NVME initiator that sent the LS, and any FC exchange
+ * context.  Upon completion of the LS response transmit, the address of the
+ * structure will be passed back to the LS rsp done() routine, allowing the
+ * nvmet-fc layer to release dma resources. Upon completion of the done()
+ * routine, no further access will be made by the nvmet-fc layer and the
+ * LLDD can de-allocate the structure.
+ *
+ * Field initialization:
+ *   At the time of the nvmet_fc_rcv_ls_req() call, there is no content that
+ *     is valid in the structure.
+ *
+ *   When the structure is used for the LLDD->xmt_ls_rsp() call, the nvmet-fc
+ *     layer will fully set the fields in order to specify the response
+ *     payload buffer and its length as well as the done routine to be called
+ *     upon compeletion of the transmit.  The nvmet-fc layer will also set a
+ *     private pointer for its own use in the done routine.
+ *
+ * Values set by the NVMET-FC layer prior to calling the LLDD xmt_ls_rsp
+ * entrypoint.
+ * @rspbuf:   pointer to the LS response buffer
+ * @rspdma:   PCI DMA address of the LS response buffer
+ * @rsplen:   Length, in bytes, of the LS response buffer
+ * @done:     The callback routine the LLDD is to invoke upon completion of
+ *            transmitting the LS response. req argument is the pointer to
+ *            the original ls request.
+ * @nvmet_fc_private:  pointer to an internal NVMET-FC layer structure used
+ *            as part of the NVMET-FC processing. The LLDD is not to access
+ *            this pointer.
+ */
+struct nvmefc_tgt_ls_req {
+	void		*rspbuf;
+	dma_addr_t	rspdma;
+	u16		rsplen;
+
+	void (*done)(struct nvmefc_tgt_ls_req *req);
+	void *nvmet_fc_private;		/* LLDD is not to access !! */
+};
+
+/* Operations that NVME-FC layer may request the LLDD to perform for FCP */
+enum {
+	NVMET_FCOP_READDATA	= 1,	/* xmt data to initiator */
+	NVMET_FCOP_WRITEDATA	= 2,	/* xmt data from initiator */
+	NVMET_FCOP_READDATA_RSP	= 3,	/* xmt data to initiator and send
+					 * rsp as well
+					 */
+	NVMET_FCOP_RSP		= 4,	/* send rsp frame */
+	NVMET_FCOP_ABORT	= 5,	/* abort exchange via ABTS */
+	NVMET_FCOP_BA_ACC	= 6,	/* send BA_ACC */
+	NVMET_FCOP_BA_RJT	= 7,	/* send BA_RJT */
+};
+
+/**
+ * struct nvmefc_tgt_fcp_req - Structure used between LLDD and NVMET-FC
+ *                            layer to represent the exchange context and
+ *                            the specific FC-NVME IU operation(s) to perform
+ *                            for a FC-NVME FCP IO.
+ *
+ * Structure used between LLDD and nvmet-fc layer to represent the exchange
+ * context for a FC-NVME FCP I/O operation (e.g. a nvme sqe, the sqe-related
+ * memory transfers, and its assocated cqe transfer).
+ *
+ * The structure is allocated by the LLDD whenever a FCP CMD IU is received
+ * from the FC link. The address of the structure is passed to the nvmet-fc
+ * layer via the nvmet_fc_rcv_fcp_req() call. The address of the structure
+ * will be passed back to the LLDD for the data operations and transmit of
+ * the response. The LLDD is to use the address to map back to the LLDD
+ * exchange structure which maintains information such as the targetport
+ * the FCP I/O was received on, the remote FC NVME initiator that sent the
+ * FCP I/O, and any FC exchange context.  Upon completion of the FCP target
+ * operation, the address of the structure will be passed back to the FCP
+ * op done() routine, allowing the nvmet-fc layer to release dma resources.
+ * Upon completion of the done() routine for either RSP or ABORT ops, no
+ * further access will be made by the nvmet-fc layer and the LLDD can
+ * de-allocate the structure.
+ *
+ * Field initialization:
+ *   At the time of the nvmet_fc_rcv_fcp_req() call, there is no content that
+ *     is valid in the structure.
+ *
+ *   When the structure is used for an FCP target operation, the nvmet-fc
+ *     layer will fully set the fields in order to specify the scattergather
+ *     list, the transfer length, as well as the done routine to be called
+ *     upon compeletion of the operation.  The nvmet-fc layer will also set a
+ *     private pointer for its own use in the done routine.
+ *
+ * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !!
+ *
+ * Values set by the NVMET-FC layer prior to calling the LLDD fcp_op
+ * entrypoint.
+ * @op:       Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx)
+ * @hwqid:    Specifies the hw queue index (0..N-1, where N is the
+ *            max_hw_queues value from the LLD's nvmet_fc_target_template)
+ *            that the operation is to use.
+ * @offset:   Indicates the DATA_OUT/DATA_IN payload offset to be tranferred.
+ *            Field is only valid on WRITEDATA, READDATA, or READDATA_RSP ops.
+ * @timeout:  amount of time, in seconds, to wait for a response from the NVME
+ *            host. A value of 0 is an infinite wait.
+ *            Valid only for the following ops:
+ *              WRITEDATA: caps the wait for data reception
+ *              READDATA_RSP & RSP: caps wait for FCP_CONF reception (if used)
+ * @transfer_length: the length, in bytes, of the DATA_OUT or DATA_IN payload
+ *            that is to be transferred.
+ *            Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops.
+ * @ba_rjt:   Contains the BA_RJT payload that is to be transferred.
+ *            Valid only for the NVMET_FCOP_BA_RJT op.
+ * @sg:       Scatter/gather list for the DATA_OUT/DATA_IN payload data.
+ *            Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops.
+ * @sg_cnt:   Number of valid entries in the scatter/gather list.
+ *            Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops.
+ * @rspaddr:  pointer to the FCP RSP IU buffer to be transmit
+ *            Used by RSP and READDATA_RSP ops
+ * @rspdma:   PCI DMA address of the FCP RSP IU buffer
+ *            Used by RSP and READDATA_RSP ops
+ * @rsplen:   Length, in bytes, of the FCP RSP IU buffer
+ *            Used by RSP and READDATA_RSP ops
+ * @done:     The callback routine the LLDD is to invoke upon completion of
+ *            the operation. req argument is the pointer to the original
+ *            FCP subsystem op request.
+ * @nvmet_fc_private:  pointer to an internal NVMET-FC layer structure used
+ *            as part of the NVMET-FC processing. The LLDD is not to
+ *            reference this field.
+ *
+ * Values set by the LLDD indicating completion status of the FCP operation.
+ * Must be set prior to calling the done() callback.
+ * @transferred_length: amount of DATA_OUT payload data received by a
+ *            a WRITEDATA operation. If not a WRITEDATA operation, value must
+ *            be set to 0. Should equal transfer_length on success.
+ * @fcp_error: status of the FCP operation. Must be 0 on success; on failure
+ *            must be a NVME_SC_FC_xxxx value.
+ */
+struct nvmefc_tgt_fcp_req {
+	u8			op;
+	u16			hwqid;
+	u32			offset;
+	u32			timeout;
+	u32			transfer_length;
+	struct fc_ba_rjt	ba_rjt;
+	struct scatterlist	sg[NVME_FC_MAX_SEGMENTS];
+	int			sg_cnt;
+	void			*rspaddr;
+	dma_addr_t		rspdma;
+	u16			rsplen;
+
+	void (*done)(struct nvmefc_tgt_fcp_req *);
+
+	void *nvmet_fc_private;		/* LLDD is not to access !! */
+
+	u32			transferred_length;
+	int			fcp_error;
+};
+
+
+/* Target Features (Bit fields) LLDD supports */
+enum {
+	NVMET_FCTGTFEAT_READDATA_RSP = (1 << 0),
+		/* Bit 0: supports the NVMET_FCPOP_READDATA_RSP op, which
+		 * sends (the last) Read Data sequence followed by the RSP
+		 * sequence in one LLDD operation. Errors during Data
+		 * sequence transmit must not allow RSP sequence to be sent.
+		 */
+	NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED = (1 << 1),
+		/* Bit 1: When 0, the LLDD will deliver FCP CMD
+		 * on the CPU it should be affinitized to. Thus work will
+		 * be scheduled on the cpu received on. When 1, the LLDD
+		 * may not deliver the CMD on the CPU it should be worked
+		 * on. The transport should pick a cpu to schedule the work
+		 * on.
+		 */
+};
+
+
+/**
+ * struct nvmet_fc_target_port - structure used between NVME-FC transport and
+ *                 a LLDD to reference a local NVME subsystem port.
+ *                 Allocated/created by the nvme_fc_register_targetport()
+ *                 transport interface.
+ *
+ * Fields with static values for the port. Initialized by the
+ * port_info struct supplied to the registration call.
+ * @port_num:  NVME-FC transport subsytem port number
+ * @node_name: FC WWNN for the port
+ * @port_name: FC WWPN for the port
+ * @private:   pointer to memory allocated alongside the local port
+ *             structure that is specifically for the LLDD to use.
+ *             The length of the buffer corresponds to the target_priv_sz
+ *             value specified in the nvme_fc_target_template supplied by
+ *             the LLDD.
+ *
+ * Fields with dynamic values. Values may change base on link state. LLDD
+ * may reference fields directly to change them. Initialized by the
+ * port_info struct supplied to the registration call.
+ * @port_id:      FC N_Port_ID currently assigned the port. Upper 8 bits must
+ *                be set to 0.
+ * @port_state:   Operational state of the port.
+ */
+struct nvmet_fc_target_port {
+	/* static/read-only fields */
+	u32 port_num;
+	u64 node_name;
+	u64 port_name;
+
+	void *private;
+
+	/* dynamic fields */
+	u32 port_id;
+	enum nvme_fc_obj_state port_state;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+
+/**
+ * struct nvmet_fc_target_template - structure containing static entrypoints
+ *                 and operational parameters for an LLDD that supports NVME
+ *                 subsystem behavior. Passed by reference in port
+ *                 registrations. NVME-FC transport remembers template
+ *                 reference and may access it during runtime operation.
+ *
+ * Subsystem/Target Transport Entrypoints/Parameters:
+ *
+ * @targetport_delete:  The LLDD initiates deletion of a targetport via
+ *       nvmet_fc_unregister_targetport(). However, the teardown is
+ *       asynchronous. This routine is called upon the completion of the
+ *       teardown to inform the LLDD that the targetport has been deleted.
+ *       Entrypoint is Mandatory.
+ *
+ * @xmt_ls_rsp:  Called to transmit the response to a FC-NVME FC-4 LS service.
+ *       The nvmefc_tgt_ls_req structure is the same LLDD-supplied exchange
+ *       structure specified in the nvmet_fc_rcv_ls_req() call made when
+ *       the LS request was received.  The structure will fully describe
+ *       the buffers for the response payload and the dma address of the
+ *       payload. The LLDD is to transmit the response (or return a non-zero
+ *       errno status), and upon completion of the transmit, call the
+ *       "done" routine specified in the nvmefc_tgt_ls_req structure
+ *       (argument to done is the ls reqwuest structure itself).
+ *       After calling the done routine, the LLDD shall consider the
+ *       LS handling complete and the nvmefc_tgt_ls_req structure may
+ *       be freed/released.
+ *       Entrypoint is Mandatory.
+ *
+ * @fcp_op:  Called to perform a data transfer, transmit a response, or
+ *       abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same
+ *       LLDD-supplied exchange structure specified in the
+ *       nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received.
+ *       The op field in the structure shall indicate the operation for
+ *       the LLDD to perform relative to the io.
+ *         NVMET_FCOP_READDATA operation: the LLDD is to send the
+ *           payload data (described by sglist) to the host in 1 or
+ *           more FC sequences (preferrably 1).  Note: the fc-nvme layer
+ *           may call the READDATA operation multiple times for longer
+ *           payloads.
+ *         NVMET_FCOP_WRITEDATA operation: the LLDD is to receive the
+ *           payload data (described by sglist) from the host via 1 or
+ *           more FC sequences (preferrably 1). The LLDD is to generate
+ *           the XFER_RDY IU(s) corresponding to the data being requested.
+ *           Note: the FC-NVME layer may call the WRITEDATA operation
+ *           multiple times for longer payloads.
+ *         NVMET_FCOP_READDATA_RSP operation: the LLDD is to send the
+ *           payload data (described by sglist) to the host in 1 or
+ *           more FC sequences (preferrably 1). If an error occurs during
+ *           payload data transmission, the LLDD is to set the
+ *           nvmefc_tgt_fcp_req fcp_error and transferred_length field, then
+ *           consider the operation complete. On error, the LLDD is to not
+ *           transmit the FCP_RSP iu. If all payload data is transferred
+ *           successfully, the LLDD is to update the nvmefc_tgt_fcp_req
+ *           transferred_length field and may subsequently transmit the
+ *           FCP_RSP iu payload (described by rspbuf, rspdma, rsplen).
+ *           The LLDD is to await FCP_CONF reception to confirm the RSP
+ *           reception by the host. The LLDD may retramsit the FCP_RSP iu
+ *           if necessary per FC-NVME. Upon reception of FCP_CONF, or upon
+ *           FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req
+ *           fcp_error field and consider the operation complete..
+ *         NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload
+ *           (described by rspbuf, rspdma, rsplen).  The LLDD is to await
+ *           FCP_CONF reception to confirm the RSP reception by the host.
+ *           The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME.
+ *           Upon reception of FCP_CONF, or upon FCP_CONF failure, the
+ *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
+ *           consider the operation complete..
+ *         NVMET_FCOP_ABORT: the LLDD is to terminate the exchange
+ *           corresponding to the fcp operation. The LLDD shall send
+ *           ABTS and follow FC exchange abort-multi rules, including
+ *           ABTS retries and possible logout.
+ *       Upon completing the indicated operation, the LLDD is to set the
+ *       status fields for the operation (tranferred_length and fcp_error
+ *       status) in the request, then all the "done" routine
+ *       indicated in the fcp request.  Upon return from the "done"
+ *       routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation
+ *       the fc-nvme layer will not longer reference the fcp request,
+ *       allowing the LLDD to free/release the fcp request.
+ *       Note: when calling the done routine for READDATA or WRITEDATA
+ *       operations, the fc-nvme layer may immediate convert, in the same
+ *       thread and before returning to the LLDD, the fcp operation to
+ *       the next operation for the fcp io and call the LLDDs fcp_op
+ *       call again. If fields in the fcp request are to be accessed post
+ *       the done call, the LLDD should save their values prior to calling
+ *       the done routine, and inspect the save values after the done
+ *       routine.
+ *       Returns 0 on success, -<errno> on failure (Ex: -EIO)
+ *       Entrypoint is Mandatory.
+ *
+ * @max_hw_queues:  indicates the maximum number of hw queues the LLDD
+ *       supports for cpu affinitization.
+ *       Value is Mandatory. Must be at least 1.
+ *
+ * @max_sgl_segments:  indicates the maximum number of sgl segments supported
+ *       by the LLDD
+ *       Value is Mandatory. Must be at least 1. Recommend at least 256.
+ *
+ * @max_dif_sgl_segments:  indicates the maximum number of sgl segments
+ *       supported by the LLDD for DIF operations.
+ *       Value is Mandatory. Must be at least 1. Recommend at least 256.
+ *
+ * @dma_boundary:  indicates the dma address boundary where dma mappings
+ *       will be split across.
+ *       Value is Mandatory. Typical value is 0xFFFFFFFF to split across
+ *       4Gig address boundarys
+ *
+ * @target_features: The LLDD sets bits in this field to correspond to
+ *       optional features that are supported by the LLDD.
+ *       Refer to the NVMET_FCTGTFEAT_xxx values.
+ *       Value is Mandatory. Allowed to be zero.
+ *
+ * @target_priv_sz: The LLDD sets this field to the amount of additional
+ *       memory that it would like fc nvme layer to allocate on the LLDD's
+ *       behalf whenever a targetport is allocated.  The additional memory
+ *       area solely for the of the LLDD and its location is specified by
+ *       the targetport->private pointer.
+ *       Value is Mandatory. Allowed to be zero.
+ */
+struct nvmet_fc_target_template {
+	void (*targetport_delete)(struct nvmet_fc_target_port *tgtport);
+	int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport,
+				struct nvmefc_tgt_ls_req *tls_req);
+	int (*fcp_op)(struct nvmet_fc_target_port *tgtport,
+				struct nvmefc_tgt_fcp_req *);
+
+	u32	max_hw_queues;
+	u16	max_sgl_segments;
+	u16	max_dif_sgl_segments;
+	u64	dma_boundary;
+
+	u32	target_features;
+
+	u32	target_priv_sz;
+};
+
+
+int nvmet_fc_register_targetport(struct nvmet_fc_port_info *portinfo,
+			struct nvmet_fc_target_template *template,
+			struct device *dev,
+			struct nvmet_fc_target_port **tgtport_p);
+
+int nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *tgtport);
+
+int nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_ls_req *lsreq,
+			void *lsreqbuf, u32 lsreqbuf_len);
+
+int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *fcpreq,
+			void *cmdiubuf, u32 cmdiubuf_len);
+
+#endif /* _NVME_FC_DRIVER_H */

From e399441de9115cd472b8ace6c517708273ca7997 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 2 Dec 2016 00:28:42 -0800
Subject: [PATCH 171/182] nvme-fabrics: Add host support for FC transport

Implements the FC-NVME T11 definition of how nvme fabric capsules are
performed on an FC fabric. Utilizes a lower-layer API to FC host adapters
to send/receive FC-4 LS operations and FCP operations that comprise NVME
over FC operation.

The T11 definitions for FC-4 Link Services are implemented which create
NVMeOF connections.  Implements the hooks with blk-mq to then submit admin
and io requests to the different connections.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS                |    1 +
 drivers/nvme/host/Kconfig  |   17 +
 drivers/nvme/host/Makefile |    3 +
 drivers/nvme/host/fc.c     | 2586 ++++++++++++++++++++++++++++++++++++
 4 files changed, 2607 insertions(+)
 create mode 100644 drivers/nvme/host/fc.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 539502e7089de4..00505700b30a31 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8665,6 +8665,7 @@ L:	linux-nvme@lists.infradead.org
 S:	Supported
 F:	include/linux/nvme-fc.h
 F:	include/linux/nvme-fc-driver.h
+F:	drivers/nvme/host/fc.c
 
 NVMEM FRAMEWORK
 M:	Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index f7d37a62f874b7..90745a616df7c9 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -43,3 +43,20 @@ config NVME_RDMA
 	  from https://github.com/linux-nvme/nvme-cli.
 
 	  If unsure, say N.
+
+config NVME_FC
+	tristate "NVM Express over Fabrics FC host driver"
+	depends on BLOCK
+	depends on HAS_DMA
+	select NVME_CORE
+	select NVME_FABRICS
+	select SG_POOL
+	help
+	  This provides support for the NVMe over Fabrics protocol using
+	  the FC transport.  This allows you to use remote block devices
+	  exported using the NVMe protocol set.
+
+	  To configure a NVMe over Fabrics controller use the nvme-cli tool
+	  from https://github.com/linux-nvme/nvme-cli.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 47abcec23514ba..f1a7d945fbb662 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_NVME_CORE)			+= nvme-core.o
 obj-$(CONFIG_BLK_DEV_NVME)		+= nvme.o
 obj-$(CONFIG_NVME_FABRICS)		+= nvme-fabrics.o
 obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
+obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
 
 nvme-core-y				:= core.o
 nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI)	+= scsi.o
@@ -12,3 +13,5 @@ nvme-y					+= pci.o
 nvme-fabrics-y				+= fabrics.o
 
 nvme-rdma-y				+= rdma.o
+
+nvme-fc-y				+= fc.o
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
new file mode 100644
index 00000000000000..771e2e76187222
--- /dev/null
+++ b/drivers/nvme/host/fc.c
@@ -0,0 +1,2586 @@
+/*
+ * Copyright (c) 2016 Avago Technologies.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful.
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
+ * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
+ * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
+ * See the GNU General Public License for more details, a copy of which
+ * can be found in the file COPYING included with this package
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <uapi/scsi/fc/fc_fs.h>
+#include <uapi/scsi/fc/fc_els.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+#include <linux/nvme-fc-driver.h>
+#include <linux/nvme-fc.h>
+
+
+/* *************************** Data Structures/Defines ****************** */
+
+
+/*
+ * We handle AEN commands ourselves and don't even let the
+ * block layer know about them.
+ */
+#define NVME_FC_NR_AEN_COMMANDS	1
+#define NVME_FC_AQ_BLKMQ_DEPTH	\
+	(NVMF_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
+#define AEN_CMDID_BASE		(NVME_FC_AQ_BLKMQ_DEPTH + 1)
+
+enum nvme_fc_queue_flags {
+	NVME_FC_Q_CONNECTED = (1 << 0),
+};
+
+#define NVMEFC_QUEUE_DELAY	3		/* ms units */
+
+struct nvme_fc_queue {
+	struct nvme_fc_ctrl	*ctrl;
+	struct device		*dev;
+	struct blk_mq_hw_ctx	*hctx;
+	void			*lldd_handle;
+	int			queue_size;
+	size_t			cmnd_capsule_len;
+	u32			qnum;
+	u32			rqcnt;
+	u32			seqno;
+
+	u64			connection_id;
+	atomic_t		csn;
+
+	unsigned long		flags;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+struct nvmefc_ls_req_op {
+	struct nvmefc_ls_req	ls_req;
+
+	struct nvme_fc_ctrl	*ctrl;
+	struct nvme_fc_queue	*queue;
+	struct request		*rq;
+
+	int			ls_error;
+	struct completion	ls_done;
+	struct list_head	lsreq_list;	/* ctrl->ls_req_list */
+	bool			req_queued;
+};
+
+enum nvme_fcpop_state {
+	FCPOP_STATE_UNINIT	= 0,
+	FCPOP_STATE_IDLE	= 1,
+	FCPOP_STATE_ACTIVE	= 2,
+	FCPOP_STATE_ABORTED	= 3,
+};
+
+struct nvme_fc_fcp_op {
+	struct nvme_request	nreq;		/*
+						 * nvme/host/core.c
+						 * requires this to be
+						 * the 1st element in the
+						 * private structure
+						 * associated with the
+						 * request.
+						 */
+	struct nvmefc_fcp_req	fcp_req;
+
+	struct nvme_fc_ctrl	*ctrl;
+	struct nvme_fc_queue	*queue;
+	struct request		*rq;
+
+	atomic_t		state;
+	u32			rqno;
+	u32			nents;
+
+	struct nvme_fc_cmd_iu	cmd_iu;
+	struct nvme_fc_ersp_iu	rsp_iu;
+};
+
+struct nvme_fc_lport {
+	struct nvme_fc_local_port	localport;
+
+	struct ida			endp_cnt;
+	struct list_head		port_list;	/* nvme_fc_port_list */
+	struct list_head		endp_list;
+	struct device			*dev;	/* physical device for dma */
+	struct nvme_fc_port_template	*ops;
+	struct kref			ref;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+struct nvme_fc_rport {
+	struct nvme_fc_remote_port	remoteport;
+
+	struct list_head		endp_list; /* for lport->endp_list */
+	struct list_head		ctrl_list;
+	spinlock_t			lock;
+	struct kref			ref;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+enum nvme_fcctrl_state {
+	FCCTRL_INIT		= 0,
+	FCCTRL_ACTIVE		= 1,
+};
+
+struct nvme_fc_ctrl {
+	spinlock_t		lock;
+	struct nvme_fc_queue	*queues;
+	u32			queue_count;
+
+	struct device		*dev;
+	struct nvme_fc_lport	*lport;
+	struct nvme_fc_rport	*rport;
+	u32			cnum;
+
+	u64			association_id;
+
+	u64			cap;
+
+	struct list_head	ctrl_list;	/* rport->ctrl_list */
+	struct list_head	ls_req_list;
+
+	struct blk_mq_tag_set	admin_tag_set;
+	struct blk_mq_tag_set	tag_set;
+
+	struct work_struct	delete_work;
+	struct kref		ref;
+	int			state;
+
+	struct nvme_fc_fcp_op	aen_ops[NVME_FC_NR_AEN_COMMANDS];
+
+	struct nvme_ctrl	ctrl;
+};
+
+static inline struct nvme_fc_ctrl *
+to_fc_ctrl(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_fc_ctrl, ctrl);
+}
+
+static inline struct nvme_fc_lport *
+localport_to_lport(struct nvme_fc_local_port *portptr)
+{
+	return container_of(portptr, struct nvme_fc_lport, localport);
+}
+
+static inline struct nvme_fc_rport *
+remoteport_to_rport(struct nvme_fc_remote_port *portptr)
+{
+	return container_of(portptr, struct nvme_fc_rport, remoteport);
+}
+
+static inline struct nvmefc_ls_req_op *
+ls_req_to_lsop(struct nvmefc_ls_req *lsreq)
+{
+	return container_of(lsreq, struct nvmefc_ls_req_op, ls_req);
+}
+
+static inline struct nvme_fc_fcp_op *
+fcp_req_to_fcp_op(struct nvmefc_fcp_req *fcpreq)
+{
+	return container_of(fcpreq, struct nvme_fc_fcp_op, fcp_req);
+}
+
+
+
+/* *************************** Globals **************************** */
+
+
+static DEFINE_SPINLOCK(nvme_fc_lock);
+
+static LIST_HEAD(nvme_fc_lport_list);
+static DEFINE_IDA(nvme_fc_local_port_cnt);
+static DEFINE_IDA(nvme_fc_ctrl_cnt);
+
+static struct workqueue_struct *nvme_fc_wq;
+
+
+
+/* *********************** FC-NVME Port Management ************************ */
+
+static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *);
+static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
+			struct nvme_fc_queue *, unsigned int);
+
+
+/**
+ * nvme_fc_register_localport - transport entry point called by an
+ *                              LLDD to register the existence of a NVME
+ *                              host FC port.
+ * @pinfo:     pointer to information about the port to be registered
+ * @template:  LLDD entrypoints and operational parameters for the port
+ * @dev:       physical hardware device node port corresponds to. Will be
+ *             used for DMA mappings
+ * @lport_p:   pointer to a local port pointer. Upon success, the routine
+ *             will allocate a nvme_fc_local_port structure and place its
+ *             address in the local port pointer. Upon failure, local port
+ *             pointer will be set to 0.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
+			struct nvme_fc_port_template *template,
+			struct device *dev,
+			struct nvme_fc_local_port **portptr)
+{
+	struct nvme_fc_lport *newrec;
+	unsigned long flags;
+	int ret, idx;
+
+	if (!template->localport_delete || !template->remoteport_delete ||
+	    !template->ls_req || !template->fcp_io ||
+	    !template->ls_abort || !template->fcp_abort ||
+	    !template->max_hw_queues || !template->max_sgl_segments ||
+	    !template->max_dif_sgl_segments || !template->dma_boundary) {
+		ret = -EINVAL;
+		goto out_reghost_failed;
+	}
+
+	newrec = kmalloc((sizeof(*newrec) + template->local_priv_sz),
+			 GFP_KERNEL);
+	if (!newrec) {
+		ret = -ENOMEM;
+		goto out_reghost_failed;
+	}
+
+	idx = ida_simple_get(&nvme_fc_local_port_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0) {
+		ret = -ENOSPC;
+		goto out_fail_kfree;
+	}
+
+	if (!get_device(dev) && dev) {
+		ret = -ENODEV;
+		goto out_ida_put;
+	}
+
+	INIT_LIST_HEAD(&newrec->port_list);
+	INIT_LIST_HEAD(&newrec->endp_list);
+	kref_init(&newrec->ref);
+	newrec->ops = template;
+	newrec->dev = dev;
+	ida_init(&newrec->endp_cnt);
+	newrec->localport.private = &newrec[1];
+	newrec->localport.node_name = pinfo->node_name;
+	newrec->localport.port_name = pinfo->port_name;
+	newrec->localport.port_role = pinfo->port_role;
+	newrec->localport.port_id = pinfo->port_id;
+	newrec->localport.port_state = FC_OBJSTATE_ONLINE;
+	newrec->localport.port_num = idx;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_add_tail(&newrec->port_list, &nvme_fc_lport_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	if (dev)
+		dma_set_seg_boundary(dev, template->dma_boundary);
+
+	*portptr = &newrec->localport;
+	return 0;
+
+out_ida_put:
+	ida_simple_remove(&nvme_fc_local_port_cnt, idx);
+out_fail_kfree:
+	kfree(newrec);
+out_reghost_failed:
+	*portptr = NULL;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_register_localport);
+
+static void
+nvme_fc_free_lport(struct kref *ref)
+{
+	struct nvme_fc_lport *lport =
+		container_of(ref, struct nvme_fc_lport, ref);
+	unsigned long flags;
+
+	WARN_ON(lport->localport.port_state != FC_OBJSTATE_DELETED);
+	WARN_ON(!list_empty(&lport->endp_list));
+
+	/* remove from transport list */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_del(&lport->port_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	/* let the LLDD know we've finished tearing it down */
+	lport->ops->localport_delete(&lport->localport);
+
+	ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
+	ida_destroy(&lport->endp_cnt);
+
+	put_device(lport->dev);
+
+	kfree(lport);
+}
+
+static void
+nvme_fc_lport_put(struct nvme_fc_lport *lport)
+{
+	kref_put(&lport->ref, nvme_fc_free_lport);
+}
+
+static int
+nvme_fc_lport_get(struct nvme_fc_lport *lport)
+{
+	return kref_get_unless_zero(&lport->ref);
+}
+
+/**
+ * nvme_fc_unregister_localport - transport entry point called by an
+ *                              LLDD to deregister/remove a previously
+ *                              registered a NVME host FC port.
+ * @localport: pointer to the (registered) local port that is to be
+ *             deregistered.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
+{
+	struct nvme_fc_lport *lport = localport_to_lport(portptr);
+	unsigned long flags;
+
+	if (!portptr)
+		return -EINVAL;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&nvme_fc_lock, flags);
+		return -EINVAL;
+	}
+	portptr->port_state = FC_OBJSTATE_DELETED;
+
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	nvme_fc_lport_put(lport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport);
+
+/**
+ * nvme_fc_register_remoteport - transport entry point called by an
+ *                              LLDD to register the existence of a NVME
+ *                              subsystem FC port on its fabric.
+ * @localport: pointer to the (registered) local port that the remote
+ *             subsystem port is connected to.
+ * @pinfo:     pointer to information about the port to be registered
+ * @rport_p:   pointer to a remote port pointer. Upon success, the routine
+ *             will allocate a nvme_fc_remote_port structure and place its
+ *             address in the remote port pointer. Upon failure, remote port
+ *             pointer will be set to 0.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
+				struct nvme_fc_port_info *pinfo,
+				struct nvme_fc_remote_port **portptr)
+{
+	struct nvme_fc_lport *lport = localport_to_lport(localport);
+	struct nvme_fc_rport *newrec;
+	unsigned long flags;
+	int ret, idx;
+
+	newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz),
+			 GFP_KERNEL);
+	if (!newrec) {
+		ret = -ENOMEM;
+		goto out_reghost_failed;
+	}
+
+	if (!nvme_fc_lport_get(lport)) {
+		ret = -ESHUTDOWN;
+		goto out_kfree_rport;
+	}
+
+	idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0) {
+		ret = -ENOSPC;
+		goto out_lport_put;
+	}
+
+	INIT_LIST_HEAD(&newrec->endp_list);
+	INIT_LIST_HEAD(&newrec->ctrl_list);
+	kref_init(&newrec->ref);
+	spin_lock_init(&newrec->lock);
+	newrec->remoteport.localport = &lport->localport;
+	newrec->remoteport.private = &newrec[1];
+	newrec->remoteport.port_role = pinfo->port_role;
+	newrec->remoteport.node_name = pinfo->node_name;
+	newrec->remoteport.port_name = pinfo->port_name;
+	newrec->remoteport.port_id = pinfo->port_id;
+	newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
+	newrec->remoteport.port_num = idx;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_add_tail(&newrec->endp_list, &lport->endp_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	*portptr = &newrec->remoteport;
+	return 0;
+
+out_lport_put:
+	nvme_fc_lport_put(lport);
+out_kfree_rport:
+	kfree(newrec);
+out_reghost_failed:
+	*portptr = NULL;
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
+
+static void
+nvme_fc_free_rport(struct kref *ref)
+{
+	struct nvme_fc_rport *rport =
+		container_of(ref, struct nvme_fc_rport, ref);
+	struct nvme_fc_lport *lport =
+			localport_to_lport(rport->remoteport.localport);
+	unsigned long flags;
+
+	WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
+	WARN_ON(!list_empty(&rport->ctrl_list));
+
+	/* remove from lport list */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_del(&rport->endp_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	/* let the LLDD know we've finished tearing it down */
+	lport->ops->remoteport_delete(&rport->remoteport);
+
+	ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
+
+	kfree(rport);
+
+	nvme_fc_lport_put(lport);
+}
+
+static void
+nvme_fc_rport_put(struct nvme_fc_rport *rport)
+{
+	kref_put(&rport->ref, nvme_fc_free_rport);
+}
+
+static int
+nvme_fc_rport_get(struct nvme_fc_rport *rport)
+{
+	return kref_get_unless_zero(&rport->ref);
+}
+
+/**
+ * nvme_fc_unregister_remoteport - transport entry point called by an
+ *                              LLDD to deregister/remove a previously
+ *                              registered a NVME subsystem FC port.
+ * @remoteport: pointer to the (registered) remote port that is to be
+ *              deregistered.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+
+	if (!portptr)
+		return -EINVAL;
+
+	spin_lock_irqsave(&rport->lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&rport->lock, flags);
+		return -EINVAL;
+	}
+	portptr->port_state = FC_OBJSTATE_DELETED;
+
+	/* tear down all associations to the remote port */
+	list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list)
+		__nvme_fc_del_ctrl(ctrl);
+
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	nvme_fc_rport_put(rport);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
+
+
+/* *********************** FC-NVME DMA Handling **************************** */
+
+/*
+ * The fcloop device passes in a NULL device pointer. Real LLD's will
+ * pass in a valid device pointer. If NULL is passed to the dma mapping
+ * routines, depending on the platform, it may or may not succeed, and
+ * may crash.
+ *
+ * As such:
+ * Wrapper all the dma routines and check the dev pointer.
+ *
+ * If simple mappings (return just a dma address, we'll noop them,
+ * returning a dma address of 0.
+ *
+ * On more complex mappings (dma_map_sg), a pseudo routine fills
+ * in the scatter list, setting all dma addresses to 0.
+ */
+
+static inline dma_addr_t
+fc_dma_map_single(struct device *dev, void *ptr, size_t size,
+		enum dma_data_direction dir)
+{
+	return dev ? dma_map_single(dev, ptr, size, dir) : (dma_addr_t)0L;
+}
+
+static inline int
+fc_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dev ? dma_mapping_error(dev, dma_addr) : 0;
+}
+
+static inline void
+fc_dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
+	enum dma_data_direction dir)
+{
+	if (dev)
+		dma_unmap_single(dev, addr, size, dir);
+}
+
+static inline void
+fc_dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void
+fc_dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_sync_single_for_device(dev, addr, size, dir);
+}
+
+/* pseudo dma_map_sg call */
+static int
+fc_map_sg(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s;
+	int i;
+
+	WARN_ON(nents == 0 || sg[0].length == 0);
+
+	for_each_sg(sg, s, nents, i) {
+		s->dma_address = 0L;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		s->dma_length = s->length;
+#endif
+	}
+	return nents;
+}
+
+static inline int
+fc_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	return dev ? dma_map_sg(dev, sg, nents, dir) : fc_map_sg(sg, nents);
+}
+
+static inline void
+fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_unmap_sg(dev, sg, nents, dir);
+}
+
+
+/* *********************** FC-NVME LS Handling **************************** */
+
+static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
+static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
+
+
+static void
+__nvme_fc_finish_ls_req(struct nvme_fc_ctrl *ctrl,
+		struct nvmefc_ls_req_op *lsop)
+{
+	struct nvmefc_ls_req *lsreq = &lsop->ls_req;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+
+	if (!lsop->req_queued) {
+		spin_unlock_irqrestore(&ctrl->lock, flags);
+		return;
+	}
+
+	list_del(&lsop->lsreq_list);
+
+	lsop->req_queued = false;
+
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	fc_dma_unmap_single(ctrl->dev, lsreq->rqstdma,
+				  (lsreq->rqstlen + lsreq->rsplen),
+				  DMA_BIDIRECTIONAL);
+
+	nvme_fc_ctrl_put(ctrl);
+}
+
+static int
+__nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl,
+		struct nvmefc_ls_req_op *lsop,
+		void (*done)(struct nvmefc_ls_req *req, int status))
+{
+	struct nvmefc_ls_req *lsreq = &lsop->ls_req;
+	unsigned long flags;
+	int ret;
+
+	if (!nvme_fc_ctrl_get(ctrl))
+		return -ESHUTDOWN;
+
+	lsreq->done = done;
+	lsop->ctrl = ctrl;
+	lsop->req_queued = false;
+	INIT_LIST_HEAD(&lsop->lsreq_list);
+	init_completion(&lsop->ls_done);
+
+	lsreq->rqstdma = fc_dma_map_single(ctrl->dev, lsreq->rqstaddr,
+				  lsreq->rqstlen + lsreq->rsplen,
+				  DMA_BIDIRECTIONAL);
+	if (fc_dma_mapping_error(ctrl->dev, lsreq->rqstdma)) {
+		nvme_fc_ctrl_put(ctrl);
+		dev_err(ctrl->dev,
+			"els request command failed EFAULT.\n");
+		return -EFAULT;
+	}
+	lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+
+	list_add_tail(&lsop->lsreq_list, &ctrl->ls_req_list);
+
+	lsop->req_queued = true;
+
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	ret = ctrl->lport->ops->ls_req(&ctrl->lport->localport,
+					&ctrl->rport->remoteport, lsreq);
+	if (ret)
+		lsop->ls_error = ret;
+
+	return ret;
+}
+
+static void
+nvme_fc_send_ls_req_done(struct nvmefc_ls_req *lsreq, int status)
+{
+	struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq);
+
+	lsop->ls_error = status;
+	complete(&lsop->ls_done);
+}
+
+static int
+nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
+{
+	struct nvmefc_ls_req *lsreq = &lsop->ls_req;
+	struct fcnvme_ls_rjt *rjt = lsreq->rspaddr;
+	int ret;
+
+	ret = __nvme_fc_send_ls_req(ctrl, lsop, nvme_fc_send_ls_req_done);
+
+	if (!ret)
+		/*
+		 * No timeout/not interruptible as we need the struct
+		 * to exist until the lldd calls us back. Thus mandate
+		 * wait until driver calls back. lldd responsible for
+		 * the timeout action
+		 */
+		wait_for_completion(&lsop->ls_done);
+
+	__nvme_fc_finish_ls_req(ctrl, lsop);
+
+	if (ret) {
+		dev_err(ctrl->dev,
+			"ls request command failed (%d).\n", ret);
+		return ret;
+	}
+
+	/* ACC or RJT payload ? */
+	if (rjt->w0.ls_cmd == FCNVME_LS_RJT)
+		return -ENXIO;
+
+	return 0;
+}
+
+static void
+nvme_fc_send_ls_req_async(struct nvme_fc_ctrl *ctrl,
+		struct nvmefc_ls_req_op *lsop,
+		void (*done)(struct nvmefc_ls_req *req, int status))
+{
+	int ret;
+
+	ret = __nvme_fc_send_ls_req(ctrl, lsop, done);
+
+	/* don't wait for completion */
+
+	if (ret)
+		done(&lsop->ls_req, ret);
+}
+
+/* Validation Error indexes into the string table below */
+enum {
+	VERR_NO_ERROR		= 0,
+	VERR_LSACC		= 1,
+	VERR_LSDESC_RQST	= 2,
+	VERR_LSDESC_RQST_LEN	= 3,
+	VERR_ASSOC_ID		= 4,
+	VERR_ASSOC_ID_LEN	= 5,
+	VERR_CONN_ID		= 6,
+	VERR_CONN_ID_LEN	= 7,
+	VERR_CR_ASSOC		= 8,
+	VERR_CR_ASSOC_ACC_LEN	= 9,
+	VERR_CR_CONN		= 10,
+	VERR_CR_CONN_ACC_LEN	= 11,
+	VERR_DISCONN		= 12,
+	VERR_DISCONN_ACC_LEN	= 13,
+};
+
+static char *validation_errors[] = {
+	"OK",
+	"Not LS_ACC",
+	"Not LSDESC_RQST",
+	"Bad LSDESC_RQST Length",
+	"Not Association ID",
+	"Bad Association ID Length",
+	"Not Connection ID",
+	"Bad Connection ID Length",
+	"Not CR_ASSOC Rqst",
+	"Bad CR_ASSOC ACC Length",
+	"Not CR_CONN Rqst",
+	"Bad CR_CONN ACC Length",
+	"Not Disconnect Rqst",
+	"Bad Disconnect ACC Length",
+};
+
+static int
+nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, u16 qsize, u16 ersp_ratio)
+{
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+	struct fcnvme_ls_cr_assoc_rqst *assoc_rqst;
+	struct fcnvme_ls_cr_assoc_acc *assoc_acc;
+	int ret, fcret = 0;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*assoc_rqst) + sizeof(*assoc_acc)), GFP_KERNEL);
+	if (!lsop) {
+		ret = -ENOMEM;
+		goto out_no_memory;
+	}
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	assoc_rqst = (struct fcnvme_ls_cr_assoc_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	assoc_acc = (struct fcnvme_ls_cr_assoc_acc *)&assoc_rqst[1];
+
+	assoc_rqst->w0.ls_cmd = FCNVME_LS_CREATE_ASSOCIATION;
+	assoc_rqst->desc_list_len =
+			cpu_to_be32(sizeof(struct fcnvme_lsdesc_cr_assoc_cmd));
+
+	assoc_rqst->assoc_cmd.desc_tag =
+			cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD);
+	assoc_rqst->assoc_cmd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_cr_assoc_cmd));
+
+	assoc_rqst->assoc_cmd.ersp_ratio = cpu_to_be16(ersp_ratio);
+	assoc_rqst->assoc_cmd.sqsize = cpu_to_be16(qsize);
+	/* Linux supports only Dynamic controllers */
+	assoc_rqst->assoc_cmd.cntlid = cpu_to_be16(0xffff);
+	memcpy(&assoc_rqst->assoc_cmd.hostid, &ctrl->ctrl.opts->host->id,
+		min_t(size_t, FCNVME_ASSOC_HOSTID_LEN, sizeof(uuid_be)));
+	strncpy(assoc_rqst->assoc_cmd.hostnqn, ctrl->ctrl.opts->host->nqn,
+		min(FCNVME_ASSOC_HOSTNQN_LEN, NVMF_NQN_SIZE));
+	strncpy(assoc_rqst->assoc_cmd.subnqn, ctrl->ctrl.opts->subsysnqn,
+		min(FCNVME_ASSOC_SUBNQN_LEN, NVMF_NQN_SIZE));
+
+	lsop->queue = queue;
+	lsreq->rqstaddr = assoc_rqst;
+	lsreq->rqstlen = sizeof(*assoc_rqst);
+	lsreq->rspaddr = assoc_acc;
+	lsreq->rsplen = sizeof(*assoc_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	ret = nvme_fc_send_ls_req(ctrl, lsop);
+	if (ret)
+		goto out_free_buffer;
+
+	/* process connect LS completion */
+
+	/* validate the ACC response */
+	if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
+		fcret = VERR_LSACC;
+	if (assoc_acc->hdr.desc_list_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_cr_assoc_acc)))
+		fcret = VERR_CR_ASSOC_ACC_LEN;
+	if (assoc_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+		fcret = VERR_LSDESC_RQST;
+	else if (assoc_acc->hdr.rqst.desc_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
+		fcret = VERR_LSDESC_RQST_LEN;
+	else if (assoc_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_CREATE_ASSOCIATION)
+		fcret = VERR_CR_ASSOC;
+	else if (assoc_acc->associd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_ASSOC_ID))
+		fcret = VERR_ASSOC_ID;
+	else if (assoc_acc->associd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id)))
+		fcret = VERR_ASSOC_ID_LEN;
+	else if (assoc_acc->connectid.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CONN_ID))
+		fcret = VERR_CONN_ID;
+	else if (assoc_acc->connectid.desc_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_conn_id)))
+		fcret = VERR_CONN_ID_LEN;
+
+	if (fcret) {
+		ret = -EBADF;
+		dev_err(ctrl->dev,
+			"q %d connect failed: %s\n",
+			queue->qnum, validation_errors[fcret]);
+	} else {
+		ctrl->association_id =
+			be64_to_cpu(assoc_acc->associd.association_id);
+		queue->connection_id =
+			be64_to_cpu(assoc_acc->connectid.connection_id);
+		set_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+	}
+
+out_free_buffer:
+	kfree(lsop);
+out_no_memory:
+	if (ret)
+		dev_err(ctrl->dev,
+			"queue %d connect admin queue failed (%d).\n",
+			queue->qnum, ret);
+	return ret;
+}
+
+static int
+nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
+			u16 qsize, u16 ersp_ratio)
+{
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+	struct fcnvme_ls_cr_conn_rqst *conn_rqst;
+	struct fcnvme_ls_cr_conn_acc *conn_acc;
+	int ret, fcret = 0;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*conn_rqst) + sizeof(*conn_acc)), GFP_KERNEL);
+	if (!lsop) {
+		ret = -ENOMEM;
+		goto out_no_memory;
+	}
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	conn_rqst = (struct fcnvme_ls_cr_conn_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	conn_acc = (struct fcnvme_ls_cr_conn_acc *)&conn_rqst[1];
+
+	conn_rqst->w0.ls_cmd = FCNVME_LS_CREATE_CONNECTION;
+	conn_rqst->desc_list_len = cpu_to_be32(
+				sizeof(struct fcnvme_lsdesc_assoc_id) +
+				sizeof(struct fcnvme_lsdesc_cr_conn_cmd));
+
+	conn_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+	conn_rqst->associd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id));
+	conn_rqst->associd.association_id = cpu_to_be64(ctrl->association_id);
+	conn_rqst->connect_cmd.desc_tag =
+			cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD);
+	conn_rqst->connect_cmd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_cr_conn_cmd));
+	conn_rqst->connect_cmd.ersp_ratio = cpu_to_be16(ersp_ratio);
+	conn_rqst->connect_cmd.qid  = cpu_to_be16(queue->qnum);
+	conn_rqst->connect_cmd.sqsize = cpu_to_be16(qsize);
+
+	lsop->queue = queue;
+	lsreq->rqstaddr = conn_rqst;
+	lsreq->rqstlen = sizeof(*conn_rqst);
+	lsreq->rspaddr = conn_acc;
+	lsreq->rsplen = sizeof(*conn_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	ret = nvme_fc_send_ls_req(ctrl, lsop);
+	if (ret)
+		goto out_free_buffer;
+
+	/* process connect LS completion */
+
+	/* validate the ACC response */
+	if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
+		fcret = VERR_LSACC;
+	if (conn_acc->hdr.desc_list_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)))
+		fcret = VERR_CR_CONN_ACC_LEN;
+	if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+		fcret = VERR_LSDESC_RQST;
+	else if (conn_acc->hdr.rqst.desc_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
+		fcret = VERR_LSDESC_RQST_LEN;
+	else if (conn_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_CREATE_CONNECTION)
+		fcret = VERR_CR_CONN;
+	else if (conn_acc->connectid.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CONN_ID))
+		fcret = VERR_CONN_ID;
+	else if (conn_acc->connectid.desc_len !=
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_conn_id)))
+		fcret = VERR_CONN_ID_LEN;
+
+	if (fcret) {
+		ret = -EBADF;
+		dev_err(ctrl->dev,
+			"q %d connect failed: %s\n",
+			queue->qnum, validation_errors[fcret]);
+	} else {
+		queue->connection_id =
+			be64_to_cpu(conn_acc->connectid.connection_id);
+		set_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+	}
+
+out_free_buffer:
+	kfree(lsop);
+out_no_memory:
+	if (ret)
+		dev_err(ctrl->dev,
+			"queue %d connect command failed (%d).\n",
+			queue->qnum, ret);
+	return ret;
+}
+
+static void
+nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
+{
+	struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq);
+	struct nvme_fc_ctrl *ctrl = lsop->ctrl;
+
+	__nvme_fc_finish_ls_req(ctrl, lsop);
+
+	if (status)
+		dev_err(ctrl->dev,
+			"disconnect assoc ls request command failed (%d).\n",
+			status);
+
+	/* fc-nvme iniator doesn't care about success or failure of cmd */
+
+	kfree(lsop);
+}
+
+/*
+ * This routine sends a FC-NVME LS to disconnect (aka terminate)
+ * the FC-NVME Association.  Terminating the association also
+ * terminates the FC-NVME connections (per queue, both admin and io
+ * queues) that are part of the association. E.g. things are torn
+ * down, and the related FC-NVME Association ID and Connection IDs
+ * become invalid.
+ *
+ * The behavior of the fc-nvme initiator is such that it's
+ * understanding of the association and connections will implicitly
+ * be torn down. The action is implicit as it may be due to a loss of
+ * connectivity with the fc-nvme target, so you may never get a
+ * response even if you tried.  As such, the action of this routine
+ * is to asynchronously send the LS, ignore any results of the LS, and
+ * continue on with terminating the association. If the fc-nvme target
+ * is present and receives the LS, it too can tear down.
+ */
+static void
+nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
+{
+	struct fcnvme_ls_disconnect_rqst *discon_rqst;
+	struct fcnvme_ls_disconnect_acc *discon_acc;
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*discon_rqst) + sizeof(*discon_acc)),
+			GFP_KERNEL);
+	if (!lsop)
+		/* couldn't sent it... too bad */
+		return;
+
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	discon_rqst = (struct fcnvme_ls_disconnect_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	discon_acc = (struct fcnvme_ls_disconnect_acc *)&discon_rqst[1];
+
+	discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT;
+	discon_rqst->desc_list_len = cpu_to_be32(
+				sizeof(struct fcnvme_lsdesc_assoc_id) +
+				sizeof(struct fcnvme_lsdesc_disconn_cmd));
+
+	discon_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+	discon_rqst->associd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id));
+
+	discon_rqst->associd.association_id = cpu_to_be64(ctrl->association_id);
+
+	discon_rqst->discon_cmd.desc_tag = cpu_to_be32(
+						FCNVME_LSDESC_DISCONN_CMD);
+	discon_rqst->discon_cmd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_disconn_cmd));
+	discon_rqst->discon_cmd.scope = FCNVME_DISCONN_ASSOCIATION;
+	discon_rqst->discon_cmd.id = cpu_to_be64(ctrl->association_id);
+
+	lsreq->rqstaddr = discon_rqst;
+	lsreq->rqstlen = sizeof(*discon_rqst);
+	lsreq->rspaddr = discon_acc;
+	lsreq->rsplen = sizeof(*discon_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	nvme_fc_send_ls_req_async(ctrl, lsop, nvme_fc_disconnect_assoc_done);
+
+	/* only meaningful part to terminating the association */
+	ctrl->association_id = 0;
+}
+
+
+/* *********************** NVME Ctrl Routines **************************** */
+
+
+static int
+nvme_fc_reinit_request(void *data, struct request *rq)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+
+	memset(cmdiu, 0, sizeof(*cmdiu));
+	cmdiu->scsi_id = NVME_CMD_SCSI_ID;
+	cmdiu->fc_id = NVME_CMD_FC_ID;
+	cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32));
+	memset(&op->rsp_iu, 0, sizeof(op->rsp_iu));
+
+	return 0;
+}
+
+static void
+__nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl,
+		struct nvme_fc_fcp_op *op)
+{
+	fc_dma_unmap_single(ctrl->lport->dev, op->fcp_req.rspdma,
+				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
+	fc_dma_unmap_single(ctrl->lport->dev, op->fcp_req.cmddma,
+				sizeof(op->cmd_iu), DMA_TO_DEVICE);
+
+	atomic_set(&op->state, FCPOP_STATE_UNINIT);
+}
+
+static void
+nvme_fc_exit_request(void *data, struct request *rq,
+				unsigned int hctx_idx, unsigned int rq_idx)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+
+	return __nvme_fc_exit_request(data, op);
+}
+
+static void
+nvme_fc_exit_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
+	int i;
+
+	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+		if (atomic_read(&aen_op->state) == FCPOP_STATE_UNINIT)
+			continue;
+		__nvme_fc_exit_request(ctrl, aen_op);
+		nvme_fc_ctrl_put(ctrl);
+	}
+}
+
+void
+nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
+{
+	struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req);
+	struct request *rq = op->rq;
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+	struct nvme_fc_queue *queue = op->queue;
+	struct nvme_completion *cqe = &op->rsp_iu.cqe;
+	u16 status;
+
+	/*
+	 * WARNING:
+	 * The current linux implementation of a nvme controller
+	 * allocates a single tag set for all io queues and sizes
+	 * the io queues to fully hold all possible tags. Thus, the
+	 * implementation does not reference or care about the sqhd
+	 * value as it never needs to use the sqhd/sqtail pointers
+	 * for submission pacing.
+	 *
+	 * This affects the FC-NVME implementation in two ways:
+	 * 1) As the value doesn't matter, we don't need to waste
+	 *    cycles extracting it from ERSPs and stamping it in the
+	 *    cases where the transport fabricates CQEs on successful
+	 *    completions.
+	 * 2) The FC-NVME implementation requires that delivery of
+	 *    ERSP completions are to go back to the nvme layer in order
+	 *    relative to the rsn, such that the sqhd value will always
+	 *    be "in order" for the nvme layer. As the nvme layer in
+	 *    linux doesn't care about sqhd, there's no need to return
+	 *    them in order.
+	 *
+	 * Additionally:
+	 * As the core nvme layer in linux currently does not look at
+	 * every field in the cqe - in cases where the FC transport must
+	 * fabricate a CQE, the following fields will not be set as they
+	 * are not referenced:
+	 *      cqe.sqid,  cqe.sqhd,  cqe.command_id
+	 */
+
+	fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
+				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
+
+	if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
+		status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+	else
+		status = freq->status;
+
+	/*
+	 * For the linux implementation, if we have an unsuccesful
+	 * status, they blk-mq layer can typically be called with the
+	 * non-zero status and the content of the cqe isn't important.
+	 */
+	if (status)
+		goto done;
+
+	/*
+	 * command completed successfully relative to the wire
+	 * protocol. However, validate anything received and
+	 * extract the status and result from the cqe (create it
+	 * where necessary).
+	 */
+
+	switch (freq->rcv_rsplen) {
+
+	case 0:
+	case NVME_FC_SIZEOF_ZEROS_RSP:
+		/*
+		 * No response payload or 12 bytes of payload (which
+		 * should all be zeros) are considered successful and
+		 * no payload in the CQE by the transport.
+		 */
+		if (freq->transferred_length !=
+			be32_to_cpu(op->cmd_iu.data_len)) {
+			status = -EIO;
+			goto done;
+		}
+		op->nreq.result.u64 = 0;
+		break;
+
+	case sizeof(struct nvme_fc_ersp_iu):
+		/*
+		 * The ERSP IU contains a full completion with CQE.
+		 * Validate ERSP IU and look at cqe.
+		 */
+		if (unlikely(be16_to_cpu(op->rsp_iu.iu_len) !=
+					(freq->rcv_rsplen / 4) ||
+			     be32_to_cpu(op->rsp_iu.xfrd_len) !=
+					freq->transferred_length ||
+			     op->rqno != le16_to_cpu(cqe->command_id))) {
+			status = -EIO;
+			goto done;
+		}
+		op->nreq.result = cqe->result;
+		status = le16_to_cpu(cqe->status) >> 1;
+		break;
+
+	default:
+		status = -EIO;
+		goto done;
+	}
+
+done:
+	if (!queue->qnum && op->rqno >= AEN_CMDID_BASE) {
+		nvme_complete_async_event(&queue->ctrl->ctrl, status,
+					&op->nreq.result);
+		nvme_fc_ctrl_put(ctrl);
+		return;
+	}
+
+	blk_mq_complete_request(rq, status);
+}
+
+static int
+__nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
+		struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op,
+		struct request *rq, u32 rqno)
+{
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	int ret = 0;
+
+	memset(op, 0, sizeof(*op));
+	op->fcp_req.cmdaddr = &op->cmd_iu;
+	op->fcp_req.cmdlen = sizeof(op->cmd_iu);
+	op->fcp_req.rspaddr = &op->rsp_iu;
+	op->fcp_req.rsplen = sizeof(op->rsp_iu);
+	op->fcp_req.done = nvme_fc_fcpio_done;
+	op->fcp_req.first_sgl = (struct scatterlist *)&op[1];
+	op->fcp_req.private = &op->fcp_req.first_sgl[SG_CHUNK_SIZE];
+	op->ctrl = ctrl;
+	op->queue = queue;
+	op->rq = rq;
+	op->rqno = rqno;
+
+	cmdiu->scsi_id = NVME_CMD_SCSI_ID;
+	cmdiu->fc_id = NVME_CMD_FC_ID;
+	cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32));
+
+	op->fcp_req.cmddma = fc_dma_map_single(ctrl->lport->dev,
+				&op->cmd_iu, sizeof(op->cmd_iu), DMA_TO_DEVICE);
+	if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) {
+		dev_err(ctrl->dev,
+			"FCP Op failed - cmdiu dma mapping failed.\n");
+		ret = EFAULT;
+		goto out_on_error;
+	}
+
+	op->fcp_req.rspdma = fc_dma_map_single(ctrl->lport->dev,
+				&op->rsp_iu, sizeof(op->rsp_iu),
+				DMA_FROM_DEVICE);
+	if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) {
+		dev_err(ctrl->dev,
+			"FCP Op failed - rspiu dma mapping failed.\n");
+		ret = EFAULT;
+	}
+
+	atomic_set(&op->state, FCPOP_STATE_IDLE);
+out_on_error:
+	return ret;
+}
+
+static int
+nvme_fc_init_request(void *data, struct request *rq,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_queue *queue = &ctrl->queues[hctx_idx+1];
+
+	return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
+}
+
+static int
+nvme_fc_init_admin_request(void *data, struct request *rq,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_queue *queue = &ctrl->queues[0];
+
+	return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
+}
+
+static int
+nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_fcp_op *aen_op;
+	struct nvme_fc_cmd_iu *cmdiu;
+	struct nvme_command *sqe;
+	int i, ret;
+
+	aen_op = ctrl->aen_ops;
+	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+		cmdiu = &aen_op->cmd_iu;
+		sqe = &cmdiu->sqe;
+		ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
+				aen_op, (struct request *)NULL,
+				(AEN_CMDID_BASE + i));
+		if (ret)
+			return ret;
+
+		memset(sqe, 0, sizeof(*sqe));
+		sqe->common.opcode = nvme_admin_async_event;
+		sqe->common.command_id = AEN_CMDID_BASE + i;
+	}
+	return 0;
+}
+
+
+static inline void
+__nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl,
+		unsigned int qidx)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[qidx];
+
+	hctx->driver_data = queue;
+	queue->hctx = hctx;
+}
+
+static int
+nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+
+	__nvme_fc_init_hctx(hctx, ctrl, hctx_idx + 1);
+
+	return 0;
+}
+
+static int
+nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+
+	__nvme_fc_init_hctx(hctx, ctrl, hctx_idx);
+
+	return 0;
+}
+
+static void
+nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size)
+{
+	struct nvme_fc_queue *queue;
+
+	queue = &ctrl->queues[idx];
+	memset(queue, 0, sizeof(*queue));
+	queue->ctrl = ctrl;
+	queue->qnum = idx;
+	atomic_set(&queue->csn, 1);
+	queue->dev = ctrl->dev;
+
+	if (idx > 0)
+		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+	else
+		queue->cmnd_capsule_len = sizeof(struct nvme_command);
+
+	queue->queue_size = queue_size;
+
+	/*
+	 * Considered whether we should allocate buffers for all SQEs
+	 * and CQEs and dma map them - mapping their respective entries
+	 * into the request structures (kernel vm addr and dma address)
+	 * thus the driver could use the buffers/mappings directly.
+	 * It only makes sense if the LLDD would use them for its
+	 * messaging api. It's very unlikely most adapter api's would use
+	 * a native NVME sqe/cqe. More reasonable if FC-NVME IU payload
+	 * structures were used instead.
+	 */
+}
+
+/*
+ * This routine terminates a queue at the transport level.
+ * The transport has already ensured that all outstanding ios on
+ * the queue have been terminated.
+ * The transport will send a Disconnect LS request to terminate
+ * the queue's connection. Termination of the admin queue will also
+ * terminate the association at the target.
+ */
+static void
+nvme_fc_free_queue(struct nvme_fc_queue *queue)
+{
+	if (!test_and_clear_bit(NVME_FC_Q_CONNECTED, &queue->flags))
+		return;
+
+	/*
+	 * Current implementation never disconnects a single queue.
+	 * It always terminates a whole association. So there is never
+	 * a disconnect(queue) LS sent to the target.
+	 */
+
+	queue->connection_id = 0;
+	clear_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+}
+
+static void
+__nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, unsigned int qidx)
+{
+	if (ctrl->lport->ops->delete_queue)
+		ctrl->lport->ops->delete_queue(&ctrl->lport->localport, qidx,
+				queue->lldd_handle);
+	queue->lldd_handle = NULL;
+}
+
+static void
+nvme_fc_destroy_admin_queue(struct nvme_fc_ctrl *ctrl)
+{
+	__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
+	blk_cleanup_queue(ctrl->ctrl.admin_q);
+	blk_mq_free_tag_set(&ctrl->admin_tag_set);
+	nvme_fc_free_queue(&ctrl->queues[0]);
+}
+
+static void
+nvme_fc_free_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_fc_free_queue(&ctrl->queues[i]);
+}
+
+static int
+__nvme_fc_create_hw_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, unsigned int qidx, u16 qsize)
+{
+	int ret = 0;
+
+	queue->lldd_handle = NULL;
+	if (ctrl->lport->ops->create_queue)
+		ret = ctrl->lport->ops->create_queue(&ctrl->lport->localport,
+				qidx, qsize, &queue->lldd_handle);
+
+	return ret;
+}
+
+static void
+nvme_fc_delete_hw_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[ctrl->queue_count - 1];
+	int i;
+
+	for (i = ctrl->queue_count - 1; i >= 1; i--, queue--)
+		__nvme_fc_delete_hw_queue(ctrl, queue, i);
+}
+
+static int
+nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[1];
+	int i, j, ret;
+
+	for (i = 1; i < ctrl->queue_count; i++, queue++) {
+		ret = __nvme_fc_create_hw_queue(ctrl, queue, i, qsize);
+		if (ret) {
+			for (j = i-1; j >= 0; j--)
+				__nvme_fc_delete_hw_queue(ctrl,
+						&ctrl->queues[j], j);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int
+nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
+{
+	int i, ret = 0;
+
+	for (i = 1; i < ctrl->queue_count; i++) {
+		ret = nvme_fc_connect_queue(ctrl, &ctrl->queues[i], qsize,
+					(qsize / 5));
+		if (ret)
+			break;
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static void
+nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize);
+}
+
+static void
+nvme_fc_ctrl_free(struct kref *ref)
+{
+	struct nvme_fc_ctrl *ctrl =
+		container_of(ref, struct nvme_fc_ctrl, ref);
+	unsigned long flags;
+
+	if (ctrl->state != FCCTRL_INIT) {
+		/* remove from rport list */
+		spin_lock_irqsave(&ctrl->rport->lock, flags);
+		list_del(&ctrl->ctrl_list);
+		spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+	}
+
+	put_device(ctrl->dev);
+	nvme_fc_rport_put(ctrl->rport);
+
+	kfree(ctrl->queues);
+	ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
+	nvmf_free_options(ctrl->ctrl.opts);
+	kfree(ctrl);
+}
+
+static void
+nvme_fc_ctrl_put(struct nvme_fc_ctrl *ctrl)
+{
+	kref_put(&ctrl->ref, nvme_fc_ctrl_free);
+}
+
+static int
+nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl)
+{
+	return kref_get_unless_zero(&ctrl->ref);
+}
+
+/*
+ * All accesses from nvme core layer done - can now free the
+ * controller. Called after last nvme_put_ctrl() call
+ */
+static void
+nvme_fc_free_nvme_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+
+	WARN_ON(nctrl != &ctrl->ctrl);
+
+	/*
+	 * Tear down the association, which will generate link
+	 * traffic to terminate connections
+	 */
+
+	if (ctrl->state != FCCTRL_INIT) {
+		/* send a Disconnect(association) LS to fc-nvme target */
+		nvme_fc_xmt_disconnect_assoc(ctrl);
+
+		if (ctrl->ctrl.tagset) {
+			blk_cleanup_queue(ctrl->ctrl.connect_q);
+			blk_mq_free_tag_set(&ctrl->tag_set);
+			nvme_fc_delete_hw_io_queues(ctrl);
+			nvme_fc_free_io_queues(ctrl);
+		}
+
+		nvme_fc_exit_aen_ops(ctrl);
+
+		nvme_fc_destroy_admin_queue(ctrl);
+	}
+
+	nvme_fc_ctrl_put(ctrl);
+}
+
+
+static int
+__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
+{
+	int state;
+
+	state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
+	if (state != FCPOP_STATE_ACTIVE) {
+		atomic_set(&op->state, state);
+		return -ECANCELED; /* fail */
+	}
+
+	ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
+					&ctrl->rport->remoteport,
+					op->queue->lldd_handle,
+					&op->fcp_req);
+
+	return 0;
+}
+
+enum blk_eh_timer_return
+nvme_fc_timeout(struct request *rq, bool reserved)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+	int ret;
+
+	if (reserved)
+		return BLK_EH_RESET_TIMER;
+
+	ret = __nvme_fc_abort_op(ctrl, op);
+	if (ret)
+		/* io wasn't active to abort consider it done */
+		return BLK_EH_HANDLED;
+
+	/*
+	 * TODO: force a controller reset
+	 *   when that happens, queues will be torn down and outstanding
+	 *   ios will be terminated, and the above abort, on a single io
+	 *   will no longer be needed.
+	 */
+
+	return BLK_EH_HANDLED;
+}
+
+static int
+nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
+		struct nvme_fc_fcp_op *op)
+{
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+	u32 map_len = nvme_map_len(rq);
+	enum dma_data_direction dir;
+	int ret;
+
+	freq->sg_cnt = 0;
+
+	if (!map_len)
+		return 0;
+
+	freq->sg_table.sgl = freq->first_sgl;
+	ret = sg_alloc_table_chained(&freq->sg_table, rq->nr_phys_segments,
+			freq->sg_table.sgl);
+	if (ret)
+		return -ENOMEM;
+
+	op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
+	WARN_ON(op->nents > rq->nr_phys_segments);
+	dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+	freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
+				op->nents, dir);
+	if (unlikely(freq->sg_cnt <= 0)) {
+		sg_free_table_chained(&freq->sg_table, true);
+		freq->sg_cnt = 0;
+		return -EFAULT;
+	}
+
+	/*
+	 * TODO: blk_integrity_rq(rq)  for DIF
+	 */
+	return 0;
+}
+
+static void
+nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
+		struct nvme_fc_fcp_op *op)
+{
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+
+	if (!freq->sg_cnt)
+		return;
+
+	fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
+				((rq_data_dir(rq) == WRITE) ?
+					DMA_TO_DEVICE : DMA_FROM_DEVICE));
+
+	nvme_cleanup_cmd(rq);
+
+	sg_free_table_chained(&freq->sg_table, true);
+
+	freq->sg_cnt = 0;
+}
+
+/*
+ * In FC, the queue is a logical thing. At transport connect, the target
+ * creates its "queue" and returns a handle that is to be given to the
+ * target whenever it posts something to the corresponding SQ.  When an
+ * SQE is sent on a SQ, FC effectively considers the SQE, or rather the
+ * command contained within the SQE, an io, and assigns a FC exchange
+ * to it. The SQE and the associated SQ handle are sent in the initial
+ * CMD IU sents on the exchange. All transfers relative to the io occur
+ * as part of the exchange.  The CQE is the last thing for the io,
+ * which is transferred (explicitly or implicitly) with the RSP IU
+ * sent on the exchange. After the CQE is received, the FC exchange is
+ * terminaed and the Exchange may be used on a different io.
+ *
+ * The transport to LLDD api has the transport making a request for a
+ * new fcp io request to the LLDD. The LLDD then allocates a FC exchange
+ * resource and transfers the command. The LLDD will then process all
+ * steps to complete the io. Upon completion, the transport done routine
+ * is called.
+ *
+ * So - while the operation is outstanding to the LLDD, there is a link
+ * level FC exchange resource that is also outstanding. This must be
+ * considered in all cleanup operations.
+ */
+static int
+nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
+	struct nvme_fc_fcp_op *op, u32 data_len,
+	enum nvmefc_fcp_datadir	io_dir)
+{
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	struct nvme_command *sqe = &cmdiu->sqe;
+	u32 csn;
+	int ret;
+
+	if (!nvme_fc_ctrl_get(ctrl))
+		return BLK_MQ_RQ_QUEUE_ERROR;
+
+	/* format the FC-NVME CMD IU and fcp_req */
+	cmdiu->connection_id = cpu_to_be64(queue->connection_id);
+	csn = atomic_inc_return(&queue->csn);
+	cmdiu->csn = cpu_to_be32(csn);
+	cmdiu->data_len = cpu_to_be32(data_len);
+	switch (io_dir) {
+	case NVMEFC_FCP_WRITE:
+		cmdiu->flags = FCNVME_CMD_FLAGS_WRITE;
+		break;
+	case NVMEFC_FCP_READ:
+		cmdiu->flags = FCNVME_CMD_FLAGS_READ;
+		break;
+	case NVMEFC_FCP_NODATA:
+		cmdiu->flags = 0;
+		break;
+	}
+	op->fcp_req.payload_length = data_len;
+	op->fcp_req.io_dir = io_dir;
+	op->fcp_req.transferred_length = 0;
+	op->fcp_req.rcv_rsplen = 0;
+	op->fcp_req.status = 0;
+	op->fcp_req.sqid = cpu_to_le16(queue->qnum);
+
+	/*
+	 * validate per fabric rules, set fields mandated by fabric spec
+	 * as well as those by FC-NVME spec.
+	 */
+	WARN_ON_ONCE(sqe->common.metadata);
+	WARN_ON_ONCE(sqe->common.dptr.prp1);
+	WARN_ON_ONCE(sqe->common.dptr.prp2);
+	sqe->common.flags |= NVME_CMD_SGL_METABUF;
+
+	/*
+	 * format SQE DPTR field per FC-NVME rules
+	 *    type=data block descr; subtype=offset;
+	 *    offset is currently 0.
+	 */
+	sqe->rw.dptr.sgl.type = NVME_SGL_FMT_OFFSET;
+	sqe->rw.dptr.sgl.length = cpu_to_le32(data_len);
+	sqe->rw.dptr.sgl.addr = 0;
+
+	/* odd that we set the command_id - should come from nvme-fabrics */
+	WARN_ON_ONCE(sqe->common.command_id != cpu_to_le16(op->rqno));
+
+	if (op->rq) {				/* skipped on aens */
+		ret = nvme_fc_map_data(ctrl, op->rq, op);
+		if (ret < 0) {
+			dev_err(queue->ctrl->ctrl.device,
+			     "Failed to map data (%d)\n", ret);
+			nvme_cleanup_cmd(op->rq);
+			nvme_fc_ctrl_put(ctrl);
+			return (ret == -ENOMEM || ret == -EAGAIN) ?
+				BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
+		}
+	}
+
+	fc_dma_sync_single_for_device(ctrl->lport->dev, op->fcp_req.cmddma,
+				  sizeof(op->cmd_iu), DMA_TO_DEVICE);
+
+	atomic_set(&op->state, FCPOP_STATE_ACTIVE);
+
+	if (op->rq)
+		blk_mq_start_request(op->rq);
+
+	ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
+					&ctrl->rport->remoteport,
+					queue->lldd_handle, &op->fcp_req);
+
+	if (ret) {
+		dev_err(ctrl->dev,
+			"Send nvme command failed - lldd returned %d.\n", ret);
+
+		if (op->rq) {			/* normal request */
+			nvme_fc_unmap_data(ctrl, op->rq, op);
+			nvme_cleanup_cmd(op->rq);
+		}
+		/* else - aen. no cleanup needed */
+
+		nvme_fc_ctrl_put(ctrl);
+
+		if (ret != -EBUSY)
+			return BLK_MQ_RQ_QUEUE_ERROR;
+
+		if (op->rq) {
+			blk_mq_stop_hw_queues(op->rq->q);
+			blk_mq_delay_queue(queue->hctx, NVMEFC_QUEUE_DELAY);
+		}
+		return BLK_MQ_RQ_QUEUE_BUSY;
+	}
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static int
+nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
+			const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_fc_queue *queue = hctx->driver_data;
+	struct nvme_fc_ctrl *ctrl = queue->ctrl;
+	struct request *rq = bd->rq;
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	struct nvme_command *sqe = &cmdiu->sqe;
+	enum nvmefc_fcp_datadir	io_dir;
+	u32 data_len;
+	int ret;
+
+	ret = nvme_setup_cmd(ns, rq, sqe);
+	if (ret)
+		return ret;
+
+	data_len = nvme_map_len(rq);
+	if (data_len)
+		io_dir = ((rq_data_dir(rq) == WRITE) ?
+					NVMEFC_FCP_WRITE : NVMEFC_FCP_READ);
+	else
+		io_dir = NVMEFC_FCP_NODATA;
+
+	return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
+}
+
+static struct blk_mq_tags *
+nvme_fc_tagset(struct nvme_fc_queue *queue)
+{
+	if (queue->qnum == 0)
+		return queue->ctrl->admin_tag_set.tags[queue->qnum];
+
+	return queue->ctrl->tag_set.tags[queue->qnum - 1];
+}
+
+static int
+nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+
+{
+	struct nvme_fc_queue *queue = hctx->driver_data;
+	struct nvme_fc_ctrl *ctrl = queue->ctrl;
+	struct request *req;
+	struct nvme_fc_fcp_op *op;
+
+	req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
+	if (!req) {
+		dev_err(queue->ctrl->ctrl.device,
+			 "tag 0x%x on QNum %#x not found\n",
+			tag, queue->qnum);
+		return 0;
+	}
+
+	op = blk_mq_rq_to_pdu(req);
+
+	if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) &&
+		 (ctrl->lport->ops->poll_queue))
+		ctrl->lport->ops->poll_queue(&ctrl->lport->localport,
+						 queue->lldd_handle);
+
+	return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE));
+}
+
+static void
+nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
+	struct nvme_fc_fcp_op *aen_op;
+	int ret;
+
+	if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
+		return;
+
+	aen_op = &ctrl->aen_ops[aer_idx];
+
+	ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
+					NVMEFC_FCP_NODATA);
+	if (ret)
+		dev_err(ctrl->ctrl.device,
+			"failed async event work [%d]\n", aer_idx);
+}
+
+static void
+nvme_fc_complete_rq(struct request *rq)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+	int error = 0, state;
+
+	state = atomic_xchg(&op->state, FCPOP_STATE_IDLE);
+
+	nvme_cleanup_cmd(rq);
+
+	nvme_fc_unmap_data(ctrl, rq, op);
+
+	if (unlikely(rq->errors)) {
+		if (nvme_req_needs_retry(rq, rq->errors)) {
+			nvme_requeue_req(rq);
+			return;
+		}
+
+		if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
+			error = rq->errors;
+		else
+			error = nvme_error_status(rq->errors);
+	}
+
+	nvme_fc_ctrl_put(ctrl);
+
+	blk_mq_end_request(rq, error);
+}
+
+static struct blk_mq_ops nvme_fc_mq_ops = {
+	.queue_rq	= nvme_fc_queue_rq,
+	.complete	= nvme_fc_complete_rq,
+	.init_request	= nvme_fc_init_request,
+	.exit_request	= nvme_fc_exit_request,
+	.reinit_request	= nvme_fc_reinit_request,
+	.init_hctx	= nvme_fc_init_hctx,
+	.poll		= nvme_fc_poll,
+	.timeout	= nvme_fc_timeout,
+};
+
+static struct blk_mq_ops nvme_fc_admin_mq_ops = {
+	.queue_rq	= nvme_fc_queue_rq,
+	.complete	= nvme_fc_complete_rq,
+	.init_request	= nvme_fc_init_admin_request,
+	.exit_request	= nvme_fc_exit_request,
+	.reinit_request	= nvme_fc_reinit_request,
+	.init_hctx	= nvme_fc_init_admin_hctx,
+	.timeout	= nvme_fc_timeout,
+};
+
+static int
+nvme_fc_configure_admin_queue(struct nvme_fc_ctrl *ctrl)
+{
+	u32 segs;
+	int error;
+
+	nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+
+	error = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
+				NVME_FC_AQ_BLKMQ_DEPTH,
+				(NVME_FC_AQ_BLKMQ_DEPTH / 4));
+	if (error)
+		return error;
+
+	memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
+	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
+	ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
+	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
+	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+					(SG_CHUNK_SIZE *
+						sizeof(struct scatterlist)) +
+					ctrl->lport->ops->fcprqst_priv_sz;
+	ctrl->admin_tag_set.driver_data = ctrl;
+	ctrl->admin_tag_set.nr_hw_queues = 1;
+	ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+
+	error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
+	if (error)
+		goto out_free_queue;
+
+	ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+	if (IS_ERR(ctrl->ctrl.admin_q)) {
+		error = PTR_ERR(ctrl->ctrl.admin_q);
+		goto out_free_tagset;
+	}
+
+	error = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
+				NVME_FC_AQ_BLKMQ_DEPTH);
+	if (error)
+		goto out_cleanup_queue;
+
+	error = nvmf_connect_admin_queue(&ctrl->ctrl);
+	if (error)
+		goto out_delete_hw_queue;
+
+	error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
+	if (error) {
+		dev_err(ctrl->ctrl.device,
+			"prop_get NVME_REG_CAP failed\n");
+		goto out_delete_hw_queue;
+	}
+
+	ctrl->ctrl.sqsize =
+		min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
+
+	error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
+	if (error)
+		goto out_delete_hw_queue;
+
+	segs = min_t(u32, NVME_FC_MAX_SEGMENTS,
+			ctrl->lport->ops->max_sgl_segments);
+	ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
+
+	error = nvme_init_identify(&ctrl->ctrl);
+	if (error)
+		goto out_delete_hw_queue;
+
+	nvme_start_keep_alive(&ctrl->ctrl);
+
+	return 0;
+
+out_delete_hw_queue:
+	__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
+out_cleanup_queue:
+	blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_free_tagset:
+	blk_mq_free_tag_set(&ctrl->admin_tag_set);
+out_free_queue:
+	nvme_fc_free_queue(&ctrl->queues[0]);
+	return error;
+}
+
+/*
+ * This routine is used by the transport when it needs to find active
+ * io on a queue that is to be terminated. The transport uses
+ * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke
+ * this routine to kill them on a 1 by 1 basis.
+ *
+ * As FC allocates FC exchange for each io, the transport must contact
+ * the LLDD to terminate the exchange, thus releasing the FC exchange.
+ * After terminating the exchange the LLDD will call the transport's
+ * normal io done path for the request, but it will have an aborted
+ * status. The done path will return the io request back to the block
+ * layer with an error status.
+ */
+static void
+nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
+{
+	struct nvme_ctrl *nctrl = data;
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
+int status;
+
+	if (!blk_mq_request_started(req))
+		return;
+
+	/* this performs an ABTS-LS on the FC exchange for the io */
+	status = __nvme_fc_abort_op(ctrl, op);
+	/*
+	 * if __nvme_fc_abort_op failed: io wasn't active to abort
+	 * consider it done. Assume completion path already completing
+	 * in parallel
+	 */
+	if (status)
+		/* io wasn't active to abort consider it done */
+		/* assume completion path already completing in parallel */
+		return;
+}
+
+
+/*
+ * This routine stops operation of the controller. Admin and IO queues
+ * are stopped, outstanding ios on them terminated, and the nvme ctrl
+ * is shutdown.
+ */
+static void
+nvme_fc_shutdown_ctrl(struct nvme_fc_ctrl *ctrl)
+{
+	/*
+	 * If io queues are present, stop them and terminate all outstanding
+	 * ios on them. As FC allocates FC exchange for each io, the
+	 * transport must contact the LLDD to terminate the exchange,
+	 * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr()
+	 * to tell us what io's are busy and invoke a transport routine
+	 * to kill them with the LLDD.  After terminating the exchange
+	 * the LLDD will call the transport's normal io done path, but it
+	 * will have an aborted status. The done path will return the
+	 * io requests back to the block layer as part of normal completions
+	 * (but with error status).
+	 */
+	if (ctrl->queue_count > 1) {
+		nvme_stop_queues(&ctrl->ctrl);
+		blk_mq_tagset_busy_iter(&ctrl->tag_set,
+				nvme_fc_terminate_exchange, &ctrl->ctrl);
+	}
+
+	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
+		nvme_shutdown_ctrl(&ctrl->ctrl);
+
+	/*
+	 * now clean up the admin queue. Same thing as above.
+	 * use blk_mq_tagset_busy_itr() and the transport routine to
+	 * terminate the exchanges.
+	 */
+	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
+				nvme_fc_terminate_exchange, &ctrl->ctrl);
+}
+
+/*
+ * Called to teardown an association.
+ * May be called with association fully in place or partially in place.
+ */
+static void
+__nvme_fc_remove_ctrl(struct nvme_fc_ctrl *ctrl)
+{
+	nvme_stop_keep_alive(&ctrl->ctrl);
+
+	/* stop and terminate ios on admin and io queues */
+	nvme_fc_shutdown_ctrl(ctrl);
+
+	/*
+	 * tear down the controller
+	 * This will result in the last reference on the nvme ctrl to
+	 * expire, calling the transport nvme_fc_free_nvme_ctrl() callback.
+	 * From there, the transport will tear down it's logical queues and
+	 * association.
+	 */
+	nvme_uninit_ctrl(&ctrl->ctrl);
+
+	nvme_put_ctrl(&ctrl->ctrl);
+}
+
+static void
+nvme_fc_del_ctrl_work(struct work_struct *work)
+{
+	struct nvme_fc_ctrl *ctrl =
+			container_of(work, struct nvme_fc_ctrl, delete_work);
+
+	__nvme_fc_remove_ctrl(ctrl);
+}
+
+static int
+__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
+		return -EBUSY;
+
+	if (!queue_work(nvme_fc_wq, &ctrl->delete_work))
+		return -EBUSY;
+
+	return 0;
+}
+
+/*
+ * Request from nvme core layer to delete the controller
+ */
+static int
+nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+	struct nvme_fc_rport *rport = ctrl->rport;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&rport->lock, flags);
+	ret = __nvme_fc_del_ctrl(ctrl);
+	spin_unlock_irqrestore(&rport->lock, flags);
+	if (ret)
+		return ret;
+
+	flush_work(&ctrl->delete_work);
+
+	return 0;
+}
+
+static int
+nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl)
+{
+	return -EIO;
+}
+
+static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
+	.name			= "fc",
+	.module			= THIS_MODULE,
+	.is_fabrics		= true,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.reset_ctrl		= nvme_fc_reset_nvme_ctrl,
+	.free_ctrl		= nvme_fc_free_nvme_ctrl,
+	.submit_async_event	= nvme_fc_submit_async_event,
+	.delete_ctrl		= nvme_fc_del_nvme_ctrl,
+	.get_subsysnqn		= nvmf_get_subsysnqn,
+	.get_address		= nvmf_get_address,
+};
+
+static int
+nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+	int ret;
+
+	ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
+	if (ret) {
+		dev_info(ctrl->ctrl.device,
+			"set_queue_count failed: %d\n", ret);
+		return ret;
+	}
+
+	ctrl->queue_count = opts->nr_io_queues + 1;
+	if (!opts->nr_io_queues)
+		return 0;
+
+	dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n",
+			opts->nr_io_queues);
+
+	nvme_fc_init_io_queues(ctrl);
+
+	memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
+	ctrl->tag_set.ops = &nvme_fc_mq_ops;
+	ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
+	ctrl->tag_set.reserved_tags = 1; /* fabric connect */
+	ctrl->tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+					(SG_CHUNK_SIZE *
+						sizeof(struct scatterlist)) +
+					ctrl->lport->ops->fcprqst_priv_sz;
+	ctrl->tag_set.driver_data = ctrl;
+	ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
+	ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
+
+	ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
+	if (ret)
+		return ret;
+
+	ctrl->ctrl.tagset = &ctrl->tag_set;
+
+	ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
+	if (IS_ERR(ctrl->ctrl.connect_q)) {
+		ret = PTR_ERR(ctrl->ctrl.connect_q);
+		goto out_free_tag_set;
+	}
+
+	ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+	if (ret)
+		goto out_cleanup_blk_queue;
+
+	ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+	if (ret)
+		goto out_delete_hw_queues;
+
+	return 0;
+
+out_delete_hw_queues:
+	nvme_fc_delete_hw_io_queues(ctrl);
+out_cleanup_blk_queue:
+	nvme_stop_keep_alive(&ctrl->ctrl);
+	blk_cleanup_queue(ctrl->ctrl.connect_q);
+out_free_tag_set:
+	blk_mq_free_tag_set(&ctrl->tag_set);
+	nvme_fc_free_io_queues(ctrl);
+
+	/* force put free routine to ignore io queues */
+	ctrl->ctrl.tagset = NULL;
+
+	return ret;
+}
+
+
+static struct nvme_ctrl *
+__nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+	struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
+{
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+	int ret, idx;
+	bool changed;
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl) {
+		ret = -ENOMEM;
+		goto out_fail;
+	}
+
+	idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0) {
+		ret = -ENOSPC;
+		goto out_free_ctrl;
+	}
+
+	ctrl->ctrl.opts = opts;
+	INIT_LIST_HEAD(&ctrl->ctrl_list);
+	INIT_LIST_HEAD(&ctrl->ls_req_list);
+	ctrl->lport = lport;
+	ctrl->rport = rport;
+	ctrl->dev = lport->dev;
+	ctrl->state = FCCTRL_INIT;
+	ctrl->cnum = idx;
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
+	if (ret)
+		goto out_free_ida;
+
+	get_device(ctrl->dev);
+	kref_init(&ctrl->ref);
+
+	INIT_WORK(&ctrl->delete_work, nvme_fc_del_ctrl_work);
+	spin_lock_init(&ctrl->lock);
+
+	/* io queue count */
+	ctrl->queue_count = min_t(unsigned int,
+				opts->nr_io_queues,
+				lport->ops->max_hw_queues);
+	opts->nr_io_queues = ctrl->queue_count;	/* so opts has valid value */
+	ctrl->queue_count++;	/* +1 for admin queue */
+
+	ctrl->ctrl.sqsize = opts->queue_size - 1;
+	ctrl->ctrl.kato = opts->kato;
+
+	ret = -ENOMEM;
+	ctrl->queues = kcalloc(ctrl->queue_count, sizeof(struct nvme_fc_queue),
+				GFP_KERNEL);
+	if (!ctrl->queues)
+		goto out_uninit_ctrl;
+
+	ret = nvme_fc_configure_admin_queue(ctrl);
+	if (ret)
+		goto out_uninit_ctrl;
+
+	/* sanity checks */
+
+	/* FC-NVME supports 64-byte SQE only */
+	if (ctrl->ctrl.ioccsz != 4) {
+		dev_err(ctrl->ctrl.device, "ioccsz %d is not supported!\n",
+				ctrl->ctrl.ioccsz);
+		goto out_remove_admin_queue;
+	}
+	/* FC-NVME supports 16-byte CQE only */
+	if (ctrl->ctrl.iorcsz != 1) {
+		dev_err(ctrl->ctrl.device, "iorcsz %d is not supported!\n",
+				ctrl->ctrl.iorcsz);
+		goto out_remove_admin_queue;
+	}
+	/* FC-NVME does not have other data in the capsule */
+	if (ctrl->ctrl.icdoff) {
+		dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
+				ctrl->ctrl.icdoff);
+		goto out_remove_admin_queue;
+	}
+
+	/* FC-NVME supports normal SGL Data Block Descriptors */
+
+	if (opts->queue_size > ctrl->ctrl.maxcmd) {
+		/* warn if maxcmd is lower than queue_size */
+		dev_warn(ctrl->ctrl.device,
+			"queue_size %zu > ctrl maxcmd %u, reducing "
+			"to queue_size\n",
+			opts->queue_size, ctrl->ctrl.maxcmd);
+		opts->queue_size = ctrl->ctrl.maxcmd;
+	}
+
+	ret = nvme_fc_init_aen_ops(ctrl);
+	if (ret)
+		goto out_exit_aen_ops;
+
+	if (ctrl->queue_count > 1) {
+		ret = nvme_fc_create_io_queues(ctrl);
+		if (ret)
+			goto out_exit_aen_ops;
+	}
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	ctrl->state = FCCTRL_ACTIVE;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+	WARN_ON_ONCE(!changed);
+
+	dev_info(ctrl->ctrl.device,
+		"NVME-FC{%d}: new ctrl: NQN \"%s\" (%p)\n",
+		ctrl->cnum, ctrl->ctrl.opts->subsysnqn, &ctrl);
+
+	kref_get(&ctrl->ctrl.kref);
+
+	spin_lock_irqsave(&rport->lock, flags);
+	list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	if (opts->nr_io_queues) {
+		nvme_queue_scan(&ctrl->ctrl);
+		nvme_queue_async_events(&ctrl->ctrl);
+	}
+
+	return &ctrl->ctrl;
+
+out_exit_aen_ops:
+	nvme_fc_exit_aen_ops(ctrl);
+out_remove_admin_queue:
+	/* send a Disconnect(association) LS to fc-nvme target */
+	nvme_fc_xmt_disconnect_assoc(ctrl);
+	nvme_stop_keep_alive(&ctrl->ctrl);
+	nvme_fc_destroy_admin_queue(ctrl);
+out_uninit_ctrl:
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+	if (ret > 0)
+		ret = -EIO;
+	/* exit via here will follow ctlr ref point callbacks to free */
+	return ERR_PTR(ret);
+
+out_free_ida:
+	ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum);
+out_free_ctrl:
+	kfree(ctrl);
+out_fail:
+	nvme_fc_rport_put(rport);
+	/* exit via here doesn't follow ctlr ref points */
+	return ERR_PTR(ret);
+}
+
+enum {
+	FCT_TRADDR_ERR		= 0,
+	FCT_TRADDR_WWNN		= 1 << 0,
+	FCT_TRADDR_WWPN		= 1 << 1,
+};
+
+struct nvmet_fc_traddr {
+	u64	nn;
+	u64	pn;
+};
+
+static const match_table_t traddr_opt_tokens = {
+	{ FCT_TRADDR_WWNN,	"nn-%s"		},
+	{ FCT_TRADDR_WWPN,	"pn-%s"		},
+	{ FCT_TRADDR_ERR,	NULL		}
+};
+
+static int
+nvme_fc_parse_address(struct nvmet_fc_traddr *traddr, char *buf)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *options, *o, *p;
+	int token, ret = 0;
+	u64 token64;
+
+	options = o = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	while ((p = strsep(&o, ":\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, traddr_opt_tokens, args);
+		switch (token) {
+		case FCT_TRADDR_WWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			traddr->nn = token64;
+			break;
+		case FCT_TRADDR_WWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			traddr->pn = token64;
+			break;
+		default:
+			pr_warn("unknown traddr token or missing value '%s'\n",
+					p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static struct nvme_ctrl *
+nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
+{
+	struct nvme_fc_lport *lport;
+	struct nvme_fc_rport *rport;
+	struct nvmet_fc_traddr laddr = { 0L, 0L };
+	struct nvmet_fc_traddr raddr = { 0L, 0L };
+	unsigned long flags;
+	int ret;
+
+	ret = nvme_fc_parse_address(&raddr, opts->traddr);
+	if (ret || !raddr.nn || !raddr.pn)
+		return ERR_PTR(-EINVAL);
+
+	ret = nvme_fc_parse_address(&laddr, opts->host_traddr);
+	if (ret || !laddr.nn || !laddr.pn)
+		return ERR_PTR(-EINVAL);
+
+	/* find the host and remote ports to connect together */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
+		if (lport->localport.node_name != laddr.nn ||
+		    lport->localport.port_name != laddr.pn)
+			continue;
+
+		list_for_each_entry(rport, &lport->endp_list, endp_list) {
+			if (rport->remoteport.node_name != raddr.nn ||
+			    rport->remoteport.port_name != raddr.pn)
+				continue;
+
+			/* if fail to get reference fall through. Will error */
+			if (!nvme_fc_rport_get(rport))
+				break;
+
+			spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+			return __nvme_fc_create_ctrl(dev, opts, lport, rport);
+		}
+	}
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	return ERR_PTR(-ENOENT);
+}
+
+
+static struct nvmf_transport_ops nvme_fc_transport = {
+	.name		= "fc",
+	.required_opts	= NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
+	.allowed_opts	= NVMF_OPT_RECONNECT_DELAY,
+	.create_ctrl	= nvme_fc_create_ctrl,
+};
+
+static int __init nvme_fc_init_module(void)
+{
+	nvme_fc_wq = create_workqueue("nvme_fc_wq");
+	if (!nvme_fc_wq)
+		return -ENOMEM;
+
+	nvmf_register_transport(&nvme_fc_transport);
+	return 0;
+}
+
+static void __exit nvme_fc_exit_module(void)
+{
+	/* sanity check - all lports should be removed */
+	if (!list_empty(&nvme_fc_lport_list))
+		pr_warn("%s: localport list not empty\n", __func__);
+
+	nvmf_unregister_transport(&nvme_fc_transport);
+
+	destroy_workqueue(nvme_fc_wq);
+
+	ida_destroy(&nvme_fc_local_port_cnt);
+	ida_destroy(&nvme_fc_ctrl_cnt);
+}
+
+module_init(nvme_fc_init_module);
+module_exit(nvme_fc_exit_module);
+
+MODULE_LICENSE("GPL v2");

From c53432030d86429dc9fe5adc3d68cb9d1343b0b2 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 2 Dec 2016 00:28:43 -0800
Subject: [PATCH 172/182] nvme-fabrics: Add target support for FC transport

Implements the FC-NVME T11 definition of how nvme fabric capsules are
performed on an FC fabric. Utilizes a lower-layer API to FC host adapters
to send/receive FC-4 LS operations and perform the FCP transactions
necessary to perform and FCP IO request for NVME.

The T11 definitions for FC-4 Link Services are implemented which create
NVMeOF connections.  Implements the hooks with nvmet layer to pass NVME
commands to it for processing and posting of data/response base to the
host via the different connections.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS                  |    1 +
 drivers/nvme/target/Kconfig  |   11 +
 drivers/nvme/target/Makefile |    2 +
 drivers/nvme/target/fc.c     | 2288 ++++++++++++++++++++++++++++++++++
 4 files changed, 2302 insertions(+)
 create mode 100644 drivers/nvme/target/fc.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 00505700b30a31..1c2c9f96dd5639 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8666,6 +8666,7 @@ S:	Supported
 F:	include/linux/nvme-fc.h
 F:	include/linux/nvme-fc-driver.h
 F:	drivers/nvme/host/fc.c
+F:	drivers/nvme/target/fc.c
 
 NVMEM FRAMEWORK
 M:	Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 3a5b9d0576cb0f..746a63e4d54aaa 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -34,3 +34,14 @@ config NVME_TARGET_RDMA
 	  devices over RDMA.
 
 	  If unsure, say N.
+
+config NVME_TARGET_FC
+	tristate "NVMe over Fabrics FC target driver"
+	depends on NVME_TARGET
+	depends on HAS_DMA
+	help
+	  This enables the NVMe FC target support, which allows exporting NVMe
+	  devices over FC.
+
+	  If unsure, say N.
+
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index b7a06232c9da79..80b128b9ac9f00 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -2,8 +2,10 @@
 obj-$(CONFIG_NVME_TARGET)		+= nvmet.o
 obj-$(CONFIG_NVME_TARGET_LOOP)		+= nvme-loop.o
 obj-$(CONFIG_NVME_TARGET_RDMA)		+= nvmet-rdma.o
+obj-$(CONFIG_NVME_TARGET_FC)		+= nvmet-fc.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \
 			discovery.o
 nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
+nvmet-fc-y	+= fc.o
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
new file mode 100644
index 00000000000000..173e842f19c975
--- /dev/null
+++ b/drivers/nvme/target/fc.c
@@ -0,0 +1,2288 @@
+/*
+ * Copyright (c) 2016 Avago Technologies.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful.
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
+ * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
+ * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
+ * See the GNU General Public License for more details, a copy of which
+ * can be found in the file COPYING included with this package
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <uapi/scsi/fc/fc_fs.h>
+#include <uapi/scsi/fc/fc_els.h>
+
+#include "nvmet.h"
+#include <linux/nvme-fc-driver.h>
+#include <linux/nvme-fc.h>
+
+
+/* *************************** Data Structures/Defines ****************** */
+
+
+#define NVMET_LS_CTX_COUNT		4
+
+/* for this implementation, assume small single frame rqst/rsp */
+#define NVME_FC_MAX_LS_BUFFER_SIZE		2048
+
+struct nvmet_fc_tgtport;
+struct nvmet_fc_tgt_assoc;
+
+struct nvmet_fc_ls_iod {
+	struct nvmefc_tgt_ls_req	*lsreq;
+	struct nvmefc_tgt_fcp_req	*fcpreq;	/* only if RS */
+
+	struct list_head		ls_list;	/* tgtport->ls_list */
+
+	struct nvmet_fc_tgtport		*tgtport;
+	struct nvmet_fc_tgt_assoc	*assoc;
+
+	u8				*rqstbuf;
+	u8				*rspbuf;
+	u16				rqstdatalen;
+	dma_addr_t			rspdma;
+
+	struct scatterlist		sg[2];
+
+	struct work_struct		work;
+} __aligned(sizeof(unsigned long long));
+
+#define NVMET_FC_MAX_KB_PER_XFR		256
+
+enum nvmet_fcp_datadir {
+	NVMET_FCP_NODATA,
+	NVMET_FCP_WRITE,
+	NVMET_FCP_READ,
+	NVMET_FCP_ABORTED,
+};
+
+struct nvmet_fc_fcp_iod {
+	struct nvmefc_tgt_fcp_req	*fcpreq;
+
+	struct nvme_fc_cmd_iu		cmdiubuf;
+	struct nvme_fc_ersp_iu		rspiubuf;
+	dma_addr_t			rspdma;
+	struct scatterlist		*data_sg;
+	struct scatterlist		*next_sg;
+	int				data_sg_cnt;
+	u32				next_sg_offset;
+	u32				total_length;
+	u32				offset;
+	enum nvmet_fcp_datadir		io_dir;
+	bool				active;
+	bool				abort;
+	spinlock_t			flock;
+
+	struct nvmet_req		req;
+	struct work_struct		work;
+
+	struct nvmet_fc_tgtport		*tgtport;
+	struct nvmet_fc_tgt_queue	*queue;
+
+	struct list_head		fcp_list;	/* tgtport->fcp_list */
+};
+
+struct nvmet_fc_tgtport {
+
+	struct nvmet_fc_target_port	fc_target_port;
+
+	struct list_head		tgt_list; /* nvmet_fc_target_list */
+	struct device			*dev;	/* dev for dma mapping */
+	struct nvmet_fc_target_template	*ops;
+
+	struct nvmet_fc_ls_iod		*iod;
+	spinlock_t			lock;
+	struct list_head		ls_list;
+	struct list_head		ls_busylist;
+	struct list_head		assoc_list;
+	struct ida			assoc_cnt;
+	struct nvmet_port		*port;
+	struct kref			ref;
+};
+
+struct nvmet_fc_tgt_queue {
+	bool				ninetypercent;
+	u16				qid;
+	u16				sqsize;
+	u16				ersp_ratio;
+	u16				sqhd;
+	int				cpu;
+	atomic_t			connected;
+	atomic_t			sqtail;
+	atomic_t			zrspcnt;
+	atomic_t			rsn;
+	spinlock_t			qlock;
+	struct nvmet_port		*port;
+	struct nvmet_cq			nvme_cq;
+	struct nvmet_sq			nvme_sq;
+	struct nvmet_fc_tgt_assoc	*assoc;
+	struct nvmet_fc_fcp_iod		*fod;		/* array of fcp_iods */
+	struct list_head		fod_list;
+	struct workqueue_struct		*work_q;
+	struct kref			ref;
+} __aligned(sizeof(unsigned long long));
+
+struct nvmet_fc_tgt_assoc {
+	u64				association_id;
+	u32				a_id;
+	struct nvmet_fc_tgtport		*tgtport;
+	struct list_head		a_list;
+	struct nvmet_fc_tgt_queue	*queues[NVMET_NR_QUEUES];
+	struct kref			ref;
+};
+
+
+static inline int
+nvmet_fc_iodnum(struct nvmet_fc_ls_iod *iodptr)
+{
+	return (iodptr - iodptr->tgtport->iod);
+}
+
+static inline int
+nvmet_fc_fodnum(struct nvmet_fc_fcp_iod *fodptr)
+{
+	return (fodptr - fodptr->queue->fod);
+}
+
+
+/*
+ * Association and Connection IDs:
+ *
+ * Association ID will have random number in upper 6 bytes and zero
+ *   in lower 2 bytes
+ *
+ * Connection IDs will be Association ID with QID or'd in lower 2 bytes
+ *
+ * note: Association ID = Connection ID for queue 0
+ */
+#define BYTES_FOR_QID			sizeof(u16)
+#define BYTES_FOR_QID_SHIFT		(BYTES_FOR_QID * 8)
+#define NVMET_FC_QUEUEID_MASK		((u64)((1 << BYTES_FOR_QID_SHIFT) - 1))
+
+static inline u64
+nvmet_fc_makeconnid(struct nvmet_fc_tgt_assoc *assoc, u16 qid)
+{
+	return (assoc->association_id | qid);
+}
+
+static inline u64
+nvmet_fc_getassociationid(u64 connectionid)
+{
+	return connectionid & ~NVMET_FC_QUEUEID_MASK;
+}
+
+static inline u16
+nvmet_fc_getqueueid(u64 connectionid)
+{
+	return (u16)(connectionid & NVMET_FC_QUEUEID_MASK);
+}
+
+static inline struct nvmet_fc_tgtport *
+targetport_to_tgtport(struct nvmet_fc_target_port *targetport)
+{
+	return container_of(targetport, struct nvmet_fc_tgtport,
+				 fc_target_port);
+}
+
+static inline struct nvmet_fc_fcp_iod *
+nvmet_req_to_fod(struct nvmet_req *nvme_req)
+{
+	return container_of(nvme_req, struct nvmet_fc_fcp_iod, req);
+}
+
+
+/* *************************** Globals **************************** */
+
+
+static DEFINE_SPINLOCK(nvmet_fc_tgtlock);
+
+static LIST_HEAD(nvmet_fc_target_list);
+static DEFINE_IDA(nvmet_fc_tgtport_cnt);
+
+
+static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
+static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
+static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
+static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
+static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue);
+static int nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue);
+static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport);
+static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport);
+
+
+/* *********************** FC-NVME DMA Handling **************************** */
+
+/*
+ * The fcloop device passes in a NULL device pointer. Real LLD's will
+ * pass in a valid device pointer. If NULL is passed to the dma mapping
+ * routines, depending on the platform, it may or may not succeed, and
+ * may crash.
+ *
+ * As such:
+ * Wrapper all the dma routines and check the dev pointer.
+ *
+ * If simple mappings (return just a dma address, we'll noop them,
+ * returning a dma address of 0.
+ *
+ * On more complex mappings (dma_map_sg), a pseudo routine fills
+ * in the scatter list, setting all dma addresses to 0.
+ */
+
+static inline dma_addr_t
+fc_dma_map_single(struct device *dev, void *ptr, size_t size,
+		enum dma_data_direction dir)
+{
+	return dev ? dma_map_single(dev, ptr, size, dir) : (dma_addr_t)0L;
+}
+
+static inline int
+fc_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dev ? dma_mapping_error(dev, dma_addr) : 0;
+}
+
+static inline void
+fc_dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
+	enum dma_data_direction dir)
+{
+	if (dev)
+		dma_unmap_single(dev, addr, size, dir);
+}
+
+static inline void
+fc_dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void
+fc_dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_sync_single_for_device(dev, addr, size, dir);
+}
+
+/* pseudo dma_map_sg call */
+static int
+fc_map_sg(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s;
+	int i;
+
+	WARN_ON(nents == 0 || sg[0].length == 0);
+
+	for_each_sg(sg, s, nents, i) {
+		s->dma_address = 0L;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		s->dma_length = s->length;
+#endif
+	}
+	return nents;
+}
+
+static inline int
+fc_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	return dev ? dma_map_sg(dev, sg, nents, dir) : fc_map_sg(sg, nents);
+}
+
+static inline void
+fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir)
+{
+	if (dev)
+		dma_unmap_sg(dev, sg, nents, dir);
+}
+
+
+/* *********************** FC-NVME Port Management ************************ */
+
+
+static int
+nvmet_fc_alloc_ls_iodlist(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_ls_iod *iod;
+	int i;
+
+	iod = kcalloc(NVMET_LS_CTX_COUNT, sizeof(struct nvmet_fc_ls_iod),
+			GFP_KERNEL);
+	if (!iod)
+		return -ENOMEM;
+
+	tgtport->iod = iod;
+
+	for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++) {
+		INIT_WORK(&iod->work, nvmet_fc_handle_ls_rqst_work);
+		iod->tgtport = tgtport;
+		list_add_tail(&iod->ls_list, &tgtport->ls_list);
+
+		iod->rqstbuf = kcalloc(2, NVME_FC_MAX_LS_BUFFER_SIZE,
+			GFP_KERNEL);
+		if (!iod->rqstbuf)
+			goto out_fail;
+
+		iod->rspbuf = iod->rqstbuf + NVME_FC_MAX_LS_BUFFER_SIZE;
+
+		iod->rspdma = fc_dma_map_single(tgtport->dev, iod->rspbuf,
+						NVME_FC_MAX_LS_BUFFER_SIZE,
+						DMA_TO_DEVICE);
+		if (fc_dma_mapping_error(tgtport->dev, iod->rspdma))
+			goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	kfree(iod->rqstbuf);
+	list_del(&iod->ls_list);
+	for (iod--, i--; i >= 0; iod--, i--) {
+		fc_dma_unmap_single(tgtport->dev, iod->rspdma,
+				NVME_FC_MAX_LS_BUFFER_SIZE, DMA_TO_DEVICE);
+		kfree(iod->rqstbuf);
+		list_del(&iod->ls_list);
+	}
+
+	kfree(iod);
+
+	return -EFAULT;
+}
+
+static void
+nvmet_fc_free_ls_iodlist(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_ls_iod *iod = tgtport->iod;
+	int i;
+
+	for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++) {
+		fc_dma_unmap_single(tgtport->dev,
+				iod->rspdma, NVME_FC_MAX_LS_BUFFER_SIZE,
+				DMA_TO_DEVICE);
+		kfree(iod->rqstbuf);
+		list_del(&iod->ls_list);
+	}
+	kfree(tgtport->iod);
+}
+
+static struct nvmet_fc_ls_iod *
+nvmet_fc_alloc_ls_iod(struct nvmet_fc_tgtport *tgtport)
+{
+	static struct nvmet_fc_ls_iod *iod;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	iod = list_first_entry_or_null(&tgtport->ls_list,
+					struct nvmet_fc_ls_iod, ls_list);
+	if (iod)
+		list_move_tail(&iod->ls_list, &tgtport->ls_busylist);
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+	return iod;
+}
+
+
+static void
+nvmet_fc_free_ls_iod(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_move(&iod->ls_list, &tgtport->ls_list);
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+}
+
+static void
+nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_tgt_queue *queue)
+{
+	struct nvmet_fc_fcp_iod *fod = queue->fod;
+	int i;
+
+	for (i = 0; i < queue->sqsize; fod++, i++) {
+		INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
+		fod->tgtport = tgtport;
+		fod->queue = queue;
+		fod->active = false;
+		list_add_tail(&fod->fcp_list, &queue->fod_list);
+		spin_lock_init(&fod->flock);
+
+		fod->rspdma = fc_dma_map_single(tgtport->dev, &fod->rspiubuf,
+					sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+		if (fc_dma_mapping_error(tgtport->dev, fod->rspdma)) {
+			list_del(&fod->fcp_list);
+			for (fod--, i--; i >= 0; fod--, i--) {
+				fc_dma_unmap_single(tgtport->dev, fod->rspdma,
+						sizeof(fod->rspiubuf),
+						DMA_TO_DEVICE);
+				fod->rspdma = 0L;
+				list_del(&fod->fcp_list);
+			}
+
+			return;
+		}
+	}
+}
+
+static void
+nvmet_fc_destroy_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_tgt_queue *queue)
+{
+	struct nvmet_fc_fcp_iod *fod = queue->fod;
+	int i;
+
+	for (i = 0; i < queue->sqsize; fod++, i++) {
+		if (fod->rspdma)
+			fc_dma_unmap_single(tgtport->dev, fod->rspdma,
+				sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+	}
+}
+
+static struct nvmet_fc_fcp_iod *
+nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue)
+{
+	static struct nvmet_fc_fcp_iod *fod;
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->qlock, flags);
+	fod = list_first_entry_or_null(&queue->fod_list,
+					struct nvmet_fc_fcp_iod, fcp_list);
+	if (fod) {
+		list_del(&fod->fcp_list);
+		fod->active = true;
+		fod->abort = false;
+		/*
+		 * no queue reference is taken, as it was taken by the
+		 * queue lookup just prior to the allocation. The iod
+		 * will "inherit" that reference.
+		 */
+	}
+	spin_unlock_irqrestore(&queue->qlock, flags);
+	return fod;
+}
+
+
+static void
+nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
+			struct nvmet_fc_fcp_iod *fod)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->qlock, flags);
+	list_add_tail(&fod->fcp_list, &fod->queue->fod_list);
+	fod->active = false;
+	spin_unlock_irqrestore(&queue->qlock, flags);
+
+	/*
+	 * release the reference taken at queue lookup and fod allocation
+	 */
+	nvmet_fc_tgt_q_put(queue);
+}
+
+static int
+nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid)
+{
+	int cpu, idx, cnt;
+
+	if (!(tgtport->ops->target_features &
+			NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED) ||
+	    tgtport->ops->max_hw_queues == 1)
+		return WORK_CPU_UNBOUND;
+
+	/* Simple cpu selection based on qid modulo active cpu count */
+	idx = !qid ? 0 : (qid - 1) % num_active_cpus();
+
+	/* find the n'th active cpu */
+	for (cpu = 0, cnt = 0; ; ) {
+		if (cpu_active(cpu)) {
+			if (cnt == idx)
+				break;
+			cnt++;
+		}
+		cpu = (cpu + 1) % num_possible_cpus();
+	}
+
+	return cpu;
+}
+
+static struct nvmet_fc_tgt_queue *
+nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
+			u16 qid, u16 sqsize)
+{
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+	int ret;
+
+	if (qid >= NVMET_NR_QUEUES)
+		return NULL;
+
+	queue = kzalloc((sizeof(*queue) +
+				(sizeof(struct nvmet_fc_fcp_iod) * sqsize)),
+				GFP_KERNEL);
+	if (!queue)
+		return NULL;
+
+	if (!nvmet_fc_tgt_a_get(assoc))
+		goto out_free_queue;
+
+	queue->work_q = alloc_workqueue("ntfc%d.%d.%d", 0, 0,
+				assoc->tgtport->fc_target_port.port_num,
+				assoc->a_id, qid);
+	if (!queue->work_q)
+		goto out_a_put;
+
+	queue->fod = (struct nvmet_fc_fcp_iod *)&queue[1];
+	queue->qid = qid;
+	queue->sqsize = sqsize;
+	queue->assoc = assoc;
+	queue->port = assoc->tgtport->port;
+	queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid);
+	INIT_LIST_HEAD(&queue->fod_list);
+	atomic_set(&queue->connected, 0);
+	atomic_set(&queue->sqtail, 0);
+	atomic_set(&queue->rsn, 1);
+	atomic_set(&queue->zrspcnt, 0);
+	spin_lock_init(&queue->qlock);
+	kref_init(&queue->ref);
+
+	nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue);
+
+	ret = nvmet_sq_init(&queue->nvme_sq);
+	if (ret)
+		goto out_fail_iodlist;
+
+	WARN_ON(assoc->queues[qid]);
+	spin_lock_irqsave(&assoc->tgtport->lock, flags);
+	assoc->queues[qid] = queue;
+	spin_unlock_irqrestore(&assoc->tgtport->lock, flags);
+
+	return queue;
+
+out_fail_iodlist:
+	nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue);
+	destroy_workqueue(queue->work_q);
+out_a_put:
+	nvmet_fc_tgt_a_put(assoc);
+out_free_queue:
+	kfree(queue);
+	return NULL;
+}
+
+
+static void
+nvmet_fc_tgt_queue_free(struct kref *ref)
+{
+	struct nvmet_fc_tgt_queue *queue =
+		container_of(ref, struct nvmet_fc_tgt_queue, ref);
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->assoc->tgtport->lock, flags);
+	queue->assoc->queues[queue->qid] = NULL;
+	spin_unlock_irqrestore(&queue->assoc->tgtport->lock, flags);
+
+	nvmet_fc_destroy_fcp_iodlist(queue->assoc->tgtport, queue);
+
+	nvmet_fc_tgt_a_put(queue->assoc);
+
+	destroy_workqueue(queue->work_q);
+
+	kfree(queue);
+}
+
+static void
+nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue)
+{
+	kref_put(&queue->ref, nvmet_fc_tgt_queue_free);
+}
+
+static int
+nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue)
+{
+	return kref_get_unless_zero(&queue->ref);
+}
+
+
+static void
+nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
+				struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	int ret;
+
+	fcpreq->op = NVMET_FCOP_ABORT;
+	fcpreq->offset = 0;
+	fcpreq->timeout = 0;
+	fcpreq->transfer_length = 0;
+	fcpreq->transferred_length = 0;
+	fcpreq->fcp_error = 0;
+	fcpreq->sg_cnt = 0;
+
+	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fcpreq);
+	if (ret)
+		/* should never reach here !! */
+		WARN_ON(1);
+}
+
+
+static void
+nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
+{
+	struct nvmet_fc_fcp_iod *fod = queue->fod;
+	unsigned long flags;
+	int i;
+	bool disconnect;
+
+	disconnect = atomic_xchg(&queue->connected, 0);
+
+	spin_lock_irqsave(&queue->qlock, flags);
+	/* about outstanding io's */
+	for (i = 0; i < queue->sqsize; fod++, i++) {
+		if (fod->active) {
+			spin_lock(&fod->flock);
+			fod->abort = true;
+			spin_unlock(&fod->flock);
+		}
+	}
+	spin_unlock_irqrestore(&queue->qlock, flags);
+
+	flush_workqueue(queue->work_q);
+
+	if (disconnect)
+		nvmet_sq_destroy(&queue->nvme_sq);
+
+	nvmet_fc_tgt_q_put(queue);
+}
+
+static struct nvmet_fc_tgt_queue *
+nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport,
+				u64 connection_id)
+{
+	struct nvmet_fc_tgt_assoc *assoc;
+	struct nvmet_fc_tgt_queue *queue;
+	u64 association_id = nvmet_fc_getassociationid(connection_id);
+	u16 qid = nvmet_fc_getqueueid(connection_id);
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
+		if (association_id == assoc->association_id) {
+			queue = assoc->queues[qid];
+			if (queue &&
+			    (!atomic_read(&queue->connected) ||
+			     !nvmet_fc_tgt_q_get(queue)))
+				queue = NULL;
+			spin_unlock_irqrestore(&tgtport->lock, flags);
+			return queue;
+		}
+	}
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+	return NULL;
+}
+
+static struct nvmet_fc_tgt_assoc *
+nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_tgt_assoc *assoc, *tmpassoc;
+	unsigned long flags;
+	u64 ran;
+	int idx;
+	bool needrandom = true;
+
+	assoc = kzalloc(sizeof(*assoc), GFP_KERNEL);
+	if (!assoc)
+		return NULL;
+
+	idx = ida_simple_get(&tgtport->assoc_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0)
+		goto out_free_assoc;
+
+	if (!nvmet_fc_tgtport_get(tgtport))
+		goto out_ida_put;
+
+	assoc->tgtport = tgtport;
+	assoc->a_id = idx;
+	INIT_LIST_HEAD(&assoc->a_list);
+	kref_init(&assoc->ref);
+
+	while (needrandom) {
+		get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID);
+		ran = ran << BYTES_FOR_QID_SHIFT;
+
+		spin_lock_irqsave(&tgtport->lock, flags);
+		needrandom = false;
+		list_for_each_entry(tmpassoc, &tgtport->assoc_list, a_list)
+			if (ran == tmpassoc->association_id) {
+				needrandom = true;
+				break;
+			}
+		if (!needrandom) {
+			assoc->association_id = ran;
+			list_add_tail(&assoc->a_list, &tgtport->assoc_list);
+		}
+		spin_unlock_irqrestore(&tgtport->lock, flags);
+	}
+
+	return assoc;
+
+out_ida_put:
+	ida_simple_remove(&tgtport->assoc_cnt, idx);
+out_free_assoc:
+	kfree(assoc);
+	return NULL;
+}
+
+static void
+nvmet_fc_target_assoc_free(struct kref *ref)
+{
+	struct nvmet_fc_tgt_assoc *assoc =
+		container_of(ref, struct nvmet_fc_tgt_assoc, ref);
+	struct nvmet_fc_tgtport *tgtport = assoc->tgtport;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_del(&assoc->a_list);
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+	ida_simple_remove(&tgtport->assoc_cnt, assoc->a_id);
+	kfree(assoc);
+	nvmet_fc_tgtport_put(tgtport);
+}
+
+static void
+nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc)
+{
+	kref_put(&assoc->ref, nvmet_fc_target_assoc_free);
+}
+
+static int
+nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc)
+{
+	return kref_get_unless_zero(&assoc->ref);
+}
+
+static void
+nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc)
+{
+	struct nvmet_fc_tgtport *tgtport = assoc->tgtport;
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	for (i = NVMET_NR_QUEUES - 1; i >= 0; i--) {
+		queue = assoc->queues[i];
+		if (queue) {
+			if (!nvmet_fc_tgt_q_get(queue))
+				continue;
+			spin_unlock_irqrestore(&tgtport->lock, flags);
+			nvmet_fc_delete_target_queue(queue);
+			nvmet_fc_tgt_q_put(queue);
+			spin_lock_irqsave(&tgtport->lock, flags);
+		}
+	}
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+
+	nvmet_fc_tgt_a_put(assoc);
+}
+
+static struct nvmet_fc_tgt_assoc *
+nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport,
+				u64 association_id)
+{
+	struct nvmet_fc_tgt_assoc *assoc;
+	struct nvmet_fc_tgt_assoc *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
+		if (association_id == assoc->association_id) {
+			ret = assoc;
+			nvmet_fc_tgt_a_get(assoc);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+
+	return ret;
+}
+
+
+/**
+ * nvme_fc_register_targetport - transport entry point called by an
+ *                              LLDD to register the existence of a local
+ *                              NVME subystem FC port.
+ * @pinfo:     pointer to information about the port to be registered
+ * @template:  LLDD entrypoints and operational parameters for the port
+ * @dev:       physical hardware device node port corresponds to. Will be
+ *             used for DMA mappings
+ * @portptr:   pointer to a local port pointer. Upon success, the routine
+ *             will allocate a nvme_fc_local_port structure and place its
+ *             address in the local port pointer. Upon failure, local port
+ *             pointer will be set to NULL.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
+			struct nvmet_fc_target_template *template,
+			struct device *dev,
+			struct nvmet_fc_target_port **portptr)
+{
+	struct nvmet_fc_tgtport *newrec;
+	unsigned long flags;
+	int ret, idx;
+
+	if (!template->xmt_ls_rsp || !template->fcp_op ||
+	    !template->targetport_delete ||
+	    !template->max_hw_queues || !template->max_sgl_segments ||
+	    !template->max_dif_sgl_segments || !template->dma_boundary) {
+		ret = -EINVAL;
+		goto out_regtgt_failed;
+	}
+
+	newrec = kzalloc((sizeof(*newrec) + template->target_priv_sz),
+			 GFP_KERNEL);
+	if (!newrec) {
+		ret = -ENOMEM;
+		goto out_regtgt_failed;
+	}
+
+	idx = ida_simple_get(&nvmet_fc_tgtport_cnt, 0, 0, GFP_KERNEL);
+	if (idx < 0) {
+		ret = -ENOSPC;
+		goto out_fail_kfree;
+	}
+
+	if (!get_device(dev) && dev) {
+		ret = -ENODEV;
+		goto out_ida_put;
+	}
+
+	newrec->fc_target_port.node_name = pinfo->node_name;
+	newrec->fc_target_port.port_name = pinfo->port_name;
+	newrec->fc_target_port.private = &newrec[1];
+	newrec->fc_target_port.port_id = pinfo->port_id;
+	newrec->fc_target_port.port_num = idx;
+	INIT_LIST_HEAD(&newrec->tgt_list);
+	newrec->dev = dev;
+	newrec->ops = template;
+	spin_lock_init(&newrec->lock);
+	INIT_LIST_HEAD(&newrec->ls_list);
+	INIT_LIST_HEAD(&newrec->ls_busylist);
+	INIT_LIST_HEAD(&newrec->assoc_list);
+	kref_init(&newrec->ref);
+	ida_init(&newrec->assoc_cnt);
+
+	ret = nvmet_fc_alloc_ls_iodlist(newrec);
+	if (ret) {
+		ret = -ENOMEM;
+		goto out_free_newrec;
+	}
+
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	list_add_tail(&newrec->tgt_list, &nvmet_fc_target_list);
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+
+	*portptr = &newrec->fc_target_port;
+	return 0;
+
+out_free_newrec:
+	put_device(dev);
+out_ida_put:
+	ida_simple_remove(&nvmet_fc_tgtport_cnt, idx);
+out_fail_kfree:
+	kfree(newrec);
+out_regtgt_failed:
+	*portptr = NULL;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_register_targetport);
+
+
+static void
+nvmet_fc_free_tgtport(struct kref *ref)
+{
+	struct nvmet_fc_tgtport *tgtport =
+		container_of(ref, struct nvmet_fc_tgtport, ref);
+	struct device *dev = tgtport->dev;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	list_del(&tgtport->tgt_list);
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+
+	nvmet_fc_free_ls_iodlist(tgtport);
+
+	/* let the LLDD know we've finished tearing it down */
+	tgtport->ops->targetport_delete(&tgtport->fc_target_port);
+
+	ida_simple_remove(&nvmet_fc_tgtport_cnt,
+			tgtport->fc_target_port.port_num);
+
+	ida_destroy(&tgtport->assoc_cnt);
+
+	kfree(tgtport);
+
+	put_device(dev);
+}
+
+static void
+nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport)
+{
+	kref_put(&tgtport->ref, nvmet_fc_free_tgtport);
+}
+
+static int
+nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport)
+{
+	return kref_get_unless_zero(&tgtport->ref);
+}
+
+static void
+__nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport)
+{
+	struct nvmet_fc_tgt_assoc *assoc, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tgtport->lock, flags);
+	list_for_each_entry_safe(assoc, next,
+				&tgtport->assoc_list, a_list) {
+		if (!nvmet_fc_tgt_a_get(assoc))
+			continue;
+		spin_unlock_irqrestore(&tgtport->lock, flags);
+		nvmet_fc_delete_target_assoc(assoc);
+		nvmet_fc_tgt_a_put(assoc);
+		spin_lock_irqsave(&tgtport->lock, flags);
+	}
+	spin_unlock_irqrestore(&tgtport->lock, flags);
+}
+
+/*
+ * nvmet layer has called to terminate an association
+ */
+static void
+nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_fc_tgtport *tgtport, *next;
+	struct nvmet_fc_tgt_assoc *assoc;
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+	bool found_ctrl = false;
+
+	/* this is a bit ugly, but don't want to make locks layered */
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	list_for_each_entry_safe(tgtport, next, &nvmet_fc_target_list,
+			tgt_list) {
+		if (!nvmet_fc_tgtport_get(tgtport))
+			continue;
+		spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+
+		spin_lock_irqsave(&tgtport->lock, flags);
+		list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
+			queue = assoc->queues[0];
+			if (queue && queue->nvme_sq.ctrl == ctrl) {
+				if (nvmet_fc_tgt_a_get(assoc))
+					found_ctrl = true;
+				break;
+			}
+		}
+		spin_unlock_irqrestore(&tgtport->lock, flags);
+
+		nvmet_fc_tgtport_put(tgtport);
+
+		if (found_ctrl) {
+			nvmet_fc_delete_target_assoc(assoc);
+			nvmet_fc_tgt_a_put(assoc);
+			return;
+		}
+
+		spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	}
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+}
+
+/**
+ * nvme_fc_unregister_targetport - transport entry point called by an
+ *                              LLDD to deregister/remove a previously
+ *                              registered a local NVME subsystem FC port.
+ * @tgtport: pointer to the (registered) target port that is to be
+ *           deregistered.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port)
+{
+	struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port);
+
+	/* terminate any outstanding associations */
+	__nvmet_fc_free_assocs(tgtport);
+
+	nvmet_fc_tgtport_put(tgtport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_unregister_targetport);
+
+
+/* *********************** FC-NVME LS Handling **************************** */
+
+
+static void
+nvmet_fc_format_rsp_hdr(void *buf, u8 ls_cmd, u32 desc_len, u8 rqst_ls_cmd)
+{
+	struct fcnvme_ls_acc_hdr *acc = buf;
+
+	acc->w0.ls_cmd = ls_cmd;
+	acc->desc_list_len = desc_len;
+	acc->rqst.desc_tag = cpu_to_be32(FCNVME_LSDESC_RQST);
+	acc->rqst.desc_len =
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst));
+	acc->rqst.w0.ls_cmd = rqst_ls_cmd;
+}
+
+static int
+nvmet_fc_format_rjt(void *buf, u16 buflen, u8 ls_cmd,
+			u8 reason, u8 explanation, u8 vendor)
+{
+	struct fcnvme_ls_rjt *rjt = buf;
+
+	nvmet_fc_format_rsp_hdr(buf, FCNVME_LSDESC_RQST,
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_rjt)),
+			ls_cmd);
+	rjt->rjt.desc_tag = cpu_to_be32(FCNVME_LSDESC_RJT);
+	rjt->rjt.desc_len = fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rjt));
+	rjt->rjt.reason_code = reason;
+	rjt->rjt.reason_explanation = explanation;
+	rjt->rjt.vendor = vendor;
+
+	return sizeof(struct fcnvme_ls_rjt);
+}
+
+/* Validation Error indexes into the string table below */
+enum {
+	VERR_NO_ERROR		= 0,
+	VERR_CR_ASSOC_LEN	= 1,
+	VERR_CR_ASSOC_RQST_LEN	= 2,
+	VERR_CR_ASSOC_CMD	= 3,
+	VERR_CR_ASSOC_CMD_LEN	= 4,
+	VERR_ERSP_RATIO		= 5,
+	VERR_ASSOC_ALLOC_FAIL	= 6,
+	VERR_QUEUE_ALLOC_FAIL	= 7,
+	VERR_CR_CONN_LEN	= 8,
+	VERR_CR_CONN_RQST_LEN	= 9,
+	VERR_ASSOC_ID		= 10,
+	VERR_ASSOC_ID_LEN	= 11,
+	VERR_NO_ASSOC		= 12,
+	VERR_CONN_ID		= 13,
+	VERR_CONN_ID_LEN	= 14,
+	VERR_NO_CONN		= 15,
+	VERR_CR_CONN_CMD	= 16,
+	VERR_CR_CONN_CMD_LEN	= 17,
+	VERR_DISCONN_LEN	= 18,
+	VERR_DISCONN_RQST_LEN	= 19,
+	VERR_DISCONN_CMD	= 20,
+	VERR_DISCONN_CMD_LEN	= 21,
+	VERR_DISCONN_SCOPE	= 22,
+	VERR_RS_LEN		= 23,
+	VERR_RS_RQST_LEN	= 24,
+	VERR_RS_CMD		= 25,
+	VERR_RS_CMD_LEN		= 26,
+	VERR_RS_RCTL		= 27,
+	VERR_RS_RO		= 28,
+};
+
+static char *validation_errors[] = {
+	"OK",
+	"Bad CR_ASSOC Length",
+	"Bad CR_ASSOC Rqst Length",
+	"Not CR_ASSOC Cmd",
+	"Bad CR_ASSOC Cmd Length",
+	"Bad Ersp Ratio",
+	"Association Allocation Failed",
+	"Queue Allocation Failed",
+	"Bad CR_CONN Length",
+	"Bad CR_CONN Rqst Length",
+	"Not Association ID",
+	"Bad Association ID Length",
+	"No Association",
+	"Not Connection ID",
+	"Bad Connection ID Length",
+	"No Connection",
+	"Not CR_CONN Cmd",
+	"Bad CR_CONN Cmd Length",
+	"Bad DISCONN Length",
+	"Bad DISCONN Rqst Length",
+	"Not DISCONN Cmd",
+	"Bad DISCONN Cmd Length",
+	"Bad Disconnect Scope",
+	"Bad RS Length",
+	"Bad RS Rqst Length",
+	"Not RS Cmd",
+	"Bad RS Cmd Length",
+	"Bad RS R_CTL",
+	"Bad RS Relative Offset",
+};
+
+static void
+nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	struct fcnvme_ls_cr_assoc_rqst *rqst =
+				(struct fcnvme_ls_cr_assoc_rqst *)iod->rqstbuf;
+	struct fcnvme_ls_cr_assoc_acc *acc =
+				(struct fcnvme_ls_cr_assoc_acc *)iod->rspbuf;
+	struct nvmet_fc_tgt_queue *queue;
+	int ret = 0;
+
+	memset(acc, 0, sizeof(*acc));
+
+	if (iod->rqstdatalen < sizeof(struct fcnvme_ls_cr_assoc_rqst))
+		ret = VERR_CR_ASSOC_LEN;
+	else if (rqst->desc_list_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_cr_assoc_rqst)))
+		ret = VERR_CR_ASSOC_RQST_LEN;
+	else if (rqst->assoc_cmd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD))
+		ret = VERR_CR_ASSOC_CMD;
+	else if (rqst->assoc_cmd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_cr_assoc_cmd)))
+		ret = VERR_CR_ASSOC_CMD_LEN;
+	else if (!rqst->assoc_cmd.ersp_ratio ||
+		 (be16_to_cpu(rqst->assoc_cmd.ersp_ratio) >=
+				be16_to_cpu(rqst->assoc_cmd.sqsize)))
+		ret = VERR_ERSP_RATIO;
+
+	else {
+		/* new association w/ admin queue */
+		iod->assoc = nvmet_fc_alloc_target_assoc(tgtport);
+		if (!iod->assoc)
+			ret = VERR_ASSOC_ALLOC_FAIL;
+		else {
+			queue = nvmet_fc_alloc_target_queue(iod->assoc, 0,
+					be16_to_cpu(rqst->assoc_cmd.sqsize));
+			if (!queue)
+				ret = VERR_QUEUE_ALLOC_FAIL;
+		}
+	}
+
+	if (ret) {
+		dev_err(tgtport->dev,
+			"Create Association LS failed: %s\n",
+			validation_errors[ret]);
+		iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
+				NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
+				ELS_RJT_LOGIC,
+				ELS_EXPL_NONE, 0);
+		return;
+	}
+
+	queue->ersp_ratio = be16_to_cpu(rqst->assoc_cmd.ersp_ratio);
+	atomic_set(&queue->connected, 1);
+	queue->sqhd = 0;	/* best place to init value */
+
+	/* format a response */
+
+	iod->lsreq->rsplen = sizeof(*acc);
+
+	nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC,
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_cr_assoc_acc)),
+			FCNVME_LS_CREATE_ASSOCIATION);
+	acc->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+	acc->associd.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id));
+	acc->associd.association_id =
+			cpu_to_be64(nvmet_fc_makeconnid(iod->assoc, 0));
+	acc->connectid.desc_tag = cpu_to_be32(FCNVME_LSDESC_CONN_ID);
+	acc->connectid.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_conn_id));
+	acc->connectid.connection_id = acc->associd.association_id;
+}
+
+static void
+nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	struct fcnvme_ls_cr_conn_rqst *rqst =
+				(struct fcnvme_ls_cr_conn_rqst *)iod->rqstbuf;
+	struct fcnvme_ls_cr_conn_acc *acc =
+				(struct fcnvme_ls_cr_conn_acc *)iod->rspbuf;
+	struct nvmet_fc_tgt_queue *queue;
+	int ret = 0;
+
+	memset(acc, 0, sizeof(*acc));
+
+	if (iod->rqstdatalen < sizeof(struct fcnvme_ls_cr_conn_rqst))
+		ret = VERR_CR_CONN_LEN;
+	else if (rqst->desc_list_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_cr_conn_rqst)))
+		ret = VERR_CR_CONN_RQST_LEN;
+	else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID))
+		ret = VERR_ASSOC_ID;
+	else if (rqst->associd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id)))
+		ret = VERR_ASSOC_ID_LEN;
+	else if (rqst->connect_cmd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD))
+		ret = VERR_CR_CONN_CMD;
+	else if (rqst->connect_cmd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_cr_conn_cmd)))
+		ret = VERR_CR_CONN_CMD_LEN;
+	else if (!rqst->connect_cmd.ersp_ratio ||
+		 (be16_to_cpu(rqst->connect_cmd.ersp_ratio) >=
+				be16_to_cpu(rqst->connect_cmd.sqsize)))
+		ret = VERR_ERSP_RATIO;
+
+	else {
+		/* new io queue */
+		iod->assoc = nvmet_fc_find_target_assoc(tgtport,
+				be64_to_cpu(rqst->associd.association_id));
+		if (!iod->assoc)
+			ret = VERR_NO_ASSOC;
+		else {
+			queue = nvmet_fc_alloc_target_queue(iod->assoc,
+					be16_to_cpu(rqst->connect_cmd.qid),
+					be16_to_cpu(rqst->connect_cmd.sqsize));
+			if (!queue)
+				ret = VERR_QUEUE_ALLOC_FAIL;
+
+			/* release get taken in nvmet_fc_find_target_assoc */
+			nvmet_fc_tgt_a_put(iod->assoc);
+		}
+	}
+
+	if (ret) {
+		dev_err(tgtport->dev,
+			"Create Connection LS failed: %s\n",
+			validation_errors[ret]);
+		iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
+				NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
+				(ret == VERR_NO_ASSOC) ?
+						ELS_RJT_PROT : ELS_RJT_LOGIC,
+				ELS_EXPL_NONE, 0);
+		return;
+	}
+
+	queue->ersp_ratio = be16_to_cpu(rqst->connect_cmd.ersp_ratio);
+	atomic_set(&queue->connected, 1);
+	queue->sqhd = 0;	/* best place to init value */
+
+	/* format a response */
+
+	iod->lsreq->rsplen = sizeof(*acc);
+
+	nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC,
+			fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)),
+			FCNVME_LS_CREATE_CONNECTION);
+	acc->connectid.desc_tag = cpu_to_be32(FCNVME_LSDESC_CONN_ID);
+	acc->connectid.desc_len =
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_conn_id));
+	acc->connectid.connection_id =
+			cpu_to_be64(nvmet_fc_makeconnid(iod->assoc,
+				be16_to_cpu(rqst->connect_cmd.qid)));
+}
+
+static void
+nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	struct fcnvme_ls_disconnect_rqst *rqst =
+			(struct fcnvme_ls_disconnect_rqst *)iod->rqstbuf;
+	struct fcnvme_ls_disconnect_acc *acc =
+			(struct fcnvme_ls_disconnect_acc *)iod->rspbuf;
+	struct nvmet_fc_tgt_queue *queue;
+	struct nvmet_fc_tgt_assoc *assoc;
+	int ret = 0;
+	bool del_assoc = false;
+
+	memset(acc, 0, sizeof(*acc));
+
+	if (iod->rqstdatalen < sizeof(struct fcnvme_ls_disconnect_rqst))
+		ret = VERR_DISCONN_LEN;
+	else if (rqst->desc_list_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_disconnect_rqst)))
+		ret = VERR_DISCONN_RQST_LEN;
+	else if (rqst->associd.desc_tag != cpu_to_be32(FCNVME_LSDESC_ASSOC_ID))
+		ret = VERR_ASSOC_ID;
+	else if (rqst->associd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_assoc_id)))
+		ret = VERR_ASSOC_ID_LEN;
+	else if (rqst->discon_cmd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_DISCONN_CMD))
+		ret = VERR_DISCONN_CMD;
+	else if (rqst->discon_cmd.desc_len !=
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_lsdesc_disconn_cmd)))
+		ret = VERR_DISCONN_CMD_LEN;
+	else if ((rqst->discon_cmd.scope != FCNVME_DISCONN_ASSOCIATION) &&
+			(rqst->discon_cmd.scope != FCNVME_DISCONN_CONNECTION))
+		ret = VERR_DISCONN_SCOPE;
+	else {
+		/* match an active association */
+		assoc = nvmet_fc_find_target_assoc(tgtport,
+				be64_to_cpu(rqst->associd.association_id));
+		iod->assoc = assoc;
+		if (!assoc)
+			ret = VERR_NO_ASSOC;
+	}
+
+	if (ret) {
+		dev_err(tgtport->dev,
+			"Disconnect LS failed: %s\n",
+			validation_errors[ret]);
+		iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
+				NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
+				(ret == 8) ? ELS_RJT_PROT : ELS_RJT_LOGIC,
+				ELS_EXPL_NONE, 0);
+		return;
+	}
+
+	/* format a response */
+
+	iod->lsreq->rsplen = sizeof(*acc);
+
+	nvmet_fc_format_rsp_hdr(acc, FCNVME_LS_ACC,
+			fcnvme_lsdesc_len(
+				sizeof(struct fcnvme_ls_disconnect_acc)),
+			FCNVME_LS_DISCONNECT);
+
+
+	if (rqst->discon_cmd.scope == FCNVME_DISCONN_CONNECTION) {
+		queue = nvmet_fc_find_target_queue(tgtport,
+					be64_to_cpu(rqst->discon_cmd.id));
+		if (queue) {
+			int qid = queue->qid;
+
+			nvmet_fc_delete_target_queue(queue);
+
+			/* release the get taken by find_target_queue */
+			nvmet_fc_tgt_q_put(queue);
+
+			/* tear association down if io queue terminated */
+			if (!qid)
+				del_assoc = true;
+		}
+	}
+
+	/* release get taken in nvmet_fc_find_target_assoc */
+	nvmet_fc_tgt_a_put(iod->assoc);
+
+	if (del_assoc)
+		nvmet_fc_delete_target_assoc(iod->assoc);
+}
+
+
+/* *********************** NVME Ctrl Routines **************************** */
+
+
+static void nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req);
+
+static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops;
+
+static void
+nvmet_fc_xmt_ls_rsp_done(struct nvmefc_tgt_ls_req *lsreq)
+{
+	struct nvmet_fc_ls_iod *iod = lsreq->nvmet_fc_private;
+	struct nvmet_fc_tgtport *tgtport = iod->tgtport;
+
+	fc_dma_sync_single_for_cpu(tgtport->dev, iod->rspdma,
+				NVME_FC_MAX_LS_BUFFER_SIZE, DMA_TO_DEVICE);
+	nvmet_fc_free_ls_iod(tgtport, iod);
+	nvmet_fc_tgtport_put(tgtport);
+}
+
+static void
+nvmet_fc_xmt_ls_rsp(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_ls_iod *iod)
+{
+	int ret;
+
+	fc_dma_sync_single_for_device(tgtport->dev, iod->rspdma,
+				  NVME_FC_MAX_LS_BUFFER_SIZE, DMA_TO_DEVICE);
+
+	ret = tgtport->ops->xmt_ls_rsp(&tgtport->fc_target_port, iod->lsreq);
+	if (ret)
+		nvmet_fc_xmt_ls_rsp_done(iod->lsreq);
+}
+
+/*
+ * Actual processing routine for received FC-NVME LS Requests from the LLD
+ */
+static void
+nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_ls_iod *iod)
+{
+	struct fcnvme_ls_rqst_w0 *w0 =
+			(struct fcnvme_ls_rqst_w0 *)iod->rqstbuf;
+
+	iod->lsreq->nvmet_fc_private = iod;
+	iod->lsreq->rspbuf = iod->rspbuf;
+	iod->lsreq->rspdma = iod->rspdma;
+	iod->lsreq->done = nvmet_fc_xmt_ls_rsp_done;
+	/* Be preventative. handlers will later set to valid length */
+	iod->lsreq->rsplen = 0;
+
+	iod->assoc = NULL;
+
+	/*
+	 * handlers:
+	 *   parse request input, execute the request, and format the
+	 *   LS response
+	 */
+	switch (w0->ls_cmd) {
+	case FCNVME_LS_CREATE_ASSOCIATION:
+		/* Creates Association and initial Admin Queue/Connection */
+		nvmet_fc_ls_create_association(tgtport, iod);
+		break;
+	case FCNVME_LS_CREATE_CONNECTION:
+		/* Creates an IO Queue/Connection */
+		nvmet_fc_ls_create_connection(tgtport, iod);
+		break;
+	case FCNVME_LS_DISCONNECT:
+		/* Terminate a Queue/Connection or the Association */
+		nvmet_fc_ls_disconnect(tgtport, iod);
+		break;
+	default:
+		iod->lsreq->rsplen = nvmet_fc_format_rjt(iod->rspbuf,
+				NVME_FC_MAX_LS_BUFFER_SIZE, w0->ls_cmd,
+				ELS_RJT_INVAL, ELS_EXPL_NONE, 0);
+	}
+
+	nvmet_fc_xmt_ls_rsp(tgtport, iod);
+}
+
+/*
+ * Actual processing routine for received FC-NVME LS Requests from the LLD
+ */
+static void
+nvmet_fc_handle_ls_rqst_work(struct work_struct *work)
+{
+	struct nvmet_fc_ls_iod *iod =
+		container_of(work, struct nvmet_fc_ls_iod, work);
+	struct nvmet_fc_tgtport *tgtport = iod->tgtport;
+
+	nvmet_fc_handle_ls_rqst(tgtport, iod);
+}
+
+
+/**
+ * nvmet_fc_rcv_ls_req - transport entry point called by an LLDD
+ *                       upon the reception of a NVME LS request.
+ *
+ * The nvmet-fc layer will copy payload to an internal structure for
+ * processing.  As such, upon completion of the routine, the LLDD may
+ * immediately free/reuse the LS request buffer passed in the call.
+ *
+ * If this routine returns error, the LLDD should abort the exchange.
+ *
+ * @tgtport:    pointer to the (registered) target port the LS was
+ *              received on.
+ * @lsreq:      pointer to a lsreq request structure to be used to reference
+ *              the exchange corresponding to the LS.
+ * @lsreqbuf:   pointer to the buffer containing the LS Request
+ * @lsreqbuf_len: length, in bytes, of the received LS request
+ */
+int
+nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *target_port,
+			struct nvmefc_tgt_ls_req *lsreq,
+			void *lsreqbuf, u32 lsreqbuf_len)
+{
+	struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port);
+	struct nvmet_fc_ls_iod *iod;
+
+	if (lsreqbuf_len > NVME_FC_MAX_LS_BUFFER_SIZE)
+		return -E2BIG;
+
+	if (!nvmet_fc_tgtport_get(tgtport))
+		return -ESHUTDOWN;
+
+	iod = nvmet_fc_alloc_ls_iod(tgtport);
+	if (!iod) {
+		nvmet_fc_tgtport_put(tgtport);
+		return -ENOENT;
+	}
+
+	iod->lsreq = lsreq;
+	iod->fcpreq = NULL;
+	memcpy(iod->rqstbuf, lsreqbuf, lsreqbuf_len);
+	iod->rqstdatalen = lsreqbuf_len;
+
+	schedule_work(&iod->work);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_ls_req);
+
+
+/*
+ * **********************
+ * Start of FCP handling
+ * **********************
+ */
+
+static int
+nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
+{
+	struct scatterlist *sg;
+	struct page *page;
+	unsigned int nent;
+	u32 page_len, length;
+	int i = 0;
+
+	length = fod->total_length;
+	nent = DIV_ROUND_UP(length, PAGE_SIZE);
+	sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
+	if (!sg)
+		goto out;
+
+	sg_init_table(sg, nent);
+
+	while (length) {
+		page_len = min_t(u32, length, PAGE_SIZE);
+
+		page = alloc_page(GFP_KERNEL);
+		if (!page)
+			goto out_free_pages;
+
+		sg_set_page(&sg[i], page, page_len, 0);
+		length -= page_len;
+		i++;
+	}
+
+	fod->data_sg = sg;
+	fod->data_sg_cnt = nent;
+	fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent,
+				((fod->io_dir == NVMET_FCP_WRITE) ?
+					DMA_FROM_DEVICE : DMA_TO_DEVICE));
+				/* note: write from initiator perspective */
+
+	return 0;
+
+out_free_pages:
+	while (i > 0) {
+		i--;
+		__free_page(sg_page(&sg[i]));
+	}
+	kfree(sg);
+	fod->data_sg = NULL;
+	fod->data_sg_cnt = 0;
+out:
+	return NVME_SC_INTERNAL;
+}
+
+static void
+nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
+{
+	struct scatterlist *sg;
+	int count;
+
+	if (!fod->data_sg || !fod->data_sg_cnt)
+		return;
+
+	fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt,
+				((fod->io_dir == NVMET_FCP_WRITE) ?
+					DMA_FROM_DEVICE : DMA_TO_DEVICE));
+	for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count)
+		__free_page(sg_page(sg));
+	kfree(fod->data_sg);
+}
+
+
+static bool
+queue_90percent_full(struct nvmet_fc_tgt_queue *q, u32 sqhd)
+{
+	u32 sqtail, used;
+
+	/* egad, this is ugly. And sqtail is just a best guess */
+	sqtail = atomic_read(&q->sqtail) % q->sqsize;
+
+	used = (sqtail < sqhd) ? (sqtail + q->sqsize - sqhd) : (sqtail - sqhd);
+	return ((used * 10) >= (((u32)(q->sqsize - 1) * 9)));
+}
+
+/*
+ * Prep RSP payload.
+ * May be a NVMET_FCOP_RSP or NVMET_FCOP_READDATA_RSP op
+ */
+static void
+nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvme_fc_ersp_iu *ersp = &fod->rspiubuf;
+	struct nvme_common_command *sqe = &fod->cmdiubuf.sqe.common;
+	struct nvme_completion *cqe = &ersp->cqe;
+	u32 *cqewd = (u32 *)cqe;
+	bool send_ersp = false;
+	u32 rsn, rspcnt, xfr_length;
+
+	if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP)
+		xfr_length = fod->total_length;
+	else
+		xfr_length = fod->offset;
+
+	/*
+	 * check to see if we can send a 0's rsp.
+	 *   Note: to send a 0's response, the NVME-FC host transport will
+	 *   recreate the CQE. The host transport knows: sq id, SQHD (last
+	 *   seen in an ersp), and command_id. Thus it will create a
+	 *   zero-filled CQE with those known fields filled in. Transport
+	 *   must send an ersp for any condition where the cqe won't match
+	 *   this.
+	 *
+	 * Here are the FC-NVME mandated cases where we must send an ersp:
+	 *  every N responses, where N=ersp_ratio
+	 *  force fabric commands to send ersp's (not in FC-NVME but good
+	 *    practice)
+	 *  normal cmds: any time status is non-zero, or status is zero
+	 *     but words 0 or 1 are non-zero.
+	 *  the SQ is 90% or more full
+	 *  the cmd is a fused command
+	 *  transferred data length not equal to cmd iu length
+	 */
+	rspcnt = atomic_inc_return(&fod->queue->zrspcnt);
+	if (!(rspcnt % fod->queue->ersp_ratio) ||
+	    sqe->opcode == nvme_fabrics_command ||
+	    xfr_length != fod->total_length ||
+	    (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
+	    (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
+	    queue_90percent_full(fod->queue, cqe->sq_head))
+		send_ersp = true;
+
+	/* re-set the fields */
+	fod->fcpreq->rspaddr = ersp;
+	fod->fcpreq->rspdma = fod->rspdma;
+
+	if (!send_ersp) {
+		memset(ersp, 0, NVME_FC_SIZEOF_ZEROS_RSP);
+		fod->fcpreq->rsplen = NVME_FC_SIZEOF_ZEROS_RSP;
+	} else {
+		ersp->iu_len = cpu_to_be16(sizeof(*ersp)/sizeof(u32));
+		rsn = atomic_inc_return(&fod->queue->rsn);
+		ersp->rsn = cpu_to_be32(rsn);
+		ersp->xfrd_len = cpu_to_be32(xfr_length);
+		fod->fcpreq->rsplen = sizeof(*ersp);
+	}
+
+	fc_dma_sync_single_for_device(tgtport->dev, fod->rspdma,
+				  sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+}
+
+static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq);
+
+static void
+nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod)
+{
+	int ret;
+
+	fod->fcpreq->op = NVMET_FCOP_RSP;
+	fod->fcpreq->timeout = 0;
+
+	nvmet_fc_prep_fcp_rsp(tgtport, fod);
+
+	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
+	if (ret)
+		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+}
+
+static void
+nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod, u8 op)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	struct scatterlist *sg, *datasg;
+	u32 tlen, sg_off;
+	int ret;
+
+	fcpreq->op = op;
+	fcpreq->offset = fod->offset;
+	fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC;
+	tlen = min_t(u32, (NVMET_FC_MAX_KB_PER_XFR * 1024),
+			(fod->total_length - fod->offset));
+	tlen = min_t(u32, tlen, NVME_FC_MAX_SEGMENTS * PAGE_SIZE);
+	tlen = min_t(u32, tlen, fod->tgtport->ops->max_sgl_segments
+					* PAGE_SIZE);
+	fcpreq->transfer_length = tlen;
+	fcpreq->transferred_length = 0;
+	fcpreq->fcp_error = 0;
+	fcpreq->rsplen = 0;
+
+	fcpreq->sg_cnt = 0;
+
+	datasg = fod->next_sg;
+	sg_off = fod->next_sg_offset;
+
+	for (sg = fcpreq->sg ; tlen; sg++) {
+		*sg = *datasg;
+		if (sg_off) {
+			sg->offset += sg_off;
+			sg->length -= sg_off;
+			sg->dma_address += sg_off;
+			sg_off = 0;
+		}
+		if (tlen < sg->length) {
+			sg->length = tlen;
+			fod->next_sg = datasg;
+			fod->next_sg_offset += tlen;
+		} else if (tlen == sg->length) {
+			fod->next_sg_offset = 0;
+			fod->next_sg = sg_next(datasg);
+		} else {
+			fod->next_sg_offset = 0;
+			datasg = sg_next(datasg);
+		}
+		tlen -= sg->length;
+		fcpreq->sg_cnt++;
+	}
+
+	/*
+	 * If the last READDATA request: check if LLDD supports
+	 * combined xfr with response.
+	 */
+	if ((op == NVMET_FCOP_READDATA) &&
+	    ((fod->offset + fcpreq->transfer_length) == fod->total_length) &&
+	    (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) {
+		fcpreq->op = NVMET_FCOP_READDATA_RSP;
+		nvmet_fc_prep_fcp_rsp(tgtport, fod);
+	}
+
+	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
+	if (ret) {
+		/*
+		 * should be ok to set w/o lock as its in the thread of
+		 * execution (not an async timer routine) and doesn't
+		 * contend with any clearing action
+		 */
+		fod->abort = true;
+
+		if (op == NVMET_FCOP_WRITEDATA)
+			nvmet_req_complete(&fod->req,
+					NVME_SC_FC_TRANSPORT_ERROR);
+		else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
+			fcpreq->fcp_error = ret;
+			fcpreq->transferred_length = 0;
+			nvmet_fc_xmt_fcp_op_done(fod->fcpreq);
+		}
+	}
+}
+
+static void
+nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+	unsigned long flags;
+	bool abort;
+
+	spin_lock_irqsave(&fod->flock, flags);
+	abort = fod->abort;
+	spin_unlock_irqrestore(&fod->flock, flags);
+
+	/* if in the middle of an io and we need to tear down */
+	if (abort && fcpreq->op != NVMET_FCOP_ABORT) {
+		/* data no longer needed */
+		nvmet_fc_free_tgt_pgs(fod);
+
+		if (fcpreq->fcp_error || abort)
+			nvmet_req_complete(&fod->req, fcpreq->fcp_error);
+
+		return;
+	}
+
+	switch (fcpreq->op) {
+
+	case NVMET_FCOP_WRITEDATA:
+		if (abort || fcpreq->fcp_error ||
+		    fcpreq->transferred_length != fcpreq->transfer_length) {
+			nvmet_req_complete(&fod->req,
+					NVME_SC_FC_TRANSPORT_ERROR);
+			return;
+		}
+
+		fod->offset += fcpreq->transferred_length;
+		if (fod->offset != fod->total_length) {
+			/* transfer the next chunk */
+			nvmet_fc_transfer_fcp_data(tgtport, fod,
+						NVMET_FCOP_WRITEDATA);
+			return;
+		}
+
+		/* data transfer complete, resume with nvmet layer */
+
+		fod->req.execute(&fod->req);
+
+		break;
+
+	case NVMET_FCOP_READDATA:
+	case NVMET_FCOP_READDATA_RSP:
+		if (abort || fcpreq->fcp_error ||
+		    fcpreq->transferred_length != fcpreq->transfer_length) {
+			/* data no longer needed */
+			nvmet_fc_free_tgt_pgs(fod);
+
+			nvmet_fc_abort_op(tgtport, fod->fcpreq);
+			return;
+		}
+
+		/* success */
+
+		if (fcpreq->op == NVMET_FCOP_READDATA_RSP) {
+			/* data no longer needed */
+			nvmet_fc_free_tgt_pgs(fod);
+			fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
+					sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+			nvmet_fc_free_fcp_iod(fod->queue, fod);
+			return;
+		}
+
+		fod->offset += fcpreq->transferred_length;
+		if (fod->offset != fod->total_length) {
+			/* transfer the next chunk */
+			nvmet_fc_transfer_fcp_data(tgtport, fod,
+						NVMET_FCOP_READDATA);
+			return;
+		}
+
+		/* data transfer complete, send response */
+
+		/* data no longer needed */
+		nvmet_fc_free_tgt_pgs(fod);
+
+		nvmet_fc_xmt_fcp_rsp(tgtport, fod);
+
+		break;
+
+	case NVMET_FCOP_RSP:
+	case NVMET_FCOP_ABORT:
+		fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
+				sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+		nvmet_fc_free_fcp_iod(fod->queue, fod);
+		break;
+
+	default:
+		nvmet_fc_free_tgt_pgs(fod);
+		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+		break;
+	}
+}
+
+/*
+ * actual completion handler after execution by the nvmet layer
+ */
+static void
+__nvmet_fc_fcp_nvme_cmd_done(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_fcp_iod *fod, int status)
+{
+	struct nvme_common_command *sqe = &fod->cmdiubuf.sqe.common;
+	struct nvme_completion *cqe = &fod->rspiubuf.cqe;
+	unsigned long flags;
+	bool abort;
+
+	spin_lock_irqsave(&fod->flock, flags);
+	abort = fod->abort;
+	spin_unlock_irqrestore(&fod->flock, flags);
+
+	/* if we have a CQE, snoop the last sq_head value */
+	if (!status)
+		fod->queue->sqhd = cqe->sq_head;
+
+	if (abort) {
+		/* data no longer needed */
+		nvmet_fc_free_tgt_pgs(fod);
+
+		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+		return;
+	}
+
+	/* if an error handling the cmd post initial parsing */
+	if (status) {
+		/* fudge up a failed CQE status for our transport error */
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->sq_head = fod->queue->sqhd;	/* echo last cqe sqhd */
+		cqe->sq_id = cpu_to_le16(fod->queue->qid);
+		cqe->command_id = sqe->command_id;
+		cqe->status = cpu_to_le16(status);
+	} else {
+
+		/*
+		 * try to push the data even if the SQE status is non-zero.
+		 * There may be a status where data still was intended to
+		 * be moved
+		 */
+		if ((fod->io_dir == NVMET_FCP_READ) && (fod->data_sg_cnt)) {
+			/* push the data over before sending rsp */
+			nvmet_fc_transfer_fcp_data(tgtport, fod,
+						NVMET_FCOP_READDATA);
+			return;
+		}
+
+		/* writes & no data - fall thru */
+	}
+
+	/* data no longer needed */
+	nvmet_fc_free_tgt_pgs(fod);
+
+	nvmet_fc_xmt_fcp_rsp(tgtport, fod);
+}
+
+
+static void
+nvmet_fc_fcp_nvme_cmd_done(struct nvmet_req *nvme_req)
+{
+	struct nvmet_fc_fcp_iod *fod = nvmet_req_to_fod(nvme_req);
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+	__nvmet_fc_fcp_nvme_cmd_done(tgtport, fod, 0);
+}
+
+
+/*
+ * Actual processing routine for received FC-NVME LS Requests from the LLD
+ */
+void
+nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
+			struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvme_fc_cmd_iu *cmdiu = &fod->cmdiubuf;
+	int ret;
+
+	/*
+	 * Fused commands are currently not supported in the linux
+	 * implementation.
+	 *
+	 * As such, the implementation of the FC transport does not
+	 * look at the fused commands and order delivery to the upper
+	 * layer until we have both based on csn.
+	 */
+
+	fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done;
+
+	fod->total_length = be32_to_cpu(cmdiu->data_len);
+	if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) {
+		fod->io_dir = NVMET_FCP_WRITE;
+		if (!nvme_is_write(&cmdiu->sqe))
+			goto transport_error;
+	} else if (cmdiu->flags & FCNVME_CMD_FLAGS_READ) {
+		fod->io_dir = NVMET_FCP_READ;
+		if (nvme_is_write(&cmdiu->sqe))
+			goto transport_error;
+	} else {
+		fod->io_dir = NVMET_FCP_NODATA;
+		if (fod->total_length)
+			goto transport_error;
+	}
+
+	fod->req.cmd = &fod->cmdiubuf.sqe;
+	fod->req.rsp = &fod->rspiubuf.cqe;
+	fod->req.port = fod->queue->port;
+
+	/* ensure nvmet handlers will set cmd handler callback */
+	fod->req.execute = NULL;
+
+	/* clear any response payload */
+	memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf));
+
+	ret = nvmet_req_init(&fod->req,
+				&fod->queue->nvme_cq,
+				&fod->queue->nvme_sq,
+				&nvmet_fc_tgt_fcp_ops);
+	if (!ret) {	/* bad SQE content */
+		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+		return;
+	}
+
+	/* keep a running counter of tail position */
+	atomic_inc(&fod->queue->sqtail);
+
+	fod->data_sg = NULL;
+	fod->data_sg_cnt = 0;
+	if (fod->total_length) {
+		ret = nvmet_fc_alloc_tgt_pgs(fod);
+		if (ret) {
+			nvmet_req_complete(&fod->req, ret);
+			return;
+		}
+	}
+	fod->req.sg = fod->data_sg;
+	fod->req.sg_cnt = fod->data_sg_cnt;
+	fod->offset = 0;
+	fod->next_sg = fod->data_sg;
+	fod->next_sg_offset = 0;
+
+	if (fod->io_dir == NVMET_FCP_WRITE) {
+		/* pull the data over before invoking nvmet layer */
+		nvmet_fc_transfer_fcp_data(tgtport, fod, NVMET_FCOP_WRITEDATA);
+		return;
+	}
+
+	/*
+	 * Reads or no data:
+	 *
+	 * can invoke the nvmet_layer now. If read data, cmd completion will
+	 * push the data
+	 */
+
+	fod->req.execute(&fod->req);
+
+	return;
+
+transport_error:
+	nvmet_fc_abort_op(tgtport, fod->fcpreq);
+}
+
+/*
+ * Actual processing routine for received FC-NVME LS Requests from the LLD
+ */
+static void
+nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
+{
+	struct nvmet_fc_fcp_iod *fod =
+		container_of(work, struct nvmet_fc_fcp_iod, work);
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+	nvmet_fc_handle_fcp_rqst(tgtport, fod);
+}
+
+/**
+ * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD
+ *                       upon the reception of a NVME FCP CMD IU.
+ *
+ * Pass a FC-NVME FCP CMD IU received from the FC link to the nvmet-fc
+ * layer for processing.
+ *
+ * The nvmet-fc layer will copy cmd payload to an internal structure for
+ * processing.  As such, upon completion of the routine, the LLDD may
+ * immediately free/reuse the CMD IU buffer passed in the call.
+ *
+ * If this routine returns error, the lldd should abort the exchange.
+ *
+ * @target_port: pointer to the (registered) target port the FCP CMD IU
+ *              was receive on.
+ * @fcpreq:     pointer to a fcpreq request structure to be used to reference
+ *              the exchange corresponding to the FCP Exchange.
+ * @cmdiubuf:   pointer to the buffer containing the FCP CMD IU
+ * @cmdiubuf_len: length, in bytes, of the received FCP CMD IU
+ */
+int
+nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
+			struct nvmefc_tgt_fcp_req *fcpreq,
+			void *cmdiubuf, u32 cmdiubuf_len)
+{
+	struct nvmet_fc_tgtport *tgtport = targetport_to_tgtport(target_port);
+	struct nvme_fc_cmd_iu *cmdiu = cmdiubuf;
+	struct nvmet_fc_tgt_queue *queue;
+	struct nvmet_fc_fcp_iod *fod;
+
+	/* validate iu, so the connection id can be used to find the queue */
+	if ((cmdiubuf_len != sizeof(*cmdiu)) ||
+			(cmdiu->scsi_id != NVME_CMD_SCSI_ID) ||
+			(cmdiu->fc_id != NVME_CMD_FC_ID) ||
+			(be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4)))
+		return -EIO;
+
+
+	queue = nvmet_fc_find_target_queue(tgtport,
+				be64_to_cpu(cmdiu->connection_id));
+	if (!queue)
+		return -ENOTCONN;
+
+	/*
+	 * note: reference taken by find_target_queue
+	 * After successful fod allocation, the fod will inherit the
+	 * ownership of that reference and will remove the reference
+	 * when the fod is freed.
+	 */
+
+	fod = nvmet_fc_alloc_fcp_iod(queue);
+	if (!fod) {
+		/* release the queue lookup reference */
+		nvmet_fc_tgt_q_put(queue);
+		return -ENOENT;
+	}
+
+	fcpreq->nvmet_fc_private = fod;
+	fod->fcpreq = fcpreq;
+	/*
+	 * put all admin cmds on hw queue id 0. All io commands go to
+	 * the respective hw queue based on a modulo basis
+	 */
+	fcpreq->hwqid = queue->qid ?
+			((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
+	memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len);
+
+	queue_work_on(queue->cpu, queue->work_q, &fod->work);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req);
+
+enum {
+	FCT_TRADDR_ERR		= 0,
+	FCT_TRADDR_WWNN		= 1 << 0,
+	FCT_TRADDR_WWPN		= 1 << 1,
+};
+
+struct nvmet_fc_traddr {
+	u64	nn;
+	u64	pn;
+};
+
+static const match_table_t traddr_opt_tokens = {
+	{ FCT_TRADDR_WWNN,	"nn-%s"		},
+	{ FCT_TRADDR_WWPN,	"pn-%s"		},
+	{ FCT_TRADDR_ERR,	NULL		}
+};
+
+static int
+nvmet_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *options, *o, *p;
+	int token, ret = 0;
+	u64 token64;
+
+	options = o = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	while ((p = strsep(&o, ",\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, traddr_opt_tokens, args);
+		switch (token) {
+		case FCT_TRADDR_WWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			traddr->nn = token64;
+			break;
+		case FCT_TRADDR_WWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			traddr->pn = token64;
+			break;
+		default:
+			pr_warn("unknown traddr token or missing value '%s'\n",
+					p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static int
+nvmet_fc_add_port(struct nvmet_port *port)
+{
+	struct nvmet_fc_tgtport *tgtport;
+	struct nvmet_fc_traddr traddr = { 0L, 0L };
+	unsigned long flags;
+	int ret;
+
+	/* validate the address info */
+	if ((port->disc_addr.trtype != NVMF_TRTYPE_FC) ||
+	    (port->disc_addr.adrfam != NVMF_ADDR_FAMILY_FC))
+		return -EINVAL;
+
+	/* map the traddr address info to a target port */
+
+	ret = nvmet_fc_parse_traddr(&traddr, port->disc_addr.traddr);
+	if (ret)
+		return ret;
+
+	ret = -ENXIO;
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
+		if ((tgtport->fc_target_port.node_name == traddr.nn) &&
+		    (tgtport->fc_target_port.port_name == traddr.pn)) {
+			/* a FC port can only be 1 nvmet port id */
+			if (!tgtport->port) {
+				tgtport->port = port;
+				port->priv = tgtport;
+				ret = 0;
+			} else
+				ret = -EALREADY;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+	return ret;
+}
+
+static void
+nvmet_fc_remove_port(struct nvmet_port *port)
+{
+	struct nvmet_fc_tgtport *tgtport = port->priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+	if (tgtport->port == port) {
+		nvmet_fc_tgtport_put(tgtport);
+		tgtport->port = NULL;
+	}
+	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+}
+
+static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
+	.owner			= THIS_MODULE,
+	.type			= NVMF_TRTYPE_FC,
+	.msdbd			= 1,
+	.add_port		= nvmet_fc_add_port,
+	.remove_port		= nvmet_fc_remove_port,
+	.queue_response		= nvmet_fc_fcp_nvme_cmd_done,
+	.delete_ctrl		= nvmet_fc_delete_ctrl,
+};
+
+static int __init nvmet_fc_init_module(void)
+{
+	return nvmet_register_transport(&nvmet_fc_tgt_fcp_ops);
+}
+
+static void __exit nvmet_fc_exit_module(void)
+{
+	/* sanity check - all lports should be removed */
+	if (!list_empty(&nvmet_fc_target_list))
+		pr_warn("%s: targetport list not empty\n", __func__);
+
+	nvmet_unregister_transport(&nvmet_fc_tgt_fcp_ops);
+
+	ida_destroy(&nvmet_fc_tgtport_cnt);
+}
+
+module_init(nvmet_fc_init_module);
+module_exit(nvmet_fc_exit_module);
+
+MODULE_LICENSE("GPL v2");

From 475d0fe795516a9b9f286a851c3972fd8831c643 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 2 Dec 2016 00:28:44 -0800
Subject: [PATCH 173/182] nvme-fabrics: Add FC LLDD loopback driver to test
 FC-NVME

Add FC LLDD loopback driver to test FC host and target transport within
nvme-fabrics

To aid in the development and testing of the lower-level api of the FC
transport, this loopback driver has been created to act as if it were a
FC hba driver supporting both the host interfaces as well as the target
interfaces with the nvme FC transport.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Jay Freyensee <james_p_freyensee@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS                  |    1 +
 drivers/nvme/target/Kconfig  |   13 +
 drivers/nvme/target/Makefile |    2 +
 drivers/nvme/target/fcloop.c | 1148 ++++++++++++++++++++++++++++++++++
 4 files changed, 1164 insertions(+)
 create mode 100644 drivers/nvme/target/fcloop.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1c2c9f96dd5639..0bbc0b0baf82ab 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8667,6 +8667,7 @@ F:	include/linux/nvme-fc.h
 F:	include/linux/nvme-fc-driver.h
 F:	drivers/nvme/host/fc.c
 F:	drivers/nvme/target/fc.c
+F:	drivers/nvme/target/fcloop.c
 
 NVMEM FRAMEWORK
 M:	Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 746a63e4d54aaa..03e4ab65fe777a 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -45,3 +45,16 @@ config NVME_TARGET_FC
 
 	  If unsure, say N.
 
+config NVME_TARGET_FCLOOP
+	tristate "NVMe over Fabrics FC Transport Loopback Test driver"
+	depends on NVME_TARGET
+	select NVME_CORE
+	select NVME_FABRICS
+	select SG_POOL
+	depends on NVME_FC
+	depends on NVME_TARGET_FC
+	help
+	  This enables the NVMe FC loopback test support, which can be useful
+	  to test NVMe-FC transport interfaces.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 80b128b9ac9f00..fecc14f535b23e 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -3,9 +3,11 @@ obj-$(CONFIG_NVME_TARGET)		+= nvmet.o
 obj-$(CONFIG_NVME_TARGET_LOOP)		+= nvme-loop.o
 obj-$(CONFIG_NVME_TARGET_RDMA)		+= nvmet-rdma.o
 obj-$(CONFIG_NVME_TARGET_FC)		+= nvmet-fc.o
+obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \
 			discovery.o
 nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
+nvme-fcloop-y	+= fcloop.o
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
new file mode 100644
index 00000000000000..bcb8ebeb01c5d2
--- /dev/null
+++ b/drivers/nvme/target/fcloop.c
@@ -0,0 +1,1148 @@
+/*
+ * Copyright (c) 2016 Avago Technologies.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful.
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
+ * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
+ * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
+ * See the GNU General Public License for more details, a copy of which
+ * can be found in the file COPYING included with this package
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <uapi/scsi/fc/fc_fs.h>
+
+#include "../host/nvme.h"
+#include "../target/nvmet.h"
+#include <linux/nvme-fc-driver.h>
+#include <linux/nvme-fc.h>
+
+
+enum {
+	NVMF_OPT_ERR		= 0,
+	NVMF_OPT_WWNN		= 1 << 0,
+	NVMF_OPT_WWPN		= 1 << 1,
+	NVMF_OPT_ROLES		= 1 << 2,
+	NVMF_OPT_FCADDR		= 1 << 3,
+	NVMF_OPT_LPWWNN		= 1 << 4,
+	NVMF_OPT_LPWWPN		= 1 << 5,
+};
+
+struct fcloop_ctrl_options {
+	int			mask;
+	u64			wwnn;
+	u64			wwpn;
+	u32			roles;
+	u32			fcaddr;
+	u64			lpwwnn;
+	u64			lpwwpn;
+};
+
+static const match_table_t opt_tokens = {
+	{ NVMF_OPT_WWNN,	"wwnn=%s"	},
+	{ NVMF_OPT_WWPN,	"wwpn=%s"	},
+	{ NVMF_OPT_ROLES,	"roles=%d"	},
+	{ NVMF_OPT_FCADDR,	"fcaddr=%x"	},
+	{ NVMF_OPT_LPWWNN,	"lpwwnn=%s"	},
+	{ NVMF_OPT_LPWWPN,	"lpwwpn=%s"	},
+	{ NVMF_OPT_ERR,		NULL		}
+};
+
+static int
+fcloop_parse_options(struct fcloop_ctrl_options *opts,
+		const char *buf)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *options, *o, *p;
+	int token, ret = 0;
+	u64 token64;
+
+	options = o = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	while ((p = strsep(&o, ",\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, opt_tokens, args);
+		opts->mask |= token;
+		switch (token) {
+		case NVMF_OPT_WWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->wwnn = token64;
+			break;
+		case NVMF_OPT_WWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->wwpn = token64;
+			break;
+		case NVMF_OPT_ROLES:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->roles = token;
+			break;
+		case NVMF_OPT_FCADDR:
+			if (match_hex(args, &token)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->fcaddr = token;
+			break;
+		case NVMF_OPT_LPWWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->lpwwnn = token64;
+			break;
+		case NVMF_OPT_LPWWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			opts->lpwwpn = token64;
+			break;
+		default:
+			pr_warn("unknown parameter or missing value '%s'\n", p);
+			ret = -EINVAL;
+			goto out_free_options;
+		}
+	}
+
+out_free_options:
+	kfree(options);
+	return ret;
+}
+
+
+static int
+fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname,
+		const char *buf)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *options, *o, *p;
+	int token, ret = 0;
+	u64 token64;
+
+	*nname = -1;
+	*pname = -1;
+
+	options = o = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	while ((p = strsep(&o, ",\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, opt_tokens, args);
+		switch (token) {
+		case NVMF_OPT_WWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			*nname = token64;
+			break;
+		case NVMF_OPT_WWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out_free_options;
+			}
+			*pname = token64;
+			break;
+		default:
+			pr_warn("unknown parameter or missing value '%s'\n", p);
+			ret = -EINVAL;
+			goto out_free_options;
+		}
+	}
+
+out_free_options:
+	kfree(options);
+
+	if (!ret) {
+		if (*nname == -1)
+			return -EINVAL;
+		if (*pname == -1)
+			return -EINVAL;
+	}
+
+	return ret;
+}
+
+
+#define LPORT_OPTS	(NVMF_OPT_WWNN | NVMF_OPT_WWPN)
+
+#define RPORT_OPTS	(NVMF_OPT_WWNN | NVMF_OPT_WWPN |  \
+			 NVMF_OPT_LPWWNN | NVMF_OPT_LPWWPN)
+
+#define TGTPORT_OPTS	(NVMF_OPT_WWNN | NVMF_OPT_WWPN)
+
+#define ALL_OPTS	(NVMF_OPT_WWNN | NVMF_OPT_WWPN | NVMF_OPT_ROLES | \
+			 NVMF_OPT_FCADDR | NVMF_OPT_LPWWNN | NVMF_OPT_LPWWPN)
+
+
+static DEFINE_SPINLOCK(fcloop_lock);
+static LIST_HEAD(fcloop_lports);
+static LIST_HEAD(fcloop_nports);
+
+struct fcloop_lport {
+	struct nvme_fc_local_port *localport;
+	struct list_head lport_list;
+	struct completion unreg_done;
+};
+
+struct fcloop_rport {
+	struct nvme_fc_remote_port *remoteport;
+	struct nvmet_fc_target_port *targetport;
+	struct fcloop_nport *nport;
+	struct fcloop_lport *lport;
+};
+
+struct fcloop_tport {
+	struct nvmet_fc_target_port *targetport;
+	struct nvme_fc_remote_port *remoteport;
+	struct fcloop_nport *nport;
+	struct fcloop_lport *lport;
+};
+
+struct fcloop_nport {
+	struct fcloop_rport *rport;
+	struct fcloop_tport *tport;
+	struct fcloop_lport *lport;
+	struct list_head nport_list;
+	struct kref ref;
+	struct completion rport_unreg_done;
+	struct completion tport_unreg_done;
+	u64 node_name;
+	u64 port_name;
+	u32 port_role;
+	u32 port_id;
+};
+
+struct fcloop_lsreq {
+	struct fcloop_tport		*tport;
+	struct nvmefc_ls_req		*lsreq;
+	struct work_struct		work;
+	struct nvmefc_tgt_ls_req	tgt_ls_req;
+	int				status;
+};
+
+struct fcloop_fcpreq {
+	struct fcloop_tport		*tport;
+	struct nvmefc_fcp_req		*fcpreq;
+	u16				status;
+	struct work_struct		work;
+	struct nvmefc_tgt_fcp_req	tgt_fcp_req;
+};
+
+
+static inline struct fcloop_lsreq *
+tgt_ls_req_to_lsreq(struct nvmefc_tgt_ls_req *tgt_lsreq)
+{
+	return container_of(tgt_lsreq, struct fcloop_lsreq, tgt_ls_req);
+}
+
+static inline struct fcloop_fcpreq *
+tgt_fcp_req_to_fcpreq(struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	return container_of(tgt_fcpreq, struct fcloop_fcpreq, tgt_fcp_req);
+}
+
+
+static int
+fcloop_create_queue(struct nvme_fc_local_port *localport,
+			unsigned int qidx, u16 qsize,
+			void **handle)
+{
+	*handle = localport;
+	return 0;
+}
+
+static void
+fcloop_delete_queue(struct nvme_fc_local_port *localport,
+			unsigned int idx, void *handle)
+{
+}
+
+
+/*
+ * Transmit of LS RSP done (e.g. buffers all set). call back up
+ * initiator "done" flows.
+ */
+static void
+fcloop_tgt_lsrqst_done_work(struct work_struct *work)
+{
+	struct fcloop_lsreq *tls_req =
+		container_of(work, struct fcloop_lsreq, work);
+	struct fcloop_tport *tport = tls_req->tport;
+	struct nvmefc_ls_req *lsreq = tls_req->lsreq;
+
+	if (tport->remoteport)
+		lsreq->done(lsreq, tls_req->status);
+}
+
+static int
+fcloop_ls_req(struct nvme_fc_local_port *localport,
+			struct nvme_fc_remote_port *remoteport,
+			struct nvmefc_ls_req *lsreq)
+{
+	struct fcloop_lsreq *tls_req = lsreq->private;
+	struct fcloop_rport *rport = remoteport->private;
+	int ret = 0;
+
+	tls_req->lsreq = lsreq;
+	INIT_WORK(&tls_req->work, fcloop_tgt_lsrqst_done_work);
+
+	if (!rport->targetport) {
+		tls_req->status = -ECONNREFUSED;
+		schedule_work(&tls_req->work);
+		return ret;
+	}
+
+	tls_req->status = 0;
+	tls_req->tport = rport->targetport->private;
+	ret = nvmet_fc_rcv_ls_req(rport->targetport, &tls_req->tgt_ls_req,
+				 lsreq->rqstaddr, lsreq->rqstlen);
+
+	return ret;
+}
+
+static int
+fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
+			struct nvmefc_tgt_ls_req *tgt_lsreq)
+{
+	struct fcloop_lsreq *tls_req = tgt_ls_req_to_lsreq(tgt_lsreq);
+	struct nvmefc_ls_req *lsreq = tls_req->lsreq;
+
+	memcpy(lsreq->rspaddr, tgt_lsreq->rspbuf,
+		((lsreq->rsplen < tgt_lsreq->rsplen) ?
+				lsreq->rsplen : tgt_lsreq->rsplen));
+	tgt_lsreq->done(tgt_lsreq);
+
+	schedule_work(&tls_req->work);
+
+	return 0;
+}
+
+/*
+ * FCP IO operation done. call back up initiator "done" flows.
+ */
+static void
+fcloop_tgt_fcprqst_done_work(struct work_struct *work)
+{
+	struct fcloop_fcpreq *tfcp_req =
+		container_of(work, struct fcloop_fcpreq, work);
+	struct fcloop_tport *tport = tfcp_req->tport;
+	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+
+	if (tport->remoteport) {
+		fcpreq->status = tfcp_req->status;
+		fcpreq->done(fcpreq);
+	}
+}
+
+
+static int
+fcloop_fcp_req(struct nvme_fc_local_port *localport,
+			struct nvme_fc_remote_port *remoteport,
+			void *hw_queue_handle,
+			struct nvmefc_fcp_req *fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = fcpreq->private;
+	struct fcloop_rport *rport = remoteport->private;
+	int ret = 0;
+
+	INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+
+	if (!rport->targetport) {
+		tfcp_req->status = NVME_SC_FC_TRANSPORT_ERROR;
+		schedule_work(&tfcp_req->work);
+		return ret;
+	}
+
+	tfcp_req->fcpreq = fcpreq;
+	tfcp_req->tport = rport->targetport->private;
+
+	ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
+				 fcpreq->cmdaddr, fcpreq->cmdlen);
+
+	return ret;
+}
+
+static void
+fcloop_fcp_copy_data(u8 op, struct scatterlist *data_sg,
+			struct scatterlist *io_sg, u32 offset, u32 length)
+{
+	void *data_p, *io_p;
+	u32 data_len, io_len, tlen;
+
+	io_p = sg_virt(io_sg);
+	io_len = io_sg->length;
+
+	for ( ; offset; ) {
+		tlen = min_t(u32, offset, io_len);
+		offset -= tlen;
+		io_len -= tlen;
+		if (!io_len) {
+			io_sg = sg_next(io_sg);
+			io_p = sg_virt(io_sg);
+			io_len = io_sg->length;
+		} else
+			io_p += tlen;
+	}
+
+	data_p = sg_virt(data_sg);
+	data_len = data_sg->length;
+
+	for ( ; length; ) {
+		tlen = min_t(u32, io_len, data_len);
+		tlen = min_t(u32, tlen, length);
+
+		if (op == NVMET_FCOP_WRITEDATA)
+			memcpy(data_p, io_p, tlen);
+		else
+			memcpy(io_p, data_p, tlen);
+
+		length -= tlen;
+
+		io_len -= tlen;
+		if ((!io_len) && (length)) {
+			io_sg = sg_next(io_sg);
+			io_p = sg_virt(io_sg);
+			io_len = io_sg->length;
+		} else
+			io_p += tlen;
+
+		data_len -= tlen;
+		if ((!data_len) && (length)) {
+			data_sg = sg_next(data_sg);
+			data_p = sg_virt(data_sg);
+			data_len = data_sg->length;
+		} else
+			data_p += tlen;
+	}
+}
+
+static int
+fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	u32 rsplen = 0, xfrlen = 0;
+	int fcp_err = 0;
+	u8 op = tgt_fcpreq->op;
+
+	switch (op) {
+	case NVMET_FCOP_WRITEDATA:
+		xfrlen = tgt_fcpreq->transfer_length;
+		fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
+					tgt_fcpreq->offset, xfrlen);
+		fcpreq->transferred_length += xfrlen;
+		break;
+
+	case NVMET_FCOP_READDATA:
+	case NVMET_FCOP_READDATA_RSP:
+		xfrlen = tgt_fcpreq->transfer_length;
+		fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
+					tgt_fcpreq->offset, xfrlen);
+		fcpreq->transferred_length += xfrlen;
+		if (op == NVMET_FCOP_READDATA)
+			break;
+
+		/* Fall-Thru to RSP handling */
+
+	case NVMET_FCOP_RSP:
+		rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
+				fcpreq->rsplen : tgt_fcpreq->rsplen);
+		memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
+		if (rsplen < tgt_fcpreq->rsplen)
+			fcp_err = -E2BIG;
+		fcpreq->rcv_rsplen = rsplen;
+		fcpreq->status = 0;
+		tfcp_req->status = 0;
+		break;
+
+	case NVMET_FCOP_ABORT:
+		tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
+		break;
+
+	default:
+		fcp_err = -EINVAL;
+		break;
+	}
+
+	tgt_fcpreq->transferred_length = xfrlen;
+	tgt_fcpreq->fcp_error = fcp_err;
+	tgt_fcpreq->done(tgt_fcpreq);
+
+	if ((!fcp_err) && (op == NVMET_FCOP_RSP ||
+			op == NVMET_FCOP_READDATA_RSP ||
+			op == NVMET_FCOP_ABORT))
+		schedule_work(&tfcp_req->work);
+
+	return 0;
+}
+
+static void
+fcloop_ls_abort(struct nvme_fc_local_port *localport,
+			struct nvme_fc_remote_port *remoteport,
+				struct nvmefc_ls_req *lsreq)
+{
+}
+
+static void
+fcloop_fcp_abort(struct nvme_fc_local_port *localport,
+			struct nvme_fc_remote_port *remoteport,
+			void *hw_queue_handle,
+			struct nvmefc_fcp_req *fcpreq)
+{
+}
+
+static void
+fcloop_localport_delete(struct nvme_fc_local_port *localport)
+{
+	struct fcloop_lport *lport = localport->private;
+
+	/* release any threads waiting for the unreg to complete */
+	complete(&lport->unreg_done);
+}
+
+static void
+fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport)
+{
+	struct fcloop_rport *rport = remoteport->private;
+
+	/* release any threads waiting for the unreg to complete */
+	complete(&rport->nport->rport_unreg_done);
+}
+
+static void
+fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
+{
+	struct fcloop_tport *tport = targetport->private;
+
+	/* release any threads waiting for the unreg to complete */
+	complete(&tport->nport->tport_unreg_done);
+}
+
+#define	FCLOOP_HW_QUEUES		4
+#define	FCLOOP_SGL_SEGS			256
+#define FCLOOP_DMABOUND_4G		0xFFFFFFFF
+
+struct nvme_fc_port_template fctemplate = {
+	.localport_delete	= fcloop_localport_delete,
+	.remoteport_delete	= fcloop_remoteport_delete,
+	.create_queue		= fcloop_create_queue,
+	.delete_queue		= fcloop_delete_queue,
+	.ls_req			= fcloop_ls_req,
+	.fcp_io			= fcloop_fcp_req,
+	.ls_abort		= fcloop_ls_abort,
+	.fcp_abort		= fcloop_fcp_abort,
+	.max_hw_queues		= FCLOOP_HW_QUEUES,
+	.max_sgl_segments	= FCLOOP_SGL_SEGS,
+	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
+	.dma_boundary		= FCLOOP_DMABOUND_4G,
+	/* sizes of additional private data for data structures */
+	.local_priv_sz		= sizeof(struct fcloop_lport),
+	.remote_priv_sz		= sizeof(struct fcloop_rport),
+	.lsrqst_priv_sz		= sizeof(struct fcloop_lsreq),
+	.fcprqst_priv_sz	= sizeof(struct fcloop_fcpreq),
+};
+
+struct nvmet_fc_target_template tgttemplate = {
+	.targetport_delete	= fcloop_targetport_delete,
+	.xmt_ls_rsp		= fcloop_xmt_ls_rsp,
+	.fcp_op			= fcloop_fcp_op,
+	.max_hw_queues		= FCLOOP_HW_QUEUES,
+	.max_sgl_segments	= FCLOOP_SGL_SEGS,
+	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
+	.dma_boundary		= FCLOOP_DMABOUND_4G,
+	/* optional features */
+	.target_features	= NVMET_FCTGTFEAT_READDATA_RSP |
+				  NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED,
+	/* sizes of additional private data for data structures */
+	.target_priv_sz		= sizeof(struct fcloop_tport),
+};
+
+static ssize_t
+fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct nvme_fc_port_info pinfo;
+	struct fcloop_ctrl_options *opts;
+	struct nvme_fc_local_port *localport;
+	struct fcloop_lport *lport;
+	int ret;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return -ENOMEM;
+
+	ret = fcloop_parse_options(opts, buf);
+	if (ret)
+		goto out_free_opts;
+
+	/* everything there ? */
+	if ((opts->mask & LPORT_OPTS) != LPORT_OPTS) {
+		ret = -EINVAL;
+		goto out_free_opts;
+	}
+
+	pinfo.node_name = opts->wwnn;
+	pinfo.port_name = opts->wwpn;
+	pinfo.port_role = opts->roles;
+	pinfo.port_id = opts->fcaddr;
+
+	ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
+	if (!ret) {
+		unsigned long flags;
+
+		/* success */
+		lport = localport->private;
+		lport->localport = localport;
+		INIT_LIST_HEAD(&lport->lport_list);
+
+		spin_lock_irqsave(&fcloop_lock, flags);
+		list_add_tail(&lport->lport_list, &fcloop_lports);
+		spin_unlock_irqrestore(&fcloop_lock, flags);
+
+		/* mark all of the input buffer consumed */
+		ret = count;
+	}
+
+out_free_opts:
+	kfree(opts);
+	return ret ? ret : count;
+}
+
+
+static void
+__unlink_local_port(struct fcloop_lport *lport)
+{
+	list_del(&lport->lport_list);
+}
+
+static int
+__wait_localport_unreg(struct fcloop_lport *lport)
+{
+	int ret;
+
+	init_completion(&lport->unreg_done);
+
+	ret = nvme_fc_unregister_localport(lport->localport);
+
+	wait_for_completion(&lport->unreg_done);
+
+	return ret;
+}
+
+
+static ssize_t
+fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct fcloop_lport *tlport, *lport = NULL;
+	u64 nodename, portname;
+	unsigned long flags;
+	int ret;
+
+	ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	list_for_each_entry(tlport, &fcloop_lports, lport_list) {
+		if (tlport->localport->node_name == nodename &&
+		    tlport->localport->port_name == portname) {
+			lport = tlport;
+			__unlink_local_port(lport);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	if (!lport)
+		return -ENOENT;
+
+	ret = __wait_localport_unreg(lport);
+
+	return ret ? ret : count;
+}
+
+static void
+fcloop_nport_free(struct kref *ref)
+{
+	struct fcloop_nport *nport =
+		container_of(ref, struct fcloop_nport, ref);
+	unsigned long flags;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+	list_del(&nport->nport_list);
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	kfree(nport);
+}
+
+static void
+fcloop_nport_put(struct fcloop_nport *nport)
+{
+	kref_put(&nport->ref, fcloop_nport_free);
+}
+
+static int
+fcloop_nport_get(struct fcloop_nport *nport)
+{
+	return kref_get_unless_zero(&nport->ref);
+}
+
+static struct fcloop_nport *
+fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
+{
+	struct fcloop_nport *newnport, *nport = NULL;
+	struct fcloop_lport *tmplport, *lport = NULL;
+	struct fcloop_ctrl_options *opts;
+	unsigned long flags;
+	u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS;
+	int ret;
+
+	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+	if (!opts)
+		return NULL;
+
+	ret = fcloop_parse_options(opts, buf);
+	if (ret)
+		goto out_free_opts;
+
+	/* everything there ? */
+	if ((opts->mask & opts_mask) != opts_mask) {
+		ret = -EINVAL;
+		goto out_free_opts;
+	}
+
+	newnport = kzalloc(sizeof(*newnport), GFP_KERNEL);
+	if (!newnport)
+		goto out_free_opts;
+
+	INIT_LIST_HEAD(&newnport->nport_list);
+	newnport->node_name = opts->wwnn;
+	newnport->port_name = opts->wwpn;
+	if (opts->mask & NVMF_OPT_ROLES)
+		newnport->port_role = opts->roles;
+	if (opts->mask & NVMF_OPT_FCADDR)
+		newnport->port_id = opts->fcaddr;
+	kref_init(&newnport->ref);
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	list_for_each_entry(tmplport, &fcloop_lports, lport_list) {
+		if (tmplport->localport->node_name == opts->wwnn &&
+		    tmplport->localport->port_name == opts->wwpn)
+			goto out_invalid_opts;
+
+		if (tmplport->localport->node_name == opts->lpwwnn &&
+		    tmplport->localport->port_name == opts->lpwwpn)
+			lport = tmplport;
+	}
+
+	if (remoteport) {
+		if (!lport)
+			goto out_invalid_opts;
+		newnport->lport = lport;
+	}
+
+	list_for_each_entry(nport, &fcloop_nports, nport_list) {
+		if (nport->node_name == opts->wwnn &&
+		    nport->port_name == opts->wwpn) {
+			if ((remoteport && nport->rport) ||
+			    (!remoteport && nport->tport)) {
+				nport = NULL;
+				goto out_invalid_opts;
+			}
+
+			fcloop_nport_get(nport);
+
+			spin_unlock_irqrestore(&fcloop_lock, flags);
+
+			if (remoteport)
+				nport->lport = lport;
+			if (opts->mask & NVMF_OPT_ROLES)
+				nport->port_role = opts->roles;
+			if (opts->mask & NVMF_OPT_FCADDR)
+				nport->port_id = opts->fcaddr;
+			goto out_free_newnport;
+		}
+	}
+
+	list_add_tail(&newnport->nport_list, &fcloop_nports);
+
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	kfree(opts);
+	return newnport;
+
+out_invalid_opts:
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+out_free_newnport:
+	kfree(newnport);
+out_free_opts:
+	kfree(opts);
+	return nport;
+}
+
+static ssize_t
+fcloop_create_remote_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct nvme_fc_remote_port *remoteport;
+	struct fcloop_nport *nport;
+	struct fcloop_rport *rport;
+	struct nvme_fc_port_info pinfo;
+	int ret;
+
+	nport = fcloop_alloc_nport(buf, count, true);
+	if (!nport)
+		return -EIO;
+
+	pinfo.node_name = nport->node_name;
+	pinfo.port_name = nport->port_name;
+	pinfo.port_role = nport->port_role;
+	pinfo.port_id = nport->port_id;
+
+	ret = nvme_fc_register_remoteport(nport->lport->localport,
+						&pinfo, &remoteport);
+	if (ret || !remoteport) {
+		fcloop_nport_put(nport);
+		return ret;
+	}
+
+	/* success */
+	rport = remoteport->private;
+	rport->remoteport = remoteport;
+	rport->targetport = (nport->tport) ?  nport->tport->targetport : NULL;
+	if (nport->tport) {
+		nport->tport->remoteport = remoteport;
+		nport->tport->lport = nport->lport;
+	}
+	rport->nport = nport;
+	rport->lport = nport->lport;
+	nport->rport = rport;
+
+	return ret ? ret : count;
+}
+
+
+static struct fcloop_rport *
+__unlink_remote_port(struct fcloop_nport *nport)
+{
+	struct fcloop_rport *rport = nport->rport;
+
+	if (rport && nport->tport)
+		nport->tport->remoteport = NULL;
+	nport->rport = NULL;
+
+	return rport;
+}
+
+static int
+__wait_remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport)
+{
+	int ret;
+
+	if (!rport)
+		return -EALREADY;
+
+	init_completion(&nport->rport_unreg_done);
+
+	ret = nvme_fc_unregister_remoteport(rport->remoteport);
+	if (ret)
+		return ret;
+
+	wait_for_completion(&nport->rport_unreg_done);
+
+	fcloop_nport_put(nport);
+
+	return ret;
+}
+
+static ssize_t
+fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct fcloop_nport *nport = NULL, *tmpport;
+	static struct fcloop_rport *rport;
+	u64 nodename, portname;
+	unsigned long flags;
+	int ret;
+
+	ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
+		if (tmpport->node_name == nodename &&
+		    tmpport->port_name == portname && tmpport->rport) {
+			nport = tmpport;
+			rport = __unlink_remote_port(nport);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	if (!nport)
+		return -ENOENT;
+
+	ret = __wait_remoteport_unreg(nport, rport);
+
+	return ret ? ret : count;
+}
+
+static ssize_t
+fcloop_create_target_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct nvmet_fc_target_port *targetport;
+	struct fcloop_nport *nport;
+	struct fcloop_tport *tport;
+	struct nvmet_fc_port_info tinfo;
+	int ret;
+
+	nport = fcloop_alloc_nport(buf, count, false);
+	if (!nport)
+		return -EIO;
+
+	tinfo.node_name = nport->node_name;
+	tinfo.port_name = nport->port_name;
+	tinfo.port_id = nport->port_id;
+
+	ret = nvmet_fc_register_targetport(&tinfo, &tgttemplate, NULL,
+						&targetport);
+	if (ret) {
+		fcloop_nport_put(nport);
+		return ret;
+	}
+
+	/* success */
+	tport = targetport->private;
+	tport->targetport = targetport;
+	tport->remoteport = (nport->rport) ?  nport->rport->remoteport : NULL;
+	if (nport->rport)
+		nport->rport->targetport = targetport;
+	tport->nport = nport;
+	tport->lport = nport->lport;
+	nport->tport = tport;
+
+	return ret ? ret : count;
+}
+
+
+static struct fcloop_tport *
+__unlink_target_port(struct fcloop_nport *nport)
+{
+	struct fcloop_tport *tport = nport->tport;
+
+	if (tport && nport->rport)
+		nport->rport->targetport = NULL;
+	nport->tport = NULL;
+
+	return tport;
+}
+
+static int
+__wait_targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport)
+{
+	int ret;
+
+	if (!tport)
+		return -EALREADY;
+
+	init_completion(&nport->tport_unreg_done);
+
+	ret = nvmet_fc_unregister_targetport(tport->targetport);
+	if (ret)
+		return ret;
+
+	wait_for_completion(&nport->tport_unreg_done);
+
+	fcloop_nport_put(nport);
+
+	return ret;
+}
+
+static ssize_t
+fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct fcloop_nport *nport = NULL, *tmpport;
+	struct fcloop_tport *tport;
+	u64 nodename, portname;
+	unsigned long flags;
+	int ret;
+
+	ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
+		if (tmpport->node_name == nodename &&
+		    tmpport->port_name == portname && tmpport->tport) {
+			nport = tmpport;
+			tport = __unlink_target_port(nport);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	if (!nport)
+		return -ENOENT;
+
+	ret = __wait_targetport_unreg(nport, tport);
+
+	return ret ? ret : count;
+}
+
+
+static DEVICE_ATTR(add_local_port, 0200, NULL, fcloop_create_local_port);
+static DEVICE_ATTR(del_local_port, 0200, NULL, fcloop_delete_local_port);
+static DEVICE_ATTR(add_remote_port, 0200, NULL, fcloop_create_remote_port);
+static DEVICE_ATTR(del_remote_port, 0200, NULL, fcloop_delete_remote_port);
+static DEVICE_ATTR(add_target_port, 0200, NULL, fcloop_create_target_port);
+static DEVICE_ATTR(del_target_port, 0200, NULL, fcloop_delete_target_port);
+
+static struct attribute *fcloop_dev_attrs[] = {
+	&dev_attr_add_local_port.attr,
+	&dev_attr_del_local_port.attr,
+	&dev_attr_add_remote_port.attr,
+	&dev_attr_del_remote_port.attr,
+	&dev_attr_add_target_port.attr,
+	&dev_attr_del_target_port.attr,
+	NULL
+};
+
+static struct attribute_group fclopp_dev_attrs_group = {
+	.attrs		= fcloop_dev_attrs,
+};
+
+static const struct attribute_group *fcloop_dev_attr_groups[] = {
+	&fclopp_dev_attrs_group,
+	NULL,
+};
+
+static struct class *fcloop_class;
+static struct device *fcloop_device;
+
+
+static int __init fcloop_init(void)
+{
+	int ret;
+
+	fcloop_class = class_create(THIS_MODULE, "fcloop");
+	if (IS_ERR(fcloop_class)) {
+		pr_err("couldn't register class fcloop\n");
+		ret = PTR_ERR(fcloop_class);
+		return ret;
+	}
+
+	fcloop_device = device_create_with_groups(
+				fcloop_class, NULL, MKDEV(0, 0), NULL,
+				fcloop_dev_attr_groups, "ctl");
+	if (IS_ERR(fcloop_device)) {
+		pr_err("couldn't create ctl device!\n");
+		ret = PTR_ERR(fcloop_device);
+		goto out_destroy_class;
+	}
+
+	get_device(fcloop_device);
+
+	return 0;
+
+out_destroy_class:
+	class_destroy(fcloop_class);
+	return ret;
+}
+
+static void __exit fcloop_exit(void)
+{
+	struct fcloop_lport *lport;
+	struct fcloop_nport *nport;
+	struct fcloop_tport *tport;
+	struct fcloop_rport *rport;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&fcloop_lock, flags);
+
+	for (;;) {
+		nport = list_first_entry_or_null(&fcloop_nports,
+						typeof(*nport), nport_list);
+		if (!nport)
+			break;
+
+		tport = __unlink_target_port(nport);
+		rport = __unlink_remote_port(nport);
+
+		spin_unlock_irqrestore(&fcloop_lock, flags);
+
+		ret = __wait_targetport_unreg(nport, tport);
+		if (ret)
+			pr_warn("%s: Failed deleting target port\n", __func__);
+
+		ret = __wait_remoteport_unreg(nport, rport);
+		if (ret)
+			pr_warn("%s: Failed deleting remote port\n", __func__);
+
+		spin_lock_irqsave(&fcloop_lock, flags);
+	}
+
+	for (;;) {
+		lport = list_first_entry_or_null(&fcloop_lports,
+						typeof(*lport), lport_list);
+		if (!lport)
+			break;
+
+		__unlink_local_port(lport);
+
+		spin_unlock_irqrestore(&fcloop_lock, flags);
+
+		ret = __wait_localport_unreg(lport);
+		if (ret)
+			pr_warn("%s: Failed deleting local port\n", __func__);
+
+		spin_lock_irqsave(&fcloop_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&fcloop_lock, flags);
+
+	put_device(fcloop_device);
+
+	device_destroy(fcloop_class, MKDEV(0, 0));
+	class_destroy(fcloop_class);
+}
+
+module_init(fcloop_init);
+module_exit(fcloop_exit);
+
+MODULE_LICENSE("GPL v2");

From 20032ec38d1614f29c5a8e6cfc5b5326685fa99b Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 8 Dec 2016 09:52:35 -0500
Subject: [PATCH 174/182] nbd: reset the setup task for NBD_CLEAR_SOCK

If an app exits before running NBD_DO_IT but after adding sockets we can
end up not being allowed to do a new nbd device.  Fix this by making
NBD_CLEAR_SOCK reset the setup_task.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index bc78cbb2d18aff..84ea5c2251e25e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -704,6 +704,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 			kfree(nbd->socks);
 			nbd->socks = NULL;
 			nbd->num_connections = 0;
+			nbd->task_setup = NULL;
 		}
 		return 0;
 

From a897b6664e9565c980f76af1c123cb0597d79822 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 5 Dec 2016 16:20:29 -0500
Subject: [PATCH 175/182] nbd: use dev_err_ratelimited in io path

While doing stress tests we noticed that we'd get a lot of dmesg spam if
we suddenly disconnected the nbd device out of band.  Rate limit the
messages in the io path in order to deal with this.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 84ea5c2251e25e..99c84468f15437 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -219,7 +219,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, void *buf,
 	unsigned long pflags = current->flags;
 
 	if (unlikely(!sock)) {
-		dev_err(disk_to_dev(nbd->disk),
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
 			"Attempted %s on closed socket in sock_xmit\n",
 			(send ? "send" : "recv"));
 		return -EINVAL;
@@ -302,7 +302,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 	result = sock_xmit(nbd, index, 1, &request, sizeof(request),
 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
-		dev_err(disk_to_dev(nbd->disk),
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
 			"Send control failed (result %d)\n", result);
 		return -EIO;
 	}
@@ -501,14 +501,14 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 	struct nbd_sock *nsock;
 
 	if (index >= nbd->num_connections) {
-		dev_err(disk_to_dev(nbd->disk),
-			"Attempted send on invalid socket\n");
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Attempted send on invalid socket\n");
 		goto error_out;
 	}
 
 	if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
-		dev_err(disk_to_dev(nbd->disk),
-			"Attempted send on closed socket\n");
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Attempted send on closed socket\n");
 		goto error_out;
 	}
 
@@ -519,8 +519,8 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 	if (req->cmd_type == REQ_TYPE_FS &&
 	    rq_data_dir(req) == WRITE &&
 	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
-		dev_err(disk_to_dev(nbd->disk),
-			"Write on read-only\n");
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Write on read-only\n");
 		goto error_out;
 	}
 
@@ -530,13 +530,14 @@ static void nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 	mutex_lock(&nsock->tx_lock);
 	if (unlikely(!nsock->sock)) {
 		mutex_unlock(&nsock->tx_lock);
-		dev_err(disk_to_dev(nbd->disk),
-			"Attempted send on closed socket\n");
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Attempted send on closed socket\n");
 		goto error_out;
 	}
 
 	if (nbd_send_cmd(nbd, cmd, index) != 0) {
-		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Request send failed\n");
 		req->errors++;
 		nbd_end_request(cmd);
 	}

From be07e14f96e3121483339a64d917fddb3b86ba98 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Dec 2016 14:19:06 +0100
Subject: [PATCH 176/182] blk-wbt: don't throttle discard or write zeroes

Both of these are metadata only commands that are not issued by the
writeback code and not directly relevant to the writeback bandwith.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-wbt.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index d500e43da5d9cc..6e82769f4042c2 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -575,10 +575,9 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
 	const int op = bio_op(bio);
 
 	/*
-	 * If not a WRITE (or a discard or write zeroes), do nothing
+	 * If not a WRITE, do nothing
 	 */
-	if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
-				op == REQ_OP_WRITE_ZEROES))
+	if (op != REQ_OP_WRITE)
 		return false;
 
 	/*

From f9d03f96b988002027d4b28ea1b7a24729a4c9b5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 8 Dec 2016 15:20:32 -0700
Subject: [PATCH 177/182] block: improve handling of the magic discard payload

Instead of allocating a single unused biovec for discard requests, send
them down without any payload.  Instead we allow the driver to add a
"special" payload using a biovec embedded into struct request (unioned
over other fields never used while in the driver), and overloading
the number of segments for this case.

This has a couple of advantages:

 - we don't have to allocate the bio_vec
 - the amount of special casing for discard requests in the block
   layer is significantly reduced
 - using this same scheme for other request types is trivial,
   which will be important for implementing the new WRITE_ZEROES
   op on devices where it actually requires a payload (e.g. SCSI)
 - we can get rid of playing games with the request length, as
   we'll never touch it and completions will work just fine
 - it will allow us to support ranged discard operations in the
   future by merging non-contiguous discard bios into a single
   request
 - last but not least it removes a lot of code

This patch is the common base for my WIP series for ranges discards and to
remove discard_zeroes_data in favor of always using REQ_OP_WRITE_ZEROES,
so it would be good to get it in quickly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                | 10 +------
 block/blk-core.c           | 34 ++----------------------
 block/blk-lib.c            |  2 +-
 block/blk-merge.c          | 53 ++++++++++++--------------------------
 drivers/nvme/host/core.c   | 17 +++---------
 drivers/nvme/host/nvme.h   |  6 +++--
 drivers/nvme/host/pci.c    | 27 +++++++++----------
 drivers/nvme/host/rdma.c   | 13 ++++------
 drivers/nvme/target/loop.c |  4 +--
 drivers/scsi/scsi_lib.c    |  6 ++---
 drivers/scsi/sd.c          | 24 +++++++----------
 include/linux/bio.h        |  3 ++-
 include/linux/blkdev.h     | 15 ++++++++---
 13 files changed, 76 insertions(+), 138 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 83db1f37fd0bfb..2b375020fc49ba 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1840,15 +1840,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
 	BUG_ON(sectors <= 0);
 	BUG_ON(sectors >= bio_sectors(bio));
 
-	/*
-	 * Discards need a mutable bio_vec to accommodate the payload
-	 * required by the DSM TRIM and UNMAP commands.
-	 */
-	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
-		split = bio_clone_bioset(bio, gfp, bs);
-	else
-		split = bio_clone_fast(bio, gfp, bs);
-
+	split = bio_clone_fast(bio, gfp, bs);
 	if (!split)
 		return NULL;
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4b7ec5958055a3..bd642a43b98b13 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1475,38 +1475,6 @@ void blk_put_request(struct request *req)
 }
 EXPORT_SYMBOL(blk_put_request);
 
-/**
- * blk_add_request_payload - add a payload to a request
- * @rq: request to update
- * @page: page backing the payload
- * @offset: offset in page
- * @len: length of the payload.
- *
- * This allows to later add a payload to an already submitted request by
- * a block driver.  The driver needs to take care of freeing the payload
- * itself.
- *
- * Note that this is a quite horrible hack and nothing but handling of
- * discard requests should ever use it.
- */
-void blk_add_request_payload(struct request *rq, struct page *page,
-		int offset, unsigned int len)
-{
-	struct bio *bio = rq->bio;
-
-	bio->bi_io_vec->bv_page = page;
-	bio->bi_io_vec->bv_offset = offset;
-	bio->bi_io_vec->bv_len = len;
-
-	bio->bi_iter.bi_size = len;
-	bio->bi_vcnt = 1;
-	bio->bi_phys_segments = 1;
-
-	rq->__data_len = rq->resid_len = len;
-	rq->nr_phys_segments = 1;
-}
-EXPORT_SYMBOL_GPL(blk_add_request_payload);
-
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio)
 {
@@ -2642,6 +2610,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		return false;
 	}
 
+	WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD);
+
 	req->__data_len -= total_bytes;
 
 	/* update sector only for requests with clear definition of sector */
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 510a6fb1531822..ed89c8f4b2a049 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -80,7 +80,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			req_sects = end_sect - sector;
 		}
 
-		bio = next_bio(bio, 1, gfp_mask);
+		bio = next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
 		bio_set_op_attrs(bio, op, 0);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1002afdfee99f8..182398cb152490 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -241,18 +241,13 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	if (!bio)
 		return 0;
 
-	/*
-	 * This should probably be returning 0, but blk_add_request_payload()
-	 * (Christoph!!!!)
-	 */
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-	case REQ_OP_WRITE_SAME:
 	case REQ_OP_WRITE_ZEROES:
+		return 0;
+	case REQ_OP_WRITE_SAME:
 		return 1;
-	default:
-		break;
 	}
 
 	fbio = bio;
@@ -410,39 +405,21 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 	*bvprv = *bvec;
 }
 
+static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv,
+		struct scatterlist *sglist, struct scatterlist **sg)
+{
+	*sg = sglist;
+	sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+	return 1;
+}
+
 static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 			     struct scatterlist *sglist,
 			     struct scatterlist **sg)
 {
 	struct bio_vec bvec, bvprv = { NULL };
 	struct bvec_iter iter;
-	int nsegs, cluster;
-
-	nsegs = 0;
-	cluster = blk_queue_cluster(q);
-
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-	case REQ_OP_SECURE_ERASE:
-	case REQ_OP_WRITE_ZEROES:
-		/*
-		 * This is a hack - drivers should be neither modifying the
-		 * biovec, nor relying on bi_vcnt - but because of
-		 * blk_add_request_payload(), a discard bio may or may not have
-		 * a payload we need to set up here (thank you Christoph) and
-		 * bi_vcnt is really the only way of telling if we need to.
-		 */
-		if (!bio->bi_vcnt)
-			return 0;
-		/* Fall through */
-	case REQ_OP_WRITE_SAME:
-		*sg = sglist;
-		bvec = bio_iovec(bio);
-		sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
-		return 1;
-	default:
-		break;
-	}
+	int cluster = blk_queue_cluster(q), nsegs = 0;
 
 	for_each_bio(bio)
 		bio_for_each_segment(bvec, bio, iter)
@@ -462,7 +439,11 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	struct scatterlist *sg = NULL;
 	int nsegs = 0;
 
-	if (rq->bio)
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+		nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg);
+	else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
+		nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg);
+	else if (rq->bio)
 		nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
 
 	if (unlikely(rq->rq_flags & RQF_COPY_USER) &&
@@ -495,7 +476,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	 * Something must have been wrong if the figured number of
 	 * segment is bigger than number of req's physical segments
 	 */
-	WARN_ON(nsegs > rq->nr_phys_segments);
+	WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
 
 	return nsegs;
 }
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1b48514fbe9913..3b1d6478dcfb48 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -239,8 +239,6 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 		struct nvme_command *cmnd)
 {
 	struct nvme_dsm_range *range;
-	struct page *page;
-	int offset;
 	unsigned int nr_bytes = blk_rq_bytes(req);
 
 	range = kmalloc(sizeof(*range), GFP_ATOMIC);
@@ -257,17 +255,10 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	cmnd->dsm.nr = 0;
 	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
-	req->completion_data = range;
-	page = virt_to_page(range);
-	offset = offset_in_page(range);
-	blk_add_request_payload(req, page, offset, sizeof(*range));
-
-	/*
-	 * we set __data_len back to the size of the area to be discarded
-	 * on disk. This allows us to report completion on the full amount
-	 * of blocks described by the request.
-	 */
-	req->__data_len = nr_bytes;
+	req->special_vec.bv_page = virt_to_page(range);
+	req->special_vec.bv_offset = offset_in_page(range);
+	req->special_vec.bv_len = sizeof(*range);
+	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
 
 	return BLK_MQ_RQ_QUEUE_OK;
 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a3d6ffd874afb0..bd5321441d1271 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -236,8 +236,10 @@ static inline unsigned nvme_map_len(struct request *rq)
 
 static inline void nvme_cleanup_cmd(struct request *req)
 {
-	if (req_op(req) == REQ_OP_DISCARD)
-		kfree(req->completion_data);
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		kfree(page_address(req->special_vec.bv_page) +
+		      req->special_vec.bv_offset);
+	}
 }
 
 static inline int nvme_error_status(u16 status)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 82b9b3f1f21d57..717d6ea47ee474 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -302,14 +302,14 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 static __le64 **iod_list(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	return (__le64 **)(iod->sg + req->nr_phys_segments);
+	return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req));
 }
 
 static int nvme_init_iod(struct request *rq, unsigned size,
 		struct nvme_dev *dev)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
-	int nseg = rq->nr_phys_segments;
+	int nseg = blk_rq_nr_phys_segments(rq);
 
 	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
 		iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
@@ -339,8 +339,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
 	__le64 **list = iod_list(req);
 	dma_addr_t prp_dma = iod->first_dma;
 
-	nvme_cleanup_cmd(req);
-
 	if (iod->npages == 0)
 		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
 	for (i = 0; i < iod->npages; i++) {
@@ -510,7 +508,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req,
 			DMA_TO_DEVICE : DMA_FROM_DEVICE;
 	int ret = BLK_MQ_RQ_QUEUE_ERROR;
 
-	sg_init_table(iod->sg, req->nr_phys_segments);
+	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
 	iod->nents = blk_rq_map_sg(q, req, iod->sg);
 	if (!iod->nents)
 		goto out;
@@ -566,6 +564,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 		}
 	}
 
+	nvme_cleanup_cmd(req);
 	nvme_free_iod(dev, req);
 }
 
@@ -596,20 +595,20 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		}
 	}
 
-	map_len = nvme_map_len(req);
-	ret = nvme_init_iod(req, map_len, dev);
+	ret = nvme_setup_cmd(ns, req, &cmnd);
 	if (ret != BLK_MQ_RQ_QUEUE_OK)
 		return ret;
 
-	ret = nvme_setup_cmd(ns, req, &cmnd);
+	map_len = nvme_map_len(req);
+	ret = nvme_init_iod(req, map_len, dev);
 	if (ret != BLK_MQ_RQ_QUEUE_OK)
-		goto out;
+		goto out_free_cmd;
 
-	if (req->nr_phys_segments)
+	if (blk_rq_nr_phys_segments(req))
 		ret = nvme_map_data(dev, req, map_len, &cmnd);
 
 	if (ret != BLK_MQ_RQ_QUEUE_OK)
-		goto out;
+		goto out_cleanup_iod;
 
 	blk_mq_start_request(req);
 
@@ -620,14 +619,16 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		else
 			ret = BLK_MQ_RQ_QUEUE_ERROR;
 		spin_unlock_irq(&nvmeq->q_lock);
-		goto out;
+		goto out_cleanup_iod;
 	}
 	__nvme_submit_cmd(nvmeq, &cmnd);
 	nvme_process_cq(nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
 	return BLK_MQ_RQ_QUEUE_OK;
-out:
+out_cleanup_iod:
 	nvme_free_iod(dev, req);
+out_free_cmd:
+	nvme_cleanup_cmd(req);
 	return ret;
 }
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b037d0cb2a7e05..251101bf982f5f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -952,8 +952,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 	struct nvme_rdma_device *dev = queue->device;
 	struct ib_device *ibdev = dev->dev;
-	int nents, count;
-	int ret;
+	int count, ret;
 
 	req->num_sge = 1;
 	req->inline_data = false;
@@ -965,16 +964,14 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
 		return nvme_rdma_set_sg_null(c);
 
 	req->sg_table.sgl = req->first_sgl;
-	ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments,
-				req->sg_table.sgl);
+	ret = sg_alloc_table_chained(&req->sg_table,
+			blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
 	if (ret)
 		return -ENOMEM;
 
-	nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
-	BUG_ON(nents > rq->nr_phys_segments);
-	req->nents = nents;
+	req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
 
-	count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents,
+	count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
 		    rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	if (unlikely(count <= 0)) {
 		sg_free_table_chained(&req->sg_table, true);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 57ded6b3ed8a98..9aaa70071ae555 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -185,13 +185,13 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (blk_rq_bytes(req)) {
 		iod->sg_table.sgl = iod->first_sgl;
 		ret = sg_alloc_table_chained(&iod->sg_table,
-			req->nr_phys_segments, iod->sg_table.sgl);
+				blk_rq_nr_phys_segments(req),
+				iod->sg_table.sgl);
 		if (ret)
 			return BLK_MQ_RQ_QUEUE_BUSY;
 
 		iod->req.sg = iod->sg_table.sgl;
 		iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);
-		BUG_ON(iod->req.sg_cnt > req->nr_phys_segments);
 	}
 
 	blk_mq_start_request(req);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 47a5c8783b894c..9a8ccff1121fb0 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1007,8 +1007,8 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
 	/*
 	 * If sg table allocation fails, requeue request later.
 	 */
-	if (unlikely(sg_alloc_table_chained(&sdb->table, req->nr_phys_segments,
-					sdb->table.sgl)))
+	if (unlikely(sg_alloc_table_chained(&sdb->table,
+			blk_rq_nr_phys_segments(req), sdb->table.sgl)))
 		return BLKPREP_DEFER;
 
 	/* 
@@ -1040,7 +1040,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 	bool is_mq = (rq->mq_ctx != NULL);
 	int error;
 
-	BUG_ON(!rq->nr_phys_segments);
+	BUG_ON(!blk_rq_nr_phys_segments(rq));
 
 	error = scsi_init_sgtable(rq, &cmd->sdb);
 	if (error)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 65738b0aad367d..079c2d9759fb8a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -716,7 +716,6 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
 	sector_t sector = blk_rq_pos(rq);
 	unsigned int nr_sectors = blk_rq_sectors(rq);
-	unsigned int nr_bytes = blk_rq_bytes(rq);
 	unsigned int len;
 	int ret;
 	char *buf;
@@ -772,24 +771,19 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
 		goto out;
 	}
 
-	rq->completion_data = page;
 	rq->timeout = SD_TIMEOUT;
 
 	cmd->transfersize = len;
 	cmd->allowed = SD_MAX_RETRIES;
 
-	/*
-	 * Initially __data_len is set to the amount of data that needs to be
-	 * transferred to the target. This amount depends on whether WRITE SAME
-	 * or UNMAP is being used. After the scatterlist has been mapped by
-	 * scsi_init_io() we set __data_len to the size of the area to be
-	 * discarded on disk. This allows us to report completion on the full
-	 * amount of blocks described by the request.
-	 */
-	blk_add_request_payload(rq, page, 0, len);
-	ret = scsi_init_io(cmd);
-	rq->__data_len = nr_bytes;
+	rq->special_vec.bv_page = page;
+	rq->special_vec.bv_offset = 0;
+	rq->special_vec.bv_len = len;
+
+	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+	rq->resid_len = len;
 
+	ret = scsi_init_io(cmd);
 out:
 	if (ret != BLKPREP_OK)
 		__free_page(page);
@@ -1182,8 +1176,8 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt)
 {
 	struct request *rq = SCpnt->request;
 
-	if (req_op(rq) == REQ_OP_DISCARD)
-		__free_page(rq->completion_data);
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+		__free_page(rq->special_vec.bv_page);
 
 	if (SCpnt->cmnd != rq->cmd) {
 		mempool_free(SCpnt->cmnd, sd_cdb_pool);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b15323934a298e..7cf8a6c70a3f71 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -197,8 +197,9 @@ static inline unsigned bio_segments(struct bio *bio)
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-	case REQ_OP_WRITE_SAME:
 	case REQ_OP_WRITE_ZEROES:
+		return 0;
+	case REQ_OP_WRITE_SAME:
 		return 1;
 	default:
 		break;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ebeef2b79c5ada..c5393766909d63 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -120,10 +120,13 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_HASHED		((__force req_flags_t)(1 << 16))
 /* IO stats tracking on */
 #define RQF_STATS		((__force req_flags_t)(1 << 17))
+/* Look at ->special_vec for the actual data payload instead of the
+   bio chain. */
+#define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ)
+	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
 
 #define BLK_MAX_CDB	16
 
@@ -175,6 +178,7 @@ struct request {
 	 */
 	union {
 		struct rb_node rb_node;	/* sort/lookup */
+		struct bio_vec special_vec;
 		void *completion_data;
 	};
 
@@ -909,8 +913,6 @@ extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern void blk_rq_set_block_pc(struct request *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
-extern void blk_add_request_payload(struct request *rq, struct page *page,
-		int offset, unsigned int len);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
@@ -1153,6 +1155,13 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 
+static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
+{
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+		return 1;
+	return rq->nr_phys_segments;
+}
+
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);

From ae911c5e796d51cb2d1ed3a55e73b9cc88d176cf Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 8 Dec 2016 13:19:30 -0700
Subject: [PATCH 178/182] blk-mq: add blk_mq_start_stopped_hw_queue()

We have a variant for all hardware queues, but not one for a single
hardware queue.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-mq.c         | 19 ++++++++++++-------
 include/linux/blk-mq.h |  1 +
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90db5b490df9a7..c993476b630f95 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1064,18 +1064,23 @@ void blk_mq_start_hw_queues(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+	if (!blk_mq_hctx_stopped(hctx))
+		return;
+
+	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+	blk_mq_run_hw_queue(hctx, async);
+}
+EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
+
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (!blk_mq_hctx_stopped(hctx))
-			continue;
-
-		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-		blk_mq_run_hw_queue(hctx, async);
-	}
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_start_stopped_hw_queue(hctx, async);
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 35a0af5ede6d1e..87e404aae267e2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -231,6 +231,7 @@ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
+void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);

From f04c3df3efeca0d226aeff7ef595e12a0b807ab2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 7 Dec 2016 08:41:17 -0700
Subject: [PATCH 179/182] blk-mq: abstract out blk_mq_dispatch_rq_list() helper

Takes a list of requests, and dispatches it. Moves any residual
requests to the dispatch list.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-mq.c | 85 ++++++++++++++++++++++++++++----------------------
 block/blk-mq.h |  1 +
 2 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c993476b630f95..d79fdc11b1ee7d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -821,41 +821,13 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
 	struct request *rq;
-	LIST_HEAD(rq_list);
 	LIST_HEAD(driver_list);
 	struct list_head *dptr;
-	int queued;
-
-	if (unlikely(blk_mq_hctx_stopped(hctx)))
-		return;
-
-	hctx->run++;
-
-	/*
-	 * Touch any software queue that has pending entries.
-	 */
-	flush_busy_ctxs(hctx, &rq_list);
-
-	/*
-	 * If we have previous entries on our dispatch list, grab them
-	 * and stuff them at the front for more fair dispatch.
-	 */
-	if (!list_empty_careful(&hctx->dispatch)) {
-		spin_lock(&hctx->lock);
-		if (!list_empty(&hctx->dispatch))
-			list_splice_init(&hctx->dispatch, &rq_list);
-		spin_unlock(&hctx->lock);
-	}
+	int queued, ret = BLK_MQ_RQ_QUEUE_OK;
 
 	/*
 	 * Start off with dptr being NULL, so we start the first request
@@ -867,16 +839,15 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 	 * Now process all the entries, sending them to the driver.
 	 */
 	queued = 0;
-	while (!list_empty(&rq_list)) {
+	while (!list_empty(list)) {
 		struct blk_mq_queue_data bd;
-		int ret;
 
-		rq = list_first_entry(&rq_list, struct request, queuelist);
+		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
 		bd.list = dptr;
-		bd.last = list_empty(&rq_list);
+		bd.last = list_empty(list);
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
 		switch (ret) {
@@ -884,7 +855,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 			queued++;
 			break;
 		case BLK_MQ_RQ_QUEUE_BUSY:
-			list_add(&rq->queuelist, &rq_list);
+			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
 		default:
@@ -902,7 +873,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 		 * We've done the first request. If we have more than 1
 		 * left in the list, set dptr to defer issue.
 		 */
-		if (!dptr && rq_list.next != rq_list.prev)
+		if (!dptr && list->next != list->prev)
 			dptr = &driver_list;
 	}
 
@@ -912,10 +883,11 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 	 * Any items that need requeuing? Stuff them into hctx->dispatch,
 	 * that is where we will continue on next queue run.
 	 */
-	if (!list_empty(&rq_list)) {
+	if (!list_empty(list)) {
 		spin_lock(&hctx->lock);
-		list_splice(&rq_list, &hctx->dispatch);
+		list_splice(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
+
 		/*
 		 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
 		 * it's possible the queue is stopped and restarted again
@@ -927,6 +899,43 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 		 **/
 		blk_mq_run_hw_queue(hctx, true);
 	}
+
+	return ret != BLK_MQ_RQ_QUEUE_BUSY;
+}
+
+/*
+ * Run this hardware queue, pulling any software queues mapped to it in.
+ * Note that this function currently has various problems around ordering
+ * of IO. In particular, we'd like FIFO behaviour on handling existing
+ * items on the hctx->dispatch list. Ignore that for now.
+ */
+static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+{
+	LIST_HEAD(rq_list);
+	LIST_HEAD(driver_list);
+
+	if (unlikely(blk_mq_hctx_stopped(hctx)))
+		return;
+
+	hctx->run++;
+
+	/*
+	 * Touch any software queue that has pending entries.
+	 */
+	flush_busy_ctxs(hctx, &rq_list);
+
+	/*
+	 * If we have previous entries on our dispatch list, grab them
+	 * and stuff them at the front for more fair dispatch.
+	 */
+	if (!list_empty_careful(&hctx->dispatch)) {
+		spin_lock(&hctx->lock);
+		if (!list_empty(&hctx->dispatch))
+			list_splice_init(&hctx->dispatch, &rq_list);
+		spin_unlock(&hctx->lock);
+	}
+
+	blk_mq_dispatch_rq_list(hctx, &rq_list);
 }
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b444370ae05ba0..3a54dd32a6fc33 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -31,6 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
 
 /*
  * CPU hotplug helpers

From 70b3ea056f3074be6d9256c312b64c0d90a4a683 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 7 Dec 2016 08:43:31 -0700
Subject: [PATCH 180/182] elevator: make the rqhash helpers exported

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 block/elevator.c         | 8 ++++----
 include/linux/elevator.h | 5 +++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index a18a5db274e4a0..40f0c04e5ad3d0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -248,13 +248,13 @@ static inline void __elv_rqhash_del(struct request *rq)
 	rq->rq_flags &= ~RQF_HASHED;
 }
 
-static void elv_rqhash_del(struct request_queue *q, struct request *rq)
+void elv_rqhash_del(struct request_queue *q, struct request *rq)
 {
 	if (ELV_ON_HASH(rq))
 		__elv_rqhash_del(rq);
 }
 
-static void elv_rqhash_add(struct request_queue *q, struct request *rq)
+void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
@@ -263,13 +263,13 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 	rq->rq_flags |= RQF_HASHED;
 }
 
-static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
 	__elv_rqhash_del(rq);
 	elv_rqhash_add(q, rq);
 }
 
-static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
 	struct elevator_queue *e = q->elevator;
 	struct hlist_node *next;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index f219c9aed360fe..b276e9ef0e0b4d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -108,6 +108,11 @@ struct elevator_type
 
 #define ELV_HASH_BITS 6
 
+void elv_rqhash_del(struct request_queue *q, struct request *rq);
+void elv_rqhash_add(struct request_queue *q, struct request *rq);
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq);
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
+
 /*
  * each queue has an elevator_queue associated with it
  */

From c8e52ba5e2d6df5acaaeedda20d74f4ec3adcc82 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 7 Dec 2016 15:53:18 -0700
Subject: [PATCH 181/182] blk-flush: run the queue when inserting blk-mq flush

Currently we pass in to run the queue async, but don't flag the
queue to be run. We don't need to run it async here, but we should
run it. So fixup the parameters.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 1bdbb3d3e5f579..27a42dab5a3664 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -426,7 +426,7 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		if (q->mq_ops) {
-			blk_mq_insert_request(rq, false, false, true);
+			blk_mq_insert_request(rq, false, true, false);
 		} else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;

From 7cd54aa8438947602cf68eda1db327822b9b8e6b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 9 Dec 2016 13:08:35 -0700
Subject: [PATCH 182/182] blk-stat: fix a few cases of missing batch flushing

Everytime we need to read ->nr_samples, we should have flushed
the batch first. The non-mq read path also needs to flush the
batch.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-stat.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block/blk-stat.c b/block/blk-stat.c
index 4d0118568727ec..9b43efb8933fb9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -64,6 +64,9 @@ static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
 
 		queue_for_each_hw_ctx(q, hctx, i) {
 			hctx_for_each_ctx(hctx, ctx, j) {
+				blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
+				blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+
 				if (!ctx->stat[BLK_STAT_READ].nr_samples &&
 				    !ctx->stat[BLK_STAT_WRITE].nr_samples)
 					continue;
@@ -111,6 +114,8 @@ void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
 	if (q->mq_ops)
 		blk_mq_stat_get(q, dst);
 	else {
+		blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]);
+		blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]);
 		memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ],
 				sizeof(struct blk_rq_stat));
 		memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE],
@@ -128,6 +133,9 @@ void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
 		uint64_t newest = 0;
 
 		hctx_for_each_ctx(hctx, ctx, i) {
+			blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
+			blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+
 			if (!ctx->stat[BLK_STAT_READ].nr_samples &&
 			    !ctx->stat[BLK_STAT_WRITE].nr_samples)
 				continue;