From f441108fa08c466d986a7dca776f59dabab58456 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:17 -0700 Subject: [PATCH 001/190] block: Remove a superfluous cast from blkdev_report_zones() No cast is necessary when assigning a non-void pointer to a void pointer. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 51000914e23f96..c461cf63f1f40d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev, /* Get header in the first page */ ofst = 0; if (!nr_rep) { - hdr = (struct blk_zone_report_hdr *) addr; + hdr = addr; nr_rep = hdr->nr_zones; ofst = sizeof(struct blk_zone_report_hdr); } From b3e7e7d2d668de0102264302a4d10dd9d4438a42 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:18 -0700 Subject: [PATCH 002/190] include/uapi/linux/blkzoned.h: Remove a superfluous __packed directive Using the __packed directive for a structure that does not need it is wrong because it makes gcc generate suboptimal code on some architectures. Hence remove the __packed directive from the blk_zone_report structure definition. See also http://digitalvampire.org/blog/index.php/2006/07/31/why-you-shouldnt-use-__attribute__packed/. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index e3c70fe6bf0fbb..ff5a5db8906a7c 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -117,7 +117,7 @@ struct blk_zone_report { __u32 nr_zones; __u8 reserved[4]; struct blk_zone zones[0]; -} __packed; +}; /** * struct blk_zone_range - BLKRESETZONE ioctl request From 6b1d83d274486615cc341e410467a98fd9c27c0a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:19 -0700 Subject: [PATCH 003/190] block: Remove bdev_nr_zones() Remove this function since it has no callers. This function was introduced in commit 6cc77e9cb080 ("block: introduce zoned block devices zone write locking"). Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Christoph Hellwig Cc: Matias Bjorling Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 79226ca8f80f2d..49a400afb146d1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1639,15 +1639,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) return 0; } -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return blk_queue_nr_zones(q); - return 0; -} - static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; From 7c8542b7982264226cf94102950343185869b584 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:20 -0700 Subject: [PATCH 004/190] block: Inline blk_queue_nr_zones() Since the implementation of blk_queue_nr_zones() is trivial and since it only has a single caller, inline this function. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- include/linux/blkdev.h | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 1c4532e9293800..26e1f8e425a813 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -214,7 +214,7 @@ static int queue_zone_wlock_show(void *data, struct seq_file *m) if (!q->seq_zones_wlock) return 0; - for (i = 0; i < blk_queue_nr_zones(q); i++) + for (i = 0; i < q->nr_zones; i++) if (test_bit(i, q->seq_zones_wlock)) seq_printf(m, "%u\n", i); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49a400afb146d1..905daa7c647e39 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -800,11 +800,6 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } -static inline unsigned int blk_queue_nr_zones(struct request_queue *q) -{ - return q->nr_zones; -} - static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { From 6a5ac9846508ad7d1d23881d9d5add35f2e6ae71 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:21 -0700 Subject: [PATCH 005/190] block: Make struct request_queue smaller for CONFIG_BLK_DEV_ZONED=n Exclude zoned block device members from struct request_queue for CONFIG_BLK_DEV_ZONED == n. Avoid breaking the build by only building the code that uses these struct request_queue members if CONFIG_BLK_DEV_ZONED != n. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Kconfig | 4 ++++ block/Makefile | 1 + block/blk-mq-debugfs-zoned.c | 24 ++++++++++++++++++++++++ block/blk-mq-debugfs.c | 15 --------------- block/blk-mq-debugfs.h | 9 +++++++++ include/linux/blkdev.h | 6 ++++++ 6 files changed, 44 insertions(+), 15 deletions(-) create mode 100644 block/blk-mq-debugfs-zoned.c diff --git a/block/Kconfig b/block/Kconfig index eb50fd4977c2f5..dfe7bc770fc9ab 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -177,6 +177,10 @@ config BLK_DEBUG_FS Unless you are building a kernel for a tiny system, you should say Y here. +config BLK_DEBUG_FS_ZONED + bool + default BLK_DEBUG_FS && BLK_DEV_ZONED + config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" ---help--- diff --git a/block/Makefile b/block/Makefile index 6a56303b992529..a8f94cdb75c390 100644 --- a/block/Makefile +++ b/block/Makefile @@ -34,4 +34,5 @@ obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o +obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c new file mode 100644 index 00000000000000..fb2c82c351e4fd --- /dev/null +++ b/block/blk-mq-debugfs-zoned.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Western Digital Corporation or its affiliates. + * + * This file is released under the GPL. + */ + +#include +#include "blk-mq-debugfs.h" + +int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + unsigned int i; + + if (!q->seq_zones_wlock) + return 0; + + for (i = 0; i < q->nr_zones; i++) + if (test_bit(i, q->seq_zones_wlock)) + seq_printf(m, "%u\n", i); + + return 0; +} diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 26e1f8e425a813..7efe268e44472a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, return count; } -static int queue_zone_wlock_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - unsigned int i; - - if (!q->seq_zones_wlock) - return 0; - - for (i = 0; i < q->nr_zones; i++) - if (test_bit(i, q->seq_zones_wlock)) - seq_printf(m, "%u\n", i); - - return 0; -} - static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index b9d366e57097da..a9160be12be05a 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc } #endif +#ifdef CONFIG_BLK_DEBUG_FS_ZONED +int queue_zone_wlock_show(void *data, struct seq_file *m); +#else +static inline int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + return 0; +} +#endif + #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 905daa7c647e39..ca5a8b046894ab 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -592,6 +592,7 @@ struct request_queue { struct queue_limits limits; +#ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information for request dispatch control. * nr_zones is the total number of zones of the device. This is always @@ -612,6 +613,7 @@ struct request_queue { unsigned int nr_zones; unsigned long *seq_zones_bitmap; unsigned long *seq_zones_wlock; +#endif /* CONFIG_BLK_DEV_ZONED */ /* * sg stuff @@ -800,6 +802,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } +#ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { @@ -815,6 +818,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q, return false; return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); } +#endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) { @@ -1065,6 +1069,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } +#ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) { return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); @@ -1074,6 +1079,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq) { return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); } +#endif /* CONFIG_BLK_DEV_ZONED */ /* * Some commands like WRITE SAME have a payload or data transfer size which From 0471559c2fbd2c19d183fc0f51ce88aefa0a13c8 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jun 2018 21:55:34 +0200 Subject: [PATCH 006/190] block, bfq: add/remove entity weights correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To keep I/O throughput high as often as possible, BFQ performs I/O-dispatch plugging (aka device idling) only when beneficial exactly for throughput, or when needed for service guarantees (low latency, fairness). An important case where the latter condition holds is when the scenario is 'asymmetric' in terms of weights: i.e., when some bfq_queue or whole group of queues has a higher weight, and thus has to receive more service, than other queues or groups. Without dispatch plugging, lower-weight queues/groups may unjustly steal bandwidth to higher-weight queues/groups. To detect asymmetric scenarios, BFQ checks some sufficient conditions. One of these conditions is that active groups have different weights. BFQ controls this condition by maintaining a special set of unique weights of active groups (group_weights_tree). To this purpose, in the function bfq_active_insert/bfq_active_extract BFQ adds/removes the weight of a group to/from this set. Unfortunately, the function bfq_active_extract may happen to be invoked also for a group that is still active (to preserve the correct update of the next queue to serve, see comments in function bfq_no_longer_next_in_service() for details). In this case, removing the weight of the group makes the set group_weights_tree inconsistent. Service-guarantee violations follow. This commit addresses this issue by moving group_weights_tree insertions from their previous location (in bfq_active_insert) into the function __bfq_activate_entity, and by moving group_weights_tree extractions from bfq_active_extract to when the entity that represents a group remains throughly idle, i.e., with no request either enqueued or dispatched. Tested-by: Holger Hoffstätte Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 45 +++++++++++++++++++++++++++++++++++++++++---- block/bfq-iosched.h | 7 +++++-- block/bfq-wf2q.c | 24 +++++++++++++----------- 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 495b9ddb3355c4..3f32e88c7e9b37 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -742,8 +742,9 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ -void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, - struct rb_root *root) +void __bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) { if (!entity->weight_counter) return; @@ -759,6 +760,43 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, entity->weight_counter = NULL; } +/* + * Invoke __bfq_weights_tree_remove on bfqq and all its inactive + * parent entities. + */ +void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = bfqq->entity.parent; + + __bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + + for_each_entity(entity) { + struct bfq_sched_data *sd = entity->my_sched_data; + + if (sd->next_in_service || sd->in_service_entity) { + /* + * entity is still active, because either + * next_in_service or in_service_entity is not + * NULL (see the comments on the definition of + * next_in_service for details on why + * in_service_entity must be checked too). + * + * As a consequence, the weight of entity is + * not to be removed. In addition, if entity + * is active, then its parent entities are + * active as well, and thus their weights are + * not to be removed either. In the end, this + * loop must stop here. + */ + break; + } + __bfq_weights_tree_remove(bfqd, entity, + &bfqd->group_weights_tree); + } +} + /* * Return expired entry, or NULL to just start from scratch in rbtree. */ @@ -4582,8 +4620,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) */ bfqq->budget_timeout = jiffies; - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); + bfq_weights_tree_remove(bfqd, bfqq); } now_ns = ktime_get_ns(); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 0f712e03b035a7..a8a2e5aca4d48f 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, struct rb_root *root); -void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, - struct rb_root *root); +void __bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); +void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq); void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4498c43245e2d4..58cf38fcee058f 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, if (bfqq) list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); #ifdef CONFIG_BFQ_GROUP_IOSCHED - else /* bfq_group */ - bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); - if (bfqg != bfqd->root_group) bfqg->active_entities++; #endif @@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, if (bfqq) list_del(&bfqq->bfqq_list); #ifdef CONFIG_BFQ_GROUP_IOSCHED - else /* bfq_group */ - bfq_weights_tree_remove(bfqd, entity, - &bfqd->group_weights_tree); - if (bfqg != bfqd->root_group) bfqg->active_entities--; #endif @@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (prev_weight != new_weight) { root = bfqq ? &bfqd->queue_weights_tree : &bfqd->group_weights_tree; - bfq_weights_tree_remove(bfqd, entity, root); + __bfq_weights_tree_remove(bfqd, entity, root); } entity->weight = new_weight; /* @@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, * one of its children receives a new request. * * Basically, this function updates the timestamps of entity and - * inserts entity into its active tree, ater possibly extracting it + * inserts entity into its active tree, after possibly extracting it * from its idle tree. */ static void __bfq_activate_entity(struct bfq_entity *entity, @@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity, entity->on_st = true; } +#ifdef BFQ_GROUP_IOSCHED_ENABLED + if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_weights_tree_add(bfqg->bfqd, entity, + &bfqd->group_weights_tree); + } +#endif + bfq_update_fin_time_enqueue(entity, st, backshifted); } @@ -1664,8 +1667,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->busy_queues--; if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); + bfq_weights_tree_remove(bfqd, bfqq); if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; From 4420b095cc474759f6fbdb6351648c7ff9833a54 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jun 2018 21:55:35 +0200 Subject: [PATCH 007/190] block, bfq: do not expire a queue that will deserve dispatch plugging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some bfq_queues, BFQ plugs I/O dispatching when the queue becomes idle, and keeps the plug until a new request of the queue arrives, or a timeout fires. BFQ does so either to boost throughput or to preserve service guarantees for the queue. More precisely, for such a queue, plugging starts when the queue happens to have either no request enqueued, or no request in flight, that is, no request already dispatched but not yet completed. On the opposite end, BFQ may happen to expire a queue with no request enqueued, without doing any plugging, if the queue still has some request in flight. Unfortunately, such a premature expiration causes the queue to lose its chance to enjoy dispatch plugging a moment later, i.e., when its in-flight requests finally get completed. This breaks service guarantees for the queue. This commit prevents BFQ from expiring an empty queue if the latter still has in-flight requests. Tested-by: Holger Hoffstätte Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3f32e88c7e9b37..4fd4f1996498db 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3597,8 +3597,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + /* + * Do not expire bfqq for budget timeout if bfqq may be about + * to enjoy device idling. The reason why, in this case, we + * prevent bfqq from expiring is the same as in the comments + * on the case where bfq_bfqq_must_idle() returns true, in + * bfq_completed_request(). + */ if (bfq_may_expire_for_budg_timeout(bfqq) && - !bfq_bfqq_wait_request(bfqq) && !bfq_bfqq_must_idle(bfqq)) goto expire; @@ -4674,8 +4680,32 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) * or if we want to idle in case it has no pending requests. */ if (bfqd->in_service_queue == bfqq) { - if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); + if (bfq_bfqq_must_idle(bfqq)) { + if (bfqq->dispatched == 0) + bfq_arm_slice_timer(bfqd); + /* + * If we get here, we do not expire bfqq, even + * if bfqq was in budget timeout or had no + * more requests (as controlled in the next + * conditional instructions). The reason for + * not expiring bfqq is as follows. + * + * Here bfqq->dispatched > 0 holds, but + * bfq_bfqq_must_idle() returned true. This + * implies that, even if no request arrives + * for bfqq before bfqq->dispatched reaches 0, + * bfqq will, however, not be expired on the + * completion event that causes bfqq->dispatch + * to reach zero. In contrast, on this event, + * bfqq will start enjoying device idling + * (I/O-dispatch plugging). + * + * But, if we expired bfqq here, bfqq would + * not have the chance to enjoy device idling + * when bfqq->dispatched finally reaches + * zero. This would expose bfqq to violation + * of its reserved service guarantees. + */ return; } else if (bfq_may_expire_for_budg_timeout(bfqq)) bfq_bfqq_expire(bfqd, bfqq, false, From 9fae8dd59ff3d9c19570cbddf12e87d7bb66c8a2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jun 2018 21:55:36 +0200 Subject: [PATCH 008/190] block, bfq: fix service being wrongly set to zero in case of preemption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If - a bfq_queue Q preempts another queue, because one request of Q arrives in time, - but, after this preemption, Q is not the queue that is set in service, then Q->entity.service is set to 0 when Q is eventually set in service. But Q should have continued receiving service with its old budget (which is why preemption has occurred) and its old service. This commit addresses this issue by resetting service on queue real expiration. Tested-by: Holger Hoffstätte Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 34 ++++++++++++++++++++++++++++------ block/bfq-wf2q.c | 6 ------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4fd4f1996498db..d579cc8e0db6fb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1382,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, * remain unchanged after such an expiration, and the * following statement therefore assigns to * entity->budget the remaining budget on such an - * expiration. For clarity, entity->service is not - * updated on expiration in any case, and, in normal - * operation, is reset only when bfqq is selected for - * service (see bfq_get_next_queue). + * expiration. */ entity->budget = min_t(unsigned long, bfq_bfqq_budget_left(bfqq), bfqq->max_budget); + /* + * At this point, we have used entity->service to get + * the budget left (needed for updating + * entity->budget). Thus we finally can, and have to, + * reset entity->service. The latter must be reset + * because bfqq would otherwise be charged again for + * the service it has received during its previous + * service slot(s). + */ + entity->service = 0; + return true; } + /* + * We can finally complete expiration, by setting service to 0. + */ + entity->service = 0; entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(bfqq->next_rq, bfqq)); bfq_clear_bfqq_non_blocking_wait_rq(bfqq); @@ -3271,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, ref = bfqq->ref; __bfq_bfqq_expire(bfqd, bfqq); + if (ref == 1) /* bfqq is gone, no more actions on it */ + return; + /* mark bfqq as waiting a request only if a bic still points to it */ - if (ref > 1 && !bfq_bfqq_busy(bfqq) && + if (!bfq_bfqq_busy(bfqq) && reason != BFQQE_BUDGET_TIMEOUT && - reason != BFQQE_BUDGET_EXHAUSTED) + reason != BFQQE_BUDGET_EXHAUSTED) { bfq_mark_bfqq_non_blocking_wait_rq(bfqq); + /* + * Not setting service to 0, because, if the next rq + * arrives in time, the queue will go on receiving + * service with this same budget (as if it never expired) + */ + } else + entity->service = 0; } /* diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 58cf38fcee058f..dbc07b4560598e 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1544,12 +1544,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) entity = sd->next_in_service; sd->in_service_entity = entity; - /* - * Reset the accumulator of the amount of service that - * the entity is about to receive. - */ - entity->service = 0; - /* * If entity is no longer a candidate for next * service, then it must be extracted from its active From 277a4a9b56cde0f3d53ea8abc0e43ff636820007 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jun 2018 21:55:37 +0200 Subject: [PATCH 009/190] block, bfq: give a better name to bfq_bfqq_may_idle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual goal of the function bfq_bfqq_may_idle is to tell whether it is better to perform device idling (more precisely: I/O-dispatch plugging) for the input bfq_queue, either to boost throughput or to preserve service guarantees. This commit improves the name of the function accordingly. Tested-by: Holger Hoffstätte Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d579cc8e0db6fb..41d9036b182249 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) * The following function returns true if every queue must receive the * same share of the throughput (this condition is used when deciding * whether idling may be disabled, see the comments in the function - * bfq_bfqq_may_idle()). + * bfq_better_to_idle()). * * Such a scenario occurs when: * 1) all active queues have the same weight, @@ -3355,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) * issues taken into account are not trivial. We discuss these issues * individually while introducing the variables. */ -static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) +static bool bfq_better_to_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; bool rot_without_queueing = @@ -3588,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) } /* - * If the in-service queue is empty but the function bfq_bfqq_may_idle + * If the in-service queue is empty but the function bfq_better_to_idle * returns true, then: * 1) the queue must remain in service and cannot be expired, and * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments on the function bfq_bfqq_may_idle for the reasons + * See the comments on the function bfq_better_to_idle for the reasons * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_may_idle itself + * and preserve service guarantees when bfq_better_to_idle itself * returns true. */ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); } /* @@ -3686,7 +3686,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * may idle after their completion, then keep it anyway. */ if (bfq_bfqq_wait_request(bfqq) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { + (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { bfqq = NULL; goto keep_queue; } @@ -4734,7 +4734,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) BFQQE_BUDGET_TIMEOUT); else if (RB_EMPTY_ROOT(&bfqq->sort_list) && (bfqq->dispatched == 0 || - !bfq_bfqq_may_idle(bfqq))) + !bfq_better_to_idle(bfqq))) bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_NO_MORE_REQUESTS); } From 8ab6bb9ee8d04ba56b9eb19cc7e4f56d0a43ad1a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:45 +0800 Subject: [PATCH 010/190] blk-mq: cleanup blk_mq_get_driver_tag() We never pass 'wait' as true to blk_mq_get_driver_tag(), and hence we never change '**hctx' as well. The last use of these went away with the flush cleanup, commit 0c2a6fe4dc3e. So cleanup the usage and remove the two extra parameters. Cc: Bart Van Assche Cc: Christoph Hellwig Tested-by: Andrew Jones Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++------------ block/blk-mq.h | 3 +-- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 95919268564b16..ae8a6b2c7c224f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -964,17 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } -bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, - bool wait) +bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_alloc_data data = { .q = rq->q, .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), - .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, + .flags = BLK_MQ_REQ_NOWAIT, }; - might_sleep_if(wait); - if (rq->tag != -1) goto done; @@ -991,8 +988,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, } done: - if (hctx) - *hctx = data.hctx; return rq->tag != -1; } @@ -1034,7 +1029,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ - return blk_mq_get_driver_tag(rq, hctx, false); + return blk_mq_get_driver_tag(rq); } wait = &this_hctx->dispatch_wait; @@ -1055,7 +1050,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, * allocation failure and adding the hardware queue to the wait * queue. */ - ret = blk_mq_get_driver_tag(rq, hctx, false); + ret = blk_mq_get_driver_tag(rq); if (!ret) { spin_unlock(&this_hctx->lock); return false; @@ -1105,7 +1100,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) break; - if (!blk_mq_get_driver_tag(rq, NULL, false)) { + if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The @@ -1137,7 +1132,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bd.last = true; else { nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); + bd.last = !blk_mq_get_driver_tag(nxt); } ret = q->mq_ops->queue_rq(hctx, &bd); @@ -1700,7 +1695,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_dispatch_budget(hctx)) goto insert; - if (!blk_mq_get_driver_tag(rq, NULL, false)) { + if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); goto insert; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 89231e439b2f60..23659f41bf2c9e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); -bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, - bool wait); +bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); From 2278d69f030f6cb7fdacba6281a46fb9d637d2aa Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:46 +0800 Subject: [PATCH 011/190] blk-mq: don't pass **hctx to blk_mq_mark_tag_wait() 'hctx' won't be changed at all, so not necessary to pass '**hctx' to blk_mq_mark_tag_wait(). Cc: Christoph Hellwig Cc: Bart Van Assche Tested-by: Andrew Jones Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ae8a6b2c7c224f..9eee896a35925e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1009,17 +1009,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, * restart. For both cases, take care to check the condition again after * marking us as waiting. */ -static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, +static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *this_hctx = *hctx; struct sbq_wait_state *ws; wait_queue_entry_t *wait; bool ret; - if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* * It's possible that a tag was freed in the window between the @@ -1032,17 +1031,17 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, return blk_mq_get_driver_tag(rq); } - wait = &this_hctx->dispatch_wait; + wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; - spin_lock(&this_hctx->lock); + spin_lock(&hctx->lock); if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->lock); return false; } - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); add_wait_queue(&ws->wait, wait); /* @@ -1052,7 +1051,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, */ ret = blk_mq_get_driver_tag(rq); if (!ret) { - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->lock); return false; } @@ -1063,7 +1062,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, spin_lock_irq(&ws->wait.lock); list_del_init(&wait->entry); spin_unlock_irq(&ws->wait.lock); - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->lock); return true; } @@ -1108,7 +1107,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * before we add this entry back on the dispatch list, * we'll re-run it below. */ - if (!blk_mq_mark_tag_wait(&hctx, rq)) { + if (!blk_mq_mark_tag_wait(hctx, rq)) { blk_mq_put_dispatch_budget(hctx); /* * For non-shared tags, the RESTART check From 5815839b3ca16bb1d45939270871169f6803a121 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:47 +0800 Subject: [PATCH 012/190] blk-mq: introduce new lock for protecting hctx->dispatch_wait Now hctx->lock is only acquired when adding hctx->dispatch_wait to one wait queue, but not held when removing it from the wait queue. IO hang can be observed easily if SCHED RESTART is disabled, that means now RESTART exits just for fixing the issue in blk_mq_mark_tag_wait(). This patch fixes the issue by introducing hctx->dispatch_wait_lock and holding it for removing hctx->dispatch_wait in blk_mq_dispatch_wake(), since we need to avoid acquiring hctx->lock in irq context. Fixes: eb619fdb2d4cb8b3d3419 ("blk-mq: fix issue with shared tag queue re-running") Cc: Christoph Hellwig Cc: Omar Sandoval Cc: Bart Van Assche Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 26 +++++++++++++++++--------- include/linux/blk-mq.h | 1 + 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9eee896a35925e..df84281f6af613 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -998,7 +998,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + spin_lock(&hctx->dispatch_wait_lock); list_del_init(&wait->entry); + spin_unlock(&hctx->dispatch_wait_lock); + blk_mq_run_hw_queue(hctx, true); return 1; } @@ -1012,7 +1015,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbq_wait_state *ws; + struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; @@ -1035,14 +1038,18 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, if (!list_empty_careful(&wait->entry)) return false; - spin_lock(&hctx->lock); + wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; + + spin_lock_irq(&wq->lock); + spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { - spin_unlock(&hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return false; } - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); - add_wait_queue(&ws->wait, wait); + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + __add_wait_queue(wq, wait); /* * It's possible that a tag was freed in the window between the @@ -1051,7 +1058,8 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, */ ret = blk_mq_get_driver_tag(rq); if (!ret) { - spin_unlock(&hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return false; } @@ -1059,10 +1067,9 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ - spin_lock_irq(&ws->wait.lock); list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); - spin_unlock(&hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return true; } @@ -2142,6 +2149,7 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->nr_ctx = 0; + spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e3147eb74222b8..ea690254dab7d6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -39,6 +39,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; + spinlock_t dispatch_wait_lock; wait_queue_entry_t dispatch_wait; atomic_t wait_index; From 97889f9ac24f8d2fc8e703ea7f80c162bab10d4d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:48 +0800 Subject: [PATCH 013/190] blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set() We have to remove synchronize_rcu() from blk_queue_cleanup(), otherwise long delay can be caused during lun probe. For removing it, we have to avoid to iterate the set->tag_list in IO path, eg, blk_mq_sched_restart(). This patch reverts 5b79413946d (Revert "blk-mq: don't handle TAG_SHARED in restart"). Given we have fixed enough IO hang issue, and there isn't any reason to restart all queues in one tags any more, see the following reasons: 1) blk-mq core can deal with shared-tags case well via blk_mq_get_driver_tag(), which can wake up queues waiting for driver tag. 2) SCSI is a bit special because it may return BLK_STS_RESOURCE if queue, target or host is ready, but SCSI built-in restart can cover all these well, see scsi_end_request(), queue will be rerun after any request initiated from this host/target is completed. In my test on scsi_debug(8 luns), this patch may improve IOPS by 20% ~ 30% when running I/O on these 8 luns concurrently. Fixes: 705cda97ee3a ("blk-mq: Make it safe to use RCU to iterate over blk_mq_tag_set.tag_list") Cc: Omar Sandoval Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Martin K. Petersen Cc: linux-scsi@vger.kernel.org Reported-by: Andrew Jones Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 85 +++--------------------------------------- block/blk-mq.c | 10 +---- include/linux/blkdev.h | 2 - 3 files changed, 7 insertions(+), 90 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 56c493c6cd903e..4e027f6108aef9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; - - if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_inc(&q->shared_hctx_restart); - } else - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return false; - - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; - - if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); - } else - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + return; + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - return blk_mq_run_hw_queue(hctx, true); + blk_mq_run_hw_queue(hctx, true); } /* @@ -380,68 +367,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return false; } -/** - * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list - * @pos: loop cursor. - * @skip: the list element that will not be examined. Iteration starts at - * @skip->next. - * @head: head of the list to examine. This list must have at least one - * element, namely @skip. - * @member: name of the list_head structure within typeof(*pos). - */ -#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ - for ((pos) = (skip); \ - (pos = (pos)->member.next != (head) ? list_entry_rcu( \ - (pos)->member.next, typeof(*pos), member) : \ - list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ - (pos) != (skip); ) - -/* - * Called after a driver tag has been freed to check whether a hctx needs to - * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware - * queues in a round-robin fashion if the tag set of @hctx is shared with other - * hardware queues. - */ -void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) -{ - struct blk_mq_tags *const tags = hctx->tags; - struct blk_mq_tag_set *const set = hctx->queue->tag_set; - struct request_queue *const queue = hctx->queue, *q; - struct blk_mq_hw_ctx *hctx2; - unsigned int i, j; - - if (set->flags & BLK_MQ_F_TAG_SHARED) { - /* - * If this is 0, then we know that no hardware queues - * have RESTART marked. We're done. - */ - if (!atomic_read(&queue->shared_hctx_restart)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu_rr(q, queue, &set->tag_list, - tag_set_list) { - queue_for_each_hw_ctx(q, hctx2, i) - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - goto done; - } - j = hctx->queue_num + 1; - for (i = 0; i < queue->nr_hw_queues; i++, j++) { - if (j == queue->nr_hw_queues) - j = 0; - hctx2 = queue->queue_hw_ctx[j]; - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - break; - } -done: - rcu_read_unlock(); - } else { - blk_mq_sched_restart_hctx(hctx); - } -} - void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async) { diff --git a/block/blk-mq.c b/block/blk-mq.c index df84281f6af613..7c6ff13171efcc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2335,15 +2335,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_inc(&q->shared_hctx_restart); + if (shared) hctx->flags |= BLK_MQ_F_TAG_SHARED; - } else { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); + else hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } } } @@ -2374,7 +2369,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) blk_mq_update_tag_set_depth(set, false); } mutex_unlock(&set->tag_list_lock); - synchronize_rcu(); INIT_LIST_HEAD(&q->tag_set_list); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ca5a8b046894ab..9d05646d50596e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -442,8 +442,6 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ - atomic_t shared_hctx_restart; - struct blk_queue_stats *stats; struct rq_wb *rq_wb; From 1311326cf4755c7ffefd20f576144ecf46d9906b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:49 +0800 Subject: [PATCH 014/190] blk-mq: avoid to synchronize rcu inside blk_cleanup_queue() SCSI probing may synchronously create and destroy a lot of request_queues for non-existent devices. Any synchronize_rcu() in queue creation or destroy path may introduce long latency during booting, see detailed description in comment of blk_register_queue(). This patch removes one synchronize_rcu() inside blk_cleanup_queue() for this case, commit c2856ae2f315d75(blk-mq: quiesce queue before freeing queue) needs synchronize_rcu() for implementing blk_mq_quiesce_queue(), but when queue isn't initialized, it isn't necessary to do that since only pass-through requests are involved, no original issue in scsi_execute() at all. Without this patch and previous one, it may take more 20+ seconds for virtio-scsi to complete disk probe. With the two patches, the time becomes less than 100ms. Fixes: c2856ae2f315d75 ("blk-mq: quiesce queue before freeing queue") Reported-by: Andrew Jones Cc: Omar Sandoval Cc: Bart Van Assche Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Christoph Hellwig Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f84a9b7b6f5aa1..947e7a4abd8c92 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -762,9 +762,13 @@ void blk_cleanup_queue(struct request_queue *q) * make sure all in-progress dispatch are completed because * blk_freeze_queue() can only complete all requests, and * dispatch may still be in-progress since we dispatch requests - * from more than one contexts + * from more than one contexts. + * + * No need to quiesce queue if it isn't initialized yet since + * blk_freeze_queue() should be enough for cases of passthrough + * request. */ - if (q->mq_ops) + if (q->mq_ops && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ From d05d199883b09cd34937ebb045adbed9098e9780 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 Jun 2018 15:51:00 -0700 Subject: [PATCH 015/190] drbd: Do not redefine __must_hold() Since __must_hold() is defined in , do not redefine it in DRBD. Compile-tested only. Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Philipp Reisner Cc: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index bc4ed2ed40a246..e35a234b0a8f2a 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -55,12 +55,10 @@ # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) # define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) -# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) #else # define __protected_by(x) # define __protected_read_by(x) # define __protected_write_by(x) -# define __must_hold(x) #endif /* shared module parameters, defined in drbd_main.c */ From 1954e9a998d59d08520d7d4bebeafb8f66ba0d0f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 27 Jun 2018 13:09:05 -0700 Subject: [PATCH 016/190] block: Document how blk_update_request() handles RQF_SPECIAL_PAYLOAD requests The payload of struct request is stored in the request.bio chain if the RQF_SPECIAL_PAYLOAD flag is not set and in request.special_vec if RQF_SPECIAL_PAYLOAD has been set. However, blk_update_request() iterates over req->bio whether or not RQF_SPECIAL_PAYLOAD has been set. Additionally, the RQF_SPECIAL_PAYLOAD flag is ignored by blk_rq_bytes() which means that the value returned by that function is incorrect if the RQF_SPECIAL_PAYLOAD flag has been set. It is not clear to me whether this is an oversight or whether this happened on purpose. Anyway, document that it is known that both functions ignore RQF_SPECIAL_PAYLOAD. See also commit f9d03f96b988 ("block: improve handling of the magic discard payload"). Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 947e7a4abd8c92..2ff8e131a892e8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3056,6 +3056,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios); * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both + * blk_rq_bytes() and in blk_update_request(). + * * Return: * %false - this request doesn't have any more data * %true - this request has more data From e1a413245a564683697a3d02ec197b72cf009b89 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2018 09:56:08 +0800 Subject: [PATCH 017/190] Blktrace: bail out early if block debugfs is not configured Since @blk_debugfs_root couldn't be configured dynamically, we can save a few memory allocation if it's not there. Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 987d9a9ae2839a..b951aa1fac6177 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -494,6 +494,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; + if (!blk_debugfs_root) + return -ENOENT; + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -518,9 +521,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - if (!blk_debugfs_root) - goto err; - dir = debugfs_lookup(buts->name, blk_debugfs_root); if (!dir) bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); From 43ada78781246cb36036f26158a645c17550ac54 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2018 09:56:56 +0800 Subject: [PATCH 018/190] Block: blk-throttle: set low_valid immediately once one cgroup has io.low configured Once one cgroup has io.low configured, @low_valid becomes true and other cgroups won't switch it back whatsoever. Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 82282e6fdcf82c..63bb261811dd0a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) struct throtl_grp *tg = blkg_to_tg(blkg); if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || - tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { low_valid = true; + break; + } } rcu_read_unlock(); From b64a71a0130dd2a88b0fc36a3c0a4882f47813e8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Jul 2018 08:42:34 +0100 Subject: [PATCH 019/190] block/floppy: remove redundant variable dflags Variable dflags is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'dflags' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 8871b5044d9e4b..48f622728ce6a1 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -1461,7 +1461,6 @@ static void setup_rw_floppy(void) int i; int r; int flags; - int dflags; unsigned long ready_date; void (*function)(void); @@ -1485,8 +1484,6 @@ static void setup_rw_floppy(void) if (fd_wait_for_completion(ready_date, function)) return; } - dflags = DRS->flags; - if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE)) setup_DMA(); From f4354a94e2097fe87a14d47ff502754bb547029a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Jul 2018 08:47:06 +0100 Subject: [PATCH 020/190] loop: remove redundant pointer inode Pointer inode is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'inode' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 4cb1d1be3cfbc9..bae472646e4a55 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -690,7 +690,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, unsigned int arg) { struct file *file, *old_file; - struct inode *inode; int error; error = -ENXIO; @@ -711,7 +710,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (error) goto out_putf; - inode = file->f_mapping->host; old_file = lo->lo_backing_file; error = -EINVAL; From e84422cdf3caa9cf35e625076dc62977f0023992 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Jul 2018 08:13:59 +0100 Subject: [PATCH 021/190] partitions/ldm: remove redundant pointer dgrp Pointer dgrp is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'dgrp' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- block/partitions/ldm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 0417937dfe9964..16766f267559ca 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) { char buf[64]; int r_objid, r_name, r_id1, r_id2, len; - struct vblk_dgrp *dgrp; BUG_ON (!buffer || !vb); @@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) if (len != get_unaligned_be32(buffer + 0x14)) return false; - dgrp = &vb->vblk.dgrp; - ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); return true; } From 5efac89c849849ad3a959224eb711f9c311e5bde Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Jul 2018 09:14:19 +0100 Subject: [PATCH 022/190] paride: remove redundant variable n Variable n is being assigned but is never used hence it is redundant and can be removed. Also put spacing between variables in declaration to clean up checkpatch warnings. Cleans up clang warning: warning: variable 'n' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- drivers/block/paride/bpck.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c index 4f27e7392e38ad..f5f63ca2889d01 100644 --- a/drivers/block/paride/bpck.c +++ b/drivers/block/paride/bpck.c @@ -347,7 +347,7 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose ) static void bpck_read_eeprom ( PIA *pi, char * buf ) -{ int i,j,k,n,p,v,f, om, od; +{ int i, j, k, p, v, f, om, od; bpck_force_spp(pi); @@ -356,7 +356,6 @@ static void bpck_read_eeprom ( PIA *pi, char * buf ) bpck_connect(pi); - n = 0; WR(4,0); for (i=0;i<64;i++) { WR(6,8); From 0da73d00ca111a6175825a00d94dbeae185f6d7e Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 2 Jul 2018 23:46:43 +0900 Subject: [PATCH 023/190] blk-mq: code clean-up by adding an API to clear set->mq_map set->mq_map is now currently cleared if something goes wrong when establishing a queue map in blk-mq-pci.c. It's also cleared before updating a queue map in blk_mq_update_queue_map(). This patch provides an API to clear set->mq_map to make it clear. Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- block/blk-mq-pci.c | 5 +++-- block/blk-mq.c | 4 +--- block/blk-mq.h | 8 ++++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index e233996bb76f2f..db644ec624f501 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -17,6 +17,8 @@ #include #include +#include "blk-mq.h" + /** * blk_mq_pci_map_queues - provide a default queue mapping for PCI device * @set: tagset to provide the mapping for @@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, fallback: WARN_ON_ONCE(set->nr_hw_queues > 1); - for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + blk_mq_clear_mq_map(set); return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7c6ff13171efcc..3cc074ae5c5957 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2683,7 +2683,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { if (set->ops->map_queues) { - int cpu; /* * transport .map_queues is usually done in the following * way: @@ -2698,8 +2697,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) * killing stale mapping since one CPU may not be mapped * to any hw queue. */ - for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + blk_mq_clear_mq_map(set); return set->ops->map_queues(set); } else diff --git a/block/blk-mq.h b/block/blk-mq.h index 23659f41bf2c9e..bc2b24735ed413 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -202,4 +202,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(hctx, rq); } +static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) +{ + int cpu; + + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; +} + #endif From c018c84fdb453ae057f3bcc87a1f1f730d41628b Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 30 Jun 2018 22:12:41 +0900 Subject: [PATCH 024/190] blk-mq: fix typo in a function comment Fix typo in a function blk_mq_alloc_tag_set() comment. if if it too large -> if it's too large. Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3cc074ae5c5957..acf31ad733bfb1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2707,7 +2707,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the - * requested depth down, if if it too large. In that case, the set + * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) From 3f0cedc7e9a0b32e79c79d2aac0c96d2b870ae55 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 2 Jul 2018 17:35:58 +0800 Subject: [PATCH 025/190] blk-mq: use list_splice_tail_init() to insert requests list_splice_tail_init() is much more faster than inserting each request one by one, given all requets in 'list' belong to same sw queue and ctx->lock is required to insert requests. Cc: Laurence Oberman Cc: Omar Sandoval Cc: Bart Van Assche Tested-by: Kashyap Desai Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index acf31ad733bfb1..795ba859b16b67 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1545,19 +1545,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list) { + struct request *rq; + /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ - spin_lock(&ctx->lock); - while (!list_empty(list)) { - struct request *rq; - - rq = list_first_entry(list, struct request, queuelist); + list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - list_del_init(&rq->queuelist); - __blk_mq_insert_req_list(hctx, rq, false); + trace_block_rq_insert(hctx->queue, rq); } + + spin_lock(&ctx->lock); + list_splice_tail_init(list, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } From b04f50ab8a74129b3041a2836c33c916be3c6667 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 2 Jul 2018 17:35:59 +0800 Subject: [PATCH 026/190] blk-mq: only attempt to merge bio if there is rq in sw queue Only attempt to merge bio iff the ctx->rq_list isn't empty, because: 1) for high-performance SSD, most of times dispatch may succeed, then there may be nothing left in ctx->rq_list, so don't try to merge over sw queue if it is empty, then we can save one acquiring of ctx->lock 2) we can't expect good merge performance on per-cpu sw queue, and missing one merge on sw queue won't be a big deal since tasks can be scheduled from one CPU to another. Cc: Laurence Oberman Cc: Omar Sandoval Cc: Bart Van Assche Tested-by: Kashyap Desai Reported-by: Kashyap Desai Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4e027f6108aef9..f3b4b5ceb4d121 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -326,7 +326,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) return e->type->ops.mq.bio_merge(hctx, bio); } - if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { + if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + !list_empty_careful(&ctx->rq_list)) { /* default per sw-queue merge */ spin_lock(&ctx->lock); ret = blk_mq_attempt_merge(q, ctx, bio); From d769a992966b0a188096fda24fc08fc769ec6547 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 2 Jul 2018 12:49:02 -0500 Subject: [PATCH 027/190] drbd: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Warning level 2 was used in this case: -Wimplicit-fallthrough=2 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index be9450f5ad1c51..a36a30795c4360 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2790,6 +2790,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet then we would do something smarter here than reading the block... */ peer_req->flags |= EE_RS_THIN_REQ; + /* fall through */ case P_RS_DATA_REQUEST: peer_req->w.cb = w_e_end_rsdata_req; fault_type = DRBD_FAULT_RS_RD; @@ -2968,6 +2969,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold /* Else fall through to one of the other strategies... */ drbd_warn(device, "Discard younger/older primary did not find a decision\n" "Using discard-least-changes instead\n"); + /* fall through */ case ASB_DISCARD_ZERO_CHG: if (ch_peer == 0 && ch_self == 0) { rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) @@ -2979,6 +2981,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold } if (after_sb_0p == ASB_DISCARD_ZERO_CHG) break; + /* else: fall through */ case ASB_DISCARD_LEAST_CHG: if (ch_self < ch_peer) rv = -1; From d893ff86034f7107f89d8b740c2b5902a21a49db Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 2 Jul 2018 12:52:06 -0500 Subject: [PATCH 028/190] block/loop: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- drivers/block/loop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bae472646e4a55..ea9debf59b225c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1609,6 +1609,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, case LOOP_GET_STATUS64: case LOOP_SET_STATUS64: arg = (unsigned long) compat_ptr(arg); + /* fall through */ case LOOP_SET_FD: case LOOP_CHANGE_FD: case LOOP_SET_BLOCK_SIZE: From 6e768717304bdbe8d2897ca8298f6b58863fdc41 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 3 Jul 2018 09:03:16 -0600 Subject: [PATCH 029/190] blk-mq: dequeue request one by one from sw queue if hctx is busy It won't be efficient to dequeue request one by one from sw queue, but we have to do that when queue is busy for better merge performance. This patch takes the Exponential Weighted Moving Average(EWMA) to figure out if queue is busy, then only dequeue request one by one from sw queue when queue is busy. Fixes: b347689ffbca ("blk-mq-sched: improve dispatching from sw queue") Cc: Kashyap Desai Cc: Laurence Oberman Cc: Omar Sandoval Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Hannes Reinecke Reported-by: Kashyap Desai Tested-by: Kashyap Desai Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 9 +++++++++ block/blk-mq-sched.c | 11 ++--------- block/blk-mq.c | 33 ++++++++++++++++++++++++++++++++- include/linux/blk-mq.h | 3 ++- 4 files changed, 45 insertions(+), 11 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7efe268e44472a..cb1e6cf7ac48f4 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -622,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m) return 0; } +static int hctx_dispatch_busy_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%u\n", hctx->dispatch_busy); + return 0; +} + static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) __acquires(&ctx->lock) { @@ -783,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"queued", 0600, hctx_queued_show, hctx_queued_write}, {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, + {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {}, }; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index f3b4b5ceb4d121..fdc129e64cc4e8 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -206,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) } } else if (has_sched_dispatch) { blk_mq_do_dispatch_sched(hctx); - } else if (q->mq_ops->get_budget) { - /* - * If we need to get budget before queuing request, we - * dequeue request one by one from sw queue for avoiding - * to mess up I/O merge when dispatch runs out of resource. - * - * TODO: get more budgets, and dequeue more requests in - * one time. - */ + } else if (hctx->dispatch_busy) { + /* dequeue request one by one from sw queue if queue is busy */ blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); diff --git a/block/blk-mq.c b/block/blk-mq.c index 795ba859b16b67..850fdd02c38576 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1074,6 +1074,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, return true; } +#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 +#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 +/* + * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): + * - EWMA is one simple way to compute running average value + * - weight(7/8 and 1/8) is applied so that it can decrease exponentially + * - take 4 as factor for avoiding to get too small(0) result, and this + * factor doesn't matter because EWMA decreases exponentially + */ +static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) +{ + unsigned int ewma; + + if (hctx->queue->elevator) + return; + + ewma = hctx->dispatch_busy; + + if (!ewma && !busy) + return; + + ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; + if (busy) + ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; + ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; + + hctx->dispatch_busy = ewma; +} + #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ /* @@ -1210,8 +1239,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, else if (needs_restart && (ret == BLK_STS_RESOURCE)) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); + blk_mq_update_dispatch_busy(hctx, true); return false; - } + } else + blk_mq_update_dispatch_busy(hctx, false); /* * If the host/device is unable to accept more work, inform the diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ea690254dab7d6..d710e92874ccbd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -35,9 +35,10 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; struct blk_mq_ctx *dispatch_from; + unsigned int dispatch_busy; - struct blk_mq_ctx **ctxs; unsigned int nr_ctx; + struct blk_mq_ctx **ctxs; spinlock_t dispatch_wait_lock; wait_queue_entry_t dispatch_wait; From 08e18eab0c579ad84399c1899c11899734854eb2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:50 -0400 Subject: [PATCH 030/190] block: add bi_blkg to the bio for cgroups Currently io.low uses a bi_cg_private to stash its private data for the blkg, however other blkcg policies may want to use this as well. Since we can get the private data out of the blkg, move this to bi_blkg in the bio and make it generic, then we can use bio_associate_blkg() to attach the blkg to the bio. Theoretically we could simply replace the bi_css with this since we can get to all the same information from the blkg, however you have to lookup the blkg, so for example wbc_init_bio() would have to lookup and possibly allocate the blkg for the css it was trying to attach to the bio. This could be problematic and result in us either not attaching the css at all to the bio, or falling back to the root blkcg if we are unable to allocate the corresponding blkg. So for now do this, and in the future if possible we could just replace the bi_css with bi_blkg and update the helpers to do the correct translation. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/bio.c | 23 +++++++++++++++++++++++ block/blk-throttle.c | 21 +++++++-------------- include/linux/bio.h | 1 + include/linux/blk_types.h | 2 +- 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/block/bio.c b/block/bio.c index 67eff5eddc4919..04457153857454 100644 --- a/block/bio.c +++ b/block/bio.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "blk.h" @@ -2036,6 +2037,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) } EXPORT_SYMBOL_GPL(bio_associate_blkcg); +/** + * bio_associate_blkg - associate a bio with the specified blkg + * @bio: target bio + * @blkg: the blkg to associate + * + * Associate @bio with the blkg specified by @blkg. This is the queue specific + * blkcg information associated with the @bio, a reference will be taken on the + * @blkg and will be freed when the bio is freed. + */ +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) +{ + if (unlikely(bio->bi_blkg)) + return -EBUSY; + blkg_get(blkg); + bio->bi_blkg = blkg; + return 0; +} + /** * bio_disassociate_task - undo bio_associate_current() * @bio: target bio @@ -2050,6 +2069,10 @@ void bio_disassociate_task(struct bio *bio) css_put(bio->bi_css); bio->bi_css = NULL; } + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } } /** diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 63bb261811dd0a..caaabbe8a7a531 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2134,12 +2134,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - if (bio->bi_css) { - if (bio->bi_cg_private) - blkg_put(tg_to_blkg(bio->bi_cg_private)); - bio->bi_cg_private = tg; - blkg_get(tg_to_blkg(tg)); - } + if (bio->bi_css) + bio_associate_blkg(bio, tg_to_blkg(tg)); bio_issue_init(&bio->bi_issue, bio_sectors(bio)); #endif } @@ -2287,6 +2283,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns) void blk_throtl_bio_endio(struct bio *bio) { + struct blkcg_gq *blkg; struct throtl_grp *tg; u64 finish_time_ns; unsigned long finish_time; @@ -2294,20 +2291,18 @@ void blk_throtl_bio_endio(struct bio *bio) unsigned long lat; int rw = bio_data_dir(bio); - tg = bio->bi_cg_private; - if (!tg) + blkg = bio->bi_blkg; + if (!blkg) return; - bio->bi_cg_private = NULL; + tg = blkg_to_tg(blkg); finish_time_ns = ktime_get_ns(); tg->last_finish_time = finish_time_ns >> 10; start_time = bio_issue_time(&bio->bi_issue) >> 10; finish_time = __bio_issue_time(finish_time_ns) >> 10; - if (!start_time || finish_time <= start_time) { - blkg_put(tg_to_blkg(tg)); + if (!start_time || finish_time <= start_time) return; - } lat = finish_time - start_time; /* this is only for bio based driver */ @@ -2336,8 +2331,6 @@ void blk_throtl_bio_endio(struct bio *bio) tg->bio_cnt /= 2; tg->bad_bio_cnt /= 2; } - - blkg_put(tg_to_blkg(tg)); } #endif diff --git a/include/linux/bio.h b/include/linux/bio.h index f08f5fe7bd0820..a279ba384da9d7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -555,6 +555,7 @@ do { \ #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3c4f390aea4bc2..3364d42ebe0807 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -179,8 +179,8 @@ struct bio { */ struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; + struct blkcg_gq *bi_blkg; #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - void *bi_cg_private; struct bio_issue bi_issue; #endif #endif From c7c98fd37653955d3a17dd4f1fa67aba070096a9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:51 -0400 Subject: [PATCH 031/190] block: introduce bio_issue_as_root_blkg Instead of forcing all file systems to get the right context on their bio's, simply check for REQ_META to see if we need to issue as the root blkg. We don't want to force all bio's to have the root blkg associated with them if REQ_META is set, as some controllers (blk-iolatency) need to know who the originating cgroup is so it can backcharge them for the work they are doing. This helper will make sure that the controllers do the proper thing wrt the IO priority and backcharging. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6c666fd7de3cc1..69aa71dc0c0427 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -238,6 +238,22 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return css_to_blkcg(task_css(current, io_cgrp_id)); } +/** + * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg + * @return: true if this bio needs to be submitted with the root blkg context. + * + * In order to avoid priority inversions we sometimes need to issue a bio as if + * it were attached to the root blkg, and then backcharge to the actual owning + * blkg. The idea is we do bio_blkcg() to look up the actual context for the + * bio and attach the appropriate blkg to the bio. Then we call this helper and + * if it is true run with the root blkg for that queue and then do any + * backcharging to the originating cgroup once the io is complete. + */ +static inline bool bio_issue_as_root_blkg(struct bio *bio) +{ + return (bio->bi_opf & REQ_META); +} + /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest From 903d23f0a354f226fa78f1c1c34b60aaf992e812 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:52 -0400 Subject: [PATCH 032/190] blk-cgroup: allow controllers to output their own stats blk-iolatency has a few stats that it would like to print out, and instead of adding a bunch of crap to the generic code just provide a helper so that controllers can add stuff to the stat line if they want to. Hide it behind a boot option since it changes the output of io.stat from normal, and these stats are only interesting to developers. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 47 +++++++++++++++++++++++++++++++++++--- include/linux/blk-cgroup.h | 3 +++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index eb85cb87c40f46..7dc6f05cc44b2f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -50,6 +50,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ +static bool blkcg_debug_stats = false; + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -954,13 +956,25 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { const char *dname; + char *buf; struct blkg_rwstat rwstat; u64 rbytes, wbytes, rios, wios; + size_t size = seq_get_buf(sf, &buf), off = 0; + int i; + bool has_stats = false; dname = blkg_dev_name(blkg); if (!dname) continue; + /* + * Hooray string manipulation, count is the size written NOT + * INCLUDING THE \0, so size is now count+1 less than what we + * had before, but we want to start writing the next bit from + * the \0 so we only add count to buf. + */ + off += scnprintf(buf+off, size-off, "%s ", dname); + spin_lock_irq(blkg->q->queue_lock); rwstat = blkg_rwstat_recursive_sum(blkg, NULL, @@ -975,9 +989,33 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) spin_unlock_irq(blkg->q->queue_lock); - if (rbytes || wbytes || rios || wios) - seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", - dname, rbytes, wbytes, rios, wios); + if (rbytes || wbytes || rios || wios) { + has_stats = true; + off += scnprintf(buf+off, size-off, + "rbytes=%llu wbytes=%llu rios=%llu wios=%llu", + rbytes, wbytes, rios, wios); + } + + if (!blkcg_debug_stats) + goto next; + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + size_t written; + + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + + written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); + if (written) + has_stats = true; + off += written; + } +next: + if (has_stats) { + off += scnprintf(buf+off, size-off, "\n"); + seq_commit(sf, off); + } } rcu_read_unlock(); @@ -1547,3 +1585,6 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); + +module_param(blkcg_debug_stats, bool, 0644); +MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 69aa71dc0c0427..b41292726c0f32 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -148,6 +148,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); +typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf, + size_t size); struct blkcg_policy { int plid; @@ -167,6 +169,7 @@ struct blkcg_policy { blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; + blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; From 0d1e0c7cd5909d6c6aa0957179318e13fcca971a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:53 -0400 Subject: [PATCH 033/190] blk: introduce REQ_SWAP Just like REQ_META, it's important to know the IO coming down is swap in order to guard against potential IO priority inversion issues with cgroups. Add REQ_SWAP and use it for all swap IO, and add it to our bio_issue_as_root_blkg helper. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 +- include/linux/blk_types.h | 3 ++- mm/page_io.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index b41292726c0f32..a8f9ba8f33a48c 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -254,7 +254,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { - return (bio->bi_opf & REQ_META); + return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3364d42ebe0807..0ffc34c5cc83a9 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -329,7 +329,7 @@ enum req_flag_bits { /* for driver use */ __REQ_DRV, - + __REQ_SWAP, /* swapping request. */ __REQ_NR_BITS, /* stops here */ }; @@ -351,6 +351,7 @@ enum req_flag_bits { #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_DRV (1ULL << __REQ_DRV) +#define REQ_SWAP (1ULL << __REQ_SWAP) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) diff --git a/mm/page_io.c b/mm/page_io.c index b41cf96445856e..a552cb37e220d7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -338,7 +338,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = -ENOMEM; goto out; } - bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); From 0d3bd88d54f513723602b361dccfc71639f50779 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 3 Jul 2018 11:14:54 -0400 Subject: [PATCH 034/190] swap,blkcg: issue swap io with the appropriate context For backcharging we need to know who the page belongs to when swapping it out. We don't worry about things that do ->rw_page (zram etc) at the moment, we're only worried about pages that actually go to a block device. Signed-off-by: Tejun Heo Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Acked-by: Andrew Morton Signed-off-by: Jens Axboe --- block/bio.c | 24 ++++++++++++++++++++++++ include/linux/bio.h | 7 +++++++ mm/page_io.c | 1 + 3 files changed, 32 insertions(+) diff --git a/block/bio.c b/block/bio.c index 04457153857454..5f84f5c3887b79 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2015,6 +2015,30 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP +#ifdef CONFIG_MEMCG +/** + * bio_associate_blkcg_from_page - associate a bio with the page's blkcg + * @bio: target bio + * @page: the page to lookup the blkcg from + * + * Associate @bio with the blkcg from @page's owning memcg. This works like + * every other associate function wrt references. + */ +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) +{ + struct cgroup_subsys_state *blkcg_css; + + if (unlikely(bio->bi_css)) + return -EBUSY; + if (!page->mem_cgroup) + return 0; + blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, + &io_cgrp_subsys); + bio->bi_css = blkcg_css; + return 0; +} +#endif /* CONFIG_MEMCG */ + /** * bio_associate_blkcg - associate a bio with the specified blkcg * @bio: target bio diff --git a/include/linux/bio.h b/include/linux/bio.h index a279ba384da9d7..a00dfff51aa589 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -553,6 +553,13 @@ do { \ #define bio_dev(bio) \ disk_devt((bio)->bi_disk) +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); +#else +static inline int bio_associate_blkcg_from_page(struct bio *bio, + struct page *page) { return 0; } +#endif + #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); diff --git a/mm/page_io.c b/mm/page_io.c index a552cb37e220d7..aafd19ec1db466 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -339,6 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); + bio_associate_blkcg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); From d09d8df3a29403693d9d20cc34ed101f2c558e2b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:55 -0400 Subject: [PATCH 035/190] blkcg: add generic throttling mechanism Since IO can be issued from literally anywhere it's almost impossible to do throttling without having some sort of adverse effect somewhere else in the system because of locking or other dependencies. The best way to solve this is to do the throttling when we know we aren't holding any other kernel resources. Do this by tracking throttling in a per-blkg basis, and if we require throttling flag the task that it needs to check before it returns to user space and possibly sleep there. This is to address the case where a process is doing work that is generating IO that can't be throttled, whether that is directly with a lot of REQ_META IO, or indirectly by allocating so much memory that it is swamping the disk with REQ_SWAP. We can't use task_add_work as we don't want to induce a memory allocation in the IO path, so simply saving the request queue in the task and flagging it to do the notify_resume thing achieves the same result without the overhead of a memory allocation. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 220 ++++++++++++++++++++++++++++++++++++ include/linux/blk-cgroup.h | 99 ++++++++++++++++ include/linux/cgroup-defs.h | 3 + include/linux/sched.h | 8 ++ include/linux/tracehook.h | 2 + 5 files changed, 332 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7dc6f05cc44b2f..d3310ec96c2abc 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "blk.h" #define MAX_KEY_LEN 100 @@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) if (!blkcg_debug_stats) goto next; + if (atomic_read(&blkg->use_delay)) { + has_stats = true; + off += scnprintf(buf+off, size-off, + " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + (unsigned long long)atomic64_read(&blkg->delay_nsec)); + } + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; size_t written; @@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&blkcg_pol_mutex); } +static void blkcg_exit(struct task_struct *tsk) +{ + if (tsk->throttle_queue) + blk_put_queue(tsk->throttle_queue); + tsk->throttle_queue = NULL; +} + struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, @@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = { .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", + .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -1586,5 +1603,208 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); +/* + * Scale the accumulated delay based on how long it has been since we updated + * the delay. We only call this when we are adding delay, in case it's been a + * while since we added delay, and when we are checking to see if we need to + * delay a task, to account for any delays that may have occurred. + */ +static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) +{ + u64 old = atomic64_read(&blkg->delay_start); + + /* + * We only want to scale down every second. The idea here is that we + * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain + * time window. We only want to throttle tasks for recent delay that + * has occurred, in 1 second time windows since that's the maximum + * things can be throttled. We save the current delay window in + * blkg->last_delay so we know what amount is still left to be charged + * to the blkg from this point onward. blkg->last_use keeps track of + * the use_delay counter. The idea is if we're unthrottling the blkg we + * are ok with whatever is happening now, and we can take away more of + * the accumulated delay as we've already throttled enough that + * everybody is happy with their IO latencies. + */ + if (time_before64(old + NSEC_PER_SEC, now) && + atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { + u64 cur = atomic64_read(&blkg->delay_nsec); + u64 sub = min_t(u64, blkg->last_delay, now - old); + int cur_use = atomic_read(&blkg->use_delay); + + /* + * We've been unthrottled, subtract a larger chunk of our + * accumulated delay. + */ + if (cur_use < blkg->last_use) + sub = max_t(u64, sub, blkg->last_delay >> 1); + + /* + * This shouldn't happen, but handle it anyway. Our delay_nsec + * should only ever be growing except here where we subtract out + * min(last_delay, 1 second), but lord knows bugs happen and I'd + * rather not end up with negative numbers. + */ + if (unlikely(cur < sub)) { + atomic64_set(&blkg->delay_nsec, 0); + blkg->last_delay = 0; + } else { + atomic64_sub(sub, &blkg->delay_nsec); + blkg->last_delay = cur - sub; + } + blkg->last_use = cur_use; + } +} + +/* + * This is called when we want to actually walk up the hierarchy and check to + * see if we need to throttle, and then actually throttle if there is some + * accumulated delay. This should only be called upon return to user space so + * we're not holding some lock that would induce a priority inversion. + */ +static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) +{ + u64 now = ktime_to_ns(ktime_get()); + u64 exp; + u64 delay_nsec = 0; + int tok; + + while (blkg->parent) { + if (atomic_read(&blkg->use_delay)) { + blkcg_scale_delay(blkg, now); + delay_nsec = max_t(u64, delay_nsec, + atomic64_read(&blkg->delay_nsec)); + } + blkg = blkg->parent; + } + + if (!delay_nsec) + return; + + /* + * Let's not sleep for all eternity if we've amassed a huge delay. + * Swapping or metadata IO can accumulate 10's of seconds worth of + * delay, and we want userspace to be able to do _something_ so cap the + * delays at 1 second. If there's 10's of seconds worth of delay then + * the tasks will be delayed for 1 second for every syscall. + */ + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); + + /* + * TODO: the use_memdelay flag is going to be for the upcoming psi stuff + * that hasn't landed upstream yet. Once that stuff is in place we need + * to do a psi_memstall_enter/leave if memdelay is set. + */ + + exp = ktime_add_ns(now, delay_nsec); + tok = io_schedule_prepare(); + do { + __set_current_state(TASK_KILLABLE); + if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) + break; + } while (!fatal_signal_pending(current)); + io_schedule_finish(tok); +} + +/** + * blkcg_maybe_throttle_current - throttle the current task if it has been marked + * + * This is only called if we've been marked with set_notify_resume(). Obviously + * we can be set_notify_resume() for reasons other than blkcg throttling, so we + * check to see if current->throttle_queue is set and if not this doesn't do + * anything. This should only ever be called by the resume code, it's not meant + * to be called by people willy-nilly as it will actually do the work to + * throttle the task if it is setup for throttling. + */ +void blkcg_maybe_throttle_current(void) +{ + struct request_queue *q = current->throttle_queue; + struct cgroup_subsys_state *css; + struct blkcg *blkcg; + struct blkcg_gq *blkg; + bool use_memdelay = current->use_memdelay; + + if (!q) + return; + + current->throttle_queue = NULL; + current->use_memdelay = false; + + rcu_read_lock(); + css = kthread_blkcg(); + if (css) + blkcg = css_to_blkcg(css); + else + blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); + + if (!blkcg) + goto out; + blkg = blkg_lookup(blkcg, q); + if (!blkg) + goto out; + blkg = blkg_try_get(blkg); + if (!blkg) + goto out; + rcu_read_unlock(); + blk_put_queue(q); + + blkcg_maybe_throttle_blkg(blkg, use_memdelay); + blkg_put(blkg); + return; +out: + rcu_read_unlock(); + blk_put_queue(q); +} +EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current); + +/** + * blkcg_schedule_throttle - this task needs to check for throttling + * @q - the request queue IO was submitted on + * @use_memdelay - do we charge this to memory delay for PSI + * + * This is called by the IO controller when we know there's delay accumulated + * for the blkg for this task. We do not pass the blkg because there are places + * we call this that may not have that information, the swapping code for + * instance will only have a request_queue at that point. This set's the + * notify_resume for the task to check and see if it requires throttling before + * returning to user space. + * + * We will only schedule once per syscall. You can call this over and over + * again and it will only do the check once upon return to user space, and only + * throttle once. If the task needs to be throttled again it'll need to be + * re-set at the next time we see the task. + */ +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (!blk_get_queue(q)) + return; + + if (current->throttle_queue) + blk_put_queue(current->throttle_queue); + current->throttle_queue = q; + if (use_memdelay) + current->use_memdelay = use_memdelay; + set_notify_resume(current); +} +EXPORT_SYMBOL_GPL(blkcg_schedule_throttle); + +/** + * blkcg_add_delay - add delay to this blkg + * @now - the current time in nanoseconds + * @delta - how many nanoseconds of delay to add + * + * Charge @delta to the blkg's current delay accumulation. This is used to + * throttle tasks if an IO controller thinks we need more throttling. + */ +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) +{ + blkcg_scale_delay(blkg, now); + atomic64_add(delta, &blkg->delay_nsec); +} +EXPORT_SYMBOL_GPL(blkcg_add_delay); + module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index a8f9ba8f33a48c..de57de4831d532 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -136,6 +136,12 @@ struct blkcg_gq { struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; + + atomic_t use_delay; + atomic64_t delay_nsec; + atomic64_t delay_start; + u64 last_delay; + int last_use; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); @@ -241,6 +247,26 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return css_to_blkcg(task_css(current, io_cgrp_id)); } +static inline bool blk_cgroup_congested(void) +{ + struct cgroup_subsys_state *css; + bool ret = false; + + rcu_read_lock(); + css = kthread_blkcg(); + if (!css) + css = task_css(current, io_cgrp_id); + while (css) { + if (atomic_read(&css->cgroup->congestion_count)) { + ret = true; + break; + } + css = css->parent; + } + rcu_read_unlock(); + return ret; +} + /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. @@ -374,6 +400,21 @@ static inline void blkg_get(struct blkcg_gq *blkg) atomic_inc(&blkg->refcnt); } +/** + * blkg_try_get - try and get a blkg reference + * @blkg: blkg to get + * + * This is for use when doing an RCU lookup of the blkg. We may be in the midst + * of freeing this blkg, so we can only use it if the refcnt is not zero. + */ +static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) +{ + if (atomic_inc_not_zero(&blkg->refcnt)) + return blkg; + return NULL; +} + + void __blkg_release_rcu(struct rcu_head *rcu); /** @@ -734,6 +775,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, return !throtl; } +static inline void blkcg_use_delay(struct blkcg_gq *blkg) +{ + if (atomic_add_return(1, &blkg->use_delay) == 1) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); +} + +static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + + if (old == 0) + return 0; + + /* + * We do this song and dance because we can race with somebody else + * adding or removing delay. If we just did an atomic_dec we'd end up + * negative and we'd already be in trouble. We need to subtract 1 and + * then check to see if we were the last delay so we can drop the + * congestion count on the cgroup. + */ + while (old) { + int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1); + if (cur == old) + break; + old = cur; + } + + if (old == 0) + return 0; + if (old == 1) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + return 1; +} + +static inline void blkcg_clear_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + if (!old) + return; + /* We only want 1 person clearing the congestion count for this blkg. */ + while (old) { + int cur = atomic_cmpxchg(&blkg->use_delay, old, 0); + if (cur == old) { + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + break; + } + old = cur; + } +} + +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); +void blkcg_maybe_throttle_current(void); #else /* CONFIG_BLK_CGROUP */ struct blkcg { @@ -753,8 +847,13 @@ struct blkcg_policy { #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) +static inline void blkcg_maybe_throttle_current(void) { } +static inline bool blk_cgroup_congested(void) { return false; } + #ifdef CONFIG_BLOCK +static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } + static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c0e68f903011cb..ff20b677fb9f2d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -438,6 +438,9 @@ struct cgroup { /* used to store eBPF programs */ struct cgroup_bpf bpf; + /* If there is block congestion on this cgroup. */ + atomic_t congestion_count; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 43731fe51c972a..c2e993de67ecf0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -734,6 +734,10 @@ struct task_struct { /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; #endif +#ifdef CONFIG_BLK_CGROUP + /* to be used once the psi infrastructure lands upstream. */ + unsigned use_memdelay:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ @@ -1151,6 +1155,10 @@ struct task_struct { unsigned int memcg_nr_pages_over_high; #endif +#ifdef CONFIG_BLK_CGROUP + struct request_queue *throttle_queue; +#endif + #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 4a8841963c2ee7..05589a3e37f479 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -51,6 +51,7 @@ #include #include #include +#include struct linux_binprm; /* @@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) task_work_run(); mem_cgroup_handle_over_high(); + blkcg_maybe_throttle_current(); } #endif /* */ From 2cf855837b89d92996cf264713f3bed2bf9b0b4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 3 Jul 2018 11:14:56 -0400 Subject: [PATCH 036/190] memcontrol: schedule throttling if we are congested Memory allocations can induce swapping via kswapd or direct reclaim. If we are having IO done for us by kswapd and don't actually go into direct reclaim we may never get scheduled for throttling. So instead check to see if our cgroup is congested, and if so schedule the throttling. Before we return to user space the throttling stuff will only throttle if we actually required it. Signed-off-by: Tejun Heo Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Acked-by: Andrew Morton Signed-off-by: Jens Axboe --- include/linux/memcontrol.h | 13 +++++++++++++ include/linux/swap.h | 11 ++++++++++- mm/huge_memory.c | 6 +++--- mm/memcontrol.c | 13 +++++++++++++ mm/memory.c | 11 ++++++----- mm/shmem.c | 10 +++++----- mm/swapfile.c | 31 +++++++++++++++++++++++++++++++ 7 files changed, 81 insertions(+), 14 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c6fb116e92588..680d3395fc8382 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -317,6 +317,9 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, bool compound); +int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare, bool compound); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, @@ -789,6 +792,16 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, return 0; } +static inline int mem_cgroup_try_charge_delay(struct page *page, + struct mm_struct *mm, + gfp_t gfp_mask, + struct mem_cgroup **memcgp, + bool compound) +{ + *memcgp = NULL; + return 0; +} + static inline void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare, bool compound) diff --git a/include/linux/swap.h b/include/linux/swap.h index c063443d86381e..1a8bd05a335ed7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -629,7 +629,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) return memcg->swappiness; } - #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { @@ -637,6 +636,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif +#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, + gfp_t gfp_mask); +#else +static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, + int node, gfp_t gfp_mask) +{ +} +#endif + #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1cd7c1a57a1443..b87d5b151db2bf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_try_charge(pages[i], vma->vm_mm, + mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { if (pages[i]) put_page(pages[i]); @@ -1312,7 +1312,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, + if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6f0d5ef320aa6..64bd28d3538805 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5593,6 +5593,19 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, return ret; } +int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) +{ + struct mem_cgroup *memcg; + int ret; + + ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); + memcg = *memcgp; + mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); + return ret; +} + /** * mem_cgroup_commit_charge - commit a page charge * @page: page to charge diff --git a/mm/memory.c b/mm/memory.c index 7206a634270be3..dfe80c574282de 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2503,7 +2503,7 @@ static int wp_page_copy(struct vm_fault *vmf) cow_user_page(new_page, old_page, vmf->address, vma); } - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); @@ -3003,8 +3003,8 @@ int do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3165,7 +3165,8 @@ static int do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, + false)) goto oom_free_page; /* @@ -3661,7 +3662,7 @@ static int do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, &vmf->memcg, false)) { put_page(vmf->cow_page); return VM_FAULT_OOM; diff --git a/mm/shmem.c b/mm/shmem.c index 2cab8440305531..6206ca3510cf5f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1239,8 +1239,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, - false); + error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, + &memcg, false); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -1712,7 +1712,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, goto failed; } - error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1818,7 +1818,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, + error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, PageTransHuge(page)); if (error) goto unacct; @@ -2291,7 +2291,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, __SetPageSwapBacked(page); __SetPageUptodate(page); - ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); + ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); if (ret) goto out_release; diff --git a/mm/swapfile.c b/mm/swapfile.c index 2cc2972eedaf10..db4ec8ae1c8c49 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3731,6 +3731,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, + gfp_t gfp_mask) +{ + struct swap_info_struct *si, *next; + if (!(gfp_mask & __GFP_IO) || !memcg) + return; + + if (!blk_cgroup_congested()) + return; + + /* + * We've already scheduled a throttle, avoid taking the global swap + * lock. + */ + if (current->throttle_queue) + return; + + spin_lock(&swap_avail_lock); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], + avail_lists[node]) { + if (si->bdev) { + blkcg_schedule_throttle(bdev_get_queue(si->bdev), + true); + break; + } + } + spin_unlock(&swap_avail_lock); +} +#endif + static int __init swapfile_init(void) { int nid; From 2ecbf456352d0699f51b4c6d70ea5bf29766579c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:57 -0400 Subject: [PATCH 037/190] blk-stat: export helpers for modifying blk_rq_stat We need to use blk_rq_stat in the blkcg qos stuff, so export some of these helpers so they can be used by other things. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-stat.c | 16 ++++++++-------- block/blk-stat.h | 4 ++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/block/blk-stat.c b/block/blk-stat.c index 175c143ac5b911..7587b1c3caaf52 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -17,7 +17,7 @@ struct blk_queue_stats { bool enable_accounting; }; -static void blk_stat_init(struct blk_rq_stat *stat) +void blk_rq_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; @@ -25,7 +25,7 @@ static void blk_stat_init(struct blk_rq_stat *stat) } /* src is a per-cpu stat, mean isn't initialized */ -static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { if (!src->nr_samples) return; @@ -39,7 +39,7 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) dst->nr_samples += src->nr_samples; } -static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) +void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); @@ -69,7 +69,7 @@ void blk_stat_add(struct request *rq, u64 now) continue; stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; - __blk_stat_add(stat, value); + blk_rq_stat_add(stat, value); put_cpu_ptr(cb->cpu_stat); } rcu_read_unlock(); @@ -82,15 +82,15 @@ static void blk_stat_timer_fn(struct timer_list *t) int cpu; for (bucket = 0; bucket < cb->buckets; bucket++) - blk_stat_init(&cb->stat[bucket]); + blk_rq_stat_init(&cb->stat[bucket]); for_each_online_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) { - blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); - blk_stat_init(&cpu_stat[bucket]); + blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); + blk_rq_stat_init(&cpu_stat[bucket]); } } @@ -143,7 +143,7 @@ void blk_stat_add_callback(struct request_queue *q, cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) - blk_stat_init(&cpu_stat[bucket]); + blk_rq_stat_init(&cpu_stat[bucket]); } spin_lock(&q->stats->lock); diff --git a/block/blk-stat.h b/block/blk-stat.h index 78399cdde9c91a..f4a1568e81a415 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -159,4 +159,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb, mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); } +void blk_rq_stat_add(struct blk_rq_stat *, u64); +void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); +void blk_rq_stat_init(struct blk_rq_stat *); + #endif From a79050434b45959f397042080fd1d70ffa9bd9df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 09:32:35 -0600 Subject: [PATCH 038/190] blk-rq-qos: refactor out common elements of blk-wbt blkcg-qos is going to do essentially what wbt does, only on a cgroup basis. Break out the common code that will be shared between blkcg-qos and wbt into blk-rq-qos.* so they can both utilize the same infrastructure. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-core.c | 12 +- block/blk-mq.c | 12 +- block/blk-rq-qos.c | 178 ++++++++++++++++++++++ block/blk-rq-qos.h | 106 ++++++++++++++ block/blk-settings.c | 4 +- block/blk-sysfs.c | 22 ++- block/blk-wbt.c | 326 ++++++++++++++++++----------------------- block/blk-wbt.h | 63 +++----- include/linux/blkdev.h | 4 +- 10 files changed, 478 insertions(+), 251 deletions(-) create mode 100644 block/blk-rq-qos.c create mode 100644 block/blk-rq-qos.h diff --git a/block/Makefile b/block/Makefile index a8f94cdb75c390..57d0f47ab05f94 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o partition-generic.o ioprio.o \ - badblocks.o partitions/ + badblocks.o partitions/ blk-rq-qos.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o diff --git a/block/blk-core.c b/block/blk-core.c index 2ff8e131a892e8..b33a73bcf2d05a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1645,7 +1645,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, rq); + rq_qos_requeue(q, rq); if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); @@ -1752,7 +1752,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); - wbt_done(q->rq_wb, req); + rq_qos_done(q, req); /* * Request may not have originated from ll_rw_blk. if not, @@ -2044,7 +2044,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); + wb_acct = rq_qos_throttle(q, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. @@ -2054,7 +2054,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); - __wbt_done(q->rq_wb, wb_acct); + rq_qos_cleanup(q, wb_acct); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; else @@ -2983,7 +2983,7 @@ void blk_start_request(struct request *req) req->throtl_size = blk_rq_sectors(req); #endif req->rq_flags |= RQF_STATS; - wbt_issue(req->q->rq_wb, req); + rq_qos_issue(req->q, req); } BUG_ON(blk_rq_is_complete(req)); @@ -3207,7 +3207,7 @@ void blk_finish_request(struct request *req, blk_status_t error) blk_account_io_done(req, now); if (req->end_io) { - wbt_done(req->q->rq_wb, req); + rq_qos_done(q, req); req->end_io(req, error); } else { if (blk_bidi_rq(req)) diff --git a/block/blk-mq.c b/block/blk-mq.c index 850fdd02c38576..ea2a226457fa5c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -504,7 +504,7 @@ void blk_mq_free_request(struct request *rq) if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); - wbt_done(q->rq_wb, rq); + rq_qos_done(q, rq); if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); @@ -527,7 +527,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_account_io_done(rq, now); if (rq->end_io) { - wbt_done(rq->q->rq_wb, rq); + rq_qos_done(rq->q, rq); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -641,7 +641,7 @@ void blk_mq_start_request(struct request *rq) rq->throtl_size = blk_rq_sectors(rq); #endif rq->rq_flags |= RQF_STATS; - wbt_issue(q->rq_wb, rq); + rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); @@ -667,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, rq); + rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -1806,13 +1806,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - wb_acct = wbt_wait(q->rq_wb, bio, NULL); + wb_acct = rq_qos_throttle(q, bio, NULL); trace_block_getrq(q, bio, bio->bi_opf); rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { - __wbt_done(q->rq_wb, wb_acct); + rq_qos_cleanup(q, wb_acct); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); return BLK_QC_T_NONE; diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c new file mode 100644 index 00000000000000..d2f2af8aa10c33 --- /dev/null +++ b/block/blk-rq-qos.c @@ -0,0 +1,178 @@ +#include "blk-rq-qos.h" + +#include "blk-wbt.h" + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit) +{ + return atomic_inc_below(&rq_wait->inflight, limit); +} + +void rq_qos_cleanup(struct request_queue *q, enum wbt_flags wb_acct) +{ + struct rq_qos *rqos; + + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->cleanup) + rqos->ops->cleanup(rqos, wb_acct); + } +} + +void rq_qos_done(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->done) + rqos->ops->done(rqos, rq); + } +} + +void rq_qos_issue(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->issue) + rqos->ops->issue(rqos, rq); + } +} + +void rq_qos_requeue(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->requeue) + rqos->ops->requeue(rqos, rq); + } +} + +enum wbt_flags rq_qos_throttle(struct request_queue *q, struct bio *bio, + spinlock_t *lock) +{ + struct rq_qos *rqos; + enum wbt_flags flags = 0; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->throttle) + flags |= rqos->ops->throttle(rqos, bio, lock); + } + return flags; +} + +/* + * Return true, if we can't increase the depth further by scaling + */ +bool rq_depth_calc_max_depth(struct rq_depth *rqd) +{ + unsigned int depth; + bool ret = false; + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rqd->queue_depth == 1) { + if (rqd->scale_step > 0) + rqd->max_depth = 1; + else { + rqd->max_depth = 2; + ret = true; + } + } else { + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, rqd->default_depth, + rqd->queue_depth); + if (rqd->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); + else if (rqd->scale_step < 0) { + unsigned int maxd = 3 * rqd->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rqd->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } + + rqd->max_depth = depth; + } + + return ret; +} + +void rq_depth_scale_up(struct rq_depth *rqd) +{ + /* + * Hit max in previous round, stop here + */ + if (rqd->scaled_max) + return; + + rqd->scale_step--; + + rqd->scaled_max = rq_depth_calc_max_depth(rqd); +} + +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rqd->max_depth == 1) + return; + + if (rqd->scale_step < 0 && hard_throttle) + rqd->scale_step = 0; + else + rqd->scale_step++; + + rqd->scaled_max = false; + rq_depth_calc_max_depth(rqd); +} + +void rq_qos_exit(struct request_queue *q) +{ + while (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + q->rq_qos = rqos->next; + rqos->ops->exit(rqos); + } +} diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h new file mode 100644 index 00000000000000..f9a39bd6ece309 --- /dev/null +++ b/block/blk-rq-qos.h @@ -0,0 +1,106 @@ +#ifndef RQ_QOS_H +#define RQ_QOS_H + +#include +#include +#include +#include +#include + +enum rq_qos_id { + RQ_QOS_WBT, + RQ_QOS_CGROUP, +}; + +struct rq_wait { + wait_queue_head_t wait; + atomic_t inflight; +}; + +struct rq_qos { + struct rq_qos_ops *ops; + struct request_queue *q; + enum rq_qos_id id; + struct rq_qos *next; +}; + +struct rq_qos_ops { + enum wbt_flags (*throttle)(struct rq_qos *, struct bio *, + spinlock_t *); + void (*issue)(struct rq_qos *, struct request *); + void (*requeue)(struct rq_qos *, struct request *); + void (*done)(struct rq_qos *, struct request *); + void (*cleanup)(struct rq_qos *, enum wbt_flags); + void (*exit)(struct rq_qos *); +}; + +struct rq_depth { + unsigned int max_depth; + + int scale_step; + bool scaled_max; + + unsigned int queue_depth; + unsigned int default_depth; +}; + +static inline struct rq_qos *rq_qos_id(struct request_queue *q, + enum rq_qos_id id) +{ + struct rq_qos *rqos; + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->id == id) + break; + } + return rqos; +} + +static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) +{ + return rq_qos_id(q, RQ_QOS_WBT); +} + +static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) +{ + return rq_qos_id(q, RQ_QOS_CGROUP); +} + +static inline void rq_wait_init(struct rq_wait *rq_wait) +{ + atomic_set(&rq_wait->inflight, 0); + init_waitqueue_head(&rq_wait->wait); +} + +static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) +{ + rqos->next = q->rq_qos; + q->rq_qos = rqos; +} + +static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) +{ + struct rq_qos *cur, *prev = NULL; + for (cur = q->rq_qos; cur; cur = cur->next) { + if (cur == rqos) { + if (prev) + prev->next = rqos->next; + else + q->rq_qos = cur; + break; + } + prev = cur; + } +} + +bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit); +void rq_depth_scale_up(struct rq_depth *rqd); +void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); +bool rq_depth_calc_max_depth(struct rq_depth *rqd); + +void rq_qos_cleanup(struct request_queue *, enum wbt_flags); +void rq_qos_done(struct request_queue *, struct request *); +void rq_qos_issue(struct request_queue *, struct request *); +void rq_qos_requeue(struct request_queue *, struct request *); +enum wbt_flags rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); +void rq_qos_exit(struct request_queue *); +#endif diff --git a/block/blk-settings.c b/block/blk-settings.c index d1de71124656a9..053de87d1fda6d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; - wbt_set_queue_depth(q->rq_wb, depth); + wbt_set_queue_depth(q, depth); } EXPORT_SYMBOL(blk_set_queue_depth); @@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); - wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 94987b1f69e15a..49c29a5d06bb98 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) { - if (!q->rq_wb) + if (!wbt_rq_qos(q)) return -EINVAL; - return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); } static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, size_t count) { - struct rq_wb *rwb; + struct rq_qos *rqos; ssize_t ret; s64 val; @@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, if (val < -1) return -EINVAL; - rwb = q->rq_wb; - if (!rwb) { + rqos = wbt_rq_qos(q); + if (!rqos) { ret = wbt_init(q); if (ret) return ret; } - rwb = q->rq_wb; if (val == -1) - rwb->min_lat_nsec = wbt_default_latency_nsec(q); + val = wbt_default_latency_nsec(q); else if (val >= 0) - rwb->min_lat_nsec = val * 1000ULL; + val *= 1000ULL; - if (rwb->enable_state == WBT_STATE_ON_DEFAULT) - rwb->enable_state = WBT_STATE_ON_MANUAL; + wbt_set_min_lat(q, val); - wbt_update_limits(rwb); + wbt_update_limits(q); return count; } @@ -964,7 +962,7 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - wbt_exit(q); + rq_qos_exit(q); mutex_lock(&q->sysfs_lock); if (q->request_fn || (q->mq_ops && q->elevator)) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4f89b28fa6524d..6fe20fb823e4d3 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,6 +25,7 @@ #include #include "blk-wbt.h" +#include "blk-rq-qos.h" #define CREATE_TRACE_POINTS #include @@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb) return rwb && rwb->wb_normal != 0; } -/* - * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, - * false if 'v' + 1 would be bigger than 'below'. - */ -static bool atomic_inc_below(atomic_t *v, int below) -{ - int cur = atomic_read(v); - - for (;;) { - int old; - - if (cur >= below) - return false; - old = atomic_cmpxchg(v, cur, cur + 1); - if (old == cur) - break; - cur = old; - } - - return true; -} - static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) { if (rwb_enabled(rwb)) { @@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; + struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb) } } -void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) +static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) { + struct rq_wb *rwb = RQWB(rqos); struct rq_wait *rqw; int inflight, limit; @@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed. */ -void wbt_done(struct rq_wb *rwb, struct request *rq) +static void wbt_done(struct rq_qos *rqos, struct request *rq) { - if (!rwb) - return; + struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { if (rwb->sync_cookie == rq) { @@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq) wb_timestamp(rwb, &rwb->last_comp); } else { WARN_ON_ONCE(rq == rwb->sync_cookie); - __wbt_done(rwb, wbt_flags(rq)); + __wbt_done(rqos, wbt_flags(rq)); } wbt_clear_state(rq); } -/* - * Return true, if we can't increase the depth further by scaling - */ -static bool calc_wb_limits(struct rq_wb *rwb) -{ - unsigned int depth; - bool ret = false; - - if (!rwb->min_lat_nsec) { - rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; - return false; - } - - /* - * For QD=1 devices, this is a special case. It's important for those - * to have one request ready when one completes, so force a depth of - * 2 for those devices. On the backend, it'll be a depth of 1 anyway, - * since the device can't have more than that in flight. If we're - * scaling down, then keep a setting of 1/1/1. - */ - if (rwb->queue_depth == 1) { - if (rwb->scale_step > 0) - rwb->wb_max = rwb->wb_normal = 1; - else { - rwb->wb_max = rwb->wb_normal = 2; - ret = true; - } - rwb->wb_background = 1; - } else { - /* - * scale_step == 0 is our default state. If we have suffered - * latency spikes, step will be > 0, and we shrink the - * allowed write depths. If step is < 0, we're only doing - * writes, and we allow a temporarily higher depth to - * increase performance. - */ - depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); - if (rwb->scale_step > 0) - depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); - else if (rwb->scale_step < 0) { - unsigned int maxd = 3 * rwb->queue_depth / 4; - - depth = 1 + ((depth - 1) << -rwb->scale_step); - if (depth > maxd) { - depth = maxd; - ret = true; - } - } - - /* - * Set our max/normal/bg queue depths based on how far - * we have scaled down (->scale_step). - */ - rwb->wb_max = depth; - rwb->wb_normal = (rwb->wb_max + 1) / 2; - rwb->wb_background = (rwb->wb_max + 3) / 4; - } - - return ret; -} - static inline bool stat_sample_valid(struct blk_rq_stat *stat) { /* @@ -307,7 +225,8 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; /* @@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) return LAT_EXCEEDED; } - if (rwb->scale_step) + if (rqd->scale_step) trace_wbt_stat(bdi, stat); return LAT_OK; @@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct rq_depth *rqd = &rwb->rq_depth; - trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, - rwb->wb_background, rwb->wb_normal, rwb->wb_max); + trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rqd->max_depth); } -static void scale_up(struct rq_wb *rwb) +static void calc_wb_limits(struct rq_wb *rwb) { - /* - * Hit max in previous round, stop here - */ - if (rwb->scaled_max) - return; + if (rwb->min_lat_nsec == 0) { + rwb->wb_normal = rwb->wb_background = 0; + } else if (rwb->rq_depth.max_depth <= 2) { + rwb->wb_normal = rwb->rq_depth.max_depth; + rwb->wb_background = 1; + } else { + rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; + rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; + } +} - rwb->scale_step--; +static void scale_up(struct rq_wb *rwb) +{ + rq_depth_scale_up(&rwb->rq_depth); + calc_wb_limits(rwb); rwb->unknown_cnt = 0; - - rwb->scaled_max = calc_wb_limits(rwb); - - rwb_wake_all(rwb); - - rwb_trace_step(rwb, "step up"); + rwb_trace_step(rwb, "scale up"); } -/* - * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we - * had a latency violation. - */ static void scale_down(struct rq_wb *rwb, bool hard_throttle) { - /* - * Stop scaling down when we've hit the limit. This also prevents - * ->scale_step from going to crazy values, if the device can't - * keep up. - */ - if (rwb->wb_max == 1) - return; - - if (rwb->scale_step < 0 && hard_throttle) - rwb->scale_step = 0; - else - rwb->scale_step++; - - rwb->scaled_max = false; - rwb->unknown_cnt = 0; + rq_depth_scale_down(&rwb->rq_depth, hard_throttle); calc_wb_limits(rwb); - rwb_trace_step(rwb, "step down"); + rwb->unknown_cnt = 0; + rwb_wake_all(rwb); + rwb_trace_step(rwb, "scale down"); } static void rwb_arm_timer(struct rq_wb *rwb) { - if (rwb->scale_step > 0) { + struct rq_depth *rqd = &rwb->rq_depth; + + if (rqd->scale_step > 0) { /* * We should speed this up, using some variant of a fast * integer inverse square root calculation. Since we only do @@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb) * though. */ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, - int_sqrt((rwb->scale_step + 1) << 8)); + int_sqrt((rqd->scale_step + 1) << 8)); } else { /* * For step < 0, we don't want to increase/decrease the @@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb) static void wb_timer_fn(struct blk_stat_callback *cb) { struct rq_wb *rwb = cb->data; + struct rq_depth *rqd = &rwb->rq_depth; unsigned int inflight = wbt_inflight(rwb); int status; status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, + trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, inflight); /* @@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) * currently don't have a valid read/write sample. For that * case, slowly return to center state (step == 0). */ - if (rwb->scale_step > 0) + if (rqd->scale_step > 0) scale_up(rwb); - else if (rwb->scale_step < 0) + else if (rqd->scale_step < 0) scale_down(rwb, false); break; default: @@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb) /* * Re-arm timer, if we have IO in flight */ - if (rwb->scale_step || inflight) + if (rqd->scale_step || inflight) rwb_arm_timer(rwb); } -void wbt_update_limits(struct rq_wb *rwb) +static void __wbt_update_limits(struct rq_wb *rwb) { - rwb->scale_step = 0; - rwb->scaled_max = false; + struct rq_depth *rqd = &rwb->rq_depth; + + rqd->scale_step = 0; + rqd->scaled_max = false; + + rq_depth_calc_max_depth(rqd); calc_wb_limits(rwb); rwb_wake_all(rwb); } +void wbt_update_limits(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return; + __wbt_update_limits(RQWB(rqos)); +} + +u64 wbt_get_min_lat(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return 0; + return RQWB(rqos)->min_lat_nsec; +} + +void wbt_set_min_lat(struct request_queue *q, u64 val) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return; + RQWB(rqos)->min_lat_nsec = val; + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + __wbt_update_limits(RQWB(rqos)); +} + + static bool close_io(struct rq_wb *rwb) { const unsigned long now = jiffies; @@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) * IO for a bit. */ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) - limit = rwb->wb_max; + limit = rwb->rq_depth.max_depth; else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, @@ -554,7 +495,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, rqw->wait.head.next != &wait->entry) return false; - return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); + return rq_wait_inc_below(rqw, get_limit(rwb, rw)); } /* @@ -614,8 +555,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +static enum wbt_flags wbt_wait(struct rq_qos *rqos, struct bio *bio, + spinlock_t *lock) { + struct rq_wb *rwb = RQWB(rqos); enum wbt_flags ret = 0; if (!rwb_enabled(rwb)) @@ -643,8 +586,10 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) return ret | WBT_TRACKED; } -void wbt_issue(struct rq_wb *rwb, struct request *rq) +void wbt_issue(struct rq_qos *rqos, struct request *rq) { + struct rq_wb *rwb = RQWB(rqos); + if (!rwb_enabled(rwb)) return; @@ -661,8 +606,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq) } } -void wbt_requeue(struct rq_wb *rwb, struct request *rq) +void wbt_requeue(struct rq_qos *rqos, struct request *rq) { + struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; if (rq == rwb->sync_cookie) { @@ -671,39 +617,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq) } } -void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) { - if (rwb) { - rwb->queue_depth = depth; - wbt_update_limits(rwb); + struct rq_qos *rqos = wbt_rq_qos(q); + if (rqos) { + RQWB(rqos)->rq_depth.queue_depth = depth; + __wbt_update_limits(RQWB(rqos)); } } -void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) -{ - if (rwb) - rwb->wc = write_cache_on; -} - -/* - * Disable wbt, if enabled by default. - */ -void wbt_disable_default(struct request_queue *q) +void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) { - struct rq_wb *rwb = q->rq_wb; - - if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) - wbt_exit(q); + struct rq_qos *rqos = wbt_rq_qos(q); + if (rqos) + RQWB(rqos)->wc = write_cache_on; } -EXPORT_SYMBOL_GPL(wbt_disable_default); /* * Enable wbt if defaults are configured that way */ void wbt_enable_default(struct request_queue *q) { + struct rq_qos *rqos = wbt_rq_qos(q); /* Throttling already enabled? */ - if (q->rq_wb) + if (rqos) return; /* Queue not registered? Maybe shutting down... */ @@ -741,6 +678,41 @@ static int wbt_data_dir(const struct request *rq) return -1; } +static void wbt_exit(struct rq_qos *rqos) +{ + struct rq_wb *rwb = RQWB(rqos); + struct request_queue *q = rqos->q; + + blk_stat_remove_callback(q, rwb->cb); + blk_stat_free_callback(rwb->cb); + kfree(rwb); +} + +/* + * Disable wbt, if enabled by default. + */ +void wbt_disable_default(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_wb *rwb; + if (!rqos) + return; + rwb = RQWB(rqos); + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) + rwb->wb_normal = 0; +} +EXPORT_SYMBOL_GPL(wbt_disable_default); + + +static struct rq_qos_ops wbt_rqos_ops = { + .throttle = wbt_wait, + .issue = wbt_issue, + .requeue = wbt_requeue, + .done = wbt_done, + .cleanup = __wbt_done, + .exit = wbt_exit, +}; + int wbt_init(struct request_queue *q) { struct rq_wb *rwb; @@ -756,39 +728,29 @@ int wbt_init(struct request_queue *q) return -ENOMEM; } - for (i = 0; i < WBT_NUM_RWQ; i++) { - atomic_set(&rwb->rq_wait[i].inflight, 0); - init_waitqueue_head(&rwb->rq_wait[i].wait); - } + for (i = 0; i < WBT_NUM_RWQ; i++) + rq_wait_init(&rwb->rq_wait[i]); + rwb->rqos.id = RQ_QOS_WBT; + rwb->rqos.ops = &wbt_rqos_ops; + rwb->rqos.q = q; rwb->last_comp = rwb->last_issue = jiffies; - rwb->queue = q; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - wbt_update_limits(rwb); + rwb->wc = 1; + rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + __wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ - q->rq_wb = rwb; + rq_qos_add(q, &rwb->rqos); blk_stat_add_callback(q, rwb->cb); rwb->min_lat_nsec = wbt_default_latency_nsec(q); - wbt_set_queue_depth(rwb, blk_queue_depth(q)); - wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + wbt_set_queue_depth(q, blk_queue_depth(q)); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); return 0; } - -void wbt_exit(struct request_queue *q) -{ - struct rq_wb *rwb = q->rq_wb; - - if (rwb) { - blk_stat_remove_callback(q, rwb->cb); - blk_stat_free_callback(rwb->cb); - q->rq_wb = NULL; - kfree(rwb); - } -} diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 300df531d0a667..53b20a58c0a212 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -9,6 +9,7 @@ #include #include "blk-stat.h" +#include "blk-rq-qos.h" enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ @@ -35,20 +36,12 @@ enum { WBT_STATE_ON_MANUAL = 2, }; -struct rq_wait { - wait_queue_head_t wait; - atomic_t inflight; -}; - struct rq_wb { /* * Settings that govern how we throttle */ unsigned int wb_background; /* background writeback */ unsigned int wb_normal; /* normal writeback */ - unsigned int wb_max; /* max throughput writeback */ - int scale_step; - bool scaled_max; short enable_state; /* WBT_STATE_* */ @@ -67,15 +60,20 @@ struct rq_wb { void *sync_cookie; unsigned int wc; - unsigned int queue_depth; unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; - struct request_queue *queue; + struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; }; +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + static inline unsigned int wbt_inflight(struct rq_wb *rwb) { unsigned int i, ret = 0; @@ -86,6 +84,7 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) return ret; } + #ifdef CONFIG_BLK_WBT static inline void wbt_track(struct request *rq, enum wbt_flags flags) @@ -93,19 +92,16 @@ static inline void wbt_track(struct request *rq, enum wbt_flags flags) rq->wbt_flags |= flags; } -void __wbt_done(struct rq_wb *, enum wbt_flags); -void wbt_done(struct rq_wb *, struct request *); -enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *); int wbt_init(struct request_queue *); -void wbt_exit(struct request_queue *); -void wbt_update_limits(struct rq_wb *); -void wbt_requeue(struct rq_wb *, struct request *); -void wbt_issue(struct rq_wb *, struct request *); +void wbt_update_limits(struct request_queue *); void wbt_disable_default(struct request_queue *); void wbt_enable_default(struct request_queue *); -void wbt_set_queue_depth(struct rq_wb *, unsigned int); -void wbt_set_write_cache(struct rq_wb *, bool); +u64 wbt_get_min_lat(struct request_queue *q); +void wbt_set_min_lat(struct request_queue *q, u64 val); + +void wbt_set_queue_depth(struct request_queue *, unsigned int); +void wbt_set_write_cache(struct request_queue *, bool); u64 wbt_default_latency_nsec(struct request_queue *); @@ -114,43 +110,30 @@ u64 wbt_default_latency_nsec(struct request_queue *); static inline void wbt_track(struct request *rq, enum wbt_flags flags) { } -static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags) -{ -} -static inline void wbt_done(struct rq_wb *rwb, struct request *rq) -{ -} -static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, - spinlock_t *lock) -{ - return 0; -} static inline int wbt_init(struct request_queue *q) { return -EINVAL; } -static inline void wbt_exit(struct request_queue *q) -{ -} -static inline void wbt_update_limits(struct rq_wb *rwb) +static inline void wbt_update_limits(struct request_queue *q) { } -static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq) +static inline void wbt_disable_default(struct request_queue *q) { } -static inline void wbt_issue(struct rq_wb *rwb, struct request *rq) +static inline void wbt_enable_default(struct request_queue *q) { } -static inline void wbt_disable_default(struct request_queue *q) +static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) { } -static inline void wbt_enable_default(struct request_queue *q) +static inline void wbt_set_write_cache(struct request_queue *q, bool wc) { } -static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +static inline u64 wbt_get_min_lat(struct request_queue *q) { + return 0; } -static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc) +static inline void wbt_set_min_lat(struct request_queue *q, u64 val) { } static inline u64 wbt_default_latency_nsec(struct request_queue *q) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9d05646d50596e..137759862f07cb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -42,7 +42,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; -struct rq_wb; +struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; @@ -443,7 +443,7 @@ struct request_queue { int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ struct blk_queue_stats *stats; - struct rq_wb *rq_wb; + struct rq_qos *rq_qos; /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg From c1c80384c8f47021a01a0cc42894a06bed2b801b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:59 -0400 Subject: [PATCH 039/190] block: remove external dependency on wbt_flags We don't really need to save this stuff in the core block code, we can just pass the bio back into the helpers later on to derive the same flags and update the rq->wbt_flags appropriately. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-core.c | 9 ++++---- block/blk-mq.c | 9 ++++---- block/blk-rq-qos.c | 24 +++++++++++++-------- block/blk-rq-qos.h | 11 +++++----- block/blk-wbt.c | 52 ++++++++++++++++++++++++++++++++++------------ block/blk-wbt.h | 5 ----- 6 files changed, 68 insertions(+), 42 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index b33a73bcf2d05a..687d7732f23a04 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -42,7 +42,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" -#include "blk-wbt.h" +#include "blk-rq-qos.h" #ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; @@ -1986,7 +1986,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int where = ELEVATOR_INSERT_SORT; struct request *req, *free; unsigned int request_count = 0; - unsigned int wb_acct; /* * low level driver can indicate that it wants pages above a @@ -2044,7 +2043,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - wb_acct = rq_qos_throttle(q, bio, q->queue_lock); + rq_qos_throttle(q, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. @@ -2054,7 +2053,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); - rq_qos_cleanup(q, wb_acct); + rq_qos_cleanup(q, bio); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; else @@ -2063,7 +2062,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) goto out_unlock; } - wbt_track(req, wb_acct); + rq_qos_track(q, req, bio); /* * After dropping the lock and possibly sleeping here, our request diff --git a/block/blk-mq.c b/block/blk-mq.c index ea2a226457fa5c..73a43b81b17dcc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -34,8 +34,8 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-stat.h" -#include "blk-wbt.h" #include "blk-mq-sched.h" +#include "blk-rq-qos.h" static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -1790,7 +1790,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; - unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1806,19 +1805,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - wb_acct = rq_qos_throttle(q, bio, NULL); + rq_qos_throttle(q, bio, NULL); trace_block_getrq(q, bio, bio->bi_opf); rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { - rq_qos_cleanup(q, wb_acct); + rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); return BLK_QC_T_NONE; } - wbt_track(rq, wb_acct); + rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d2f2af8aa10c33..b7b02e04f64f65 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -1,7 +1,5 @@ #include "blk-rq-qos.h" -#include "blk-wbt.h" - /* * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. @@ -29,13 +27,13 @@ bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit) return atomic_inc_below(&rq_wait->inflight, limit); } -void rq_qos_cleanup(struct request_queue *q, enum wbt_flags wb_acct) +void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { struct rq_qos *rqos; for (rqos = q->rq_qos; rqos; rqos = rqos->next) { if (rqos->ops->cleanup) - rqos->ops->cleanup(rqos, wb_acct); + rqos->ops->cleanup(rqos, bio); } } @@ -69,17 +67,25 @@ void rq_qos_requeue(struct request_queue *q, struct request *rq) } } -enum wbt_flags rq_qos_throttle(struct request_queue *q, struct bio *bio, - spinlock_t *lock) +void rq_qos_throttle(struct request_queue *q, struct bio *bio, + spinlock_t *lock) { struct rq_qos *rqos; - enum wbt_flags flags = 0; for(rqos = q->rq_qos; rqos; rqos = rqos->next) { if (rqos->ops->throttle) - flags |= rqos->ops->throttle(rqos, bio, lock); + rqos->ops->throttle(rqos, bio, lock); + } +} + +void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->track) + rqos->ops->track(rqos, rq, bio); } - return flags; } /* diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index f9a39bd6ece309..a6d13b8ce0dcb0 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -25,12 +25,12 @@ struct rq_qos { }; struct rq_qos_ops { - enum wbt_flags (*throttle)(struct rq_qos *, struct bio *, - spinlock_t *); + void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); + void (*track)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); - void (*cleanup)(struct rq_qos *, enum wbt_flags); + void (*cleanup)(struct rq_qos *, struct bio *); void (*exit)(struct rq_qos *); }; @@ -97,10 +97,11 @@ void rq_depth_scale_up(struct rq_depth *rqd); void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); -void rq_qos_cleanup(struct request_queue *, enum wbt_flags); +void rq_qos_cleanup(struct request_queue *, struct bio *); void rq_qos_done(struct request_queue *, struct request *); void rq_qos_issue(struct request_queue *, struct request *); void rq_qos_requeue(struct request_queue *, struct request *); -enum wbt_flags rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); +void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); +void rq_qos_track(struct request_queue *q, struct request *, struct bio *); void rq_qos_exit(struct request_queue *); #endif diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 6fe20fb823e4d3..461a9af11efe00 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -549,41 +549,66 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) } } +static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) +{ + enum wbt_flags flags = 0; + + if (bio_op(bio) == REQ_OP_READ) { + flags = WBT_READ; + } else if (wbt_should_throttle(rwb, bio)) { + if (current_is_kswapd()) + flags |= WBT_KSWAPD; + if (bio_op(bio) == REQ_OP_DISCARD) + flags |= WBT_DISCARD; + flags |= WBT_TRACKED; + } + return flags; +} + +static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); + __wbt_done(rqos, flags); +} + /* * Returns true if the IO request should be accounted, false if not. * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -static enum wbt_flags wbt_wait(struct rq_qos *rqos, struct bio *bio, - spinlock_t *lock) +static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) { struct rq_wb *rwb = RQWB(rqos); - enum wbt_flags ret = 0; + enum wbt_flags flags; if (!rwb_enabled(rwb)) - return 0; + return; - if (bio_op(bio) == REQ_OP_READ) - ret = WBT_READ; + flags = bio_to_wbt_flags(rwb, bio); if (!wbt_should_throttle(rwb, bio)) { - if (ret & WBT_READ) + if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); - return ret; + return; } if (current_is_kswapd()) - ret |= WBT_KSWAPD; + flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) - ret |= WBT_DISCARD; + flags |= WBT_DISCARD; - __wbt_wait(rwb, ret, bio->bi_opf, lock); + __wbt_wait(rwb, flags, bio->bi_opf, lock); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); +} - return ret | WBT_TRACKED; +static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); } void wbt_issue(struct rq_qos *rqos, struct request *rq) @@ -707,9 +732,10 @@ EXPORT_SYMBOL_GPL(wbt_disable_default); static struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, + .track = wbt_track, .requeue = wbt_requeue, .done = wbt_done, - .cleanup = __wbt_done, + .cleanup = wbt_cleanup, .exit = wbt_exit, }; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 53b20a58c0a212..f47218d5b3b208 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -87,11 +87,6 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) #ifdef CONFIG_BLK_WBT -static inline void wbt_track(struct request *rq, enum wbt_flags flags) -{ - rq->wbt_flags |= flags; -} - int wbt_init(struct request_queue *); void wbt_update_limits(struct request_queue *); void wbt_disable_default(struct request_queue *); From 67b42d0bf7a8fd1ec0cf1acdc9550e688d7c8578 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:15:00 -0400 Subject: [PATCH 040/190] rq-qos: introduce dio_bio callback wbt cares only about request completion time, but controllers may need information that is on the bio itself, so add a done_bio callback for rq-qos so things like blk-iolatency can use it to have the bio when it completes. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/bio.c | 4 ++++ block/blk-rq-qos.c | 10 ++++++++++ block/blk-rq-qos.h | 2 ++ 3 files changed, 16 insertions(+) diff --git a/block/bio.c b/block/bio.c index 5f84f5c3887b79..f3536bfc82989f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -32,6 +32,7 @@ #include #include "blk.h" +#include "blk-rq-qos.h" /* * Test patch to inline a certain number of bi_io_vec's inside the bio @@ -1808,6 +1809,9 @@ void bio_endio(struct bio *bio) if (!bio_integrity_endio(bio)) return; + if (bio->bi_disk) + rq_qos_done_bio(bio->bi_disk->queue, bio); + /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index b7b02e04f64f65..5134b24482f618 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -88,6 +88,16 @@ void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) } } +void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->done_bio) + rqos->ops->done_bio(rqos, bio); + } +} + /* * Return true, if we can't increase the depth further by scaling */ diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index a6d13b8ce0dcb0..d5e2f68fe41e00 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -30,6 +30,7 @@ struct rq_qos_ops { void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); + void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); void (*exit)(struct rq_qos *); }; @@ -101,6 +102,7 @@ void rq_qos_cleanup(struct request_queue *, struct bio *); void rq_qos_done(struct request_queue *, struct request *); void rq_qos_issue(struct request_queue *, struct request *); void rq_qos_requeue(struct request_queue *, struct request *); +void rq_qos_done_bio(struct request_queue *q, struct bio *bio); void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); void rq_qos_track(struct request_queue *q, struct request *, struct bio *); void rq_qos_exit(struct request_queue *); From d70675121546c35feaceebf7ed9caed8716640f3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:15:01 -0400 Subject: [PATCH 041/190] block: introduce blk-iolatency io controller Current IO controllers for the block layer are less than ideal for our use case. The io.max controller is great at hard limiting, but it is not work conserving. This patch introduces io.latency. You provide a latency target for your group and we monitor the io in short windows to make sure we are not exceeding those latency targets. This makes use of the rq-qos infrastructure and works much like the wbt stuff. There are a few differences from wbt - It's bio based, so the latency covers the whole block layer in addition to the actual io. - We will throttle all IO types that comes in here if we need to. - We use the mean latency over the 100ms window. This is because writes can be particularly fast, which could give us a false sense of the impact of other workloads on our protected workload. - By default there's no throttling, we set the queue_depth to INT_MAX so that we can have as many outstanding bio's as we're allowed to. Only at throttle time do we pay attention to the actual queue depth. - We backcharge cgroups for root cg issued IO and induce artificial delays in order to deal with cases like metadata only or swap heavy workloads. In testing this has worked out relatively well. Protected workloads will throttle noisy workloads down to 1 io at time if they are doing normal IO on their own, or induce up to a 1 second delay per syscall if they are doing a lot of root issued IO (metadata/swap IO). Our testing has revolved mostly around our production web servers where we have hhvm (the web server application) in a protected group and everything else in another group. We see slightly higher requests per second (RPS) on the test tier vs the control tier, and much more stable RPS across all machines in the test tier vs the control tier. Another test we run is a slow memory allocator in the unprotected group. Before this would eventually push us into swap and cause the whole box to die and not recover at all. With these patches we see slight RPS drops (usually 10-15%) before the memory consumer is properly killed and things recover within seconds. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/Kconfig | 12 + block/Makefile | 1 + block/blk-cgroup.c | 8 + block/blk-iolatency.c | 930 ++++++++++++++++++++++++++++++++++++++ block/blk.h | 6 + include/linux/blk_types.h | 2 - 6 files changed, 957 insertions(+), 2 deletions(-) create mode 100644 block/blk-iolatency.c diff --git a/block/Kconfig b/block/Kconfig index dfe7bc770fc9ab..1f2469a0123ceb 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -149,6 +149,18 @@ config BLK_WBT dynamically on an algorithm loosely based on CoDel, factoring in the realtime performance of the disk. +config BLK_CGROUP_IOLATENCY + bool "Enable support for latency based cgroup IO protection" + depends on BLK_CGROUP=y + default n + ---help--- + Enabling this option enables the .latency interface for IO throttling. + The IO controller will attempt to maintain average IO latencies below + the configured latency target, throttling anybody with a higher latency + target than the victimized group. + + Note, this is an experimental interface and could be changed someday. + config BLK_WBT_SQ bool "Single queue writeback throttling" default n diff --git a/block/Makefile b/block/Makefile index 57d0f47ab05f94..572b33f32c07cf 100644 --- a/block/Makefile +++ b/block/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o +obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d3310ec96c2abc..7e2c19ce1a08ee 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1238,6 +1238,14 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); + ret = blk_iolatency_init(q); + if (ret) { + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + return ret; + } + ret = blk_throtl_init(q); if (ret) { spin_lock_irq(q->queue_lock); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c new file mode 100644 index 00000000000000..a35a1f58033765 --- /dev/null +++ b/block/blk-iolatency.c @@ -0,0 +1,930 @@ +/* + * Block rq-qos base io controller + * + * This works similar to wbt with a few exceptions + * + * - It's bio based, so the latency covers the whole block layer in addition to + * the actual io. + * - We will throttle all IO that comes in here if we need to. + * - We use the mean latency over the 100ms window. This is because writes can + * be particularly fast, which could give us a false sense of the impact of + * other workloads on our protected workload. + * - By default there's no throttling, we set the queue_depth to INT_MAX so that + * we can have as many outstanding bio's as we're allowed to. Only at + * throttle time do we pay attention to the actual queue depth. + * + * The hierarchy works like the cpu controller does, we track the latency at + * every configured node, and each configured node has it's own independent + * queue depth. This means that we only care about our latency targets at the + * peer level. Some group at the bottom of the hierarchy isn't going to affect + * a group at the end of some other path if we're only configred at leaf level. + * + * Consider the following + * + * root blkg + * / \ + * fast (target=5ms) slow (target=10ms) + * / \ / \ + * a b normal(15ms) unloved + * + * "a" and "b" have no target, but their combined io under "fast" cannot exceed + * an average latency of 5ms. If it does then we will throttle the "slow" + * group. In the case of "normal", if it exceeds its 15ms target, we will + * throttle "unloved", but nobody else. + * + * In this example "fast", "slow", and "normal" will be the only groups actually + * accounting their io latencies. We have to walk up the heirarchy to the root + * on every submit and complete so we can do the appropriate stat recording and + * adjust the queue depth of ourselves if needed. + * + * There are 2 ways we throttle IO. + * + * 1) Queue depth throttling. As we throttle down we will adjust the maximum + * number of IO's we're allowed to have in flight. This starts at (u64)-1 down + * to 1. If the group is only ever submitting IO for itself then this is the + * only way we throttle. + * + * 2) Induced delay throttling. This is for the case that a group is generating + * IO that has to be issued by the root cg to avoid priority inversion. So think + * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot + * of work done for us on behalf of the root cg and are being asked to scale + * down more then we induce a latency at userspace return. We accumulate the + * total amount of time we need to be punished by doing + * + * total_time += min_lat_nsec - actual_io_completion + * + * and then at throttle time will do + * + * throttle_time = min(total_time, NSEC_PER_SEC) + * + * This induced delay will throttle back the activity that is generating the + * root cg issued io's, wethere that's some metadata intensive operation or the + * group is using so much memory that it is pushing us into swap. + * + * Copyright (C) 2018 Josef Bacik + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk-rq-qos.h" +#include "blk-stat.h" + +#define DEFAULT_SCALE_COOKIE 1000000U + +static struct blkcg_policy blkcg_policy_iolatency; +struct iolatency_grp; + +struct blk_iolatency { + struct rq_qos rqos; + struct timer_list timer; + atomic_t enabled; +}; + +static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) +{ + return container_of(rqos, struct blk_iolatency, rqos); +} + +static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) +{ + return atomic_read(&blkiolat->enabled) > 0; +} + +struct child_latency_info { + spinlock_t lock; + + /* Last time we adjusted the scale of everybody. */ + u64 last_scale_event; + + /* The latency that we missed. */ + u64 scale_lat; + + /* Total io's from all of our children for the last summation. */ + u64 nr_samples; + + /* The guy who actually changed the latency numbers. */ + struct iolatency_grp *scale_grp; + + /* Cookie to tell if we need to scale up or down. */ + atomic_t scale_cookie; +}; + +struct iolatency_grp { + struct blkg_policy_data pd; + struct blk_rq_stat __percpu *stats; + struct blk_iolatency *blkiolat; + struct rq_depth rq_depth; + struct rq_wait rq_wait; + atomic64_t window_start; + atomic_t scale_cookie; + u64 min_lat_nsec; + u64 cur_win_nsec; + + /* total running average of our io latency. */ + u64 total_lat_avg; + u64 total_lat_nr; + + /* Our current number of IO's for the last summation. */ + u64 nr_samples; + + struct child_latency_info child_lat; +}; + +static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; +} + +static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) +{ + return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency)); +} + +static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) +{ + return pd_to_blkg(&iolat->pd); +} + +static inline bool iolatency_may_queue(struct iolatency_grp *iolat, + wait_queue_entry_t *wait, + bool first_block) +{ + struct rq_wait *rqw = &iolat->rq_wait; + + if (first_block && waitqueue_active(&rqw->wait) && + rqw->wait.head.next != &wait->entry) + return false; + return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); +} + +static void __blkcg_iolatency_throttle(struct rq_qos *rqos, + struct iolatency_grp *iolat, + spinlock_t *lock, bool issue_as_root, + bool use_memdelay) + __releases(lock) + __acquires(lock) +{ + struct rq_wait *rqw = &iolat->rq_wait; + unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); + DEFINE_WAIT(wait); + bool first_block = true; + + if (use_delay) + blkcg_schedule_throttle(rqos->q, use_memdelay); + + /* + * To avoid priority inversions we want to just take a slot if we are + * issuing as root. If we're being killed off there's no point in + * delaying things, we may have been killed by OOM so throttling may + * make recovery take even longer, so just let the IO's through so the + * task can go away. + */ + if (issue_as_root || fatal_signal_pending(current)) { + atomic_inc(&rqw->inflight); + return; + } + + if (iolatency_may_queue(iolat, &wait, first_block)) + return; + + do { + prepare_to_wait_exclusive(&rqw->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (iolatency_may_queue(iolat, &wait, first_block)) + break; + first_block = false; + + if (lock) { + spin_unlock_irq(lock); + io_schedule(); + spin_lock_irq(lock); + } else { + io_schedule(); + } + } while (1); + + finish_wait(&rqw->wait, &wait); +} + +#define SCALE_DOWN_FACTOR 2 +#define SCALE_UP_FACTOR 4 + +static inline unsigned long scale_amount(unsigned long qd, bool up) +{ + return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); +} + +/* + * We scale the qd down faster than we scale up, so we need to use this helper + * to adjust the scale_cookie accordingly so we don't prematurely get + * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. + * + * Each group has their own local copy of the last scale cookie they saw, so if + * the global scale cookie goes up or down they know which way they need to go + * based on their last knowledge of it. + */ +static void scale_cookie_change(struct blk_iolatency *blkiolat, + struct child_latency_info *lat_info, + bool up) +{ + unsigned long qd = blk_queue_depth(blkiolat->rqos.q); + unsigned long scale = scale_amount(qd, up); + unsigned long old = atomic_read(&lat_info->scale_cookie); + unsigned long max_scale = qd << 1; + unsigned long diff = 0; + + if (old < DEFAULT_SCALE_COOKIE) + diff = DEFAULT_SCALE_COOKIE - old; + + if (up) { + if (scale + old > DEFAULT_SCALE_COOKIE) + atomic_set(&lat_info->scale_cookie, + DEFAULT_SCALE_COOKIE); + else if (diff > qd) + atomic_inc(&lat_info->scale_cookie); + else + atomic_add(scale, &lat_info->scale_cookie); + } else { + /* + * We don't want to dig a hole so deep that it takes us hours to + * dig out of it. Just enough that we don't throttle/unthrottle + * with jagged workloads but can still unthrottle once pressure + * has sufficiently dissipated. + */ + if (diff > qd) { + if (diff < max_scale) + atomic_dec(&lat_info->scale_cookie); + } else { + atomic_sub(scale, &lat_info->scale_cookie); + } + } +} + +/* + * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the + * queue depth at a time so we don't get wild swings and hopefully dial in to + * fairer distribution of the overall queue depth. + */ +static void scale_change(struct iolatency_grp *iolat, bool up) +{ + unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q); + unsigned long scale = scale_amount(qd, up); + unsigned long old = iolat->rq_depth.max_depth; + bool changed = false; + + if (old > qd) + old = qd; + + if (up) { + if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat))) + return; + + if (old < qd) { + changed = true; + old += scale; + old = min(old, qd); + iolat->rq_depth.max_depth = old; + wake_up_all(&iolat->rq_wait.wait); + } + } else if (old > 1) { + old >>= 1; + changed = true; + iolat->rq_depth.max_depth = max(old, 1UL); + } +} + +/* Check our parent and see if the scale cookie has changed. */ +static void check_scale_change(struct iolatency_grp *iolat) +{ + struct iolatency_grp *parent; + struct child_latency_info *lat_info; + unsigned int cur_cookie; + unsigned int our_cookie = atomic_read(&iolat->scale_cookie); + u64 scale_lat; + unsigned int old; + int direction = 0; + + if (lat_to_blkg(iolat)->parent == NULL) + return; + + parent = blkg_to_lat(lat_to_blkg(iolat)->parent); + if (!parent) + return; + + lat_info = &parent->child_lat; + cur_cookie = atomic_read(&lat_info->scale_cookie); + scale_lat = READ_ONCE(lat_info->scale_lat); + + if (cur_cookie < our_cookie) + direction = -1; + else if (cur_cookie > our_cookie) + direction = 1; + else + return; + + old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); + + /* Somebody beat us to the punch, just bail. */ + if (old != our_cookie) + return; + + if (direction < 0 && iolat->min_lat_nsec) { + u64 samples_thresh; + + if (!scale_lat || iolat->min_lat_nsec <= scale_lat) + return; + + /* + * Sometimes high priority groups are their own worst enemy, so + * instead of taking it out on some poor other group that did 5% + * or less of the IO's for the last summation just skip this + * scale down event. + */ + samples_thresh = lat_info->nr_samples * 5; + samples_thresh = div64_u64(samples_thresh, 100); + if (iolat->nr_samples <= samples_thresh) + return; + } + + /* We're as low as we can go. */ + if (iolat->rq_depth.max_depth == 1 && direction < 0) { + blkcg_use_delay(lat_to_blkg(iolat)); + return; + } + + /* We're back to the default cookie, unthrottle all the things. */ + if (cur_cookie == DEFAULT_SCALE_COOKIE) { + blkcg_clear_delay(lat_to_blkg(iolat)); + iolat->rq_depth.max_depth = INT_MAX; + wake_up_all(&iolat->rq_wait.wait); + return; + } + + scale_change(iolat, direction > 0); +} + +static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, + spinlock_t *lock) +{ + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + struct blkcg *blkcg; + struct blkcg_gq *blkg; + struct request_queue *q = rqos->q; + bool issue_as_root = bio_issue_as_root_blkg(bio); + + if (!blk_iolatency_enabled(blkiolat)) + return; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + bio_associate_blkcg(bio, &blkcg->css); + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + if (!lock) + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + if (!lock) + spin_unlock_irq(q->queue_lock); + } + if (!blkg) + goto out; + + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio_associate_blkg(bio, blkg); +out: + rcu_read_unlock(); + while (blkg && blkg->parent) { + struct iolatency_grp *iolat = blkg_to_lat(blkg); + if (!iolat) { + blkg = blkg->parent; + continue; + } + + check_scale_change(iolat); + __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); + blkg = blkg->parent; + } + if (!timer_pending(&blkiolat->timer)) + mod_timer(&blkiolat->timer, jiffies + HZ); +} + +static void iolatency_record_time(struct iolatency_grp *iolat, + struct bio_issue *issue, u64 now, + bool issue_as_root) +{ + struct blk_rq_stat *rq_stat; + u64 start = bio_issue_time(issue); + u64 req_time; + + if (now <= start) + return; + + req_time = now - start; + + /* + * We don't want to count issue_as_root bio's in the cgroups latency + * statistics as it could skew the numbers downwards. + */ + if (unlikely(issue_as_root && iolat->rq_depth.max_depth != (u64)-1)) { + u64 sub = iolat->min_lat_nsec; + if (req_time < sub) + blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); + return; + } + + rq_stat = get_cpu_ptr(iolat->stats); + blk_rq_stat_add(rq_stat, req_time); + put_cpu_ptr(rq_stat); +} + +#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) +#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 + +static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) +{ + struct blkcg_gq *blkg = lat_to_blkg(iolat); + struct iolatency_grp *parent; + struct child_latency_info *lat_info; + struct blk_rq_stat stat; + unsigned long flags; + int cpu; + + blk_rq_stat_init(&stat); + preempt_disable(); + for_each_online_cpu(cpu) { + struct blk_rq_stat *s; + s = per_cpu_ptr(iolat->stats, cpu); + blk_rq_stat_sum(&stat, s); + blk_rq_stat_init(s); + } + preempt_enable(); + + /* + * Our average exceeded our window, scale up our window so we are more + * accurate, but not more than the global timer. + */ + if (stat.mean > iolat->cur_win_nsec) { + iolat->cur_win_nsec <<= 1; + iolat->cur_win_nsec = + max_t(u64, iolat->cur_win_nsec, NSEC_PER_SEC); + } + + parent = blkg_to_lat(blkg->parent); + if (!parent) + return; + + lat_info = &parent->child_lat; + + iolat->total_lat_avg = + div64_u64((iolat->total_lat_avg * iolat->total_lat_nr) + + stat.mean, iolat->total_lat_nr + 1); + + iolat->total_lat_nr++; + + /* Everything is ok and we don't need to adjust the scale. */ + if (stat.mean <= iolat->min_lat_nsec && + atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) + return; + + /* Somebody beat us to the punch, just bail. */ + spin_lock_irqsave(&lat_info->lock, flags); + lat_info->nr_samples -= iolat->nr_samples; + lat_info->nr_samples += stat.nr_samples; + iolat->nr_samples = stat.nr_samples; + + if ((lat_info->last_scale_event >= now || + now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && + lat_info->scale_lat <= iolat->min_lat_nsec) + goto out; + + if (stat.mean <= iolat->min_lat_nsec && + stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { + if (lat_info->scale_grp == iolat) { + lat_info->last_scale_event = now; + scale_cookie_change(iolat->blkiolat, lat_info, true); + } + } else if (stat.mean > iolat->min_lat_nsec) { + lat_info->last_scale_event = now; + if (!lat_info->scale_grp || + lat_info->scale_lat > iolat->min_lat_nsec) { + WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); + lat_info->scale_grp = iolat; + } + scale_cookie_change(iolat->blkiolat, lat_info, false); + } +out: + spin_unlock_irqrestore(&lat_info->lock, flags); +} + +static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg; + struct rq_wait *rqw; + struct iolatency_grp *iolat; + u64 window_start; + u64 now = ktime_to_ns(ktime_get()); + bool issue_as_root = bio_issue_as_root_blkg(bio); + bool enabled = false; + + blkg = bio->bi_blkg; + if (!blkg) + return; + + iolat = blkg_to_lat(bio->bi_blkg); + if (!iolat) + return; + + enabled = blk_iolatency_enabled(iolat->blkiolat); + while (blkg && blkg->parent) { + iolat = blkg_to_lat(blkg); + if (!iolat) { + blkg = blkg->parent; + continue; + } + rqw = &iolat->rq_wait; + + atomic_dec(&rqw->inflight); + if (!enabled || iolat->min_lat_nsec == 0) + goto next; + iolatency_record_time(iolat, &bio->bi_issue, now, + issue_as_root); + window_start = atomic64_read(&iolat->window_start); + if (now > window_start && + (now - window_start) >= iolat->cur_win_nsec) { + if (atomic64_cmpxchg(&iolat->window_start, + window_start, now) == window_start) + iolatency_check_latencies(iolat, now); + } +next: + wake_up(&rqw->wait); + blkg = blkg->parent; + } +} + +static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg; + + blkg = bio->bi_blkg; + while (blkg && blkg->parent) { + struct rq_wait *rqw; + struct iolatency_grp *iolat; + + iolat = blkg_to_lat(blkg); + if (!iolat) + goto next; + + rqw = &iolat->rq_wait; + atomic_dec(&rqw->inflight); + wake_up(&rqw->wait); +next: + blkg = blkg->parent; + } +} + +static void blkcg_iolatency_exit(struct rq_qos *rqos) +{ + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + + del_timer_sync(&blkiolat->timer); + blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); + kfree(blkiolat); +} + +static struct rq_qos_ops blkcg_iolatency_ops = { + .throttle = blkcg_iolatency_throttle, + .cleanup = blkcg_iolatency_cleanup, + .done_bio = blkcg_iolatency_done_bio, + .exit = blkcg_iolatency_exit, +}; + +static void blkiolatency_timer_fn(struct timer_list *t) +{ + struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); + struct blkcg_gq *blkg; + struct cgroup_subsys_state *pos_css; + u64 now = ktime_to_ns(ktime_get()); + + rcu_read_lock(); + blkg_for_each_descendant_pre(blkg, pos_css, + blkiolat->rqos.q->root_blkg) { + struct iolatency_grp *iolat; + struct child_latency_info *lat_info; + unsigned long flags; + u64 cookie; + + /* + * We could be exiting, don't access the pd unless we have a + * ref on the blkg. + */ + if (!blkg_try_get(blkg)) + continue; + + iolat = blkg_to_lat(blkg); + if (!iolat) + continue; + + lat_info = &iolat->child_lat; + cookie = atomic_read(&lat_info->scale_cookie); + + if (cookie >= DEFAULT_SCALE_COOKIE) + goto next; + + spin_lock_irqsave(&lat_info->lock, flags); + if (lat_info->last_scale_event >= now) + goto next_lock; + + /* + * We scaled down but don't have a scale_grp, scale up and carry + * on. + */ + if (lat_info->scale_grp == NULL) { + scale_cookie_change(iolat->blkiolat, lat_info, true); + goto next_lock; + } + + /* + * It's been 5 seconds since our last scale event, clear the + * scale grp in case the group that needed the scale down isn't + * doing any IO currently. + */ + if (now - lat_info->last_scale_event >= + ((u64)NSEC_PER_SEC * 5)) + lat_info->scale_grp = NULL; +next_lock: + spin_unlock_irqrestore(&lat_info->lock, flags); +next: + blkg_put(blkg); + } + rcu_read_unlock(); +} + +int blk_iolatency_init(struct request_queue *q) +{ + struct blk_iolatency *blkiolat; + struct rq_qos *rqos; + int ret; + + blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); + if (!blkiolat) + return -ENOMEM; + + rqos = &blkiolat->rqos; + rqos->id = RQ_QOS_CGROUP; + rqos->ops = &blkcg_iolatency_ops; + rqos->q = q; + + rq_qos_add(q, rqos); + + ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); + if (ret) { + rq_qos_del(q, rqos); + kfree(blkiolat); + return ret; + } + + timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); + + return 0; +} + +static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) +{ + struct iolatency_grp *iolat = blkg_to_lat(blkg); + struct blk_iolatency *blkiolat = iolat->blkiolat; + u64 oldval = iolat->min_lat_nsec; + + iolat->min_lat_nsec = val; + iolat->cur_win_nsec = max_t(u64, val << 4, 100 * NSEC_PER_MSEC); + iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, NSEC_PER_SEC); + + if (!oldval && val) + atomic_inc(&blkiolat->enabled); + if (oldval && !val) + atomic_dec(&blkiolat->enabled); +} + +static void iolatency_clear_scaling(struct blkcg_gq *blkg) +{ + if (blkg->parent) { + struct iolatency_grp *iolat = blkg_to_lat(blkg->parent); + struct child_latency_info *lat_info; + if (!iolat) + return; + + lat_info = &iolat->child_lat; + spin_lock(&lat_info->lock); + atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); + lat_info->last_scale_event = 0; + lat_info->scale_grp = NULL; + lat_info->scale_lat = 0; + spin_unlock(&lat_info->lock); + } +} + +static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkcg_gq *blkg; + struct blk_iolatency *blkiolat; + struct blkg_conf_ctx ctx; + struct iolatency_grp *iolat; + char *p, *tok; + u64 lat_val = 0; + u64 oldval; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); + if (ret) + return ret; + + iolat = blkg_to_lat(ctx.blkg); + blkiolat = iolat->blkiolat; + p = ctx.body; + + ret = -EINVAL; + while ((tok = strsep(&p, " "))) { + char key[16]; + char val[21]; /* 18446744073709551616 */ + + if (sscanf(tok, "%15[^=]=%20s", key, val) != 2) + goto out; + + if (!strcmp(key, "target")) { + u64 v; + + if (!strcmp(val, "max")) + lat_val = 0; + else if (sscanf(val, "%llu", &v) == 1) + lat_val = v * NSEC_PER_USEC; + else + goto out; + } else { + goto out; + } + } + + /* Walk up the tree to see if our new val is lower than it should be. */ + blkg = ctx.blkg; + oldval = iolat->min_lat_nsec; + + iolatency_set_min_lat_nsec(blkg, lat_val); + if (oldval != iolat->min_lat_nsec) { + iolatency_clear_scaling(blkg); + } + + ret = 0; +out: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static u64 iolatency_prfill_limit(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + const char *dname = blkg_dev_name(pd->blkg); + + if (!dname || !iolat->min_lat_nsec) + return 0; + seq_printf(sf, "%s target=%llu\n", + dname, + (unsigned long long)iolat->min_lat_nsec / NSEC_PER_USEC); + return 0; +} + +static int iolatency_print_limit(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + iolatency_prfill_limit, + &blkcg_policy_iolatency, seq_cft(sf)->private, false); + return 0; +} + +static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, + size_t size) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + unsigned long long avg_lat = div64_u64(iolat->total_lat_avg, NSEC_PER_USEC); + + if (iolat->rq_depth.max_depth == (u64)-1) + return scnprintf(buf, size, " depth=max avg_lat=%llu", + avg_lat); + + return scnprintf(buf, size, " depth=%u avg_lat=%llu", + iolat->rq_depth.max_depth, avg_lat); +} + + +static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) +{ + struct iolatency_grp *iolat; + + iolat = kzalloc_node(sizeof(*iolat), gfp, node); + if (!iolat) + return NULL; + iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), + __alignof__(struct blk_rq_stat), gfp); + if (!iolat->stats) { + kfree(iolat); + return NULL; + } + return &iolat->pd; +} + +static void iolatency_pd_init(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + struct blkcg_gq *blkg = lat_to_blkg(iolat); + struct rq_qos *rqos = blkcg_rq_qos(blkg->q); + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + for_each_possible_cpu(cpu) { + struct blk_rq_stat *stat; + stat = per_cpu_ptr(iolat->stats, cpu); + blk_rq_stat_init(stat); + } + + rq_wait_init(&iolat->rq_wait); + spin_lock_init(&iolat->child_lat.lock); + iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); + iolat->rq_depth.max_depth = INT_MAX; + iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; + iolat->blkiolat = blkiolat; + iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; + atomic64_set(&iolat->window_start, now); + + /* + * We init things in list order, so the pd for the parent may not be + * init'ed yet for whatever reason. + */ + if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) { + struct iolatency_grp *parent = blkg_to_lat(blkg->parent); + atomic_set(&iolat->scale_cookie, + atomic_read(&parent->child_lat.scale_cookie)); + } else { + atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE); + } + + atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); +} + +static void iolatency_pd_offline(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + struct blkcg_gq *blkg = lat_to_blkg(iolat); + + iolatency_set_min_lat_nsec(blkg, 0); + iolatency_clear_scaling(blkg); +} + +static void iolatency_pd_free(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + free_percpu(iolat->stats); + kfree(iolat); +} + +static struct cftype iolatency_files[] = { + { + .name = "latency", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = iolatency_print_limit, + .write = iolatency_set_limit, + }, + {} +}; + +static struct blkcg_policy blkcg_policy_iolatency = { + .dfl_cftypes = iolatency_files, + .pd_alloc_fn = iolatency_pd_alloc, + .pd_init_fn = iolatency_pd_init, + .pd_offline_fn = iolatency_pd_offline, + .pd_free_fn = iolatency_pd_free, + .pd_stat_fn = iolatency_pd_stat, +}; + +static int __init iolatency_init(void) +{ + return blkcg_policy_register(&blkcg_policy_iolatency); +} + +static void __exit iolatency_exit(void) +{ + return blkcg_policy_unregister(&blkcg_policy_iolatency); +} + +module_init(iolatency_init); +module_exit(iolatency_exit); diff --git a/block/blk.h b/block/blk.h index 8d23aea96ce9e7..69b14cd2bb225f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -412,4 +412,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) extern void blk_drain_queue(struct request_queue *q); +#ifdef CONFIG_BLK_CGROUP_IOLATENCY +extern int blk_iolatency_init(struct request_queue *q); +#else +static inline int blk_iolatency_init(struct request_queue *q) { return 0; } +#endif + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0ffc34c5cc83a9..e13449a379a172 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -180,9 +180,7 @@ struct bio { struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; struct blkcg_gq *bi_blkg; -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW struct bio_issue bi_issue; -#endif #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) From b351f0c76c3eb94c9ccfb68d0b23899a35e47f27 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:15:02 -0400 Subject: [PATCH 042/190] Documentation: add a doc for blk-iolatency A basic documentation to describe the interface, statistics, and behavior of io.latency. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 79 +++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8a2c52d5c53b7a..569ce27b85e53f 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -51,6 +51,9 @@ v1 is available under Documentation/cgroup-v1/. 5-3. IO 5-3-1. IO Interface Files 5-3-2. Writeback + 5-3-3. IO Latency + 5-3-3-1. How IO Latency Throttling Works + 5-3-3-2. IO Latency Interface Files 5-4. PID 5-4-1. PID Interface Files 5-5. Device @@ -1446,6 +1449,82 @@ writeback as follows. vm.dirty[_background]_ratio. +IO Latency +~~~~~~~~~~ + +This is a cgroup v2 controller for IO workload protection. You provide a group +with a latency target, and if the average latency exceeds that target the +controller will throttle any peers that have a lower latency target than the +protected workload. + +The limits are only applied at the peer level in the hierarchy. This means that +in the diagram below, only groups A, B, and C will influence each other, and +groups D and F will influence each other. Group G will influence nobody. + + [root] + / | \ + A B C + / \ | + D F G + + +So the ideal way to configure this is to set io.latency in groups A, B, and C. +Generally you do not want to set a value lower than the latency your device +supports. Experiment to find the value that works best for your workload. +Start at higher than the expected latency for your device and watch the +total_lat_avg value in io.stat for your workload group to get an idea of the +latency you see during normal operation. Use this value as a basis for your +real setting, setting at 10-15% higher than the value in io.stat. +Experimentation is key here because total_lat_avg is a running total, so is the +"statistics" portion of "lies, damned lies, and statistics." + +How IO Latency Throttling Works +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +io.latency is work conserving; so as long as everybody is meeting their latency +target the controller doesn't do anything. Once a group starts missing its +target it begins throttling any peer group that has a higher target than itself. +This throttling takes 2 forms: + +- Queue depth throttling. This is the number of outstanding IO's a group is + allowed to have. We will clamp down relatively quickly, starting at no limit + and going all the way down to 1 IO at a time. + +- Artificial delay induction. There are certain types of IO that cannot be + throttled without possibly adversely affecting higher priority groups. This + includes swapping and metadata IO. These types of IO are allowed to occur + normally, however they are "charged" to the originating group. If the + originating group is being throttled you will see the use_delay and delay + fields in io.stat increase. The delay value is how many microseconds that are + being added to any process that runs in this group. Because this number can + grow quite large if there is a lot of swapping or metadata IO occurring we + limit the individual delay events to 1 second at a time. + +Once the victimized group starts meeting its latency target again it will start +unthrottling any peer groups that were throttled previously. If the victimized +group simply stops doing IO the global counter will unthrottle appropriately. + +IO Latency Interface Files +~~~~~~~~~~~~~~~~~~~~~~~~~~ + + io.latency + This takes a similar format as the other controllers. + + "MAJOR:MINOR target= Date: Tue, 3 Jul 2018 11:15:03 -0400 Subject: [PATCH 043/190] mm: skip readahead if the cgroup is congested We noticed in testing we'd get pretty bad latency stalls under heavy pressure because read ahead would try to do its thing while the cgroup was under severe pressure. If we're under this much pressure we want to do as little IO as possible so we can still make progress on real work if we're a throttled cgroup, so just skip readahead if our group is under pressure. Signed-off-by: Josef Bacik Acked-by: Tejun Heo Acked-by: Andrew Morton Signed-off-by: Jens Axboe --- mm/readahead.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/readahead.c b/mm/readahead.c index e273f0de337699..9f62b71511001a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" @@ -505,6 +506,9 @@ void page_cache_sync_readahead(struct address_space *mapping, if (!ra->ra_pages) return; + if (blk_cgroup_congested()) + return; + /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { force_page_cache_readahead(mapping, filp, offset, req_size); @@ -555,6 +559,9 @@ page_cache_async_readahead(struct address_space *mapping, if (inode_read_congested(mapping->host)) return; + if (blk_cgroup_congested()) + return; + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); } From 00a8cdb84fcb64c7f9f1061298ff676a96dfaf41 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 6 Jul 2018 03:07:13 +0800 Subject: [PATCH 044/190] null_blk: remove NULLB_DEV_FL_CONFIGURED on turning off nullb device Currently mbps knob could only be set once before switching power knob to on, after power knob has been set at least once, there is no way to set mbps knob again due to -EBUSY. As nullb is mainly used for testing, in order to make it flexible, this removes the flag NULLB_DEV_FL_CONFIGURED so that mbps knob can be reset when power knob is off, e.g. echo 0 > /config/nullb/a/power echo 40 > /config/nullb/a/mbps echo 1 > /config/nullb/a/power So does other knobs under /config/nullb/a. Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 042c778e5a4e0b..8abfb1059909b5 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -390,6 +390,7 @@ static ssize_t nullb_device_power_store(struct config_item *item, null_del_dev(dev->nullb); mutex_unlock(&lock); clear_bit(NULLB_DEV_FL_UP, &dev->flags); + clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); } return count; From c137969bd4c5b2a67803d10518e3d60d3b13d0cb Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 3 Jul 2018 10:14:46 -0700 Subject: [PATCH 045/190] block, mm: remove unnecessary __GFP_HIGH flag The flag GFP_ATOMIC already contains __GFP_HIGH. There is no need to explicitly or __GFP_HIGH again. So, just remove unnecessary __GFP_HIGH. Signed-off-by: Shakeel Butt Signed-off-by: Jens Axboe --- block/blk-ioc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index f23311e4b201fe..01580f88fcb39f 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -278,7 +278,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); spin_lock_init(&ioc->lock); - INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); + INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); From b88aef36b87c9787a4db724923ec4f57dfd513f3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 3 Jul 2018 13:34:22 -0400 Subject: [PATCH 046/190] block: fix infinite loop if the device loses discard capability If __blkdev_issue_discard is in progress and a device mapper device is reloaded with a table that doesn't support discard, q->limits.max_discard_sectors is set to zero. This results in infinite loop in __blkdev_issue_discard. This patch checks if max_discard_sectors is zero and aborts with -EOPNOTSUPP. Signed-off-by: Mikulas Patocka Tested-by: Zdenek Kabelac Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-lib.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/blk-lib.c b/block/blk-lib.c index 8faa70f26fcd15..d1b9dd03da256f 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -68,6 +68,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, */ req_sects = min_t(sector_t, nr_sects, q->limits.max_discard_sectors); + if (!req_sects) + goto fail; if (req_sects > UINT_MAX >> 9) req_sects = UINT_MAX >> 9; @@ -105,6 +107,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, *biop = bio; return 0; + +fail: + if (bio) { + submit_bio_wait(bio); + bio_put(bio); + } + *biop = NULL; + return -EOPNOTSUPP; } EXPORT_SYMBOL(__blkdev_issue_discard); From e9a83853302b339e63dea4072f6210e5a88ab4bb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 6 Jul 2018 10:49:35 +0200 Subject: [PATCH 047/190] block: Add default switch case to blk_pm_allow_request() to kill warning With gcc 4.9.0 and 7.3.0: block/blk-core.c: In function 'blk_pm_allow_request': block/blk-core.c:2747:2: warning: enumeration value 'RPM_ACTIVE' not handled in switch [-Wswitch] switch (rq->q->rpm_status) { ^ Convert the return statement below the switch() block into a default case to fix this. Fixes: e4f36b249b4d4e75 ("block: fix peeking requests during PM") Signed-off-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 687d7732f23a04..c4b57d8806fe15 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2753,9 +2753,9 @@ static bool blk_pm_allow_request(struct request *rq) return rq->rq_flags & RQF_PM; case RPM_SUSPENDED: return false; + default: + return true; } - - return true; } #else static bool blk_pm_allow_request(struct request *rq) From 6dad38d38f20c0c8a84b5ae4f23c62b2c8758ec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 Jul 2018 19:38:38 +0200 Subject: [PATCH 048/190] null_blk: move shared definitions to header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split the null_blk device driver, such that it can prepare for zoned block interface support. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 76 +------------------------------------- drivers/block/null_blk.h | 80 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 75 deletions(-) create mode 100644 drivers/block/null_blk.h diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 8abfb1059909b5..47dd0849914585 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -7,14 +7,8 @@ #include #include #include -#include #include -#include -#include -#include -#include -#include -#include +#include "null_blk.h" #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) @@ -35,28 +29,6 @@ static inline u64 mb_per_tick(int mbps) return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); } -struct nullb_cmd { - struct list_head list; - struct llist_node ll_list; - struct __call_single_data csd; - struct request *rq; - struct bio *bio; - unsigned int tag; - blk_status_t error; - struct nullb_queue *nq; - struct hrtimer timer; -}; - -struct nullb_queue { - unsigned long *tag_map; - wait_queue_head_t wait; - unsigned int queue_depth; - struct nullb_device *dev; - unsigned int requeue_selection; - - struct nullb_cmd *cmds; -}; - /* * Status flags for nullb_device. * @@ -92,52 +64,6 @@ struct nullb_page { #define NULLB_PAGE_LOCK (MAP_SZ - 1) #define NULLB_PAGE_FREE (MAP_SZ - 2) -struct nullb_device { - struct nullb *nullb; - struct config_item item; - struct radix_tree_root data; /* data stored in the disk */ - struct radix_tree_root cache; /* disk cache data */ - unsigned long flags; /* device flags */ - unsigned int curr_cache; - struct badblocks badblocks; - - unsigned long size; /* device size in MB */ - unsigned long completion_nsec; /* time in ns to complete a request */ - unsigned long cache_size; /* disk cache size in MB */ - unsigned int submit_queues; /* number of submission queues */ - unsigned int home_node; /* home node for the device */ - unsigned int queue_mode; /* block interface */ - unsigned int blocksize; /* block size */ - unsigned int irqmode; /* IRQ completion handler */ - unsigned int hw_queue_depth; /* queue depth */ - unsigned int index; /* index of the disk, only valid with a disk */ - unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ - bool blocking; /* blocking blk-mq device */ - bool use_per_node_hctx; /* use per-node allocation for hardware context */ - bool power; /* power on/off the device */ - bool memory_backed; /* if data is stored in memory */ - bool discard; /* if support discard */ -}; - -struct nullb { - struct nullb_device *dev; - struct list_head list; - unsigned int index; - struct request_queue *q; - struct gendisk *disk; - struct blk_mq_tag_set *tag_set; - struct blk_mq_tag_set __tag_set; - unsigned int queue_depth; - atomic_long_t cur_bytes; - struct hrtimer bw_timer; - unsigned long cache_flush_pos; - spinlock_t lock; - - struct nullb_queue *queues; - unsigned int nr_queues; - char disk_name[DISK_NAME_LEN]; -}; - static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h new file mode 100644 index 00000000000000..d82c5501806d5a --- /dev/null +++ b/drivers/block/null_blk.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BLK_NULL_BLK_H +#define __BLK_NULL_BLK_H + +#include +#include +#include +#include +#include +#include +#include + +struct nullb_cmd { + struct list_head list; + struct llist_node ll_list; + struct __call_single_data csd; + struct request *rq; + struct bio *bio; + unsigned int tag; + blk_status_t error; + struct nullb_queue *nq; + struct hrtimer timer; +}; + +struct nullb_queue { + unsigned long *tag_map; + wait_queue_head_t wait; + unsigned int queue_depth; + struct nullb_device *dev; + unsigned int requeue_selection; + + struct nullb_cmd *cmds; +}; + +struct nullb_device { + struct nullb *nullb; + struct config_item item; + struct radix_tree_root data; /* data stored in the disk */ + struct radix_tree_root cache; /* disk cache data */ + unsigned long flags; /* device flags */ + unsigned int curr_cache; + struct badblocks badblocks; + + unsigned long size; /* device size in MB */ + unsigned long completion_nsec; /* time in ns to complete a request */ + unsigned long cache_size; /* disk cache size in MB */ + unsigned int submit_queues; /* number of submission queues */ + unsigned int home_node; /* home node for the device */ + unsigned int queue_mode; /* block interface */ + unsigned int blocksize; /* block size */ + unsigned int irqmode; /* IRQ completion handler */ + unsigned int hw_queue_depth; /* queue depth */ + unsigned int index; /* index of the disk, only valid with a disk */ + unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ + bool blocking; /* blocking blk-mq device */ + bool use_per_node_hctx; /* use per-node allocation for hardware context */ + bool power; /* power on/off the device */ + bool memory_backed; /* if data is stored in memory */ + bool discard; /* if support discard */ +}; + +struct nullb { + struct nullb_device *dev; + struct list_head list; + unsigned int index; + struct request_queue *q; + struct gendisk *disk; + struct blk_mq_tag_set *tag_set; + struct blk_mq_tag_set __tag_set; + unsigned int queue_depth; + atomic_long_t cur_bytes; + struct hrtimer bw_timer; + unsigned long cache_flush_pos; + spinlock_t lock; + + struct nullb_queue *queues; + unsigned int nr_queues; + char disk_name[DISK_NAME_LEN]; +}; +#endif /* __NULL_BLK_H */ From ca4b2a011948fae4e4d31490107db4926385a983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 6 Jul 2018 19:38:39 +0200 Subject: [PATCH 049/190] null_blk: add zone support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds support for exposing a null_blk device through the zone device interface. The interface is managed with the parameters zoned and zone_size. If zoned is set, the null_blk instance registers as a zoned block device. The zone_size parameter defines how big each zone will be. Signed-off-by: Matias Bjørling Signed-off-by: Bart Van Assche Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- Documentation/block/null_blk.txt | 7 ++ drivers/block/Makefile | 5 +- drivers/block/null_blk.c | 48 +++++++++- drivers/block/null_blk.h | 28 ++++++ drivers/block/null_blk_zoned.c | 149 +++++++++++++++++++++++++++++++ 5 files changed, 234 insertions(+), 3 deletions(-) create mode 100644 drivers/block/null_blk_zoned.c diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index 07f147381f3270..ea2dafe49ae8f5 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt @@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0 0: Tag set is not shared. 1: Tag set shared between devices for blk-mq. Only makes sense with nr_devices > 1, otherwise there's no tag set to share. + +zoned=[0/1]: Default: 0 + 0: Block device is exposed as a random-access block device. + 1: Block device is exposed as a host-managed zoned block device. + +zone_size=[MB]: Default: 256 + Per zone size when exposed as a zoned block device. Must be a power of two. diff --git a/drivers/block/Makefile b/drivers/block/Makefile index dc061158b4036e..a0d88aa0c05d60 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ -obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o obj-$(CONFIG_ZRAM) += zram/ +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk_mod.o +null_blk_mod-objs := null_blk.o +null_blk_mod-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o + skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 47dd0849914585..86cafa6d3b4177 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -180,6 +180,14 @@ static bool g_use_per_node_hctx; module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); +static bool g_zoned; +module_param_named(zoned, g_zoned, bool, S_IRUGO); +MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); + +static unsigned long g_zone_size = 256; +module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -283,6 +291,8 @@ NULLB_DEVICE_ATTR(memory_backed, bool); NULLB_DEVICE_ATTR(discard, bool); NULLB_DEVICE_ATTR(mbps, uint); NULLB_DEVICE_ATTR(cache_size, ulong); +NULLB_DEVICE_ATTR(zoned, bool); +NULLB_DEVICE_ATTR(zone_size, ulong); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -395,6 +405,8 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_mbps, &nullb_device_attr_cache_size, &nullb_device_attr_badblocks, + &nullb_device_attr_zoned, + &nullb_device_attr_zone_size, NULL, }; @@ -447,7 +459,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n"); + return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -506,6 +518,8 @@ static struct nullb_device *null_alloc_dev(void) dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; dev->use_per_node_hctx = g_use_per_node_hctx; + dev->zoned = g_zoned; + dev->zone_size = g_zone_size; return dev; } @@ -514,6 +528,7 @@ static void null_free_dev(struct nullb_device *dev) if (!dev) return; + null_zone_exit(dev); badblocks_exit(&dev->badblocks); kfree(dev); } @@ -1146,6 +1161,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) struct nullb *nullb = dev->nullb; int err = 0; + if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) { + cmd->error = null_zone_report(nullb, cmd); + goto out; + } + if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { struct request *rq = cmd->rq; @@ -1210,6 +1230,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) } } cmd->error = errno_to_blk_status(err); + + if (!cmd->error && dev->zoned) { + if (req_op(cmd->rq) == REQ_OP_WRITE) + null_zone_write(cmd); + else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET) + null_zone_reset(cmd); + } out: /* Complete IO by inline, softirq or timer */ switch (dev->irqmode) { @@ -1737,6 +1764,15 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_flush_queueable(nullb->q, true); } + if (dev->zoned) { + rv = null_zone_init(dev); + if (rv) + goto out_cleanup_blk_queue; + + blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects); + nullb->q->limits.zoned = BLK_ZONED_HM; + } + nullb->q->queuedata = nullb; blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); @@ -1755,13 +1791,16 @@ static int null_add_dev(struct nullb_device *dev) rv = null_gendisk_register(nullb); if (rv) - goto out_cleanup_blk_queue; + goto out_cleanup_zone; mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); mutex_unlock(&lock); return 0; +out_cleanup_zone: + if (dev->zoned) + null_zone_exit(dev); out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: @@ -1788,6 +1827,11 @@ static int __init null_init(void) g_bs = PAGE_SIZE; } + if (!is_power_of_2(g_zone_size)) { + pr_err("null_blk: zone_size must be power-of-two\n"); + return -EINVAL; + } + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("null_blk: submit_queues param is set to %u.\n", diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index d82c5501806d5a..d81781f22dba09 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -41,9 +41,14 @@ struct nullb_device { unsigned int curr_cache; struct badblocks badblocks; + unsigned int nr_zones; + struct blk_zone *zones; + sector_t zone_size_sects; + unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ unsigned long cache_size; /* disk cache size in MB */ + unsigned long zone_size; /* zone size in MB if device is zoned */ unsigned int submit_queues; /* number of submission queues */ unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ @@ -57,6 +62,7 @@ struct nullb_device { bool power; /* power on/off the device */ bool memory_backed; /* if data is stored in memory */ bool discard; /* if support discard */ + bool zoned; /* if device is zoned */ }; struct nullb { @@ -77,4 +83,26 @@ struct nullb { unsigned int nr_queues; char disk_name[DISK_NAME_LEN]; }; + +#ifdef CONFIG_BLK_DEV_ZONED +int null_zone_init(struct nullb_device *dev); +void null_zone_exit(struct nullb_device *dev); +blk_status_t null_zone_report(struct nullb *nullb, + struct nullb_cmd *cmd); +void null_zone_write(struct nullb_cmd *cmd); +void null_zone_reset(struct nullb_cmd *cmd); +#else +static inline int null_zone_init(struct nullb_device *dev) +{ + return -EINVAL; +} +static inline void null_zone_exit(struct nullb_device *dev) {} +static inline blk_status_t null_zone_report(struct nullb *nullb, + struct nullb_cmd *cmd) +{ + return BLK_STS_NOTSUPP; +} +static inline void null_zone_write(struct nullb_cmd *cmd) {} +static inline void null_zone_reset(struct nullb_cmd *cmd) {} +#endif /* CONFIG_BLK_DEV_ZONED */ #endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c new file mode 100644 index 00000000000000..a979ca00d7be43 --- /dev/null +++ b/drivers/block/null_blk_zoned.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "null_blk.h" + +/* zone_size in MBs to sectors. */ +#define ZONE_SIZE_SHIFT 11 + +static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) +{ + return sect >> ilog2(dev->zone_size_sects); +} + +int null_zone_init(struct nullb_device *dev) +{ + sector_t dev_size = (sector_t)dev->size * 1024 * 1024; + sector_t sector = 0; + unsigned int i; + + if (!is_power_of_2(dev->zone_size)) { + pr_err("null_blk: zone_size must be power-of-two\n"); + return -EINVAL; + } + + dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; + dev->nr_zones = dev_size >> + (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), + GFP_KERNEL | __GFP_ZERO); + if (!dev->zones) + return -ENOMEM; + + for (i = 0; i < dev->nr_zones; i++) { + struct blk_zone *zone = &dev->zones[i]; + + zone->start = zone->wp = sector; + zone->len = dev->zone_size_sects; + zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone->cond = BLK_ZONE_COND_EMPTY; + + sector += dev->zone_size_sects; + } + + return 0; +} + +void null_zone_exit(struct nullb_device *dev) +{ + kvfree(dev->zones); +} + +static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq, + unsigned int zno, unsigned int nr_zones) +{ + struct blk_zone_report_hdr *hdr = NULL; + struct bio_vec bvec; + struct bvec_iter iter; + void *addr; + unsigned int zones_to_cpy; + + bio_for_each_segment(bvec, rq->bio, iter) { + addr = kmap_atomic(bvec.bv_page); + + zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone); + + if (!hdr) { + hdr = (struct blk_zone_report_hdr *)addr; + hdr->nr_zones = nr_zones; + zones_to_cpy--; + addr += sizeof(struct blk_zone_report_hdr); + } + + zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones); + + memcpy(addr, &dev->zones[zno], + zones_to_cpy * sizeof(struct blk_zone)); + + kunmap_atomic(addr); + + nr_zones -= zones_to_cpy; + zno += zones_to_cpy; + + if (!nr_zones) + break; + } +} + +blk_status_t null_zone_report(struct nullb *nullb, + struct nullb_cmd *cmd) +{ + struct nullb_device *dev = nullb->dev; + struct request *rq = cmd->rq; + unsigned int zno = null_zone_no(dev, blk_rq_pos(rq)); + unsigned int nr_zones = dev->nr_zones - zno; + unsigned int max_zones = (blk_rq_bytes(rq) / + sizeof(struct blk_zone)) - 1; + + nr_zones = min_t(unsigned int, nr_zones, max_zones); + + null_zone_fill_rq(nullb->dev, rq, zno, nr_zones); + + return BLK_STS_OK; +} + +void null_zone_write(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct request *rq = cmd->rq; + sector_t sector = blk_rq_pos(rq); + unsigned int rq_sectors = blk_rq_sectors(rq); + unsigned int zno = null_zone_no(dev, sector); + struct blk_zone *zone = &dev->zones[zno]; + + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* Cannot write to a full zone */ + cmd->error = BLK_STS_IOERR; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_IMP_OPEN: + /* Writes must be at the write pointer position */ + if (blk_rq_pos(rq) != zone->wp) { + cmd->error = BLK_STS_IOERR; + break; + } + + if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + zone->wp += rq_sectors; + if (zone->wp == zone->start + zone->len) + zone->cond = BLK_ZONE_COND_FULL; + break; + default: + /* Invalid zone condition */ + cmd->error = BLK_STS_IOERR; + break; + } +} + +void null_zone_reset(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct request *rq = cmd->rq; + unsigned int zno = null_zone_no(dev, blk_rq_pos(rq)); + struct blk_zone *zone = &dev->zones[zno]; + + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; +} From 3993e501bf853cce85c5114a704b86b8f486790c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Jul 2018 20:49:19 -0700 Subject: [PATCH 050/190] block/DAC960.c: fix defined but not used build warnings Fix build warnings in DAC960.c when CONFIG_PROC_FS is not enabled by marking the unused functions as __maybe_unused. ../drivers/block/DAC960.c:6429:12: warning: 'dac960_proc_show' defined but not used [-Wunused-function] ../drivers/block/DAC960.c:6449:12: warning: 'dac960_initial_status_proc_show' defined but not used [-Wunused-function] ../drivers/block/DAC960.c:6456:12: warning: 'dac960_current_status_proc_show' defined but not used [-Wunused-function] Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/DAC960.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index f6518067aa7d06..f99e5c883368a7 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -21,6 +21,7 @@ #define DAC960_DriverDate "21 Aug 2007" +#include #include #include #include @@ -6426,7 +6427,7 @@ static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller, return true; } -static int dac960_proc_show(struct seq_file *m, void *v) +static int __maybe_unused dac960_proc_show(struct seq_file *m, void *v) { unsigned char *StatusMessage = "OK\n"; int ControllerNumber; @@ -6446,14 +6447,16 @@ static int dac960_proc_show(struct seq_file *m, void *v) return 0; } -static int dac960_initial_status_proc_show(struct seq_file *m, void *v) +static int __maybe_unused dac960_initial_status_proc_show(struct seq_file *m, + void *v) { DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private; seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer); return 0; } -static int dac960_current_status_proc_show(struct seq_file *m, void *v) +static int __maybe_unused dac960_current_status_proc_show(struct seq_file *m, + void *v) { DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private; unsigned char *StatusMessage = From 88b7210c81096c018f5836aff4749743c0d34623 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jul 2018 17:21:34 +0200 Subject: [PATCH 051/190] block: iolatency: avoid 64-bit division On 32-bit architectures, dividing a 64-bit number needs to use the do_div() function or something like it to avoid a link failure: block/blk-iolatency.o: In function `iolatency_prfill_limit': blk-iolatency.c:(.text+0x8cc): undefined reference to `__aeabi_uldivmod' Using div_u64() gives us the best output and avoids the need for an explicit cast. Fixes: d70675121546 ("block: introduce blk-iolatency io controller") Reviewed-by: Josef Bacik Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index a35a1f58033765..56ddb2c68752c0 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -798,8 +798,7 @@ static u64 iolatency_prfill_limit(struct seq_file *sf, if (!dname || !iolat->min_lat_nsec) return 0; seq_printf(sf, "%s target=%llu\n", - dname, - (unsigned long long)iolat->min_lat_nsec / NSEC_PER_USEC); + dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC)); return 0; } From a284390b397ae2f95de799bc591efbfe54bf0907 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Jul 2018 10:34:42 -0400 Subject: [PATCH 052/190] blk-iolatency: fix max_depth comparisons max_depth used to be a u64, but I changed it to a unsigned int but didn't convert my comparisons over everywhere. Fix by using UINT_MAX everywhere instead of (u64)-1. Reported-by: Dan Carpenter Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 56ddb2c68752c0..b59e5451680b58 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -9,8 +9,8 @@ * - We use the mean latency over the 100ms window. This is because writes can * be particularly fast, which could give us a false sense of the impact of * other workloads on our protected workload. - * - By default there's no throttling, we set the queue_depth to INT_MAX so that - * we can have as many outstanding bio's as we're allowed to. Only at + * - By default there's no throttling, we set the queue_depth to UINT_MAX so + * that we can have as many outstanding bio's as we're allowed to. Only at * throttle time do we pay attention to the actual queue depth. * * The hierarchy works like the cpu controller does, we track the latency at @@ -361,7 +361,7 @@ static void check_scale_change(struct iolatency_grp *iolat) /* We're back to the default cookie, unthrottle all the things. */ if (cur_cookie == DEFAULT_SCALE_COOKIE) { blkcg_clear_delay(lat_to_blkg(iolat)); - iolat->rq_depth.max_depth = INT_MAX; + iolat->rq_depth.max_depth = UINT_MAX; wake_up_all(&iolat->rq_wait.wait); return; } @@ -434,7 +434,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat, * We don't want to count issue_as_root bio's in the cgroups latency * statistics as it could skew the numbers downwards. */ - if (unlikely(issue_as_root && iolat->rq_depth.max_depth != (u64)-1)) { + if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { u64 sub = iolat->min_lat_nsec; if (req_time < sub) blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); @@ -816,7 +816,7 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat = div64_u64(iolat->total_lat_avg, NSEC_PER_USEC); - if (iolat->rq_depth.max_depth == (u64)-1) + if (iolat->rq_depth.max_depth == UINT_MAX) return scnprintf(buf, size, " depth=max avg_lat=%llu", avg_lat); @@ -859,7 +859,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) rq_wait_init(&iolat->rq_wait); spin_lock_init(&iolat->child_lat.lock); iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); - iolat->rq_depth.max_depth = INT_MAX; + iolat->rq_depth.max_depth = UINT_MAX; iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; iolat->blkiolat = blkiolat; iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; From 28519c891c5ad569203636b3b65d36bcb4333d4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Jul 2018 10:09:59 +0200 Subject: [PATCH 053/190] bsg: remove read/write support The code poses a security risk due to user memory access in ->release and had an API that can't be used reliably. As far as we know it was never used for real, but if that turns out wrong we'll have to revert this commit and come up with a band aid. Jann Horn did look software archives for users of this interface, and the only users found were example code in sg3_utils, and optional support in an optional module of the tgt user space iscsi target, which looks like a proof of concept extension of the /dev/sg read/write support. Tony Battersby chimes in that the code is basically unsafe to use in general: The read/write interface on /dev/bsg is impossible to use safely because the list of completed commands is per-device (bd->done_list) rather than per-fd like it is with /dev/sg. So if program A and program B are both using the write/read interface on the same bsg device, then their command responses will get mixed up, and program A will read() some command results from program B and vice versa. So no, I don't use read/write on /dev/bsg. From a security standpoint, it should definitely be fixed or removed. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bsg.c | 460 +--------------------------------------------------- 1 file changed, 6 insertions(+), 454 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index 66602c48995643..0d2e9bf6208b90 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -13,11 +13,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -38,21 +36,10 @@ struct bsg_device { struct request_queue *queue; spinlock_t lock; - struct list_head busy_list; - struct list_head done_list; struct hlist_node dev_list; atomic_t ref_count; - int queued_cmds; - int done_cmds; - wait_queue_head_t wq_done; - wait_queue_head_t wq_free; char name[20]; int max_queue; - unsigned long flags; -}; - -enum { - BSG_F_BLOCK = 1, }; #define BSG_DEFAULT_CMDS 64 @@ -67,64 +54,6 @@ static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE]; static struct class *bsg_class; static int bsg_major; -static struct kmem_cache *bsg_cmd_cachep; - -/* - * our internal command type - */ -struct bsg_command { - struct bsg_device *bd; - struct list_head list; - struct request *rq; - struct bio *bio; - struct bio *bidi_bio; - int err; - struct sg_io_v4 hdr; -}; - -static void bsg_free_command(struct bsg_command *bc) -{ - struct bsg_device *bd = bc->bd; - unsigned long flags; - - kmem_cache_free(bsg_cmd_cachep, bc); - - spin_lock_irqsave(&bd->lock, flags); - bd->queued_cmds--; - spin_unlock_irqrestore(&bd->lock, flags); - - wake_up(&bd->wq_free); -} - -static struct bsg_command *bsg_alloc_command(struct bsg_device *bd) -{ - struct bsg_command *bc = ERR_PTR(-EINVAL); - - spin_lock_irq(&bd->lock); - - if (bd->queued_cmds >= bd->max_queue) - goto out; - - bd->queued_cmds++; - spin_unlock_irq(&bd->lock); - - bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL); - if (unlikely(!bc)) { - spin_lock_irq(&bd->lock); - bd->queued_cmds--; - bc = ERR_PTR(-ENOMEM); - goto out; - } - - bc->bd = bd; - INIT_LIST_HEAD(&bc->list); - bsg_dbg(bd, "returning free cmd %p\n", bc); - return bc; -out: - spin_unlock_irq(&bd->lock); - return bc; -} - static inline struct hlist_head *bsg_dev_idx_hash(int index) { return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; @@ -287,101 +216,6 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode) return ERR_PTR(ret); } -/* - * async completion call-back from the block layer, when scsi/ide/whatever - * calls end_that_request_last() on a request - */ -static void bsg_rq_end_io(struct request *rq, blk_status_t status) -{ - struct bsg_command *bc = rq->end_io_data; - struct bsg_device *bd = bc->bd; - unsigned long flags; - - bsg_dbg(bd, "finished rq %p bc %p, bio %p\n", - rq, bc, bc->bio); - - bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); - - spin_lock_irqsave(&bd->lock, flags); - list_move_tail(&bc->list, &bd->done_list); - bd->done_cmds++; - spin_unlock_irqrestore(&bd->lock, flags); - - wake_up(&bd->wq_done); -} - -/* - * do final setup of a 'bc' and submit the matching 'rq' to the block - * layer for io - */ -static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, - struct bsg_command *bc, struct request *rq) -{ - int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL)); - - /* - * add bc command to busy queue and submit rq for io - */ - bc->rq = rq; - bc->bio = rq->bio; - if (rq->next_rq) - bc->bidi_bio = rq->next_rq->bio; - bc->hdr.duration = jiffies; - spin_lock_irq(&bd->lock); - list_add_tail(&bc->list, &bd->busy_list); - spin_unlock_irq(&bd->lock); - - bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc); - - rq->end_io_data = bc; - blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); -} - -static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd) -{ - struct bsg_command *bc = NULL; - - spin_lock_irq(&bd->lock); - if (bd->done_cmds) { - bc = list_first_entry(&bd->done_list, struct bsg_command, list); - list_del(&bc->list); - bd->done_cmds--; - } - spin_unlock_irq(&bd->lock); - - return bc; -} - -/* - * Get a finished command from the done list - */ -static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) -{ - struct bsg_command *bc; - int ret; - - do { - bc = bsg_next_done_cmd(bd); - if (bc) - break; - - if (!test_bit(BSG_F_BLOCK, &bd->flags)) { - bc = ERR_PTR(-EAGAIN); - break; - } - - ret = wait_event_interruptible(bd->wq_done, bd->done_cmds); - if (ret) { - bc = ERR_PTR(-ERESTARTSYS); - break; - } - } while (1); - - bsg_dbg(bd, "returning done %p\n", bc); - - return bc; -} - static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, struct bio *bio, struct bio *bidi_bio) { @@ -400,234 +234,6 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, return ret; } -static bool bsg_complete(struct bsg_device *bd) -{ - bool ret = false; - bool spin; - - do { - spin_lock_irq(&bd->lock); - - BUG_ON(bd->done_cmds > bd->queued_cmds); - - /* - * All commands consumed. - */ - if (bd->done_cmds == bd->queued_cmds) - ret = true; - - spin = !test_bit(BSG_F_BLOCK, &bd->flags); - - spin_unlock_irq(&bd->lock); - } while (!ret && spin); - - return ret; -} - -static int bsg_complete_all_commands(struct bsg_device *bd) -{ - struct bsg_command *bc; - int ret, tret; - - bsg_dbg(bd, "entered\n"); - - /* - * wait for all commands to complete - */ - io_wait_event(bd->wq_done, bsg_complete(bd)); - - /* - * discard done commands - */ - ret = 0; - do { - spin_lock_irq(&bd->lock); - if (!bd->queued_cmds) { - spin_unlock_irq(&bd->lock); - break; - } - spin_unlock_irq(&bd->lock); - - bc = bsg_get_done_cmd(bd); - if (IS_ERR(bc)) - break; - - tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio, - bc->bidi_bio); - if (!ret) - ret = tret; - - bsg_free_command(bc); - } while (1); - - return ret; -} - -static int -__bsg_read(char __user *buf, size_t count, struct bsg_device *bd, - const struct iovec *iov, ssize_t *bytes_read) -{ - struct bsg_command *bc; - int nr_commands, ret; - - if (count % sizeof(struct sg_io_v4)) - return -EINVAL; - - ret = 0; - nr_commands = count / sizeof(struct sg_io_v4); - while (nr_commands) { - bc = bsg_get_done_cmd(bd); - if (IS_ERR(bc)) { - ret = PTR_ERR(bc); - break; - } - - /* - * this is the only case where we need to copy data back - * after completing the request. so do that here, - * bsg_complete_work() cannot do that for us - */ - ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio, - bc->bidi_bio); - - if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr))) - ret = -EFAULT; - - bsg_free_command(bc); - - if (ret) - break; - - buf += sizeof(struct sg_io_v4); - *bytes_read += sizeof(struct sg_io_v4); - nr_commands--; - } - - return ret; -} - -static inline void bsg_set_block(struct bsg_device *bd, struct file *file) -{ - if (file->f_flags & O_NONBLOCK) - clear_bit(BSG_F_BLOCK, &bd->flags); - else - set_bit(BSG_F_BLOCK, &bd->flags); -} - -/* - * Check if the error is a "real" error that we should return. - */ -static inline int err_block_err(int ret) -{ - if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN) - return 1; - - return 0; -} - -static ssize_t -bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct bsg_device *bd = file->private_data; - int ret; - ssize_t bytes_read; - - bsg_dbg(bd, "read %zd bytes\n", count); - - bsg_set_block(bd, file); - - bytes_read = 0; - ret = __bsg_read(buf, count, bd, NULL, &bytes_read); - *ppos = bytes_read; - - if (!bytes_read || err_block_err(ret)) - bytes_read = ret; - - return bytes_read; -} - -static int __bsg_write(struct bsg_device *bd, const char __user *buf, - size_t count, ssize_t *bytes_written, fmode_t mode) -{ - struct bsg_command *bc; - struct request *rq; - int ret, nr_commands; - - if (count % sizeof(struct sg_io_v4)) - return -EINVAL; - - nr_commands = count / sizeof(struct sg_io_v4); - rq = NULL; - bc = NULL; - ret = 0; - while (nr_commands) { - struct request_queue *q = bd->queue; - - bc = bsg_alloc_command(bd); - if (IS_ERR(bc)) { - ret = PTR_ERR(bc); - bc = NULL; - break; - } - - if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) { - ret = -EFAULT; - break; - } - - /* - * get a request, fill in the blanks, and add to request queue - */ - rq = bsg_map_hdr(bd->queue, &bc->hdr, mode); - if (IS_ERR(rq)) { - ret = PTR_ERR(rq); - rq = NULL; - break; - } - - bsg_add_command(bd, q, bc, rq); - bc = NULL; - rq = NULL; - nr_commands--; - buf += sizeof(struct sg_io_v4); - *bytes_written += sizeof(struct sg_io_v4); - } - - if (bc) - bsg_free_command(bc); - - return ret; -} - -static ssize_t -bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) -{ - struct bsg_device *bd = file->private_data; - ssize_t bytes_written; - int ret; - - bsg_dbg(bd, "write %zd bytes\n", count); - - if (unlikely(uaccess_kernel())) - return -EINVAL; - - bsg_set_block(bd, file); - - bytes_written = 0; - ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode); - - *ppos = bytes_written; - - /* - * return bytes written on non-fatal errors - */ - if (!bytes_written || err_block_err(ret)) - bytes_written = ret; - - bsg_dbg(bd, "returning %zd\n", bytes_written); - return bytes_written; -} - static struct bsg_device *bsg_alloc_device(void) { struct bsg_device *bd; @@ -637,29 +243,20 @@ static struct bsg_device *bsg_alloc_device(void) return NULL; spin_lock_init(&bd->lock); - bd->max_queue = BSG_DEFAULT_CMDS; - - INIT_LIST_HEAD(&bd->busy_list); - INIT_LIST_HEAD(&bd->done_list); INIT_HLIST_NODE(&bd->dev_list); - - init_waitqueue_head(&bd->wq_free); - init_waitqueue_head(&bd->wq_done); return bd; } static int bsg_put_device(struct bsg_device *bd) { - int ret = 0, do_free; struct request_queue *q = bd->queue; mutex_lock(&bsg_mutex); - do_free = atomic_dec_and_test(&bd->ref_count); - if (!do_free) { + if (!atomic_dec_and_test(&bd->ref_count)) { mutex_unlock(&bsg_mutex); - goto out; + return 0; } hlist_del(&bd->dev_list); @@ -670,20 +267,9 @@ static int bsg_put_device(struct bsg_device *bd) /* * close can always block */ - set_bit(BSG_F_BLOCK, &bd->flags); - - /* - * correct error detection baddies here again. it's the responsibility - * of the app to properly reap commands before close() if it wants - * fool-proof error detection - */ - ret = bsg_complete_all_commands(bd); - kfree(bd); -out: - if (do_free) - blk_put_queue(q); - return ret; + blk_put_queue(q); + return 0; } static struct bsg_device *bsg_add_device(struct inode *inode, @@ -706,8 +292,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode, bd->queue = rq; - bsg_set_block(bd, file); - atomic_set(&bd->ref_count, 1); hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); @@ -781,24 +365,6 @@ static int bsg_release(struct inode *inode, struct file *file) return bsg_put_device(bd); } -static __poll_t bsg_poll(struct file *file, poll_table *wait) -{ - struct bsg_device *bd = file->private_data; - __poll_t mask = 0; - - poll_wait(file, &bd->wq_done, wait); - poll_wait(file, &bd->wq_free, wait); - - spin_lock_irq(&bd->lock); - if (!list_empty(&bd->done_list)) - mask |= EPOLLIN | EPOLLRDNORM; - if (bd->queued_cmds < bd->max_queue) - mask |= EPOLLOUT; - spin_unlock_irq(&bd->lock); - - return mask; -} - static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct bsg_device *bd = file->private_data; @@ -872,9 +438,6 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } static const struct file_operations bsg_fops = { - .read = bsg_read, - .write = bsg_write, - .poll = bsg_poll, .open = bsg_open, .release = bsg_release, .unlocked_ioctl = bsg_ioctl, @@ -979,21 +542,12 @@ static int __init bsg_init(void) int ret, i; dev_t devid; - bsg_cmd_cachep = kmem_cache_create("bsg_cmd", - sizeof(struct bsg_command), 0, 0, NULL); - if (!bsg_cmd_cachep) { - printk(KERN_ERR "bsg: failed creating slab cache\n"); - return -ENOMEM; - } - for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++) INIT_HLIST_HEAD(&bsg_device_list[i]); bsg_class = class_create(THIS_MODULE, "bsg"); - if (IS_ERR(bsg_class)) { - ret = PTR_ERR(bsg_class); - goto destroy_kmemcache; - } + if (IS_ERR(bsg_class)) + return PTR_ERR(bsg_class); bsg_class->devnode = bsg_devnode; ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); @@ -1014,8 +568,6 @@ static int __init bsg_init(void) unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS); destroy_bsg_class: class_destroy(bsg_class); -destroy_kmemcache: - kmem_cache_destroy(bsg_cmd_cachep); return ret; } From ea870bb2ae6cbc1a5ba1f3ec8c8fca921a51880b Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 12 Jul 2018 22:29:16 +0200 Subject: [PATCH 054/190] block: skd: Use %pad printk format for dma_addr_t values Use the existing %pad printk format to print dma_addr_t values. This avoids the following warnings when compiling on the parisc64 platform: drivers/block/skd_main.c: In function 'skd_preop_sg_list': drivers/block/skd_main.c:660:4: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 6 has type 'dma_addr_t {aka unsigned int}' [-Wformat=] Reviewed-by: Bart Van Assche Signed-off-by: Helge Deller Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index bc7aea6d7b7cca..87b9e7fbf0621a 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -657,8 +657,8 @@ static bool skd_preop_sg_list(struct skd_device *skdev, if (unlikely(skdev->dbg_level > 1)) { dev_dbg(&skdev->pdev->dev, - "skreq=%x sksg_list=%p sksg_dma=%llx\n", - skreq->id, skreq->sksg_list, skreq->sksg_dma_address); + "skreq=%x sksg_list=%p sksg_dma=%pad\n", + skreq->id, skreq->sksg_list, &skreq->sksg_dma_address); for (i = 0; i < n_sg; i++) { struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; @@ -1190,8 +1190,8 @@ static void skd_send_fitmsg(struct skd_device *skdev, { u64 qcmd; - dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n", - skmsg->mb_dma_address, skd_in_flight(skdev)); + dev_dbg(&skdev->pdev->dev, "dma address %pad, busy=%d\n", + &skmsg->mb_dma_address, skd_in_flight(skdev)); dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf); qcmd = skmsg->mb_dma_address; @@ -1250,9 +1250,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, } dev_dbg(&skdev->pdev->dev, - "skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n", + "skspcl=%p id=%04x sksg_list=%p sksg_dma=%pad\n", skspcl, skspcl->req.id, skspcl->req.sksg_list, - skspcl->req.sksg_dma_address); + &skspcl->req.sksg_dma_address); for (i = 0; i < skspcl->req.n_sg; i++) { struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[i]; @@ -2685,8 +2685,8 @@ static int skd_cons_skmsg(struct skd_device *skdev) WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) & (FIT_QCMD_ALIGN - 1), - "not aligned: msg_buf %p mb_dma_address %#llx\n", - skmsg->msg_buf, skmsg->mb_dma_address); + "not aligned: msg_buf %p mb_dma_address %pad\n", + skmsg->msg_buf, &skmsg->mb_dma_address); memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); } From 05814a10370b3252fe2b0898b6adac3cdd531096 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Fri, 13 Jul 2018 17:07:26 +0300 Subject: [PATCH 055/190] block: remove blkdev_entry_to_request() macro Remove blkdev_entry_to_request() macro, which remained unused through the observable history, also note that it repeats list_entry_rq() macro verbatim. Signed-off-by: Vladimir Zapolskiy Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 137759862f07cb..1939ed95f9361a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1436,8 +1436,6 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) - static inline unsigned long queue_segment_boundary(struct request_queue *q) { return q->limits.seg_boundary_mask; From ffc03fb7a52a88f87910d2b2418fd4e3069698e1 Mon Sep 17 00:00:00 2001 From: Marcin Dziegielewski Date: Fri, 13 Jul 2018 10:48:36 +0200 Subject: [PATCH 056/190] lightnvm: pblk: handle case when mw_cunits equals to 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices can expose mw_cunits equal to 0, it can cause the creation of too small write buffer and cause performance to drop on write workloads. Additionally, write buffer size must cover write data requirements, such as WS_MIN and MW_CUNITS - it must be greater than or equal to the larger one multiplied by the number of PUs. However, for performance reasons, use the WS_OPT value to calculation instead of WS_MIN. Because the place where buffer size is calculated was changed, this patch also removes pgs_in_buffer filed in pblk structure. Signed-off-by: Marcin Dziegielewski Signed-off-by: Igor Konopko Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 9 +++++---- drivers/lightnvm/pblk.h | 3 --- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index b57f764d6a1670..ef8d8dea7b6ba5 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -179,11 +179,14 @@ static int pblk_rwb_init(struct pblk *pblk) struct pblk_rb_entry *entries; unsigned long nr_entries, buffer_size; unsigned int power_size, power_seg_sz; + int pgs_in_buffer; - if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer)) + pgs_in_buffer = max(geo->mw_cunits, geo->ws_opt) * geo->all_luns; + + if (write_buffer_size && (write_buffer_size > pgs_in_buffer)) buffer_size = write_buffer_size; else - buffer_size = pblk->pgs_in_buffer; + buffer_size = pgs_in_buffer; nr_entries = pblk_rb_calculate_size(buffer_size); @@ -366,8 +369,6 @@ static int pblk_core_init(struct pblk *pblk) atomic64_set(&pblk->nr_flush, 0); pblk->nr_flush_rst = 0; - pblk->pgs_in_buffer = geo->mw_cunits * geo->all_luns; - pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE); max_write_ppas = pblk->min_write_pgs * geo->all_luns; pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 34cc1d64a9d42a..9d1a0e86e0825c 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -608,9 +608,6 @@ struct pblk { int min_write_pgs; /* Minimum amount of pages required by controller */ int max_write_pgs; /* Maximum amount of pages supported by controller */ - int pgs_in_buffer; /* Number of pages that need to be held in buffer to - * guarantee successful reads. - */ sector_t capacity; /* Device capacity when bad blocks are subtracted */ From 880eda544097a525b669df84533f439fb031684b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 13 Jul 2018 10:48:37 +0200 Subject: [PATCH 057/190] lightnvm: move NVM_DEBUG to pblk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no users of CONFIG_NVM_DEBUG in the LightNVM subsystem. All users are in pblk. Rename NVM_DEBUG to NVM_PBLK_DEBUG and enable only for pblk. Also fix up the CONFIG_NVM_PBLK entry to follow the code style for Kconfig files. Signed-off-by: Matias Bjørling Reviewed-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/Kconfig | 30 ++++++++++++++++-------------- drivers/lightnvm/pblk-cache.c | 4 ++-- drivers/lightnvm/pblk-core.c | 26 +++++++++++++------------- drivers/lightnvm/pblk-gc.c | 2 +- drivers/lightnvm/pblk-init.c | 8 ++++---- drivers/lightnvm/pblk-rb.c | 16 ++++++++-------- drivers/lightnvm/pblk-read.c | 28 ++++++++++++++-------------- drivers/lightnvm/pblk-sysfs.c | 8 ++++---- drivers/lightnvm/pblk-write.c | 14 +++++++------- drivers/lightnvm/pblk.h | 6 +++--- 10 files changed, 72 insertions(+), 70 deletions(-) diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 9c03f35d9df113..439bf90d084dde 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -17,23 +17,25 @@ menuconfig NVM if NVM -config NVM_DEBUG - bool "Open-Channel SSD debugging support" - default n - ---help--- - Exposes a debug management interface to create/remove targets at: +config NVM_PBLK + tristate "Physical Block Device Open-Channel SSD target" + help + Allows an open-channel SSD to be exposed as a block device to the + host. The target assumes the device exposes raw flash and must be + explicitly managed by the host. - /sys/module/lnvm/parameters/configure_debug + Please note the disk format is considered EXPERIMENTAL for now. - It is required to create/remove targets without IOCTLs. +if NVM_PBLK -config NVM_PBLK - tristate "Physical Block Device Open-Channel SSD target" - ---help--- - Allows an open-channel SSD to be exposed as a block device to the - host. The target assumes the device exposes raw flash and must be - explicitly managed by the host. +config NVM_PBLK_DEBUG + bool "PBlk Debug Support" + default n + help + Enables debug support for pblk. This includes extra checks, more + vocal error messages, and extra tracking fields in the pblk sysfs + entries. - Please note the disk format is considered EXPERIMENTAL for now. +endif # NVM_PBLK_DEBUG endif # NVM diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index b1c6d7eb611571..77d811962818ad 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -67,7 +67,7 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) atomic64_add(nr_entries, &pblk->user_wa); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(nr_entries, &pblk->inflight_writes); atomic_long_add(nr_entries, &pblk->req_writes); #endif @@ -123,7 +123,7 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq) atomic64_add(valid_entries, &pblk->gc_wa); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(valid_entries, &pblk->inflight_writes); atomic_long_add(valid_entries, &pblk->recov_gc_writes); #endif diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index ed9cc977c8b32f..66ab1036f2fb00 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -194,7 +194,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) u64 paddr; int line_id; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Callers must ensure that the ppa points to a device address */ BUG_ON(pblk_addr_in_cache(ppa)); BUG_ON(pblk_ppa_empty(ppa)); @@ -430,7 +430,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio) void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) { atomic_long_inc(&pblk->write_failed); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG pblk_print_failed_rqd(pblk, rqd, rqd->error); #endif } @@ -454,7 +454,7 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) default: pr_err("pblk: unknown read error:%d\n", rqd->error); } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG pblk_print_failed_rqd(pblk, rqd, rqd->error); #endif } @@ -470,7 +470,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) atomic_inc(&pblk->inflight_io); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG if (pblk_check_io(pblk, rqd)) return NVM_IO_ERR; #endif @@ -484,7 +484,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) atomic_inc(&pblk->inflight_io); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG if (pblk_check_io(pblk, rqd)) return NVM_IO_ERR; #endif @@ -1726,7 +1726,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) struct list_head *move_list; int i; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), "pblk: corrupt closed line %d\n", line->id); #endif @@ -1856,7 +1856,7 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, * Only send one inflight I/O per LUN. Since we map at a page * granurality, all ppas in the I/O will map to the same LUN */ -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG int i; for (i = 1; i < nr_ppas; i++) @@ -1901,7 +1901,7 @@ void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) struct pblk_lun *rlun; int pos = pblk_ppa_to_pos(geo, ppa_list[0]); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG int i; for (i = 1; i < nr_ppas; i++) @@ -1951,7 +1951,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) { -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Callers must ensure that the ppa points to a cache address */ BUG_ON(!pblk_addr_in_cache(ppa)); BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); @@ -1966,7 +1966,7 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, struct ppa_addr ppa_l2p, ppa_gc; int ret = 1; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Callers must ensure that the ppa points to a cache address */ BUG_ON(!pblk_addr_in_cache(ppa_new)); BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); @@ -2003,14 +2003,14 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, { struct ppa_addr ppa_l2p; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Callers must ensure that the ppa points to a device address */ BUG_ON(pblk_addr_in_cache(ppa_mapped)); #endif /* Invalidate and discard padded entries */ if (lba == ADDR_EMPTY) { atomic64_inc(&pblk->pad_wa); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_inc(&pblk->padded_wb); #endif if (!pblk_ppa_empty(ppa_mapped)) @@ -2036,7 +2036,7 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, goto out; } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); #endif diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 080469d90b408a..40d2dcb4f2bdf0 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -522,7 +522,7 @@ static int pblk_gc_reader_ts(void *data) io_schedule(); } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG pr_info("pblk: flushing gc pipeline, %d lines left\n", atomic_read(&gc->pipeline_gc)); #endif diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index ef8d8dea7b6ba5..9ea30102f61c6d 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -91,7 +91,7 @@ static size_t pblk_trans_map_size(struct pblk *pblk) return entry_size * pblk->rl.nr_secs; } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG static u32 pblk_l2p_crc(struct pblk *pblk) { size_t map_size; @@ -122,7 +122,7 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) } } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); #endif @@ -1166,7 +1166,7 @@ static void pblk_exit(void *private, bool graceful) pblk_gc_exit(pblk, graceful); pblk_tear_down(pblk, graceful); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); #endif @@ -1217,7 +1217,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, spin_lock_init(&pblk->trans_lock); spin_lock_init(&pblk->lock); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_set(&pblk->inflight_writes, 0); atomic_long_set(&pblk->padded_writes, 0); atomic_long_set(&pblk->padded_wb, 0); diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 55e9442a99e2bf..529def80966bc1 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -111,7 +111,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base, } while (iter > 0); up_write(&pblk_rb_lock); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_set(&rb->inflight_flush_point, 0); #endif @@ -308,7 +308,7 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, entry = &rb->entries[ring_pos]; flags = READ_ONCE(entry->w_ctx.flags); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Caller must guarantee that the entry is free */ BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); #endif @@ -332,7 +332,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, entry = &rb->entries[ring_pos]; flags = READ_ONCE(entry->w_ctx.flags); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Caller must guarantee that the entry is free */ BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); #endif @@ -362,7 +362,7 @@ static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, return 0; } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_inc(&rb->inflight_flush_point); #endif @@ -588,7 +588,7 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, atomic64_add(pad, &pblk->pad_wa); } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(pad, &pblk->padded_writes); #endif @@ -613,7 +613,7 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, int ret = 1; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Caller must ensure that the access will not cause an overflow */ BUG_ON(pos >= rb->nr_entries); #endif @@ -820,7 +820,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) rb->subm, rb->sync, rb->l2p_update, -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_read(&rb->inflight_flush_point), #else 0, @@ -838,7 +838,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) rb->subm, rb->sync, rb->l2p_update, -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_read(&rb->inflight_flush_point), #else 0, diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 18694694e5f040..6e93c489ce57ce 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -28,7 +28,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, sector_t lba, struct ppa_addr ppa, int bio_iter, bool advanced_bio) { -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Callers must ensure that the ppa points to a cache address */ BUG_ON(pblk_ppa_empty(ppa)); BUG_ON(!pblk_addr_in_cache(ppa)); @@ -79,7 +79,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, WARN_ON(test_and_set_bit(i, read_bitmap)); meta_list[i].lba = cpu_to_le64(lba); advanced_bio = true; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_inc(&pblk->cache_reads); #endif } else { @@ -97,7 +97,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, else rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(nr_secs, &pblk->inflight_reads); #endif } @@ -117,7 +117,7 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, continue; if (lba != blba + i) { -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG struct ppa_addr *p; p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr; @@ -149,7 +149,7 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, meta_lba = le64_to_cpu(meta_lba_list[j].lba); if (lba != meta_lba) { -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG struct ppa_addr *p; int nr_ppas = rqd->nr_ppas; @@ -185,7 +185,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) static void pblk_end_user_read(struct bio *bio) { -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); #endif bio_endio(bio); @@ -212,7 +212,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, if (put_line) pblk_read_put_rqd_kref(pblk, rqd); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); #endif @@ -285,7 +285,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, if (rqd->error) { atomic_long_inc(&pblk->read_failed); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG pblk_print_failed_rqd(pblk, rqd, rqd->error); #endif } @@ -359,7 +359,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_inc(&pblk->inflight_reads); #endif @@ -382,7 +382,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, WARN_ON(test_and_set_bit(0, read_bitmap)); meta_list[0].lba = cpu_to_le64(lba); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_inc(&pblk->cache_reads); #endif } else { @@ -514,7 +514,7 @@ static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(valid_secs, &pblk->inflight_reads); #endif @@ -548,7 +548,7 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, rqd->ppa_addr = ppa_l2p; valid_secs = 1; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_inc(&pblk->inflight_reads); #endif @@ -619,12 +619,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) if (rqd.error) { atomic_long_inc(&pblk->read_failed_gc); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG pblk_print_failed_rqd(pblk, &rqd, rqd.error); #endif } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index 88a0a7c407aa09..b0e5e93a9d5f3a 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -421,7 +421,7 @@ static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page) return sz; } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) { return snprintf(page, PAGE_SIZE, @@ -598,7 +598,7 @@ static struct attribute sys_padding_dist = { .mode = 0644, }; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG static struct attribute sys_stats_debug_attr = { .name = "stats", .mode = 0444, @@ -619,7 +619,7 @@ static struct attribute *pblk_attrs[] = { &sys_write_amp_mileage, &sys_write_amp_trip, &sys_padding_dist, -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG &sys_stats_debug_attr, #endif NULL, @@ -654,7 +654,7 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr, return pblk_sysfs_get_write_amp_trip(pblk, buf); else if (strcmp(attr->name, "padding_dist") == 0) return pblk_sysfs_get_padding_dist(pblk, buf); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG else if (strcmp(attr->name, "stats") == 0) return pblk_sysfs_stats_debug(pblk, buf); #endif diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index f353e52941f59a..5f44df999aed71 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, /* Release flags on context. Protect from writes */ smp_store_release(&w_ctx->flags, flags); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_dec(&rwb->inflight_flush_point); #endif } @@ -51,7 +51,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, c_ctx->nr_padded); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); #endif @@ -78,7 +78,7 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd, unsigned long flags; unsigned long pos; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); #endif @@ -196,7 +196,7 @@ static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx) list_add_tail(&r_ctx->list, &pblk->resubmit_list); spin_unlock(&pblk->resubmit_lock); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); #endif } @@ -258,7 +258,7 @@ static void pblk_end_io_write(struct nvm_rq *rqd) pblk_end_w_fail(pblk, rqd); return; } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG else WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); #endif @@ -356,7 +356,7 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush); -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG if ((!secs_to_sync && secs_to_flush) || (secs_to_sync < 0) || (secs_to_sync > secs_avail && !secs_to_flush)) { @@ -640,7 +640,7 @@ static int pblk_submit_write(struct pblk *pblk) if (pblk_submit_io_set(pblk, rqd)) goto fail_free_bio; -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_long_add(secs_to_sync, &pblk->sub_writes); #endif diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 9d1a0e86e0825c..c072955d72c2fd 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -193,7 +193,7 @@ struct pblk_rb { spinlock_t w_lock; /* Write lock */ spinlock_t s_lock; /* Sync lock */ -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ #endif }; @@ -636,7 +636,7 @@ struct pblk { u64 nr_flush_rst; /* Flushes reset value for pad dist.*/ atomic64_t nr_flush; /* Number of flush/fua I/O */ -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG /* Non-persistent debug counters, 4kb sector I/Os */ atomic_long_t inflight_writes; /* Inflight writes (user and gc) */ atomic_long_t padded_writes; /* Sectors padded due to flush/fua */ @@ -1279,7 +1279,7 @@ static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) return !(nr_secs % pblk->min_write_pgs); } -#ifdef CONFIG_NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG static inline void print_ppa(struct nvm_geo *geo, struct ppa_addr *p, char *msg, int error) { From 99b8dad1b6e52721904220322a947f7b75056303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 13 Jul 2018 10:48:38 +0200 Subject: [PATCH 058/190] lightnvm: pblk: enable line minor version detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When recovering a line, an extra check was added when debugging was active, such that minor version where also checked. Unfortunately, this used the ifdef NVM_DEBUG, which is not correct. Instead use the proper DEBUG def, and now that it compiles, also fix the variable. Signed-off-by: Matias Bjørling Fixes: d0ab0b1ab991f ("lightnvm: pblk: check data lines version on recovery") Reviewed-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-recovery.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 3a5069183859e8..d83466b3821b4d 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -742,9 +742,10 @@ static int pblk_recov_check_line_version(struct pblk *pblk, return 1; } -#ifdef NVM_DEBUG +#ifdef CONFIG_NVM_PBLK_DEBUG if (header->version_minor > EMETA_VERSION_MINOR) - pr_info("pblk: newer line minor version found: %d\n", line_v); + pr_info("pblk: newer line minor version found: %d\n", + header->version_minor); #endif return 0; From 242e461fb628bb63763e0bb2788d52ea054f8721 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 13 Jul 2018 10:48:39 +0200 Subject: [PATCH 059/190] lightnvm: Remove redundant rq->__data_len initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since both blk_old_get_request() and blk_mq_alloc_request() initialize rq->__data_len to zero, it is not necessary to initialize that member in nvme_nvm_alloc_request(). Hence remove the rq->__data_len initialization from nvme_nvm_alloc_request(). Signed-off-by: Bart Van Assche Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 41279da799ed8b..a76db8820f1c99 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -662,12 +662,10 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - if (rqd->bio) { + if (rqd->bio) blk_init_request_from_bio(rq, rqd->bio); - } else { + else rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); - rq->__data_len = 0; - } return rq; } From 921aebfac0871e8212913039d5241c0b3527eddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 13 Jul 2018 10:48:40 +0200 Subject: [PATCH 060/190] lightnvm: pblk: fix read_bitmap for 32bit archs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If using pblk on a 32bit architecture, and there is a need to perform a partial read, the partial read bitmap will only have allocated 32 entries, where as 64 are needed. Make sure that the read_bitmap is initialized to 64bits on 32bit architectures as well. Signed-off-by: Matias Bjørling Reviewed-by: Igor Konopko Reviewed-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 6e93c489ce57ce..bcfc6ea86e9d64 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -401,7 +401,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) struct pblk_g_ctx *r_ctx; struct nvm_rq *rqd; unsigned int bio_init_idx; - unsigned long read_bitmap; /* Max 64 ppas per request */ + DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA); int ret = NVM_IO_ERR; /* logic error: lba out-of-bounds. Ignore read request */ @@ -413,7 +413,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0); - bitmap_zero(&read_bitmap, nr_secs); + bitmap_zero(read_bitmap, nr_secs); rqd = pblk_alloc_rqd(pblk, PBLK_READ); @@ -444,19 +444,19 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; - pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap); + pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap); } else { - pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap); + pblk_read_rq(pblk, rqd, bio, blba, read_bitmap); } - if (bitmap_full(&read_bitmap, nr_secs)) { + if (bitmap_full(read_bitmap, nr_secs)) { atomic_inc(&pblk->inflight_io); __pblk_end_io_read(pblk, rqd, false); return NVM_IO_DONE; } /* All sectors are to be read from the device */ - if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { + if (bitmap_empty(read_bitmap, rqd->nr_ppas)) { struct bio *int_bio = NULL; /* Clone read bio to deal with read errors internally */ @@ -480,7 +480,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* The read bio request could be partially filled by the write buffer, * but there are some holes that need to be read from the drive. */ - return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap); + return pblk_partial_read(pblk, rqd, bio, bio_init_idx, read_bitmap); fail_rqd_free: pblk_free_rqd(pblk, rqd, PBLK_READ); From 59a8f43b6341b6964a9956640bb0f21b083ccd66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 13 Jul 2018 10:48:41 +0200 Subject: [PATCH 061/190] lightnvm: limit get chunk meta request size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For devices that does not specify a limit on its transfer size, the get_chk_meta command may send down a single I/O retrieving the full chunk metadata table. Resulting in large 2-4MB I/O requests. Instead, split up the I/Os to a maximum of 256KB and issue them separately to reduce memory requirements. Signed-off-by: Matias Bjørling Reviewed-by: Javier González Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index a76db8820f1c99..d9e4cccd5b66c0 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -583,7 +583,13 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, struct ppa_addr ppa; size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); size_t log_pos, offset, len; - int ret, i; + int ret, i, max_len; + + /* + * limit requests to maximum 256K to avoid issuing arbitrary large + * requests when the device does not specific a maximum transfer size. + */ + max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024); /* Normalize lba address space to obtain log offset */ ppa.ppa = slba; @@ -596,7 +602,7 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, offset = log_pos * sizeof(struct nvme_nvm_chk_meta); while (left) { - len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9); + len = min_t(unsigned int, left, max_len); ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK, dev_meta, len, offset); From 4e495a46b1039252f4af0c883e2cb31cc5f44145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 13 Jul 2018 10:48:42 +0200 Subject: [PATCH 062/190] lightnvm: pblk: expose generic disk name on pr_* msgs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The error messages in pblk does not say which pblk instance that a message occurred from. Update each error message to reflect the instance it belongs to, and also prefix it with pblk, so we know the message comes from the pblk module. Signed-off-by: Matias Bjørling Reviewed-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 51 ++++++++++----------- drivers/lightnvm/pblk-gc.c | 32 +++++++------- drivers/lightnvm/pblk-init.c | 76 ++++++++++++++++---------------- drivers/lightnvm/pblk-rb.c | 8 ++-- drivers/lightnvm/pblk-read.c | 25 ++++++----- drivers/lightnvm/pblk-recovery.c | 44 +++++++++--------- drivers/lightnvm/pblk-sysfs.c | 5 +-- drivers/lightnvm/pblk-write.c | 21 ++++----- drivers/lightnvm/pblk.h | 29 ++++++++---- 9 files changed, 153 insertions(+), 138 deletions(-) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 66ab1036f2fb00..b829460fe827bf 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -35,7 +35,7 @@ static void pblk_line_mark_bb(struct work_struct *work) line = &pblk->lines[pblk_ppa_to_line(*ppa)]; pos = pblk_ppa_to_pos(&dev->geo, *ppa); - pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", + pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n", line->id, pos); } @@ -51,12 +51,12 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, struct ppa_addr *ppa; int pos = pblk_ppa_to_pos(geo, ppa_addr); - pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos); + pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos); atomic_long_inc(&pblk->erase_failed); atomic_dec(&line->blk_in_line); if (test_and_set_bit(pos, line->blk_bitmap)) - pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", + pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n", line->id, pos); /* Not necessary to mark bad blocks on 2.0 spec. */ @@ -274,7 +274,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) pool = &pblk->e_rq_pool; break; default: - pr_err("pblk: trying to free unknown rqd type\n"); + pblk_err(pblk, "trying to free unknown rqd type\n"); return; } @@ -310,7 +310,7 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); if (ret != PBLK_EXPOSED_PAGE_SIZE) { - pr_err("pblk: could not add page to bio\n"); + pblk_err(pblk, "could not add page to bio\n"); mempool_free(page, &pblk->page_bio_pool); goto err; } @@ -410,7 +410,7 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) line->state = PBLK_LINESTATE_CORRUPT; line->gc_group = PBLK_LINEGC_NONE; move_list = &l_mg->corrupt_list; - pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", + pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", line->id, vsc, line->sec_in_line, lm->high_thrs, lm->mid_thrs); @@ -452,7 +452,7 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) atomic_long_inc(&pblk->read_failed); break; default: - pr_err("pblk: unknown read error:%d\n", rqd->error); + pblk_err(pblk, "unknown read error:%d\n", rqd->error); } #ifdef CONFIG_NVM_PBLK_DEBUG pblk_print_failed_rqd(pblk, rqd, rqd->error); @@ -517,7 +517,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, for (i = 0; i < nr_secs; i++) { page = vmalloc_to_page(kaddr); if (!page) { - pr_err("pblk: could not map vmalloc bio\n"); + pblk_err(pblk, "could not map vmalloc bio\n"); bio_put(bio); bio = ERR_PTR(-ENOMEM); goto out; @@ -525,7 +525,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { - pr_err("pblk: could not add page to bio\n"); + pblk_err(pblk, "could not add page to bio\n"); bio_put(bio); bio = ERR_PTR(-ENOMEM); goto out; @@ -711,7 +711,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, while (test_bit(pos, line->blk_bitmap)) { paddr += min; if (pblk_boundary_paddr_checks(pblk, paddr)) { - pr_err("pblk: corrupt emeta line:%d\n", + pblk_err(pblk, "corrupt emeta line:%d\n", line->id); bio_put(bio); ret = -EINTR; @@ -723,7 +723,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, } if (pblk_boundary_paddr_checks(pblk, paddr + min)) { - pr_err("pblk: corrupt emeta line:%d\n", + pblk_err(pblk, "corrupt emeta line:%d\n", line->id); bio_put(bio); ret = -EINTR; @@ -738,7 +738,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, ret = pblk_submit_io_sync(pblk, &rqd); if (ret) { - pr_err("pblk: emeta I/O submission failed: %d\n", ret); + pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); bio_put(bio); goto free_rqd_dma; } @@ -843,7 +843,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, */ ret = pblk_submit_io_sync(pblk, &rqd); if (ret) { - pr_err("pblk: smeta I/O submission failed: %d\n", ret); + pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); bio_put(bio); goto free_ppa_list; } @@ -905,7 +905,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - pr_err("pblk: could not sync erase line:%d,blk:%d\n", + pblk_err(pblk, "could not sync erase line:%d,blk:%d\n", pblk_ppa_to_line(ppa), pblk_ppa_to_pos(geo, ppa)); @@ -945,7 +945,7 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) ret = pblk_blk_erase_sync(pblk, ppa); if (ret) { - pr_err("pblk: failed to erase line %d\n", line->id); + pblk_err(pblk, "failed to erase line %d\n", line->id); return ret; } } while (1); @@ -1012,7 +1012,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, list_add_tail(&line->list, &l_mg->bad_list); spin_unlock(&l_mg->free_lock); - pr_debug("pblk: line %d is bad\n", line->id); + pblk_debug(pblk, "line %d is bad\n", line->id); return 0; } @@ -1122,7 +1122,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, line->cur_sec = off + lm->smeta_sec; if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { - pr_debug("pblk: line smeta I/O failed. Retry\n"); + pblk_debug(pblk, "line smeta I/O failed. Retry\n"); return 0; } @@ -1154,7 +1154,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, spin_unlock(&line->lock); list_add_tail(&line->list, &l_mg->bad_list); - pr_err("pblk: unexpected line %d is bad\n", line->id); + pblk_err(pblk, "unexpected line %d is bad\n", line->id); return 0; } @@ -1299,7 +1299,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk) retry: if (list_empty(&l_mg->free_list)) { - pr_err("pblk: no free lines\n"); + pblk_err(pblk, "no free lines\n"); return NULL; } @@ -1315,7 +1315,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk) list_add_tail(&line->list, &l_mg->bad_list); - pr_debug("pblk: line %d is bad\n", line->id); + pblk_debug(pblk, "line %d is bad\n", line->id); goto retry; } @@ -1329,7 +1329,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk) list_add(&line->list, &l_mg->corrupt_list); goto retry; default: - pr_err("pblk: failed to prepare line %d\n", line->id); + pblk_err(pblk, "failed to prepare line %d\n", line->id); list_add(&line->list, &l_mg->free_list); l_mg->nr_free_lines++; return NULL; @@ -1477,7 +1477,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk) ret = pblk_submit_meta_io(pblk, line); if (ret) { - pr_err("pblk: sync meta line %d failed (%d)\n", + pblk_err(pblk, "sync meta line %d failed (%d)\n", line->id, ret); return; } @@ -1507,7 +1507,7 @@ void __pblk_pipeline_flush(struct pblk *pblk) ret = pblk_recov_pad(pblk); if (ret) { - pr_err("pblk: could not close data on teardown(%d)\n", ret); + pblk_err(pblk, "could not close data on teardown(%d)\n", ret); return; } @@ -1687,7 +1687,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; - pr_err("pblk: could not async erase line:%d,blk:%d\n", + pblk_err(pblk, "could not async erase line:%d,blk:%d\n", pblk_ppa_to_line(ppa), pblk_ppa_to_pos(geo, ppa)); } @@ -1866,7 +1866,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); if (ret == -ETIME || ret == -EINTR) - pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret); + pblk_err(pblk, "taking lun semaphore timed out: err %d\n", + -ret); } void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 40d2dcb4f2bdf0..157c2567c9e850 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -90,7 +90,7 @@ static void pblk_gc_line_ws(struct work_struct *work) gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs)); if (!gc_rq->data) { - pr_err("pblk: could not GC line:%d (%d/%d)\n", + pblk_err(pblk, "could not GC line:%d (%d/%d)\n", line->id, *line->vsc, gc_rq->nr_secs); goto out; } @@ -98,7 +98,7 @@ static void pblk_gc_line_ws(struct work_struct *work) /* Read from GC victim block */ ret = pblk_submit_read_gc(pblk, gc_rq); if (ret) { - pr_err("pblk: failed GC read in line:%d (err:%d)\n", + pblk_err(pblk, "failed GC read in line:%d (err:%d)\n", line->id, ret); goto out; } @@ -146,7 +146,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk, ret = pblk_line_read_emeta(pblk, line, emeta_buf); if (ret) { - pr_err("pblk: line %d read emeta failed (%d)\n", + pblk_err(pblk, "line %d read emeta failed (%d)\n", line->id, ret); pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); return NULL; @@ -160,7 +160,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk, ret = pblk_recov_check_emeta(pblk, emeta_buf); if (ret) { - pr_err("pblk: inconsistent emeta (line %d)\n", + pblk_err(pblk, "inconsistent emeta (line %d)\n", line->id); pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); return NULL; @@ -201,7 +201,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) } else { lba_list = get_lba_list_from_emeta(pblk, line); if (!lba_list) { - pr_err("pblk: could not interpret emeta (line %d)\n", + pblk_err(pblk, "could not interpret emeta (line %d)\n", line->id); goto fail_free_invalid_bitmap; } @@ -213,7 +213,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) spin_unlock(&line->lock); if (sec_left < 0) { - pr_err("pblk: corrupted GC line (%d)\n", line->id); + pblk_err(pblk, "corrupted GC line (%d)\n", line->id); goto fail_free_lba_list; } @@ -289,7 +289,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) kref_put(&line->ref, pblk_line_put); atomic_dec(&gc->read_inflight_gc); - pr_err("pblk: Failed to GC line %d\n", line->id); + pblk_err(pblk, "failed to GC line %d\n", line->id); } static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) @@ -297,7 +297,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) struct pblk_gc *gc = &pblk->gc; struct pblk_line_ws *line_ws; - pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); + pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id); line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); if (!line_ws) @@ -351,7 +351,7 @@ static int pblk_gc_read(struct pblk *pblk) pblk_gc_kick(pblk); if (pblk_gc_line(pblk, line)) - pr_err("pblk: failed to GC line %d\n", line->id); + pblk_err(pblk, "failed to GC line %d\n", line->id); return 0; } @@ -523,7 +523,7 @@ static int pblk_gc_reader_ts(void *data) } #ifdef CONFIG_NVM_PBLK_DEBUG - pr_info("pblk: flushing gc pipeline, %d lines left\n", + pblk_info(pblk, "flushing gc pipeline, %d lines left\n", atomic_read(&gc->pipeline_gc)); #endif @@ -540,7 +540,7 @@ static int pblk_gc_reader_ts(void *data) static void pblk_gc_start(struct pblk *pblk) { pblk->gc.gc_active = 1; - pr_debug("pblk: gc start\n"); + pblk_debug(pblk, "gc start\n"); } void pblk_gc_should_start(struct pblk *pblk) @@ -605,14 +605,14 @@ int pblk_gc_init(struct pblk *pblk) gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts"); if (IS_ERR(gc->gc_ts)) { - pr_err("pblk: could not allocate GC main kthread\n"); + pblk_err(pblk, "could not allocate GC main kthread\n"); return PTR_ERR(gc->gc_ts); } gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, "pblk-gc-writer-ts"); if (IS_ERR(gc->gc_writer_ts)) { - pr_err("pblk: could not allocate GC writer kthread\n"); + pblk_err(pblk, "could not allocate GC writer kthread\n"); ret = PTR_ERR(gc->gc_writer_ts); goto fail_free_main_kthread; } @@ -620,7 +620,7 @@ int pblk_gc_init(struct pblk *pblk) gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, "pblk-gc-reader-ts"); if (IS_ERR(gc->gc_reader_ts)) { - pr_err("pblk: could not allocate GC reader kthread\n"); + pblk_err(pblk, "could not allocate GC reader kthread\n"); ret = PTR_ERR(gc->gc_reader_ts); goto fail_free_writer_kthread; } @@ -641,7 +641,7 @@ int pblk_gc_init(struct pblk *pblk) gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); if (!gc->gc_line_reader_wq) { - pr_err("pblk: could not allocate GC line reader workqueue\n"); + pblk_err(pblk, "could not allocate GC line reader workqueue\n"); ret = -ENOMEM; goto fail_free_reader_kthread; } @@ -650,7 +650,7 @@ int pblk_gc_init(struct pblk *pblk) gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!gc->gc_reader_wq) { - pr_err("pblk: could not allocate GC reader workqueue\n"); + pblk_err(pblk, "could not allocate GC reader workqueue\n"); ret = -ENOMEM; goto fail_free_reader_line_wq; } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 9ea30102f61c6d..d023ea6116bc3f 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -117,13 +117,13 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) } else { line = pblk_recov_l2p(pblk); if (IS_ERR(line)) { - pr_err("pblk: could not recover l2p table\n"); + pblk_err(pblk, "could not recover l2p table\n"); return -EFAULT; } } #ifdef CONFIG_NVM_PBLK_DEBUG - pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); + pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); #endif /* Free full lines directly as GC has not been started yet */ @@ -166,7 +166,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) static void pblk_rwb_free(struct pblk *pblk) { if (pblk_rb_tear_down_check(&pblk->rwb)) - pr_err("pblk: write buffer error on tear down\n"); + pblk_err(pblk, "write buffer error on tear down\n"); pblk_rb_data_free(&pblk->rwb); vfree(pblk_rb_entries_ref(&pblk->rwb)); @@ -203,7 +203,8 @@ static int pblk_rwb_init(struct pblk *pblk) /* Minimum pages needed within a lun */ #define ADDR_POOL_SIZE 64 -static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst) +static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo, + struct nvm_addrf_12 *dst) { struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf; int power_len; @@ -211,14 +212,14 @@ static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst) /* Re-calculate channel and lun format to adapt to configuration */ power_len = get_count_order(geo->num_ch); if (1 << power_len != geo->num_ch) { - pr_err("pblk: supports only power-of-two channel config.\n"); + pblk_err(pblk, "supports only power-of-two channel config.\n"); return -EINVAL; } dst->ch_len = power_len; power_len = get_count_order(geo->num_lun); if (1 << power_len != geo->num_lun) { - pr_err("pblk: supports only power-of-two LUN config.\n"); + pblk_err(pblk, "supports only power-of-two LUN config.\n"); return -EINVAL; } dst->lun_len = power_len; @@ -285,18 +286,19 @@ static int pblk_set_addrf(struct pblk *pblk) case NVM_OCSSD_SPEC_12: div_u64_rem(geo->clba, pblk->min_write_pgs, &mod); if (mod) { - pr_err("pblk: bad configuration of sectors/pages\n"); + pblk_err(pblk, "bad configuration of sectors/pages\n"); return -EINVAL; } - pblk->addrf_len = pblk_set_addrf_12(geo, (void *)&pblk->addrf); + pblk->addrf_len = pblk_set_addrf_12(pblk, geo, + (void *)&pblk->addrf); break; case NVM_OCSSD_SPEC_20: pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf, - &pblk->uaddrf); + &pblk->uaddrf); break; default: - pr_err("pblk: OCSSD revision not supported (%d)\n", + pblk_err(pblk, "OCSSD revision not supported (%d)\n", geo->version); return -EINVAL; } @@ -375,7 +377,7 @@ static int pblk_core_init(struct pblk *pblk) pblk_set_sec_per_write(pblk, pblk->min_write_pgs); if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { - pr_err("pblk: vector list too big(%u > %u)\n", + pblk_err(pblk, "vector list too big(%u > %u)\n", pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); return -EINVAL; } @@ -608,7 +610,7 @@ static int pblk_luns_init(struct pblk *pblk) /* TODO: Implement unbalanced LUN support */ if (geo->num_lun < 0) { - pr_err("pblk: unbalanced LUN config.\n"); + pblk_err(pblk, "unbalanced LUN config.\n"); return -EINVAL; } @@ -1027,7 +1029,7 @@ static int pblk_line_meta_init(struct pblk *pblk) lm->emeta_sec[0], geo->clba); if (lm->min_blk_line > lm->blk_per_line) { - pr_err("pblk: config. not supported. Min. LUN in line:%d\n", + pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n", lm->blk_per_line); return -EINVAL; } @@ -1079,7 +1081,7 @@ static int pblk_lines_init(struct pblk *pblk) } if (!nr_free_chks) { - pr_err("pblk: too many bad blocks prevent for sane instance\n"); + pblk_err(pblk, "too many bad blocks prevent for sane instance\n"); return -EINTR; } @@ -1109,7 +1111,7 @@ static int pblk_writer_init(struct pblk *pblk) int err = PTR_ERR(pblk->writer_ts); if (err != -EINTR) - pr_err("pblk: could not allocate writer kthread (%d)\n", + pblk_err(pblk, "could not allocate writer kthread (%d)\n", err); return err; } @@ -1155,7 +1157,7 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful) pblk_rb_sync_l2p(&pblk->rwb); pblk_rl_free(&pblk->rl); - pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful); + pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful); } static void pblk_exit(void *private, bool graceful) @@ -1167,7 +1169,7 @@ static void pblk_exit(void *private, bool graceful) pblk_tear_down(pblk, graceful); #ifdef CONFIG_NVM_PBLK_DEBUG - pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); + pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); #endif pblk_free(pblk); @@ -1190,29 +1192,30 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, struct pblk *pblk; int ret; - /* pblk supports 1.2 and 2.0 versions */ + pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL); + if (!pblk) + return ERR_PTR(-ENOMEM); + + pblk->dev = dev; + pblk->disk = tdisk; + pblk->state = PBLK_STATE_RUNNING; + pblk->gc.gc_enabled = 0; + if (!(geo->version == NVM_OCSSD_SPEC_12 || geo->version == NVM_OCSSD_SPEC_20)) { - pr_err("pblk: OCSSD version not supported (%u)\n", + pblk_err(pblk, "OCSSD version not supported (%u)\n", geo->version); + kfree(pblk); return ERR_PTR(-EINVAL); } if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) { - pr_err("pblk: host-side L2P table not supported. (%x)\n", + pblk_err(pblk, "host-side L2P table not supported. (%x)\n", geo->dom); + kfree(pblk); return ERR_PTR(-EINVAL); } - pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL); - if (!pblk) - return ERR_PTR(-ENOMEM); - - pblk->dev = dev; - pblk->disk = tdisk; - pblk->state = PBLK_STATE_RUNNING; - pblk->gc.gc_enabled = 0; - spin_lock_init(&pblk->resubmit_lock); spin_lock_init(&pblk->trans_lock); spin_lock_init(&pblk->lock); @@ -1242,38 +1245,38 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, ret = pblk_core_init(pblk); if (ret) { - pr_err("pblk: could not initialize core\n"); + pblk_err(pblk, "could not initialize core\n"); goto fail; } ret = pblk_lines_init(pblk); if (ret) { - pr_err("pblk: could not initialize lines\n"); + pblk_err(pblk, "could not initialize lines\n"); goto fail_free_core; } ret = pblk_rwb_init(pblk); if (ret) { - pr_err("pblk: could not initialize write buffer\n"); + pblk_err(pblk, "could not initialize write buffer\n"); goto fail_free_lines; } ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY); if (ret) { - pr_err("pblk: could not initialize maps\n"); + pblk_err(pblk, "could not initialize maps\n"); goto fail_free_rwb; } ret = pblk_writer_init(pblk); if (ret) { if (ret != -EINTR) - pr_err("pblk: could not initialize write thread\n"); + pblk_err(pblk, "could not initialize write thread\n"); goto fail_free_l2p; } ret = pblk_gc_init(pblk); if (ret) { - pr_err("pblk: could not initialize gc\n"); + pblk_err(pblk, "could not initialize gc\n"); goto fail_stop_writer; } @@ -1288,8 +1291,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue); - pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n", - tdisk->disk_name, + pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n", geo->all_luns, pblk->l_mg.nr_lines, (unsigned long long)pblk->rl.nr_secs, pblk->rwb.nr_entries); diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 529def80966bc1..f6eec0212dfcf0 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -547,7 +547,7 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, page = virt_to_page(entry->data); if (!page) { - pr_err("pblk: could not allocate write bio page\n"); + pblk_err(pblk, "could not allocate write bio page\n"); flags &= ~PBLK_WRITTEN_DATA; flags |= PBLK_SUBMITTED_ENTRY; /* Release flags on context. Protect from writes */ @@ -557,7 +557,7 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != rb->seg_size) { - pr_err("pblk: could not add page to write bio\n"); + pblk_err(pblk, "could not add page to write bio\n"); flags &= ~PBLK_WRITTEN_DATA; flags |= PBLK_SUBMITTED_ENTRY; /* Release flags on context. Protect from writes */ @@ -576,14 +576,14 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, if (pad) { if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { - pr_err("pblk: could not pad page in write bio\n"); + pblk_err(pblk, "could not pad page in write bio\n"); return NVM_IO_ERR; } if (pad < pblk->min_write_pgs) atomic64_inc(&pblk->pad_dist[pad - 1]); else - pr_warn("pblk: padding more than min. sectors\n"); + pblk_warn(pblk, "padding more than min. sectors\n"); atomic64_add(pad, &pblk->pad_wa); } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index bcfc6ea86e9d64..9c9362b2086191 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -121,9 +121,9 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, struct ppa_addr *p; p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr; - print_ppa(&pblk->dev->geo, p, "seq", i); + print_ppa(pblk, p, "seq", i); #endif - pr_err("pblk: corrupted read LBA (%llu/%llu)\n", + pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", lba, (u64)blba + i); WARN_ON(1); } @@ -154,9 +154,9 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, int nr_ppas = rqd->nr_ppas; p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr; - print_ppa(&pblk->dev->geo, p, "seq", j); + print_ppa(pblk, p, "seq", j); #endif - pr_err("pblk: corrupted read LBA (%llu/%llu)\n", + pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", lba, meta_lba); WARN_ON(1); } @@ -256,7 +256,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, goto fail_add_pages; if (nr_holes != new_bio->bi_vcnt) { - pr_err("pblk: malformed bio\n"); + pblk_err(pblk, "malformed bio\n"); goto fail; } @@ -279,7 +279,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, ret = pblk_submit_io_sync(pblk, rqd); if (ret) { bio_put(rqd->bio); - pr_err("pblk: sync read IO submission failed\n"); + pblk_err(pblk, "sync read IO submission failed\n"); goto fail; } @@ -346,7 +346,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, /* Free allocated pages in new bio */ pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt); fail_add_pages: - pr_err("pblk: failed to perform partial read\n"); + pblk_err(pblk, "failed to perform partial read\n"); __pblk_end_io_read(pblk, rqd, false); return NVM_IO_ERR; } @@ -436,7 +436,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &rqd->dma_meta_list); if (!rqd->meta_list) { - pr_err("pblk: not able to allocate ppa list\n"); + pblk_err(pblk, "not able to allocate ppa list\n"); goto fail_rqd_free; } @@ -462,14 +462,14 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* Clone read bio to deal with read errors internally */ int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); if (!int_bio) { - pr_err("pblk: could not clone read bio\n"); + pblk_err(pblk, "could not clone read bio\n"); goto fail_end_io; } rqd->bio = int_bio; if (pblk_submit_io(pblk, rqd)) { - pr_err("pblk: read IO submission failed\n"); + pblk_err(pblk, "read IO submission failed\n"); ret = NVM_IO_ERR; goto fail_end_io; } @@ -595,7 +595,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, PBLK_VMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { - pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); + pblk_err(pblk, "could not allocate GC bio (%lu)\n", + PTR_ERR(bio)); goto err_free_dma; } @@ -609,7 +610,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) if (pblk_submit_io_sync(pblk, &rqd)) { ret = -EIO; - pr_err("pblk: GC read request failed\n"); + pblk_err(pblk, "GC read request failed\n"); goto err_free_bio; } diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index d83466b3821b4d..e232e47e13532e 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -77,7 +77,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) } if (nr_valid_lbas != nr_lbas) - pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n", + pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n", line->id, nr_valid_lbas, nr_lbas); line->left_msecs = 0; @@ -184,7 +184,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line, /* If read fails, more padding is needed */ ret = pblk_submit_io_sync(pblk, rqd); if (ret) { - pr_err("pblk: I/O submission failed: %d\n", ret); + pblk_err(pblk, "I/O submission failed: %d\n", ret); return ret; } @@ -194,7 +194,7 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line, * we cannot recover from here. Need FTL log. */ if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { - pr_err("pblk: L2P recovery failed (%d)\n", rqd->error); + pblk_err(pblk, "L2P recovery failed (%d)\n", rqd->error); return -EINTR; } @@ -273,7 +273,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line, next_pad_rq: rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); if (rq_ppas < pblk->min_write_pgs) { - pr_err("pblk: corrupted pad line %d\n", line->id); + pblk_err(pblk, "corrupted pad line %d\n", line->id); goto fail_free_pad; } @@ -342,7 +342,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line, ret = pblk_submit_io(pblk, rqd); if (ret) { - pr_err("pblk: I/O submission failed: %d\n", ret); + pblk_err(pblk, "I/O submission failed: %d\n", ret); pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); goto fail_free_bio; } @@ -356,12 +356,12 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line, if (!wait_for_completion_io_timeout(&pad_rq->wait, msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: pad write timed out\n"); + pblk_err(pblk, "pad write timed out\n"); ret = -ETIME; } if (!pblk_line_is_full(line)) - pr_err("pblk: corrupted padded line: %d\n", line->id); + pblk_err(pblk, "corrupted padded line: %d\n", line->id); vfree(data); free_rq: @@ -461,7 +461,7 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line, ret = pblk_submit_io_sync(pblk, rqd); if (ret) { - pr_err("pblk: I/O submission failed: %d\n", ret); + pblk_err(pblk, "I/O submission failed: %d\n", ret); return ret; } @@ -501,11 +501,11 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line, ret = pblk_recov_pad_oob(pblk, line, pad_secs); if (ret) - pr_err("pblk: OOB padding failed (err:%d)\n", ret); + pblk_err(pblk, "OOB padding failed (err:%d)\n", ret); ret = pblk_recov_read_oob(pblk, line, p, r_ptr); if (ret) - pr_err("pblk: OOB read failed (err:%d)\n", ret); + pblk_err(pblk, "OOB read failed (err:%d)\n", ret); left_ppas = 0; } @@ -592,7 +592,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, ret = pblk_submit_io_sync(pblk, rqd); if (ret) { - pr_err("pblk: I/O submission failed: %d\n", ret); + pblk_err(pblk, "I/O submission failed: %d\n", ret); bio_put(bio); return ret; } @@ -671,14 +671,14 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) ret = pblk_recov_scan_oob(pblk, line, p, &done); if (ret) { - pr_err("pblk: could not recover L2P from OOB\n"); + pblk_err(pblk, "could not recover L2P from OOB\n"); goto out; } if (!done) { ret = pblk_recov_scan_all_oob(pblk, line, p); if (ret) { - pr_err("pblk: could not recover L2P from OOB\n"); + pblk_err(pblk, "could not recover L2P from OOB\n"); goto out; } } @@ -737,14 +737,14 @@ static int pblk_recov_check_line_version(struct pblk *pblk, struct line_header *header = &emeta->header; if (header->version_major != EMETA_VERSION_MAJOR) { - pr_err("pblk: line major version mismatch: %d, expected: %d\n", - header->version_major, EMETA_VERSION_MAJOR); + pblk_err(pblk, "line major version mismatch: %d, expected: %d\n", + header->version_major, EMETA_VERSION_MAJOR); return 1; } #ifdef CONFIG_NVM_PBLK_DEBUG if (header->version_minor > EMETA_VERSION_MINOR) - pr_info("pblk: newer line minor version found: %d\n", + pblk_info(pblk, "newer line minor version found: %d\n", header->version_minor); #endif @@ -852,7 +852,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) continue; if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) { - pr_err("pblk: found incompatible line version %u\n", + pblk_err(pblk, "found incompatible line version %u\n", smeta_buf->header.version_major); return ERR_PTR(-EINVAL); } @@ -864,7 +864,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) } if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { - pr_debug("pblk: ignore line %u due to uuid mismatch\n", + pblk_debug(pblk, "ignore line %u due to uuid mismatch\n", i); continue; } @@ -888,7 +888,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) pblk_recov_line_add_ordered(&recov_list, line); found_lines++; - pr_debug("pblk: recovering data line %d, seq:%llu\n", + pblk_debug(pblk, "recovering data line %d, seq:%llu\n", line->id, smeta_buf->seq_nr); } @@ -948,7 +948,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) line->emeta = NULL; } else { if (open_lines > 1) - pr_err("pblk: failed to recover L2P\n"); + pblk_err(pblk, "failed to recover L2P\n"); open_lines++; line->meta_line = meta_line; @@ -977,7 +977,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) out: if (found_lines != recovered_lines) - pr_err("pblk: failed to recover all found lines %d/%d\n", + pblk_err(pblk, "failed to recover all found lines %d/%d\n", found_lines, recovered_lines); return data_line; @@ -1000,7 +1000,7 @@ int pblk_recov_pad(struct pblk *pblk) ret = pblk_recov_pad_oob(pblk, line, left_msecs); if (ret) { - pr_err("pblk: Tear down padding failed (%d)\n", ret); + pblk_err(pblk, "tear down padding failed (%d)\n", ret); return ret; } diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index b0e5e93a9d5f3a..9fc3dfa168b4bb 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -268,7 +268,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) spin_unlock(&l_mg->free_lock); if (nr_free_lines != free_line_cnt) - pr_err("pblk: corrupted free line list:%d/%d\n", + pblk_err(pblk, "corrupted free line list:%d/%d\n", nr_free_lines, free_line_cnt); sz = snprintf(page, PAGE_SIZE - sz, @@ -697,8 +697,7 @@ int pblk_sysfs_init(struct gendisk *tdisk) kobject_get(&parent_dev->kobj), "%s", "pblk"); if (ret) { - pr_err("pblk: could not register %s/pblk\n", - tdisk->disk_name); + pblk_err(pblk, "could not register\n"); return ret; } diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 5f44df999aed71..ee774a86cf1e6e 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -238,7 +238,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); if (!recovery) { - pr_err("pblk: could not allocate recovery work\n"); + pblk_err(pblk, "could not allocate recovery work\n"); return; } @@ -279,7 +279,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) if (rqd->error) { pblk_log_write_err(pblk, rqd); - pr_err("pblk: metadata I/O failed. Line %d\n", line->id); + pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id); line->w_err_gc->has_write_err = 1; } @@ -360,7 +360,7 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, if ((!secs_to_sync && secs_to_flush) || (secs_to_sync < 0) || (secs_to_sync > secs_avail && !secs_to_flush)) { - pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n", + pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n", secs_avail, secs_to_sync, secs_to_flush); } #endif @@ -397,7 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, l_mg->emeta_alloc_type, GFP_KERNEL); if (IS_ERR(bio)) { - pr_err("pblk: failed to map emeta io"); + pblk_err(pblk, "failed to map emeta io"); ret = PTR_ERR(bio); goto fail_free_rqd; } @@ -428,7 +428,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) ret = pblk_submit_io(pblk, rqd); if (ret) { - pr_err("pblk: emeta I/O submission failed: %d\n", ret); + pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); goto fail_rollback; } @@ -518,7 +518,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) /* Assign lbas to ppas and populate request structure */ err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); if (err) { - pr_err("pblk: could not setup write request: %d\n", err); + pblk_err(pblk, "could not setup write request: %d\n", err); return NVM_IO_ERR; } @@ -527,7 +527,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) /* Submit data write for current data line */ err = pblk_submit_io(pblk, rqd); if (err) { - pr_err("pblk: data I/O submission failed: %d\n", err); + pblk_err(pblk, "data I/O submission failed: %d\n", err); return NVM_IO_ERR; } @@ -549,7 +549,8 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) /* Submit metadata write for previous data line */ err = pblk_submit_meta_io(pblk, meta_line); if (err) { - pr_err("pblk: metadata I/O submission failed: %d", err); + pblk_err(pblk, "metadata I/O submission failed: %d", + err); return NVM_IO_ERR; } } @@ -614,7 +615,7 @@ static int pblk_submit_write(struct pblk *pblk) secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); if (secs_to_sync > pblk->max_write_pgs) { - pr_err("pblk: bad buffer sync calculation\n"); + pblk_err(pblk, "bad buffer sync calculation\n"); return 1; } @@ -633,7 +634,7 @@ static int pblk_submit_write(struct pblk *pblk) if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, secs_avail)) { - pr_err("pblk: corrupted write bio\n"); + pblk_err(pblk, "corrupted write bio\n"); goto fail_put_bio; } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index c072955d72c2fd..5c6904eb855761 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -703,6 +703,15 @@ struct pblk_line_ws { #define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) #define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) +#define pblk_err(pblk, fmt, ...) \ + pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) +#define pblk_info(pblk, fmt, ...) \ + pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) +#define pblk_warn(pblk, fmt, ...) \ + pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) +#define pblk_debug(pblk, fmt, ...) \ + pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) + /* * pblk ring buffer operations */ @@ -1280,19 +1289,21 @@ static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) } #ifdef CONFIG_NVM_PBLK_DEBUG -static inline void print_ppa(struct nvm_geo *geo, struct ppa_addr *p, +static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p, char *msg, int error) { + struct nvm_geo *geo = &pblk->dev->geo; + if (p->c.is_cached) { - pr_err("ppa: (%s: %x) cache line: %llu\n", + pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n", msg, error, (u64)p->c.line); } else if (geo->version == NVM_OCSSD_SPEC_12) { - pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", + pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", msg, error, p->g.ch, p->g.lun, p->g.blk, p->g.pg, p->g.pl, p->g.sec); } else { - pr_err("ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", + pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", msg, error, p->m.grp, p->m.pu, p->m.chk, p->m.sec); } @@ -1304,16 +1315,16 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, int bit = -1; if (rqd->nr_ppas == 1) { - print_ppa(&pblk->dev->geo, &rqd->ppa_addr, "rqd", error); + print_ppa(pblk, &rqd->ppa_addr, "rqd", error); return; } while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas, bit + 1)) < rqd->nr_ppas) { - print_ppa(&pblk->dev->geo, &rqd->ppa_list[bit], "rqd", error); + print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error); } - pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status); + pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status); } static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, @@ -1344,7 +1355,7 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, continue; } - print_ppa(geo, ppa, "boundary", i); + print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i); return 1; } @@ -1374,7 +1385,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) spin_lock(&line->lock); if (line->state != PBLK_LINESTATE_OPEN) { - pr_err("pblk: bad ppa: line:%d,state:%d\n", + pblk_err(pblk, "bad ppa: line:%d,state:%d\n", line->id, line->state); WARN_ON(1); spin_unlock(&line->lock); From 884b031b288bae15397dd07b084a41ffb44f99e4 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 13 Jul 2018 10:48:43 +0200 Subject: [PATCH 063/190] lightnvm: pblk: mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index b829460fe827bf..00984b486fea72 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -264,6 +264,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) switch (type) { case PBLK_WRITE: kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); + /* fall through */ case PBLK_WRITE_INT: pool = &pblk->w_rq_pool; break; From 11f6ad699a32f3be1232741e4bfa34abf6677cb8 Mon Sep 17 00:00:00 2001 From: Heiner Litz Date: Fri, 13 Jul 2018 10:48:44 +0200 Subject: [PATCH 064/190] lightnvm: pblk: add asynchronous partial read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the read path, partial reads are currently performed synchronously which affects performance for workloads that generate many partial reads. This patch adds an asynchronous partial read path as well as the required partial read ctx. Signed-off-by: Heiner Litz Reviewed-by: Igor Konopko Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 183 +++++++++++++++++++++++------------ drivers/lightnvm/pblk.h | 10 ++ 2 files changed, 130 insertions(+), 63 deletions(-) diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 9c9362b2086191..26d414ae25b685 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -231,74 +231,36 @@ static void pblk_end_io_read(struct nvm_rq *rqd) __pblk_end_io_read(pblk, rqd, true); } -static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, - struct bio *orig_bio, unsigned int bio_init_idx, - unsigned long *read_bitmap) +static void pblk_end_partial_read(struct nvm_rq *rqd) { - struct pblk_sec_meta *meta_list = rqd->meta_list; - struct bio *new_bio; + struct pblk *pblk = rqd->private; + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct pblk_pr_ctx *pr_ctx = r_ctx->private; + struct bio *new_bio = rqd->bio; + struct bio *bio = pr_ctx->orig_bio; struct bio_vec src_bv, dst_bv; - void *ppa_ptr = NULL; - void *src_p, *dst_p; - dma_addr_t dma_ppa_list = 0; - __le64 *lba_list_mem, *lba_list_media; - int nr_secs = rqd->nr_ppas; + struct pblk_sec_meta *meta_list = rqd->meta_list; + int bio_init_idx = pr_ctx->bio_init_idx; + unsigned long *read_bitmap = pr_ctx->bitmap; + int nr_secs = pr_ctx->orig_nr_secs; int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); - int i, ret, hole; - - /* Re-use allocated memory for intermediate lbas */ - lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); - lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); - - new_bio = bio_alloc(GFP_KERNEL, nr_holes); - - if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) - goto fail_add_pages; - - if (nr_holes != new_bio->bi_vcnt) { - pblk_err(pblk, "malformed bio\n"); - goto fail; - } - - for (i = 0; i < nr_secs; i++) - lba_list_mem[i] = meta_list[i].lba; - - new_bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(new_bio, REQ_OP_READ, 0); - - rqd->bio = new_bio; - rqd->nr_ppas = nr_holes; - rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); - - if (unlikely(nr_holes == 1)) { - ppa_ptr = rqd->ppa_list; - dma_ppa_list = rqd->dma_ppa_list; - rqd->ppa_addr = rqd->ppa_list[0]; - } - - ret = pblk_submit_io_sync(pblk, rqd); - if (ret) { - bio_put(rqd->bio); - pblk_err(pblk, "sync read IO submission failed\n"); - goto fail; - } - - if (rqd->error) { - atomic_long_inc(&pblk->read_failed); -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, rqd, rqd->error); -#endif - } + __le64 *lba_list_mem, *lba_list_media; + void *src_p, *dst_p; + int hole, i; if (unlikely(nr_holes == 1)) { struct ppa_addr ppa; ppa = rqd->ppa_addr; - rqd->ppa_list = ppa_ptr; - rqd->dma_ppa_list = dma_ppa_list; + rqd->ppa_list = pr_ctx->ppa_ptr; + rqd->dma_ppa_list = pr_ctx->dma_ppa_list; rqd->ppa_list[0] = ppa; } + /* Re-use allocated memory for intermediate lbas */ + lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); + lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); + for (i = 0; i < nr_secs; i++) { lba_list_media[i] = meta_list[i].lba; meta_list[i].lba = lba_list_mem[i]; @@ -316,7 +278,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, meta_list[hole].lba = lba_list_media[i]; src_bv = new_bio->bi_io_vec[i++]; - dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole]; + dst_bv = bio->bi_io_vec[bio_init_idx + hole]; src_p = kmap_atomic(src_bv.bv_page); dst_p = kmap_atomic(dst_bv.bv_page); @@ -334,19 +296,107 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, } while (hole < nr_secs); bio_put(new_bio); + kfree(pr_ctx); /* restore original request */ rqd->bio = NULL; rqd->nr_ppas = nr_secs; + bio_endio(bio); __pblk_end_io_read(pblk, rqd, false); - return NVM_IO_DONE; +} -fail: - /* Free allocated pages in new bio */ +static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd, + unsigned int bio_init_idx, + unsigned long *read_bitmap, + int nr_holes) +{ + struct pblk_sec_meta *meta_list = rqd->meta_list; + struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); + struct pblk_pr_ctx *pr_ctx; + struct bio *new_bio, *bio = r_ctx->private; + __le64 *lba_list_mem; + int nr_secs = rqd->nr_ppas; + int i; + + /* Re-use allocated memory for intermediate lbas */ + lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); + + new_bio = bio_alloc(GFP_KERNEL, nr_holes); + + if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) + goto fail_bio_put; + + if (nr_holes != new_bio->bi_vcnt) { + WARN_ONCE(1, "pblk: malformed bio\n"); + goto fail_free_pages; + } + + pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL); + if (!pr_ctx) + goto fail_free_pages; + + for (i = 0; i < nr_secs; i++) + lba_list_mem[i] = meta_list[i].lba; + + new_bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(new_bio, REQ_OP_READ, 0); + + rqd->bio = new_bio; + rqd->nr_ppas = nr_holes; + rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); + + pr_ctx->ppa_ptr = NULL; + pr_ctx->orig_bio = bio; + bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA); + pr_ctx->bio_init_idx = bio_init_idx; + pr_ctx->orig_nr_secs = nr_secs; + r_ctx->private = pr_ctx; + + if (unlikely(nr_holes == 1)) { + pr_ctx->ppa_ptr = rqd->ppa_list; + pr_ctx->dma_ppa_list = rqd->dma_ppa_list; + rqd->ppa_addr = rqd->ppa_list[0]; + } + return 0; + +fail_free_pages: pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt); -fail_add_pages: +fail_bio_put: + bio_put(new_bio); + + return -ENOMEM; +} + +static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, + unsigned int bio_init_idx, + unsigned long *read_bitmap, int nr_secs) +{ + int nr_holes; + int ret; + + nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); + + if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap, + nr_holes)) + return NVM_IO_ERR; + + rqd->end_io = pblk_end_partial_read; + + ret = pblk_submit_io(pblk, rqd); + if (ret) { + bio_put(rqd->bio); + pblk_err(pblk, "partial read IO submission failed\n"); + goto err; + } + + return NVM_IO_OK; + +err: pblk_err(pblk, "failed to perform partial read\n"); + + /* Free allocated pages in new bio */ + pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt); __pblk_end_io_read(pblk, rqd, false); return NVM_IO_ERR; } @@ -480,8 +530,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* The read bio request could be partially filled by the write buffer, * but there are some holes that need to be read from the drive. */ - return pblk_partial_read(pblk, rqd, bio, bio_init_idx, read_bitmap); + ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap, + nr_secs); + if (ret) + goto fail_meta_free; + + return NVM_IO_OK; +fail_meta_free: + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); fail_rqd_free: pblk_free_rqd(pblk, rqd, PBLK_READ); return ret; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 5c6904eb855761..4760af7b64994f 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -119,6 +119,16 @@ struct pblk_g_ctx { u64 lba; }; +/* partial read context */ +struct pblk_pr_ctx { + struct bio *orig_bio; + DECLARE_BITMAP(bitmap, NVM_MAX_VLBA); + unsigned int orig_nr_secs; + unsigned int bio_init_idx; + void *ppa_ptr; + dma_addr_t dma_ppa_list; +}; + /* Pad context */ struct pblk_pad_rq { struct pblk *pblk; From f6352103d2e0ad2d2066725eb19bfdfb8763239b Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Jul 2018 10:48:45 +0200 Subject: [PATCH 065/190] lightnvm: pblk: assume that chunks are closed on 1.2 devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can't know if a block is closed or not on 1.2 devices, so assume closed state to make sure that blocks are erased before writing. Fixes: 32ef9412c114 ("lightnvm: pblk: implement get log report chunk") Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index d023ea6116bc3f..537e98f2b24a2d 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -719,10 +719,11 @@ static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line, /* * In 1.2 spec. chunk state is not persisted by the device. Thus - * some of the values are reset each time pblk is instantiated. + * some of the values are reset each time pblk is instantiated, + * so we have to assume that the block is closed. */ if (lun_bb_meta[line->id] == NVM_BLK_T_FREE) - chunk->state = NVM_CHK_ST_FREE; + chunk->state = NVM_CHK_ST_CLOSED; else chunk->state = NVM_CHK_ST_OFFLINE; From d607eefa3b55c7b8cd9902e9209fc1235f88ccd9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Jul 2018 12:12:22 -0400 Subject: [PATCH 066/190] blk-iolatency: don't change the latency window Early versions of these patches had us waiting for seconds at a time during submission, so we had to adjust the timing window we monitored for latency. Now we don't do things like that so this is unnecessary code. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index b59e5451680b58..893f888eebb1a1 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -468,16 +468,6 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) } preempt_enable(); - /* - * Our average exceeded our window, scale up our window so we are more - * accurate, but not more than the global timer. - */ - if (stat.mean > iolat->cur_win_nsec) { - iolat->cur_win_nsec <<= 1; - iolat->cur_win_nsec = - max_t(u64, iolat->cur_win_nsec, NSEC_PER_SEC); - } - parent = blkg_to_lat(blkg->parent); if (!parent) return; From 71e9690b59e7349156025a514c29c29ef55b0175 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Jul 2018 12:12:23 -0400 Subject: [PATCH 067/190] blk-iolatency: truncate our current time In our longer tests we noticed that some boxes would degrade to the point of uselessness. This is because we truncate the current time when saving it in our bio, but I was using the raw current time to subtract from. So once the box had been up a certain amount of time it would appear as if our IO's were taking several years to complete. Fix this by truncating the current time so it matches the issue time. Verified this worked by running with this patch for a week on our test tier. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 893f888eebb1a1..bb59b2929e0d96 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -425,6 +425,12 @@ static void iolatency_record_time(struct iolatency_grp *iolat, u64 start = bio_issue_time(issue); u64 req_time; + /* + * Have to do this so we are truncated to the correct time that our + * issue is truncated to. + */ + now = __bio_issue_time(now); + if (now <= start) return; From 6ce3dd6eec114930cf2035a8bcb1e80477ed79a8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 10 Jul 2018 09:03:31 +0800 Subject: [PATCH 068/190] blk-mq: issue directly if hw queue isn't busy in case of 'none' In case of 'none' io scheduler, when hw queue isn't busy, it isn't necessary to enqueue request to sw queue and dequeue it from sw queue because request may be submitted to hw queue asap without extra cost, meantime there shouldn't be much request in sw queue, and we don't need to worry about effect on IO merge. There are still some single hw queue SCSI HBAs(HPSA, megaraid_sas, ...) which may connect high performance devices, so 'none' is often required for obtaining good performance. This patch improves IOPS and decreases CPU unilization on megaraid_sas, per Kashyap's test. Cc: Kashyap Desai Cc: Laurence Oberman Cc: Omar Sandoval Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Hannes Reinecke Reported-by: Kashyap Desai Tested-by: Kashyap Desai Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 13 ++++++++++++- block/blk-mq.c | 23 ++++++++++++++++++++++- block/blk-mq.h | 2 ++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index fdc129e64cc4e8..cf9c66c6d35a86 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -405,8 +405,19 @@ void blk_mq_sched_insert_requests(struct request_queue *q, if (e && e->type->ops.mq.insert_requests) e->type->ops.mq.insert_requests(hctx, list, false); - else + else { + /* + * try to issue requests directly if the hw queue isn't + * busy in case of 'none' scheduler, and this way may save + * us one extra enqueue & dequeue to sw queue. + */ + if (!hctx->dispatch_busy && !e && !run_queue_async) { + blk_mq_try_issue_list_directly(hctx, list); + if (list_empty(list)) + return; + } blk_mq_insert_requests(hctx, ctx, list); + } blk_mq_run_hw_queue(hctx, run_queue_async); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 73a43b81b17dcc..21f3eda9843137 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1691,13 +1691,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: + blk_mq_update_dispatch_busy(hctx, false); *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: + blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; default: + blk_mq_update_dispatch_busy(hctx, false); *cookie = BLK_QC_T_NONE; break; } @@ -1780,6 +1783,23 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq) return ret; } +void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list) +{ + while (!list_empty(list)) { + blk_status_t ret; + struct request *rq = list_first_entry(list, struct request, + queuelist); + + list_del_init(&rq->queuelist); + ret = blk_mq_request_issue_directly(rq); + if (ret != BLK_STS_OK) { + list_add(&rq->queuelist, list); + break; + } + } +} + static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); @@ -1880,7 +1900,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } - } else if (q->nr_hw_queues > 1 && is_sync) { + } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && + !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); diff --git a/block/blk-mq.h b/block/blk-mq.h index bc2b24735ed413..9497b47e2526c6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -64,6 +64,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, /* Used by blk_insert_cloned_request() to issue request directly */ blk_status_t blk_mq_request_issue_directly(struct request *rq); +void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list); /* * CPU -> queue mappings From ada94973f15f175283fd3b8f9bfcf9de6f2cc818 Mon Sep 17 00:00:00 2001 From: RAGHU Halharvi Date: Tue, 17 Jul 2018 22:32:12 +0530 Subject: [PATCH 069/190] pktcdvd: remove assignment in if condition * Remove checkpatch errors caused due to assignment operation in if condition Signed-off-by: RAGHU Halharvi Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 69 +++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index b3f83cd96f335b..a4b4d524c3af7d 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -798,7 +798,8 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, cgc.cmd[4] = (write_speed >> 8) & 0xff; cgc.cmd[5] = write_speed & 0xff; - if ((ret = pkt_generic_packet(pd, &cgc))) + ret = pkt_generic_packet(pd, &cgc); + if (ret) pkt_dump_sense(pd, &cgc); return ret; @@ -1562,7 +1563,8 @@ static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di) cgc.cmd[8] = cgc.buflen = 2; cgc.quiet = 1; - if ((ret = pkt_generic_packet(pd, &cgc))) + ret = pkt_generic_packet(pd, &cgc); + if (ret) return ret; /* not all drives have the same disc_info length, so requeue @@ -1591,7 +1593,8 @@ static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, cgc.cmd[8] = 8; cgc.quiet = 1; - if ((ret = pkt_generic_packet(pd, &cgc))) + ret = pkt_generic_packet(pd, &cgc); + if (ret) return ret; cgc.buflen = be16_to_cpu(ti->track_information_length) + @@ -1612,17 +1615,20 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, __u32 last_track; int ret = -1; - if ((ret = pkt_get_disc_info(pd, &di))) + ret = pkt_get_disc_info(pd, &di); + if (ret) return ret; last_track = (di.last_track_msb << 8) | di.last_track_lsb; - if ((ret = pkt_get_track_info(pd, last_track, 1, &ti))) + ret = pkt_get_track_info(pd, last_track, 1, &ti); + if (ret) return ret; /* if this track is blank, try the previous. */ if (ti.blank) { last_track--; - if ((ret = pkt_get_track_info(pd, last_track, 1, &ti))) + ret = pkt_get_track_info(pd, last_track, 1, &ti); + if (ret) return ret; } @@ -1657,7 +1663,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) memset(buffer, 0, sizeof(buffer)); init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); cgc.sense = &sense; - if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { + ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); + if (ret) { pkt_dump_sense(pd, &cgc); return ret; } @@ -1672,7 +1679,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) */ init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); cgc.sense = &sense; - if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { + ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); + if (ret) { pkt_dump_sense(pd, &cgc); return ret; } @@ -1714,7 +1722,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) wp->packet_size = cpu_to_be32(pd->settings.size >> 2); cgc.buflen = cgc.cmd[8] = size; - if ((ret = pkt_mode_select(pd, &cgc))) { + ret = pkt_mode_select(pd, &cgc); + if (ret) { pkt_dump_sense(pd, &cgc); return ret; } @@ -1819,7 +1828,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) memset(&di, 0, sizeof(disc_information)); memset(&ti, 0, sizeof(track_information)); - if ((ret = pkt_get_disc_info(pd, &di))) { + ret = pkt_get_disc_info(pd, &di); + if (ret) { pkt_err(pd, "failed get_disc\n"); return ret; } @@ -1830,7 +1840,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ - if ((ret = pkt_get_track_info(pd, track, 1, &ti))) { + ret = pkt_get_track_info(pd, track, 1, &ti); + if (ret) { pkt_err(pd, "failed get_track\n"); return ret; } @@ -1918,7 +1929,8 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, */ cgc.quiet = 1; - if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0))) + ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0); + if (ret) return ret; buf[pd->mode_offset + 10] |= (!!set << 2); @@ -2093,7 +2105,8 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) cgc.timeout = 60*HZ; cgc.cmd[0] = GPCMD_SEND_OPC; cgc.cmd[1] = 1; - if ((ret = pkt_generic_packet(pd, &cgc))) + ret = pkt_generic_packet(pd, &cgc); + if (ret) pkt_dump_sense(pd, &cgc); return ret; } @@ -2103,19 +2116,22 @@ static int pkt_open_write(struct pktcdvd_device *pd) int ret; unsigned int write_speed, media_write_speed, read_speed; - if ((ret = pkt_probe_settings(pd))) { + ret = pkt_probe_settings(pd); + if (ret) { pkt_dbg(2, pd, "failed probe\n"); return ret; } - if ((ret = pkt_set_write_settings(pd))) { + ret = pkt_set_write_settings(pd); + if (ret) { pkt_dbg(1, pd, "failed saving write settings\n"); return -EIO; } pkt_write_caching(pd, USE_WCACHING); - if ((ret = pkt_get_max_speed(pd, &write_speed))) + ret = pkt_get_max_speed(pd, &write_speed); + if (ret) write_speed = 16 * 177; switch (pd->mmc3_profile) { case 0x13: /* DVD-RW */ @@ -2124,7 +2140,8 @@ static int pkt_open_write(struct pktcdvd_device *pd) pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); break; default: - if ((ret = pkt_media_speed(pd, &media_write_speed))) + ret = pkt_media_speed(pd, &media_write_speed); + if (ret) media_write_speed = 16; write_speed = min(write_speed, media_write_speed * 177); pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); @@ -2132,14 +2149,16 @@ static int pkt_open_write(struct pktcdvd_device *pd) } read_speed = write_speed; - if ((ret = pkt_set_speed(pd, write_speed, read_speed))) { + ret = pkt_set_speed(pd, write_speed, read_speed); + if (ret) { pkt_dbg(1, pd, "couldn't set write speed\n"); return -EIO; } pd->write_speed = write_speed; pd->read_speed = read_speed; - if ((ret = pkt_perform_opc(pd))) { + ret = pkt_perform_opc(pd); + if (ret) { pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); } @@ -2161,10 +2180,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) * so bdget() can't fail. */ bdget(pd->bdev->bd_dev); - if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd))) + ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd); + if (ret) goto out; - if ((ret = pkt_get_last_written(pd, &lba))) { + ret = pkt_get_last_written(pd, &lba); + if (ret) { pkt_err(pd, "pkt_get_last_written failed\n"); goto out_putdev; } @@ -2175,7 +2196,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) q = bdev_get_queue(pd->bdev); if (write) { - if ((ret = pkt_open_write(pd))) + ret = pkt_open_write(pd); + if (ret) goto out_putdev; /* * Some CDRW drives can not handle writes larger than one packet, @@ -2190,7 +2212,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) clear_bit(PACKET_WRITABLE, &pd->flags); } - if ((ret = pkt_set_segment_merging(pd, q))) + ret = pkt_set_segment_merging(pd, q); + if (ret) goto out_putdev; if (write) { From 3f289dcb4b265416a57ca79cf4a324060bb09060 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Jul 2018 04:47:36 -0700 Subject: [PATCH 070/190] block: make bdev_ops->rw_page() take a REQ_OP instead of bool c11f0c0b5bb9 ("block/mm: make bdev_ops->rw_page() take a bool for read/write") replaced @op with boolean @is_write, which limited the amount of information going into ->rw_page() and more importantly page_endio(), which removed the need to expose block internals to mm. Unfortunately, we want to track discards separately and @is_write isn't enough information. This patch updates bdev_ops->rw_page() to take REQ_OP instead but leaves page_endio() to take bool @is_write. This allows the block part of operations to have enough information while not leaking it to mm. Signed-off-by: Tejun Heo Cc: Mike Christie Cc: Minchan Kim Cc: Dan Williams Signed-off-by: Jens Axboe --- drivers/block/brd.c | 14 +++++++------- drivers/block/zram/zram_drv.c | 16 ++++++++-------- drivers/nvdimm/btt.c | 12 ++++++------ drivers/nvdimm/pmem.c | 13 ++++++------- fs/block_dev.c | 6 ++++-- fs/mpage.c | 4 ++-- include/linux/blkdev.h | 2 +- 7 files changed, 34 insertions(+), 33 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index bb976598ee4340..df8103dd40ac2d 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -254,20 +254,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd, * Process a single bvec of a bio. */ static int brd_do_bvec(struct brd_device *brd, struct page *page, - unsigned int len, unsigned int off, bool is_write, + unsigned int len, unsigned int off, unsigned int op, sector_t sector) { void *mem; int err = 0; - if (is_write) { + if (op_is_write(op)) { err = copy_to_brd_setup(brd, sector, len); if (err) goto out; } mem = kmap_atomic(page); - if (!is_write) { + if (!op_is_write(op)) { copy_from_brd(mem + off, brd, sector, len); flush_dcache_page(page); } else { @@ -296,7 +296,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) int err; err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, - op_is_write(bio_op(bio)), sector); + bio_op(bio), sector); if (err) goto io_error; sector += len >> SECTOR_SHIFT; @@ -310,15 +310,15 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) } static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, bool is_write) + struct page *page, unsigned int op) { struct brd_device *brd = bdev->bd_disk->private_data; int err; if (PageTransHuge(page)) return -ENOTSUPP; - err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector); - page_endio(page, is_write, err); + err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); + page_endio(page, op_is_write(op), err); return err; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7436b2d27fa385..78c29044684a67 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1274,17 +1274,17 @@ static void zram_bio_discard(struct zram *zram, u32 index, * Returns 1 if IO request was successfully submitted. */ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, bool is_write, struct bio *bio) + int offset, unsigned int op, struct bio *bio) { unsigned long start_time = jiffies; - int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ; + int rw_acct = op_is_write(op) ? REQ_OP_WRITE : REQ_OP_READ; struct request_queue *q = zram->disk->queue; int ret; generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT, &zram->disk->part0); - if (!is_write) { + if (!op_is_write(op)) { atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset, bio); flush_dcache_page(bvec->bv_page); @@ -1300,7 +1300,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, zram_slot_unlock(zram, index); if (unlikely(ret < 0)) { - if (!is_write) + if (!op_is_write(op)) atomic64_inc(&zram->stats.failed_reads); else atomic64_inc(&zram->stats.failed_writes); @@ -1338,7 +1338,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, unwritten); if (zram_bvec_rw(zram, &bv, index, offset, - op_is_write(bio_op(bio)), bio) < 0) + bio_op(bio), bio) < 0) goto out; bv.bv_offset += bv.bv_len; @@ -1390,7 +1390,7 @@ static void zram_slot_free_notify(struct block_device *bdev, } static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, bool is_write) + struct page *page, unsigned int op) { int offset, ret; u32 index; @@ -1414,7 +1414,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; - ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL); + ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); out: /* * If I/O fails, just return error(ie, non-zero) without @@ -1429,7 +1429,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, switch (ret) { case 0: - page_endio(page, is_write, 0); + page_endio(page, op_is_write(op), 0); break; case 1: ret = 0; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 85de8053aa3445..0360c015f6580b 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1423,11 +1423,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, struct page *page, unsigned int len, unsigned int off, - bool is_write, sector_t sector) + unsigned int op, sector_t sector) { int ret; - if (!is_write) { + if (!op_is_write(op)) { ret = btt_read_pg(btt, bip, page, off, sector, len); flush_dcache_page(page); } else { @@ -1464,7 +1464,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) } err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, - op_is_write(bio_op(bio)), iter.bi_sector); + bio_op(bio), iter.bi_sector); if (err) { dev_err(&btt->nd_btt->dev, "io error in %s sector %lld, len %d,\n", @@ -1483,16 +1483,16 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) } static int btt_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, bool is_write) + struct page *page, unsigned int op) { struct btt *btt = bdev->bd_disk->private_data; int rc; unsigned int len; len = hpage_nr_pages(page) * PAGE_SIZE; - rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector); + rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector); if (rc == 0) - page_endio(page, is_write, 0); + page_endio(page, op_is_write(op), 0); return rc; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 8b1fd7f1a224ee..dd17acd8fe6810 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -120,7 +120,7 @@ static blk_status_t read_pmem(struct page *page, unsigned int off, } static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, - unsigned int len, unsigned int off, bool is_write, + unsigned int len, unsigned int off, unsigned int op, sector_t sector) { blk_status_t rc = BLK_STS_OK; @@ -131,7 +131,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) bad_pmem = true; - if (!is_write) { + if (!op_is_write(op)) { if (unlikely(bad_pmem)) rc = BLK_STS_IOERR; else { @@ -180,8 +180,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) do_acct = nd_iostat_start(bio, &start); bio_for_each_segment(bvec, bio, iter) { rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, - bvec.bv_offset, op_is_write(bio_op(bio)), - iter.bi_sector); + bvec.bv_offset, bio_op(bio), iter.bi_sector); if (rc) { bio->bi_status = rc; break; @@ -198,13 +197,13 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) } static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, bool is_write) + struct page *page, unsigned int op) { struct pmem_device *pmem = bdev->bd_queue->queuedata; blk_status_t rc; rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, - 0, is_write, sector); + 0, op, sector); /* * The ->rw_page interface is subtle and tricky. The core @@ -213,7 +212,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, * caused by double completion. */ if (rc == 0) - page_endio(page, is_write, 0); + page_endio(page, op_is_write(op), 0); return blk_status_to_errno(rc); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 0dd87aaeb39a7d..496fb51a1e1a10 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -665,7 +665,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_READ); blk_queue_exit(bdev->bd_queue); return result; } @@ -703,7 +704,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, return result; set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_WRITE); if (result) { end_page_writeback(page); } else { diff --git a/fs/mpage.c b/fs/mpage.c index b7e7f570733ad0..b73638db9866cd 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -51,8 +51,8 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, op_is_write(bio_op(bio)), - blk_status_to_errno(bio->bi_status)); + page_endio(page, bio_op(bio), + blk_status_to_errno(bio->bi_status)); } bio_put(bio); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1939ed95f9361a..331a6cb8805f00 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1943,7 +1943,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req, struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, bool); + int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, From 59767fbd49d794b4499d30b314df6c0d4aca584b Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:37 -0700 Subject: [PATCH 071/190] block: Add part_stat_read_accum to read across field entries. Add a part_stat_read_accum macro to genhd.h to read and sum across field entries. For example to sum up the number read and write sectors completed. In addition to being ar reasonable cleanup by itself this will make it easier to add new stat fields in the future. tj: Refreshed on top of v4.17. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 3 +-- drivers/block/drbd/drbd_worker.c | 4 +--- drivers/md/md.c | 3 +-- include/linux/genhd.h | 4 ++++ 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a36a30795c4360..75f6b47169e65a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2674,8 +2674,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) if (c_min_rate == 0) return false; - curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]) - + curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - atomic_read(&device->rs_sect_ev); if (atomic_read(&device->ap_actlog_cnt) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 5e793dd7adfbd0..b8f77e83d45629 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1690,9 +1690,7 @@ void drbd_rs_controller_reset(struct drbd_device *device) atomic_set(&device->rs_sect_in, 0); atomic_set(&device->rs_sect_ev, 0); device->rs_in_flight = 0; - device->rs_last_events = - (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]); + device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors); /* Updating the RCU protected object in place is necessary since this function gets called from atomic context. diff --git a/drivers/md/md.c b/drivers/md/md.c index 994aed2f9dfff4..dabe36723d6019 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8046,8 +8046,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]) - + curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6cb8a57896682a..19f36fa1099509 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -353,6 +353,10 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ +#define part_stat_read_accum(part, field) \ + (part_stat_read(part, field[0]) + \ + part_stat_read(part, field[1])) + #define part_stat_add(cpu, part, field, addnd) do { \ __part_stat_add((cpu), (part), field, addnd); \ if ((part)->partno) \ From dbae2c551377b6533a00c11fc7ede370100ab404 Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:38 -0700 Subject: [PATCH 072/190] block: Define and use STAT_READ and STAT_WRITE Add defines for STAT_READ and STAT_WRITE for indexing the partition stat entries. This clarifies some fs/ code which has hardcoded 1 for STAT_WRITE and will make it easier to extend the stats with additional fields. tj: Refreshed on top of v4.17. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Cc: "Theodore Ts'o" Cc: Jaegeuk Kim Signed-off-by: Jens Axboe --- block/genhd.c | 16 ++++++++-------- block/partition-generic.c | 16 ++++++++-------- fs/ext4/super.c | 5 +++-- fs/ext4/sysfs.c | 6 ++++-- fs/f2fs/f2fs.h | 2 +- fs/f2fs/super.c | 3 ++- include/linux/blk_types.h | 7 +++++++ include/linux/genhd.h | 13 +++++++------ 8 files changed, 40 insertions(+), 28 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index f1543a45e73bfa..0711a800d0d4ca 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1337,14 +1337,14 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%u %lu %lu %lu %u %u %u %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[READ]), - part_stat_read(hd, merges[READ]), - part_stat_read(hd, sectors[READ]), - jiffies_to_msecs(part_stat_read(hd, ticks[READ])), - part_stat_read(hd, ios[WRITE]), - part_stat_read(hd, merges[WRITE]), - part_stat_read(hd, sectors[WRITE]), - jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), + part_stat_read(hd, ios[STAT_READ]), + part_stat_read(hd, merges[STAT_READ]), + part_stat_read(hd, sectors[STAT_READ]), + jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])), + part_stat_read(hd, ios[STAT_WRITE]), + part_stat_read(hd, merges[STAT_WRITE]), + part_stat_read(hd, sectors[STAT_WRITE]), + jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])), inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) diff --git a/block/partition-generic.c b/block/partition-generic.c index 3dcfd4ec0e1115..0ddb06722162ca 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -132,14 +132,14 @@ ssize_t part_stat_show(struct device *dev, "%8lu %8lu %8llu %8u " "%8u %8u %8u" "\n", - part_stat_read(p, ios[READ]), - part_stat_read(p, merges[READ]), - (unsigned long long)part_stat_read(p, sectors[READ]), - jiffies_to_msecs(part_stat_read(p, ticks[READ])), - part_stat_read(p, ios[WRITE]), - part_stat_read(p, merges[WRITE]), - (unsigned long long)part_stat_read(p, sectors[WRITE]), - jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), + part_stat_read(p, ios[STAT_READ]), + part_stat_read(p, merges[STAT_READ]), + (unsigned long long)part_stat_read(p, sectors[STAT_READ]), + jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])), + part_stat_read(p, ios[STAT_WRITE]), + part_stat_read(p, merges[STAT_WRITE]), + (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), + jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])), inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ba2396a7bd04b0..4b8aef98955238 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3514,7 +3514,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_sb_block = sb_block; if (sb->s_bdev->bd_part) sbi->s_sectors_written_start = - part_stat_read(sb->s_bdev->bd_part, sectors[1]); + part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]); /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); @@ -4824,7 +4824,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (sb->s_bdev->bd_part) es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + ((part_stat_read(sb->s_bdev->bd_part, + sectors[STAT_WRITE]) - EXT4_SB(sb)->s_sectors_written_start) >> 1)); else es->s_kbytes_written = diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index f34da0bb8f1744..2be9ad79001728 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -56,7 +56,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) if (!sb->s_bdev->bd_part) return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%lu\n", - (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + (part_stat_read(sb->s_bdev->bd_part, + sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); } @@ -68,7 +69,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + ((part_stat_read(sb->s_bdev->bd_part, + sectors[STAT_WRITE]) - EXT4_SB(sb)->s_sectors_written_start) >> 1))); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d8b1de831439a..6799c3fc44e30d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1304,7 +1304,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \ (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3995e926ba3a36..17bcff789c082a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2882,7 +2882,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* For write statistics */ if (sb->s_bdev->bd_part) sbi->sectors_written_start = - (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]); + (u64)part_stat_read(sb->s_bdev->bd_part, + sectors[STAT_WRITE]); /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e13449a379a172..d2b44de56bc1f1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -357,6 +357,13 @@ enum req_flag_bits { #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) +enum stat_group { + STAT_READ, + STAT_WRITE, + + NR_STAT_GROUPS +}; + #define bio_op(bio) \ ((bio)->bi_opf & REQ_OP_MASK) #define req_op(req) \ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 19f36fa1099509..a75445446974a7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_BLOCK @@ -82,10 +83,10 @@ struct partition { } __attribute__((packed)); struct disk_stats { - unsigned long sectors[2]; /* READs and WRITEs */ - unsigned long ios[2]; - unsigned long merges[2]; - unsigned long ticks[2]; + unsigned long sectors[NR_STAT_GROUPS]; + unsigned long ios[NR_STAT_GROUPS]; + unsigned long merges[NR_STAT_GROUPS]; + unsigned long ticks[NR_STAT_GROUPS]; unsigned long io_ticks; unsigned long time_in_queue; }; @@ -354,8 +355,8 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ #define part_stat_read_accum(part, field) \ - (part_stat_read(part, field[0]) + \ - part_stat_read(part, field[1])) + (part_stat_read(part, field[STAT_READ]) + \ + part_stat_read(part, field[STAT_WRITE])) #define part_stat_add(cpu, part, field, addnd) do { \ __part_stat_add((cpu), (part), field, addnd); \ From ddcf35d397976421a4ec1d0d00fbcc027a8cb034 Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:39 -0700 Subject: [PATCH 073/190] block: Add and use op_stat_group() for indexing disk_stat fields. Add and use a new op_stat_group() function for indexing partition stat fields rather than indexing them by rq_data_dir() or bio_data_dir(). This function works similarly to op_is_sync() in that it takes the request::cmd_flags or bio::bi_opf flags and determines which stats should et updated. In addition, the second parameter to generic_start_io_acct() and generic_end_io_acct() is now a REQ_OP rather than simply a read or write bit and it uses op_stat_group() on the parameter to determine the stat group. Note that the partition in_flight counts are not part of the per-cpu statistics and as such are not indexed via this function. It's now indexed by op_is_write(). tj: Refreshed on top of v4.17. Updated to pass around REQ_OP. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Cc: Minchan Kim Cc: Dan Williams Cc: Joshua Morris Cc: Philipp Reisner Cc: Matias Bjorling Cc: Kent Overstreet Cc: Alasdair Kergon Signed-off-by: Jens Axboe --- block/bio.c | 16 +++++++++------- block/blk-core.c | 12 ++++++------ drivers/block/drbd/drbd_req.c | 4 ++-- drivers/block/rsxx/dev.c | 6 +++--- drivers/block/zram/zram_drv.c | 5 ++--- drivers/lightnvm/pblk-cache.c | 5 +++-- drivers/lightnvm/pblk-read.c | 5 +++-- drivers/md/bcache/request.c | 13 +++++-------- drivers/md/dm.c | 6 ++++-- drivers/md/md.c | 5 +++-- drivers/nvdimm/nd.h | 7 +++---- include/linux/bio.h | 4 ++-- include/linux/blk_types.h | 5 +++++ 13 files changed, 50 insertions(+), 43 deletions(-) diff --git a/block/bio.c b/block/bio.c index f3536bfc82989f..8ecc95615941f4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1728,29 +1728,31 @@ void bio_check_pages_dirty(struct bio *bio) } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); -void generic_start_io_acct(struct request_queue *q, int rw, +void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part) { + const int sgrp = op_stat_group(op); int cpu = part_stat_lock(); part_round_stats(q, cpu, part); - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, sectors[rw], sectors); - part_inc_in_flight(q, part, rw); + part_stat_inc(cpu, part, ios[sgrp]); + part_stat_add(cpu, part, sectors[sgrp], sectors); + part_inc_in_flight(q, part, op_is_write(op)); part_stat_unlock(); } EXPORT_SYMBOL(generic_start_io_acct); -void generic_end_io_acct(struct request_queue *q, int rw, +void generic_end_io_acct(struct request_queue *q, int req_op, struct hd_struct *part, unsigned long start_time) { unsigned long duration = jiffies - start_time; + const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - part_stat_add(cpu, part, ticks[rw], duration); + part_stat_add(cpu, part, ticks[sgrp], duration); part_round_stats(q, cpu, part); - part_dec_in_flight(q, part, rw); + part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); } diff --git a/block/blk-core.c b/block/blk-core.c index c4b57d8806fe15..03a4ea93a5f365 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2702,13 +2702,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes); void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { - const int rw = rq_data_dir(req); + const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; cpu = part_stat_lock(); part = req->part; - part_stat_add(cpu, part, sectors[rw], bytes >> 9); + part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -2722,7 +2722,7 @@ void blk_account_io_done(struct request *req, u64 now) */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { unsigned long duration; - const int rw = rq_data_dir(req); + const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; @@ -2730,10 +2730,10 @@ void blk_account_io_done(struct request *req, u64 now) cpu = part_stat_lock(); part = req->part; - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); + part_stat_inc(cpu, part, ios[sgrp]); + part_stat_add(cpu, part, ticks[sgrp], duration); part_round_stats(req->q, cpu, part); - part_dec_in_flight(req->q, part, rw); + part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); part_stat_unlock(); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d146fedc38bb26..19cac36e97371f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -38,7 +38,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request { struct request_queue *q = device->rq_queue; - generic_start_io_acct(q, bio_data_dir(req->master_bio), + generic_start_io_acct(q, bio_op(req->master_bio), req->i.size >> 9, &device->vdisk->part0); } @@ -47,7 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r { struct request_queue *q = device->rq_queue; - generic_end_io_acct(q, bio_data_dir(req->master_bio), + generic_end_io_acct(q, bio_op(req->master_bio), &device->vdisk->part0, req->start_jif); } diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index dddb3f2490b675..1a92f9e6593746 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = { static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) { - generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio), + generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio), &card->gendisk->part0); } @@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card, struct bio *bio, unsigned long start_time) { - generic_end_io_acct(card->queue, bio_data_dir(bio), - &card->gendisk->part0, start_time); + generic_end_io_acct(card->queue, bio_op(bio), + &card->gendisk->part0, start_time); } static void bio_dma_done_cb(struct rsxx_cardinfo *card, diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 78c29044684a67..2907a8156aafb2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1277,11 +1277,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, unsigned int op, struct bio *bio) { unsigned long start_time = jiffies; - int rw_acct = op_is_write(op) ? REQ_OP_WRITE : REQ_OP_READ; struct request_queue *q = zram->disk->queue; int ret; - generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT, + generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT, &zram->disk->part0); if (!op_is_write(op)) { @@ -1293,7 +1292,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset, bio); } - generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); + generic_end_io_acct(q, op, &zram->disk->part0, start_time); zram_slot_lock(zram, index); zram_accessed(zram, index); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 77d811962818ad..f565a56b898ab0 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -27,7 +27,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) int nr_entries = pblk_get_secs(bio); int i, ret; - generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0); + generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio), + &pblk->disk->part0); /* Update the write buffer head (mem) with the entries that we can * write. The write in itself cannot fail, so there is no need to @@ -75,7 +76,7 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags) pblk_rl_inserted(&pblk->rl, nr_entries); out: - generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time); + generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time); pblk_write_should_kick(pblk); return ret; } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 26d414ae25b685..5a46d7f9302fa7 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -199,7 +199,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, struct bio *int_bio = rqd->bio; unsigned long start_time = r_ctx->start_time; - generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time); + generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time); if (rqd->error) pblk_log_read_err(pblk, rqd); @@ -461,7 +461,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) return NVM_IO_ERR; } - generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0); + generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio), + &pblk->disk->part0); bitmap_zero(read_bitmap, nr_secs); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ae67f5fa80475d..97707b0c54ce05 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - generic_end_io_acct(s->d->disk->queue, - bio_data_dir(s->orig_bio), + generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio), &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); @@ -1062,8 +1061,7 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io = ddip->bi_end_io; bio->bi_private = ddip->bi_private; - generic_end_io_acct(ddip->d->disk->queue, - bio_data_dir(bio), + generic_end_io_acct(ddip->d->disk->queue, bio_op(bio), &ddip->d->disk->part0, ddip->start_time); if (bio->bi_status) { @@ -1120,7 +1118,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, } atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; @@ -1229,7 +1227,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, struct search *s; struct closure *cl; struct bcache_device *d = bio->bi_disk->private_data; - int rw = bio_data_dir(bio); if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; @@ -1237,7 +1234,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; } - generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); s = search_alloc(bio, d); cl = &s->cl; @@ -1254,7 +1251,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, flash_dev_nodata, bcache_wq); return BLK_QC_T_NONE; - } else if (rw) { + } else if (bio_data_dir(bio)) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), &KEY(d->id, bio_end_sector(bio), 0)); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b0dd7027848b7d..20f7e4ef534227 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io) io->start_time = jiffies; - generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0); + generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), + &dm_disk(md)->part0); atomic_set(&dm_disk(md)->part0.in_flight[rw], atomic_inc_return(&md->pending[rw])); @@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io) int pending; int rw = bio_data_dir(bio); - generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time); + generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, + io->start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), diff --git a/drivers/md/md.c b/drivers/md/md.c index dabe36723d6019..f6e58dbca0d44b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -335,6 +335,7 @@ EXPORT_SYMBOL(md_handle_request); static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); + const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = q->queuedata; unsigned int sectors; int cpu; @@ -363,8 +364,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) md_handle_request(mddev, bio); cpu = part_stat_lock(); - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); part_stat_unlock(); return BLK_QC_T_NONE; diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 32e0364b48b9d7..6ee7fd7e4bbdc6 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -396,16 +396,15 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) return false; *start = jiffies; - generic_start_io_acct(disk->queue, bio_data_dir(bio), - bio_sectors(bio), &disk->part0); + generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio), + &disk->part0); return true; } static inline void nd_iostat_end(struct bio *bio, unsigned long start) { struct gendisk *disk = bio->bi_disk; - generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0, - start); + generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start); } static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) diff --git a/include/linux/bio.h b/include/linux/bio.h index a00dfff51aa589..ab221c517f4ecd 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -496,9 +496,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(struct request_queue *q, int rw, +void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part); -void generic_end_io_acct(struct request_queue *q, int rw, +void generic_end_io_acct(struct request_queue *q, int op, struct hd_struct *part, unsigned long start_time); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d2b44de56bc1f1..2960a96d833c9e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -401,6 +401,11 @@ static inline bool op_is_sync(unsigned int op) (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } +static inline int op_stat_group(unsigned int op) +{ + return op_is_write(op); +} + typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U #define BLK_QC_T_SHIFT 16 From bdca3c87fb7ad1cc61d231d37eb0d8f90d001e0c Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:40 -0700 Subject: [PATCH 074/190] block: Track DISCARD statistics and output them in stat and diskstat Add tracking of REQ_OP_DISCARD ios to the partition statistics and append them to the various stat files in /sys as well as /proc/diskstats. These are tracked with the same four stats as reads and writes: Number of discard ios completed. Number of discard ios merged Number of discard sectors completed Milliseconds spent on discard requests This is done via adding a new STAT_DISCARD define to genhd.h and then using it to index that stat field for discard requests. tj: Refreshed on top of v4.17 and other previous updates. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Cc: Andy Newell Signed-off-by: Jens Axboe --- Documentation/ABI/testing/procfs-diskstats | 10 ++++++++ Documentation/block/stat.txt | 28 ++++++++++++---------- Documentation/iostats.txt | 15 ++++++++++++ block/genhd.c | 13 +++++++--- block/partition-generic.c | 9 +++++-- include/linux/blk_types.h | 8 +++++++ include/linux/genhd.h | 3 ++- 7 files changed, 68 insertions(+), 18 deletions(-) diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats index f91a973a37feac..abac31d216deed 100644 --- a/Documentation/ABI/testing/procfs-diskstats +++ b/Documentation/ABI/testing/procfs-diskstats @@ -5,6 +5,7 @@ Description: The /proc/diskstats file displays the I/O statistics of block devices. Each line contains the following 14 fields: + 1 - major number 2 - minor mumber 3 - device name @@ -19,4 +20,13 @@ Description: 12 - I/Os currently in progress 13 - time spent doing I/Os (ms) 14 - weighted time spent doing I/Os (ms) + + Kernel 4.18+ appends four more fields for discard + tracking putting the total at 18: + + 15 - discards completed successfully + 16 - discards merged + 17 - sectors discarded + 18 - time spent discarding + For more details refer to Documentation/iostats.txt diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt index 0dbc946de2eab8..0aace9cc536c9d 100644 --- a/Documentation/block/stat.txt +++ b/Documentation/block/stat.txt @@ -31,28 +31,32 @@ write ticks milliseconds total wait time for write requests in_flight requests number of I/Os currently in flight io_ticks milliseconds total time this block device has been active time_in_queue milliseconds total wait time for all requests +discard I/Os requests number of discard I/Os processed +discard merges requests number of discard I/Os merged with in-queue I/O +discard sectors sectors number of sectors discarded +discard ticks milliseconds total wait time for discard requests -read I/Os, write I/Os -===================== +read I/Os, write I/Os, discard I/0s +=================================== These values increment when an I/O request completes. -read merges, write merges -========================= +read merges, write merges, discard merges +========================================= These values increment when an I/O request is merged with an already-queued I/O request. -read sectors, write sectors -=========================== +read sectors, write sectors, discard_sectors +============================================ -These values count the number of sectors read from or written to this -block device. The "sectors" in question are the standard UNIX 512-byte -sectors, not any device- or filesystem-specific block size. The -counters are incremented when the I/O completes. +These values count the number of sectors read from, written to, or +discarded from this block device. The "sectors" in question are the +standard UNIX 512-byte sectors, not any device- or filesystem-specific +block size. The counters are incremented when the I/O completes. -read ticks, write ticks -======================= +read ticks, write ticks, discard ticks +====================================== These values count the number of milliseconds that I/O requests have waited on this block device. If there are multiple I/O requests waiting, diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt index 04d394a2e06cef..49df45f90e8adc 100644 --- a/Documentation/iostats.txt +++ b/Documentation/iostats.txt @@ -31,6 +31,9 @@ Here are examples of these different formats:: 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 3 1 hda1 35486 38030 38030 38030 + 4.18+ diskstats: + 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 + On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. @@ -101,6 +104,18 @@ Field 11 -- weighted # of milliseconds spent doing I/Os last update of this field. This can provide an easy measure of both I/O completion time and the backlog that may be accumulating. +Field 12 -- # of discards completed + This is the total number of discards completed successfully. + +Field 13 -- # of discards merged + See the description of field 2 + +Field 14 -- # of sectors discarded + This is the total number of sectors discarded successfully. + +Field 15 -- # of milliseconds spent discarding + This is the total number of milliseconds spent by all discards (as + measured from __make_request() to end_that_request_last()). To avoid introducing performance bottlenecks, no locks are held while modifying these counters. This implies that minor inaccuracies may be diff --git a/block/genhd.c b/block/genhd.c index 0711a800d0d4ca..8cc719a37b32f0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1333,8 +1333,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_round_stats(gp->queue, cpu, hd); part_stat_unlock(); part_in_flight(gp->queue, hd, inflight); - seq_printf(seqf, "%4d %7d %s %lu %lu %lu " - "%u %lu %lu %lu %u %u %u %u\n", + seq_printf(seqf, "%4d %7d %s " + "%lu %lu %lu %u " + "%lu %lu %lu %u " + "%u %u %u " + "%lu %lu %lu %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), part_stat_read(hd, ios[STAT_READ]), @@ -1347,7 +1350,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])), inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), - jiffies_to_msecs(part_stat_read(hd, time_in_queue)) + jiffies_to_msecs(part_stat_read(hd, time_in_queue)), + part_stat_read(hd, ios[STAT_DISCARD]), + part_stat_read(hd, merges[STAT_DISCARD]), + part_stat_read(hd, sectors[STAT_DISCARD]), + jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD])) ); } disk_part_iter_exit(&piter); diff --git a/block/partition-generic.c b/block/partition-generic.c index 0ddb06722162ca..5a8975a1201c6b 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -130,7 +130,8 @@ ssize_t part_stat_show(struct device *dev, return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " - "%8u %8u %8u" + "%8u %8u %8u " + "%8lu %8lu %8llu %8u" "\n", part_stat_read(p, ios[STAT_READ]), part_stat_read(p, merges[STAT_READ]), @@ -142,7 +143,11 @@ ssize_t part_stat_show(struct device *dev, jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])), inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue))); + jiffies_to_msecs(part_stat_read(p, time_in_queue)), + part_stat_read(p, ios[STAT_DISCARD]), + part_stat_read(p, merges[STAT_DISCARD]), + (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), + jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD]))); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2960a96d833c9e..f6dfb30737d8d3 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -360,6 +360,7 @@ enum req_flag_bits { enum stat_group { STAT_READ, STAT_WRITE, + STAT_DISCARD, NR_STAT_GROUPS }; @@ -401,8 +402,15 @@ static inline bool op_is_sync(unsigned int op) (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } +static inline bool op_is_discard(unsigned int op) +{ + return (op & REQ_OP_MASK) == REQ_OP_DISCARD; +} + static inline int op_stat_group(unsigned int op) { + if (op_is_discard(op)) + return STAT_DISCARD; return op_is_write(op); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a75445446974a7..57864422a2c881 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -356,7 +356,8 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ - part_stat_read(part, field[STAT_WRITE])) + part_stat_read(part, field[STAT_WRITE]) + \ + part_stat_read(part, field[STAT_DISCARD])) #define part_stat_add(cpu, part, field, addnd) do { \ __part_stat_add((cpu), (part), field, addnd); \ From 636620b66d5d4012c4a9c86206013964d3986c4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 18 Jul 2018 04:47:41 -0700 Subject: [PATCH 075/190] blkcg: Track DISCARD statistics and output them in cgroup io.stat Add tracking of REQ_OP_DISCARD ios to the per-cgroup io.stat. Two fields, dbytes and dios, to respectively count the total bytes and number of discards are added. Signed-off-by: Tejun Heo Cc: Andy Newell Cc: Michael Callahan Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 10 ++++++---- block/blk-cgroup.c | 13 +++++++++---- include/linux/blk-cgroup.h | 5 ++++- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 569ce27b85e53f..3afe10fa82bc69 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1317,17 +1317,19 @@ IO Interface Files Lines are keyed by $MAJ:$MIN device numbers and not ordered. The following nested keys are defined. - ====== =================== + ====== ===================== rbytes Bytes read wbytes Bytes written rios Number of read IOs wios Number of write IOs - ====== =================== + dbytes Bytes discarded + dios Number of discard IOs + ====== ===================== An example read output follows: - 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 - 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 + 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0 + 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021 io.weight A read-write flat-keyed file which exists on non-root cgroups. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7e2c19ce1a08ee..1942357d7165d2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -567,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, [BLKG_RWSTAT_WRITE] = "Write", [BLKG_RWSTAT_SYNC] = "Sync", [BLKG_RWSTAT_ASYNC] = "Async", + [BLKG_RWSTAT_DISCARD] = "Discard", }; const char *dname = blkg_dev_name(pd->blkg); u64 v; @@ -580,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -959,7 +961,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) const char *dname; char *buf; struct blkg_rwstat rwstat; - u64 rbytes, wbytes, rios, wios; + u64 rbytes, wbytes, rios, wios, dbytes, dios; size_t size = seq_get_buf(sf, &buf), off = 0; int i; bool has_stats = false; @@ -982,19 +984,22 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) offsetof(struct blkcg_gq, stat_bytes)); rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); rwstat = blkg_rwstat_recursive_sum(blkg, NULL, offsetof(struct blkcg_gq, stat_ios)); rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); spin_unlock_irq(blkg->q->queue_lock); if (rbytes || wbytes || rios || wios) { has_stats = true; off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu", - rbytes, wbytes, rios, wios); + "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); } if (!blkcg_debug_stats) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index de57de4831d532..3bed5e02a8732f 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -35,6 +35,7 @@ enum blkg_rwstat_type { BLKG_RWSTAT_WRITE, BLKG_RWSTAT_SYNC, BLKG_RWSTAT_ASYNC, + BLKG_RWSTAT_DISCARD, BLKG_RWSTAT_NR, BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, @@ -649,7 +650,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, { struct percpu_counter *cnt; - if (op_is_write(op)) + if (op_is_discard(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; + else if (op_is_write(op)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; From 22f17952c7873a427e6e4280d723c0f686d75fb9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 19 Jul 2018 21:42:13 -0400 Subject: [PATCH 076/190] blk-rq-qos: make depth comparisons unsigned With the change to use UINT_MAX I broke the depth check as any value of inflight (ie 0) would be less than (int)UINT_MAX. Fix this by changing everything to unsigned int to match the depth. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 8 ++++---- block/blk-rq-qos.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 5134b24482f618..0005dfd568dd5b 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -4,12 +4,12 @@ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, * false if 'v' + 1 would be bigger than 'below'. */ -static bool atomic_inc_below(atomic_t *v, int below) +static bool atomic_inc_below(atomic_t *v, unsigned int below) { - int cur = atomic_read(v); + unsigned int cur = atomic_read(v); for (;;) { - int old; + unsigned int old; if (cur >= below) return false; @@ -22,7 +22,7 @@ static bool atomic_inc_below(atomic_t *v, int below) return true; } -bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit) +bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) { return atomic_inc_below(&rq_wait->inflight, limit); } diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index d5e2f68fe41e00..32b02efbfa66dd 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -93,7 +93,7 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) } } -bool rq_wait_inc_below(struct rq_wait *rq_wait, int limit); +bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); void rq_depth_scale_up(struct rq_depth *rqd); void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); From 8824f62246bef288173a6624a363352f0d4d3b09 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 22 Jul 2018 14:10:15 +0800 Subject: [PATCH 077/190] blk-mq: fail the request in case issue failure Inside blk_mq_try_issue_list_directly(), if the request is issued as failed, we shouldn't try to do it again, otherwise the warning in blk_mq_start_request() will be triggered. This change is aligned to behaviour of other ways of request issue & dispatch. Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in case of 'none'") Cc: Kashyap Desai Cc: Laurence Oberman Cc: Omar Sandoval Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Hannes Reinecke Cc: Kashyap Desai Cc: kernel test robot Cc: LKP Reported-by: kernel test robot Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 21f3eda9843137..e13bdc2707ce25 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1794,8 +1794,12 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq); if (ret != BLK_STS_OK) { - list_add(&rq->queuelist, list); - break; + if (ret == BLK_STS_RESOURCE || + ret == BLK_STS_DEV_RESOURCE) { + list_add(&rq->queuelist, list); + break; + } + blk_mq_end_request(rq, ret); } } } From 40c6f9c28ef03f2f2c3ee58c2447a6e6b9a713f2 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Fri, 15 Jun 2018 12:39:27 -0600 Subject: [PATCH 078/190] nvme.h: resync with nvme-cli Added some feature ids present in nvme-cli but not kernel. Signed-off-by: Revanth Rajashekar Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2950ce957656d2..80dfedcf0bf722 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -749,6 +749,11 @@ enum { NVME_FEAT_HOST_MEM_BUF = 0x0d, NVME_FEAT_TIMESTAMP = 0x0e, NVME_FEAT_KATO = 0x0f, + NVME_FEAT_HCTM = 0x10, + NVME_FEAT_NOPSC = 0x11, + NVME_FEAT_RRL = 0x12, + NVME_FEAT_PLM_CONFIG = 0x13, + NVME_FEAT_PLM_WINDOW = 0x14, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, From 230f1f9e04e2abee34b1478b3bcc2d947b7cc2a0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 12 Jun 2018 16:28:24 -0700 Subject: [PATCH 079/190] nvme: move init of keep_alive work item to controller initialization Currently, the code initializes the keep alive work item whenever nvme_start_keep_alive() is called. However, this routine is called several times while reconnecting, etc. Although it's hoped that keep alive is always disabled and not scheduled when start is called, re-initing if it were scheduled or completing can have very bad side effects. There's no need for re-initialization. Move the keep_alive work item and cmd struct initialization to controller init. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 46df030b2c3f74..e541fe268bcfbc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -848,9 +848,6 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); - memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); - ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } @@ -3484,6 +3481,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); + INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); + ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); if (ret < 0) goto out; From 0866bf0c3778661e65f68a5c93df8e0a1e9e43cc Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 11 Jun 2018 13:40:07 -0400 Subject: [PATCH 080/190] nvmet: add commands supported and effects log page This patch adds support for Commands Supported and Effects log page (Log Identifier 05h) for NVMeOF. This also makes it easier to find which commands are supported, e.g. :- subnqn : testnqn1 Admin Command Set ACS2 [Get Log Page ] 00000001 ACS6 [Identify ] 00000001 ACS8 [Abort ] 00000001 ACS9 [Set Features ] 00000001 ACS10 [Get Features ] 00000001 ACS12 [Asynchronous Event Request ] 00000001 ACS24 [Keep Alive ] 00000001 NVM Command Set IOCS0 [Flush ] 00000001 IOCS1 [Write ] 00000001 IOCS2 [Read ] 00000001 IOCS8 [Write Zeroes ] 00000001 IOCS9 [Dataset Management ] 00000001 This partticular functionality can be used from the host side to examine the NVMeOF ctrl commands supported. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 35 ++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 38803576d5e122..e2c6f8b3938886 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -128,6 +128,36 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) nvmet_req_complete(req, status); } +static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) +{ + u16 status = NVME_SC_INTERNAL; + struct nvme_effects_log *log; + + log = kzalloc(sizeof(*log), GFP_KERNEL); + if (!log) + goto out; + + log->acs[nvme_admin_get_log_page] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_identify] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_abort_cmd] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_set_features] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_get_features] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_async_event] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_keep_alive] = cpu_to_le32(1 << 0); + + log->iocs[nvme_cmd_read] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_write] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_flush] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0); + + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); + + kfree(log); +out: + nvmet_req_complete(req, status); +} + static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -208,7 +238,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* first slot is read-only, only one slot supported */ id->frmw = (1 << 0) | (1 << 1); - id->lpa = (1 << 0) | (1 << 2); + id->lpa = (1 << 0) | (1 << 1) | (1 << 2); id->elpe = NVMET_ERROR_LOG_SLOTS - 1; id->npss = 0; @@ -586,6 +616,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) case NVME_LOG_CHANGED_NS: req->execute = nvmet_execute_get_log_changed_ns; return 0; + case NVME_LOG_CMD_EFFECTS: + req->execute = nvmet_execute_get_log_cmd_effects_ns; + return 0; } break; case nvme_admin_identify: From 55eb942eda2ccbbbea61db4c1a774ba22b618046 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 20 Jun 2018 00:01:41 -0400 Subject: [PATCH 081/190] nvmet: add buffered I/O support for file backed ns Add a new "buffered_io" attribute, which disabled direct I/O and thus enables page cache based caching when enabled. The attribute can only be changed when the namespace is disabled as the file has to be reopend for the change to take effect. The possibly blocking read/write are deferred to a newly introduced global workqueue. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 29 +++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 9 +++++++++ drivers/nvme/target/io-cmd-file.c | 31 ++++++++++++++++++++++++++----- drivers/nvme/target/nvmet.h | 3 +++ 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index d3f3b3ec4d1afa..fee56b3a23bc7d 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -407,11 +407,40 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, CONFIGFS_ATTR(nvmet_ns_, enable); +static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io); +} + +static ssize_t nvmet_ns_buffered_io_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool val; + + if (strtobool(page, &val)) + return -EINVAL; + + mutex_lock(&ns->subsys->lock); + if (ns->enabled) { + pr_err("disable ns before setting buffered_io value.\n"); + mutex_unlock(&ns->subsys->lock); + return -EINVAL; + } + + ns->buffered_io = val; + mutex_unlock(&ns->subsys->lock); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, buffered_io); + static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, &nvmet_ns_attr_device_uuid, &nvmet_ns_attr_enable, + &nvmet_ns_attr_buffered_io, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 74d4b785d2daac..96eafbd419e736 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -18,6 +18,7 @@ #include "nvmet.h" +struct workqueue_struct *buffered_io_wq; static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; static DEFINE_IDA(cntlid_ida); @@ -437,6 +438,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; uuid_gen(&ns->uuid); + ns->buffered_io = false; return ns; } @@ -1109,6 +1111,12 @@ static int __init nvmet_init(void) { int error; + buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", + WQ_MEM_RECLAIM, 0); + if (!buffered_io_wq) { + error = -ENOMEM; + goto out; + } error = nvmet_init_discovery(); if (error) goto out; @@ -1129,6 +1137,7 @@ static void __exit nvmet_exit(void) nvmet_exit_configfs(); nvmet_exit_discovery(); ida_destroy(&cntlid_ida); + destroy_workqueue(buffered_io_wq); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 8c42b3a8c420ab..57c660e3245d76 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -16,6 +16,8 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) { if (ns->file) { + if (ns->buffered_io) + flush_workqueue(buffered_io_wq); mempool_destroy(ns->bvec_pool); ns->bvec_pool = NULL; kmem_cache_destroy(ns->bvec_cache); @@ -27,11 +29,14 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) int nvmet_file_ns_enable(struct nvmet_ns *ns) { - int ret; + int flags = O_RDWR | O_LARGEFILE; struct kstat stat; + int ret; + + if (!ns->buffered_io) + flags |= O_DIRECT; - ns->file = filp_open(ns->device_path, - O_RDWR | O_LARGEFILE | O_DIRECT, 0); + ns->file = filp_open(ns->device_path, flags, 0); if (IS_ERR(ns->file)) { pr_err("failed to open file %s: (%ld)\n", ns->device_path, PTR_ERR(ns->file)); @@ -100,7 +105,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, iocb->ki_pos = pos; iocb->ki_filp = req->ns->file; - iocb->ki_flags = IOCB_DIRECT | ki_flags; + iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); ret = call_iter(iocb, &iter); @@ -189,6 +194,19 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); } +static void nvmet_file_buffered_io_work(struct work_struct *w) +{ + struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); + + nvmet_file_execute_rw(req); +} + +static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req) +{ + INIT_WORK(&req->f.work, nvmet_file_buffered_io_work); + queue_work(buffered_io_wq, &req->f.work); +} + static void nvmet_file_flush_work(struct work_struct *w) { struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); @@ -280,7 +298,10 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) switch (cmd->common.opcode) { case nvme_cmd_read: case nvme_cmd_write: - req->execute = nvmet_file_execute_rw; + if (req->ns->buffered_io) + req->execute = nvmet_file_execute_rw_buffered_io; + else + req->execute = nvmet_file_execute_rw; req->data_len = nvmet_rw_len(req); return 0; case nvme_cmd_flush: diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 480dfe10fad943..5efb98ec95df8d 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -65,6 +65,7 @@ struct nvmet_ns { u8 nguid[16]; uuid_t uuid; + bool buffered_io; bool enabled; struct nvmet_subsys *subsys; const char *device_path; @@ -269,6 +270,8 @@ struct nvmet_req { const struct nvmet_fabrics_ops *ops; }; +extern struct workqueue_struct *buffered_io_wq; + static inline void nvmet_set_status(struct nvmet_req *req, u16 status) { req->rsp->status = cpu_to_le16(status << 1); From 64a741c1eaa83e34a8846c7196feb8e45785bebc Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 20 Jun 2018 07:15:05 -0700 Subject: [PATCH 082/190] nvme-rdma: support up to 4 segments of inline data Allow up to 4 segments of inline data for NVMF WRITE operations. This reduces latency for small WRITEs by removing the need for the target to issue a READ WR for IB, or a REG_MR + READ WR chain for iWarp. Also cap the inline segments used based on the limitations of the device. Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Steve Wise Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 518c5b09038c1e..363f73fe549c34 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -40,13 +40,14 @@ #define NVME_RDMA_MAX_SEGMENTS 256 -#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 +#define NVME_RDMA_MAX_INLINE_SEGMENTS 4 struct nvme_rdma_device { struct ib_device *dev; struct ib_pd *pd; struct kref ref; struct list_head entry; + unsigned int num_inline_segments; }; struct nvme_rdma_qe { @@ -117,6 +118,7 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + bool use_inline_data; }; static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) @@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) /* +1 for drain */ init_attr.cap.max_recv_wr = queue->queue_size + 1; init_attr.cap.max_recv_sge = 1; - init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; + init_attr.cap.max_send_sge = 1 + dev->num_inline_segments; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = queue->ib_cq; @@ -374,6 +376,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) goto out_free_pd; } + ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS, + ndev->dev->attrs.max_sge - 1); list_add(&ndev->entry, &device_list); out_unlock: mutex_unlock(&device_list_mutex); @@ -925,6 +929,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ret) goto requeue; + if (ctrl->ctrl.sgls & (1 << 20)) + ctrl->use_inline_data = true; + if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) @@ -1090,19 +1097,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c) } static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, - struct nvme_rdma_request *req, struct nvme_command *c) + struct nvme_rdma_request *req, struct nvme_command *c, + int count) { struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + struct scatterlist *sgl = req->sg_table.sgl; + struct ib_sge *sge = &req->sge[1]; + u32 len = 0; + int i; - req->sge[1].addr = sg_dma_address(req->sg_table.sgl); - req->sge[1].length = sg_dma_len(req->sg_table.sgl); - req->sge[1].lkey = queue->device->pd->local_dma_lkey; + for (i = 0; i < count; i++, sgl++, sge++) { + sge->addr = sg_dma_address(sgl); + sge->length = sg_dma_len(sgl); + sge->lkey = queue->device->pd->local_dma_lkey; + len += sge->length; + } sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); - sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); + sg->length = cpu_to_le32(len); sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; - req->num_sge++; + req->num_sge += count; return 0; } @@ -1195,15 +1210,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, goto out_free_table; } - if (count == 1) { + if (count <= dev->num_inline_segments) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + queue->ctrl->use_inline_data && blk_rq_payload_bytes(rq) <= nvme_rdma_inline_data_size(queue)) { - ret = nvme_rdma_map_sg_inline(queue, req, c); + ret = nvme_rdma_map_sg_inline(queue, req, c, count); goto out; } - if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { ret = nvme_rdma_map_sg_single(queue, req, c); goto out; } From 0d5ee2b2ab4f6776c361bc975c2323bc8b5cf349 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 20 Jun 2018 07:15:10 -0700 Subject: [PATCH 083/190] nvmet-rdma: support max(16KB, PAGE_SIZE) inline data The patch enables inline data sizes using up to 4 recv sges, and capping the size at 16KB or at least 1 page size. So on a 4K page system, up to 16KB is supported, and for a 64K page system 1 page of 64KB is supported. We avoid > 0 order page allocations for the inline buffers by using multiple recv sges, one for each page. If the device cannot support the configured inline data size due to lack of enough recv sges, then log a warning and reduce the inline size. Add a new configfs port attribute, called param_inline_data_size, to allow configuring the size of inline data for a given nvmf port. The maximum size allowed is still enforced by nvmet-rdma with NVMET_RDMA_MAX_INLINE_DATA_SIZE, which is now max(16KB, PAGE_SIZE). And the default size, if not specified via configfs, is still PAGE_SIZE. This preserves the existing behavior, but allows larger inline sizes for small page systems. If the configured inline data size exceeds NVMET_RDMA_MAX_INLINE_DATA_SIZE, a warning is logged and the size is reduced. If param_inline_data_size is set to 0, then inline data is disabled for that nvmf port. Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Steve Wise Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 4 +- drivers/nvme/target/configfs.c | 31 ++++++ drivers/nvme/target/core.c | 4 + drivers/nvme/target/discovery.c | 2 +- drivers/nvme/target/nvmet.h | 2 +- drivers/nvme/target/rdma.c | 169 ++++++++++++++++++++++++-------- 6 files changed, 169 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index e2c6f8b3938886..837bbdbfaa4bb2 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -268,14 +268,14 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ if (ctrl->ops->has_keyed_sgls) id->sgls |= cpu_to_le32(1 << 2); - if (ctrl->ops->sqe_inline_size) + if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); strcpy(id->subnqn, ctrl->subsys->subsysnqn); /* Max command capsule size is sqe + single page of in-capsule data */ id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + - ctrl->ops->sqe_inline_size) / 16); + req->port->inline_data_size) / 16); /* Max response capsule size is cqe */ id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index fee56b3a23bc7d..3ba5ea5c4376a1 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -218,6 +218,35 @@ static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, CONFIGFS_ATTR(nvmet_, addr_trsvcid); +static ssize_t nvmet_param_inline_data_size_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size); +} + +static ssize_t nvmet_param_inline_data_size_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + int ret; + + if (port->enabled) { + pr_err("Cannot modify inline_data_size while port enabled\n"); + pr_err("Disable the port before modifying\n"); + return -EACCES; + } + ret = kstrtoint(page, 0, &port->inline_data_size); + if (ret) { + pr_err("Invalid value '%s' for inline_data_size\n", page); + return -EINVAL; + } + return count; +} + +CONFIGFS_ATTR(nvmet_, param_inline_data_size); + static ssize_t nvmet_addr_trtype_show(struct config_item *item, char *page) { @@ -903,6 +932,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = { &nvmet_attr_addr_traddr, &nvmet_attr_addr_trsvcid, &nvmet_attr_addr_trtype, + &nvmet_attr_param_inline_data_size, NULL, }; @@ -932,6 +962,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, INIT_LIST_HEAD(&port->entry); INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->referrals); + port->inline_data_size = -1; /* < 0 == let the transport choose */ port->disc_addr.portid = cpu_to_le16(portid); config_group_init_type_name(&port->group, name, &nvmet_port_type); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 96eafbd419e736..ddd85715a00ab4 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -242,6 +242,10 @@ int nvmet_enable_port(struct nvmet_port *port) return ret; } + /* If the transport didn't set inline_data_size, then disable it. */ + if (port->inline_data_size < 0) + port->inline_data_size = 0; + port->enabled = true; return 0; } diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 08656b849bd6ef..eae29f493a0748 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -171,7 +171,7 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ if (ctrl->ops->has_keyed_sgls) id->sgls |= cpu_to_le32(1 << 2); - if (ctrl->ops->sqe_inline_size) + if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); strcpy(id->subnqn, ctrl->subsys->subsysnqn); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 5efb98ec95df8d..68899385540260 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -117,6 +117,7 @@ struct nvmet_port { struct list_head referrals; void *priv; bool enabled; + int inline_data_size; }; static inline struct nvmet_port *to_nvmet_port(struct config_item *item) @@ -226,7 +227,6 @@ struct nvmet_req; struct nvmet_fabrics_ops { struct module *owner; unsigned int type; - unsigned int sqe_inline_size; unsigned int msdbd; bool has_keyed_sgls : 1; void (*queue_response)(struct nvmet_req *req); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 52e0c5d579a7aa..2106ae2ec17738 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -33,16 +33,17 @@ #include "nvmet.h" /* - * We allow up to a page of inline data to go with the SQE + * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data */ -#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE +#define NVMET_RDMA_MAX_INLINE_SGE 4 +#define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) struct nvmet_rdma_cmd { - struct ib_sge sge[2]; + struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; struct ib_recv_wr wr; - struct scatterlist inline_sg; - struct page *inline_page; + struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; struct nvme_command *nvme_cmd; struct nvmet_rdma_queue *queue; }; @@ -116,6 +117,8 @@ struct nvmet_rdma_device { size_t srq_size; struct kref ref; struct list_head entry; + int inline_data_size; + int inline_page_count; }; static bool nvmet_rdma_use_srq; @@ -138,6 +141,11 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); static const struct nvmet_fabrics_ops nvmet_rdma_ops; +static int num_pages(int len) +{ + return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); +} + /* XXX: really should move to a generic header sooner or later.. */ static inline u32 get_unaligned_le24(const u8 *p) { @@ -184,6 +192,71 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); } +static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) +{ + struct scatterlist *sg; + struct ib_sge *sge; + int i; + + if (!ndev->inline_data_size) + return; + + sg = c->inline_sg; + sge = &c->sge[1]; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } +} + +static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c) +{ + struct scatterlist *sg; + struct ib_sge *sge; + struct page *pg; + int len; + int i; + + if (!ndev->inline_data_size) + return 0; + + sg = c->inline_sg; + sg_init_table(sg, ndev->inline_page_count); + sge = &c->sge[1]; + len = ndev->inline_data_size; + + for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) + goto out_err; + sg_assign_page(sg, pg); + sge->addr = ib_dma_map_page(ndev->device, + pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, sge->addr)) + goto out_err; + sge->length = min_t(int, len, PAGE_SIZE); + sge->lkey = ndev->pd->local_dma_lkey; + len -= sge->length; + } + + return 0; +out_err: + for (; i >= 0; i--, sg--, sge--) { + if (sge->length) + ib_dma_unmap_page(ndev->device, sge->addr, + sge->length, DMA_FROM_DEVICE); + if (sg_page(sg)) + __free_page(sg_page(sg)); + } + return -ENOMEM; +} + static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { @@ -200,33 +273,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, c->sge[0].length = sizeof(*c->nvme_cmd); c->sge[0].lkey = ndev->pd->local_dma_lkey; - if (!admin) { - c->inline_page = alloc_pages(GFP_KERNEL, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - if (!c->inline_page) - goto out_unmap_cmd; - c->sge[1].addr = ib_dma_map_page(ndev->device, - c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) - goto out_free_inline_page; - c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; - c->sge[1].lkey = ndev->pd->local_dma_lkey; - } + if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) + goto out_unmap_cmd; c->cqe.done = nvmet_rdma_recv_done; c->wr.wr_cqe = &c->cqe; c->wr.sg_list = c->sge; - c->wr.num_sge = admin ? 1 : 2; + c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; return 0; -out_free_inline_page: - if (!admin) { - __free_pages(c->inline_page, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - } out_unmap_cmd: ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); @@ -240,12 +297,8 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *c, bool admin) { - if (!admin) { - ib_dma_unmap_page(ndev->device, c->sge[1].addr, - NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); - __free_pages(c->inline_page, - get_order(NVMET_RDMA_INLINE_DATA_SIZE)); - } + if (!admin) + nvmet_rdma_free_inline_pages(ndev, c); ib_dma_unmap_single(ndev->device, c->sge[0].addr, sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); kfree(c->nvme_cmd); @@ -429,7 +482,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); } - if (rsp->req.sg != &rsp->cmd->inline_sg) + if (rsp->req.sg != rsp->cmd->inline_sg) sgl_free(rsp->req.sg); if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) @@ -529,10 +582,25 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, u64 off) { - sg_init_table(&rsp->cmd->inline_sg, 1); - sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); - rsp->req.sg = &rsp->cmd->inline_sg; - rsp->req.sg_cnt = 1; + int sg_count = num_pages(len); + struct scatterlist *sg; + int i; + + sg = rsp->cmd->inline_sg; + for (i = 0; i < sg_count; i++, sg++) { + if (i < sg_count - 1) + sg_unmark_end(sg); + else + sg_mark_end(sg); + sg->offset = off; + sg->length = min_t(int, len, PAGE_SIZE - off); + len -= sg->length; + if (!i) + off = 0; + } + + rsp->req.sg = rsp->cmd->inline_sg; + rsp->req.sg_cnt = sg_count; } static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) @@ -544,7 +612,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) if (!nvme_is_write(rsp->req.cmd)) return NVME_SC_INVALID_FIELD | NVME_SC_DNR; - if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { + if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; } @@ -743,7 +811,7 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) srq_size = 4095; /* XXX: tune */ srq_attr.attr.max_wr = srq_size; - srq_attr.attr.max_sge = 2; + srq_attr.attr.max_sge = 1 + ndev->inline_page_count; srq_attr.attr.srq_limit = 0; srq_attr.srq_type = IB_SRQT_BASIC; srq = ib_create_srq(ndev->pd, &srq_attr); @@ -793,7 +861,10 @@ static void nvmet_rdma_free_dev(struct kref *ref) static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { + struct nvmet_port *port = cm_id->context; struct nvmet_rdma_device *ndev; + int inline_page_count; + int inline_sge_count; int ret; mutex_lock(&device_list_mutex); @@ -807,6 +878,18 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) if (!ndev) goto out_err; + inline_page_count = num_pages(port->inline_data_size); + inline_sge_count = max(cm_id->device->attrs.max_sge_rd, + cm_id->device->attrs.max_sge) - 1; + if (inline_page_count > inline_sge_count) { + pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", + port->inline_data_size, cm_id->device->name, + inline_sge_count * PAGE_SIZE); + port->inline_data_size = inline_sge_count * PAGE_SIZE; + inline_page_count = inline_sge_count; + } + ndev->inline_data_size = port->inline_data_size; + ndev->inline_page_count = inline_page_count; ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -881,7 +964,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) } else { /* +1 for drain */ qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; - qp_attr.cap.max_recv_sge = 2; + qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; } ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); @@ -1379,6 +1462,15 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) return -EINVAL; } + if (port->inline_data_size < 0) { + port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + port->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, port->disc_addr.trsvcid, &addr); if (ret) { @@ -1456,7 +1548,6 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, - .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, .msdbd = 1, .has_keyed_sgls = 1, .add_port = nvmet_rdma_add_port, From 2fc464e2162c2b2f7faf7404fa9c35d1cf70aa00 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 27 Jun 2018 14:58:02 +0300 Subject: [PATCH 084/190] nvmet-rdma: add unlikely check in the fast path ib_post_send operation should succeed unless something unusual happened to the ib device. Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 2106ae2ec17738..4ca09456bbbb1b 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -546,7 +546,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) rsp->send_sge.addr, rsp->send_sge.length, DMA_TO_DEVICE); - if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { + if (unlikely(ib_post_send(cm_id->qp, first_wr, &bad_wr))) { pr_err("sending cmd response failed\n"); nvmet_rdma_release_rsp(rsp); } From 202093848cac2da7d92ae666b51b7109bbab633c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 1 Jul 2018 12:20:24 +0300 Subject: [PATCH 085/190] nvmet-rdma: add an error flow for post_recv failures Posting receive buffer operation can fail, thus we should make sure to have an error flow during initialization phase. While we're here, add a debug print in case of a failure. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 4ca09456bbbb1b..e7f43d1e17797f 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -436,14 +436,21 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, struct nvmet_rdma_cmd *cmd) { struct ib_recv_wr *bad_wr; + int ret; ib_dma_sync_single_for_device(ndev->device, cmd->sge[0].addr, cmd->sge[0].length, DMA_FROM_DEVICE); if (ndev->srq) - return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); - return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); + ret = ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); + else + ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); + + if (unlikely(ret)) + pr_err("post_recv cmd failed\n"); + + return ret; } static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) @@ -833,11 +840,16 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) ndev->srq = srq; ndev->srq_size = srq_size; - for (i = 0; i < srq_size; i++) - nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + for (i = 0; i < srq_size; i++) { + ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + if (ret) + goto out_free_cmds; + } return 0; +out_free_cmds: + nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); out_destroy_srq: ib_destroy_srq(srq); return ret; @@ -982,13 +994,17 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) if (!ndev->srq) { for (i = 0; i < queue->recv_queue_size; i++) { queue->cmds[i].queue = queue; - nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + if (ret) + goto err_destroy_qp; } } out: return ret; +err_destroy_qp: + rdma_destroy_qp(queue->cm_id); err_destroy_cq: ib_free_cq(queue->cq); goto out; From 59e29ce66bc52ebd6d0cb450f13079c7e913430d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 29 Jun 2018 16:50:00 -0600 Subject: [PATCH 086/190] nvme: cache struct nvme_ctrl reference to struct nvme_request We will need to reference the controller in the setup and completion time for tracing and future traffic based keep alive support. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 1 + drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 2 ++ drivers/nvme/host/rdma.c | 1 + drivers/nvme/target/loop.c | 1 + 5 files changed, 6 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 41d45a1b5c628c..9cc33752539a67 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1737,6 +1737,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; + nvme_req(rq)->ctrl = &ctrl->ctrl; return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0c4a33df3b2f3b..f2249387b60d70 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -102,6 +102,7 @@ struct nvme_request { u8 retries; u8 flags; u16 status; + struct nvme_ctrl *ctrl; }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ba943f211687c6..8dcae11bbf3ab5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -418,6 +418,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, BUG_ON(!nvmeq); iod->nvmeq = nvmeq; + + nvme_req(req)->ctrl = &dev->ctrl; return 0; } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 363f73fe549c34..bb6e26fa833166 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -288,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct ib_device *ibdev = dev->dev; int ret; + nvme_req(rq)->ctrl = &ctrl->ctrl; ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (ret) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index d8d91f04bd7eed..af7fbf4132b018 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -227,6 +227,7 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, { struct nvme_loop_ctrl *ctrl = set->driver_data; + nvme_req(req)->ctrl = &ctrl->ctrl; return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); } From 5d87eb94d9ba13e5e2d5ceb56ac6fe0948259ffa Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 29 Jun 2018 16:50:01 -0600 Subject: [PATCH 087/190] nvme: use hw qid in trace events We can not match a command to its completion based on the command id alone. We need the submitting queue identifier to pair with the completion, so this patch adds that to the trace buffer. This patch is also collapsing the admin and IO submission traces into a single one so we don't need to duplicate this and creating unnecessary code branches: we know if the command is an admin vs IO based on the qid. And since we're here, the patch fixes code formatting in the area. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn [hch: move the qid helper to nvme.h and made it an inline function] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 +- drivers/nvme/host/nvme.h | 7 +++ drivers/nvme/host/trace.h | 112 +++++++++++++++----------------------- 3 files changed, 53 insertions(+), 71 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e541fe268bcfbc..e77e6418a21cb3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -652,10 +652,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } cmd->common.command_id = req->tag; - if (ns) - trace_nvme_setup_nvm_cmd(req->q->id, cmd); - else - trace_nvme_setup_admin_cmd(cmd); + trace_nvme_setup_cmd(req, cmd); return ret; } EXPORT_SYMBOL_GPL(nvme_setup_cmd); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f2249387b60d70..4ad0c8ad2a2747 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -120,6 +120,13 @@ static inline struct nvme_request *nvme_req(struct request *req) return blk_mq_rq_to_pdu(req); } +static inline u16 nvme_req_qid(struct request *req) +{ + if (!req->rq_disk) + return 0; + return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; +} + /* The below value is the specific amount of delay needed before checking * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 01390f0e167105..e6362677447d63 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -50,13 +50,8 @@ nvme_admin_opcode_name(nvme_admin_security_recv), \ nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) -const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, - u8 *cdw10); -#define __parse_nvme_admin_cmd(opcode, cdw10) \ - nvme_trace_parse_admin_cmd(p, opcode, cdw10) - #define nvme_opcode_name(opcode) { opcode, #opcode } -#define show_opcode_name(val) \ +#define show_nvm_opcode_name(val) \ __print_symbolic(val, \ nvme_opcode_name(nvme_cmd_flush), \ nvme_opcode_name(nvme_cmd_write), \ @@ -70,83 +65,66 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, nvme_opcode_name(nvme_cmd_resv_acquire), \ nvme_opcode_name(nvme_cmd_resv_release)) -const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, - u8 *cdw10); -#define __parse_nvme_cmd(opcode, cdw10) \ - nvme_trace_parse_nvm_cmd(p, opcode, cdw10) +#define show_opcode_name(qid, opcode) \ + (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode)) -TRACE_EVENT(nvme_setup_admin_cmd, - TP_PROTO(struct nvme_command *cmd), - TP_ARGS(cmd), - TP_STRUCT__entry( - __field(u8, opcode) - __field(u8, flags) - __field(u16, cid) - __field(u64, metadata) - __array(u8, cdw10, 24) - ), - TP_fast_assign( - __entry->opcode = cmd->common.opcode; - __entry->flags = cmd->common.flags; - __entry->cid = cmd->common.command_id; - __entry->metadata = le64_to_cpu(cmd->common.metadata); - memcpy(__entry->cdw10, cmd->common.cdw10, - sizeof(__entry->cdw10)); - ), - TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->cid, __entry->flags, __entry->metadata, - show_admin_opcode_name(__entry->opcode), - __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10)) -); +const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, + u8 *cdw10); +#define parse_nvme_cmd(qid, opcode, cdw10) \ + (qid ? \ + nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ + nvme_trace_parse_admin_cmd(p, opcode, cdw10)) -TRACE_EVENT(nvme_setup_nvm_cmd, - TP_PROTO(int qid, struct nvme_command *cmd), - TP_ARGS(qid, cmd), +TRACE_EVENT(nvme_setup_cmd, + TP_PROTO(struct request *req, struct nvme_command *cmd), + TP_ARGS(req, cmd), TP_STRUCT__entry( - __field(int, qid) - __field(u8, opcode) - __field(u8, flags) - __field(u16, cid) - __field(u32, nsid) - __field(u64, metadata) - __array(u8, cdw10, 24) + __field(int, qid) + __field(u8, opcode) + __field(u8, flags) + __field(u16, cid) + __field(u32, nsid) + __field(u64, metadata) + __array(u8, cdw10, 24) ), TP_fast_assign( - __entry->qid = qid; - __entry->opcode = cmd->common.opcode; - __entry->flags = cmd->common.flags; - __entry->cid = cmd->common.command_id; - __entry->nsid = le32_to_cpu(cmd->common.nsid); - __entry->metadata = le64_to_cpu(cmd->common.metadata); - memcpy(__entry->cdw10, cmd->common.cdw10, - sizeof(__entry->cdw10)); + __entry->qid = nvme_req_qid(req); + __entry->opcode = cmd->common.opcode; + __entry->flags = cmd->common.flags; + __entry->cid = cmd->common.command_id; + __entry->nsid = le32_to_cpu(cmd->common.nsid); + __entry->metadata = le64_to_cpu(cmd->common.metadata); + memcpy(__entry->cdw10, cmd->common.cdw10, + sizeof(__entry->cdw10)); ), - TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->qid, __entry->nsid, __entry->cid, + TP_printk("qid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->qid, __entry->cid, __entry->nsid, __entry->flags, __entry->metadata, - show_opcode_name(__entry->opcode), - __parse_nvme_cmd(__entry->opcode, __entry->cdw10)) + show_opcode_name(__entry->qid, __entry->opcode), + parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) ); TRACE_EVENT(nvme_complete_rq, TP_PROTO(struct request *req), TP_ARGS(req), TP_STRUCT__entry( - __field(int, qid) - __field(int, cid) - __field(u64, result) - __field(u8, retries) - __field(u8, flags) - __field(u16, status) + __field(int, qid) + __field(int, cid) + __field(u64, result) + __field(u8, retries) + __field(u8, flags) + __field(u16, status) ), TP_fast_assign( - __entry->qid = req->q->id; - __entry->cid = req->tag; - __entry->result = le64_to_cpu(nvme_req(req)->result.u64); - __entry->retries = nvme_req(req)->retries; - __entry->flags = nvme_req(req)->flags; - __entry->status = nvme_req(req)->status; + __entry->qid = nvme_req_qid(req); + __entry->cid = req->tag; + __entry->result = le64_to_cpu(nvme_req(req)->result.u64); + __entry->retries = nvme_req(req)->retries; + __entry->flags = nvme_req(req)->flags; + __entry->status = nvme_req(req)->status; ), TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", __entry->qid, __entry->cid, __entry->result, From b80a55e246a1b817cb254d79d077f364a2419578 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 2 Jul 2018 09:15:03 -0600 Subject: [PATCH 088/190] nvme: add controller name to trace events This appends the controller instance to the nvme trace buffer to distinguish which controller is dispatching and completing a command. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index e6362677447d63..35b8c72478d533 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -82,6 +82,7 @@ TRACE_EVENT(nvme_setup_cmd, TP_PROTO(struct request *req, struct nvme_command *cmd), TP_ARGS(req, cmd), TP_STRUCT__entry( + __field(int, ctrl_id) __field(int, qid) __field(u8, opcode) __field(u8, flags) @@ -91,6 +92,7 @@ TRACE_EVENT(nvme_setup_cmd, __array(u8, cdw10, 24) ), TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; __entry->qid = nvme_req_qid(req); __entry->opcode = cmd->common.opcode; __entry->flags = cmd->common.flags; @@ -100,9 +102,9 @@ TRACE_EVENT(nvme_setup_cmd, memcpy(__entry->cdw10, cmd->common.cdw10, sizeof(__entry->cdw10)); ), - TP_printk("qid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->qid, __entry->cid, __entry->nsid, - __entry->flags, __entry->metadata, + TP_printk("nvme%d: qid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->ctrl_id, __entry->qid, __entry->cid, + __entry->nsid, __entry->flags, __entry->metadata, show_opcode_name(__entry->qid, __entry->opcode), parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) ); @@ -111,6 +113,7 @@ TRACE_EVENT(nvme_complete_rq, TP_PROTO(struct request *req), TP_ARGS(req), TP_STRUCT__entry( + __field(int, ctrl_id) __field(int, qid) __field(int, cid) __field(u64, result) @@ -119,6 +122,7 @@ TRACE_EVENT(nvme_complete_rq, __field(u16, status) ), TP_fast_assign( + __entry->ctrl_id = nvme_req(req)->ctrl->instance; __entry->qid = nvme_req_qid(req); __entry->cid = req->tag; __entry->result = le64_to_cpu(nvme_req(req)->result.u64); @@ -126,9 +130,10 @@ TRACE_EVENT(nvme_complete_rq, __entry->flags = nvme_req(req)->flags; __entry->status = nvme_req(req)->status; ), - TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", - __entry->qid, __entry->cid, __entry->result, - __entry->retries, __entry->flags, __entry->status) + TP_printk("nvme%d: qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", + __entry->ctrl_id, __entry->qid, __entry->cid, + __entry->result, __entry->retries, __entry->flags, + __entry->status) ); From 6268953e8977a23ca7512a2921e82a5d9252ec01 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 29 Jun 2018 16:50:03 -0600 Subject: [PATCH 089/190] nvme: add disk name to trace events This will print the disk name to the nvme event trace for io requests so a user can better distinguish traffic to different disks. This can be used to create disk based filters. For example, to see only nvme0n2 traffic: echo "disk == \"nvme0n2\"" > /sys/kernel/debug/tracing/events/nvme/filter Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn [hch: turned __assign_disk_name into an inline function] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 11 +++++++++++ drivers/nvme/host/trace.h | 33 ++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 41944bbef8353e..25b0e310f4a813 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -128,3 +128,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, return nvme_trace_common(p, cdw10); } } + +const char *nvme_trace_disk_name(struct trace_seq *p, char *name) +{ + const char *ret = trace_seq_buffer_ptr(p); + + if (*name) + trace_seq_printf(p, "disk=%s, ", name); + trace_seq_putc(p, 0); + + return ret; +} diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 35b8c72478d533..a490790d669136 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -78,10 +78,25 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \ nvme_trace_parse_admin_cmd(p, opcode, cdw10)) +const char *nvme_trace_disk_name(struct trace_seq *p, char *name); +#define __print_disk_name(name) \ + nvme_trace_disk_name(p, name) + +#ifndef TRACE_HEADER_MULTI_READ +static inline void __assign_disk_name(char *name, struct gendisk *disk) +{ + if (disk) + memcpy(name, disk->disk_name, DISK_NAME_LEN); + else + memset(name, 0, DISK_NAME_LEN); +} +#endif + TRACE_EVENT(nvme_setup_cmd, TP_PROTO(struct request *req, struct nvme_command *cmd), TP_ARGS(req, cmd), TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) __field(int, ctrl_id) __field(int, qid) __field(u8, opcode) @@ -99,12 +114,14 @@ TRACE_EVENT(nvme_setup_cmd, __entry->cid = cmd->common.command_id; __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = le64_to_cpu(cmd->common.metadata); + __assign_disk_name(__entry->disk, req->rq_disk); memcpy(__entry->cdw10, cmd->common.cdw10, sizeof(__entry->cdw10)); ), - TP_printk("nvme%d: qid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", - __entry->ctrl_id, __entry->qid, __entry->cid, - __entry->nsid, __entry->flags, __entry->metadata, + TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->nsid, + __entry->flags, __entry->metadata, show_opcode_name(__entry->qid, __entry->opcode), parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10)) ); @@ -113,6 +130,7 @@ TRACE_EVENT(nvme_complete_rq, TP_PROTO(struct request *req), TP_ARGS(req), TP_STRUCT__entry( + __array(char, disk, DISK_NAME_LEN) __field(int, ctrl_id) __field(int, qid) __field(int, cid) @@ -129,11 +147,12 @@ TRACE_EVENT(nvme_complete_rq, __entry->retries = nvme_req(req)->retries; __entry->flags = nvme_req(req)->flags; __entry->status = nvme_req(req)->status; + __assign_disk_name(__entry->disk, req->rq_disk); ), - TP_printk("nvme%d: qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", - __entry->ctrl_id, __entry->qid, __entry->cid, - __entry->result, __entry->retries, __entry->flags, - __entry->status) + TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", + __entry->ctrl_id, __print_disk_name(__entry->disk), + __entry->qid, __entry->cid, __entry->result, + __entry->retries, __entry->flags, __entry->status) ); From 249090f9016b7d68a18fc4c79c42accca18d6961 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 5 Jul 2018 08:14:00 -0500 Subject: [PATCH 090/190] nvme-rdma: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index bb6e26fa833166..2d4a51a80e8fb1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1591,6 +1591,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: nvme_rdma_destroy_queue_ib(queue); + /* fall through */ case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); From 90140624e8face94207003ac9a9d2a329b309d68 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 9 Jul 2018 12:49:05 +0300 Subject: [PATCH 091/190] nvme-rdma: unquiesce queues when deleting the controller If the controller is going away, we need to unquiesce the IO queues so that all pending request can fail gracefully before moving forward with controller deletion. Do that before we destroy the IO queues so blk_cleanup_queue won't block in freeze. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2d4a51a80e8fb1..2b683b8d47637c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1759,6 +1759,8 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_stop_io_queues(ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); + if (shutdown) + nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, shutdown); } From c66e2998c8ca4d5da85d4915612dca29e054ad21 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 9 Jul 2018 12:49:06 +0300 Subject: [PATCH 092/190] nvme-rdma: centralize controller setup sequence Centralize controller sequence to a single routine that correctly cleans up after failures instead of having multiple apperances in several flows (create, reset, reconnect). One thing that we also gain here are the sanity/boundary checks also when connecting back to a dynamic controller. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 130 ++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 77 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2b683b8d47637c..c22125c5661bdc 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -917,24 +917,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) } } -static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) { - struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), - struct nvme_rdma_ctrl, reconnect_work); + int ret = -EINVAL; bool changed; - int ret; - - ++ctrl->ctrl.nr_reconnects; - ret = nvme_rdma_configure_admin_queue(ctrl, false); + ret = nvme_rdma_configure_admin_queue(ctrl, new); if (ret) - goto requeue; + return ret; + + if (ctrl->ctrl.icdoff) { + dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + goto destroy_admin; + } + + if (!(ctrl->ctrl.sgls & (1 << 2))) { + dev_err(ctrl->ctrl.device, + "Mandatory keyed sgls are not supported!\n"); + goto destroy_admin; + } + + if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) { + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); + } + + if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { + dev_warn(ctrl->ctrl.device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); + ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; + } if (ctrl->ctrl.sgls & (1 << 20)) ctrl->use_inline_data = true; if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_configure_io_queues(ctrl, false); + ret = nvme_rdma_configure_io_queues(ctrl, new); if (ret) goto destroy_admin; } @@ -943,10 +963,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (!changed) { /* state change failure is ok if we're in DELETING state */ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); - return; + ret = -EINVAL; + goto destroy_io; } nvme_start_ctrl(&ctrl->ctrl); + return 0; + +destroy_io: + if (ctrl->ctrl.queue_count > 1) + nvme_rdma_destroy_io_queues(ctrl, new); +destroy_admin: + nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_rdma_destroy_admin_queue(ctrl, new); + return ret; +} + +static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_rdma_ctrl, reconnect_work); + + ++ctrl->ctrl.nr_reconnects; + + if (nvme_rdma_setup_ctrl(ctrl, false)) + goto requeue; dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", ctrl->ctrl.nr_reconnects); @@ -955,9 +996,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; -destroy_admin: - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", ctrl->ctrl.nr_reconnects); @@ -1786,8 +1824,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); - int ret; - bool changed; nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); @@ -1798,25 +1834,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) return; } - ret = nvme_rdma_configure_admin_queue(ctrl, false); - if (ret) + if (nvme_rdma_setup_ctrl(ctrl, false)) goto out_fail; - if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_configure_io_queues(ctrl, false); - if (ret) - goto out_fail; - } - - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - if (!changed) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); - return; - } - - nvme_start_ctrl(&ctrl->ctrl); - return; out_fail: @@ -1979,49 +1999,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); WARN_ON_ONCE(!changed); - ret = nvme_rdma_configure_admin_queue(ctrl, true); + ret = nvme_rdma_setup_ctrl(ctrl, true); if (ret) goto out_uninit_ctrl; - /* sanity check icdoff */ - if (ctrl->ctrl.icdoff) { - dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); - ret = -EINVAL; - goto out_remove_admin_queue; - } - - /* sanity check keyed sgls */ - if (!(ctrl->ctrl.sgls & (1 << 2))) { - dev_err(ctrl->ctrl.device, - "Mandatory keyed sgls are not supported!\n"); - ret = -EINVAL; - goto out_remove_admin_queue; - } - - /* only warn if argument is too large here, will clamp later */ - if (opts->queue_size > ctrl->ctrl.sqsize + 1) { - dev_warn(ctrl->ctrl.device, - "queue_size %zu > ctrl sqsize %u, clamping down\n", - opts->queue_size, ctrl->ctrl.sqsize + 1); - } - - /* warn if maxcmd is lower than sqsize+1 */ - if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { - dev_warn(ctrl->ctrl.device, - "sqsize %u > ctrl maxcmd %u, clamping down\n", - ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); - ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; - } - - if (opts->nr_io_queues) { - ret = nvme_rdma_configure_io_queues(ctrl, true); - if (ret) - goto out_remove_admin_queue; - } - - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); @@ -2031,13 +2012,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); - nvme_start_ctrl(&ctrl->ctrl); - return &ctrl->ctrl; -out_remove_admin_queue: - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_destroy_admin_queue(ctrl, true); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); From 75862c72323e222656792370e2f240bc4029ff96 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 9 Jul 2018 12:49:07 +0300 Subject: [PATCH 093/190] nvme-rdma: centralize admin/io queue teardown sequence We follow the same queue teardown sequence in delete, reset and error recovery. Centralize the logic. This patch does not change any functionality. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 66 ++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index c22125c5661bdc..13a6064e47942e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -873,6 +873,31 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) return ret; } +static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, + &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_destroy_admin_queue(ctrl, remove); +} + +static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, + &ctrl->ctrl); + if (remove) + nvme_start_queues(&ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, remove); + } +} + static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -1008,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); nvme_stop_keep_alive(&ctrl->ctrl); - - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, false); - } - - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_admin_queue(ctrl, false); - - /* - * queues are not a live anymore, so restart the queues to fail fast - * new IO - */ - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); + nvme_rdma_teardown_admin_queue(ctrl, false); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1792,27 +1799,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - if (shutdown) - nvme_start_queues(&ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, shutdown); - } - + nvme_rdma_teardown_io_queues(ctrl, shutdown); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); - - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_destroy_admin_queue(ctrl, shutdown); + nvme_rdma_teardown_admin_queue(ctrl, shutdown); } static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) From 1b72b71faccee986e2128a271125177dfe91f7b7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Jul 2018 12:43:16 +0300 Subject: [PATCH 094/190] nvmet: fix file discard return status If nvmet_copy_from_sgl failed, we falsly return successful completion status. Fixes: d5eff33ee6f8 ("nvmet: add simple file backed ns support") Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-file.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 57c660e3245d76..dad8d44bf90e85 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -227,22 +227,24 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) { int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; struct nvme_dsm_range range; - loff_t offset; - loff_t len; - int i, ret; + loff_t offset, len; + u16 ret; + int i; for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { - if (nvmet_copy_from_sgl(req, i * sizeof(range), &range, - sizeof(range))) + ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range)); + if (ret) break; offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; len = le32_to_cpu(range.nlb) << req->ns->blksize_shift; - ret = vfs_fallocate(req->ns->file, mode, offset, len); - if (ret) + if (vfs_fallocate(req->ns->file, mode, offset, len)) { + ret = NVME_SC_INTERNAL | NVME_SC_DNR; break; + } } - nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + nvmet_req_complete(req, ret); } static void nvmet_file_dsm_work(struct work_struct *w) From 9c891c139894ce2ec0ca585a00e0bec5e6b4ccab Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Jul 2018 16:13:04 +0300 Subject: [PATCH 095/190] nvmet: check fileio lba range access boundaries Fail out-of-bounds with a proper status code. Fixes: d5eff33ee6f8 ("nvmet: add simple file backed ns support") Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-file.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index dad8d44bf90e85..c2d0d08b59c8e1 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -145,6 +145,12 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) return; } + pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; + if (unlikely(pos + req->data_len > req->ns->size)) { + nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); + return; + } + if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_KERNEL); @@ -160,8 +166,6 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) is_sync = true; } - pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; - memset(&req->f.iocb, 0, sizeof(struct kiocb)); for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) { nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter); @@ -236,8 +240,14 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) sizeof(range)); if (ret) break; + offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; len = le32_to_cpu(range.nlb) << req->ns->blksize_shift; + if (offset + len > req->ns->size) { + ret = NVME_SC_LBA_RANGE | NVME_SC_DNR; + break; + } + if (vfs_fallocate(req->ns->file, mode, offset, len)) { ret = NVME_SC_INTERNAL | NVME_SC_DNR; break; @@ -283,6 +293,11 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w) len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << req->ns->blksize_shift); + if (unlikely(offset + len > req->ns->size)) { + nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); + return; + } + ret = vfs_fallocate(req->ns->file, mode, offset, len); nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); } From 1b0d274523df5ef1caedc834da055ff721e4d4f0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 17 Jul 2018 17:17:36 +0300 Subject: [PATCH 096/190] nvmet: don't use uuid_le type Don't use sizeof(uuid_le) where none of the parameters is type of uuid_le. Since both arguments are u8 [16], use size of destination there. Moreover, uuid_le is a deprecated type, and nvmet is using uuid_t already. Signed-off-by: Andy Shevchenko Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 837bbdbfaa4bb2..16a9b24270f967 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -338,7 +338,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) */ id->nmic = (1 << 0); - memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le)); + memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid)); id->lbaf[0].ds = ns->blksize_shift; From 76f17d8ba1cbc3d2786955b2f15e071da93527cd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 23 Jul 2018 14:18:33 -0700 Subject: [PATCH 097/190] block: Rename the null_blk_mod kernel module back into null_blk Commit ca4b2a011948 ("null_blk: add zone support") breaks several blktests scripts because it renamed the null_blk kernel module into null_blk_mod. Hence rename null_blk_mod back into null_blk. Fixes: ca4b2a011948 ("null_blk: add zone support") Signed-off-by: Bart Van Assche Cc: Matias Bjorling Cc: Christoph Hellwig Cc: Ming Lei Cc: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/Makefile | 6 +++--- drivers/block/{null_blk.c => null_blk_main.c} | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename drivers/block/{null_blk.c => null_blk_main.c} (100%) diff --git a/drivers/block/Makefile b/drivers/block/Makefile index a0d88aa0c05d60..8566b188368b36 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -38,9 +38,9 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ -obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk_mod.o -null_blk_mod-objs := null_blk.o -null_blk_mod-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o +null_blk-objs := null_blk_main.o +null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk_main.c similarity index 100% rename from drivers/block/null_blk.c rename to drivers/block/null_blk_main.c From 24d5493f207ce0ce38df80ce86c907417e04594a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 14:04:12 +0200 Subject: [PATCH 098/190] block: simplify bio_check_pages_dirty bio_check_pages_dirty currently inviolates the invariant that bv_page of a bio_vec inside bi_vcnt shouldn't be zero, and that is going to become really annoying with multpath biovecs. Fortunately there isn't any all that good reason for it - once we decide to defer freeing the bio to a workqueue holding onto a few additional pages isn't really an issue anymore. So just check if there is a clean page that needs dirtying in the first path, and do a second pass to free them if there was none, while the cache is still hot. Also use the chance to micro-optimize bio_dirty_fn a bit by not saving irq state - we know we are called from a workqueue. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 56 ++++++++++++++++++++--------------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/block/bio.c b/block/bio.c index 8ecc95615941f4..504b4227809913 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1649,19 +1649,15 @@ static void bio_release_pages(struct bio *bio) struct bio_vec *bvec; int i; - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page) - put_page(page); - } + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); } /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must * have been written out during the direct-IO read. So we take another ref on - * the BIO and the offending pages and re-dirty the pages in process context. + * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from * here on. It will run one put_page() against each page and will run one @@ -1679,52 +1675,42 @@ static struct bio *bio_dirty_list; */ static void bio_dirty_fn(struct work_struct *work) { - unsigned long flags; - struct bio *bio; + struct bio *bio, *next; - spin_lock_irqsave(&bio_dirty_lock, flags); - bio = bio_dirty_list; + spin_lock_irq(&bio_dirty_lock); + next = bio_dirty_list; bio_dirty_list = NULL; - spin_unlock_irqrestore(&bio_dirty_lock, flags); + spin_unlock_irq(&bio_dirty_lock); - while (bio) { - struct bio *next = bio->bi_private; + while ((bio = next) != NULL) { + next = bio->bi_private; bio_set_pages_dirty(bio); bio_release_pages(bio); bio_put(bio); - bio = next; } } void bio_check_pages_dirty(struct bio *bio) { struct bio_vec *bvec; - int nr_clean_pages = 0; + unsigned long flags; int i; bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (PageDirty(page) || PageCompound(page)) { - put_page(page); - bvec->bv_page = NULL; - } else { - nr_clean_pages++; - } + if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) + goto defer; } - if (nr_clean_pages) { - unsigned long flags; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio->bi_private = bio_dirty_list; - bio_dirty_list = bio; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } else { - bio_put(bio); - } + bio_release_pages(bio); + bio_put(bio); + return; +defer: + spin_lock_irqsave(&bio_dirty_lock, flags); + bio->bi_private = bio_dirty_list; + bio_dirty_list = bio; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); From 3bb5098310317ca62304bd21af6ccea57d799b06 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 14:04:13 +0200 Subject: [PATCH 099/190] block: bio_set_pages_dirty can't see NULL bv_page in a valid bio_vec So don't bother handling it. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index 504b4227809913..07d3ffd95989bc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1636,10 +1636,8 @@ void bio_set_pages_dirty(struct bio *bio) int i; bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page && !PageCompound(page)) - set_page_dirty_lock(page); + if (!PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); From c8b27acc775990bbd01f067ee7616f7abf2c98a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 09:52:30 +0200 Subject: [PATCH 100/190] bcache: don't clone bio in bch_data_verify We immediately overwrite the biovec array, so instead just allocate a new bio and copy over the disk, setor and size. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Acked-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/debug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index d030ce3025a6a6..04d1467119500e 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) struct bio_vec bv, cbv; struct bvec_iter iter, citer = { 0 }; - check = bio_clone_kmalloc(bio, GFP_NOIO); + check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); if (!check) return; + check->bi_disk = bio->bi_disk; check->bi_opf = REQ_OP_READ; + check->bi_iter.bi_sector = bio->bi_iter.bi_sector; + check->bi_iter.bi_size = bio->bi_iter.bi_size; + bch_bio_map(check, NULL); if (bch_bio_alloc_pages(check, GFP_NOIO)) goto out_put; From 076ff2f0b877df4ace6604480f9b1278e61719b8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 09:52:31 +0200 Subject: [PATCH 101/190] exofs: use bio_clone_fast in _write_mirror The mirroring code never changes the bio data or biovecs. This means we can reuse the biovec allocation easily instead of duplicating it. Signed-off-by: Christoph Hellwig Acked-by Boaz Harrosh Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- fs/exofs/ore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 1b8b44637e7065..5331a15a61f198 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -873,8 +873,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) struct bio *bio; if (per_dev != master_dev) { - bio = bio_clone_kmalloc(master_dev->bio, - GFP_KERNEL); + bio = bio_clone_fast(master_dev->bio, + GFP_KERNEL, NULL); if (unlikely(!bio)) { ORE_DBGMSG( "Failed to allocate BIO size=%u\n", From 071f52fbce6161706d070ceada5accb81630bf02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 09:52:32 +0200 Subject: [PATCH 102/190] block: remove bio_clone_kmalloc Unused now. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index ab221c517f4ecd..b861baa59454ca 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -443,12 +443,6 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); } -static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) -{ - return bio_clone_bioset(bio, gfp_mask, NULL); - -} - extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); From 3ed122e68bb2dcab36b48109431341736e37ce85 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 09:52:33 +0200 Subject: [PATCH 103/190] md: remove a bogus comment The function name mentioned doesn't exist, and the code next to it doesn't match the description either. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/md/md.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f6e58dbca0d44b..cb4eb5faa519f3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -204,10 +204,6 @@ static int start_readonly; */ static bool create_on_open = true; -/* bio_clone_mddev - * like bio_clone_bioset, but with a local bio set - */ - struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev) { From c55183c9aaa00d2bbb578169a480e31aff3d397c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Jul 2018 09:52:34 +0200 Subject: [PATCH 104/190] block: unexport bio_clone_bioset Now only used by the bounce code, so move it there and mark the function static. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 77 --------------------------------------------- block/bounce.c | 69 +++++++++++++++++++++++++++++++++++++++- include/linux/bio.h | 1 - 3 files changed, 68 insertions(+), 79 deletions(-) diff --git a/block/bio.c b/block/bio.c index 07d3ffd95989bc..b832151cd0bf42 100644 --- a/block/bio.c +++ b/block/bio.c @@ -646,83 +646,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) } EXPORT_SYMBOL(bio_clone_fast); -/** - * bio_clone_bioset - clone a bio - * @bio_src: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Clone bio. Caller will own the returned bio, but not the actual data it - * points to. Reference count of returned bio will be one. - */ -struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) -{ - struct bvec_iter iter; - struct bio_vec bv; - struct bio *bio; - - /* - * Pre immutable biovecs, __bio_clone() used to just do a memcpy from - * bio_src->bi_io_vec to bio->bi_io_vec. - * - * We can't do that anymore, because: - * - * - The point of cloning the biovec is to produce a bio with a biovec - * the caller can modify: bi_idx and bi_bvec_done should be 0. - * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if - * we tried to clone the whole thing bio_alloc_bioset() would fail. - * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. - * - * - Lastly, bi_vcnt should not be looked at or relied upon by code - * that does not own the bio - reason being drivers don't use it for - * iterating over the biovec anymore, so expecting it to be kept up - * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work on - * __bio_clone_fast() anyways. - */ - - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); - if (!bio) - return NULL; - bio->bi_disk = bio_src->bi_disk; - bio->bi_opf = bio_src->bi_opf; - bio->bi_write_hint = bio_src->bi_write_hint; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - break; - case REQ_OP_WRITE_SAME: - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - break; - default: - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - break; - } - - if (bio_integrity(bio_src)) { - int ret; - - ret = bio_integrity_clone(bio, bio_src, gfp_mask); - if (ret < 0) { - bio_put(bio); - return NULL; - } - } - - bio_clone_blkcg_association(bio, bio_src); - - return bio; -} -EXPORT_SYMBOL(bio_clone_bioset); - /** * bio_add_pc_page - attempt to add page to bio * @q: the target queue diff --git a/block/bounce.c b/block/bounce.c index fd31347b783616..bc63b3a2d18cad 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio) __bounce_end_io_read(bio, &isa_page_pool); } +static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + /* + * Pre immutable biovecs, __bio_clone() used to just do a memcpy from + * bio_src->bi_io_vec to bio->bi_io_vec. + * + * We can't do that anymore, because: + * + * - The point of cloning the biovec is to produce a bio with a biovec + * the caller can modify: bi_idx and bi_bvec_done should be 0. + * + * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * we tried to clone the whole thing bio_alloc_bioset() would fail. + * But the clone should succeed as long as the number of biovecs we + * actually need to allocate is fewer than BIO_MAX_PAGES. + * + * - Lastly, bi_vcnt should not be looked at or relied upon by code + * that does not own the bio - reason being drivers don't use it for + * iterating over the biovec anymore, so expecting it to be kept up + * to date (i.e. for clones that share the parent biovec) is just + * asking for trouble and would force extra work on + * __bio_clone_fast() anyways. + */ + + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); + if (!bio) + return NULL; + bio->bi_disk = bio_src->bi_disk; + bio->bi_opf = bio_src->bi_opf; + bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + break; + case REQ_OP_WRITE_SAME: + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + break; + default: + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + break; + } + + if (bio_integrity(bio_src)) { + int ret; + + ret = bio_integrity_clone(bio, bio_src, gfp_mask); + if (ret < 0) { + bio_put(bio); + return NULL; + } + } + + bio_clone_blkcg_association(bio, bio_src); + + return bio; +} + static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, mempool_t *pool) { @@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, generic_make_request(*bio_orig); *bio_orig = bio; } - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL : + bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : &bounce_bio_set); bio_for_each_segment_all(to, bio, i) { diff --git a/include/linux/bio.h b/include/linux/bio.h index b861baa59454ca..51371740d2a8f0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -429,7 +429,6 @@ extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); -extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); extern struct bio_set fs_bio_set; From 42c9cdfe1e11e083dceb0f0c4977b758cf7403b9 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 20 Jul 2018 14:57:38 -0400 Subject: [PATCH 105/190] block: allow max_discard_segments to be stacked Set max_discard_segments to USHRT_MAX in blk_set_stacking_limits() so that blk_stack_limits() can stack up this limit for stacked devices. before: $ cat /sys/block/nvme0n1/queue/max_discard_segments 256 $ cat /sys/block/dm-0/queue/max_discard_segments 1 after: $ cat /sys/block/nvme0n1/queue/max_discard_segments 256 $ cat /sys/block/dm-0/queue/max_discard_segments 256 Fixes: 1e739730c5b9e ("block: optionally merge discontiguous discard bios into a single request") Reviewed-by: Christoph Hellwig Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 053de87d1fda6d..ffd459969689df 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -128,7 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) /* Inherit limits from component devices */ lim->max_segments = USHRT_MAX; - lim->max_discard_segments = 1; + lim->max_discard_segments = USHRT_MAX; lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; From d3df0ac09654e9db82a882031ccae010f1b7575b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 25 Jul 2018 09:42:07 +0200 Subject: [PATCH 106/190] xen/blkfront: remove unused macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove some macros not used anywhere. Acked-by: Roger Pau Monné Signed-off-by: Juergen Gross Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index b5cedccb5d7db1..94300dbe358b36 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -251,14 +251,9 @@ static DEFINE_SPINLOCK(minor_lock); #define GRANTS_PER_INDIRECT_FRAME \ (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) -#define PSEGS_PER_INDIRECT_FRAME \ - (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS) - #define INDIRECT_GREFS(_grants) \ DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) -#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) - static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); static void blkfront_gather_backend_features(struct blkfront_info *info); static int negotiate_mq(struct blkfront_info *info); From 359f642700f2ff05d9c94cd9216c97af7b8e9553 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Wed, 25 Jul 2018 10:22:58 -0400 Subject: [PATCH 107/190] block: move bio_integrity_{intervals,bytes} into blkdev.h This allows bio_integrity_bytes() to be called from drivers instead of open coding it. Acked-by: Martin K. Petersen Signed-off-by: Greg Edwards Signed-off-by: Jens Axboe --- block/bio-integrity.c | 22 ---------------------- include/linux/blkdev.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index add7c7c853352b..67b5fb861a5100 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -159,28 +159,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); -/** - * bio_integrity_intervals - Return number of integrity intervals for a bio - * @bi: blk_integrity profile for device - * @sectors: Size of the bio in 512-byte sectors - * - * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the data integrity - * interval size of the storage device. Convert the block layer sectors - * to the appropriate number of integrity intervals. - */ -static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, - unsigned int sectors) -{ - return sectors >> (bi->interval_exp - 9); -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return bio_integrity_intervals(bi, sectors) * bi->tuple_size; -} - /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 331a6cb8805f00..050d599f5ea972 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1865,6 +1865,28 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } +/** + * bio_integrity_intervals - Return number of integrity intervals for a bio + * @bi: blk_integrity profile for device + * @sectors: Size of the bio in 512-byte sectors + * + * Description: The block layer calculates everything in 512 byte + * sectors but integrity metadata is done in terms of the data integrity + * interval size of the storage device. Convert the block layer sectors + * to the appropriate number of integrity intervals. + */ +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return sectors >> (bi->interval_exp - 9); +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return bio_integrity_intervals(bi, sectors) * bi->tuple_size; +} + #else /* CONFIG_BLK_DEV_INTEGRITY */ struct bio; @@ -1938,6 +1960,18 @@ static inline bool integrity_req_gap_front_merge(struct request *req, return false; } +static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} + +static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, + unsigned int sectors) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ struct block_device_operations { From cdcdcaae8450a975e7d07e1bfec21f9b8c016d0c Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Thu, 26 Jul 2018 15:52:54 -0400 Subject: [PATCH 108/190] scsi: virtio_scsi: fix pi_bytes{out,in} on 4 KiB block size devices When the underlying device is a 4 KiB logical block size device with a protection interval exponent of 0, i.e. 4096 bytes data + 8 bytes PI, the driver miscalculates the pi_bytes{out,in} by a factor of 8x (64 bytes). This leads to errors on all reads and writes on 4 KiB logical block size devices when CONFIG_BLK_DEV_INTEGRITY is enabled and the VIRTIO_SCSI_F_T10_PI feature bit has been negotiated. Fixes: e6dc783a38ec0 ("virtio-scsi: Enable DIF/DIX modes in SCSI host LLD") Acked-by: Martin K. Petersen Signed-off-by: Greg Edwards Signed-off-by: Jens Axboe --- drivers/scsi/virtio_scsi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 6dc8891ccb745a..1c72db94270e89 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -513,12 +513,12 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev, if (sc->sc_data_direction == DMA_TO_DEVICE) cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, - blk_rq_sectors(rq) * - bi->tuple_size); + bio_integrity_bytes(bi, + blk_rq_sectors(rq))); else if (sc->sc_data_direction == DMA_FROM_DEVICE) cmd_pi->pi_bytesin = cpu_to_virtio32(vdev, - blk_rq_sectors(rq) * - bi->tuple_size); + bio_integrity_bytes(bi, + blk_rq_sectors(rq))); } #endif From dc30b96ab6d569060741572cf30517d3179429a8 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Fri, 27 Jul 2018 09:09:53 -0600 Subject: [PATCH 109/190] readahead: stricter check for bdi io_pages ondemand_readahead() checks bdi->io_pages to cap the maximum pages that need to be processed. This works until the readit section. If we would do an async only readahead (async size = sync size) and target is at beginning of window we expand the pages by another get_next_ra_size() pages. Btrace for large reads shows that kernel always issues a doubled size read at the beginning of processing. Add an additional check for io_pages in the lower part of the func. The fix helps devices that hard limit bio pages and rely on proper handling of max_hw_read_sectors (e.g. older FusionIO cards). For that reason it could qualify for stable. Fixes: 9491ae4a ("mm: don't cap request size based on read-ahead setting") Cc: stable@vger.kernel.org Signed-off-by: Markus Stockhausen stockhausen@collogia.de Signed-off-by: Jens Axboe --- mm/readahead.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 9f62b71511001a..a59ea70527b9bc 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -386,6 +386,7 @@ ondemand_readahead(struct address_space *mapping, { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; + unsigned long add_pages; pgoff_t prev_offset; /* @@ -475,10 +476,17 @@ ondemand_readahead(struct address_space *mapping, * Will this read hit the readahead marker made by itself? * If so, trigger the readahead marker hit now, and merge * the resulted next readahead window into the current one. + * Take care of maximum IO pages as above. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max_pages); - ra->size += ra->async_size; + add_pages = get_next_ra_size(ra, max_pages); + if (ra->size + add_pages <= max_pages) { + ra->async_size = add_pages; + ra->size += add_pages; + } else { + ra->size = max_pages; + ra->async_size = max_pages >> 1; + } } return ra_submit(ra, mapping, filp); From 99a27d59bd7b2ce1a82a4e826e8e7881f4d4954d Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Thu, 26 Jul 2018 12:17:33 +0800 Subject: [PATCH 110/190] bcache: simplify the calculation of the total amount of flash dirty data Currently we calculate the total amount of flash only devices dirty data by adding the dirty data of each flash only device under registering locker. It is very inefficient. In this patch, we add a member flash_dev_dirty_sectors in struct cache_set to record the total amount of flash only devices dirty data in real time, so we didn't need to calculate the total amount of dirty data any more. Signed-off-by: Tang Junhui Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 ++ drivers/md/bcache/writeback.c | 5 ++++- drivers/md/bcache/writeback.h | 19 ------------------- 4 files changed, 7 insertions(+), 20 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d6bf294f3907da..3226d38bf8594e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -525,6 +525,7 @@ struct cache_set { unsigned devices_max_used; struct list_head cached_devs; uint64_t cached_dev_sectors; + atomic_long_t flash_dev_dirty_sectors; struct closure caching; struct closure sb_write; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fa4058e4320289..cea2a42ea2762a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1311,6 +1311,8 @@ static void flash_dev_free(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); mutex_lock(&bch_register_lock); + atomic_long_sub(bcache_dev_sectors_dirty(d), + &d->c->flash_dev_dirty_sectors); bcache_device_free(d); mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ad45ebe1a74b46..0d2a05074a8169 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc) * flash-only devices */ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - - bcache_flash_devs_sectors_dirty(c); + atomic_long_read(&c->flash_dev_dirty_sectors); /* * Unfortunately there is no control of global dirty data. If the @@ -476,6 +476,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, if (!d) return; + if (UUID_FLASH_ONLY(&c->uuids[inode])) + atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); + stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 610fb01de629c9..3745d7004c478c 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) -{ - uint64_t i, ret = 0; - - mutex_lock(&bch_register_lock); - - for (i = 0; i < c->devices_max_used; i++) { - struct bcache_device *d = c->devices[i]; - - if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) - continue; - ret += bcache_dev_sectors_dirty(d); - } - - mutex_unlock(&bch_register_lock); - - return ret; -} - static inline unsigned offset_to_stripe(struct bcache_device *d, uint64_t offset) { From 5c25c4fc74af40657606dd01df27cc5eb9efb26c Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Thu, 26 Jul 2018 12:17:34 +0800 Subject: [PATCH 111/190] bcache: finish incremental GC In GC thread, we record the latest GC key in gc_done, which is expected to be used for incremental GC, but in currently code, we didn't realize it. When GC runs, front side IO would be blocked until the GC over, it would be a long time if there is a lot of btree nodes. This patch realizes incremental GC, the main ideal is that, when there are front side I/Os, after GC some nodes (100), we stop GC, release locker of the btree node, and go to process the front side I/Os for some times (100 ms), then go back to GC again. By this patch, when we doing GC, I/Os are not blocked all the time, and there is no obvious I/Os zero jump problem any more. Patch v2: Rename some variables and macros name as Coly suggested. Signed-off-by: Tang Junhui Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 5 +++++ drivers/md/bcache/btree.c | 14 +++++++++++++- drivers/md/bcache/request.c | 3 +++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3226d38bf8594e..872ef4d677115f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -474,6 +474,7 @@ struct cache { struct gc_stat { size_t nodes; + size_t nodes_pre; size_t key_bytes; size_t nkeys; @@ -603,6 +604,10 @@ struct cache_set { * rescale; when it hits 0 we rescale all the bucket priorities. */ atomic_t rescale; + /* + * used for GC, identify if any front side I/Os is inflight + */ + atomic_t search_inflight; /* * When we invalidate buckets, we use both the priority and the amount * of good data to determine which buckets to reuse first - to weight diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 547c9eedc2f4fa..b4407ba12667d2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -90,6 +90,8 @@ #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 +#define MIN_GC_NODES 100 +#define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) @@ -1585,6 +1587,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; + if (atomic_read(&b->c->search_inflight) && + gc->nodes >= gc->nodes_pre + MIN_GC_NODES) { + gc->nodes_pre = gc->nodes; + ret = -EAGAIN; + break; + } + if (need_resched()) { ret = -EAGAIN; break; @@ -1753,7 +1762,10 @@ static void bch_btree_gc(struct cache_set *c) closure_sync(&writes); cond_resched(); - if (ret && ret != -EAGAIN) + if (ret == -EAGAIN) + schedule_timeout_interruptible(msecs_to_jiffies + (GC_SLEEP_MS)); + else if (ret) pr_warn("gc failed!"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 97707b0c54ce05..43af905920f545 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -701,6 +701,8 @@ static void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); + atomic_dec(&s->d->c->search_inflight); + if (s->iop.bio) bio_put(s->iop.bio); @@ -718,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio, closure_init(&s->cl, NULL); do_bio_hook(s, bio, request_endio); + atomic_inc(&d->c->search_inflight); s->orig_bio = bio; s->cache_miss = NULL; From 7f4a59de28137aae4316a58f501b599ac3b87395 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Thu, 26 Jul 2018 12:17:35 +0800 Subject: [PATCH 112/190] bcache: calculate the number of incremental GC nodes according to the total of btree nodes This patch base on "[PATCH] bcache: finish incremental GC". Since incremental GC would stop 100ms when front side I/O comes, so when there are many btree nodes, if GC only processes constant (100) nodes each time, GC would last a long time, and the front I/Os would run out of the buckets (since no new bucket can be allocated during GC), and I/Os be blocked again. So GC should not process constant nodes, but varied nodes according to the number of btree nodes. In this patch, GC is divided into constant (100) times, so when there are many btree nodes, GC can process more nodes each time, otherwise GC will process less nodes each time (but no less than MIN_GC_NODES). Signed-off-by: Tang Junhui Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b4407ba12667d2..475008fbbaab6a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -90,6 +90,7 @@ #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 +#define MAX_GC_TIMES 100 #define MIN_GC_NODES 100 #define GC_SLEEP_MS 100 @@ -1522,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b) return ret; } +static size_t btree_gc_min_nodes(struct cache_set *c) +{ + size_t min_nodes; + + /* + * Since incremental GC would stop 100ms when front + * side I/O comes, so when there are many btree nodes, + * if GC only processes constant (100) nodes each time, + * GC would last a long time, and the front side I/Os + * would run out of the buckets (since no new bucket + * can be allocated during GC), and be blocked again. + * So GC should not process constant nodes, but varied + * nodes according to the number of btree nodes, which + * realized by dividing GC into constant(100) times, + * so when there are many btree nodes, GC can process + * more nodes each time, otherwise, GC will process less + * nodes each time (but no less than MIN_GC_NODES) + */ + min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; + if (min_nodes < MIN_GC_NODES) + min_nodes = MIN_GC_NODES; + + return min_nodes; +} + + static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { @@ -1588,7 +1615,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, r->b = NULL; if (atomic_read(&b->c->search_inflight) && - gc->nodes >= gc->nodes_pre + MIN_GC_NODES) { + gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { gc->nodes_pre = gc->nodes; ret = -EAGAIN; break; @@ -1846,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) do { k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); - if (k) + if (k) { btree_node_prefetch(b, k); + /* + * initiallize c->gc_stats.nodes + * for incremental GC + */ + b->c->gc_stats.nodes++; + } if (p) ret = btree(check_recurse, p, b, op); From 94f71c16062e86069fb87dfa9b6683e2f1c21232 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Thu, 26 Jul 2018 12:17:36 +0800 Subject: [PATCH 113/190] bcache: fix I/O significant decline while backend devices registering I attached several backend devices in the same cache set, and produced lots of dirty data by running small rand I/O writes in a long time, then I continue run I/O in the others cached devices, and stopped a cached device, after a mean while, I register the stopped device again, I see the running I/O in the others cached devices dropped significantly, sometimes even jumps to zero. In currently code, bcache would traverse each keys and btree node to count the dirty data under read locker, and the writes threads can not get the btree write locker, and when there is a lot of keys and btree node in the registering device, it would last several seconds, so the write I/Os in others cached device are blocked and declined significantly. In this patch, when a device registering to a ache set, which exist others cached devices with running I/Os, we get the amount of dirty data of the device in an incremental way, and do not block other cached devices all the time. Patch v2: Rename some variables and macros name as Coly suggested. Signed-off-by: Tang Junhui Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 0d2a05074a8169..912e969fedbacb 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -676,10 +676,14 @@ static int bch_writeback_thread(void *arg) } /* Init */ +#define INIT_KEYS_EACH_TIME 500000 +#define INIT_KEYS_SLEEP_MS 100 struct sectors_dirty_init { struct btree_op op; unsigned inode; + size_t count; + struct bkey start; }; static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, @@ -694,18 +698,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + op->count++; + if (atomic_read(&b->c->search_inflight) && + !(op->count % INIT_KEYS_EACH_TIME)) { + bkey_copy_key(&op->start, k); + return -EAGAIN; + } + return MAP_CONTINUE; } void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; + int ret; bch_btree_op_init(&op.op, -1); op.inode = d->id; - - bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), - sectors_dirty_init_fn, 0); + op.count = 0; + op.start = KEY(op.inode, 0, 0); + + do { + ret = bch_btree_map_keys(&op.op, d->c, &op.start, + sectors_dirty_init_fn, 0); + if (ret == -EAGAIN) + schedule_timeout_interruptible( + msecs_to_jiffies(INIT_KEYS_SLEEP_MS)); + else if (ret < 0) { + pr_warn("sectors dirty init failed, ret=%d!", ret); + break; + } + } while (ret == -EAGAIN); } void bch_cached_dev_writeback_init(struct cached_dev *dc) From a56489d4b3c914eb30b724ff25debc2e59c7950e Mon Sep 17 00:00:00 2001 From: Florian Schmaus Date: Thu, 26 Jul 2018 12:17:37 +0800 Subject: [PATCH 114/190] bcache: do not assign in if condition register_bcache() Fixes an error condition reported by checkpatch.pl which is caused by assigning a variable in an if condition. Signed-off-by: Florian Schmaus Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index cea2a42ea2762a..093b3789ce05b1 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2165,8 +2165,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; - if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || - !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) + path = kstrndup(buffer, size, GFP_KERNEL); + if (!path) + goto err; + + sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); + if (!sb) goto err; err = "failed to open device"; From 6268dc2c4703aabfb0b35681be709acf4c2826c6 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 26 Jul 2018 12:17:38 +0800 Subject: [PATCH 115/190] bcache: free heap cache_set->flush_btree in bch_journal_free Free the cache_set->flush_bree heap memory on journal free. Signed-off-by: Wang Sheng-Hui Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 18f1b523962042..10748c626a1dd8 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c) free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); free_fifo(&c->journal.pin); + free_heap(&c->flush_btree); } int bch_journal_alloc(struct cache_set *c) From 16c1fdf4cfd6c0091e59b93ec2cb7e99973f8244 Mon Sep 17 00:00:00 2001 From: Florian Schmaus Date: Thu, 26 Jul 2018 12:17:39 +0800 Subject: [PATCH 116/190] bcache: do not assign in if condition in bcache_init() Fixes an error condition reported by checkpatch.pl which is caused by assigning a variable in an if condition. Signed-off-by: Florian Schmaus Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 093b3789ce05b1..a852018964adbe 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2330,9 +2330,15 @@ static int __init bcache_init(void) return bcache_major; } - if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || - !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || - bch_request_init() || + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + if (!bcache_wq) + goto err; + + bcache_kobj = kobject_create_and_add("bcache", fs_kobj); + if (!bcache_kobj) + goto err; + + if (bch_request_init() || bch_debug_init(bcache_kobj) || closure_debug_init() || sysfs_create_files(bcache_kobj, files)) goto err; From 9b4e9f5abb94b671f998ffc0efebd1582852fee3 Mon Sep 17 00:00:00 2001 From: Florian Schmaus Date: Thu, 26 Jul 2018 12:17:40 +0800 Subject: [PATCH 117/190] bcache: do not assign in if condition in bcache_device_init() Fixes an error condition reported by checkpatch.pl which is caused by assigning a variable in an if condition. Signed-off-by: Florian Schmaus Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a852018964adbe..40fe26fef00fee 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -796,11 +796,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, return idx; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || - !(d->disk = alloc_disk(BCACHE_MINORS))) { - ida_simple_remove(&bcache_device_idx, idx); - return -ENOMEM; - } + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + goto err; + + d->disk = alloc_disk(BCACHE_MINORS); + if (!d->disk) + goto err; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -834,6 +835,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, blk_queue_write_cache(q, true, true); return 0; + +err: + ida_simple_remove(&bcache_device_idx, idx); + return -ENOMEM; + } /* Cached device */ From 75cbb3f1d840429e6aa67b351332f38b29e6292c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 26 Jul 2018 12:17:41 +0800 Subject: [PATCH 118/190] bcache: stop using the deprecated get_seconds() The get_seconds function is deprecated now since it returns a 32-bit value that will eventually overflow, and we are replacing it throughout the kernel with ktime_get_seconds() or ktime_get_real_seconds() that return a time64_t. bcache uses get_seconds() to read the current system time and store it in the superblock as well as in uuid_entry structures that are user visible. Unfortunately, the two structures in are still limited to 32 bits, so this won't fix any real problems but will still overflow in year 2106. Let's at least document that properly, in case we get an updated format in the future it can be fixed. We still have a long time before the overflow and checking the tools at https://github.com/koverstreet/bcache-tools reveals no access to any of them. Signed-off-by: Arnd Bergmann Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 12 ++++++------ include/uapi/linux/bcache.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 40fe26fef00fee..e0a92104ca2318 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, goto err; } - sb->last_mount = get_seconds(); + sb->last_mount = (u32)ktime_get_real_seconds(); err = NULL; get_page(bh->b_page); @@ -701,7 +701,7 @@ static void bcache_device_detach(struct bcache_device *d) SET_UUID_FLASH_ONLY(u, 0); memcpy(u->uuid, invalid_uuid, 16); - u->invalidated = cpu_to_le32(get_seconds()); + u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); bch_uuid_write(d->c); } @@ -1033,7 +1033,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint8_t *set_uuid) { - uint32_t rtime = cpu_to_le32(get_seconds()); + uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); struct uuid_entry *u; struct cached_dev *exist_dc, *t; @@ -1076,7 +1076,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { memcpy(u->uuid, invalid_uuid, 16); - u->invalidated = cpu_to_le32(get_seconds()); + u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds()); u = NULL; } @@ -1398,7 +1398,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) get_random_bytes(u->uuid, 16); memset(u->label, 0, 32); - u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); + u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds()); SET_UUID_FLASH_ONLY(u, 1); u->sectors = size >> 9; @@ -1902,7 +1902,7 @@ static void run_cache_set(struct cache_set *c) goto err; closure_sync(&cl); - c->sb.last_mount = get_seconds(); + c->sb.last_mount = (u32)ktime_get_real_seconds(); bcache_write_super(c); list_for_each_entry_safe(dc, t, &uncached_devices, list) diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 821f71a2e48fa6..8d19e02d752a51 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -195,7 +195,7 @@ struct cache_sb { }; }; - __u32 last_mount; /* time_t */ + __u32 last_mount; /* time overflow in y2106 */ __u16 first_bucket; union { @@ -318,7 +318,7 @@ struct uuid_entry { struct { __u8 uuid[16]; __u8 label[32]; - __u32 first_reg; + __u32 first_reg; /* time overflow in y2106 */ __u32 last_reg; __u32 invalidated; From 14cb2c8a6c5dae57ee3e2da10fa3db2b9087e39e Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 25 Jul 2018 22:46:28 -0300 Subject: [PATCH 119/190] partitions/aix: fix usage of uninitialized lv_info and lvname structures The if-block that sets a successful return value in aix_partition() uses 'lvip[].pps_per_lv' and 'n[].name' potentially uninitialized. For example, if 'numlvs' is zero or alloc_lvn() fails, neither is initialized, but are used anyway if alloc_pvd() succeeds after it. So, make the alloc_pvd() call conditional on their initialization. This has been hit when attaching an apparently corrupted/stressed AIX LUN, misleading the kernel to pr_warn() invalid data and hang. [...] partition (null) (11 pp's found) is not contiguous [...] partition (null) (2 pp's found) is not contiguous [...] partition (null) (3 pp's found) is not contiguous [...] partition (null) (64 pp's found) is not contiguous Fixes: 6ceea22bbbc8 ("partitions: add aix lvm partition support files") Signed-off-by: Mauricio Faria de Oliveira Signed-off-by: Jens Axboe --- block/partitions/aix.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/partitions/aix.c b/block/partitions/aix.c index 007f95eea0e1a9..850cbd1860d4ca 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -178,7 +178,7 @@ int aix_partition(struct parsed_partitions *state) u32 vgda_sector = 0; u32 vgda_len = 0; int numlvs = 0; - struct pvd *pvd; + struct pvd *pvd = NULL; struct lv_info { unsigned short pps_per_lv; unsigned short pps_found; @@ -232,10 +232,11 @@ int aix_partition(struct parsed_partitions *state) if (lvip[i].pps_per_lv) foundlvs += 1; } + /* pvd loops depend on n[].name and lvip[].pps_per_lv */ + pvd = alloc_pvd(state, vgda_sector + 17); } put_dev_sector(sect); } - pvd = alloc_pvd(state, vgda_sector + 17); if (pvd) { int numpps = be16_to_cpu(pvd->pp_count); int psn_part1 = be32_to_cpu(pvd->psn_part1); From d43fdae7bac2def8c4314b5a49822cb7f08a45f1 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Wed, 25 Jul 2018 22:46:29 -0300 Subject: [PATCH 120/190] partitions/aix: append null character to print data from disk Even if properly initialized, the lvname array (i.e., strings) is read from disk, and might contain corrupt data (e.g., lack the null terminating character for strings). So, make sure the partition name string used in pr_warn() has the null terminating character. Fixes: 6ceea22bbbc8 ("partitions: add aix lvm partition support files") Suggested-by: Daniel J. Axtens Signed-off-by: Mauricio Faria de Oliveira Signed-off-by: Jens Axboe --- block/partitions/aix.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/partitions/aix.c b/block/partitions/aix.c index 850cbd1860d4ca..903f3ed175d026 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -283,10 +283,14 @@ int aix_partition(struct parsed_partitions *state) next_lp_ix += 1; } for (i = 0; i < state->limit; i += 1) - if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) { + char tmp[sizeof(n[i].name) + 1]; // null char + + snprintf(tmp, sizeof(tmp), "%s", n[i].name); pr_warn("partition %s (%u pp's found) is " "not contiguous\n", - n[i].name, lvip[i].pps_found); + tmp, lvip[i].pps_found); + } kfree(pvd); } kfree(n); From 9b89bc3857a6c0dfda18ddae2a42c114ecc32753 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 12 May 2018 18:18:12 +0200 Subject: [PATCH 121/190] nvme.h: add support for the log specific field NVMe 1.3 added a new log specific field to the get log page CQ defintion, add it to our get_log_page SQ structure. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 80dfedcf0bf722..39f05b0b8c7f77 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -885,7 +885,7 @@ struct nvme_get_log_page_command { __u64 rsvd2[2]; union nvme_data_ptr dptr; __u8 lid; - __u8 rsvd10; + __u8 lsp; /* upper 4 bits reserved */ __le16 numdl; __le16 numdu; __u16 rsvd11; From 1a37621658fe06b10cf8bac02c32304d2a1c888c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 May 2018 18:53:57 +0200 Subject: [PATCH 122/190] nvme.h: add ANA definitions Add various defintions from NVMe 1.3 TP 4004. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- include/linux/nvme.h | 50 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 39f05b0b8c7f77..64c9175723de70 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -242,7 +242,12 @@ struct nvme_id_ctrl { __le32 sanicap; __le32 hmminds; __le16 hmmaxd; - __u8 rsvd338[174]; + __u8 rsvd338[4]; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __u8 rsvd352[160]; __u8 sqes; __u8 cqes; __le16 maxcmd; @@ -258,7 +263,8 @@ struct nvme_id_ctrl { __le16 acwu; __u8 rsvd534[2]; __le32 sgls; - __u8 rsvd540[228]; + __le32 mnan; + __u8 rsvd544[224]; char subnqn[256]; __u8 rsvd1024[768]; __le32 ioccsz; @@ -312,7 +318,9 @@ struct nvme_id_ns { __le16 nabspf; __le16 noiob; __u8 nvmcap[16]; - __u8 rsvd64[40]; + __u8 rsvd64[28]; + __le32 anagrpid; + __u8 rsvd96[8]; __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; @@ -425,6 +433,32 @@ struct nvme_effects_log { __u8 resv[2048]; }; +enum nvme_ana_state { + NVME_ANA_OPTIMIZED = 0x01, + NVME_ANA_NONOPTIMIZED = 0x02, + NVME_ANA_INACCESSIBLE = 0x03, + NVME_ANA_PERSISTENT_LOSS = 0x04, + NVME_ANA_CHANGE = 0x0f, +}; + +struct nvme_ana_group_desc { + __le32 grpid; + __le32 nnsids; + __le64 chgcnt; + __u8 state; + __u8 rsvd17[7]; + __le32 nsids[]; +}; + +/* flag for the log specific field of the ANA log */ +#define NVME_ANA_LOG_RGO (1 << 0) + +struct nvme_ana_rsp_hdr { + __le64 chgcnt; + __le16 ngrps; + __le16 rsvd10[3]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -444,11 +478,13 @@ enum { enum { NVME_AER_NOTICE_NS_CHANGED = 0x00, NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, + NVME_AER_NOTICE_ANA = 0x03, }; enum { NVME_AEN_CFG_NS_ATTR = 1 << 8, NVME_AEN_CFG_FW_ACT = 1 << 9, + NVME_AEN_CFG_ANA_CHANGE = 1 << 11, }; struct nvme_lba_range_type { @@ -763,6 +799,7 @@ enum { NVME_LOG_FW_SLOT = 0x03, NVME_LOG_CHANGED_NS = 0x04, NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_ANA = 0x0c, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), @@ -1185,6 +1222,13 @@ enum { NVME_SC_ACCESS_DENIED = 0x286, NVME_SC_UNWRITTEN_BLOCK = 0x287, + /* + * Path-related Errors: + */ + NVME_SC_ANA_PERSISTENT_LOSS = 0x301, + NVME_SC_ANA_INACCESSIBLE = 0x302, + NVME_SC_ANA_TRANSITION = 0x303, + NVME_SC_DNR = 0x4000, }; From 0e98719b0e4b48b61965e1d1cba037c2005d01d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Jun 2018 14:39:00 +0200 Subject: [PATCH 123/190] nvme: simplify the API for getting log pages Merge nvme_get_log and nvme_get_log_ext into a single helper, which takes a plain nsid instead of the nvme_ns pointer. Also add support for the log specific field while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 32 +++++++++++--------------------- drivers/nvme/host/lightnvm.c | 5 +++-- drivers/nvme/host/nvme.h | 4 ++-- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e77e6418a21cb3..4552167f8b2be8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2249,21 +2249,16 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) return ret; } -int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 log_page, void *log, - size_t size, u64 offset) +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, + void *log, size_t size, u64 offset) { struct nvme_command c = { }; unsigned long dwlen = size / 4 - 1; c.get_log_page.opcode = nvme_admin_get_log_page; - - if (ns) - c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id); - else - c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL); - + c.get_log_page.nsid = cpu_to_le32(nsid); c.get_log_page.lid = log_page; + c.get_log_page.lsp = lsp; c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); @@ -2272,12 +2267,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, - size_t size) -{ - return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0); -} - static int nvme_get_effects_log(struct nvme_ctrl *ctrl) { int ret; @@ -2288,8 +2277,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) if (!ctrl->effects) return 0; - ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, - sizeof(*ctrl->effects)); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, + ctrl->effects, sizeof(*ctrl->effects), 0); if (ret) { kfree(ctrl->effects); ctrl->effects = NULL; @@ -3208,7 +3197,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, + log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -3325,9 +3315,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) - dev_warn(ctrl->device, - "Get FW SLOT INFO log error\n"); + if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log, + sizeof(*log), 0)) + dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index d9e4cccd5b66c0..7e4cf4eb9d6604 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -604,8 +604,9 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, while (left) { len = min_t(unsigned int, left, max_len); - ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK, - dev_meta, len, offset); + ret = nvme_get_log(ctrl, ns->head->ns_id, + NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, + offset); if (ret) { dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); break; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4ad0c8ad2a2747..f463eaf56db5e7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -443,8 +443,8 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); -int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 log_page, void *log, size_t size, u64 offset); +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, + void *log, size_t size, u64 offset); extern const struct attribute_group nvme_ns_id_attr_group; extern const struct block_device_operations nvme_ns_head_ops; From 8decf5d5b9f3f72b802a017b0b035f7db0592acf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Jun 2018 08:43:00 +0200 Subject: [PATCH 124/190] nvme: remove nvme_req_needs_failover Now that we just call out to blk_path_error there isn't really any good reason to not merge it into the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/multipath.c | 7 ------- drivers/nvme/host/nvme.h | 6 ------ 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4552167f8b2be8..456d37a02ea35f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -236,7 +236,8 @@ void nvme_complete_rq(struct request *req) trace_nvme_complete_rq(req); if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if (nvme_req_needs_failover(req, status)) { + if ((req->cmd_flags & REQ_NVME_MPATH) && + blk_path_error(status)) { nvme_failover_req(req); return; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1ffd3e8b13a18d..348aa405b641c9 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -56,13 +56,6 @@ void nvme_failover_req(struct request *req) kblockd_schedule_work(&ns->head->requeue_work); } -bool nvme_req_needs_failover(struct request *req, blk_status_t error) -{ - if (!(req->cmd_flags & REQ_NVME_MPATH)) - return false; - return blk_path_error(error); -} - void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f463eaf56db5e7..07452adef11069 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -453,7 +453,6 @@ extern const struct block_device_operations nvme_ns_head_ops; void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); void nvme_failover_req(struct request *req); -bool nvme_req_needs_failover(struct request *req, blk_status_t error); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns_head *head); @@ -490,11 +489,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, static inline void nvme_failover_req(struct request *req) { } -static inline bool nvme_req_needs_failover(struct request *req, - blk_status_t error) -{ - return false; -} static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { } From 0d0b660f214dc4905db7b6bc998bad0c16dfb1ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 May 2018 08:48:54 +0200 Subject: [PATCH 125/190] nvme: add ANA support Add support for Asynchronous Namespace Access as specified in NVMe 1.3 TP 4004. With ANA each namespace attached to a controller belongs to an ANA group that describes the characteristics of accessing the namespaces through this controller. In the optimized and non-optimized states namespaces can be accessed regularly, although in a multi-pathing environment we should always prefer to access a namespace through a controller where an optimized relationship exists. Namespaces in Inaccessible, Permanent-Loss or Change state for a given controller should not be accessed. The states are updated through reading the ANA log page, which is read once during controller initialization, whenever the ANA change notice AEN is received, or when one of the ANA specific status codes that signal a state change is received on a command. The ANA state is kept in the nvme_ns structure, which makes the checks in the fast path very simple. Updating the ANA state when reading the log page is also very simple, the only downside is that finding the initial ANA state when scanning for namespaces is a bit cumbersome. The gendisk for a ns_head is only registered once a live path for it exists. Without that the kernel would hang during partition scanning. Includes fixes and improvements from Hannes Reinecke. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 42 ++++- drivers/nvme/host/multipath.c | 342 ++++++++++++++++++++++++++++++++-- drivers/nvme/host/nvme.h | 51 ++++- 3 files changed, 408 insertions(+), 27 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 456d37a02ea35f..e62592c949ab54 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1035,18 +1035,18 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) EXPORT_SYMBOL_GPL(nvme_set_queue_count); #define NVME_AEN_SUPPORTED \ - (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT) + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE) static void nvme_enable_aen(struct nvme_ctrl *ctrl) { - u32 result; + u32 supported = ctrl->oaes & NVME_AEN_SUPPORTED, result; int status; - status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, - ctrl->oaes & NVME_AEN_SUPPORTED, NULL, 0, &result); + status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported, NULL, + 0, &result); if (status) dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", - ctrl->oaes & NVME_AEN_SUPPORTED); + supported); } static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) @@ -2370,6 +2370,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) nvme_set_queue_limits(ctrl, ctrl->admin_q); ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); + ctrl->max_namespaces = le32_to_cpu(id->mnan); if (id->rtd3e) { /* us -> s */ @@ -2429,8 +2430,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); } + ret = nvme_mpath_init(ctrl, id); kfree(id); + if (ret < 0) + return ret; + if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) @@ -2649,6 +2654,10 @@ static struct attribute *nvme_ns_id_attrs[] = { &dev_attr_nguid.attr, &dev_attr_eui.attr, &dev_attr_nsid.attr, +#ifdef CONFIG_NVME_MULTIPATH + &dev_attr_ana_grpid.attr, + &dev_attr_ana_state.attr, +#endif NULL, }; @@ -2671,6 +2680,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) return 0; } +#ifdef CONFIG_NVME_MULTIPATH + if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { + if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ + return 0; + if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) + return 0; + } +#endif return a->mode; } @@ -3044,8 +3061,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_get_ctrl(ctrl); - kfree(id); - device_add_disk(ctrl->device, ns->disk); if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_id_attr_group)) @@ -3055,8 +3070,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) pr_warn("%s: failed to register lightnvm sysfs group for identification\n", ns->disk->disk_name); - nvme_mpath_add_disk(ns->head); + nvme_mpath_add_disk(ns, id); nvme_fault_inject_init(ns); + kfree(id); + return; out_unlink_ns: mutex_lock(&ctrl->subsys->lock); @@ -3364,6 +3381,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) case NVME_AER_NOTICE_FW_ACT_STARTING: queue_work(nvme_wq, &ctrl->fw_act_work); break; +#ifdef CONFIG_NVME_MULTIPATH + case NVME_AER_NOTICE_ANA: + if (!ctrl->ana_log_buf) + break; + queue_work(nvme_wq, &ctrl->ana_work); + break; +#endif default: dev_warn(ctrl->device, "async event result %08x\n", result); } @@ -3396,6 +3420,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { + nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); flush_work(&ctrl->scan_work); @@ -3433,6 +3458,7 @@ static void nvme_free_ctrl(struct device *dev) ida_simple_remove(&nvme_instance_ida, ctrl->instance); kfree(ctrl->effects); + nvme_mpath_uninit(ctrl); if (subsys) { mutex_lock(&subsys->lock); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 348aa405b641c9..c643872f8dac08 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Christoph Hellwig. + * Copyright (c) 2017-2018 Christoph Hellwig. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -20,6 +20,11 @@ module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return multipath && (ctrl->subsys->cmic & (1 << 3)); +} + /* * If multipathing is enabled we need to always use the subsystem instance * number for numbering our devices to avoid conflicts between subsystems that @@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; + u16 status = nvme_req(req)->status; unsigned long flags; spin_lock_irqsave(&ns->head->requeue_lock, flags); @@ -52,7 +58,34 @@ void nvme_failover_req(struct request *req) spin_unlock_irqrestore(&ns->head->requeue_lock, flags); blk_mq_end_request(req, 0); - nvme_reset_ctrl(ns->ctrl); + switch (status & 0x7ff) { + case NVME_SC_ANA_TRANSITION: + case NVME_SC_ANA_INACCESSIBLE: + case NVME_SC_ANA_PERSISTENT_LOSS: + /* + * If we got back an ANA error we know the controller is alive, + * but not ready to serve this namespaces. The spec suggests + * we should update our general state here, but due to the fact + * that the admin and I/O queues are not serialized that is + * fundamentally racy. So instead just clear the current path, + * mark the the path as pending and kick of a re-read of the ANA + * log page ASAP. + */ + nvme_mpath_clear_current_path(ns); + if (ns->ctrl->ana_log_buf) { + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); + } + break; + default: + /* + * Reset the controller for any non-ANA error as we don't know + * what caused the error. + */ + nvme_reset_ctrl(ns->ctrl); + break; + } + kblockd_schedule_work(&ns->head->requeue_work); } @@ -68,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) up_read(&ctrl->namespaces_rwsem); } +static const char *nvme_ana_state_names[] = { + [0] = "invalid state", + [NVME_ANA_OPTIMIZED] = "optimized", + [NVME_ANA_NONOPTIMIZED] = "non-optimized", + [NVME_ANA_INACCESSIBLE] = "inaccessible", + [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", + [NVME_ANA_CHANGE] = "change", +}; + static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) { - struct nvme_ns *ns; + struct nvme_ns *ns, *fallback = NULL; list_for_each_entry_rcu(ns, &head->list, siblings) { - if (ns->ctrl->state == NVME_CTRL_LIVE) { + if (ns->ctrl->state != NVME_CTRL_LIVE || + test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + continue; + switch (ns->ana_state) { + case NVME_ANA_OPTIMIZED: rcu_assign_pointer(head->current_path, ns); return ns; + case NVME_ANA_NONOPTIMIZED: + fallback = ns; + break; + default: + break; } } - return NULL; + if (fallback) + rcu_assign_pointer(head->current_path, fallback); + return fallback; +} + +static inline bool nvme_path_is_optimized(struct nvme_ns *ns) +{ + return ns->ctrl->state == NVME_CTRL_LIVE && + ns->ana_state == NVME_ANA_OPTIMIZED; } inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) { struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); - if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) + if (unlikely(!ns || !nvme_path_is_optimized(ns))) ns = __nvme_find_path(head); return ns; } @@ -135,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) srcu_idx = srcu_read_lock(&head->srcu); ns = srcu_dereference(head->current_path, &head->srcu); - if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) + if (likely(ns && nvme_path_is_optimized(ns))) found = ns->queue->poll_fn(q, qc); srcu_read_unlock(&head->srcu, srcu_idx); return found; @@ -169,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) struct request_queue *q; bool vwc = false; + mutex_init(&head->lock); bio_list_init(&head->requeue_list); spin_lock_init(&head->requeue_lock); INIT_WORK(&head->requeue_work, nvme_requeue_work); @@ -213,29 +273,232 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) return -ENOMEM; } -void nvme_mpath_add_disk(struct nvme_ns_head *head) +static void nvme_mpath_set_live(struct nvme_ns *ns) { + struct nvme_ns_head *head = ns->head; + + lockdep_assert_held(&ns->head->lock); + if (!head->disk) return; - mutex_lock(&head->subsys->lock); if (!(head->disk->flags & GENHD_FL_UP)) { device_add_disk(&head->subsys->dev, head->disk); if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, &nvme_ns_id_attr_group)) - pr_warn("%s: failed to create sysfs group for identification\n", - head->disk->disk_name); + dev_warn(&head->subsys->dev, + "failed to create id group.\n"); + } + + kblockd_schedule_work(&ns->head->requeue_work); +} + +static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, + int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, + void *)) +{ + void *base = ctrl->ana_log_buf; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); + int error, i; + + lockdep_assert_held(&ctrl->ana_lock); + + for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { + struct nvme_ana_group_desc *desc = base + offset; + u32 nr_nsids = le32_to_cpu(desc->nnsids); + size_t nsid_buf_size = nr_nsids * sizeof(__le32); + + if (WARN_ON_ONCE(desc->grpid == 0)) + return -EINVAL; + if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) + return -EINVAL; + if (WARN_ON_ONCE(desc->state == 0)) + return -EINVAL; + if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) + return -EINVAL; + + offset += sizeof(*desc); + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) + return -EINVAL; + + error = cb(ctrl, desc, data); + if (error) + return error; + + offset += nsid_buf_size; + if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) + return -EINVAL; + } + + return 0; +} + +static inline bool nvme_state_is_live(enum nvme_ana_state state) +{ + return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; +} + +static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, + struct nvme_ns *ns) +{ + enum nvme_ana_state old; + + mutex_lock(&ns->head->lock); + old = ns->ana_state; + ns->ana_grpid = le32_to_cpu(desc->grpid); + ns->ana_state = desc->state; + clear_bit(NVME_NS_ANA_PENDING, &ns->flags); + + if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old)) + nvme_mpath_set_live(ns); + mutex_unlock(&ns->head->lock); +} + +static int nvme_update_ana_state(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) +{ + u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; + unsigned *nr_change_groups = data; + struct nvme_ns *ns; + + dev_info(ctrl->device, "ANA group %d: %s.\n", + le32_to_cpu(desc->grpid), + nvme_ana_state_names[desc->state]); + + if (desc->state == NVME_ANA_CHANGE) + (*nr_change_groups)++; + + if (!nr_nsids) + return 0; + + down_write(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) + continue; + nvme_update_ns_ana_state(desc, ns); + if (++n == nr_nsids) + break; + } + up_write(&ctrl->namespaces_rwsem); + WARN_ON_ONCE(n < nr_nsids); + return 0; +} + +static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only) +{ + u32 nr_change_groups = 0; + int error; + + mutex_lock(&ctrl->ana_lock); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, + groups_only ? NVME_ANA_LOG_RGO : 0, + ctrl->ana_log_buf, ctrl->ana_log_size, 0); + if (error) { + dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); + goto out_unlock; + } + + error = nvme_parse_ana_log(ctrl, &nr_change_groups, + nvme_update_ana_state); + if (error) + goto out_unlock; + + /* + * In theory we should have an ANATT timer per group as they might enter + * the change state at different times. But that is a lot of overhead + * just to protect against a target that keeps entering new changes + * states while never finishing previous ones. But we'll still + * eventually time out once all groups are in change state, so this + * isn't a big deal. + * + * We also double the ANATT value to provide some slack for transports + * or AEN processing overhead. + */ + if (nr_change_groups) + mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); + else + del_timer_sync(&ctrl->anatt_timer); +out_unlock: + mutex_unlock(&ctrl->ana_lock); + return error; +} + +static void nvme_ana_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); + + nvme_read_ana_log(ctrl, false); +} + +static void nvme_anatt_timeout(struct timer_list *t) +{ + struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); + + dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); + nvme_reset_ctrl(ctrl); +} + +void nvme_mpath_stop(struct nvme_ctrl *ctrl) +{ + if (!nvme_ctrl_use_ana(ctrl)) + return; + del_timer_sync(&ctrl->anatt_timer); + cancel_work_sync(&ctrl->ana_work); +} + +static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); +} +DEVICE_ATTR_RO(ana_grpid); + +static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); +} +DEVICE_ATTR_RO(ana_state); + +static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, + struct nvme_ana_group_desc *desc, void *data) +{ + struct nvme_ns *ns = data; + + if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { + nvme_update_ns_ana_state(desc, ns); + return -ENXIO; /* just break out of the loop */ + } + + return 0; +} + +void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + if (nvme_ctrl_use_ana(ns->ctrl)) { + mutex_lock(&ns->ctrl->ana_lock); + ns->ana_grpid = le32_to_cpu(id->anagrpid); + nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); + mutex_unlock(&ns->ctrl->ana_lock); + } else { + mutex_lock(&ns->head->lock); + ns->ana_state = NVME_ANA_OPTIMIZED; + nvme_mpath_set_live(ns); + mutex_unlock(&ns->head->lock); } - mutex_unlock(&head->subsys->lock); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - sysfs_remove_group(&disk_to_dev(head->disk)->kobj, - &nvme_ns_id_attr_group); - del_gendisk(head->disk); + if (head->disk->flags & GENHD_FL_UP) { + sysfs_remove_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group); + del_gendisk(head->disk); + } blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); @@ -243,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) blk_cleanup_queue(head->disk->queue); put_disk(head->disk); } + +int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + int error; + + if (!nvme_ctrl_use_ana(ctrl)) + return 0; + + ctrl->anacap = id->anacap; + ctrl->anatt = id->anatt; + ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); + ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); + + mutex_init(&ctrl->ana_lock); + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); + ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); + if (!(ctrl->anacap & (1 << 6))) + ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); + + if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { + dev_err(ctrl->device, + "ANA log page size (%zd) larger than MDTS (%d).\n", + ctrl->ana_log_size, + ctrl->max_hw_sectors << SECTOR_SHIFT); + dev_err(ctrl->device, "disabling ANA support.\n"); + return 0; + } + + INIT_WORK(&ctrl->ana_work, nvme_ana_work); + ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); + if (!ctrl->ana_log_buf) + goto out; + + error = nvme_read_ana_log(ctrl, true); + if (error) + goto out_free_ana_log_buf; + return 0; +out_free_ana_log_buf: + kfree(ctrl->ana_log_buf); +out: + return -ENOMEM; +} + +void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +{ + kfree(ctrl->ana_log_buf); +} + diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 07452adef11069..8b356f1d941cca 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -183,6 +183,7 @@ struct nvme_ctrl { u16 oacs; u16 nssa; u16 nr_streams; + u32 max_namespaces; atomic_t abort_limit; u8 vwc; u32 vs; @@ -205,6 +206,19 @@ struct nvme_ctrl { struct work_struct fw_act_work; unsigned long events; +#ifdef CONFIG_NVME_MULTIPATH + /* asymmetric namespace access: */ + u8 anacap; + u8 anatt; + u32 anagrpmax; + u32 nanagrpid; + struct mutex ana_lock; + struct nvme_ana_rsp_hdr *ana_log_buf; + size_t ana_log_size; + struct timer_list anatt_timer; + struct work_struct ana_work; +#endif + /* Power saving configuration */ u64 ps_max_latency_us; bool apst_enabled; @@ -269,6 +283,7 @@ struct nvme_ns_head { struct bio_list requeue_list; spinlock_t requeue_lock; struct work_struct requeue_work; + struct mutex lock; #endif struct list_head list; struct srcu_struct srcu; @@ -295,6 +310,10 @@ struct nvme_ns { struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH + enum nvme_ana_state ana_state; + u32 ana_grpid; +#endif struct list_head siblings; struct nvm_dev *ndev; struct kref kref; @@ -307,8 +326,9 @@ struct nvme_ns { bool ext; u8 pi_type; unsigned long flags; -#define NVME_NS_REMOVING 0 -#define NVME_NS_DEAD 1 +#define NVME_NS_REMOVING 0 +#define NVME_NS_DEAD 1 +#define NVME_NS_ANA_PENDING 2 u16 noiob; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS @@ -450,13 +470,17 @@ extern const struct attribute_group nvme_ns_id_attr_group; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH +bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); -void nvme_mpath_add_disk(struct nvme_ns_head *head); +void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); void nvme_mpath_remove_disk(struct nvme_ns_head *head); +int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); +void nvme_mpath_uninit(struct nvme_ctrl *ctrl); +void nvme_mpath_stop(struct nvme_ctrl *ctrl); static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { @@ -475,7 +499,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } +extern struct device_attribute dev_attr_ana_grpid; +extern struct device_attribute dev_attr_ana_state; + #else +static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return false; +} /* * Without the multipath code enabled, multiple controller per subsystems are * visible as devices and thus we cannot use the subsystem instance. @@ -497,7 +528,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, { return 0; } -static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) +static inline void nvme_mpath_add_disk(struct nvme_ns *ns, + struct nvme_id_ns *id) { } static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) @@ -509,6 +541,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } +static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, + struct nvme_id_ctrl *id) +{ + return 0; +} +static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) +{ +} +static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM From 4ee43280488b0f6cbd74702725a32f47d03d690b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jun 2018 15:09:50 +0200 Subject: [PATCH 126/190] nvmet: keep a port pointer in nvmet_ctrl This will be needed for the ANA AEN code. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/target/core.c | 2 ++ drivers/nvme/target/nvmet.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ddd85715a00ab4..cbcd19f52121ab 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -876,6 +876,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, nvmet_init_cap(ctrl); + ctrl->port = req->port; + INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); INIT_LIST_HEAD(&ctrl->async_events); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 68899385540260..de12dcbfd3f3e5 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -140,6 +140,8 @@ struct nvmet_ctrl { u16 cntlid; u32 kato; + struct nvmet_port *port; + u32 aen_enabled; unsigned long aen_masked; struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; From 793c7cfce02ce88b7bd67d43834c052d16c096e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 May 2018 19:00:13 +0200 Subject: [PATCH 127/190] nvmet: track and limit the number of namespaces per subsystem TP 4004 introduces a new 'Maximum Number of Allocated Namespaces' field in the Identify controller data to help the host size resources. Put an upper limit on the supported namespaces to be able to support this value as supporting 32-bits worth of namespaces would lead to very large buffers. The limit is completely arbitrary at this point. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/target/admin-cmd.c | 1 + drivers/nvme/target/core.c | 8 +++++++- drivers/nvme/target/nvmet.h | 8 ++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 16a9b24270f967..55f2bf4b5d0745 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -252,6 +252,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); id->nn = cpu_to_le32(ctrl->subsys->max_nsid); + id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index cbcd19f52121ab..42e8565015d5d7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -337,9 +337,13 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns) int nvmet_ns_enable(struct nvmet_ns *ns) { struct nvmet_subsys *subsys = ns->subsys; - int ret = 0; + int ret; mutex_lock(&subsys->lock); + ret = -EMFILE; + if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES) + goto out_unlock; + ret = 0; if (ns->enabled) goto out_unlock; @@ -374,6 +378,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns) list_add_tail_rcu(&ns->dev_link, &old->dev_link); } + subsys->nr_namespaces++; nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; @@ -414,6 +419,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) percpu_ref_exit(&ns->ref); mutex_lock(&subsys->lock); + subsys->nr_namespaces--; nvmet_ns_changed(subsys, ns->nsid); nvmet_ns_dev_disable(ns); out_unlock: diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index de12dcbfd3f3e5..701017f7f3df3c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -170,6 +170,7 @@ struct nvmet_subsys { struct kref ref; struct list_head namespaces; + unsigned int nr_namespaces; unsigned int max_nsid; struct list_head ctrls; @@ -362,6 +363,13 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd); #define NVMET_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD NVMET_QUEUE_SIZE + +/* + * Nice round number that makes a list of nsids fit into a page. + * Should become tunable at some point in the future. + */ +#define NVMET_MAX_NAMESPACES 1024 + #define NVMET_KAS 10 #define NVMET_DISC_KATO 120 From 72efd25dcf4f6310e9e6fa85620aa443b27c23fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Jul 2018 07:35:20 -0700 Subject: [PATCH 128/190] nvmet: add minimal ANA support Add support for Asynchronous Namespace Access as specified in NVMe 1.3 TP 4004. Just add a default ANA group 1 that is optimized on all ports. This is (and will remain) the default assignment for any namespace not epxlicitly assigned to another ANA group. The ANA state can be manually changed through the configfs interface, including the change state. Includes fixes and improvements from Hannes Reinecke. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/target/admin-cmd.c | 87 +++++++++++++++++++++++++++++++-- drivers/nvme/target/configfs.c | 10 ++++ drivers/nvme/target/core.c | 34 +++++++++++++ drivers/nvme/target/nvmet.h | 15 ++++++ 4 files changed, 142 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 55f2bf4b5d0745..b98d38c4e579a6 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -182,6 +182,69 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) nvmet_req_complete(req, status); } +static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid, + struct nvme_ana_group_desc *desc) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_ns *ns; + u32 count = 0; + + if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) { + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) + if (ns->anagrpid == grpid) + desc->nsids[count++] = cpu_to_le32(ns->nsid); + rcu_read_unlock(); + } + + desc->grpid = cpu_to_le32(grpid); + desc->nnsids = cpu_to_le32(count); + desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt); + desc->state = req->port->ana_state[grpid]; + memset(desc->rsvd17, 0, sizeof(desc->rsvd17)); + return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32); +} + +static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) +{ + struct nvme_ana_rsp_hdr hdr = { 0, }; + struct nvme_ana_group_desc *desc; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); /* start beyond hdr */ + size_t len; + u32 grpid; + u16 ngrps = 0; + u16 status; + + status = NVME_SC_INTERNAL; + desc = kmalloc(sizeof(struct nvme_ana_group_desc) + + NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL); + if (!desc) + goto out; + + down_read(&nvmet_ana_sem); + for (grpid = 1; grpid <= NVMET_MAX_ANAGRPS; grpid++) { + if (!nvmet_ana_group_enabled[grpid]) + continue; + len = nvmet_format_ana_group(req, grpid, desc); + status = nvmet_copy_to_sgl(req, offset, desc, len); + if (status) + break; + offset += len; + ngrps++; + } + + hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); + hdr.ngrps = cpu_to_le16(ngrps); + up_read(&nvmet_ana_sem); + + kfree(desc); + + /* copy the header last once we know the number of groups */ + status = nvmet_copy_to_sgl(req, 0, &hdr, sizeof(hdr)); +out: + nvmet_req_complete(req, status); +} + static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -213,8 +276,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) * the safest is to leave it as zeroes. */ - /* we support multiple ports and multiples hosts: */ - id->cmic = (1 << 0) | (1 << 1); + /* we support multiple ports, multiples hosts and ANA: */ + id->cmic = (1 << 0) | (1 << 1) | (1 << 3); /* no limit on data transfer sizes for now */ id->mdts = 0; @@ -282,6 +345,11 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->msdbd = ctrl->ops->msdbd; + id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4); + id->anatt = 10; /* random value */ + id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS); + id->nanagrpid = cpu_to_le32(NVMET_MAX_ANAGRPS); + /* * Meh, we don't really support any power state. Fake up the same * values that qemu does. @@ -323,8 +391,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) * nuse = ncap = nsze isn't always true, but we have no way to find * that out from the underlying device. */ - id->ncap = id->nuse = id->nsze = - cpu_to_le64(ns->size >> ns->blksize_shift); + id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift); + switch (req->port->ana_state[ns->anagrpid]) { + case NVME_ANA_INACCESSIBLE: + case NVME_ANA_PERSISTENT_LOSS: + break; + default: + id->nuse = id->nsze; + break; + } /* * We just provide a single LBA format that matches what the @@ -338,6 +413,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) * controllers, but also with any other user of the block device. */ id->nmic = (1 << 0); + id->anagrpid = cpu_to_le32(ns->anagrpid); memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid)); @@ -620,6 +696,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) case NVME_LOG_CMD_EFFECTS: req->execute = nvmet_execute_get_log_cmd_effects_ns; return 0; + case NVME_LOG_ANA: + req->execute = nvmet_execute_get_log_page_ana; + return 0; } break; case nvme_admin_identify: diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 3ba5ea5c4376a1..b3c62b41b2ae94 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -923,6 +923,7 @@ static void nvmet_port_release(struct config_item *item) { struct nvmet_port *port = to_nvmet_port(item); + kfree(port->ana_state); kfree(port); } @@ -959,6 +960,15 @@ static struct config_group *nvmet_ports_make(struct config_group *group, if (!port) return ERR_PTR(-ENOMEM); + port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1, + sizeof(*port->ana_state), GFP_KERNEL); + if (!port->ana_state) { + kfree(port); + return ERR_PTR(-ENOMEM); + } + + port->ana_state[NVMET_DEFAULT_ANA_GRPID] = NVME_ANA_OPTIMIZED; + INIT_LIST_HEAD(&port->entry); INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->referrals); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 42e8565015d5d7..43a755f7baa5a7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -40,6 +40,10 @@ static DEFINE_IDA(cntlid_ida); */ DECLARE_RWSEM(nvmet_config_sem); +u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; +u64 nvmet_ana_chgcnt; +DECLARE_RWSEM(nvmet_ana_sem); + static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, const char *subsysnqn); @@ -430,6 +434,10 @@ void nvmet_ns_free(struct nvmet_ns *ns) { nvmet_ns_disable(ns); + down_write(&nvmet_ana_sem); + nvmet_ana_group_enabled[ns->anagrpid]--; + up_write(&nvmet_ana_sem); + kfree(ns->device_path); kfree(ns); } @@ -447,6 +455,12 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; + + down_write(&nvmet_ana_sem); + ns->anagrpid = NVMET_DEFAULT_ANA_GRPID; + nvmet_ana_group_enabled[ns->anagrpid]++; + up_write(&nvmet_ana_sem); + uuid_gen(&ns->uuid); ns->buffered_io = false; @@ -554,6 +568,20 @@ int nvmet_sq_init(struct nvmet_sq *sq) } EXPORT_SYMBOL_GPL(nvmet_sq_init); +static inline u16 nvmet_check_ana_state(struct nvmet_port *port, + struct nvmet_ns *ns) +{ + enum nvme_ana_state state = port->ana_state[ns->anagrpid]; + + if (unlikely(state == NVME_ANA_INACCESSIBLE)) + return NVME_SC_ANA_INACCESSIBLE; + if (unlikely(state == NVME_ANA_PERSISTENT_LOSS)) + return NVME_SC_ANA_PERSISTENT_LOSS; + if (unlikely(state == NVME_ANA_CHANGE)) + return NVME_SC_ANA_TRANSITION; + return 0; +} + static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -566,6 +594,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); if (unlikely(!req->ns)) return NVME_SC_INVALID_NS | NVME_SC_DNR; + ret = nvmet_check_ana_state(req->port, req->ns); + if (unlikely(ret)) + return ret; if (req->ns->file) return nvmet_file_parse_io_cmd(req); @@ -1123,12 +1154,15 @@ static int __init nvmet_init(void) { int error; + nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; + buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", WQ_MEM_RECLAIM, 0); if (!buffered_io_wq) { error = -ENOMEM; goto out; } + error = nvmet_init_discovery(); if (error) goto out; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 701017f7f3df3c..f7d622fc1aa723 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -64,6 +64,7 @@ struct nvmet_ns { loff_t size; u8 nguid[16]; uuid_t uuid; + u32 anagrpid; bool buffered_io; bool enabled; @@ -115,6 +116,7 @@ struct nvmet_port { struct list_head subsystems; struct config_group referrals_group; struct list_head referrals; + enum nvme_ana_state *ana_state; void *priv; bool enabled; int inline_data_size; @@ -370,6 +372,15 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd); */ #define NVMET_MAX_NAMESPACES 1024 +/* + * 0 is not a valid ANA group ID, so we start numbering at 1. + * + * ANA Group 1 exists without manual intervention, has namespaces assigned to it + * by default, and is available in an optimized state through all ports. + */ +#define NVMET_MAX_ANAGRPS 1 +#define NVMET_DEFAULT_ANA_GRPID 1 + #define NVMET_KAS 10 #define NVMET_DISC_KATO 120 @@ -383,6 +394,10 @@ extern struct nvmet_subsys *nvmet_disc_subsys; extern u64 nvmet_genctr; extern struct rw_semaphore nvmet_config_sem; +extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; +extern u64 nvmet_ana_chgcnt; +extern struct rw_semaphore nvmet_ana_sem; + bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, const char *hostnqn); From 62ac0d32f74ea511d5813be728dc589d03f866a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 Jun 2018 08:59:25 +0200 Subject: [PATCH 129/190] nvmet: support configuring ANA groups Allow creating non-default ANA groups (group ID > 1). Groups are created either by assigning the group ID to a namespace, or by creating a configfs group object under a specific port. All namespaces assigned to a group that doesn't have a configfs object for a given port are marked as inaccessible. Allow changing the ANA state on a per-port basis by creating an ana_groups directory under each port, and another directory with an ana_state file in it. The default ANA group 1 directory is created automatically for each port. For all changes in ANA configuration the ANA change AEN is sent. We only keep a global changecount instead of additional per-group changecounts to keep the implementation as simple as possible. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/target/admin-cmd.c | 1 + drivers/nvme/target/configfs.c | 182 +++++++++++++++++++++++++++++++- drivers/nvme/target/core.c | 27 +++++ drivers/nvme/target/nvmet.h | 30 +++++- 4 files changed, 236 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index b98d38c4e579a6..d1de639786ee75 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -235,6 +235,7 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); hdr.ngrps = cpu_to_le16(ngrps); + clear_bit(NVME_AEN_CFG_ANA_CHANGE, &req->sq->ctrl->aen_masked); up_read(&nvmet_ana_sem); kfree(desc); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index b3c62b41b2ae94..51f5a8c092b418 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -411,6 +411,39 @@ static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, CONFIGFS_ATTR(nvmet_ns_, device_nguid); +static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid); +} + +static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + u32 oldgrpid, newgrpid; + int ret; + + ret = kstrtou32(page, 0, &newgrpid); + if (ret) + return ret; + + if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS) + return -EINVAL; + + down_write(&nvmet_ana_sem); + oldgrpid = ns->anagrpid; + nvmet_ana_group_enabled[newgrpid]++; + ns->anagrpid = newgrpid; + nvmet_ana_group_enabled[oldgrpid]--; + nvmet_ana_chgcnt++; + up_write(&nvmet_ana_sem); + + nvmet_send_ana_event(ns->subsys, NULL); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, ana_grpid); + static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) { return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled); @@ -468,6 +501,7 @@ static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, &nvmet_ns_attr_device_uuid, + &nvmet_ns_attr_ana_grpid, &nvmet_ns_attr_enable, &nvmet_ns_attr_buffered_io, NULL, @@ -916,6 +950,134 @@ static const struct config_item_type nvmet_referrals_type = { .ct_group_ops = &nvmet_referral_group_ops, }; +static struct { + enum nvme_ana_state state; + const char *name; +} nvmet_ana_state_names[] = { + { NVME_ANA_OPTIMIZED, "optimized" }, + { NVME_ANA_NONOPTIMIZED, "non-optimized" }, + { NVME_ANA_INACCESSIBLE, "inaccessible" }, + { NVME_ANA_PERSISTENT_LOSS, "persistent-loss" }, + { NVME_ANA_CHANGE, "change" }, +}; + +static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item, + char *page) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + enum nvme_ana_state state = grp->port->ana_state[grp->grpid]; + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) { + if (state != nvmet_ana_state_names[i].state) + continue; + return sprintf(page, "%s\n", nvmet_ana_state_names[i].name); + } + + return sprintf(page, "\n"); +} + +static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) { + if (sysfs_streq(page, nvmet_ana_state_names[i].name)) + goto found; + } + + pr_err("Invalid value '%s' for ana_state\n", page); + return -EINVAL; + +found: + down_write(&nvmet_ana_sem); + grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state; + nvmet_ana_chgcnt++; + up_write(&nvmet_ana_sem); + + nvmet_port_send_ana_event(grp->port); + return count; +} + +CONFIGFS_ATTR(nvmet_ana_group_, ana_state); + +static struct configfs_attribute *nvmet_ana_group_attrs[] = { + &nvmet_ana_group_attr_ana_state, + NULL, +}; + +static void nvmet_ana_group_release(struct config_item *item) +{ + struct nvmet_ana_group *grp = to_ana_group(item); + + if (grp == &grp->port->ana_default_group) + return; + + down_write(&nvmet_ana_sem); + grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE; + nvmet_ana_group_enabled[grp->grpid]--; + up_write(&nvmet_ana_sem); + + nvmet_port_send_ana_event(grp->port); + kfree(grp); +} + +static struct configfs_item_operations nvmet_ana_group_item_ops = { + .release = nvmet_ana_group_release, +}; + +static const struct config_item_type nvmet_ana_group_type = { + .ct_item_ops = &nvmet_ana_group_item_ops, + .ct_attrs = nvmet_ana_group_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ana_groups_make_group( + struct config_group *group, const char *name) +{ + struct nvmet_port *port = ana_groups_to_port(&group->cg_item); + struct nvmet_ana_group *grp; + u32 grpid; + int ret; + + ret = kstrtou32(name, 0, &grpid); + if (ret) + goto out; + + ret = -EINVAL; + if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS) + goto out; + + ret = -ENOMEM; + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) + goto out; + grp->port = port; + grp->grpid = grpid; + + down_write(&nvmet_ana_sem); + nvmet_ana_group_enabled[grpid]++; + up_write(&nvmet_ana_sem); + + nvmet_port_send_ana_event(grp->port); + + config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type); + return &grp->group; +out: + return ERR_PTR(ret); +} + +static struct configfs_group_operations nvmet_ana_groups_group_ops = { + .make_group = nvmet_ana_groups_make_group, +}; + +static const struct config_item_type nvmet_ana_groups_type = { + .ct_group_ops = &nvmet_ana_groups_group_ops, + .ct_owner = THIS_MODULE, +}; + /* * Ports definitions. */ @@ -952,6 +1114,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, { struct nvmet_port *port; u16 portid; + u32 i; if (kstrtou16(name, 0, &portid)) return ERR_PTR(-EINVAL); @@ -967,7 +1130,12 @@ static struct config_group *nvmet_ports_make(struct config_group *group, return ERR_PTR(-ENOMEM); } - port->ana_state[NVMET_DEFAULT_ANA_GRPID] = NVME_ANA_OPTIMIZED; + for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) { + if (i == NVMET_DEFAULT_ANA_GRPID) + port->ana_state[1] = NVME_ANA_OPTIMIZED; + else + port->ana_state[i] = NVME_ANA_INACCESSIBLE; + } INIT_LIST_HEAD(&port->entry); INIT_LIST_HEAD(&port->subsystems); @@ -985,6 +1153,18 @@ static struct config_group *nvmet_ports_make(struct config_group *group, "referrals", &nvmet_referrals_type); configfs_add_default_group(&port->referrals_group, &port->group); + config_group_init_type_name(&port->ana_groups_group, + "ana_groups", &nvmet_ana_groups_type); + configfs_add_default_group(&port->ana_groups_group, &port->group); + + port->ana_default_group.port = port; + port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID; + config_group_init_type_name(&port->ana_default_group.group, + __stringify(NVMET_DEFAULT_ANA_GRPID), + &nvmet_ana_group_type); + configfs_add_default_group(&port->ana_default_group.group, + &port->ana_groups_group); + return &port->group; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 43a755f7baa5a7..3ceb7a03bb2ae7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -194,6 +194,33 @@ static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) } } +void nvmet_send_ana_event(struct nvmet_subsys *subsys, + struct nvmet_port *port) +{ + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (port && ctrl->port != port) + continue; + if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE)) + continue; + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_NOTICE_ANA, NVME_LOG_ANA); + } + mutex_unlock(&subsys->lock); +} + +void nvmet_port_send_ana_event(struct nvmet_port *port) +{ + struct nvmet_subsys_link *p; + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) + nvmet_send_ana_event(p->subsys, port); + up_read(&nvmet_config_sem); +} + int nvmet_register_transport(const struct nvmet_fabrics_ops *ops) { int ret = 0; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index f7d622fc1aa723..22941045f46ecb 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -30,12 +30,11 @@ #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 - /* * Supported optional AENs: */ #define NVMET_AEN_CFG_OPTIONAL \ - NVME_AEN_CFG_NS_ATTR + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE) /* * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): @@ -99,6 +98,18 @@ struct nvmet_sq { struct completion confirm_done; }; +struct nvmet_ana_group { + struct config_group group; + struct nvmet_port *port; + u32 grpid; +}; + +static inline struct nvmet_ana_group *to_ana_group(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_ana_group, + group); +} + /** * struct nvmet_port - Common structure to keep port * information for the target. @@ -116,6 +127,8 @@ struct nvmet_port { struct list_head subsystems; struct config_group referrals_group; struct list_head referrals; + struct config_group ana_groups_group; + struct nvmet_ana_group ana_default_group; enum nvme_ana_state *ana_state; void *priv; bool enabled; @@ -128,6 +141,13 @@ static inline struct nvmet_port *to_nvmet_port(struct config_item *item) group); } +static inline struct nvmet_port *ana_groups_to_port( + struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_port, + ana_groups_group); +} + struct nvmet_ctrl { struct nvmet_subsys *subsys; struct nvmet_cq **cqs; @@ -345,6 +365,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns); struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); void nvmet_ns_free(struct nvmet_ns *ns); +void nvmet_send_ana_event(struct nvmet_subsys *subsys, + struct nvmet_port *port); +void nvmet_port_send_ana_event(struct nvmet_port *port); + int nvmet_register_transport(const struct nvmet_fabrics_ops *ops); void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops); @@ -378,7 +402,7 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd); * ANA Group 1 exists without manual intervention, has namespaces assigned to it * by default, and is available in an optimized state through all ports. */ -#define NVMET_MAX_ANAGRPS 1 +#define NVMET_MAX_ANAGRPS 128 #define NVMET_DEFAULT_ANA_GRPID 1 #define NVMET_KAS 10 From b369b30cf510fe94d8884837039362e2ec223cec Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 26 Jul 2018 14:00:41 -0700 Subject: [PATCH 130/190] nvmet: use Retain Async Event bit to clear AEN In the current implementation, we clear the AEN bit when we get the "get log page" command if given log page is associated with AEN. This patch allows optionally retaining the AEN for the ctrl under consideration when Retain Asynchronous Event (RAE) bit is set as a part of "get log page" command. This allows the host to read the Log page and optionally retaining the AEN associated with this log page when using userspace tools like nvme-cli. Signed-off-by: Chaitanya Kulkarni [hch: also use the new helper in the just merged ANA code] Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index d1de639786ee75..f517bc562d264c 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -19,6 +19,19 @@ #include #include "nvmet.h" +/* + * This helper allows us to clear the AEN based on the RAE bit, + * Please use this helper when processing the log pages which are + * associated with the AEN. + */ +static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit) +{ + int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15; + + if (!rae) + clear_bit(aen_bit, &req->sq->ctrl->aen_masked); +} + u32 nvmet_get_log_page_len(struct nvme_command *cmd) { u32 len = le16_to_cpu(cmd->get_log_page.numdu); @@ -176,7 +189,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) if (!status) status = nvmet_zero_sgl(req, len, req->data_len - len); ctrl->nr_changed_ns = 0; - clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked); + nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR); mutex_unlock(&ctrl->lock); out: nvmet_req_complete(req, status); @@ -235,7 +248,7 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); hdr.ngrps = cpu_to_le16(ngrps); - clear_bit(NVME_AEN_CFG_ANA_CHANGE, &req->sq->ctrl->aen_masked); + nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE); up_read(&nvmet_ana_sem); kfree(desc); From 55690c07b44a82cc3359ce0c233f4ba7d80ba145 Mon Sep 17 00:00:00 2001 From: Jinbum Park Date: Sat, 28 Jul 2018 13:20:44 +0900 Subject: [PATCH 131/190] pktcdvd: Fix possible Spectre-v1 for pkt_devs User controls @dev_minor which to be used as index of pkt_devs. So, It can be exploited via Spectre-like attack. (speculative execution) This kind of attack leaks address of pkt_devs, [1] It leads an attacker to bypass security mechanism such as KASLR. So sanitize @dev_minor before using it to prevent attack. [1] https://github.com/jinb-park/linux-exploit/ tree/master/exploit-remaining-spectre-gadget/leak_pkt_devs.c Signed-off-by: Jinbum Park Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index a4b4d524c3af7d..9bb7721c26fc2f 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -67,7 +67,7 @@ #include #include #include - +#include #include #define DRIVER_NAME "pktcdvd" @@ -2254,6 +2254,8 @@ static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) { if (dev_minor >= MAX_WRITERS) return NULL; + + dev_minor = array_index_nospec(dev_minor, MAX_WRITERS); return pkt_devs[dev_minor]; } From c454edc21b12dd7d416de6c81555e87aaec9685c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Jul 2018 10:10:01 -0400 Subject: [PATCH 132/190] block: don't account for split bio's size in cgroup stats We need to check in blkcg_bio_issue_check if the bio is flagged as QUEUE_ENTERED, because if it is then we've already accounted for the size of the IO in the cgroup stats. We can still however account for the extra IO since it'll be another request. Reported-by: Tejun Heo Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3bed5e02a8732f..f7b910768306e4 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -769,8 +769,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, if (!throtl) { blkg = blkg ?: q->root_blkg; - blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); + /* + * If the bio is flagged with BIO_QUEUE_ENTERED it means this + * is a split bio and we would have already accounted for the + * size of the bio. + */ + if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } From ddd0bc756983dc4d19000a4fe021b4c7f9d59aab Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 30 Jul 2018 00:15:31 +0300 Subject: [PATCH 133/190] block: move ref_tag calculation func to the block layer Currently this function is implemented in the scsi layer, but it's actual place should be the block layer since T10-PI is a general data integrity feature that is used in the nvme protocol as well. Suggested-by: Christoph Hellwig Cc: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- drivers/infiniband/ulp/iser/iser_memory.c | 2 +- drivers/nvme/host/core.c | 3 +-- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 2 +- drivers/scsi/sd_dif.c | 4 ++-- include/linux/t10-pi.h | 10 ++++++++++ include/scsi/scsi_cmnd.h | 7 +------ 6 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index ca844a926e6add..130bf163f06600 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -311,7 +311,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, { domain->sig_type = IB_SIG_TYPE_T10_DIF; domain->sig.dif.pi_interval = scsi_prot_interval(sc); - domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc); + domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request); /* * At the moment we hard code those, but in the future * we will take them from sc. diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e77e6418a21cb3..16c8b86fe95d38 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -611,8 +611,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; - cmnd->rw.reftag = cpu_to_le32( - nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; } } diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index b8d131a455d01c..dd738ae5c75bcd 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -4568,7 +4568,7 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd, MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG | MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD; mpi_request->CDB.EEDP32.PrimaryReferenceTag = - cpu_to_be32(scsi_prot_ref_tag(scmd)); + cpu_to_be32(t10_pi_ref_tag(scmd->request)); break; case SCSI_PROT_DIF_TYPE3: diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 9035380c0ddabd..d8de43d359ac6d 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -124,7 +124,7 @@ void sd_dif_prepare(struct scsi_cmnd *scmd) if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION) return; - phys = scsi_prot_ref_tag(scmd); + phys = t10_pi_ref_tag(scmd->request); __rq_for_each_bio(bio, scmd->request) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -176,7 +176,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) return; intervals = good_bytes / scsi_prot_interval(scmd); - phys = scsi_prot_ref_tag(scmd); + phys = t10_pi_ref_tag(scmd->request); __rq_for_each_bio(bio, scmd->request) { struct bio_integrity_payload *bip = bio_integrity(bio); diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index c6aa8a3c42ed9a..c40511f4e63d65 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -37,6 +37,16 @@ struct t10_pi_tuple { #define T10_PI_APP_ESCAPE cpu_to_be16(0xffff) #define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff) +static inline u32 t10_pi_ref_tag(struct request *rq) +{ +#ifdef CONFIG_BLK_DEV_INTEGRITY + return blk_rq_pos(rq) >> + (rq->q->integrity.interval_exp - 9) & 0xffffffff; +#else + return -1U; +#endif +} + extern const struct blk_integrity_profile t10_pi_type1_crc; extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index aaf1e971c6a368..cae229b5395c8d 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -313,12 +314,6 @@ static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) return scmd->device->sector_size; } -static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) -{ - return blk_rq_pos(scmd->request) >> - (ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff; -} - static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; From 10c41ddd61323b27b447bc8e18296ac6c06107ad Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 30 Jul 2018 00:15:32 +0300 Subject: [PATCH 134/190] block: move dif_prepare/dif_complete functions to block layer Currently these functions are implemented in the scsi layer, but their actual place should be the block layer since T10-PI is a general data integrity feature that is used in the nvme protocol as well. Also, use the tuple size from the integrity profile since it may vary between integrity types. Suggested-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- block/t10-pi.c | 110 +++++++++++++++++++++++++++++++++++++++ drivers/scsi/sd.c | 8 +-- drivers/scsi/sd.h | 9 ---- drivers/scsi/sd_dif.c | 113 ----------------------------------------- include/linux/t10-pi.h | 3 ++ 5 files changed, 118 insertions(+), 125 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index a98db384048fa0..62aed77d0bb935 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = { .verify_fn = t10_pi_type3_verify_ip, }; EXPORT_SYMBOL(t10_pi_type3_ip); + +/** + * t10_pi_prepare - prepare PI prior submitting request to device + * @rq: request with PI that should be prepared + * @protection_type: PI type (Type 1/Type 2/Type 3) + * + * For Type 1/Type 2, the virtual start sector is the one that was + * originally submitted by the block layer for the ref_tag usage. Due to + * partitioning, MD/DM cloning, etc. the actual physical start sector is + * likely to be different. Remap protection information to match the + * physical LBA. + * + * Type 3 does not have a reference tag so no remapping is required. + */ +void t10_pi_prepare(struct request *rq, u8 protection_type) +{ + const int tuple_sz = rq->q->integrity.tuple_size; + u32 ref_tag = t10_pi_ref_tag(rq); + struct bio *bio; + + if (protection_type == T10_PI_TYPE3_PROTECTION) + return; + + __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); + u32 virt = bip_get_seed(bip) & 0xffffffff; + struct bio_vec iv; + struct bvec_iter iter; + + /* Already remapped? */ + if (bip->bip_flags & BIP_MAPPED_INTEGRITY) + break; + + bip_for_each_vec(iv, bip, iter) { + void *p, *pmap; + unsigned int j; + + pmap = kmap_atomic(iv.bv_page); + p = pmap + iv.bv_offset; + for (j = 0; j < iv.bv_len; j += tuple_sz) { + struct t10_pi_tuple *pi = p; + + if (be32_to_cpu(pi->ref_tag) == virt) + pi->ref_tag = cpu_to_be32(ref_tag); + virt++; + ref_tag++; + p += tuple_sz; + } + + kunmap_atomic(pmap); + } + + bip->bip_flags |= BIP_MAPPED_INTEGRITY; + } +} +EXPORT_SYMBOL(t10_pi_prepare); + +/** + * t10_pi_complete - prepare PI prior returning request to the block layer + * @rq: request with PI that should be prepared + * @protection_type: PI type (Type 1/Type 2/Type 3) + * @intervals: total elements to prepare + * + * For Type 1/Type 2, the virtual start sector is the one that was + * originally submitted by the block layer for the ref_tag usage. Due to + * partitioning, MD/DM cloning, etc. the actual physical start sector is + * likely to be different. Since the physical start sector was submitted + * to the device, we should remap it back to virtual values expected by the + * block layer. + * + * Type 3 does not have a reference tag so no remapping is required. + */ +void t10_pi_complete(struct request *rq, u8 protection_type, + unsigned int intervals) +{ + const int tuple_sz = rq->q->integrity.tuple_size; + u32 ref_tag = t10_pi_ref_tag(rq); + struct bio *bio; + + if (protection_type == T10_PI_TYPE3_PROTECTION) + return; + + __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); + u32 virt = bip_get_seed(bip) & 0xffffffff; + struct bio_vec iv; + struct bvec_iter iter; + + bip_for_each_vec(iv, bip, iter) { + void *p, *pmap; + unsigned int j; + + pmap = kmap_atomic(iv.bv_page); + p = pmap + iv.bv_offset; + for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { + struct t10_pi_tuple *pi = p; + + if (be32_to_cpu(pi->ref_tag) == ref_tag) + pi->ref_tag = cpu_to_be32(virt); + virt++; + ref_tag++; + intervals--; + p += tuple_sz; + } + + kunmap_atomic(pmap); + } + } +} +EXPORT_SYMBOL(t10_pi_complete); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 9421d987773051..bbebdc3769b06a 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1119,7 +1119,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) SCpnt->cmnd[0] = WRITE_6; if (blk_integrity_rq(rq)) - sd_dif_prepare(SCpnt); + t10_pi_prepare(SCpnt->request, sdkp->protection_type); } else if (rq_data_dir(rq) == READ) { SCpnt->cmnd[0] = READ_6; @@ -2047,8 +2047,10 @@ static int sd_done(struct scsi_cmnd *SCpnt) "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); - if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt)) - sd_dif_complete(SCpnt, good_bytes); + if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) && + good_bytes) + t10_pi_complete(SCpnt->request, sdkp->protection_type, + good_bytes / scsi_prot_interval(SCpnt)); return good_bytes; } diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 392c7d078ae37e..a7d4f50b67d433 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -254,21 +254,12 @@ static inline unsigned int sd_prot_flag_mask(unsigned int prot_op) #ifdef CONFIG_BLK_DEV_INTEGRITY extern void sd_dif_config_host(struct scsi_disk *); -extern void sd_dif_prepare(struct scsi_cmnd *scmd); -extern void sd_dif_complete(struct scsi_cmnd *, unsigned int); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline void sd_dif_config_host(struct scsi_disk *disk) { } -static inline int sd_dif_prepare(struct scsi_cmnd *scmd) -{ - return 0; -} -static inline void sd_dif_complete(struct scsi_cmnd *cmd, unsigned int a) -{ -} #endif /* CONFIG_BLK_DEV_INTEGRITY */ diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index d8de43d359ac6d..db72c82486e379 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -95,116 +95,3 @@ void sd_dif_config_host(struct scsi_disk *sdkp) blk_integrity_register(disk, &bi); } -/* - * The virtual start sector is the one that was originally submitted - * by the block layer. Due to partitioning, MD/DM cloning, etc. the - * actual physical start sector is likely to be different. Remap - * protection information to match the physical LBA. - * - * From a protocol perspective there's a slight difference between - * Type 1 and 2. The latter uses 32-byte CDBs exclusively, and the - * reference tag is seeded in the CDB. This gives us the potential to - * avoid virt->phys remapping during write. However, at read time we - * don't know whether the virt sector is the same as when we wrote it - * (we could be reading from real disk as opposed to MD/DM device. So - * we always remap Type 2 making it identical to Type 1. - * - * Type 3 does not have a reference tag so no remapping is required. - */ -void sd_dif_prepare(struct scsi_cmnd *scmd) -{ - const int tuple_sz = sizeof(struct t10_pi_tuple); - struct bio *bio; - struct scsi_disk *sdkp; - struct t10_pi_tuple *pi; - u32 phys, virt; - - sdkp = scsi_disk(scmd->request->rq_disk); - - if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION) - return; - - phys = t10_pi_ref_tag(scmd->request); - - __rq_for_each_bio(bio, scmd->request) { - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_vec iv; - struct bvec_iter iter; - unsigned int j; - - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; - - virt = bip_get_seed(bip) & 0xffffffff; - - bip_for_each_vec(iv, bip, iter) { - pi = kmap_atomic(iv.bv_page) + iv.bv_offset; - - for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { - - if (be32_to_cpu(pi->ref_tag) == virt) - pi->ref_tag = cpu_to_be32(phys); - - virt++; - phys++; - } - - kunmap_atomic(pi); - } - - bip->bip_flags |= BIP_MAPPED_INTEGRITY; - } -} - -/* - * Remap physical sector values in the reference tag to the virtual - * values expected by the block layer. - */ -void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) -{ - const int tuple_sz = sizeof(struct t10_pi_tuple); - struct scsi_disk *sdkp; - struct bio *bio; - struct t10_pi_tuple *pi; - unsigned int j, intervals; - u32 phys, virt; - - sdkp = scsi_disk(scmd->request->rq_disk); - - if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION || good_bytes == 0) - return; - - intervals = good_bytes / scsi_prot_interval(scmd); - phys = t10_pi_ref_tag(scmd->request); - - __rq_for_each_bio(bio, scmd->request) { - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_vec iv; - struct bvec_iter iter; - - virt = bip_get_seed(bip) & 0xffffffff; - - bip_for_each_vec(iv, bip, iter) { - pi = kmap_atomic(iv.bv_page) + iv.bv_offset; - - for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) { - - if (intervals == 0) { - kunmap_atomic(pi); - return; - } - - if (be32_to_cpu(pi->ref_tag) == phys) - pi->ref_tag = cpu_to_be32(virt); - - virt++; - phys++; - intervals--; - } - - kunmap_atomic(pi); - } - } -} - diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index c40511f4e63d65..5a427c289f58bd 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -51,5 +51,8 @@ extern const struct blk_integrity_profile t10_pi_type1_crc; extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; +extern void t10_pi_prepare(struct request *rq, u8 protection_type); +extern void t10_pi_complete(struct request *rq, u8 protection_type, + unsigned int intervals); #endif From f7f1fc363aab4601786d373569c1ae802ea593d0 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 30 Jul 2018 00:15:33 +0300 Subject: [PATCH 135/190] nvme: use blk API to remap ref tags for IOs with metadata Also moved the logic of the remapping to the nvme core driver instead of implementing it in the nvme pci driver. This way all the other nvme transport drivers will benefit from it (in case they'll implement metadata support). Suggested-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Keith Busch Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 18 ++++++++++ drivers/nvme/host/nvme.h | 9 +---- drivers/nvme/host/pci.c | 75 +--------------------------------------- 3 files changed, 20 insertions(+), 82 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 16c8b86fe95d38..8f3b1ad1ee14fa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -601,6 +601,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) return BLK_STS_NOTSUPP; control |= NVME_RW_PRINFO_PRACT; + } else if (req_op(req) == REQ_OP_WRITE) { + t10_pi_prepare(req, ns->pi_type); } switch (ns->pi_type) { @@ -621,6 +623,22 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, return 0; } +void nvme_cleanup_cmd(struct request *req) +{ + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + nvme_req(req)->status == 0) { + struct nvme_ns *ns = req->rq_disk->private_data; + + t10_pi_complete(req, ns->pi_type, + blk_rq_bytes(req) >> ns->lba_shift); + } + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + kfree(page_address(req->special_vec.bv_page) + + req->special_vec.bv_offset); + } +} +EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); + blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4ad0c8ad2a2747..cf970f9543a6b3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -364,14 +364,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } -static inline void nvme_cleanup_cmd(struct request *req) -{ - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - kfree(page_address(req->special_vec.bv_page) + - req->special_vec.bv_offset); - } -} - static inline void nvme_end_request(struct request *req, __le16 status, union nvme_result result) { @@ -428,6 +420,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); +void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8dcae11bbf3ab5..0848e7143311b1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -537,73 +537,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) mempool_free(iod->sg, dev->iod_mempool); } -#ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) -{ - if (be32_to_cpu(pi->ref_tag) == v) - pi->ref_tag = cpu_to_be32(p); -} - -static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) -{ - if (be32_to_cpu(pi->ref_tag) == p) - pi->ref_tag = cpu_to_be32(v); -} - -/** - * nvme_dif_remap - remaps ref tags to bip seed and physical lba - * - * The virtual start sector is the one that was originally submitted by the - * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical - * start sector may be different. Remap protection information to match the - * physical LBA on writes, and back to the original seed on reads. - * - * Type 0 and 3 do not have a ref tag, so no remapping required. - */ -static void nvme_dif_remap(struct request *req, - void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) -{ - struct nvme_ns *ns = req->rq_disk->private_data; - struct bio_integrity_payload *bip; - struct t10_pi_tuple *pi; - void *p, *pmap; - u32 i, nlb, ts, phys, virt; - - if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) - return; - - bip = bio_integrity(req->bio); - if (!bip) - return; - - pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - - p = pmap; - virt = bip_get_seed(bip); - phys = nvme_block_nr(ns, blk_rq_pos(req)); - nlb = (blk_rq_bytes(req) >> ns->lba_shift); - ts = ns->disk->queue->integrity.tuple_size; - - for (i = 0; i < nlb; i++, virt++, phys++) { - pi = (struct t10_pi_tuple *)p; - dif_swap(phys, virt, pi); - p += ts; - } - kunmap_atomic(pmap); -} -#else /* CONFIG_BLK_DEV_INTEGRITY */ -static void nvme_dif_remap(struct request *req, - void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) -{ -} -static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) -{ -} -static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) -{ -} -#endif - static void nvme_print_sgl(struct scatterlist *sgl, int nents) { int i; @@ -829,9 +762,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) goto out_unmap; - if (req_op(req) == REQ_OP_WRITE) - nvme_dif_remap(req, nvme_dif_prep); - if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) goto out_unmap; } @@ -854,11 +784,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) if (iod->nents) { dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); - if (blk_integrity_rq(req)) { - if (req_op(req) == REQ_OP_READ) - nvme_dif_remap(req, nvme_dif_complete); + if (blk_integrity_rq(req)) dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); - } } nvme_cleanup_cmd(req); From 54648cf1ec2d7f4b6a71767799c45676a138ca24 Mon Sep 17 00:00:00 2001 From: xiao jin Date: Mon, 30 Jul 2018 14:11:12 +0800 Subject: [PATCH 136/190] block: blk_init_allocated_queue() set q->fq as NULL in the fail case We find the memory use-after-free issue in __blk_drain_queue() on the kernel 4.14. After read the latest kernel 4.18-rc6 we think it has the same problem. Memory is allocated for q->fq in the blk_init_allocated_queue(). If the elevator init function called with error return, it will run into the fail case to free the q->fq. Then the __blk_drain_queue() uses the same memory after the free of the q->fq, it will lead to the unpredictable event. The patch is to set q->fq as NULL in the fail case of blk_init_allocated_queue(). Fixes: commit 7c94e1c157a2 ("block: introduce blk_flush_queue to drive flush machinery") Cc: Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Signed-off-by: xiao jin Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-core.c b/block/blk-core.c index 03a4ea93a5f365..23cd1b7770e706 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1184,6 +1184,7 @@ int blk_init_allocated_queue(struct request_queue *q) q->exit_rq_fn(q, q->fq->flush_rq); out_free_flush_queue: blk_free_flush_queue(q->fq); + q->fq = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_init_allocated_queue); From 08fcf813281ebcf72c69487c1501ad91b7121cdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 31 Jul 2018 09:10:26 -0600 Subject: [PATCH 137/190] t10-pi: provide empty t10_pi_complete() for !CONFIG_BLK_DEV_INTEGRITY Fixes a link failure whtn BLK_DEV_INTEGRITY isn't defined. Fixes: 10c41ddd6132 ("block: move dif_prepare/dif_complete functions to block layer") Signed-off-by: Jens Axboe --- include/linux/t10-pi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 5a427c289f58bd..b9626aa7e90c67 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -51,8 +51,19 @@ extern const struct blk_integrity_profile t10_pi_type1_crc; extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; + +#ifdef CONFIG_BLK_DEV_INTEGRITY extern void t10_pi_prepare(struct request *rq, u8 protection_type); extern void t10_pi_complete(struct request *rq, u8 protection_type, unsigned int intervals); +#else +static inline void t10_pi_complete(struct request *rq, u8 protection_type, + unsigned int intervals) +{ +} +static inline void t10_pi_prepare(struct request *rq, u8 protection_type) +{ +} +#endif #endif From 4725549192c9633b6a3740bf23770cb758bee4a0 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 1 Aug 2018 00:13:14 +0800 Subject: [PATCH 138/190] block/bsg-lib: use PTR_ERR_OR_ZERO to simplify the flow path Simplify the code by using the PTR_ERR_OR_ZERO, instead of the open code. It is better. Reviewed-by: Johannes Thumshirn Signed-off-by: zhong jiang Signed-off-by: Jens Axboe --- block/bsg-lib.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 9419def8c01755..f3501cdaf1a654 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -48,9 +48,8 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, job->request_len = hdr->request_len; job->request = memdup_user(uptr64(hdr->request), hdr->request_len); - if (IS_ERR(job->request)) - return PTR_ERR(job->request); - return 0; + + return PTR_ERR_OR_ZERO(job->request); } static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) From 52a1199ccd426ad583ade4eb678b3b5846c58f43 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 Jul 2018 12:39:02 -0400 Subject: [PATCH 139/190] blk-iolatency: fix blkg leak in timer_fn At this point we have a ref on the blkg, we need to drop it if we don't have a iolat. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index bb59b2929e0d96..b0dc4fc64b3ec2 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -627,7 +627,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) iolat = blkg_to_lat(blkg); if (!iolat) - continue; + goto next; lat_info = &iolat->child_lat; cookie = atomic_read(&lat_info->scale_cookie); From cc7ecc258562b065ef638c4b45e20953d57fb564 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 Jul 2018 12:39:03 -0400 Subject: [PATCH 140/190] blk-cgroup: hold the queue ref during throttling The blkg lifetime is protected by the queue lifetime, so we need to put the queue _after_ we're done using the blkg. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1942357d7165d2..694595b29b8fd2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1759,10 +1759,10 @@ void blkcg_maybe_throttle_current(void) if (!blkg) goto out; rcu_read_unlock(); - blk_put_queue(q); blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); + blk_put_queue(q); return; out: rcu_read_unlock(); From 2c323017e381c55c5ce2a603b8305bb18c1162cc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 Jul 2018 12:39:04 -0400 Subject: [PATCH 141/190] blk-cgroup: clear the throttle queue on fork We were hitting a panic in production where we put too many times on the request queue. This is because we'd get the throttle_queue of the parent if we fork()'ed while we needed to be throttled, but we didn't have a reference on it. Instead just clear these flags on fork so the child doesn't pay for the sins of its father. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925ca0..694ae0e56866d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -843,6 +843,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->fail_nth = 0; #endif +#ifdef CONFIG_BLK_CGROUP + tsk->throttle_queue = NULL; + tsk->use_memdelay = 0; +#endif + return tsk; free_stack: From c480bcf97b186a67ea6f0f6cab70ba430bcd5613 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Wed, 1 Aug 2018 23:15:41 -0700 Subject: [PATCH 142/190] block: make iolatency avg_lat exponentially decay Currently, avg_lat is calculated by accumulating the mean of every window in a long running cumulative average. As time goes on, the metric becomes less and less useful due to the accumulated history. This patch reuses the same calculation done in load averages to make the avg_lat metric more lively. Unlike load averages, the avg only advances when a window elapses (due to an io). Idle periods extend the most recent window. Bucketing is used to limit the history of avg_lat by binding it to the window size. So, the window range for 1/exp (decay rate) is [1 min, 2.5 min) when windows elapse immediately. The current sample window size is exposed in the debug info to enable calculation of the window range. Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Josef Bacik Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 21 +++++---- block/blk-iolatency.c | 60 ++++++++++++++++++------- 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 3afe10fa82bc69..1746131bc9cb31 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1474,11 +1474,9 @@ So the ideal way to configure this is to set io.latency in groups A, B, and C. Generally you do not want to set a value lower than the latency your device supports. Experiment to find the value that works best for your workload. Start at higher than the expected latency for your device and watch the -total_lat_avg value in io.stat for your workload group to get an idea of the -latency you see during normal operation. Use this value as a basis for your -real setting, setting at 10-15% higher than the value in io.stat. -Experimentation is key here because total_lat_avg is a running total, so is the -"statistics" portion of "lies, damned lies, and statistics." +avg_lat value in io.stat for your workload group to get an idea of the +latency you see during normal operation. Use the avg_lat value as a basis for +your real setting, setting at 10-15% higher than the value in io.stat. How IO Latency Throttling Works ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1522,10 +1520,15 @@ IO Latency Interface Files This is the current queue depth for the group. avg_lat - The running average IO latency for this group in microseconds. - Running average is generally flawed, but will give an - administrator a general idea of the overall latency they can - expect for their workload on the given disk. + This is an exponential moving average with a decay rate of 1/exp + bound by the sampling interval. The decay rate interval can be + calculated by multiplying the win value in io.stat by the + corresponding number of samples based on the win value. + + win + The sampling window size in milliseconds. This is the minimum + duration of time between evaluation events. Windows only elapse + with IO activity. Idle periods extend the most recent window. PID --- diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index b0dc4fc64b3ec2..19923f8a029ddf 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include "blk-rq-qos.h" @@ -126,8 +127,7 @@ struct iolatency_grp { u64 cur_win_nsec; /* total running average of our io latency. */ - u64 total_lat_avg; - u64 total_lat_nr; + u64 lat_avg; /* Our current number of IO's for the last summation. */ u64 nr_samples; @@ -135,6 +135,28 @@ struct iolatency_grp { struct child_latency_info child_lat; }; +#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) +#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC +/* + * These are the constants used to fake the fixed-point moving average + * calculation just like load average. The call to CALC_LOAD folds + * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling + * window size is bucketed to try to approximately calculate average + * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows + * elapse immediately. Note, windows only elapse with IO activity. Idle + * periods extend the most recent window. + */ +#define BLKIOLATENCY_NR_EXP_FACTORS 5 +#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ + (BLKIOLATENCY_NR_EXP_FACTORS - 1)) +static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { + 2045, // exp(1/600) - 600 samples + 2039, // exp(1/240) - 240 samples + 2031, // exp(1/120) - 120 samples + 2023, // exp(1/80) - 80 samples + 2014, // exp(1/60) - 60 samples +}; + static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; @@ -462,7 +484,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) struct child_latency_info *lat_info; struct blk_rq_stat stat; unsigned long flags; - int cpu; + int cpu, exp_idx; blk_rq_stat_init(&stat); preempt_disable(); @@ -480,11 +502,17 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) lat_info = &parent->child_lat; - iolat->total_lat_avg = - div64_u64((iolat->total_lat_avg * iolat->total_lat_nr) + - stat.mean, iolat->total_lat_nr + 1); - - iolat->total_lat_nr++; + /* + * CALC_LOAD takes in a number stored in fixed point representation. + * Because we are using this for IO time in ns, the values stored + * are significantly larger than the FIXED_1 denominator (2048). + * Therefore, rounding errors in the calculation are negligible and + * can be ignored. + */ + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, + div64_u64(iolat->cur_win_nsec, + BLKIOLATENCY_EXP_BUCKET_SIZE)); + CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); /* Everything is ok and we don't need to adjust the scale. */ if (stat.mean <= iolat->min_lat_nsec && @@ -700,8 +728,9 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) u64 oldval = iolat->min_lat_nsec; iolat->min_lat_nsec = val; - iolat->cur_win_nsec = max_t(u64, val << 4, 100 * NSEC_PER_MSEC); - iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, NSEC_PER_SEC); + iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); + iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, + BLKIOLATENCY_MAX_WIN_SIZE); if (!oldval && val) atomic_inc(&blkiolat->enabled); @@ -810,14 +839,15 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) { struct iolatency_grp *iolat = pd_to_lat(pd); - unsigned long long avg_lat = div64_u64(iolat->total_lat_avg, NSEC_PER_USEC); + unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); + unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " depth=max avg_lat=%llu", - avg_lat); + return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); - return scnprintf(buf, size, " depth=%u avg_lat=%llu", - iolat->rq_depth.max_depth, avg_lat); + return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); } From 99972f171bba19243999310154b7442198f0ab30 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 1 Aug 2018 17:30:20 -0500 Subject: [PATCH 143/190] aoe: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 114722 ("Missing break in switch") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 096882e54095b4..136dc507d0206d 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1137,6 +1137,7 @@ noskb: if (buf) break; } bvcpy(skb, f->buf->bio, f->iter, n); + /* fall through */ case ATA_CMD_PIO_WRITE: case ATA_CMD_PIO_WRITE_EXT: spin_lock_irq(&d->lock); From b233f127042dba991229e3882c6217c80492f6ef Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 30 Jul 2018 20:02:19 +0800 Subject: [PATCH 144/190] block: really disable runtime-pm for blk-mq Runtime PM isn't ready for blk-mq yet, and commit 765e40b675a9 ("block: disable runtime-pm for blk-mq") tried to disable it. Unfortunately, it can't take effect in that way since user space still can switch it on via 'echo auto > /sys/block/sdN/device/power/control'. This patch disables runtime-pm for blk-mq really by pm_runtime_disable() and fixes all kinds of PM related kernel crash. Cc: Tomas Janousek Cc: Przemek Socha Cc: Alan Stern Cc: Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Tested-by: Patrick Steinhardt Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 23cd1b7770e706..f9ad73d8573c85 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3770,9 +3770,11 @@ EXPORT_SYMBOL(blk_finish_plug); */ void blk_pm_runtime_init(struct request_queue *q, struct device *dev) { - /* not support for RQF_PM and ->rpm_status in blk-mq yet */ - if (q->mq_ops) + /* Don't enable runtime PM for blk-mq until it is ready */ + if (q->mq_ops) { + pm_runtime_disable(dev); return; + } q->dev = dev; q->rpm_status = RPM_ACTIVE; From 75d6e175fc511e95ae3eb8f708680133bc211ed3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 2 Aug 2018 18:23:26 +0800 Subject: [PATCH 145/190] blk-mq: fix updating tags depth The passed 'nr' from userspace represents the total depth, meantime inside 'struct blk_mq_tags', 'nr_tags' stores the total tag depth, and 'nr_reserved_tags' stores the reserved part. There are two issues in blk_mq_tag_update_depth() now: 1) for growing tags, we should have used the passed 'nr', and keep the number of reserved tags not changed. 2) the passed 'nr' should have been used for checking against 'tags->nr_tags', instead of number of the normal part. This patch fixes the above two cases, and avoids kernel crash caused by wrong resizing sbitmap queue. Cc: "Ewan D. Milne" Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Omar Sandoval Tested by: Marco Patalano Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 09b2ee6694fb16..c43b3398d7b417 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -399,8 +399,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth <= tags->nr_reserved_tags) return -EINVAL; - tdepth -= tags->nr_reserved_tags; - /* * If we are allowed to grow beyond the original size, allocate * a new set of tags before freeing the old one. @@ -420,7 +418,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > 16 * BLKDEV_MAX_RQ) return -EINVAL; - new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0); + new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, + tags->nr_reserved_tags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); @@ -437,7 +436,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ - sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + sbitmap_queue_resize(&tags->bitmap_tags, + tdepth - tags->nr_reserved_tags); } return 0; From 54f8a7ae7c210382a3037887a9831462741ae0db Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 31 Jul 2018 12:51:46 -0700 Subject: [PATCH 146/190] ide-cd: Drop unused sense buffers This drops unused sense buffers from: cdrom_eject() cdrom_read_capacity() cdrom_read_tocentry() ide_cd_lockdoor() ide_cd_read_toc() Acked-by: David S. Miller Reviewed-by: Christoph Hellwig Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/ide/ide-cd.c | 36 +++++++++++++++--------------------- drivers/ide/ide-cd.h | 2 +- drivers/ide/ide-cd_ioctl.c | 34 ++++++++++++---------------------- 3 files changed, 28 insertions(+), 44 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 5f178384876fb7..a37dd381d307d6 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -890,8 +890,7 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense) } static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, - unsigned long *sectors_per_frame, - struct request_sense *sense) + unsigned long *sectors_per_frame) { struct { __be32 lba; @@ -908,7 +907,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, memset(cmd, 0, BLK_MAX_CDB); cmd[0] = GPCMD_READ_CDVD_CAPACITY; - stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, sense, 0, + stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, NULL, 0, RQF_QUIET); if (stat) return stat; @@ -944,8 +943,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, } static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag, - int format, char *buf, int buflen, - struct request_sense *sense) + int format, char *buf, int buflen) { unsigned char cmd[BLK_MAX_CDB]; @@ -962,11 +960,11 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag, if (msf_flag) cmd[1] = 2; - return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, RQF_QUIET); + return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, NULL, 0, RQF_QUIET); } /* Try to read the entire TOC for the disk into our internal buffer. */ -int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) +int ide_cd_read_toc(ide_drive_t *drive) { int stat, ntracks, i; struct cdrom_info *info = drive->driver_data; @@ -996,14 +994,13 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) * Check to see if the existing data is still valid. If it is, * just return. */ - (void) cdrom_check_status(drive, sense); + (void) cdrom_check_status(drive, NULL); if (drive->atapi_flags & IDE_AFLAG_TOC_VALID) return 0; /* try to get the total cdrom capacity and sector size */ - stat = cdrom_read_capacity(drive, &toc->capacity, §ors_per_frame, - sense); + stat = cdrom_read_capacity(drive, &toc->capacity, §ors_per_frame); if (stat) toc->capacity = 0x1fffff; @@ -1016,7 +1013,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) /* first read just the header, so we know how long the TOC is */ stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr, - sizeof(struct atapi_toc_header), sense); + sizeof(struct atapi_toc_header)); if (stat) return stat; @@ -1036,7 +1033,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) (char *)&toc->hdr, sizeof(struct atapi_toc_header) + (ntracks + 1) * - sizeof(struct atapi_toc_entry), sense); + sizeof(struct atapi_toc_entry)); if (stat && toc->hdr.first_track > 1) { /* @@ -1056,8 +1053,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) (char *)&toc->hdr, sizeof(struct atapi_toc_header) + (ntracks + 1) * - sizeof(struct atapi_toc_entry), - sense); + sizeof(struct atapi_toc_entry)); if (stat) return stat; @@ -1094,7 +1090,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) if (toc->hdr.first_track != CDROM_LEADOUT) { /* read the multisession information */ stat = cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp, - sizeof(ms_tmp), sense); + sizeof(ms_tmp)); if (stat) return stat; @@ -1108,7 +1104,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) { /* re-read multisession information using MSF format */ stat = cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp, - sizeof(ms_tmp), sense); + sizeof(ms_tmp)); if (stat) return stat; @@ -1412,7 +1408,7 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive) { unsigned long capacity, sectors_per_frame; - if (cdrom_read_capacity(drive, &capacity, §ors_per_frame, NULL)) + if (cdrom_read_capacity(drive, &capacity, §ors_per_frame)) return 0; return capacity * sectors_per_frame; @@ -1710,9 +1706,8 @@ static unsigned int idecd_check_events(struct gendisk *disk, static int idecd_revalidate_disk(struct gendisk *disk) { struct cdrom_info *info = ide_drv_g(disk, cdrom_info); - struct request_sense sense; - ide_cd_read_toc(info->drive, &sense); + ide_cd_read_toc(info->drive); return 0; } @@ -1736,7 +1731,6 @@ static int ide_cd_probe(ide_drive_t *drive) { struct cdrom_info *info; struct gendisk *g; - struct request_sense sense; ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x", drive->driver_req, drive->media); @@ -1785,7 +1779,7 @@ static int ide_cd_probe(ide_drive_t *drive) goto failed; } - ide_cd_read_toc(drive, &sense); + ide_cd_read_toc(drive); g->fops = &idecd_ops; g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; device_add_disk(&drive->gendev, g); diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h index 04f0f310a85661..fc162fbb6629c0 100644 --- a/drivers/ide/ide-cd.h +++ b/drivers/ide/ide-cd.h @@ -99,7 +99,7 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *); /* ide-cd.c functions used by ide-cd_ioctl.c */ int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *, unsigned *, struct request_sense *, int, req_flags_t); -int ide_cd_read_toc(ide_drive_t *, struct request_sense *); +int ide_cd_read_toc(ide_drive_t *); int ide_cdrom_get_capabilities(ide_drive_t *, u8 *); void ide_cdrom_update_speed(ide_drive_t *, u8 *); int cdrom_check_status(ide_drive_t *, struct request_sense *); diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index b1322400887ba7..14540544413c41 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -105,8 +105,7 @@ unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi, /* Eject the disk if EJECTFLAG is 0. If EJECTFLAG is 1, try to reload the disk. */ static -int cdrom_eject(ide_drive_t *drive, int ejectflag, - struct request_sense *sense) +int cdrom_eject(ide_drive_t *drive, int ejectflag) { struct cdrom_info *cd = drive->driver_data; struct cdrom_device_info *cdi = &cd->devinfo; @@ -129,20 +128,16 @@ int cdrom_eject(ide_drive_t *drive, int ejectflag, cmd[0] = GPCMD_START_STOP_UNIT; cmd[4] = loej | (ejectflag != 0); - return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, 0); + return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0); } /* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */ static -int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, - struct request_sense *sense) +int ide_cd_lockdoor(ide_drive_t *drive, int lockflag) { - struct request_sense my_sense; + struct request_sense my_sense, *sense = &my_sense; int stat; - if (sense == NULL) - sense = &my_sense; - /* If the drive cannot lock the door, just pretend. */ if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) { stat = 0; @@ -186,23 +181,22 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position) { ide_drive_t *drive = cdi->handle; - struct request_sense sense; if (position) { - int stat = ide_cd_lockdoor(drive, 0, &sense); + int stat = ide_cd_lockdoor(drive, 0); if (stat) return stat; } - return cdrom_eject(drive, !position, &sense); + return cdrom_eject(drive, !position); } int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock) { ide_drive_t *drive = cdi->handle; - return ide_cd_lockdoor(drive, lock, NULL); + return ide_cd_lockdoor(drive, lock); } /* @@ -213,7 +207,6 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed) { ide_drive_t *drive = cdi->handle; struct cdrom_info *cd = drive->driver_data; - struct request_sense sense; u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE]; int stat; unsigned char cmd[BLK_MAX_CDB]; @@ -236,7 +229,7 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed) cmd[5] = speed & 0xff; } - stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0); + stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0); if (!ide_cdrom_get_capabilities(drive, buf)) { ide_cdrom_update_speed(drive, buf); @@ -252,11 +245,10 @@ int ide_cdrom_get_last_session(struct cdrom_device_info *cdi, struct atapi_toc *toc; ide_drive_t *drive = cdi->handle; struct cdrom_info *info = drive->driver_data; - struct request_sense sense; int ret; if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) { - ret = ide_cd_read_toc(drive, &sense); + ret = ide_cd_read_toc(drive); if (ret) return ret; } @@ -300,7 +292,6 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) { ide_drive_t *drive = cdi->handle; struct cdrom_info *cd = drive->driver_data; - struct request_sense sense; struct request *rq; int ret; @@ -315,7 +306,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) * lock it again. */ if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED) - (void)ide_cd_lockdoor(drive, 1, &sense); + (void)ide_cd_lockdoor(drive, 1); return ret; } @@ -355,7 +346,6 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg) struct atapi_toc_entry *first_toc, *last_toc; unsigned long lba_start, lba_end; int stat; - struct request_sense sense; unsigned char cmd[BLK_MAX_CDB]; stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc); @@ -380,7 +370,7 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg) lba_to_msf(lba_start, &cmd[3], &cmd[4], &cmd[5]); lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]); - return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0); + return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0); } static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg) @@ -391,7 +381,7 @@ static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg) int stat; /* Make sure our saved TOC is valid. */ - stat = ide_cd_read_toc(drive, NULL); + stat = ide_cd_read_toc(drive); if (stat) return stat; From 1fd89e4ddcec0c8cae15ddd19980cb944b2baf5d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 31 Jul 2018 12:51:47 -0700 Subject: [PATCH 147/190] scsi: cxlflash: Drop unused sense buffers This removes the unused sense buffer in read_cap16() and write_same16(). Reviewed-by: Christoph Hellwig Acked-by: Matthew R. Ochs Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/scsi/cxlflash/superpipe.c | 8 ++------ drivers/scsi/cxlflash/vlun.c | 7 ++----- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c index e489d89cbb45dd..379890c4500b0c 100644 --- a/drivers/scsi/cxlflash/superpipe.c +++ b/drivers/scsi/cxlflash/superpipe.c @@ -339,7 +339,6 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli) struct scsi_sense_hdr sshdr; u8 *cmd_buf = NULL; u8 *scsi_cmd = NULL; - u8 *sense_buf = NULL; int rc = 0; int result = 0; int retry_cnt = 0; @@ -348,8 +347,7 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli) retry: cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); - sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); - if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) { + if (unlikely(!cmd_buf || !scsi_cmd)) { rc = -ENOMEM; goto out; } @@ -364,7 +362,7 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli) /* Drop the ioctl read semahpore across lengthy call */ up_read(&cfg->ioctl_rwsem); result = scsi_execute(sdev, scsi_cmd, DMA_FROM_DEVICE, cmd_buf, - CMD_BUFSIZE, sense_buf, &sshdr, to, CMD_RETRIES, + CMD_BUFSIZE, NULL, &sshdr, to, CMD_RETRIES, 0, 0, NULL); down_read(&cfg->ioctl_rwsem); rc = check_state(cfg); @@ -395,7 +393,6 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli) if (retry_cnt++ < 1) { kfree(cmd_buf); kfree(scsi_cmd); - kfree(sense_buf); goto retry; } } @@ -426,7 +423,6 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli) out: kfree(cmd_buf); kfree(scsi_cmd); - kfree(sense_buf); dev_dbg(dev, "%s: maxlba=%lld blklen=%d rc=%d\n", __func__, gli->max_lba, gli->blk_len, rc); diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c index 66e445a17d6c2a..2c904bf16b650e 100644 --- a/drivers/scsi/cxlflash/vlun.c +++ b/drivers/scsi/cxlflash/vlun.c @@ -426,7 +426,6 @@ static int write_same16(struct scsi_device *sdev, { u8 *cmd_buf = NULL; u8 *scsi_cmd = NULL; - u8 *sense_buf = NULL; int rc = 0; int result = 0; u64 offset = lba; @@ -440,8 +439,7 @@ static int write_same16(struct scsi_device *sdev, cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); - sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); - if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) { + if (unlikely(!cmd_buf || !scsi_cmd)) { rc = -ENOMEM; goto out; } @@ -457,7 +455,7 @@ static int write_same16(struct scsi_device *sdev, /* Drop the ioctl read semahpore across lengthy call */ up_read(&cfg->ioctl_rwsem); result = scsi_execute(sdev, scsi_cmd, DMA_TO_DEVICE, cmd_buf, - CMD_BUFSIZE, sense_buf, NULL, to, + CMD_BUFSIZE, NULL, NULL, to, CMD_RETRIES, 0, 0, NULL); down_read(&cfg->ioctl_rwsem); rc = check_state(cfg); @@ -482,7 +480,6 @@ static int write_same16(struct scsi_device *sdev, out: kfree(cmd_buf); kfree(scsi_cmd); - kfree(sense_buf); dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc); return rc; } From ad80f9703a3de62c48c012af62899b754af087f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Jul 2018 12:51:48 -0700 Subject: [PATCH 148/190] scsi: build scsi_common.o for all scsi passthrough request users Split scsi_common.o out of SCSI so that non-SCSI users can pull it in easily for future sense buffer helper usage. Signed-off-by: Christoph Hellwig Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/Makefile | 2 +- drivers/scsi/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index 24cd470146578d..a6abd7a856c63d 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -76,7 +76,7 @@ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ obj-$(CONFIG_NUBUS) += nubus/ obj-y += macintosh/ obj-$(CONFIG_IDE) += ide/ -obj-$(CONFIG_SCSI) += scsi/ +obj-y += scsi/ obj-y += nvme/ obj-$(CONFIG_ATA) += ata/ obj-$(CONFIG_TARGET_CORE) += target/ diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index 80aca24563534b..768953881c9e98 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -21,6 +21,7 @@ CFLAGS_gdth.o = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ -DGDTH_STATISTICS obj-$(CONFIG_PCMCIA) += pcmcia/ obj-$(CONFIG_SCSI) += scsi_mod.o +obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_common.o obj-$(CONFIG_RAID_ATTRS) += raid_class.o @@ -156,7 +157,6 @@ obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/ obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o scsi_mod-y += scsi.o hosts.o scsi_ioctl.o \ scsicam.o scsi_error.o scsi_lib.o -scsi_mod-y += scsi_common.o scsi_mod-$(CONFIG_SCSI_CONSTANTS) += constants.o scsi_mod-$(CONFIG_SCSI_DMA) += scsi_lib_dma.o scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o From 8a39a0478355e9dfdd2f35038d07c4ebe3192441 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Jul 2018 12:51:49 -0700 Subject: [PATCH 149/190] target: don't depend on SCSI The core target code only needs code from scsi_common.c, which is now separately selectable. Signed-off-by: Christoph Hellwig Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/target/Kconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index 4c44d7bed01a6c..cb6f32ce7de8f2 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig @@ -1,10 +1,10 @@ menuconfig TARGET_CORE tristate "Generic Target Core Mod (TCM) and ConfigFS Infrastructure" - depends on SCSI && BLOCK + depends on BLOCK select CONFIGFS_FS select CRC_T10DIF - select BLK_SCSI_REQUEST # only for scsi_command_size_tbl.. + select BLK_SCSI_REQUEST select SGL_ALLOC default n help @@ -29,6 +29,7 @@ config TCM_FILEIO config TCM_PSCSI tristate "TCM/pSCSI Subsystem Plugin for Linux/SCSI" + depends on SCSI help Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered passthrough access to Linux/SCSI device From e7d0748dd71695b94f3a35c8bdc05226a7f3d919 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Aug 2018 15:22:13 -0600 Subject: [PATCH 150/190] block: Switch struct packet_command to use struct scsi_sense_hdr There is a lot of needless struct request_sense usage in the CDROM code. These can all be struct scsi_sense_hdr instead, to avoid any confusion over their respective structure sizes. This patch is a lot of noise changing "sense" to "sshdr", but the final code is more readable to distinguish between "sense" meaning "struct request_sense" and "sshdr" meaning "struct scsi_sense_hdr". Reviewed-by: Christoph Hellwig Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 36 ++++++++++++++++++------------------ drivers/cdrom/cdrom.c | 22 +++++++++++----------- drivers/ide/ide-cd.c | 11 ++++++----- drivers/ide/ide-cd.h | 4 ++-- drivers/ide/ide-cd_ioctl.c | 30 +++++++++++++++--------------- drivers/scsi/sr_ioctl.c | 22 +++++++++------------- include/linux/cdrom.h | 3 ++- 7 files changed, 63 insertions(+), 65 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 9bb7721c26fc2f..e285413d4a7510 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -748,13 +748,13 @@ static const char *sense_key_string(__u8 index) static void pkt_dump_sense(struct pktcdvd_device *pd, struct packet_command *cgc) { - struct request_sense *sense = cgc->sense; + struct scsi_sense_hdr *sshdr = cgc->sshdr; - if (sense) + if (sshdr) pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", CDROM_PACKET_SIZE, cgc->cmd, - sense->sense_key, sense->asc, sense->ascq, - sense_key_string(sense->sense_key)); + sshdr->sense_key, sshdr->asc, sshdr->ascq, + sense_key_string(sshdr->sense_key)); else pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); } @@ -787,11 +787,11 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, unsigned write_speed, unsigned read_speed) { struct packet_command cgc; - struct request_sense sense; + struct scsi_sense_hdr sshdr; int ret; init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.sense = &sense; + cgc.sshdr = &sshdr; cgc.cmd[0] = GPCMD_SET_SPEED; cgc.cmd[2] = (read_speed >> 8) & 0xff; cgc.cmd[3] = read_speed & 0xff; @@ -1651,7 +1651,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) { struct packet_command cgc; - struct request_sense sense; + struct scsi_sense_hdr sshdr; write_param_page *wp; char buffer[128]; int ret, size; @@ -1662,7 +1662,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) memset(buffer, 0, sizeof(buffer)); init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); - cgc.sense = &sense; + cgc.sshdr = &sshdr; ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); if (ret) { pkt_dump_sense(pd, &cgc); @@ -1678,7 +1678,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) * now get it all */ init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); - cgc.sense = &sense; + cgc.sshdr = &sshdr; ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); if (ret) { pkt_dump_sense(pd, &cgc); @@ -1916,12 +1916,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, int set) { struct packet_command cgc; - struct request_sense sense; + struct scsi_sense_hdr sshdr; unsigned char buf[64]; int ret; init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); - cgc.sense = &sense; + cgc.sshdr = &sshdr; cgc.buflen = pd->mode_offset + 12; /* @@ -1962,14 +1962,14 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, unsigned *write_speed) { struct packet_command cgc; - struct request_sense sense; + struct scsi_sense_hdr sshdr; unsigned char buf[256+18]; unsigned char *cap_buf; int ret, offset; cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); - cgc.sense = &sense; + cgc.sshdr = &sshdr; ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); if (ret) { @@ -2023,13 +2023,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, unsigned *speed) { struct packet_command cgc; - struct request_sense sense; + struct scsi_sense_hdr sshdr; unsigned char buf[64]; unsigned int size, st, sp; int ret; init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); - cgc.sense = &sense; + cgc.sshdr = &sshdr; cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; cgc.cmd[1] = 2; cgc.cmd[2] = 4; /* READ ATIP */ @@ -2044,7 +2044,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, size = sizeof(buf); init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); - cgc.sense = &sense; + cgc.sshdr = &sshdr; cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; cgc.cmd[1] = 2; cgc.cmd[2] = 4; @@ -2095,13 +2095,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) { struct packet_command cgc; - struct request_sense sense; + struct scsi_sense_hdr sshdr; int ret; pkt_dbg(2, pd, "Performing OPC\n"); init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.sense = &sense; + cgc.sshdr = &sshdr; cgc.timeout = 60*HZ; cgc.cmd[0] = GPCMD_SEND_OPC; cgc.cmd[1] = 1; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index a78b8e7085e9bc..86619472d91647 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -345,10 +345,10 @@ static LIST_HEAD(cdrom_list); int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, struct packet_command *cgc) { - if (cgc->sense) { - cgc->sense->sense_key = 0x05; - cgc->sense->asc = 0x20; - cgc->sense->ascq = 0x00; + if (cgc->sshdr) { + cgc->sshdr->sense_key = 0x05; + cgc->sshdr->asc = 0x20; + cgc->sshdr->ascq = 0x00; } cgc->stat = -EIO; @@ -2943,7 +2943,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, struct packet_command *cgc, int cmd) { - struct request_sense sense; + struct scsi_sense_hdr sshdr; struct cdrom_msf msf; int blocksize = 0, format = 0, lba; int ret; @@ -2971,13 +2971,13 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, if (cgc->buffer == NULL) return -ENOMEM; - memset(&sense, 0, sizeof(sense)); - cgc->sense = &sense; + memset(&sshdr, 0, sizeof(sshdr)); + cgc->sshdr = &sshdr; cgc->data_direction = CGC_DATA_READ; ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize); - if (ret && sense.sense_key == 0x05 && - sense.asc == 0x20 && - sense.ascq == 0x00) { + if (ret && sshdr.sense_key == 0x05 && + sshdr.asc == 0x20 && + sshdr.ascq == 0x00) { /* * SCSI-II devices are not required to support * READ_CD, so let's try switching block size @@ -2986,7 +2986,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, ret = cdrom_switch_blocksize(cdi, blocksize); if (ret) goto out; - cgc->sense = NULL; + cgc->sshdr = NULL; ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1); ret |= cdrom_switch_blocksize(cdi, blocksize); } diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index a37dd381d307d6..a24cdff018657b 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -419,7 +419,7 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd) int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, int write, void *buffer, unsigned *bufflen, - struct request_sense *sense, int timeout, + struct scsi_sense_hdr *sshdr, int timeout, req_flags_t rq_flags) { struct cdrom_info *info = drive->driver_data; @@ -456,8 +456,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, if (buffer) *bufflen = scsi_req(rq)->resid_len; - if (sense) - memcpy(sense, scsi_req(rq)->sense, sizeof(*sense)); + if (sshdr) + scsi_normalize_sense(scsi_req(rq)->sense, + scsi_req(rq)->sense_len, sshdr); /* * FIXME: we should probably abort/retry or something in case of @@ -864,7 +865,7 @@ static void msf_from_bcd(struct atapi_msf *msf) msf->frame = bcd2bin(msf->frame); } -int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense) +int cdrom_check_status(ide_drive_t *drive, struct scsi_sense_hdr *sshdr) { struct cdrom_info *info = drive->driver_data; struct cdrom_device_info *cdi; @@ -886,7 +887,7 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense) */ cmd[7] = cdi->sanyo_slot % 3; - return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET); + return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sshdr, 0, RQF_QUIET); } static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h index fc162fbb6629c0..a69dc7f61c4d57 100644 --- a/drivers/ide/ide-cd.h +++ b/drivers/ide/ide-cd.h @@ -98,11 +98,11 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *); /* ide-cd.c functions used by ide-cd_ioctl.c */ int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *, - unsigned *, struct request_sense *, int, req_flags_t); + unsigned *, struct scsi_sense_hdr *, int, req_flags_t); int ide_cd_read_toc(ide_drive_t *); int ide_cdrom_get_capabilities(ide_drive_t *, u8 *); void ide_cdrom_update_speed(ide_drive_t *, u8 *); -int cdrom_check_status(ide_drive_t *, struct request_sense *); +int cdrom_check_status(ide_drive_t *, struct scsi_sense_hdr *); /* ide-cd_ioctl.c */ int ide_cdrom_open_real(struct cdrom_device_info *, int); diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 14540544413c41..4a6e1a413eadb4 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -43,14 +43,14 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr) { ide_drive_t *drive = cdi->handle; struct media_event_desc med; - struct request_sense sense; + struct scsi_sense_hdr sshdr; int stat; if (slot_nr != CDSL_CURRENT) return -EINVAL; - stat = cdrom_check_status(drive, &sense); - if (!stat || sense.sense_key == UNIT_ATTENTION) + stat = cdrom_check_status(drive, &sshdr); + if (!stat || sshdr.sense_key == UNIT_ATTENTION) return CDS_DISC_OK; if (!cdrom_get_media_event(cdi, &med)) { @@ -62,8 +62,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr) return CDS_NO_DISC; } - if (sense.sense_key == NOT_READY && sense.asc == 0x04 - && sense.ascq == 0x04) + if (sshdr.sense_key == NOT_READY && sshdr.asc == 0x04 + && sshdr.ascq == 0x04) return CDS_DISC_OK; /* @@ -71,8 +71,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr) * just return TRAY_OPEN since ATAPI doesn't provide * any other way to detect this... */ - if (sense.sense_key == NOT_READY) { - if (sense.asc == 0x3a && sense.ascq == 1) + if (sshdr.sense_key == NOT_READY) { + if (sshdr.asc == 0x3a && sshdr.ascq == 1) return CDS_NO_DISC; else return CDS_TRAY_OPEN; @@ -135,7 +135,7 @@ int cdrom_eject(ide_drive_t *drive, int ejectflag) static int ide_cd_lockdoor(ide_drive_t *drive, int lockflag) { - struct request_sense my_sense, *sense = &my_sense; + struct scsi_sense_hdr sshdr; int stat; /* If the drive cannot lock the door, just pretend. */ @@ -150,14 +150,14 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag) cmd[4] = lockflag ? 1 : 0; stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, - sense, 0, 0); + &sshdr, 0, 0); } /* If we got an illegal field error, the drive probably cannot lock the door. */ if (stat != 0 && - sense->sense_key == ILLEGAL_REQUEST && - (sense->asc == 0x24 || sense->asc == 0x20)) { + sshdr.sense_key == ILLEGAL_REQUEST && + (sshdr.asc == 0x24 || sshdr.asc == 0x20)) { printk(KERN_ERR "%s: door locking not supported\n", drive->name); drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; @@ -165,7 +165,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag) } /* no medium, that's alright. */ - if (stat != 0 && sense->sense_key == NOT_READY && sense->asc == 0x3a) + if (stat != 0 && sshdr.sense_key == NOT_READY && sshdr.asc == 0x3a) stat = 0; if (stat == 0) { @@ -451,8 +451,8 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi, layer. the packet must be complete, as we do not touch it at all. */ - if (cgc->sense) - memset(cgc->sense, 0, sizeof(struct request_sense)); + if (cgc->sshdr) + memset(cgc->sshdr, 0, sizeof(*cgc->sshdr)); if (cgc->quiet) flags |= RQF_QUIET; @@ -460,7 +460,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi, cgc->stat = ide_cd_queue_pc(drive, cgc->cmd, cgc->data_direction == CGC_DATA_WRITE, cgc->buffer, &len, - cgc->sense, cgc->timeout, flags); + cgc->sshdr, cgc->timeout, flags); if (!cgc->stat) cgc->buflen -= len; return cgc->stat; diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index 35fab1e18adc34..ffcf902da3901c 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -186,14 +186,13 @@ static int sr_play_trkind(struct cdrom_device_info *cdi, int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) { struct scsi_device *SDev; - struct scsi_sense_hdr sshdr; + struct scsi_sense_hdr local_sshdr, *sshdr = &local_sshdr; int result, err = 0, retries = 0; - unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE], *senseptr = NULL; SDev = cd->device; - if (cgc->sense) - senseptr = sense_buffer; + if (cgc->sshdr) + sshdr = cgc->sshdr; retry: if (!scsi_block_when_processing_errors(SDev)) { @@ -202,15 +201,12 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) } result = scsi_execute(SDev, cgc->cmd, cgc->data_direction, - cgc->buffer, cgc->buflen, senseptr, &sshdr, + cgc->buffer, cgc->buflen, NULL, sshdr, cgc->timeout, IOCTL_RETRIES, 0, 0, NULL); - if (cgc->sense) - memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense)); - /* Minimal error checking. Ignore cases we know about, and report the rest. */ if (driver_byte(result) != 0) { - switch (sshdr.sense_key) { + switch (sshdr->sense_key) { case UNIT_ATTENTION: SDev->changed = 1; if (!cgc->quiet) @@ -221,8 +217,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) err = -ENOMEDIUM; break; case NOT_READY: /* This happens if there is no disc in drive */ - if (sshdr.asc == 0x04 && - sshdr.ascq == 0x01) { + if (sshdr->asc == 0x04 && + sshdr->ascq == 0x01) { /* sense: Logical unit is in process of becoming ready */ if (!cgc->quiet) sr_printk(KERN_INFO, cd, @@ -245,8 +241,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) break; case ILLEGAL_REQUEST: err = -EIO; - if (sshdr.asc == 0x20 && - sshdr.ascq == 0x00) + if (sshdr->asc == 0x20 && + sshdr->ascq == 0x00) /* sense: Invalid command operation code */ err = -EDRIVE_CANT_DO_THIS; break; diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index e75dfd1f1dec6e..528271c6001827 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -13,6 +13,7 @@ #include /* not really needed, later.. */ #include +#include #include struct packet_command @@ -21,7 +22,7 @@ struct packet_command unsigned char *buffer; unsigned int buflen; int stat; - struct request_sense *sense; + struct scsi_sense_hdr *sshdr; unsigned char data_direction; int quiet; int timeout; From 7a6873be1b11517a0c55ab4534e4de54b3e228a3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 31 Jul 2018 12:51:51 -0700 Subject: [PATCH 151/190] ide-cd: Remove redundant sense buffer This is already able to process the sense buffer, so remove the redundant parsing during the failure path. This also fixes any possible stale values since the prior code did not check the sense length. Acked-by: David S. Miller Reviewed-by: Christoph Hellwig Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/ide/ide-cd.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index a24cdff018657b..44a7a255ef74a5 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -423,6 +423,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, req_flags_t rq_flags) { struct cdrom_info *info = drive->driver_data; + struct scsi_sense_hdr local_sshdr; int retries = 10; bool failed; @@ -430,6 +431,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, "rq_flags: 0x%x", cmd[0], write, timeout, rq_flags); + if (!sshdr) + sshdr = &local_sshdr; + /* start of retry loop */ do { struct request *rq; @@ -456,9 +460,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, if (buffer) *bufflen = scsi_req(rq)->resid_len; - if (sshdr) - scsi_normalize_sense(scsi_req(rq)->sense, - scsi_req(rq)->sense_len, sshdr); + scsi_normalize_sense(scsi_req(rq)->sense, + scsi_req(rq)->sense_len, sshdr); /* * FIXME: we should probably abort/retry or something in case of @@ -470,12 +473,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, * The request failed. Retry if it was due to a unit * attention status (usually means media was changed). */ - struct request_sense *reqbuf = scsi_req(rq)->sense; - - if (reqbuf->sense_key == UNIT_ATTENTION) + if (sshdr->sense_key == UNIT_ATTENTION) cdrom_saw_media_change(drive); - else if (reqbuf->sense_key == NOT_READY && - reqbuf->asc == 4 && reqbuf->ascq != 4) { + else if (sshdr->sense_key == NOT_READY && + sshdr->asc == 4 && sshdr->ascq != 4) { /* * The drive is in the process of loading * a disk. Retry, but wait a little to give From 4e178c17cac07d58df7d31ef6fe10036cfa3883d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 31 Jul 2018 12:51:52 -0700 Subject: [PATCH 152/190] cdrom: Use struct scsi_sense_hdr internally This removes more casts of struct request_sense and uses the standard struct scsi_sense_hdr instead. This also fixes any possible stale values since the prior code did not check the sense length. Reviewed-by: Christoph Hellwig Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 2 +- drivers/cdrom/cdrom.c | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index ad9b687a236a96..d4913516823f14 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -74,12 +74,12 @@ config AMIGA_Z2RAM config CDROM tristate + select BLK_SCSI_REQUEST config GDROM tristate "SEGA Dreamcast GD-ROM drive" depends on SH_DREAMCAST select CDROM - select BLK_SCSI_REQUEST # only for the generic cdrom code help A standard SEGA Dreamcast comes with a modified CD ROM drive called a "GD-ROM" by SEGA to signify it is capable of reading special disks diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 86619472d91647..113fc6edb2b037 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -282,6 +282,7 @@ #include #include #include +#include #include /* used to tell the module to turn on full debugging messages */ @@ -2222,9 +2223,12 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, blk_execute_rq(q, cdi->disk, rq, 0); if (scsi_req(rq)->result) { - struct request_sense *s = req->sense; + struct scsi_sense_hdr sshdr; + ret = -EIO; - cdi->last_sense = s->sense_key; + scsi_normalize_sense(req->sense, req->sense_len, + &sshdr); + cdi->last_sense = sshdr.sense_key; } if (blk_rq_unmap_user(bio)) From 429296cc51c4cf145b240a78c8d68545e4d67e4c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 31 Jul 2018 12:51:53 -0700 Subject: [PATCH 153/190] libata-scsi: Move sense buffers onto stack To support future compile-time sizeof() checks that will be able to validate the length of sense buffers, this removes the only dynamically allocated sense buffers in the tree by putting the 96 byte sense buffers on the stack. Reviewed-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/ata/libata-scsi.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 6a91d04351d9b6..d46863e9e30090 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -597,8 +597,9 @@ static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev, int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) { int rc = 0; + u8 sensebuf[SCSI_SENSE_BUFFERSIZE]; u8 scsi_cmd[MAX_COMMAND_SIZE]; - u8 args[4], *argbuf = NULL, *sensebuf = NULL; + u8 args[4], *argbuf = NULL; int argsize = 0; enum dma_data_direction data_dir; struct scsi_sense_hdr sshdr; @@ -610,10 +611,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) if (copy_from_user(args, arg, sizeof(args))) return -EFAULT; - sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO); - if (!sensebuf) - return -ENOMEM; - + memset(sensebuf, 0, sizeof(sensebuf)); memset(scsi_cmd, 0, sizeof(scsi_cmd)); if (args[3]) { @@ -685,7 +683,6 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) && copy_to_user(arg + sizeof(args), argbuf, argsize)) rc = -EFAULT; error: - kfree(sensebuf); kfree(argbuf); return rc; } @@ -704,8 +701,9 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) { int rc = 0; + u8 sensebuf[SCSI_SENSE_BUFFERSIZE]; u8 scsi_cmd[MAX_COMMAND_SIZE]; - u8 args[7], *sensebuf = NULL; + u8 args[7]; struct scsi_sense_hdr sshdr; int cmd_result; @@ -715,10 +713,7 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) if (copy_from_user(args, arg, sizeof(args))) return -EFAULT; - sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO); - if (!sensebuf) - return -ENOMEM; - + memset(sensebuf, 0, sizeof(sensebuf)); memset(scsi_cmd, 0, sizeof(scsi_cmd)); scsi_cmd[0] = ATA_16; scsi_cmd[1] = (3 << 1); /* Non-data */ @@ -769,7 +764,6 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) } error: - kfree(sensebuf); return rc; } From 704f83928c8e7da6e06144569efb15dec73278e8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 31 Jul 2018 12:51:54 -0700 Subject: [PATCH 154/190] scsi: Check sense buffer size at build time To avoid introducing problems like those fixed in commit f7068114d45e ("sr: pass down correctly sized SCSI sense buffer"), this creates a macro wrapper for scsi_execute() that verifies the size of the sense buffer similar to what was done for command string sizes in commit 3756f6401c30 ("exec: avoid gcc-8 warning for get_task_comm"). Another solution could be to add a length argument to scsi_execute(), but this function already takes a lot of arguments and Jens was not fond of that approach. Additionally, this moves the SCSI_SENSE_BUFFERSIZE definition into scsi_device.h, and removes a redundant include for scsi_device.h from scsi_cmnd.h. Reviewed-by: Christoph Hellwig Signed-off-by: Kees Cook Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 6 +++--- include/scsi/scsi_cmnd.h | 6 ++---- include/scsi/scsi_device.h | 14 +++++++++++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 41e9ac9fc13851..9cb9a166fa0cad 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -238,7 +238,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) /** - * scsi_execute - insert request and wait for the result + * __scsi_execute - insert request and wait for the result * @sdev: scsi device * @cmd: scsi command * @data_direction: data direction @@ -255,7 +255,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) * Returns the scsi_cmnd result field if a command was executed, or a negative * Linux error code if we didn't get that far. */ -int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, +int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, unsigned char *sense, struct scsi_sense_hdr *sshdr, int timeout, int retries, u64 flags, req_flags_t rq_flags, @@ -309,7 +309,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, return ret; } -EXPORT_SYMBOL(scsi_execute); +EXPORT_SYMBOL(__scsi_execute); /* * Function: scsi_init_cmd_errh() diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index cae229b5395c8d..c891ada3c5c25c 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -15,8 +15,6 @@ struct Scsi_Host; struct scsi_driver; -#include - /* * MAX_COMMAND_SIZE is: * The longest fixed-length SCSI CDB as per the SCSI standard. @@ -121,11 +119,11 @@ struct scsi_cmnd { struct request *request; /* The command we are working on */ -#define SCSI_SENSE_BUFFERSIZE 96 unsigned char *sense_buffer; /* obtained by REQUEST SENSE when * CHECK CONDITION is received on original - * command (auto-sense) */ + * command (auto-sense). Length must be + * SCSI_SENSE_BUFFERSIZE bytes. */ /* Low-level done function - can be used by low-level driver to point * to completion function. Not used by mid/upper level code. */ diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 4c36af6edd79b6..202f4d6a434212 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -17,6 +17,8 @@ struct scsi_sense_hdr; typedef __u64 __bitwise blist_flags_t; +#define SCSI_SENSE_BUFFERSIZE 96 + struct scsi_mode_data { __u32 length; __u16 block_descriptor_length; @@ -426,11 +428,21 @@ extern const char *scsi_device_state_name(enum scsi_device_state); extern int scsi_is_sdev_device(const struct device *); extern int scsi_is_target_device(const struct device *); extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); -extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, +extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, unsigned char *sense, struct scsi_sense_hdr *sshdr, int timeout, int retries, u64 flags, req_flags_t rq_flags, int *resid); +/* Make sure any sense buffer is the correct size. */ +#define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense, \ + sshdr, timeout, retries, flags, rq_flags, resid) \ +({ \ + BUILD_BUG_ON((sense) != NULL && \ + sizeof(sense) != SCSI_SENSE_BUFFERSIZE); \ + __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, \ + sense, sshdr, timeout, retries, flags, rq_flags, \ + resid); \ +}) static inline int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, From f10fe9d85dc0802b54519c917716e6f0092b4ce7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sun, 5 Aug 2018 08:25:19 -0600 Subject: [PATCH 155/190] lightnvm: remove minor version check for 2.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A minor version number increase should not break backwards compatibility. Fixes: 3cb98f84d368b ("lightnvm: add minor version to generic geometry") Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 7e4cf4eb9d6604..6fe5923c95d4aa 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -414,12 +414,6 @@ static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id, /* Set compacted version for upper layers */ geo->version = NVM_OCSSD_SPEC_20; - if (!(geo->major_ver_id == 2 && geo->minor_ver_id == 0)) { - pr_err("nvm: OCSSD version not supported (v%d.%d)\n", - geo->major_ver_id, geo->minor_ver_id); - return -EINVAL; - } - geo->num_ch = le16_to_cpu(id->num_grp); geo->num_lun = le16_to_cpu(id->num_pu); geo->all_luns = geo->num_ch * geo->num_lun; From f87c30c96cd9b5baa28bc63900f16e04e8c7cbb2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 6 Aug 2018 08:14:55 -0600 Subject: [PATCH 156/190] xen-blkfront: use true and false for boolean values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return statements in functions returning bool should use true or false instead of an integer value. This code was detected with the help of Coccinelle. Acked-by: Roger Pau Monné Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 94300dbe358b36..8986adab9bf585 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1436,7 +1436,7 @@ static bool blkif_completion(unsigned long *id, /* Wait the second response if not yet here. */ if (s2->status == REQ_WAITING) - return 0; + return false; bret->status = blkif_get_final_status(s->status, s2->status); @@ -1537,7 +1537,7 @@ static bool blkif_completion(unsigned long *id, } } - return 1; + return true; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) From 8f220c418d070a097f7d292cf6b37f88d67845ad Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 7 Aug 2018 12:43:42 +0200 Subject: [PATCH 157/190] nvme: fixup crash on failed discovery When the initial discovery fails the subsystem hasn't been setup yet in nvme_mpath_stop, and we can't dereference ctrl->subsys. Fixes: 0d0b660f ("nvme: add ANA support") Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index c643872f8dac08..5a9562881d4ef8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath, inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) { - return multipath && (ctrl->subsys->cmic & (1 << 3)); + return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3)); } /* From e33e5c85763e8ac1899ec382c1ebc4603d8c52ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Aug 2018 09:42:22 +0200 Subject: [PATCH 158/190] target/loop: depend on SCSI The target loopback driver is a low-level driver for the SCSI subsystem, and as such needs to depend on it. Fixes: 8a39a047 ("target: don't depend on SCSI") Signed-off-by: Christoph Hellwig Reported-by: Randy Dunlap Signed-off-by: Jens Axboe --- drivers/target/loopback/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/target/loopback/Kconfig b/drivers/target/loopback/Kconfig index abe8ecbcdf0648..158ee9d522f753 100644 --- a/drivers/target/loopback/Kconfig +++ b/drivers/target/loopback/Kconfig @@ -1,5 +1,6 @@ config LOOPBACK_TARGET tristate "TCM Virtual SAS target and Linux/SCSI LDD fabric loopback module" + depends on SCSI help Say Y here to enable the TCM Virtual SAS target and Linux/SCSI LLD fabric loopback module. From 2887e41b910bb14fd847cf01ab7a5993db989d88 Mon Sep 17 00:00:00 2001 From: Anchal Agarwal Date: Tue, 7 Aug 2018 14:40:49 -0600 Subject: [PATCH 159/190] blk-wbt: Avoid lock contention and thundering herd issue in wbt_wait I am currently running a large bare metal instance (i3.metal) on EC2 with 72 cores, 512GB of RAM and NVME drives, with a 4.18 kernel. I have a workload that simulates a database workload and I am running into lockup issues when writeback throttling is enabled,with the hung task detector also kicking in. Crash dumps show that most CPUs (up to 50 of them) are all trying to get the wbt wait queue lock while trying to add themselves to it in __wbt_wait (see stack traces below). [ 0.948118] CPU: 45 PID: 0 Comm: swapper/45 Not tainted 4.14.51-62.38.amzn1.x86_64 #1 [ 0.948119] Hardware name: Amazon EC2 i3.metal/Not Specified, BIOS 1.0 10/16/2017 [ 0.948120] task: ffff883f7878c000 task.stack: ffffc9000c69c000 [ 0.948124] RIP: 0010:native_queued_spin_lock_slowpath+0xf8/0x1a0 [ 0.948125] RSP: 0018:ffff883f7fcc3dc8 EFLAGS: 00000046 [ 0.948126] RAX: 0000000000000000 RBX: ffff887f7709ca68 RCX: ffff883f7fce2a00 [ 0.948128] RDX: 000000000000001c RSI: 0000000000740001 RDI: ffff887f7709ca68 [ 0.948129] RBP: 0000000000000002 R08: 0000000000b80000 R09: 0000000000000000 [ 0.948130] R10: ffff883f7fcc3d78 R11: 000000000de27121 R12: 0000000000000002 [ 0.948131] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 [ 0.948132] FS: 0000000000000000(0000) GS:ffff883f7fcc0000(0000) knlGS:0000000000000000 [ 0.948134] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.948135] CR2: 000000c424c77000 CR3: 0000000002010005 CR4: 00000000003606e0 [ 0.948136] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 0.948137] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 0.948138] Call Trace: [ 0.948139] [ 0.948142] do_raw_spin_lock+0xad/0xc0 [ 0.948145] _raw_spin_lock_irqsave+0x44/0x4b [ 0.948149] ? __wake_up_common_lock+0x53/0x90 [ 0.948150] __wake_up_common_lock+0x53/0x90 [ 0.948155] wbt_done+0x7b/0xa0 [ 0.948158] blk_mq_free_request+0xb7/0x110 [ 0.948161] __blk_mq_complete_request+0xcb/0x140 [ 0.948166] nvme_process_cq+0xce/0x1a0 [nvme] [ 0.948169] nvme_irq+0x23/0x50 [nvme] [ 0.948173] __handle_irq_event_percpu+0x46/0x300 [ 0.948176] handle_irq_event_percpu+0x20/0x50 [ 0.948179] handle_irq_event+0x34/0x60 [ 0.948181] handle_edge_irq+0x77/0x190 [ 0.948185] handle_irq+0xaf/0x120 [ 0.948188] do_IRQ+0x53/0x110 [ 0.948191] common_interrupt+0x87/0x87 [ 0.948192] .... [ 0.311136] CPU: 4 PID: 9737 Comm: run_linux_amd64 Not tainted 4.14.51-62.38.amzn1.x86_64 #1 [ 0.311137] Hardware name: Amazon EC2 i3.metal/Not Specified, BIOS 1.0 10/16/2017 [ 0.311138] task: ffff883f6e6a8000 task.stack: ffffc9000f1ec000 [ 0.311141] RIP: 0010:native_queued_spin_lock_slowpath+0xf5/0x1a0 [ 0.311142] RSP: 0018:ffffc9000f1efa28 EFLAGS: 00000046 [ 0.311144] RAX: 0000000000000000 RBX: ffff887f7709ca68 RCX: ffff883f7f722a00 [ 0.311145] RDX: 0000000000000035 RSI: 0000000000d80001 RDI: ffff887f7709ca68 [ 0.311146] RBP: 0000000000000202 R08: 0000000000140000 R09: 0000000000000000 [ 0.311147] R10: ffffc9000f1ef9d8 R11: 000000001a249fa0 R12: ffff887f7709ca68 [ 0.311148] R13: ffffc9000f1efad0 R14: 0000000000000000 R15: ffff887f7709ca00 [ 0.311149] FS: 000000c423f30090(0000) GS:ffff883f7f700000(0000) knlGS:0000000000000000 [ 0.311150] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.311151] CR2: 00007feefcea4000 CR3: 0000007f7016e001 CR4: 00000000003606e0 [ 0.311152] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 0.311153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 0.311154] Call Trace: [ 0.311157] do_raw_spin_lock+0xad/0xc0 [ 0.311160] _raw_spin_lock_irqsave+0x44/0x4b [ 0.311162] ? prepare_to_wait_exclusive+0x28/0xb0 [ 0.311164] prepare_to_wait_exclusive+0x28/0xb0 [ 0.311167] wbt_wait+0x127/0x330 [ 0.311169] ? finish_wait+0x80/0x80 [ 0.311172] ? generic_make_request+0xda/0x3b0 [ 0.311174] blk_mq_make_request+0xd6/0x7b0 [ 0.311176] ? blk_queue_enter+0x24/0x260 [ 0.311178] ? generic_make_request+0xda/0x3b0 [ 0.311181] generic_make_request+0x10c/0x3b0 [ 0.311183] ? submit_bio+0x5c/0x110 [ 0.311185] submit_bio+0x5c/0x110 [ 0.311197] ? __ext4_journal_stop+0x36/0xa0 [ext4] [ 0.311210] ext4_io_submit+0x48/0x60 [ext4] [ 0.311222] ext4_writepages+0x810/0x11f0 [ext4] [ 0.311229] ? do_writepages+0x3c/0xd0 [ 0.311239] ? ext4_mark_inode_dirty+0x260/0x260 [ext4] [ 0.311240] do_writepages+0x3c/0xd0 [ 0.311243] ? _raw_spin_unlock+0x24/0x30 [ 0.311245] ? wbc_attach_and_unlock_inode+0x165/0x280 [ 0.311248] ? __filemap_fdatawrite_range+0xa3/0xe0 [ 0.311250] __filemap_fdatawrite_range+0xa3/0xe0 [ 0.311253] file_write_and_wait_range+0x34/0x90 [ 0.311264] ext4_sync_file+0x151/0x500 [ext4] [ 0.311267] do_fsync+0x38/0x60 [ 0.311270] SyS_fsync+0xc/0x10 [ 0.311272] do_syscall_64+0x6f/0x170 [ 0.311274] entry_SYSCALL_64_after_hwframe+0x42/0xb7 In the original patch, wbt_done is waking up all the exclusive processes in the wait queue, which can cause a thundering herd if there is a large number of writer threads in the queue. The original intention of the code seems to be to wake up one thread only however, it uses wake_up_all() in __wbt_done(), and then uses the following check in __wbt_wait to have only one thread actually get out of the wait loop: if (waitqueue_active(&rqw->wait) && rqw->wait.head.next != &wait->entry) return false; The problem with this is that the wait entry in wbt_wait is define with DEFINE_WAIT, which uses the autoremove wakeup function. That means that the above check is invalid - the wait entry will have been removed from the queue already by the time we hit the check in the loop. Secondly, auto-removing the wait entries also means that the wait queue essentially gets reordered "randomly" (e.g. threads re-add themselves in the order they got to run after being woken up). Additionally, new requests entering wbt_wait might overtake requests that were queued earlier, because the wait queue will be (temporarily) empty after the wake_up_all, so the waitqueue_active check will not stop them. This can cause certain threads to starve under high load. The fix is to leave the woken up requests in the queue and remove them in finish_wait() once the current thread breaks out of the wait loop in __wbt_wait. This will ensure new requests always end up at the back of the queue, and they won't overtake requests that are already in the wait queue. With that change, the loop in wbt_wait is also in line with many other wait loops in the kernel. Waking up just one thread drastically reduces lock contention, as does moving the wait queue add/remove out of the loop. A significant drop in lockdep's lock contention numbers is seen when running the test application on the patched kernel. Signed-off-by: Anchal Agarwal Signed-off-by: Frank van der Linden Signed-off-by: Jens Axboe --- block/blk-wbt.c | 55 +++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 461a9af11efe00..1d94a20374fcbf 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -166,7 +166,7 @@ static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) - wake_up_all(&rqw->wait); + wake_up(&rqw->wait); } } @@ -474,30 +474,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) return limit; } -static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, - wait_queue_entry_t *wait, unsigned long rw) -{ - /* - * inc it here even if disabled, since we'll dec it at completion. - * this only happens if the task was sleeping in __wbt_wait(), - * and someone turned it off at the same time. - */ - if (!rwb_enabled(rwb)) { - atomic_inc(&rqw->inflight); - return true; - } - - /* - * If the waitqueue is already active and we are not the next - * in line to be woken up, wait for our turn. - */ - if (waitqueue_active(&rqw->wait) && - rqw->wait.head.next != &wait->entry) - return false; - - return rq_wait_inc_below(rqw, get_limit(rwb, rw)); -} - /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. @@ -508,16 +484,32 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, __acquires(lock) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); - DEFINE_WAIT(wait); + DECLARE_WAITQUEUE(wait, current); + + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __wbt_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rqw->inflight); + return; + } - if (may_queue(rwb, rqw, &wait, rw)) + if (!waitqueue_active(&rqw->wait) + && rq_wait_inc_below(rqw, get_limit(rwb, rw))) return; + add_wait_queue_exclusive(&rqw->wait, &wait); do { - prepare_to_wait_exclusive(&rqw->wait, &wait, - TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!rwb_enabled(rwb)) { + atomic_inc(&rqw->inflight); + break; + } - if (may_queue(rwb, rqw, &wait, rw)) + if (rq_wait_inc_below(rqw, get_limit(rwb, rw))) break; if (lock) { @@ -528,7 +520,8 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, io_schedule(); } while (1); - finish_wait(&rqw->wait, &wait); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&rqw->wait, &wait); } static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) From 9b4f43460dd0ee461f5dd65ee1efa152f2e52559 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 7 Aug 2018 16:17:28 -0700 Subject: [PATCH 160/190] cfq: Annotate fall-through in a switch statement This patch avoids that gcc complains about fall-through when building with W=1. Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 82b6c27b324551..ec6acdd58b7a65 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3666,6 +3666,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) switch (ioprio_class) { default: printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); + /* fall through */ case IOPRIO_CLASS_NONE: /* * no prio set, inherit CPU scheduling settings From f7ecb1b109da1006a08d5675debe60990e824432 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 7 Aug 2018 16:17:29 -0700 Subject: [PATCH 161/190] cfq: Suppress compiler warnings about comparisons This patch does not change any functionality but avoids that gcc reports the following warnings when building with W=1: block/cfq-iosched.c: In function ?cfq_back_seek_max_store?: block/cfq-iosched.c:4741:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/cfq-iosched.c:4756:1: note: in expansion of macro ?STORE_FUNCTION? STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); ^~~~~~~~~~~~~~ block/cfq-iosched.c: In function ?cfq_slice_idle_store?: block/cfq-iosched.c:4741:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/cfq-iosched.c:4759:1: note: in expansion of macro ?STORE_FUNCTION? STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); ^~~~~~~~~~~~~~ block/cfq-iosched.c: In function ?cfq_group_idle_store?: block/cfq-iosched.c:4741:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/cfq-iosched.c:4760:1: note: in expansion of macro ?STORE_FUNCTION? STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); ^~~~~~~~~~~~~~ block/cfq-iosched.c: In function ?cfq_low_latency_store?: block/cfq-iosched.c:4741:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/cfq-iosched.c:4765:1: note: in expansion of macro ?STORE_FUNCTION? STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); ^~~~~~~~~~~~~~ block/cfq-iosched.c: In function ?cfq_slice_idle_us_store?: block/cfq-iosched.c:4775:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/cfq-iosched.c:4782:1: note: in expansion of macro ?USEC_STORE_FUNCTION? USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX); ^~~~~~~~~~~~~~~~~~~ block/cfq-iosched.c: In function ?cfq_group_idle_us_store?: block/cfq-iosched.c:4775:13: warning: comparison of unsigned expression < 0 is always false [-Wtype-limits] if (__data < (MIN)) \ ^ block/cfq-iosched.c:4783:1: note: in expansion of macro ?USEC_STORE_FUNCTION? USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX); ^~~~~~~~~~~~~~~~~~~ Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ec6acdd58b7a65..2eb87444b15727 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4736,12 +4736,13 @@ USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency); static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct cfq_data *cfqd = e->elevator_data; \ - unsigned int __data; \ + unsigned int __data, __min = (MIN), __max = (MAX); \ + \ cfq_var_store(&__data, (page)); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ + if (__data < __min) \ + __data = __min; \ + else if (__data > __max) \ + __data = __max; \ if (__CONV) \ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ @@ -4770,12 +4771,13 @@ STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct cfq_data *cfqd = e->elevator_data; \ - unsigned int __data; \ + unsigned int __data, __min = (MIN), __max = (MAX); \ + \ cfq_var_store(&__data, (page)); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ + if (__data < __min) \ + __data = __min; \ + else if (__data > __max) \ + __data = __max; \ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ return count; \ } From 8b92d0e3d400390660a26ef7f475524700fb86cf Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 8 Aug 2018 08:35:29 +0200 Subject: [PATCH 162/190] nvme.h: fixup ANA group descriptor format ANA Phase 3 draft had the 'reserved' field in the group descriptor format set to '23:17' (so that the first namespace identifier started at byte 24), but that got move with the approved TP to '31:17' (so that the first namespace identifier started at byte 32). Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 64c9175723de70..a661861e9d56f1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -446,7 +446,7 @@ struct nvme_ana_group_desc { __le32 nnsids; __le64 chgcnt; __u8 state; - __u8 rsvd17[7]; + __u8 rsvd17[15]; __le32 nsids[]; }; From 93045d5942da60801e71764597d448cf37a798c1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 7 Aug 2018 23:01:05 -0700 Subject: [PATCH 163/190] nvme.h: add support for ns write protect definitions Add various definitions from NVMe 1.3 TP 4005. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a661861e9d56f1..68e91ef5494c11 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -259,7 +259,7 @@ struct nvme_id_ctrl { __le16 awun; __le16 awupf; __u8 nvscc; - __u8 rsvd531; + __u8 nwpc; __le16 acwu; __u8 rsvd534[2]; __le32 sgls; @@ -320,7 +320,9 @@ struct nvme_id_ns { __u8 nvmcap[16]; __u8 rsvd64[28]; __le32 anagrpid; - __u8 rsvd96[8]; + __u8 rsvd96[3]; + __u8 nsattr; + __u8 rsvd100[4]; __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; @@ -794,6 +796,7 @@ enum { NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, NVME_FEAT_RESV_PERSIST = 0x83, + NVME_FEAT_WRITE_PROTECT = 0x84, NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, @@ -807,6 +810,14 @@ enum { NVME_FWACT_ACTV = (2 << 3), }; +/* NVMe Namespace Write Protect State */ +enum { + NVME_NS_NO_WRITE_PROTECT = 0, + NVME_NS_WRITE_PROTECT, + NVME_NS_WRITE_PROTECT_POWER_CYCLE, + NVME_NS_WRITE_PROTECT_PERMANENT, +}; + #define NVME_MAX_CHANGED_NAMESPACES 1024 struct nvme_identify { @@ -1153,6 +1164,8 @@ enum { NVME_SC_SGL_INVALID_OFFSET = 0x16, NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + NVME_SC_NS_WRITE_PROTECTED = 0x20, + NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, From 1293477f4f324f9cf23a36f6cc0adc6801f1baac Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 7 Aug 2018 23:01:06 -0700 Subject: [PATCH 164/190] nvme: set gendisk read only based on nsattr NVMe 1.3 TP 4005 introduces new filed (NSATTR). This field indicates whether given namespace is write protected or not. This patch sets the gendisk associated with the namespace to read only based on the identify namespace nsattr field. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 603fe59756fbfd..dd8ec1dd921909 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1484,6 +1484,12 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity(disk, capacity); nvme_config_discard(ns); + + if (id->nsattr & (1 << 0)) + set_disk_ro(disk, true); + else + set_disk_ro(disk, false); + blk_mq_unfreeze_queue(disk->queue); } From dedf0be544614b6d9d395e78d72cc8c30d03e440 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 7 Aug 2018 23:01:07 -0700 Subject: [PATCH 165/190] nvmet: add ns write protect support This patch implements the Namespace Write Protect feature described in "NVMe TP 4005a Namespace Write Protect". In this version, we implement No Write Protect and Write Protect states for target ns which can be toggled by set-features commands from the host side. For write-protect state transition, we need to flush the ns specified as a part of command so we also add helpers for carrying out synchronous flush operations. Signed-off-by: Chaitanya Kulkarni [hch: fixed an incorrect endianess conversion, minor cleanups] Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 76 +++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 20 +++++++- drivers/nvme/target/io-cmd-bdev.c | 7 +++ drivers/nvme/target/io-cmd-file.c | 12 +++-- drivers/nvme/target/nvmet.h | 4 ++ 5 files changed, 114 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index f517bc562d264c..a21caea1e0806a 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -372,6 +372,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->psd[0].entry_lat = cpu_to_le32(0x10); id->psd[0].exit_lat = cpu_to_le32(0x4); + id->nwpc = 1 << 0; /* write protect and no write protect */ + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); kfree(id); @@ -433,6 +435,8 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) id->lbaf[0].ds = ns->blksize_shift; + if (ns->readonly) + id->nsattr |= (1 << 0); nvmet_put_namespace(ns); done: status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); @@ -545,6 +549,52 @@ static void nvmet_execute_abort(struct nvmet_req *req) nvmet_req_complete(req, 0); } +static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req) +{ + u16 status; + + if (req->ns->file) + status = nvmet_file_flush(req); + else + status = nvmet_bdev_flush(req); + + if (status) + pr_err("write protect flush failed nsid: %u\n", req->ns->nsid); + return status; +} + +static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) +{ + u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]); + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE; + + req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid); + if (unlikely(!req->ns)) + return status; + + mutex_lock(&subsys->lock); + switch (write_protect) { + case NVME_NS_WRITE_PROTECT: + req->ns->readonly = true; + status = nvmet_write_protect_flush_sync(req); + if (status) + req->ns->readonly = false; + break; + case NVME_NS_NO_WRITE_PROTECT: + req->ns->readonly = false; + status = 0; + break; + default: + break; + } + + if (!status) + nvmet_ns_changed(subsys, req->ns->nsid); + mutex_unlock(&subsys->lock); + return status; +} + static void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; @@ -575,6 +625,9 @@ static void nvmet_execute_set_features(struct nvmet_req *req) case NVME_FEAT_HOST_ID: status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; break; + case NVME_FEAT_WRITE_PROTECT: + status = nvmet_set_feat_write_protect(req); + break; default: status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; @@ -583,6 +636,26 @@ static void nvmet_execute_set_features(struct nvmet_req *req) nvmet_req_complete(req, status); } +static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 result; + + req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid); + if (!req->ns) + return NVME_SC_INVALID_NS | NVME_SC_DNR; + + mutex_lock(&subsys->lock); + if (req->ns->readonly == true) + result = NVME_NS_WRITE_PROTECT; + else + result = NVME_NS_NO_WRITE_PROTECT; + nvmet_set_result(req, result); + mutex_unlock(&subsys->lock); + + return 0; +} + static void nvmet_execute_get_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; @@ -634,6 +707,9 @@ static void nvmet_execute_get_features(struct nvmet_req *req) status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid, sizeof(req->sq->ctrl->hostid)); break; + case NVME_FEAT_WRITE_PROTECT: + status = nvmet_get_feat_write_protect(req); + break; default: status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 3ceb7a03bb2ae7..14b4c4916a8e5f 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -180,7 +180,7 @@ static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid) mutex_unlock(&ctrl->lock); } -static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) +void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) { struct nvmet_ctrl *ctrl; @@ -609,6 +609,21 @@ static inline u16 nvmet_check_ana_state(struct nvmet_port *port, return 0; } +static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) +{ + if (unlikely(req->ns->readonly)) { + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_flush: + break; + default: + return NVME_SC_NS_WRITE_PROTECTED; + } + } + + return 0; +} + static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -622,6 +637,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (unlikely(!req->ns)) return NVME_SC_INVALID_NS | NVME_SC_DNR; ret = nvmet_check_ana_state(req->port, req->ns); + if (unlikely(ret)) + return ret; + ret = nvmet_io_cmd_check_access(req); if (unlikely(ret)) return ret; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index e0b0f7df70c2e0..7bc9f624043296 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -124,6 +124,13 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) submit_bio(bio); } +u16 nvmet_bdev_flush(struct nvmet_req *req) +{ + if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL, NULL)) + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; +} + static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns, struct nvme_dsm_range *range, struct bio **bio) { diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index c2d0d08b59c8e1..81a9dc5290a874 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -211,14 +211,18 @@ static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req) queue_work(buffered_io_wq, &req->f.work); } +u16 nvmet_file_flush(struct nvmet_req *req) +{ + if (vfs_fsync(req->ns->file, 1) < 0) + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; +} + static void nvmet_file_flush_work(struct work_struct *w) { struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); - int ret; - - ret = vfs_fsync(req->ns->file, 1); - nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + nvmet_req_complete(req, nvmet_file_flush(req)); } static void nvmet_file_execute_flush(struct nvmet_req *req) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 22941045f46ecb..ec9af4ee03b603 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -58,6 +58,7 @@ struct nvmet_ns { struct percpu_ref ref; struct block_device *bdev; struct file *file; + bool readonly; u32 nsid; u32 blksize_shift; loff_t size; @@ -429,6 +430,9 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns); int nvmet_file_ns_enable(struct nvmet_ns *ns); void nvmet_bdev_ns_disable(struct nvmet_ns *ns); void nvmet_file_ns_disable(struct nvmet_ns *ns); +u16 nvmet_bdev_flush(struct nvmet_req *req); +u16 nvmet_file_flush(struct nvmet_req *req); +void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid); static inline u32 nvmet_rw_len(struct nvmet_req *req) { From 66414e80245e1e73222f67ee711951c7f4bdedab Mon Sep 17 00:00:00 2001 From: Tal Shorer Date: Tue, 7 Aug 2018 23:42:39 +0300 Subject: [PATCH 166/190] nvme-fabrics: fix ctrl_loss_tmo < 0 to reconnect forever When the user supplies a ctrl_loss_tmo < 0, we warn them that this will cause the fabrics layer to attempt reconnection forever. However, in reality the fabrics layer never attempts to reconnect because the condition to test whether we should reconnect is backwards in this case. Signed-off-by: Tal Shorer Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 903eb4545e2699..3c6cd0f81ba684 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) { - if (ctrl->opts->max_reconnects != -1 && + if (ctrl->opts->max_reconnects == -1 || ctrl->nr_reconnects < ctrl->opts->max_reconnects) return true; From d5fcc4e46e5168e6a6ce1e350ad0714a9d880c8e Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 8 Aug 2018 22:58:33 +0800 Subject: [PATCH 167/190] drivers/block/mtip32xx: remove the null check for debugfs_remove_recursive debugfs_remove_recursive has taken null pointer into account. So it is safe to drop the null check before calling the function. Signed-off-by: zhong jiang Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c73626decb46e9..db253cd5b32af2 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2575,8 +2575,7 @@ static int mtip_hw_debugfs_init(struct driver_data *dd) static void mtip_hw_debugfs_exit(struct driver_data *dd) { - if (dd->dfs_node) - debugfs_remove_recursive(dd->dfs_node); + debugfs_remove_recursive(dd->dfs_node); } /* From 69daf897d75b31ff90031bb0a49a8d65cedfe3ca Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 8 Aug 2018 23:00:35 +0800 Subject: [PATCH 168/190] drivers/block/aoe/aoedev: NULL check is not needed for mempool_destroy mempool_destroy has taken the null pointer into account. So it is safe to remove the null check. Signed-off-by: zhong jiang Signed-off-by: Jens Axboe --- drivers/block/aoe/aoedev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 697f735b07a435..41060e9cedf20c 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -284,8 +284,8 @@ freedev(struct aoedev *d) e = t + d->ntargets; for (; t < e && *t; t++) freetgt(d, *t); - if (d->bufpool) - mempool_destroy(d->bufpool); + + mempool_destroy(d->bufpool); skbpoolfree(d); minor_free(d->sysminor); From a12fc00b237c25519cc861b56dd4ca41bbec4ed3 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 8 Aug 2018 23:22:47 +0800 Subject: [PATCH 169/190] drivers/block/drbd: remove the null check for kmem_cache_destroy kmem_cache_destroy has taken null pointer into account. So it is safe to drop the null check before calling the function. Signed-off-by: zhong jiang Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a80809bd305715..ef8212a4b73ef5 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2103,14 +2103,10 @@ static void drbd_destroy_mempools(void) mempool_exit(&drbd_md_io_page_pool); mempool_exit(&drbd_ee_mempool); mempool_exit(&drbd_request_mempool); - if (drbd_ee_cache) - kmem_cache_destroy(drbd_ee_cache); - if (drbd_request_cache) - kmem_cache_destroy(drbd_request_cache); - if (drbd_bm_ext_cache) - kmem_cache_destroy(drbd_bm_ext_cache); - if (drbd_al_ext_cache) - kmem_cache_destroy(drbd_al_ext_cache); + kmem_cache_destroy(drbd_ee_cache); + kmem_cache_destroy(drbd_request_cache); + kmem_cache_destroy(drbd_bm_ext_cache); + kmem_cache_destroy(drbd_al_ext_cache); drbd_ee_cache = NULL; drbd_request_cache = NULL; From 78ac2107176baa0daf65b0fb8e561d2ed14c83ca Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Aug 2018 15:48:42 +0800 Subject: [PATCH 170/190] bcache: do not check return value of debugfs_create_dir() Greg KH suggests that normal code should not care about debugfs. Therefore no matter successful or failed of debugfs_create_dir() execution, it is unncessary to check its return value. There are two functions called debugfs_create_dir() and check the return value, which are bch_debug_init() and closure_debug_init(). This patch changes these two functions from int to void type, and ignore return values of debugfs_create_dir(). This patch does not fix exact bug, just makes things work as they should. Signed-off-by: Coly Li Suggested-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org Cc: Kai Krakow Cc: Kent Overstreet Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/closure.c | 13 +++++++++---- drivers/md/bcache/closure.h | 4 ++-- drivers/md/bcache/debug.c | 11 ++++++----- drivers/md/bcache/super.c | 4 +++- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 872ef4d677115f..0a3e82b0876d88 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1001,7 +1001,7 @@ void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); void bch_debug_exit(void); -int bch_debug_init(struct kobject *); +void bch_debug_init(struct kobject *kobj); void bch_request_exit(void); int bch_request_init(void); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 0e14969182c6e2..618253683d409e 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -199,11 +199,16 @@ static const struct file_operations debug_ops = { .release = single_release }; -int __init closure_debug_init(void) +void __init closure_debug_init(void) { - closure_debug = debugfs_create_file("closures", - 0400, bcache_debug, NULL, &debug_ops); - return IS_ERR_OR_NULL(closure_debug); + if (!IS_ERR_OR_NULL(bcache_debug)) + /* + * it is unnecessary to check return value of + * debugfs_create_file(), we should not care + * about this. + */ + closure_debug = debugfs_create_file( + "closures", 0400, bcache_debug, NULL, &debug_ops); } #endif diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 71427eb5fdaeb1..7c2c5bc7c88b12 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl) #ifdef CONFIG_BCACHE_CLOSURES_DEBUG -int closure_debug_init(void); +void closure_debug_init(void); void closure_debug_create(struct closure *cl); void closure_debug_destroy(struct closure *cl); #else -static inline int closure_debug_init(void) { return 0; } +static inline void closure_debug_init(void) {} static inline void closure_debug_create(struct closure *cl) {} static inline void closure_debug_destroy(struct closure *cl) {} diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 04d1467119500e..12034c07257b87 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -252,11 +252,12 @@ void bch_debug_exit(void) debugfs_remove_recursive(bcache_debug); } -int __init bch_debug_init(struct kobject *kobj) +void __init bch_debug_init(struct kobject *kobj) { - if (!IS_ENABLED(CONFIG_DEBUG_FS)) - return 0; - + /* + * it is unnecessary to check return value of + * debugfs_create_file(), we should not care + * about this. + */ bcache_debug = debugfs_create_dir("bcache", NULL); - return IS_ERR_OR_NULL(bcache_debug); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e0a92104ca2318..c7ffa6ef3f8276 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2345,10 +2345,12 @@ static int __init bcache_init(void) goto err; if (bch_request_init() || - bch_debug_init(bcache_kobj) || closure_debug_init() || sysfs_create_files(bcache_kobj, files)) goto err; + bch_debug_init(bcache_kobj); + closure_debug_init(); + return 0; err: bcache_exit(); From b4cb6efc1af7da2fa1e9ff0eaf90e2be02cfdf5f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Aug 2018 15:48:43 +0800 Subject: [PATCH 171/190] bcache: display rate debug parameters to 0 when writeback is not running When writeback is not running, writeback rate should be 0, other value is misleading. And the following dyanmic writeback rate debug parameters should be 0 too, rate, proportional, integral, change otherwise they are misleading when writeback is not running. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 225b15aa034054..3e9d3459a224c9 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -149,6 +149,7 @@ SHOW(__bch_cached_dev) struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; + int wb = dc->writeback_running; #define var(stat) (dc->stat) @@ -170,7 +171,7 @@ SHOW(__bch_cached_dev) var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); + sysfs_hprint(writeback_rate, wb ? dc->writeback_rate.rate << 9 : 0); sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); sysfs_printf(io_error_limit, "%i", dc->error_limit); sysfs_printf(io_disable, "%i", dc->io_disable); @@ -188,15 +189,20 @@ SHOW(__bch_cached_dev) char change[20]; s64 next_io; - bch_hprint(rate, dc->writeback_rate.rate << 9); - bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); - bch_hprint(target, dc->writeback_rate_target << 9); - bch_hprint(proportional,dc->writeback_rate_proportional << 9); - bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); - bch_hprint(change, dc->writeback_rate_change << 9); - - next_io = div64_s64(dc->writeback_rate.next - local_clock(), - NSEC_PER_MSEC); + /* + * Except for dirty and target, other values should + * be 0 if writeback is not running. + */ + bch_hprint(rate, wb ? dc->writeback_rate.rate << 9 : 0); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); + bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional, + wb ? dc->writeback_rate_proportional << 9 : 0); + bch_hprint(integral, + wb ? dc->writeback_rate_integral_scaled << 9 : 0); + bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0); + next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(), + NSEC_PER_MSEC) : 0; return sprintf(buf, "rate:\t\t%s/sec\n" From c2e8dcf7fa43ed236cb341c33aa3a75687557c43 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Aug 2018 15:48:44 +0800 Subject: [PATCH 172/190] bcache: avoid unncessary cache prefetch bch_btree_node_get() In bch_btree_node_get() the read-in btree node will be partially prefetched into L1 cache for following bset iteration (if there is). But if the btree node read is failed, the perfetch operations will waste L1 cache space. This patch checkes whether read operation and only does cache prefetch when read I/O succeeded. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 475008fbbaab6a..c19f7716df8800 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1011,6 +1011,13 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, BUG_ON(b->level != level); } + if (btree_node_io_error(b)) { + rw_unlock(write, b); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); + b->parent = parent; b->accessed = 1; @@ -1022,13 +1029,6 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, for (; i <= b->keys.nsets; i++) prefetch(b->keys.set[i].data); - if (btree_node_io_error(b)) { - rw_unlock(write, b); - return ERR_PTR(-EIO); - } - - BUG_ON(!b->written); - return b; } From e57fd746848597e9c701751e26416d4b2e3b27ea Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Aug 2018 15:48:45 +0800 Subject: [PATCH 173/190] bcache: add a comment in super.c This patch adds a line of code comment in super.c:register_bdev(), to make code to be more comprehensible. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c7ffa6ef3f8276..1e85cbb4c159f5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1291,6 +1291,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, pr_info("registered backing device %s", dc->backing_dev_name); list_add(&dc->list, &uncached_devices); + /* attach to a matched cache set if it exists */ list_for_each_entry(c, &bch_cache_sets, list) bch_cached_dev_attach(dc, c, NULL); From cb329dec11822f84f9d7309766a1f3a35d3d182a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Aug 2018 15:48:46 +0800 Subject: [PATCH 174/190] bcache: fix mistaken code comments in bcache.h This patch updates the code comment in struct cache with correct array names, to make the code to be more comprehensible. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0a3e82b0876d88..b393b3fd06b695 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -423,9 +423,9 @@ struct cache { /* * When allocating new buckets, prio_write() gets first dibs - since we * may not be allocate at all without writing priorities and gens. - * prio_buckets[] contains the last buckets we wrote priorities to (so - * gc can mark them as metadata), prio_next[] contains the buckets - * allocated for the next prio write. + * prio_last_buckets[] contains the last buckets we wrote priorities to + * (so gc can mark them as metadata), prio_buckets[] contains the + * buckets allocated for the next prio write. */ uint64_t *prio_buckets; uint64_t *prio_last_buckets; From 0cba2e71111e4bd156e67086de3484c7b907a4fc Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Aug 2018 15:48:47 +0800 Subject: [PATCH 175/190] bcache: fix mistaken comments in request.c This patch updates code comment in bch_keylist_realloc() by fixing incorrected function names, to make the code to be more comprehennsible. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 43af905920f545..914d501ad1e07d 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -107,7 +107,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s, /* * The journalling code doesn't handle the case where the keys to insert * is bigger than an empty write: If we just return -ENOMEM here, - * bio_insert() and bio_invalidate() will insert the keys created so far + * bch_data_insert_keys() will insert the keys created so far * and finish the rest when the keylist is empty. */ if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) From b467a6ac0b4bf57ec8c2329212e8a8a0231a2ef2 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Aug 2018 15:48:48 +0800 Subject: [PATCH 176/190] bcache: add code comments for bset.c This patch tries to add code comments in bset.c, to make some tricky code and designment to be more comprehensible. Most information of this patch comes from the discussion between Kent and I, he offers very informative details. If there is any mistake of the idea behind the code, no doubt that's from me misrepresentation. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 63 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f3403b45bc28a8..596c93b44e9b37 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -366,6 +366,10 @@ EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ +/* + * return array index next to j when does in-order traverse + * of a binary tree which is stored in a linear array + */ static unsigned inorder_next(unsigned j, unsigned size) { if (j * 2 + 1 < size) { @@ -379,6 +383,10 @@ static unsigned inorder_next(unsigned j, unsigned size) return j; } +/* + * return array index previous to j when does in-order traverse + * of a binary tree which is stored in a linear array + */ static unsigned inorder_prev(unsigned j, unsigned size) { if (j * 2 < size) { @@ -421,6 +429,10 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) return j; } +/* + * Return the cacheline index in bset_tree->data, where j is index + * from a linear array which stores the auxiliar binary tree + */ static unsigned to_inorder(unsigned j, struct bset_tree *t) { return __to_inorder(j, t->size, t->extra); @@ -441,6 +453,10 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) return j; } +/* + * Return an index from a linear array which stores the auxiliar binary + * tree, j is the cacheline index of t->data. + */ static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) { return __inorder_to_tree(j, t->size, t->extra); @@ -546,6 +562,20 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) return low; } +/* + * Calculate mantissa value for struct bkey_float. + * If most significant bit of f->exponent is not set, then + * - f->exponent >> 6 is 0 + * - p[0] points to bkey->low + * - p[-1] borrows bits from KEY_INODE() of bkey->high + * if most isgnificant bits of f->exponent is set, then + * - f->exponent >> 6 is 1 + * - p[0] points to bits from KEY_INODE() of bkey->high + * - p[-1] points to other bits from KEY_INODE() of + * bkey->high too. + * See make_bfloat() to check when most significant bit of f->exponent + * is set or not. + */ static inline unsigned bfloat_mantissa(const struct bkey *k, struct bkey_float *f) { @@ -570,6 +600,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j) BUG_ON(m < l || m > r); BUG_ON(bkey_next(p) != m); + /* + * If l and r have different KEY_INODE values (different backing + * device), f->exponent records how many least significant bits + * are different in KEY_INODE values and sets most significant + * bits to 1 (by +64). + * If l and r have same KEY_INODE value, f->exponent records + * how many different bits in least significant bits of bkey->low. + * See bfloat_mantiss() how the most significant bit of + * f->exponent is used to calculate bfloat mantissa value. + */ if (KEY_INODE(l) != KEY_INODE(r)) f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; else @@ -633,6 +673,15 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) } EXPORT_SYMBOL(bch_bset_init_next); +/* + * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to + * accelerate bkey search in a btree node (pointed by bset_tree->data in + * memory). After search in the auxiliar tree by calling bset_search_tree(), + * a struct bset_search_iter is returned which indicates range [l, r] from + * bset_tree->data where the searching bkey might be inside. Then a followed + * linear comparison does the exact search, see __bch_bset_search() for how + * the auxiliary tree is used. + */ void bch_bset_build_written_tree(struct btree_keys *b) { struct bset_tree *t = bset_tree_last(b); @@ -898,6 +947,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, unsigned inorder, j, n = 1; do { + /* + * A bit trick here. + * If p < t->size, (int)(p - t->size) is a minus value and + * the most significant bit is set, right shifting 31 bits + * gets 1. If p >= t->size, the most significant bit is + * not set, right shifting 31 bits gets 0. + * So the following 2 lines equals to + * if (p >= t->size) + * p = 0; + * but a branch instruction is avoided. + */ unsigned p = n << 4; p &= ((int) (p - t->size)) >> 31; @@ -907,6 +967,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, f = &t->tree[j]; /* + * Similar bit trick, use subtract operation to avoid a branch + * instruction. + * * n = (f->mantissa > bfloat_mantissa()) * ? j * 2 * : j * 2 + 1; From ea8c5356d39048bc94bae068228f51ddbecc6b89 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Aug 2018 15:48:49 +0800 Subject: [PATCH 177/190] bcache: set max writeback rate when I/O request is idle Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle") allows the writeback rate to be faster if there is no I/O request on a bcache device. It works well if there is only one bcache device attached to the cache set. If there are many bcache devices attached to a cache set, it may introduce performance regression because multiple faster writeback threads of the idle bcache devices will compete the btree level locks with the bcache device who have I/O requests coming. This patch fixes the above issue by only permitting fast writebac when all bcache devices attached on the cache set are idle. And if one of the bcache devices has new I/O request coming, minimized all writeback throughput immediately and let PI controller __update_writeback_rate() to decide the upcoming writeback rate for each bcache device. Also when all bcache devices are idle, limited wrieback rate to a small number is wast of thoughput, especially when backing devices are slower non-rotation devices (e.g. SATA SSD). This patch sets a max writeback rate for each backing device if the whole cache set is idle. A faster writeback rate in idle time means new I/Os may have more available space for dirty data, and people may observe a better write performance then. Please note bcache may change its cache mode in run time, and this patch still works if the cache mode is switched from writeback mode and there is still dirty data on cache. Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle") Cc: stable@vger.kernel.org #4.16+ Signed-off-by: Coly Li Tested-by: Kai Krakow Tested-by: Stefan Priebe Cc: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 10 ++-- drivers/md/bcache/request.c | 59 ++++++++++++++++++++++- drivers/md/bcache/super.c | 4 ++ drivers/md/bcache/sysfs.c | 15 ++++-- drivers/md/bcache/util.c | 2 +- drivers/md/bcache/util.h | 2 +- drivers/md/bcache/writeback.c | 91 +++++++++++++++++++++++------------ 7 files changed, 138 insertions(+), 45 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b393b3fd06b695..05f82ff6f016eb 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -328,13 +328,6 @@ struct cached_dev { */ atomic_t has_dirty; - /* - * Set to zero by things that touch the backing volume-- except - * writeback. Incremented by writeback. Used to determine when to - * accelerate idle writeback. - */ - atomic_t backing_idle; - struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; @@ -515,6 +508,8 @@ struct cache_set { struct cache_accounting accounting; unsigned long flags; + atomic_t idle_counter; + atomic_t at_max_writeback_rate; struct cache_sb sb; @@ -524,6 +519,7 @@ struct cache_set { struct bcache_device **devices; unsigned devices_max_used; + atomic_t attached_dev_nr; struct list_head cached_devs; uint64_t cached_dev_sectors; atomic_long_t flash_dev_dirty_sectors; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 914d501ad1e07d..7dbe8b6316a003 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1103,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) generic_make_request(bio); } +static void quit_max_writeback_rate(struct cache_set *c, + struct cached_dev *this_dc) +{ + int i; + struct bcache_device *d; + struct cached_dev *dc; + + /* + * mutex bch_register_lock may compete with other parallel requesters, + * or attach/detach operations on other backing device. Waiting to + * the mutex lock may increase I/O request latency for seconds or more. + * To avoid such situation, if mutext_trylock() failed, only writeback + * rate of current cached device is set to 1, and __update_write_back() + * will decide writeback rate of other cached devices (remember now + * c->idle_counter is 0 already). + */ + if (mutex_trylock(&bch_register_lock)) { + for (i = 0; i < c->devices_max_used; i++) { + if (!c->devices[i]) + continue; + + if (UUID_FLASH_ONLY(&c->uuids[i])) + continue; + + d = c->devices[i]; + dc = container_of(d, struct cached_dev, disk); + /* + * set writeback rate to default minimum value, + * then let update_writeback_rate() to decide the + * upcoming rate. + */ + atomic_long_set(&dc->writeback_rate.rate, 1); + } + mutex_unlock(&bch_register_lock); + } else + atomic_long_set(&this_dc->writeback_rate.rate, 1); +} + /* Cached devices - read & write stuff */ static blk_qc_t cached_dev_make_request(struct request_queue *q, @@ -1120,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, return BLK_QC_T_NONE; } - atomic_set(&dc->backing_idle, 0); - generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0); + if (likely(d->c)) { + if (atomic_read(&d->c->idle_counter)) + atomic_set(&d->c->idle_counter, 0); + /* + * If at_max_writeback_rate of cache set is true and new I/O + * comes, quit max writeback rate of all cached devices + * attached to this cache set, and set at_max_writeback_rate + * to false. + */ + if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) { + atomic_set(&d->c->at_max_writeback_rate, 0); + quit_max_writeback_rate(d->c, dc); + } + } + + generic_start_io_acct(q, + bio_op(bio), + bio_sectors(bio), + &d->disk->part0); bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1e85cbb4c159f5..55a37641aa95a5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -696,6 +696,8 @@ static void bcache_device_detach(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); + atomic_dec(&d->c->attached_dev_nr); + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { struct uuid_entry *u = d->c->uuids + d->id; @@ -1144,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, bch_cached_dev_run(dc); bcache_device_link(&dc->disk, c, "bdev"); + atomic_inc(&c->attached_dev_nr); /* Allow the writeback thread to proceed */ up_write(&dc->writeback_lock); @@ -1696,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); c->devices_max_used = 0; + atomic_set(&c->attached_dev_nr, 0); c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 3e9d3459a224c9..6e88142514fb2f 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -171,7 +171,8 @@ SHOW(__bch_cached_dev) var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_hprint(writeback_rate, wb ? dc->writeback_rate.rate << 9 : 0); + sysfs_hprint(writeback_rate, + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); sysfs_printf(io_error_limit, "%i", dc->error_limit); sysfs_printf(io_disable, "%i", dc->io_disable); @@ -193,7 +194,9 @@ SHOW(__bch_cached_dev) * Except for dirty and target, other values should * be 0 if writeback is not running. */ - bch_hprint(rate, wb ? dc->writeback_rate.rate << 9 : 0); + bch_hprint(rate, + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 + : 0); bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); bch_hprint(proportional, @@ -261,8 +264,12 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, INT_MAX); + if (attr == &sysfs_writeback_rate) { + int v; + + sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX); + atomic_long_set(&dc->writeback_rate.rate, v); + } sysfs_strtoul_clamp(writeback_rate_update_seconds, dc->writeback_rate_update_seconds, diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index fc479b026d6d86..b15256bcf0e756 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); - d->next += div_u64(done * NSEC_PER_SEC, d->rate); + d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate)); /* Bound the time. Don't let us fall further than 2 seconds behind * (this prevents unnecessary backlog that would make it impossible diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cced87f8eb278f..f7b0133c9d2f1a 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -442,7 +442,7 @@ struct bch_ratelimit { * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - uint32_t rate; + atomic_long_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 912e969fedbacb..481d4cf38ac0e9 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc) dc->writeback_rate_proportional = proportional_scaled; dc->writeback_rate_integral_scaled = integral_scaled; - dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; - dc->writeback_rate.rate = new_rate; + dc->writeback_rate_change = new_rate - + atomic_long_read(&dc->writeback_rate.rate); + atomic_long_set(&dc->writeback_rate.rate, new_rate); dc->writeback_rate_target = target; } +static bool set_at_max_writeback_rate(struct cache_set *c, + struct cached_dev *dc) +{ + /* + * Idle_counter is increased everytime when update_writeback_rate() is + * called. If all backing devices attached to the same cache set have + * identical dc->writeback_rate_update_seconds values, it is about 6 + * rounds of update_writeback_rate() on each backing device before + * c->at_max_writeback_rate is set to 1, and then max wrteback rate set + * to each dc->writeback_rate.rate. + * In order to avoid extra locking cost for counting exact dirty cached + * devices number, c->attached_dev_nr is used to calculate the idle + * throushold. It might be bigger if not all cached device are in write- + * back mode, but it still works well with limited extra rounds of + * update_writeback_rate(). + */ + if (atomic_inc_return(&c->idle_counter) < + atomic_read(&c->attached_dev_nr) * 6) + return false; + + if (atomic_read(&c->at_max_writeback_rate) != 1) + atomic_set(&c->at_max_writeback_rate, 1); + + atomic_long_set(&dc->writeback_rate.rate, INT_MAX); + + /* keep writeback_rate_target as existing value */ + dc->writeback_rate_proportional = 0; + dc->writeback_rate_integral_scaled = 0; + dc->writeback_rate_change = 0; + + /* + * Check c->idle_counter and c->at_max_writeback_rate agagain in case + * new I/O arrives during before set_at_max_writeback_rate() returns. + * Then the writeback rate is set to 1, and its new value should be + * decided via __update_writeback_rate(). + */ + if ((atomic_read(&c->idle_counter) < + atomic_read(&c->attached_dev_nr) * 6) || + !atomic_read(&c->at_max_writeback_rate)) + return false; + + return true; +} + static void update_writeback_rate(struct work_struct *work) { struct cached_dev *dc = container_of(to_delayed_work(work), @@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work) return; } - down_read(&dc->writeback_lock); - - if (atomic_read(&dc->has_dirty) && - dc->writeback_percent) - __update_writeback_rate(dc); + if (atomic_read(&dc->has_dirty) && dc->writeback_percent) { + /* + * If the whole cache set is idle, set_at_max_writeback_rate() + * will set writeback rate to a max number. Then it is + * unncessary to update writeback rate for an idle cache set + * in maximum writeback rate number(s). + */ + if (!set_at_max_writeback_rate(c, dc)) { + down_read(&dc->writeback_lock); + __update_writeback_rate(dc); + up_read(&dc->writeback_lock); + } + } - up_read(&dc->writeback_lock); /* * CACHE_SET_IO_DISABLE might be set via sysfs interface, @@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc) delay = writeback_delay(dc, size); - /* If the control system would wait for at least half a - * second, and there's been no reqs hitting the backing disk - * for awhile: use an alternate mode where we have at most - * one contiguous set of writebacks in flight at a time. If - * someone wants to do IO it will be quick, as it will only - * have to contend with one operation in flight, and we'll - * be round-tripping data to the backing disk as quickly as - * it can accept it. - */ - if (delay >= HZ / 2) { - /* 3 means at least 1.5 seconds, up to 7.5 if we - * have slowed way down. - */ - if (atomic_inc_return(&dc->backing_idle) >= 3) { - /* Wait for current I/Os to finish */ - closure_sync(&cl); - /* And immediately launch a new set. */ - delay = 0; - } - } - while (!kthread_should_stop() && !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && delay) { @@ -741,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_running = true; dc->writeback_percent = 10; dc->writeback_delay = 30; - dc->writeback_rate.rate = 1024; + atomic_long_set(&dc->writeback_rate.rate, 1024); dc->writeback_rate_minimum = 8; dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; From e921efeb07048a6f5daa540e5f04f19ec9360da2 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 9 Aug 2018 15:48:50 +0800 Subject: [PATCH 178/190] bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section The pr_err statement in the code for sysfs_attatch section would run for various error codes, which maybe confusing. E.g, Run the command twice: echo 796b5c05-b03c-4bc7-9cbd-a8df5e8be891 > \ /sys/block/bcache0/bcache/attach [the backing dev got attached on the first run] echo 796b5c05-b03c-4bc7-9cbd-a8df5e8be891 > \ /sys/block/bcache0/bcache/attach In dmesg, after the command run twice, we can get: bcache: bch_cached_dev_attach() Can't attach sda6: already attached bcache: __cached_dev_store() Can't attach 796b5c05-b03c-4bc7-9cbd-\ a8df5e8be891 : cache set not found The first statement in the message was right, but the second was confusing. bch_cached_dev_attach has various pr_ statements for various error codes, except ENOENT. After the change, rerun above command twice: echo 796b5c05-b03c-4bc7-9cbd-a8df5e8be891 > \ /sys/block/bcache0/bcache/attach echo 796b5c05-b03c-4bc7-9cbd-a8df5e8be891 > \ /sys/block/bcache0/bcache/attach In dmesg we only got: bcache: bch_cached_dev_attach() Can't attach sda6: already attached No confusing "cache set not found" message anymore. And for some not exist SET-UUID: echo 796b5c05-b03c-4bc7-9cbd-a8df5e8be898 > \ /sys/block/bcache0/bcache/attach In dmesg we can get: bcache: __cached_dev_store() Can't attach 796b5c05-b03c-4bc7-9cbd-\ a8df5e8be898 : cache set not found Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 6e88142514fb2f..22f8565d2bf1a1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -351,8 +351,8 @@ STORE(__cached_dev) if (!v) return size; } - - pr_err("Can't attach %s: cache set not found", buf); + if (v == -ENOENT) + pr_err("Can't attach %s: cache set not found", buf); return v; } From cbb751c060fe61140e3f23dc7cd95190bba4c89e Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 9 Aug 2018 15:48:51 +0800 Subject: [PATCH 179/190] bcache: trivial - remove tailing backslash in macro BTREE_FLAG Remove the tailing backslash in macro BTREE_FLAG in btree.h Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d211e2c25b6bce..68e9d926134da9 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b) \ { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ \ static inline void set_btree_node_ ## flag(struct btree *b) \ -{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } enum btree_flags { BTREE_NODE_io_error, From d6c02a9beb67f13d5f14f23e72fa9981e8b84477 Mon Sep 17 00:00:00 2001 From: Greg Edwards Date: Wed, 8 Aug 2018 13:27:53 -0600 Subject: [PATCH 180/190] block: bvec_nr_vecs() returns value for wrong slab In commit ed996a52c868 ("block: simplify and cleanup bvec pool handling"), the value of the slab index is incremented by one in bvec_alloc() after the allocation is done to indicate an index value of 0 does not need to be later freed. bvec_nr_vecs() was not updated accordingly, and thus returns the wrong value. Decrement idx before performing the lookup. Fixes: ed996a52c868 ("block: simplify and cleanup bvec pool handling") Signed-off-by: Greg Edwards Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index b832151cd0bf42..04969b392c72d6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -158,7 +158,7 @@ static void bio_put_slab(struct bio_set *bs) unsigned int bvec_nr_vecs(unsigned short idx) { - return bvec_slabs[idx].nr_vecs; + return bvec_slabs[--idx].nr_vecs; } void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) From d263ed9926823c462f99a7679e18f0c9e5b8550d Mon Sep 17 00:00:00 2001 From: Jianchao Wang Date: Thu, 9 Aug 2018 08:34:17 -0600 Subject: [PATCH 181/190] blk-mq: count the hctx as active before allocating tag Currently, we count the hctx as active after allocate driver tag successfully. If a previously inactive hctx try to get tag first time, it may fails and need to wait. However, due to the stale tag ->active_queues, the other shared-tags users are still able to occupy all driver tags while there is someone waiting for tag. Consequently, even if the previously inactive hctx is waked up, it still may not be able to get a tag and could be starved. To fix it, we count the hctx as active before try to allocate driver tag, then when it is waiting the tag, the other shared-tag users will reserve budget for it. Reviewed-by: Ming Lei Signed-off-by: Jianchao Wang Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 3 +++ block/blk-mq.c | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c43b3398d7b417..c0c4e63583ae03 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -23,6 +23,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags) /* * If a previously inactive queue goes active, bump the active user count. + * We need to do this before try to allocate driver tag, then even if fail + * to get tag when first time, the other shared-tag users could reserve + * budget for it. */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq.c b/block/blk-mq.c index e13bdc2707ce25..5efd789910e25c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -285,7 +285,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->tag = -1; rq->internal_tag = tag; } else { - if (blk_mq_tag_busy(data->hctx)) { + if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } @@ -367,6 +367,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, if (!op_is_flush(op) && e->type->ops.mq.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.mq.limit_depth(op, data); + } else { + blk_mq_tag_busy(data->hctx); } tag = blk_mq_get_tag(data); @@ -971,6 +973,7 @@ bool blk_mq_get_driver_tag(struct request *rq) .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), .flags = BLK_MQ_REQ_NOWAIT, }; + bool shared; if (rq->tag != -1) goto done; @@ -978,9 +981,10 @@ bool blk_mq_get_driver_tag(struct request *rq) if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; + shared = blk_mq_tag_busy(data.hctx); rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { - if (blk_mq_tag_busy(data.hctx)) { + if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; atomic_inc(&data.hctx->nr_active); } From b1f4267cc5448d20ae0c515a74141e74365e78a3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Aug 2018 07:47:28 -0700 Subject: [PATCH 182/190] block: Remove two superfluous #include directives Commit 12f5b9314545 ("blk-mq: Remove generation seqeunce") removed the only seqcount_t and u64_stats_sync instances from but did not remove the corresponding #include directives. Since these include directives are no longer needed, remove them. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Keith Busch Cc: Ming Lei Cc: Jianchao Wang Cc: Hannes Reinecke , Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 050d599f5ea972..d6869e0e2b64c2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,8 +27,6 @@ #include #include #include -#include -#include struct module; struct scsi_ioctl_command; From 6bad9b210a228d2fe0e0efe26d9b115348529cee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Aug 2018 07:53:36 -0700 Subject: [PATCH 183/190] blkcg: Introduce blkg_root_lookup() This new function will be used in a later patch to verify whether a queue has been dissociated from the cgroup controller before being released. Signed-off-by: Bart Van Assche Cc: Tejun Heo Cc: Christoph Hellwig Cc: Ming Lei Cc: Omar Sandoval Cc: Johannes Thumshirn Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Joseph Qi Cc: Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index f7b910768306e4..1361cfc9b878b8 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -341,6 +341,23 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, return __blkg_lookup(blkcg, q, false); } +/** + * blkg_lookup - look up blkg for the specified request queue + * @q: request_queue of interest + * + * Lookup blkg for @q at the root level. See also blkg_lookup(). + */ +static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) +{ + struct blkcg_gq *blkg; + + rcu_read_lock(); + blkg = blkg_lookup(&blkcg_root, q); + rcu_read_unlock(); + + return blkg; +} + /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest @@ -864,6 +881,7 @@ static inline bool blk_cgroup_congested(void) { return false; } static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) { return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } static inline void blkcg_exit_queue(struct request_queue *q) { } From 4cf6324b17e96b7b7ab4021c6929500934d46750 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Aug 2018 07:53:37 -0700 Subject: [PATCH 184/190] block: Introduce blk_exit_queue() This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Cc: Christoph Hellwig Cc: Ming Lei Cc: Omar Sandoval Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Joseph Qi Cc: Signed-off-by: Jens Axboe --- block/blk-core.c | 54 +++++++++++++++++++++++++++--------------------- block/blk.h | 1 + 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f9ad73d8573c85..49af34bf2119cd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -715,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_set_queue_dying); +/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ +void blk_exit_queue(struct request_queue *q) +{ + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + ioc_clear_queue(q); + elevator_exit(q, q->elevator); + q->elevator = NULL; + } + + /* + * Remove all references to @q from the block cgroup controller before + * restoring @q->queue_lock to avoid that restoring this pointer causes + * e.g. blkcg_print_blkgs() to crash. + */ + blkcg_exit_queue(q); + + /* + * Since the cgroup code may dereference the @q->backing_dev_info + * pointer, only decrease its reference count after having removed the + * association with the block cgroup controller. + */ + bdi_put(q->backing_dev_info); +} + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -784,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q) */ WARN_ON_ONCE(q->kobj.state_in_sysfs); - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q, q->elevator); - q->elevator = NULL; - } - - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); + blk_exit_queue(q); if (q->mq_ops) blk_mq_free_queue(q); diff --git a/block/blk.h b/block/blk.h index 69b14cd2bb225f..d4d67e94892042 100644 --- a/block/blk.h +++ b/block/blk.h @@ -130,6 +130,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); void blk_exit_rl(struct request_queue *q, struct request_list *rl); +void blk_exit_queue(struct request_queue *q); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); void blk_queue_bypass_start(struct request_queue *q); From 24ecc3585348b616993a3c4d6dc2c6b8007e358c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Aug 2018 07:53:38 -0700 Subject: [PATCH 185/190] block: Ensure that a request queue is dissociated from the cgroup controller Several block drivers call alloc_disk() followed by put_disk() if something fails before device_add_disk() is called without calling blk_cleanup_queue(). Make sure that also for this scenario a request queue is dissociated from the cgroup controller. This patch avoids that loading the parport_pc, paride and pf drivers triggers the following kernel crash: BUG: KASAN: null-ptr-deref in pi_init+0x42e/0x580 [paride] Read of size 4 at addr 0000000000000008 by task modprobe/744 Call Trace: dump_stack+0x9a/0xeb kasan_report+0x139/0x350 pi_init+0x42e/0x580 [paride] pf_init+0x2bb/0x1000 [pf] do_one_initcall+0x8e/0x405 do_init_module+0xd9/0x2f2 load_module+0x3ab4/0x4700 SYSC_finit_module+0x176/0x1a0 do_syscall_64+0xee/0x2b0 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Reported-by: Alexandru Moise <00moses.alexander00@gmail.com> Fixes: a063057d7c73 ("block: Fix a race between request queue removal and the block cgroup controller") # v4.17 Signed-off-by: Bart Van Assche Tested-by: Alexandru Moise <00moses.alexander00@gmail.com> Reviewed-by: Johannes Thumshirn Cc: Tejun Heo Cc: Christoph Hellwig Cc: Ming Lei Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Joseph Qi Cc: Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 49c29a5d06bb98..380bc284ced17f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -802,6 +802,21 @@ static void __blk_release_queue(struct work_struct *work) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); + if (!blk_queue_dead(q)) { + /* + * Last reference was dropped without having called + * blk_cleanup_queue(). + */ + WARN_ONCE(blk_queue_init_done(q), + "request queue %p has been registered but blk_cleanup_queue() has not been called for that queue\n", + q); + blk_exit_queue(q); + } + + WARN(blkg_root_lookup(q), + "request queue %p is being released but it has not yet been removed from the blkcg controller\n", + q); + blk_free_queue_stats(q->stats); blk_exit_rl(q, &q->root_rl); From 0a1c749dee4c52465d5580d77e0f8aaa9215c357 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 9 Aug 2018 10:54:46 -0500 Subject: [PATCH 186/190] block: paride: pd: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 1056543 ("Missing break in switch") Addresses-Coverity-ID: 1056544 ("Missing break in switch") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- drivers/block/paride/pd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 8961b190e2566c..7cf947586fe46b 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -426,6 +426,7 @@ static void run_fsm(void) pd_claimed = 1; if (!pi_schedule_claimed(pi_current, run_fsm)) return; + /* fall through */ case 1: pd_claimed = 2; pi_current->proto->connect(pi_current); @@ -445,6 +446,7 @@ static void run_fsm(void) spin_unlock_irqrestore(&pd_lock, saved_flags); if (stop) return; + /* fall through */ case Hold: schedule_fsm(); return; From 991f61fe7e1db3f74b72a3a5cbe6b012804eb0ee Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 10 Aug 2018 01:47:02 +0800 Subject: [PATCH 187/190] Blk-throttle: reduce tail io latency when iops limit is enforced When an application's iops has exceeded its cgroup's iops limit, surely it is throttled and kernel will set a timer for dispatching, thus IO latency includes the delay. However, the dispatch delay which is calculated by the limit and the elapsed jiffies is suboptimal. As the dispatch delay is only calculated once the application's iops is (iops limit + 1), it doesn't need to wait any longer than the remaining time of the current slice. The difference can be proved by the following fio job and cgroup iops setting, ----- $ echo 4 > /mnt/config/nullb/disk1/mbps # limit nullb's bandwidth to 4MB/s for testing. $ echo "253:1 riops=100 rbps=max" > /sys/fs/cgroup/unified/cg1/io.max $ cat r2.job [global] name=fio-rand-read filename=/dev/nullb1 rw=randread bs=4k direct=1 numjobs=1 time_based=1 runtime=60 group_reporting=1 [file1] size=4G ioengine=libaio iodepth=1 rate_iops=50000 norandommap=1 thinktime=4ms ----- wo patch: file1: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=1 fio-3.7-66-gedfc Starting 1 process read: IOPS=99, BW=400KiB/s (410kB/s)(23.4MiB/60001msec) slat (usec): min=10, max=336, avg=27.71, stdev=17.82 clat (usec): min=2, max=28887, avg=5929.81, stdev=7374.29 lat (usec): min=24, max=28901, avg=5958.73, stdev=7366.22 clat percentiles (usec): | 1.00th=[ 4], 5.00th=[ 4], 10.00th=[ 4], 20.00th=[ 4], | 30.00th=[ 4], 40.00th=[ 4], 50.00th=[ 6], 60.00th=[11731], | 70.00th=[11863], 80.00th=[11994], 90.00th=[12911], 95.00th=[22676], | 99.00th=[23725], 99.50th=[23987], 99.90th=[23987], 99.95th=[25035], | 99.99th=[28967] w/ patch: file1: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=1 fio-3.7-66-gedfc Starting 1 process read: IOPS=100, BW=400KiB/s (410kB/s)(23.4MiB/60005msec) slat (usec): min=10, max=155, avg=23.24, stdev=16.79 clat (usec): min=2, max=12393, avg=5961.58, stdev=5959.25 lat (usec): min=23, max=12412, avg=5985.91, stdev=5951.92 clat percentiles (usec): | 1.00th=[ 3], 5.00th=[ 3], 10.00th=[ 4], 20.00th=[ 4], | 30.00th=[ 4], 40.00th=[ 5], 50.00th=[ 47], 60.00th=[11863], | 70.00th=[11994], 80.00th=[11994], 90.00th=[11994], 95.00th=[11994], | 99.00th=[11994], 99.50th=[11994], 99.90th=[12125], 99.95th=[12125], | 99.99th=[12387] Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index caaabbe8a7a531..a3eede00d3020b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -922,12 +922,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, } /* Calc approx time to dispatch */ - jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; - - if (jiffy_wait > jiffy_elapsed) - jiffy_wait = jiffy_wait - jiffy_elapsed; - else - jiffy_wait = 1; + jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; if (wait) *wait = jiffy_wait; From 61884de08f8368b9aa289ab8dc953e0ce4c755b1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Aug 2018 14:22:41 -0600 Subject: [PATCH 188/190] null_blk: add lock drop/acquire annotation sparse complains: drivers/block/null_blk_main.c:816:24: sparse: context imbalance in 'null_insert_page' - unexpected unlock Fix it by adding the necessary annotations to the function. Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 86cafa6d3b4177..6127e3ff7b4b34 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -804,7 +804,9 @@ static struct nullb_page *null_lookup_page(struct nullb *nullb, } static struct nullb_page *null_insert_page(struct nullb *nullb, - sector_t sector, bool ignore_cache) + sector_t sector, bool ignore_cache) + __releases(&nullb->lock) + __acquires(&nullb->lock) { u64 idx; struct nullb_page *t_page; From 46451874c7c9afaa4e014aea0f0970f886444e0d Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 10 Aug 2018 23:45:50 +0800 Subject: [PATCH 189/190] bcache: fix error setting writeback_rate through sysfs interface Commit ea8c5356d390 ("bcache: set max writeback rate when I/O request is idle") changes struct bch_ratelimit member rate from uint32_t to atomic_long_t and uses atomic_long_set() in drivers/md/bcache/sysfs.c to set new writeback rate, after the input is converted from memory buf to long int by sysfs_strtoul_clamp(). The above change has a problem because there is an implicit return inside sysfs_strtoul_clamp() so the following atomic_long_set() won't be called. This error is detected by 0day system with following snipped smatch warnings: drivers/md/bcache/sysfs.c:271 __cached_dev_store() error: uninitialized symbol 'v'. 270 sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @271 atomic_long_set(&dc->writeback_rate.rate, v); This patch fixes the above error by using strtoul_safe_clamp() to convert the input buffer into a long int type result. Fixes: ea8c5356d390 ("bcache: set max writeback rate when I/O request is idle") Cc: Kai Krakow Cc: Stefan Priebe Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 22f8565d2bf1a1..81d3520b0702d1 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -265,10 +265,17 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); if (attr == &sysfs_writeback_rate) { - int v; + ssize_t ret; + long int v = atomic_long_read(&dc->writeback_rate.rate); + + ret = strtoul_safe_clamp(buf, v, 1, INT_MAX); - sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX); - atomic_long_set(&dc->writeback_rate.rate, v); + if (!ret) { + atomic_long_set(&dc->writeback_rate.rate, v); + ret = size; + } + + return ret; } sysfs_strtoul_clamp(writeback_rate_update_seconds, From b86d865cb1cae1e61527ea0b8977078bbf694328 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 10 Aug 2018 13:28:07 -0700 Subject: [PATCH 190/190] blkcg: Make blkg_root_lookup() work for queues in bypass mode For legacy queues the only call of blkg_root_lookup() happens after bypass mode has been enabled. Since blkg_lookup() returns NULL for queues in bypass mode, modify the blkg_root_lookup() such that it no longer depends on bypass mode. Rename the function into blk_queue_root_blkg() as suggested by Tejun. Suggested-by: Tejun Heo Fixes: 6bad9b210a22 ("blkcg: Introduce blkg_root_lookup()") Signed-off-by: Bart Van Assche Cc: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 2 +- include/linux/blk-cgroup.h | 15 +++++---------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 380bc284ced17f..bb109bb0a05537 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -813,7 +813,7 @@ static void __blk_release_queue(struct work_struct *work) blk_exit_queue(q); } - WARN(blkg_root_lookup(q), + WARN(blk_queue_root_blkg(q), "request queue %p is being released but it has not yet been removed from the blkcg controller\n", q); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1361cfc9b878b8..34aec30e06c734 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -342,20 +342,14 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, } /** - * blkg_lookup - look up blkg for the specified request queue + * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair * @q: request_queue of interest * * Lookup blkg for @q at the root level. See also blkg_lookup(). */ -static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) +static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) { - struct blkcg_gq *blkg; - - rcu_read_lock(); - blkg = blkg_lookup(&blkcg_root, q); - rcu_read_unlock(); - - return blkg; + return q->root_blkg; } /** @@ -881,7 +875,8 @@ static inline bool blk_cgroup_congested(void) { return false; } static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline struct blkcg_gq *blkg_root_lookup(struct request_queue *q) { return NULL; } +static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) +{ return NULL; } static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } static inline void blkcg_exit_queue(struct request_queue *q) { }