From 98ec286e016a028678cc943b1a1f87668ad447c8 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 21 Jan 2010 19:33:14 +0100 Subject: [PATCH 1/6] drbd: fix max_segment_size initialization blk_queue_make_request() internally calls blk_set_default_limits(), so calling blk_queue_max_segment_size() before is useless. Ergo: move the call to blk_queue_max_segment_size() down a few lines. Impact: If, after a fresh modprobe, you first connect a Diskless drbd, then attach, this could result in a DRBD Protocol Error at first. The next connection attempt would then succeeded. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e898ad9eb1c3e1..ab871e00ffc5b9 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2973,7 +2973,6 @@ struct drbd_conf *drbd_new_device(unsigned int minor) goto out_no_q; mdev->rq_queue = q; q->queuedata = mdev; - blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); disk = alloc_disk(1); if (!disk) @@ -2997,6 +2996,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor) q->backing_dev_info.congested_data = mdev; blk_queue_make_request(q, drbd_make_request_26); + blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); blk_queue_merge_bvec(q, drbd_merge_bvec); q->queue_lock = &mdev->req_lock; /* needed since we use */ From d3db7b485ad7c467a61279d6a8ef51a3c83352df Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 23 Jan 2010 15:45:22 +0300 Subject: [PATCH 2/6] drbd: null dereference bug epoch is always NULL here. Signed-off-by: Dan Carpenter Signed-off-by: Philipp Reisner --- drivers/block/drbd/drbd_receiver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index f22a5283128a29..d065c646b35a06 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1224,7 +1224,7 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); if (!epoch) { dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); - issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); + issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); drbd_wait_ee_list_empty(mdev, &mdev->active_ee); if (issue_flush) { rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); From 1d6165851cd8e3f919d446cd6da35dee44e8837e Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Jan 2010 22:44:36 +0300 Subject: [PATCH 3/6] block: fix bio_add_page for non trivial merge_bvec_fn case We have to properly decrease bi_size in order to merge_bvec_fn return right result. Otherwise this result in false merge rejects for two absolutely valid bio_vecs. This may cause significant performance penalty for example fs_block_size == 1k and block device is raid0 with small chunk_size = 8k. Then it is impossible to merge 7-th fs-block in to bio which already has 6 fs-blocks. Cc: Signed-off-by: Dmitry Monakhov Signed-off-by: Jens Axboe --- fs/bio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bio.c b/fs/bio.c index 12429c9553eb95..88094afc29ea38 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -542,13 +542,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { + unsigned int prev_bv_len = prev->bv_len; prev->bv_len += len; if (q->merge_bvec_fn) { struct bvec_merge_data bvm = { + /* prev_bvec is already charged in + bi_size, discharge it in order to + simulate merging updated prev_bvec + as new bvec. */ .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, + .bi_size = bio->bi_size - prev_bv_len, .bi_rw = bio->bi_rw, }; From 9e9432c267e4047db98b9d4fba95099c6effcef9 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Sat, 30 Jan 2010 20:28:19 +0100 Subject: [PATCH 4/6] block: fix bugs in bio-integrity mempool usage Fix two bugs in the bio integrity code: use_bip_pool() always returns 0 because it checks against the wrong limit, causing the mempool to be used only when regular allocation fails. When the mempool is used as a fallback we don't free the data properly. Signed-Off-By: Chuck Ebbert Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 49a34e7f7306d4..a16f29e888cd83 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -61,7 +61,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr) static inline int use_bip_pool(unsigned int idx) { - if (idx == BIOVEC_NR_POOLS) + if (idx == BIOVEC_MAX_IDX) return 1; return 0; @@ -95,6 +95,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, /* Use mempool if lower order alloc failed or max vecs were requested */ if (bip == NULL) { + idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); if (unlikely(bip == NULL)) { From bcf4dd43424cdfd8195f3955300a579fe58e9911 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 1 Feb 2010 09:58:54 +0100 Subject: [PATCH 5/6] blk-cgroup: Fix potential deadlock in blk-cgroup I triggered a lockdep warning as following. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.33-rc2 #1 ------------------------------------------------------- test_io_control/7357 is trying to acquire lock: (blkio_list_lock){+.+...}, at: [] blkiocg_weight_write+0x82/0x9e but task is already holding lock: (&(&blkcg->lock)->rlock){......}, at: [] blkiocg_weight_write+0x3b/0x9e which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&(&blkcg->lock)->rlock){......}: [] validate_chain+0x8bc/0xb9c [] __lock_acquire+0x723/0x789 [] lock_acquire+0x90/0xa7 [] _raw_spin_lock_irqsave+0x27/0x5a [] blkiocg_add_blkio_group+0x1a/0x6d [] cfq_get_queue+0x225/0x3de [] cfq_set_request+0x217/0x42d [] elv_set_request+0x17/0x26 [] get_request+0x203/0x2c5 [] get_request_wait+0x18/0x10e [] __make_request+0x2ba/0x375 [] generic_make_request+0x28d/0x30f [] submit_bio+0x8a/0x8f [] submit_bh+0xf0/0x10f [] ll_rw_block+0xc0/0xf9 [] ext3_find_entry+0x319/0x544 [ext3] [] ext3_lookup+0x2c/0xb9 [ext3] [] do_lookup+0xd3/0x172 [] link_path_walk+0x5fb/0x95c [] path_walk+0x3c/0x81 [] do_path_lookup+0x21/0x8a [] do_filp_open+0xf0/0x978 [] open_exec+0x1b/0xb7 [] do_execve+0xbb/0x266 [] sys_execve+0x24/0x4a [] ptregs_execve+0x12/0x18 -> #1 (&(&q->__queue_lock)->rlock){..-.-.}: [] validate_chain+0x8bc/0xb9c [] __lock_acquire+0x723/0x789 [] lock_acquire+0x90/0xa7 [] _raw_spin_lock_irqsave+0x27/0x5a [] cfq_unlink_blkio_group+0x17/0x41 [] blkiocg_destroy+0x72/0xc7 [] cgroup_diput+0x4a/0xb2 [] dentry_iput+0x93/0xb7 [] d_kill+0x1c/0x36 [] dput+0xf5/0xfe [] do_rmdir+0x95/0xbe [] sys_rmdir+0x10/0x12 [] sysenter_do_call+0x12/0x32 -> #0 (blkio_list_lock){+.+...}: [] validate_chain+0x61c/0xb9c [] __lock_acquire+0x723/0x789 [] lock_acquire+0x90/0xa7 [] _raw_spin_lock+0x1e/0x4e [] blkiocg_weight_write+0x82/0x9e [] cgroup_file_write+0xc6/0x1c0 [] vfs_write+0x8c/0x116 [] sys_write+0x3b/0x60 [] sysenter_do_call+0x12/0x32 other info that might help us debug this: 1 lock held by test_io_control/7357: #0: (&(&blkcg->lock)->rlock){......}, at: [] blkiocg_weight_write+0x3b/0x9e stack backtrace: Pid: 7357, comm: test_io_control Not tainted 2.6.33-rc2 #1 Call Trace: [] print_circular_bug+0x91/0x9d [] validate_chain+0x61c/0xb9c [] __lock_acquire+0x723/0x789 [] lock_acquire+0x90/0xa7 [] ? blkiocg_weight_write+0x82/0x9e [] _raw_spin_lock+0x1e/0x4e [] ? blkiocg_weight_write+0x82/0x9e [] blkiocg_weight_write+0x82/0x9e [] cgroup_file_write+0xc6/0x1c0 [] ? trace_hardirqs_off+0xb/0xd [] ? cpu_clock+0x2e/0x44 [] ? security_file_permission+0xf/0x11 [] ? rw_verify_area+0x8a/0xad [] ? cgroup_file_write+0x0/0x1c0 [] vfs_write+0x8c/0x116 [] sys_write+0x3b/0x60 [] sysenter_do_call+0x12/0x32 To prevent deadlock, we should take locks as following sequence: blkio_list_lock -> queue_lock -> blkcg_lock. The following patch should fix this bug. Signed-off-by: Gui Jianfeng Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1fa2654db0a62c..e7dbbaf5fb3ee5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -147,16 +147,16 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) return -EINVAL; blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); blkcg->weight = (unsigned int)val; hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - spin_lock(&blkio_list_lock); list_for_each_entry(blkiop, &blkio_list, list) blkiop->ops.blkio_update_group_weight_fn(blkg, blkcg->weight); - spin_unlock(&blkio_list_lock); } spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); return 0; } From 1efe8fe1c2240acc476bed77740883df63373862 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 2 Feb 2010 20:45:46 +0100 Subject: [PATCH 6/6] cfq-iosched: Do not idle on async queues Few weeks back, Shaohua Li had posted similar patch. I am reposting it with more test results. This patch does two things. - Do not idle on async queues. - It also changes the write queue depth CFQ drives (cfq_may_dispatch()). Currently, we seem to driving queue depth of 1 always for WRITES. This is true even if there is only one write queue in the system and all the logic of infinite queue depth in case of single busy queue as well as slowly increasing queue depth based on last delayed sync request does not seem to be kicking in at all. This patch will allow deeper WRITE queue depths (subjected to the other WRITE queue depth contstraints like cfq_quantum and last delayed sync request). Shaohua Li had reported getting more out of his SSD. For me, I have got one Lun exported from an HP EVA and when pure buffered writes are on, I can get more out of the system. Following are test results of pure buffered writes (with end_fsync=1) with vanilla and patched kernel. These results are average of 3 sets of run with increasing number of threads. AVERAGE[bufwfs][vanilla] ------- job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us) --- --- -- ------------ ----------- ------------- ----------- bufwfs 3 1 0 0 95349 474141 bufwfs 3 2 0 0 100282 806926 bufwfs 3 4 0 0 109989 2.7301e+06 bufwfs 3 8 0 0 116642 3762231 bufwfs 3 16 0 0 118230 6902970 AVERAGE[bufwfs] [patched kernel] ------- bufwfs 3 1 0 0 270722 404352 bufwfs 3 2 0 0 206770 1.06552e+06 bufwfs 3 4 0 0 195277 1.62283e+06 bufwfs 3 8 0 0 260960 2.62979e+06 bufwfs 3 16 0 0 299260 1.70731e+06 I also ran buffered writes along with some sequential reads and some buffered reads going on in the system on a SATA disk because the potential risk could be that we should not be driving queue depth higher in presence of sync IO going to keep the max clat low. With some random and sequential reads going on in the system on one SATA disk I did not see any significant increase in max clat. So it looks like other WRITE queue depth control logic is doing its job. Here are the results. AVERAGE[brr, bsr, bufw together] [vanilla] ------- job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us) --- --- -- ------------ ----------- ------------- ----------- brr 3 1 850 546345 0 0 bsr 3 1 14650 729543 0 0 bufw 3 1 0 0 23908 8274517 brr 3 2 981.333 579395 0 0 bsr 3 2 14149.7 1175689 0 0 bufw 3 2 0 0 21921 1.28108e+07 brr 3 4 898.333 1.75527e+06 0 0 bsr 3 4 12230.7 1.40072e+06 0 0 bufw 3 4 0 0 19722.3 2.4901e+07 brr 3 8 900 3160594 0 0 bsr 3 8 9282.33 1.91314e+06 0 0 bufw 3 8 0 0 18789.3 23890622 AVERAGE[brr, bsr, bufw mixed] [patched kernel] ------- job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us) --- --- -- ------------ ----------- ------------- ----------- brr 3 1 837 417973 0 0 bsr 3 1 14357.7 591275 0 0 bufw 3 1 0 0 24869.7 8910662 brr 3 2 1038.33 543434 0 0 bsr 3 2 13351.3 1205858 0 0 bufw 3 2 0 0 18626.3 13280370 brr 3 4 913 1.86861e+06 0 0 bsr 3 4 12652.3 1430974 0 0 bufw 3 4 0 0 15343.3 2.81305e+07 brr 3 8 890 2.92695e+06 0 0 bsr 3 8 9635.33 1.90244e+06 0 0 bufw 3 8 0 0 17200.3 24424392 So looks like it might make sense to include this patch. Thanks Vivek Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ee130f14d1fc92..17b768d0d42f74 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1803,7 +1803,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - return service_tree->count == 1; + return service_tree->count == 1 && cfq_cfqq_sync(cfqq); } static void cfq_arm_slice_timer(struct cfq_data *cfqd)