Merge tag 'md-3.7' of git://neil.brown.name/md

Pull md updates from NeilBrown: - "discard" support, some dm-raid improvements and other assorted bits and pieces. * tag 'md-3.7' of git://neil.brown.name/md: (29 commits) md: refine reporting of resync/reshape delays. md/raid5: be careful not to resize_stripes too big. md: make sure manual changes to recovery checkpoint are saved. md/raid10: use correct limit variable md: writing to sync_action should clear the read-auto state. Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races md/raid5: make sure to_read and to_write never go negative. md: When RAID5 is dirty, force reconstruct-write instead of read-modify-write. md/raid5: protect debug message against NULL derefernce. md/raid5: add some missing locking in handle_failed_stripe. MD: raid5 avoid unnecessary zero page for trim MD: raid5 trim support md/bitmap:Don't use IS_ERR to judge alloc_page(). md/raid1: Don't release reference to device while handling read error. raid: replace list_for_each_continue_rcu with new interface add further __init annotations to crypto/xor.c DM RAID: Fix for "sync" directive ineffectiveness DM RAID: Fix comparison of index and quantity for "rebuild" parameter DM RAID: Add rebuild capability for RAID10 DM RAID: Move 'rebuild' checking code to its own function ...
micronews · Oct 13, 2012 · 9db9088 · 9db9088
2 parents 4d7127d + 72f36d5
commit 9db9088
Show file tree

Hide file tree

Showing 13 changed files with 578 additions and 129 deletions.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
@@ -132,3 +132,12 @@ Here we can see the RAID type is raid4, there are 5 devices - all of
 which are 'A'live, and the array is 2/490221568 complete with recovery.
 Faulty or missing devices are marked 'D'.  Devices that are out-of-sync
 are marked 'a'.
+
+
+Version History
+---------------
+1.0.0	Initial version.  Support for RAID 4/5/6
+1.1.0	Added support for RAID 1
+1.2.0	Handle creation of arrays that contain failed devices.
+1.3.0	Added support for RAID 10
+1.3.1	Allow device replacement/rebuild for RAID 10
diff --git a/crypto/xor.c b/crypto/xor.c
@@ -56,11 +56,11 @@ xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs)
 EXPORT_SYMBOL(xor_blocks);
 
 /* Set of all registered templates.  */
-static struct xor_block_template *template_list;
+static struct xor_block_template *__initdata template_list;
 
 #define BENCH_SIZE (PAGE_SIZE)
 
-static void
+static void __init
 do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 {
 	int speed;

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
@@ -163,20 +163,17 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
 	 * As devices are only added or removed when raid_disk is < 0 and
 	 * nr_pending is 0 and In_sync is clear, the entries we return will
 	 * still be in the same position on the list when we re-enter
-	 * list_for_each_continue_rcu.
+	 * list_for_each_entry_continue_rcu.
 	 */
-	struct list_head *pos;
 	rcu_read_lock();
 	if (rdev == NULL)
 		/* start at the beginning */
-		pos = &mddev->disks;
+		rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set);
 	else {
 		/* release the previous rdev and start from there. */
 		rdev_dec_pending(rdev, mddev);
-		pos = &rdev->same_set;
 	}
-	list_for_each_continue_rcu(pos, &mddev->disks) {
-		rdev = list_entry(pos, struct md_rdev, same_set);
+	list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) {
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
 			/* this is a usable devices */
@@ -473,14 +470,10 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 {
 	bitmap_super_t *sb;
 	unsigned long chunksize, daemon_sleep, write_behind;
-	int err = -EINVAL;
 
 	bitmap->storage.sb_page = alloc_page(GFP_KERNEL);
-	if (IS_ERR(bitmap->storage.sb_page)) {
-		err = PTR_ERR(bitmap->storage.sb_page);
-		bitmap->storage.sb_page = NULL;
-		return err;
-	}
+	if (bitmap->storage.sb_page == NULL)
+		return -ENOMEM;
 	bitmap->storage.sb_page->index = 0;
 
 	sb = kmap_atomic(bitmap->storage.sb_page);

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
@@ -337,6 +337,84 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 	return 0;
 }
 
+/*
+ * validate_rebuild_devices
+ * @rs
+ *
+ * Determine if the devices specified for rebuild can result in a valid
+ * usable array that is capable of rebuilding the given devices.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_rebuild_devices(struct raid_set *rs)
+{
+	unsigned i, rebuild_cnt = 0;
+	unsigned rebuilds_per_group, copies, d;
+
+	if (!(rs->print_flags & DMPF_REBUILD))
+		return 0;
+
+	for (i = 0; i < rs->md.raid_disks; i++)
+		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+			rebuild_cnt++;
+
+	switch (rs->raid_type->level) {
+	case 1:
+		if (rebuild_cnt >= rs->md.raid_disks)
+			goto too_many;
+		break;
+	case 4:
+	case 5:
+	case 6:
+		if (rebuild_cnt > rs->raid_type->parity_devs)
+			goto too_many;
+		break;
+	case 10:
+		copies = raid10_md_layout_to_copies(rs->md.layout);
+		if (rebuild_cnt < copies)
+			break;
+
+		/*
+		 * It is possible to have a higher rebuild count for RAID10,
+		 * as long as the failed devices occur in different mirror
+		 * groups (i.e. different stripes).
+		 *
+		 * Right now, we only allow for "near" copies.  When other
+		 * formats are added, we will have to check those too.
+		 *
+		 * When checking "near" format, make sure no adjacent devices
+		 * have failed beyond what can be handled.  In addition to the
+		 * simple case where the number of devices is a multiple of the
+		 * number of copies, we must also handle cases where the number
+		 * of devices is not a multiple of the number of copies.
+		 * E.g.    dev1 dev2 dev3 dev4 dev5
+		 *          A    A    B    B    C
+		 *          C    D    D    E    E
+		 */
+		rebuilds_per_group = 0;
+		for (i = 0; i < rs->md.raid_disks * copies; i++) {
+			d = i % rs->md.raid_disks;
+			if (!test_bit(In_sync, &rs->dev[d].rdev.flags) &&
+			    (++rebuilds_per_group >= copies))
+				goto too_many;
+			if (!((i + 1) % copies))
+				rebuilds_per_group = 0;
+		}
+		break;
+	default:
+		DMERR("The rebuild parameter is not supported for %s",
+		      rs->raid_type->name);
+		rs->ti->error = "Rebuild not supported for this RAID type";
+		return -EINVAL;
+	}
+
+	return 0;
+
+too_many:
+	rs->ti->error = "Too many rebuild devices specified";
+	return -EINVAL;
+}
+
 /*
  * Possible arguments are...
  *	<chunk_size> [optional_args]
@@ -365,7 +443,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 {
 	char *raid10_format = "near";
 	unsigned raid10_copies = 2;
-	unsigned i, rebuild_cnt = 0;
+	unsigned i;
 	unsigned long value, region_size = 0;
 	sector_t sectors_per_dev = rs->ti->len;
 	sector_t max_io_len;
@@ -461,31 +539,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
 		/* Parameters that take a numeric value are checked here */
 		if (!strcasecmp(key, "rebuild")) {
-			rebuild_cnt++;
-
-			switch (rs->raid_type->level) {
-			case 1:
-				if (rebuild_cnt >= rs->md.raid_disks) {
-					rs->ti->error = "Too many rebuild devices specified";
-					return -EINVAL;
-				}
-				break;
-			case 4:
-			case 5:
-			case 6:
-				if (rebuild_cnt > rs->raid_type->parity_devs) {
-					rs->ti->error = "Too many rebuild devices specified for given RAID type";
-					return -EINVAL;
-				}
-				break;
-			case 10:
-			default:
-				DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
-				rs->ti->error = "Rebuild not supported for this RAID type";
-				return -EINVAL;
-			}
-
-			if (value > rs->md.raid_disks) {
+			if (value >= rs->md.raid_disks) {
 				rs->ti->error = "Invalid rebuild index given";
 				return -EINVAL;
 			}
@@ -608,6 +662,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	}
 	rs->md.dev_sectors = sectors_per_dev;
 
+	if (validate_rebuild_devices(rs))
+		return -EINVAL;
+
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.external = 1;
@@ -960,6 +1017,19 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 
 	freshest = NULL;
 	rdev_for_each_safe(rdev, tmp, mddev) {
+		/*
+		 * Skipping super_load due to DMPF_SYNC will cause
+		 * the array to undergo initialization again as
+		 * though it were new.  This is the intended effect
+		 * of the "sync" directive.
+		 *
+		 * When reshaping capability is added, we must ensure
+		 * that the "sync" directive is disallowed during the
+		 * reshape.
+		 */
+		if (rs->print_flags & DMPF_SYNC)
+			continue;
+
 		if (!rdev->meta_bdev)
 			continue;
 
@@ -1360,7 +1430,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 3, 0},
+	.version = {1, 3, 1},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
@@ -138,6 +138,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	struct linear_conf *conf;
 	struct md_rdev *rdev;
 	int i, cnt;
+	bool discard_supported = false;
 
 	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
 			GFP_KERNEL);
@@ -171,13 +172,20 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 		conf->array_sectors += rdev->sectors;
 		cnt++;
 
+		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+			discard_supported = true;
 	}
 	if (cnt != raid_disks) {
 		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
 		       mdname(mddev));
 		goto out;
 	}
 
+	if (!discard_supported)
+		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	else
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
 	/*
 	 * Here we calculate the device offsets.
 	 */
@@ -244,7 +252,9 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
 	if (!newconf)
 		return -ENOMEM;
 
-	oldconf = rcu_dereference(mddev->private);
+	oldconf = rcu_dereference_protected(mddev->private,
+					    lockdep_is_held(
+						    &mddev->reconfig_mutex));
 	mddev->raid_disks++;
 	rcu_assign_pointer(mddev->private, newconf);
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
@@ -256,7 +266,10 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
 
 static int linear_stop (struct mddev *mddev)
 {
-	struct linear_conf *conf = mddev->private;
+	struct linear_conf *conf =
+		rcu_dereference_protected(mddev->private,
+					  lockdep_is_held(
+						  &mddev->reconfig_mutex));
 
 	/*
 	 * We do not require rcu protection here since
@@ -326,6 +339,14 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	bio->bi_sector = bio->bi_sector - start_sector
 		+ tmp_dev->rdev->data_offset;
 	rcu_read_unlock();
+
+	if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+		/* Just ignore it */
+		bio_endio(bio, 0);
+		return;
+	}
+
 	generic_make_request(bio);
 }