Skip to content

Commit

Permalink
DM RAID: Add ability to restore transiently failed devices on resume
Browse files Browse the repository at this point in the history
DM RAID: Add ability to restore transiently failed devices on resume

This patch adds code to the resume function to check over the devices
in the RAID array.  If any are found to be marked as failed and their
superblocks can be read, an attempt is made to reintegrate them into
the array.  This allows the user to refresh the array with a simple
suspend and resume of the array - rather than having to load a
completely new table, allocate and initialize all the structures and
throw away the old instantiation.

Signed-off-by: Jonathan Brassow <[email protected]>
Signed-off-by: NeilBrown <[email protected]>
  • Loading branch information
jbrassow authored and neilbrown committed Jun 13, 2013
1 parent 25e33ed commit 9092c02
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 8 deletions.
1 change: 1 addition & 0 deletions Documentation/device-mapper/dm-raid.txt
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,4 @@ Version History
1.4.2 Add RAID10 "far" and "offset" algorithm support.
1.5.0 Add message interface to allow manipulation of the sync_action.
New status (STATUSTYPE_INFO) fields: sync_action and mismatch_cnt.
1.5.1 Add ability to restore transiently failed devices on resume.
44 changes: 43 additions & 1 deletion drivers/md/dm-raid.c
Original file line number Diff line number Diff line change
Expand Up @@ -1574,12 +1574,54 @@ static void raid_postsuspend(struct dm_target *ti)

static void raid_resume(struct dm_target *ti)
{
int i;
uint64_t failed_devices, cleared_failed_devices = 0;
unsigned long flags;
struct dm_raid_superblock *sb;
struct raid_set *rs = ti->private;
struct md_rdev *r;

set_bit(MD_CHANGE_DEVS, &rs->md.flags);
if (!rs->bitmap_loaded) {
bitmap_load(&rs->md);
rs->bitmap_loaded = 1;
} else {
/*
* A secondary resume while the device is active.
* Take this opportunity to check whether any failed
* devices are reachable again.
*/
for (i = 0; i < rs->md.raid_disks; i++) {
r = &rs->dev[i].rdev;
if (test_bit(Faulty, &r->flags) && r->sb_page &&
sync_page_io(r, 0, r->sb_size,
r->sb_page, READ, 1)) {
DMINFO("Faulty device #%d has readable super"
"block. Attempting to revive it.", i);
r->raid_disk = i;
r->saved_raid_disk = i;
flags = r->flags;
clear_bit(Faulty, &r->flags);
clear_bit(WriteErrorSeen, &r->flags);
clear_bit(In_sync, &r->flags);
if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
r->raid_disk = -1;
r->saved_raid_disk = -1;
r->flags = flags;
} else {
r->recovery_offset = 0;
cleared_failed_devices |= 1 << i;
}
}
}
if (cleared_failed_devices) {
rdev_for_each(r, &rs->md) {
sb = page_address(r->sb_page);
failed_devices = le64_to_cpu(sb->failed_devices);
failed_devices &= ~cleared_failed_devices;
sb->failed_devices = cpu_to_le64(failed_devices);
}
}
}

clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
Expand All @@ -1588,7 +1630,7 @@ static void raid_resume(struct dm_target *ti)

static struct target_type raid_target = {
.name = "raid",
.version = {1, 5, 0},
.version = {1, 5, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
Expand Down
7 changes: 4 additions & 3 deletions drivers/md/raid1.c
Original file line number Diff line number Diff line change
Expand Up @@ -1519,8 +1519,9 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
p = conf->mirrors+mirror;
if (!p->rdev) {

disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);

p->head_position = 0;
rdev->raid_disk = mirror;
Expand Down Expand Up @@ -1559,7 +1560,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
clear_bit(Unmerged, &rdev->flags);
}
md_integrity_add_rdev(rdev, mddev);
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
print_conf(conf);
return err;
Expand Down
10 changes: 6 additions & 4 deletions drivers/md/raid10.c
Original file line number Diff line number Diff line change
Expand Up @@ -1819,15 +1819,17 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev);
break;
}

disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);

p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1;
Expand Down

0 comments on commit 9092c02

Please sign in to comment.