Skip to content

Commit

Permalink
md-cluster: Use a small window for raid10 resync
Browse files Browse the repository at this point in the history
Suspending the entire device for resync could take
too long. Resync in small chunks.

cluster's resync window is maintained in r10conf as
cluster_sync_low and cluster_sync_high, and processed
in raid10's sync_request(). If the current resync is
outside the cluster resync window:

1. Set the cluster_sync_low to curr_resync_completed.
2. Set cluster_sync_high to cluster_sync_low + stripe
   size.
3. Send a message to all nodes so they may add it in
   their suspension list.

Note:
We only support "near" raid10 so far, resync a far or
offset raid10 array could have trouble. So raid10_run
checks the layout of clustered raid10, it will refuse
to run if the layout is not correct.

With the "near" layout we process one stripe at a time
progressing monotonically through the address space.
So we can have a sliding window of whole-stripes which
moves through the array suspending IO on other nodes,
and both resync which uses array addresses and recovery
which uses device addresses can stay within this window.

Signed-off-by: Guoqing Jiang <[email protected]>
Signed-off-by: Shaohua Li <[email protected]>
  • Loading branch information
GuoqingJiang authored and shligit committed Nov 2, 2017
1 parent cb8a7a7 commit 8db8791
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 1 deletion.
113 changes: 112 additions & 1 deletion drivers/md/raid10.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
kfree(r10_bio);
}

#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */
#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)

/*
* When performing a resync, we need to read and compare, so
Expand Down Expand Up @@ -2840,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
return r10bio;
}

/*
* Set cluster_sync_high since we need other nodes to add the
* range [cluster_sync_low, cluster_sync_high] to suspend list.
*/
static void raid10_set_cluster_sync_high(struct r10conf *conf)
{
sector_t window_size;
int extra_chunk, chunks;

/*
* First, here we define "stripe" as a unit which across
* all member devices one time, so we get chunks by use
* raid_disks / near_copies. Otherwise, if near_copies is
* close to raid_disks, then resync window could increases
* linearly with the increase of raid_disks, which means
* we will suspend a really large IO window while it is not
* necessary. If raid_disks is not divisible by near_copies,
* an extra chunk is needed to ensure the whole "stripe" is
* covered.
*/

chunks = conf->geo.raid_disks / conf->geo.near_copies;
if (conf->geo.raid_disks % conf->geo.near_copies == 0)
extra_chunk = 0;
else
extra_chunk = 1;
window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;

/*
* At least use a 32M window to align with raid1's resync window
*/
window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
CLUSTER_RESYNC_WINDOW_SECTORS : window_size;

conf->cluster_sync_high = conf->cluster_sync_low + window_size;
}

/*
* perform a "sync" on one "block"
*
Expand Down Expand Up @@ -2912,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;

/* If we aborted, we need to abort the
* sync on the 'current' bitmap chucks (there can
* be several when recovering multiple devices).
Expand Down Expand Up @@ -3266,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */
int count = 0;

bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
/*
* Since curr_resync_completed could probably not update in
* time, and we will set cluster_sync_low based on it.
* Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS >
conf->cluster_sync_high));

if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
Expand Down Expand Up @@ -3400,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
} while (++page_idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors;

if (mddev_is_clustered(mddev) &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* It is resync not recovery */
if (conf->cluster_sync_high < sector_nr + nr_sectors) {
conf->cluster_sync_low = mddev->curr_resync_completed;
raid10_set_cluster_sync_high(conf);
/* Send resync message */
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
} else if (mddev_is_clustered(mddev)) {
/* This is recovery not resync */
sector_t sect_va1, sect_va2;
bool broadcast_msg = false;

for (i = 0; i < conf->geo.raid_disks; i++) {
/*
* sector_nr is a device address for recovery, so we
* need translate it to array address before compare
* with cluster_sync_high.
*/
sect_va1 = raid10_find_virt(conf, sector_nr, i);

if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
broadcast_msg = true;
/*
* curr_resync_completed is similar as
* sector_nr, so make the translation too.
*/
sect_va2 = raid10_find_virt(conf,
mddev->curr_resync_completed, i);

if (conf->cluster_sync_low == 0 ||
conf->cluster_sync_low > sect_va2)
conf->cluster_sync_low = sect_va2;
}
}
if (broadcast_msg) {
raid10_set_cluster_sync_high(conf);
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
}

while (biolist) {
bio = biolist;
biolist = biolist->bi_next;
Expand Down Expand Up @@ -3659,6 +3758,18 @@ static int raid10_run(struct mddev *mddev)
if (!conf)
goto out;

if (mddev_is_clustered(conf->mddev)) {
int fc, fo;

fc = (mddev->layout >> 8) & 255;
fo = mddev->layout & (1<<16);
if (fc > 1 || fo > 0) {
pr_err("only near layout is supported by clustered"
" raid10\n");
goto out;
}
}

mddev->thread = conf->thread;
conf->thread = NULL;

Expand Down
6 changes: 6 additions & 0 deletions drivers/md/raid10.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ struct r10conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;

/*
* Keep track of cluster resync window to send to other nodes.
*/
sector_t cluster_sync_low;
sector_t cluster_sync_high;
};

/*
Expand Down

0 comments on commit 8db8791

Please sign in to comment.