Skip to content

Commit

Permalink
MFV r336950: 9290 device removal reduces redundancy of mirrors
Browse files Browse the repository at this point in the history
Mirrors are supposed to provide redundancy in the face of whole-disk failure
and silent damage (e.g. some data on disk is not right, but ZFS hasn't
detected the whole device as being broken). However, the current device
removal implementation bypasses some of the mirror's redundancy.

illumos/illumos-gate@3a4b1be

Reviewed by: George Wilson <[email protected]>
Reviewed by: Prashanth Sreenivasa <[email protected]>
Reviewed by: Sara Hartse <[email protected]>
Reviewed by: Serapheim Dimitropoulos <[email protected]>
Reviewed by: Brian Behlendorf <[email protected]>
Reviewed by: Tim Chase <[email protected]>
Approved by: Richard Lowe <[email protected]>
Author: Matthew Ahrens <[email protected]>
  • Loading branch information
amotin committed Jul 31, 2018
2 parents 9cd6f16 + d7479e2 commit 194000f
Show file tree
Hide file tree
Showing 14 changed files with 898 additions and 230 deletions.
18 changes: 11 additions & 7 deletions cddl/contrib/opensolaris/cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -3033,7 +3033,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_t *vd = svr->svr_vdev;
vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;

for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
Expand All @@ -3049,13 +3049,17 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
svr->svr_allocd_segs, SM_ALLOC));

/*
* Clear everything past what has been synced,
* because we have not allocated mappings for it yet.
* Clear everything past what has been synced unless
* it's past the spacemap, because we have not allocated
* mappings for it yet.
*/
range_tree_clear(svr->svr_allocd_segs,
vdev_indirect_mapping_max_offset(vim),
msp->ms_sm->sm_start + msp->ms_sm->sm_size -
vdev_indirect_mapping_max_offset(vim));
uint64_t vim_max_offset =
vdev_indirect_mapping_max_offset(vim);
uint64_t sm_end = msp->ms_sm->sm_start +
msp->ms_sm->sm_size;
if (sm_end > vim_max_offset)
range_tree_clear(svr->svr_allocd_segs,
vim_max_offset, sm_end - vim_max_offset);
}

zcb->zcb_removing_size +=
Expand Down
58 changes: 55 additions & 3 deletions cddl/contrib/opensolaris/cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ static ztest_ds_t *ztest_ds;

static kmutex_t ztest_vdev_lock;
static kmutex_t ztest_checkpoint_lock;
static boolean_t ztest_device_removal_active = B_FALSE;

/*
* The ztest_name_lock protects the pool and dataset namespace used by
Expand Down Expand Up @@ -2882,7 +2883,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
* value. Don't bother trying to attach while we are in the middle
* of removal.
*/
if (spa->spa_vdev_removal != NULL) {
if (ztest_device_removal_active) {
spa_config_exit(spa, SCL_ALL, FTAG);
mutex_exit(&ztest_vdev_lock);
return;
Expand Down Expand Up @@ -3057,16 +3058,49 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id)
spa_t *spa = ztest_spa;
vdev_t *vd;
uint64_t guid;
int error;

mutex_enter(&ztest_vdev_lock);

if (ztest_device_removal_active) {
mutex_exit(&ztest_vdev_lock);
return;
}

/*
* Remove a random top-level vdev and wait for removal to finish.
*/
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
guid = vd->vdev_guid;
spa_config_exit(spa, SCL_VDEV, FTAG);

(void) spa_vdev_remove(spa, guid, B_FALSE);
error = spa_vdev_remove(spa, guid, B_FALSE);
if (error == 0) {
ztest_device_removal_active = B_TRUE;
mutex_exit(&ztest_vdev_lock);

while (spa->spa_vdev_removal != NULL)
txg_wait_synced(spa_get_dsl(spa), 0);
} else {
mutex_exit(&ztest_vdev_lock);
return;
}

/*
* The pool needs to be scrubbed after completing device removal.
* Failure to do so may result in checksum errors due to the
* strategy employed by ztest_fault_inject() when selecting which
* offset are redundant and can be damaged.
*/
error = spa_scan(spa, POOL_SCAN_SCRUB);
if (error == 0) {
while (dsl_scan_scrubbing(spa_get_dsl(spa)))
txg_wait_synced(spa_get_dsl(spa), 0);
}

mutex_enter(&ztest_vdev_lock);
ztest_device_removal_active = B_FALSE;
mutex_exit(&ztest_vdev_lock);
}

Expand Down Expand Up @@ -3205,7 +3239,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
* that the metaslab_class space increased (because it decreases
* when the device removal completes).
*/
if (spa->spa_vdev_removal != NULL) {
if (ztest_device_removal_active) {
spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&ztest_vdev_lock);
mutex_exit(&ztest_checkpoint_lock);
Expand Down Expand Up @@ -4986,6 +5020,18 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
boolean_t islog = B_FALSE;

mutex_enter(&ztest_vdev_lock);

/*
* Device removal is in progress, fault injection must be disabled
* until it completes and the pool is scrubbed. The fault injection
* strategy for damaging blocks does not take in to account evacuated
* blocks which may have already been damaged.
*/
if (ztest_device_removal_active) {
mutex_exit(&ztest_vdev_lock);
return;
}

maxfaults = MAXFAULTS();
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
mirror_save = zs->zs_mirrors;
Expand Down Expand Up @@ -5331,6 +5377,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;

/*
* Scrub in progress by device removal.
*/
if (ztest_device_removal_active)
return;

(void) spa_scan(spa, POOL_SCAN_SCRUB);
(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
(void) spa_scan(spa, POOL_SCAN_SCRUB);
Expand Down
2 changes: 1 addition & 1 deletion cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -2836,7 +2836,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,

case EBUSY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, "
"or pool has removing/removed vdevs"),
"or device removal is in progress"),
new_disk);
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
Expand Down
10 changes: 10 additions & 0 deletions sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -2959,6 +2959,16 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
{
vdev_t *vd;

if (vd->vdev_ops == &vdev_indirect_ops) {
/*
* The indirect vdev can point to multiple
* vdevs. For simplicity, always create
* the resilver zio_t. zio_vdev_io_start()
* will bypass the child resilver i/o's if
* they are on vdevs that don't have DTL's.
*/
return (B_TRUE);
}
if (DVA_GET_GANG(dva)) {
/*
* Gang members may be spread across multiple
Expand Down
2 changes: 1 addition & 1 deletion sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -3596,7 +3596,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
return;

if (spa->spa_vdev_removal != NULL &&
spa->spa_vdev_removal->svr_vdev == vd &&
spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
vdev_is_concrete(vd)) {
/*
* Note: we check if the vdev is concrete because when
Expand Down
7 changes: 2 additions & 5 deletions sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -5837,8 +5837,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
if (spa->spa_vdev_removal != NULL &&
tvd->vdev_ashift !=
spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/* Fail if top level vdev is raidz */
Expand Down Expand Up @@ -5954,10 +5953,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
return (spa_vdev_exit(spa, NULL, txg, error));
}

if (spa->spa_vdev_removal != NULL ||
spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
}

if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
Expand Down
5 changes: 4 additions & 1 deletion sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1878,9 +1878,12 @@ spa_update_dspace(spa_t *spa)
* allocated twice (on the old device and the new
* device).
*/
vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
vdev_t *vd =
vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
spa->spa_dspace -= spa_deflate(spa) ?
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
spa_config_exit(spa, SCL_VDEV, FTAG);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ extern "C" {
#endif

typedef struct spa_vdev_removal {
vdev_t *svr_vdev;
uint64_t svr_vdev_id;
uint64_t svr_max_offset_to_sync[TXG_SIZE];
/* Thread performing a vdev removal. */
kthread_t *svr_thread;
Expand Down
2 changes: 1 addition & 1 deletion sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
zio_done_func_t *done, void *priv);

extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
struct abd *data, uint64_t size, int type, zio_priority_t priority,
struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *priv);

extern void zio_vdev_io_bypass(zio_t *zio);
Expand Down
26 changes: 26 additions & 0 deletions sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_stat.vs_space = 0;
svd->vdev_stat.vs_dspace = 0;

/*
* State which may be set on a top-level vdev that's in the
* process of being removed.
*/
ASSERT0(tvd->vdev_indirect_config.vic_births_object);
ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
ASSERT0(tvd->vdev_removing);
tvd->vdev_removing = svd->vdev_removing;
tvd->vdev_indirect_config = svd->vdev_indirect_config;
tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
tvd->vdev_indirect_births = svd->vdev_indirect_births;
range_tree_swap(&svd->vdev_obsolete_segments,
&tvd->vdev_obsolete_segments);
tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
svd->vdev_indirect_config.vic_mapping_object = 0;
svd->vdev_indirect_config.vic_births_object = 0;
svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
svd->vdev_indirect_mapping = NULL;
svd->vdev_indirect_births = NULL;
svd->vdev_obsolete_sm = NULL;
svd->vdev_removing = 0;

for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
Expand Down
Loading

0 comments on commit 194000f

Please sign in to comment.