forked from openzfs/zfs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Resilver restarts unnecessarily when it encounters errors
When a resilver finishes, vdev_dtl_reassess is called to hopefully excise DTL_MISSING (amongst other things). If there are errors during the resilver, they are tracked in DTL_SCRUB, as spelled out in the block comment in vdev.c. DTL_SCRUB is in-core only, so it can only be used if the pool was online for the whole resilver. This state is tracked with the spa_scrub_started flag, which only gets set when the scan is initialized. Unfortunately, this flag gets cleared right before vdev_dtl_reassess gets called, so if there are any errors during the scan, DTL_MISSING will never get excised and the resilver will just continually restart. This fix simply moves clearing that flag until after the call to vdev_dtl_reasses. In addition, if a pool is imported and already has scn_errors > 0, this change will restart the resilver immediately instead of doing the rest of the scan and then restarting it from the beginning. On the other hand, if scn_errors == 0 at import, then no errors have been encountered so far, so the spa_scrub_started flag can be safely set. A test has been added to verify that resilver does not restart when relevant DTL's are available. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Paul Zuchowski <[email protected]> Signed-off-by: John Poduska <[email protected]> Closes openzfs#10291
- Loading branch information
Showing
6 changed files
with
149 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
102 changes: 102 additions & 0 deletions
102
tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
#!/bin/ksh -p | ||
|
||
# | ||
# CDDL HEADER START | ||
# | ||
# This file and its contents are supplied under the terms of the | ||
# Common Development and Distribution License ("CDDL"), version 1.0. | ||
# You may only use this file in accordance with the terms of version | ||
# 1.0 of the CDDL. | ||
# | ||
# A full copy of the text of the CDDL should have accompanied this | ||
# source. A copy of the CDDL is also available via the Internet at | ||
# http://www.illumos.org/license/CDDL. | ||
# | ||
# CDDL HEADER END | ||
# | ||
|
||
# | ||
# Copyright (c) 2020, Datto Inc. All rights reserved. | ||
# | ||
|
||
. $STF_SUITE/include/libtest.shlib | ||
. $STF_SUITE/tests/functional/resilver/resilver.cfg | ||
|
||
# | ||
# DESCRIPTION: | ||
# Testing resilver completes when scan errors are encountered, but relevant | ||
# DTL's have not been lost. | ||
# | ||
# STRATEGY: | ||
# 1. Create a pool (1k recordsize) | ||
# 2. Create a 32m file (32k records) | ||
# 3. Inject an error halfway through the file | ||
# 4. Start a resilver, ensure the error is triggered and that the resilver | ||
# does not restart after finishing | ||
# | ||
# NB: use legacy scanning to ensure scan of specific block causes error | ||
# | ||
|
||
function cleanup | ||
{ | ||
log_must zinject -c all | ||
destroy_pool $TESTPOOL | ||
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE | ||
log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY | ||
} | ||
|
||
log_assert "Check for resilver restarts caused by scan errors" | ||
|
||
ORIG_SCAN_LEGACY=$(get_tunable SCAN_LEGACY) | ||
|
||
log_onexit cleanup | ||
|
||
# use legacy scan to ensure injected error will be triggered | ||
log_must set_tunable32 SCAN_LEGACY 1 | ||
|
||
# create the pool and a 32M file (32k blocks) | ||
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE | ||
log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]} | ||
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1 | ||
|
||
# determine objset/object | ||
objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') | ||
object=$(ls -i /$TESTPOOL/file | awk '{print $1}') | ||
|
||
# inject event to cause error during resilver | ||
log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL | ||
|
||
# clear events and start resilver | ||
log_must zpool events -c | ||
log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE | ||
|
||
log_note "waiting for read errors to start showing up" | ||
for iter in {0..59} | ||
do | ||
zpool sync $TESTPOOL | ||
err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}') | ||
(( $err > 0 )) && break | ||
sleep 1 | ||
done | ||
|
||
(( $err == 0 )) && log_fail "Unable to induce errors in resilver" | ||
|
||
log_note "waiting for resilver to finish" | ||
for iter in {0..59} | ||
do | ||
finish=$(zpool events | grep "sysevent.fs.zfs.resilver_finish" | wc -l) | ||
(( $finish > 0 )) && break | ||
sleep 1 | ||
done | ||
|
||
(( $finish == 0 )) && log_fail "resilver took too long to finish" | ||
|
||
# wait a few syncs to ensure that zfs does not restart the resilver | ||
log_must zpool sync $TESTPOOL | ||
log_must zpool sync $TESTPOOL | ||
|
||
# check if resilver was restarted | ||
start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l) | ||
(( $start != 1 )) && log_fail "resilver restarted unnecessarily" | ||
|
||
log_pass "Resilver did not restart unnecessarily from scan errors" |