Skip to content

Commit

Permalink
MFV r336991, r337001:
Browse files Browse the repository at this point in the history
9102 zfs should be able to initialize storage devices

The first access to a disk block can incur a performance penalty on some
platforms (e.g. AWS's EBS, VMware VMDKs). Therefore it is recommended that
volumes be "thick provisioned", where supported by the platform (VMware).
Thick provisioning is time consuming and often is ignored. If the thick
provision step is omitted, customers will see suboptimal performance until
we have written to all parts of the LUN. ZFS should be able to initialize
any unused storage to remove any first-write penalty that exists.

illumos/illumos-gate@094e47e

Reviewed by: John Wren Kennedy <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Pavel Zakharov <[email protected]>
Reviewed by: Prakash Surya <[email protected]>
Approved by: Richard Lowe <[email protected]>
Author:     George Wilson <[email protected]>
  • Loading branch information
amotin committed Jul 31, 2018
2 parents bc9ba24 + 5343ce6 commit 0021e1c
Show file tree
Hide file tree
Showing 32 changed files with 1,757 additions and 28 deletions.
31 changes: 31 additions & 0 deletions cddl/contrib/opensolaris/cmd/zpool/zpool.8
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@
.Ar pool | id
.Op Ar newpool
.Nm
.Cm initialize
.Op Fl cs
.Ar pool
.Op Ar device Ns ...
.Nm
.Cm iostat
.Op Fl T Cm d Ns | Ns Cm u
.Op Fl v
Expand Down Expand Up @@ -1437,6 +1442,32 @@ to fully rewind.
.El
.It Xo
.Nm
.Cm initialize
.Op Fl cs
.Ar pool
.Op Ar device Ns ...
.Xc
Begins initializing by writing to all unallocated regions on the specified
devices, or all eligible devices in the pool if no individual devices are
specified.
Only leaf data or log devices may be initialized.
.Bl -tag -width Ds
.It Fl c, -cancel
Cancel initializing on the specified devices, or all eligible devices if none
are specified.
If one or more target devices are invalid or are not currently being
initialized, the command will fail and no cancellation will occur on any device.
.It Fl s -suspend
Suspend initializing on the specified devices, or all eligible devices if none
are specified.
If one or more target devices are invalid or are not currently being
initialized, the command will fail and no suspension will occur on any device.
Initializing can then be resumed by running
.Nm zpool Cm initialize
with no flags on the relevant target devices.
.El
.It Xo
.Nm
.Cm iostat
.Op Fl T Cm d Ns | Ns Cm u
.Op Fl v
Expand Down
155 changes: 155 additions & 0 deletions cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ static int zpool_do_detach(int, char **);
static int zpool_do_replace(int, char **);
static int zpool_do_split(int, char **);

static int zpool_do_initialize(int, char **);
static int zpool_do_scrub(int, char **);

static int zpool_do_import(int, char **);
Expand Down Expand Up @@ -136,6 +137,7 @@ typedef enum {
HELP_ONLINE,
HELP_REPLACE,
HELP_REMOVE,
HELP_INITIALIZE,
HELP_SCRUB,
HELP_STATUS,
HELP_UPGRADE,
Expand Down Expand Up @@ -187,6 +189,7 @@ static zpool_command_t command_table[] = {
{ "replace", zpool_do_replace, HELP_REPLACE },
{ "split", zpool_do_split, HELP_SPLIT },
{ NULL },
{ "initialize", zpool_do_initialize, HELP_INITIALIZE },
{ "scrub", zpool_do_scrub, HELP_SCRUB },
{ NULL },
{ "import", zpool_do_import, HELP_IMPORT },
Expand Down Expand Up @@ -261,6 +264,8 @@ get_usage(zpool_help_t idx)
return (gettext("\tremove [-nps] <pool> <device> ...\n"));
case HELP_REOPEN:
return (gettext("\treopen <pool>\n"));
case HELP_INITIALIZE:
return (gettext("\tinitialize [-cs] <pool> [<device> ...]\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-s | -p] <pool> ...\n"));
case HELP_STATUS:
Expand Down Expand Up @@ -1650,6 +1655,43 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
"resilvering" : "repairing");
}

if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
!vs->vs_scan_removing) {
char zbuf[1024];
char tbuf[256];
struct tm zaction_ts;

time_t t = vs->vs_initialize_action_time;
int initialize_pct = 100;
if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) {
initialize_pct = (vs->vs_initialize_bytes_done * 100 /
(vs->vs_initialize_bytes_est + 1));
}

(void) localtime_r(&t, &zaction_ts);
(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);

switch (vs->vs_initialize_state) {
case VDEV_INITIALIZE_SUSPENDED:
(void) snprintf(zbuf, sizeof (zbuf),
", suspended, started at %s", tbuf);
break;
case VDEV_INITIALIZE_ACTIVE:
(void) snprintf(zbuf, sizeof (zbuf),
", started at %s", tbuf);
break;
case VDEV_INITIALIZE_COMPLETE:
(void) snprintf(zbuf, sizeof (zbuf),
", completed at %s", tbuf);
break;
}

(void) printf(gettext(" (%d%% initialized%s)"),
initialize_pct, zbuf);
}

(void) printf("\n");

for (c = 0; c < children; c++) {
Expand Down Expand Up @@ -4238,6 +4280,119 @@ zpool_do_scrub(int argc, char **argv)
return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
}

static void
zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
{
uint_t children = 0;
nvlist_t **child;
uint_t i;

(void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children);

if (children == 0) {
char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE);
fnvlist_add_boolean(res, path);
free(path);
return;
}

for (i = 0; i < children; i++) {
zpool_collect_leaves(zhp, child[i], res);
}
}

/*
* zpool initialize [-cs] <pool> [<vdev> ...]
* Initialize all unused blocks in the specified vdevs, or all vdevs in the pool
* if none specified.
*
* -c Cancel. Ends active initializing.
* -s Suspend. Initializing can then be restarted with no flags.
*/
int
zpool_do_initialize(int argc, char **argv)
{
int c;
char *poolname;
zpool_handle_t *zhp;
nvlist_t *vdevs;
int err = 0;

struct option long_options[] = {
{"cancel", no_argument, NULL, 'c'},
{"suspend", no_argument, NULL, 's'},
{0, 0, 0, 0}
};

pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO;
while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) {
switch (c) {
case 'c':
if (cmd_type != POOL_INITIALIZE_DO) {
(void) fprintf(stderr, gettext("-c cannot be "
"combined with other options\n"));
usage(B_FALSE);
}
cmd_type = POOL_INITIALIZE_CANCEL;
break;
case 's':
if (cmd_type != POOL_INITIALIZE_DO) {
(void) fprintf(stderr, gettext("-s cannot be "
"combined with other options\n"));
usage(B_FALSE);
}
cmd_type = POOL_INITIALIZE_SUSPEND;
break;
case '?':
if (optopt != 0) {
(void) fprintf(stderr,
gettext("invalid option '%c'\n"), optopt);
} else {
(void) fprintf(stderr,
gettext("invalid option '%s'\n"),
argv[optind - 1]);
}
usage(B_FALSE);
}
}

argc -= optind;
argv += optind;

if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool name argument\n"));
usage(B_FALSE);
return (-1);
}

poolname = argv[0];
zhp = zpool_open(g_zfs, poolname);
if (zhp == NULL)
return (-1);

vdevs = fnvlist_alloc();
if (argc == 1) {
/* no individual leaf vdevs specified, so add them all */
nvlist_t *config = zpool_get_config(zhp, NULL);
nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
ZPOOL_CONFIG_VDEV_TREE);
zpool_collect_leaves(zhp, nvroot, vdevs);
} else {
int i;
for (i = 1; i < argc; i++) {
fnvlist_add_boolean(vdevs, argv[i]);
}
}

err = zpool_initialize(zhp, cmd_type, vdevs);

fnvlist_free(vdevs);
zpool_close(zhp);

return (err);
}

typedef struct status_cbdata {
int cb_count;
boolean_t cb_allpools;
Expand Down
96 changes: 95 additions & 1 deletion cddl/contrib/opensolaris/cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
#include <sys/zil_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/vdev_initialize.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
#include <sys/dsl_prop.h>
Expand Down Expand Up @@ -348,6 +349,7 @@ ztest_func_t ztest_spa_upgrade;
ztest_func_t ztest_device_removal;
ztest_func_t ztest_remap_blocks;
ztest_func_t ztest_spa_checkpoint_create_discard;
ztest_func_t ztest_initialize;

uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
Expand Down Expand Up @@ -391,7 +393,8 @@ ztest_info_t ztest_info[] = {
&ztest_opts.zo_vdevtime },
{ ztest_device_removal, 1, &zopt_sometimes },
{ ztest_remap_blocks, 1, &zopt_sometimes },
{ ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }
{ ztest_spa_checkpoint_create_discard, 1, &zopt_rarely },
{ ztest_initialize, 1, &zopt_sometimes }
};

#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
Expand Down Expand Up @@ -5471,6 +5474,97 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
rw_exit(&ztest_name_lock);
}

static vdev_t *
ztest_random_concrete_vdev_leaf(vdev_t *vd)
{
if (vd == NULL)
return (NULL);

if (vd->vdev_children == 0)
return (vd);

vdev_t *eligible[vd->vdev_children];
int eligible_idx = 0, i;
for (i = 0; i < vd->vdev_children; i++) {
vdev_t *cvd = vd->vdev_child[i];
if (cvd->vdev_top->vdev_removing)
continue;
if (cvd->vdev_children > 0 ||
(vdev_is_concrete(cvd) && !cvd->vdev_detached)) {
eligible[eligible_idx++] = cvd;
}
}
VERIFY(eligible_idx > 0);

uint64_t child_no = ztest_random(eligible_idx);
return (ztest_random_concrete_vdev_leaf(eligible[child_no]));
}

/* ARGSUSED */
void
ztest_initialize(ztest_ds_t *zd, uint64_t id)
{
spa_t *spa = ztest_spa;
int error = 0;

mutex_enter(&ztest_vdev_lock);

spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

/* Random leaf vdev */
vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
if (rand_vd == NULL) {
spa_config_exit(spa, SCL_VDEV, FTAG);
mutex_exit(&ztest_vdev_lock);
return;
}

/*
* The random vdev we've selected may change as soon as we
* drop the spa_config_lock. We create local copies of things
* we're interested in.
*/
uint64_t guid = rand_vd->vdev_guid;
char *path = strdup(rand_vd->vdev_path);
boolean_t active = rand_vd->vdev_initialize_thread != NULL;

zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
spa_config_exit(spa, SCL_VDEV, FTAG);

uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS);
error = spa_vdev_initialize(spa, guid, cmd);
switch (cmd) {
case POOL_INITIALIZE_CANCEL:
if (ztest_opts.zo_verbose >= 4) {
(void) printf("Cancel initialize %s", path);
if (!active)
(void) printf(" failed (no initialize active)");
(void) printf("\n");
}
break;
case POOL_INITIALIZE_DO:
if (ztest_opts.zo_verbose >= 4) {
(void) printf("Start initialize %s", path);
if (active && error == 0)
(void) printf(" failed (already active)");
else if (error != 0)
(void) printf(" failed (error %d)", error);
(void) printf("\n");
}
break;
case POOL_INITIALIZE_SUSPEND:
if (ztest_opts.zo_verbose >= 4) {
(void) printf("Suspend initialize %s", path);
if (!active)
(void) printf(" failed (no initialize active)");
(void) printf("\n");
}
break;
}
free(path);
mutex_exit(&ztest_vdev_lock);
}

/*
* Verify pool integrity by running zdb.
*/
Expand Down
5 changes: 5 additions & 0 deletions cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ typedef enum zfs_error {
EZFS_NO_CHECKPOINT, /* pool has no checkpoint */
EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */
EZFS_VDEV_TOO_BIG, /* a device is too big to be used */
EZFS_TOOMANY, /* argument list too long */
EZFS_INITIALIZING, /* currently initializing */
EZFS_NO_INITIALIZE, /* no active initialize */
EZFS_UNKNOWN
} zfs_error_t;

Expand Down Expand Up @@ -262,6 +265,8 @@ typedef struct splitflags {
* Functions to manipulate pool and vdev state
*/
extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
nvlist_t *);
extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
extern int zpool_reguid(zpool_handle_t *);
extern int zpool_reopen(zpool_handle_t *);
Expand Down
Loading

0 comments on commit 0021e1c

Please sign in to comment.