Skip to content

Commit

Permalink
Add explicit prefetches to bpobj_iterate().
Browse files Browse the repository at this point in the history
To simplify error handling bpobj_iterate_blkptrs() iterates through
the list of block pointers backwards.  Unfortunately speculative
prefetcher is currently unable to detect such patterns, that makes
each block read there synchronous and very slow on HDD pools.

According to my tests, added explicit prefetch reduces time needed
to asynchronously delete 8 snapshots of 4 million blocks each from
20 seconds to less than one, that should free sync thread for other
useful work, such as async writes, scrub, etc.

While there, plug one memory leak in case of bpobj_open() error and
harmonize some variable names.

Reviewed-by: Allan Jude <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by:	Alexander Motin <[email protected]>
Sponsored by:	iXsystems, Inc.
Closes openzfs#15071
  • Loading branch information
amotin authored Jul 21, 2023
1 parent 6fd87e1 commit 28430b5
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 13 deletions.
2 changes: 1 addition & 1 deletion include/sys/bpobj.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ typedef struct bpobj {
kmutex_t bpo_lock;
objset_t *bpo_os;
uint64_t bpo_object;
int bpo_epb;
uint32_t bpo_epb;
uint8_t bpo_havecomp;
uint8_t bpo_havesubobj;
uint8_t bpo_havefreed;
Expand Down
49 changes: 37 additions & 12 deletions module/zfs/bpobj.c
Original file line number Diff line number Diff line change
Expand Up @@ -284,17 +284,34 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
dmu_buf_t *dbuf = NULL;
bpobj_t *bpo = bpi->bpi_bpo;

for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
sizeof (blkptr_t);
uint64_t ps = start * sizeof (blkptr_t);
uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
ps);
if (pe > pb) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
ZIO_PRIORITY_ASYNC_READ);
}
for (; i >= start; i--) {
uint64_t offset = i * sizeof (blkptr_t);
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);

if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
offset, FTAG, &dbuf, 0);
offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
if (err)
break;
pe = pb;
pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
dbuf->db_offset - dmu_prefetch_max : 0, ps);
if (pe > pb) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
}
}

ASSERT3U(offset, >=, dbuf->db_offset);
Expand Down Expand Up @@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
uint64_t offset = i * sizeof (uint64_t);

uint64_t obj_from_sublist;
uint64_t subobj;
err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
offset, sizeof (uint64_t), &obj_from_sublist,
DMU_READ_PREFETCH);
offset, sizeof (uint64_t), &subobj,
DMU_READ_NO_PREFETCH);
if (err)
break;
bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
KM_SLEEP);

err = bpobj_open(sublist, bpo->bpo_os,
obj_from_sublist);
if (err)
bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
KM_SLEEP);
err = bpobj_open(subbpo, bpo->bpo_os, subobj);
if (err) {
kmem_free(subbpo, sizeof (bpobj_t));
break;
}

if (subbpo->bpo_havesubobj &&
subbpo->bpo_phys->bpo_subobjs != 0) {
dmu_prefetch(subbpo->bpo_os,
subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
ZIO_PRIORITY_ASYNC_READ);
}

list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
mutex_enter(&sublist->bpo_lock);
list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
mutex_enter(&subbpo->bpo_lock);
bpi->bpi_unprocessed_subobjs--;
}
}
Expand Down

0 comments on commit 28430b5

Please sign in to comment.