Skip to content

Commit

Permalink
btrfs: zoned: mark block groups to copy for device-replace
Browse files Browse the repository at this point in the history
This is the 1/4 patch to support device-replace on zoned filesystems.

We have two types of IOs during the device replace process. One is an IO
to "copy" (by the scrub functions) all the device extents from the source
device to the destination device. The other one is an IO to "clone" (by
handle_ops_on_dev_replace()) new incoming write IOs from users to the
source device into the target device.

Cloning incoming IOs can break the sequential write rule in on target
device. When a write is mapped in the middle of a block group, the IO is
directed to the middle of a target device zone, which breaks the
sequential write requirement.

However, the cloning function cannot be disabled since incoming IOs
targeting already copied device extents must be cloned so that the IO is
executed on the target device.

We cannot use dev_replace->cursor_{left,right} to determine whether a bio
is going to a not yet copied region. Since we have a time gap between
finishing btrfs_scrub_dev() and rewriting the mapping tree in
btrfs_dev_replace_finishing(), we can have a newly allocated device extent
which is never cloned nor copied.

So the point is to copy only already existing device extents. This patch
introduces mark_block_group_to_copy() to mark existing block groups as a
target of copying. Then, handle_ops_on_dev_replace() and dev-replace can
check the flag to do their job.

Also, btrfs_finish_block_group_to_copy() will check if the copied stripe
is the last stripe in the block group. With the last stripe copied,
the to_copy flag is finally disabled. Afterwards we can safely clone
incoming IOs on this block group.

Reviewed-by: Josef Bacik <[email protected]>
Signed-off-by: Naohiro Aota <[email protected]>
Signed-off-by: David Sterba <[email protected]>
  • Loading branch information
naota authored and kdave committed Feb 9, 2021
1 parent 4eef29e commit 78ce9fc
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 0 deletions.
1 change: 1 addition & 0 deletions fs/btrfs/block-group.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ struct btrfs_block_group {
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
unsigned int to_copy:1;

int disk_cache_state;

Expand Down
184 changes: 184 additions & 0 deletions fs/btrfs/dev-replace.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
#include "block-group.h"

/*
* Device replace overview
Expand Down Expand Up @@ -459,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
}

static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_device *src_dev)
{
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_block_group *cache;
struct btrfs_trans_handle *trans;
int ret = 0;
u64 chunk_offset;

/* Do not use "to_copy" on non zoned filesystem for now */
if (!btrfs_is_zoned(fs_info))
return 0;

mutex_lock(&fs_info->chunk_mutex);

/* Ensure we don't have pending new block group */
spin_lock(&fs_info->trans_lock);
while (fs_info->running_transaction &&
!list_empty(&fs_info->running_transaction->dev_update_list)) {
spin_unlock(&fs_info->trans_lock);
mutex_unlock(&fs_info->chunk_mutex);
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
mutex_lock(&fs_info->chunk_mutex);
if (ret == -ENOENT) {
spin_lock(&fs_info->trans_lock);
continue;
} else {
goto unlock;
}
}

ret = btrfs_commit_transaction(trans);
mutex_lock(&fs_info->chunk_mutex);
if (ret)
goto unlock;

spin_lock(&fs_info->trans_lock);
}
spin_unlock(&fs_info->trans_lock);

path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto unlock;
}

path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;

key.objectid = src_dev->devid;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;

ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto free_path;
if (ret > 0) {
if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto free_path;
if (ret > 0) {
ret = 0;
goto free_path;
}
} else {
ret = 0;
}
}

while (1) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];

btrfs_item_key_to_cpu(leaf, &found_key, slot);

if (found_key.objectid != src_dev->devid)
break;

if (found_key.type != BTRFS_DEV_EXTENT_KEY)
break;

if (found_key.offset < key.offset)
break;

dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);

chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);

cache = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!cache)
goto skip;

spin_lock(&cache->lock);
cache->to_copy = 1;
spin_unlock(&cache->lock);

btrfs_put_block_group(cache);

skip:
ret = btrfs_next_item(root, path);
if (ret != 0) {
if (ret > 0)
ret = 0;
break;
}
}

free_path:
btrfs_free_path(path);
unlock:
mutex_unlock(&fs_info->chunk_mutex);

return ret;
}

bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct extent_map *em;
struct map_lookup *map;
u64 chunk_offset = cache->start;
int num_extents, cur_extent;
int i;

/* Do not use "to_copy" on non zoned filesystem for now */
if (!btrfs_is_zoned(fs_info))
return true;

spin_lock(&cache->lock);
if (cache->removed) {
spin_unlock(&cache->lock);
return true;
}
spin_unlock(&cache->lock);

em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
ASSERT(!IS_ERR(em));
map = em->map_lookup;

num_extents = cur_extent = 0;
for (i = 0; i < map->num_stripes; i++) {
/* We have more device extent to copy */
if (srcdev != map->stripes[i].dev)
continue;

num_extents++;
if (physical == map->stripes[i].physical)
cur_extent = i;
}

free_extent_map(em);

if (num_extents > 1 && cur_extent < num_extents - 1) {
/*
* Has more stripes on this device. Keep this block group
* readonly until we finish all the stripes.
*/
return false;
}

/* Last stripe on this device */
spin_lock(&cache->lock);
cache->to_copy = 0;
spin_unlock(&cache->lock);

return true;
}

static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
Expand Down Expand Up @@ -500,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
return ret;

ret = mark_block_group_to_copy(fs_info, src_device);
if (ret)
return ret;

down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
Expand Down
3 changes: 3 additions & 0 deletions fs/btrfs/dev-replace.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical);

#endif
16 changes: 16 additions & 0 deletions fs/btrfs/scrub.c
Original file line number Diff line number Diff line change
Expand Up @@ -3561,6 +3561,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;

if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
spin_lock(&cache->lock);
if (!cache->to_copy) {
spin_unlock(&cache->lock);
ro_set = 0;
goto done;
}
spin_unlock(&cache->lock);
}

/*
* Make sure that while we are scrubbing the corresponding block
* group doesn't get its logical address and its device extents
Expand Down Expand Up @@ -3692,6 +3702,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,

scrub_pause_off(fs_info);

if (sctx->is_dev_replace &&
!btrfs_finish_block_group_to_copy(dev_replace->srcdev,
cache, found_key.offset))
ro_set = 0;

done:
down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
Expand Down

0 comments on commit 78ce9fc

Please sign in to comment.