Skip to content

Commit

Permalink
Btrfs: fix broken free space cache after the system crashed
Browse files Browse the repository at this point in the history
When we mounted the filesystem after the crash, we got the following
message:
  BTRFS error (device xxx): block group xxxx has wrong amount of free space
  BTRFS error (device xxx): failed to load free space cache for block group xxx

It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.

There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
  space cache
- account the size of the allocated space that is used to store the file
  data, if the size is not zero, don't write out the free space cache.

The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.

Signed-off-by: Miao Xie <[email protected]>
Signed-off-by: Chris Mason <[email protected]>
  • Loading branch information
Miao Xie authored and masoncl committed Jun 19, 2014
1 parent 5349d6c commit e570fd2
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 44 deletions.
13 changes: 11 additions & 2 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -1259,11 +1259,19 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
u64 sectorsize;
u64 cache_generation;

/*
* It is just used for the delayed data space allocation because
* only the data space allocation and the relative metadata update
* can be done cross the transaction.
*/
struct rw_semaphore data_rwsem;

/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;

Expand Down Expand Up @@ -3316,7 +3324,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data);
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
Expand All @@ -3330,7 +3338,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
u64 owner, u64 offset, int no_quota);

int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
int delalloc);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
Expand Down
143 changes: 112 additions & 31 deletions fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve);
u64 num_bytes, int reserve,
int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
Expand Down Expand Up @@ -3260,7 +3261,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,

spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
!btrfs_test_opt(root, SPACE_CACHE)) {
!btrfs_test_opt(root, SPACE_CACHE) ||
block_group->delalloc_bytes) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
Expand Down Expand Up @@ -5613,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @reserve: One of the reservation enums
* @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space, or by somebody who is
* freeing space that was never actually used on disk. For example if you
Expand All @@ -5631,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* succeeds.
*/
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve)
u64 num_bytes, int reserve, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
Expand All @@ -5650,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}

if (delalloc)
cache->delalloc_bytes += num_bytes;
}
} else {
if (cache->ro)
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;

if (delalloc)
cache->delalloc_bytes -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
Expand Down Expand Up @@ -6206,7 +6215,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
Expand Down Expand Up @@ -6365,6 +6374,70 @@ enum btrfs_loop_type {
LOOP_NO_EMPTY_SIZE = 3,
};

static inline void
btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
int delalloc)
{
if (delalloc)
down_read(&cache->data_rwsem);
}

static inline void
btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
int delalloc)
{
btrfs_get_block_group(cache);
if (delalloc)
down_read(&cache->data_rwsem);
}

static struct btrfs_block_group_cache *
btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
int delalloc)
{
struct btrfs_block_group_cache *used_bg;
bool locked = false;
again:
spin_lock(&cluster->refill_lock);
if (locked) {
if (used_bg == cluster->block_group)
return used_bg;

up_read(&used_bg->data_rwsem);
btrfs_put_block_group(used_bg);
}

used_bg = cluster->block_group;
if (!used_bg)
return NULL;

if (used_bg == block_group)
return used_bg;

btrfs_get_block_group(used_bg);

if (!delalloc)
return used_bg;

if (down_read_trylock(&used_bg->data_rwsem))
return used_bg;

spin_unlock(&cluster->refill_lock);
down_read(&used_bg->data_rwsem);
locked = true;
goto again;
}

static inline void
btrfs_release_block_group(struct btrfs_block_group_cache *cache,
int delalloc)
{
if (delalloc)
up_read(&cache->data_rwsem);
btrfs_put_block_group(cache);
}

/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
Expand All @@ -6379,7 +6452,7 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
u64 flags)
u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
Expand Down Expand Up @@ -6467,6 +6540,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
up_read(&space_info->groups_sem);
} else {
index = get_block_group_index(block_group);
btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
} else if (block_group) {
Expand All @@ -6481,7 +6555,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 offset;
int cached;

btrfs_get_block_group(block_group);
btrfs_grab_block_group(block_group, delalloc);
search_start = block_group->key.objectid;

/*
Expand Down Expand Up @@ -6529,16 +6603,16 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
* the refill lock keeps out other
* people trying to start a new cluster
*/
spin_lock(&last_ptr->refill_lock);
used_block_group = last_ptr->block_group;
if (used_block_group != block_group &&
(!used_block_group ||
used_block_group->ro ||
!block_group_bits(used_block_group, flags)))
used_block_group = btrfs_lock_cluster(block_group,
last_ptr,
delalloc);
if (!used_block_group)
goto refill_cluster;

if (used_block_group != block_group)
btrfs_get_block_group(used_block_group);
if (used_block_group != block_group &&
(used_block_group->ro ||
!block_group_bits(used_block_group, flags)))
goto release_cluster;

offset = btrfs_alloc_from_cluster(used_block_group,
last_ptr,
Expand All @@ -6552,16 +6626,15 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
used_block_group,
search_start, num_bytes);
if (used_block_group != block_group) {
btrfs_put_block_group(block_group);
btrfs_release_block_group(block_group,
delalloc);
block_group = used_block_group;
}
goto checks;
}

WARN_ON(last_ptr->block_group != used_block_group);
if (used_block_group != block_group)
btrfs_put_block_group(used_block_group);
refill_cluster:
release_cluster:
/* If we are on LOOP_NO_EMPTY_SIZE, we can't
* set up a new clusters, so lets just skip it
* and let the allocator find whatever block
Expand All @@ -6578,8 +6651,10 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
* succeeding in the unclustered
* allocation. */
if (loop >= LOOP_NO_EMPTY_SIZE &&
last_ptr->block_group != block_group) {
used_block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
btrfs_release_block_group(used_block_group,
delalloc);
goto unclustered_alloc;
}

Expand All @@ -6589,6 +6664,10 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);

if (used_block_group != block_group)
btrfs_release_block_group(used_block_group,
delalloc);
refill_cluster:
if (loop >= LOOP_NO_EMPTY_SIZE) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
Expand Down Expand Up @@ -6696,7 +6775,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
BUG_ON(offset > search_start);

ret = btrfs_update_reserved_bytes(block_group, num_bytes,
alloc_type);
alloc_type, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
Expand All @@ -6708,13 +6787,13 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,

trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
btrfs_put_block_group(block_group);
btrfs_release_block_group(block_group, delalloc);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
btrfs_put_block_group(block_group);
btrfs_release_block_group(block_group, delalloc);
}
up_read(&space_info->groups_sem);

Expand Down Expand Up @@ -6827,7 +6906,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int btrfs_reserve_extent(struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data)
struct btrfs_key *ins, int is_data, int delalloc)
{
bool final_tried = false;
u64 flags;
Expand All @@ -6837,7 +6916,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
flags);
flags, delalloc);

if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
Expand All @@ -6862,7 +6941,8 @@ int btrfs_reserve_extent(struct btrfs_root *root,
}

static int __btrfs_free_reserved_extent(struct btrfs_root *root,
u64 start, u64 len, int pin)
u64 start, u64 len,
int pin, int delalloc)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
Expand All @@ -6881,7 +6961,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
pin_down_extent(root, cache, start, len, 1);
else {
btrfs_add_free_space(cache, start, len);
btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
}
btrfs_put_block_group(cache);

Expand All @@ -6891,15 +6971,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
}

int btrfs_free_reserved_extent(struct btrfs_root *root,
u64 start, u64 len)
u64 start, u64 len, int delalloc)
{
return __btrfs_free_reserved_extent(root, start, len, 0);
return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
}

int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len)
{
return __btrfs_free_reserved_extent(root, start, len, 1);
return __btrfs_free_reserved_extent(root, start, len, 1, 0);
}

static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
Expand Down Expand Up @@ -7114,7 +7194,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
return -EINVAL;

ret = btrfs_update_reserved_bytes(block_group, ins->offset,
RESERVE_ALLOC_NO_ACCOUNT);
RESERVE_ALLOC_NO_ACCOUNT, 0);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
Expand Down Expand Up @@ -7256,7 +7336,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return ERR_CAST(block_rsv);

ret = btrfs_reserve_extent(root, blocksize, blocksize,
empty_size, hint, &ins, 0);
empty_size, hint, &ins, 0, 0);
if (ret) {
unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
Expand Down Expand Up @@ -8659,6 +8739,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
start);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->new_bg_list);
Expand Down
Loading

0 comments on commit e570fd2

Please sign in to comment.