Skip to content

Commit

Permalink
Merge tag 'xfs-rmap-for-linus-4.8-rc1' of git://git.kernel.org/pub/sc…
Browse files Browse the repository at this point in the history
…m/linux/kernel/git/dgc/linux-xfs

Pull more xfs updates from Dave Chinner:
 "This is the second part of the XFS updates for this merge cycle, and
  contains the new reverse block mapping feature for XFS.

  Reverse mapping allows us to track the owner of a specific block on
  disk precisely.  It is implemented as a set of btrees (one per
  allocation group) that track the owners of allocated extents.
  Effectively it is a "used space tree" that is updated when we allocate
  or free extents.  i.e. it is coherent with the free space btrees we
  already maintain and never overlaps with them.

  This reverse mapping infrastructure is the building block of several
  upcoming features - reflink, copy-on-write data, dedupe, online
  metadata and data scrubbing, highly accurate bad sector/data loss
  reporting to users, and significantly improved reconstruction of
  damaged and corrupted filesystems.  There's a lot of new stuff coming
  along in the next couple of cycles,a nd it all builds in the rmap
  infrastructure.

  As such, it's a huge chunk of new code with new on-disk format
  features and internal infrastructure.  It warns at mount time as an
  experimental feature and that it may eat data (as we do with all new
  on-disk features until they stabilise).  We have not released
  userspace suport for it yet - userspace support currently requires
  download from Darrick's xfsprogs repo and build from source, so the
  access to this feature is really developer/tester only at this point.
  Initial userspace support will be released at the same time kernel
  with this code in it is released.

  The new rmap enabled code regresses 3 xfstests - all are ENOSPC
  related corner cases, one of which Darrick posted a fix for a few
  hours ago.  The other two are fixed by infrastructure that is part of
  the upcoming reflink patchset.  This new ENOSPC infrastructure
  requires a on-disk format tweak required to keep mount times in
  check - we need to keep an on-disk count of allocated rmapbt blocks so
  we don't have to scan the entire btrees at mount time to count them.

  This is currently being tested and will be part of the fixes sent in
  the next week or two so users will not be exposed to this change"

* tag 'xfs-rmap-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (52 commits)
  xfs: move (and rename) the deferred bmap-free tracepoints
  xfs: collapse single use static functions
  xfs: remove unnecessary parentheses from log redo item recovery functions
  xfs: remove the extents array from the rmap update done log item
  xfs: in btree_lshift, only allocate temporary cursor when needed
  xfs: remove unnecesary lshift/rshift key initialization
  xfs: remove the get*keys and update_keys btree ops pointers
  xfs: enable the rmap btree functionality
  xfs: don't update rmapbt when fixing agfl
  xfs: disable XFS_IOC_SWAPEXT when rmap btree is enabled
  xfs: add rmap btree block detection to log recovery
  xfs: add rmap btree geometry feature flag
  xfs: propagate bmap updates to rmapbt
  xfs: enable the xfs_defer mechanism to process rmaps to update
  xfs: log rmap intent items
  xfs: create rmap update intent log items
  xfs: add rmap btree insert and delete helpers
  xfs: convert unwritten status of reverse mappings
  xfs: remove an extent from the rmap btree
  xfs: add an extent to the rmap btree
  ...
  • Loading branch information
torvalds committed Aug 6, 2016
2 parents 835c92d + 3481b68 commit 0cbbc42
Show file tree
Hide file tree
Showing 64 changed files with 6,267 additions and 915 deletions.
5 changes: 5 additions & 0 deletions fs/xfs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_btree.o \
xfs_da_btree.o \
xfs_da_format.o \
xfs_defer.o \
xfs_dir2.o \
xfs_dir2_block.o \
xfs_dir2_data.o \
Expand All @@ -51,6 +52,8 @@ xfs-y += $(addprefix libxfs/, \
xfs_inode_fork.o \
xfs_inode_buf.o \
xfs_log_rlimit.o \
xfs_rmap.o \
xfs_rmap_btree.o \
xfs_sb.o \
xfs_symlink_remote.o \
xfs_trans_resv.o \
Expand Down Expand Up @@ -100,11 +103,13 @@ xfs-y += xfs_log.o \
xfs_extfree_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
xfs_rmap_item.o \
xfs_log_recover.o \
xfs_trans_ail.o \
xfs_trans_buf.o \
xfs_trans_extfree.o \
xfs_trans_inode.o \
xfs_trans_rmap.o \

# optional features
xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
Expand Down
149 changes: 136 additions & 13 deletions fs/xfs/libxfs/xfs_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
Expand All @@ -49,6 +51,81 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);

xfs_extlen_t
xfs_prealloc_blocks(
struct xfs_mount *mp)
{
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
return XFS_RMAP_BLOCK(mp) + 1;
if (xfs_sb_version_hasfinobt(&mp->m_sb))
return XFS_FIBT_BLOCK(mp) + 1;
return XFS_IBT_BLOCK(mp) + 1;
}

/*
* In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
* AGF buffer (PV 947395), we place constraints on the relationship among
* actual allocations for data blocks, freelist blocks, and potential file data
* bmap btree blocks. However, these restrictions may result in no actual space
* allocated for a delayed extent, for example, a data block in a certain AG is
* allocated but there is no additional block for the additional bmap btree
* block due to a split of the bmap btree of the file. The result of this may
* lead to an infinite loop when the file gets flushed to disk and all delayed
* extents need to be actually allocated. To get around this, we explicitly set
* aside a few blocks which will not be reserved in delayed allocation.
*
* When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist
* and 4 more to handle a potential split of the file's bmap btree.
*
* When rmap is enabled, we must also be able to handle two rmap btree inserts
* to record both the file data extent and a new bmbt block. The bmbt block
* might not be in the same AG as the file data extent. In the worst case
* the bmap btree splits multiple levels and all the new blocks come from
* different AGs, so set aside enough to handle rmap btree splits in all AGs.
*/
unsigned int
xfs_alloc_set_aside(
struct xfs_mount *mp)
{
unsigned int blocks;

blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels;
return blocks;
}

/*
* When deciding how much space to allocate out of an AG, we limit the
* allocation maximum size to the size the AG. However, we cannot use all the
* blocks in the AG - some are permanently used by metadata. These
* blocks are generally:
* - the AG superblock, AGF, AGI and AGFL
* - the AGF (bno and cnt) and AGI btree root blocks, and optionally
* the AGI free inode and rmap btree root blocks.
* - blocks on the AGFL according to xfs_alloc_set_aside() limits
* - the rmapbt root block
*
* The AG headers are sector sized, so the amount of space they take up is
* dependent on filesystem geometry. The others are all single blocks.
*/
unsigned int
xfs_alloc_ag_max_usable(
struct xfs_mount *mp)
{
unsigned int blocks;

blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
blocks += XFS_ALLOC_AGFL_RESERVE;
blocks += 3; /* AGF, AGI btree root blocks */
if (xfs_sb_version_hasfinobt(&mp->m_sb))
blocks++; /* finobt root block */
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
blocks++; /* rmap root block */

return mp->m_sb.sb_agblocks - blocks;
}

/*
* Lookup the record equal to [bno, len] in the btree given by cur.
*/
Expand Down Expand Up @@ -636,6 +713,14 @@ xfs_alloc_ag_vextent(
ASSERT(!args->wasfromfl || !args->isfl);
ASSERT(args->agbno % args->alignment == 0);

/* if not file data, insert new block into the reverse map btree */
if (args->oinfo.oi_owner != XFS_RMAP_OWN_UNKNOWN) {
error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
args->agbno, args->len, &args->oinfo);
if (error)
return error;
}

if (!args->wasfromfl) {
error = xfs_alloc_update_counters(args->tp, args->pag,
args->agbp,
Expand Down Expand Up @@ -1577,14 +1662,15 @@ xfs_alloc_ag_vextent_small(
/*
* Free the extent starting at agno/bno for length.
*/
STATIC int /* error */
STATIC int
xfs_free_ag_extent(
xfs_trans_t *tp, /* transaction pointer */
xfs_buf_t *agbp, /* buffer for a.g. freelist header */
xfs_agnumber_t agno, /* allocation group number */
xfs_agblock_t bno, /* starting block number */
xfs_extlen_t len, /* length of extent */
int isfl) /* set if is freelist blocks - no sb acctg */
xfs_trans_t *tp,
xfs_buf_t *agbp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len,
struct xfs_owner_info *oinfo,
int isfl)
{
xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
Expand All @@ -1601,12 +1687,19 @@ xfs_free_ag_extent(
xfs_extlen_t nlen; /* new length of freespace */
xfs_perag_t *pag; /* per allocation group data */

bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;

if (oinfo->oi_owner != XFS_RMAP_OWN_UNKNOWN) {
error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
if (error)
goto error0;
}

/*
* Allocate and initialize a cursor for the by-block btree.
*/
bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
cnt_cur = NULL;
/*
* Look for a neighboring block on the left (lower block numbers)
* that is contiguous with this space.
Expand Down Expand Up @@ -1875,6 +1968,11 @@ xfs_alloc_min_freelist(
/* space needed by-size freespace btree */
min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
mp->m_ag_maxlevels);
/* space needed reverse mapping used space btree */
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
min_free += min_t(unsigned int,
pag->pagf_levels[XFS_BTNUM_RMAPi] + 1,
mp->m_rmap_maxlevels);

return min_free;
}
Expand Down Expand Up @@ -1992,21 +2090,34 @@ xfs_alloc_fix_freelist(
* anything other than extra overhead when we need to put more blocks
* back on the free list? Maybe we should only do this when space is
* getting low or the AGFL is more than half full?
*
* The NOSHRINK flag prevents the AGFL from being shrunk if it's too
* big; the NORMAP flag prevents AGFL expand/shrink operations from
* updating the rmapbt. Both flags are used in xfs_repair while we're
* rebuilding the rmapbt, and neither are used by the kernel. They're
* both required to ensure that rmaps are correctly recorded for the
* regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
* repair/rmap.c in xfsprogs for details.
*/
while (pag->pagf_flcount > need) {
memset(&targs, 0, sizeof(targs));
if (flags & XFS_ALLOC_FLAG_NORMAP)
xfs_rmap_skip_owner_update(&targs.oinfo);
else
xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
struct xfs_buf *bp;

error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
&targs.oinfo, 1);
if (error)
goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
xfs_trans_binval(tp, bp);
}

memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
targs.agbp = agbp;
Expand Down Expand Up @@ -2271,6 +2382,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
return false;

if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
return false;

/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
Expand Down Expand Up @@ -2402,6 +2517,8 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
pag->pagf_levels[XFS_BTNUM_CNTi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
pag->pagf_levels[XFS_BTNUM_RMAPi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
pag->pagb_tree = RB_ROOT;
Expand Down Expand Up @@ -2691,7 +2808,8 @@ int /* error */
xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len) /* length of extent */
xfs_extlen_t len, /* length of extent */
struct xfs_owner_info *oinfo) /* extent owner */
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
Expand All @@ -2701,6 +2819,11 @@ xfs_free_extent(

ASSERT(len != 0);

if (XFS_TEST_ERROR(false, mp,
XFS_ERRTAG_FREE_EXTENT,
XFS_RANDOM_FREE_EXTENT))
return -EIO;

error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
if (error)
return error;
Expand All @@ -2712,7 +2835,7 @@ xfs_free_extent(
agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
err);

error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0);
if (error)
goto err;

Expand Down
52 changes: 14 additions & 38 deletions fs/xfs/libxfs/xfs_alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,41 +54,8 @@ typedef unsigned int xfs_alloctype_t;
*/
#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/

/*
* In order to avoid ENOSPC-related deadlock caused by
* out-of-order locking of AGF buffer (PV 947395), we place
* constraints on the relationship among actual allocations for
* data blocks, freelist blocks, and potential file data bmap
* btree blocks. However, these restrictions may result in no
* actual space allocated for a delayed extent, for example, a data
* block in a certain AG is allocated but there is no additional
* block for the additional bmap btree block due to a split of the
* bmap btree of the file. The result of this may lead to an
* infinite loop in xfssyncd when the file gets flushed to disk and
* all delayed extents need to be actually allocated. To get around
* this, we explicitly set aside a few blocks which will not be
* reserved in delayed allocation. Considering the minimum number of
* needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
* btree requires 1 fsb, so we set the number of set-aside blocks
* to 4 + 4*agcount.
*/
#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4))

/*
* When deciding how much space to allocate out of an AG, we limit the
* allocation maximum size to the size the AG. However, we cannot use all the
* blocks in the AG - some are permanently used by metadata. These
* blocks are generally:
* - the AG superblock, AGF, AGI and AGFL
* - the AGF (bno and cnt) and AGI btree root blocks
* - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
*
* The AG headers are sector sized, so the amount of space they take up is
* dependent on filesystem geometry. The others are all single blocks.
*/
#define XFS_ALLOC_AG_MAX_USABLE(mp) \
((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */
#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */


/*
Expand Down Expand Up @@ -123,6 +90,7 @@ typedef struct xfs_alloc_arg {
char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
struct xfs_owner_info oinfo; /* owner of blocks being allocated */
} xfs_alloc_arg_t;

/*
Expand All @@ -132,6 +100,11 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */

/* freespace limit calculations */
#define XFS_ALLOC_AGFL_RESERVE 4
unsigned int xfs_alloc_set_aside(struct xfs_mount *mp);
unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp);

xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag, xfs_extlen_t need);
unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
Expand Down Expand Up @@ -208,9 +181,10 @@ xfs_alloc_vextent(
*/
int /* error */
xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len); /* length of extent */
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
struct xfs_owner_info *oinfo);/* extent owner */

int /* error */
xfs_alloc_lookup_ge(
Expand All @@ -232,4 +206,6 @@ int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **agbp);

xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);

#endif /* __XFS_ALLOC_H__ */
12 changes: 0 additions & 12 deletions fs/xfs/libxfs/xfs_alloc_btree.c
Original file line number Diff line number Diff line change
Expand Up @@ -211,17 +211,6 @@ xfs_allocbt_init_key_from_rec(
key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
}

STATIC void
xfs_allocbt_init_rec_from_key(
union xfs_btree_key *key,
union xfs_btree_rec *rec)
{
ASSERT(key->alloc.ar_startblock != 0);

rec->alloc.ar_startblock = key->alloc.ar_startblock;
rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
}

STATIC void
xfs_allocbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
Expand Down Expand Up @@ -406,7 +395,6 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.get_minrecs = xfs_allocbt_get_minrecs,
.get_maxrecs = xfs_allocbt_get_maxrecs,
.init_key_from_rec = xfs_allocbt_init_key_from_rec,
.init_rec_from_key = xfs_allocbt_init_rec_from_key,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_allocbt_key_diff,
Expand Down
Loading

0 comments on commit 0cbbc42

Please sign in to comment.