Skip to content

Commit

Permalink
Merge tag 'large-extent-counters-v9' of https://github.com/chandanr/l…
Browse files Browse the repository at this point in the history
…inux into xfs-5.19-for-next

xfs: Large extent counters

The commit xfs: fix inode fork extent count overflow
(3f8a4f1) mentions that 10 billion
data fork extents should be possible to create. However the
corresponding on-disk field has a signed 32-bit type. Hence this
patchset extends the per-inode data fork extent counter to 64 bits
(out of which 48 bits are used to store the extent count).

Also, XFS has an attribute fork extent counter which is 16 bits
wide. A workload that,
1. Creates 1 million 255-byte sized xattrs,
2. Deletes 50% of these xattrs in an alternating manner,
3. Tries to insert 400,000 new 255-byte sized xattrs
   causes the xattr extent counter to overflow.

Dave tells me that there are instances where a single file has more
than 100 million hardlinks. With parent pointers being stored in
xattrs, we will overflow the signed 16-bits wide attribute extent
counter when large number of hardlinks are created. Hence this
patchset extends the on-disk field to 32-bits.

The following changes are made to accomplish this,
1. A 64-bit inode field is carved out of existing di_pad and
   di_flushiter fields to hold the 64-bit data fork extent counter.
2. The existing 32-bit inode data fork extent counter will be used to
   hold the attribute fork extent counter.
3. A new incompat superblock flag to prevent older kernels from mounting
   the filesystem.

Signed-off-by: Chandan Babu R <[email protected]>
Signed-off-by: Dave Chinner <[email protected]>
  • Loading branch information
dchinner committed Apr 21, 2022
2 parents 463260d + 973ac0e commit a44a027
Show file tree
Hide file tree
Showing 37 changed files with 607 additions and 278 deletions.
2 changes: 1 addition & 1 deletion fs/xfs/libxfs/xfs_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -2511,7 +2511,7 @@ __xfs_free_extent_later(

ASSERT(bno != NULLFSBLOCK);
ASSERT(len > 0);
ASSERT(len <= MAXEXTLEN);
ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
ASSERT(!isnullstartblock(bno));
agno = XFS_FSB_TO_AGNO(mp, bno);
agbno = XFS_FSB_TO_AGBNO(mp, bno);
Expand Down
3 changes: 3 additions & 0 deletions fs/xfs/libxfs/xfs_attr.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,9 @@ xfs_attr_set(
if (args->value || xfs_inode_hasattr(dp)) {
error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error == -EFBIG)
error = xfs_iext_count_upgrade(args->trans, dp,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
}
Expand Down
109 changes: 45 additions & 64 deletions fs/xfs/libxfs/xfs_bmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels(
xfs_mount_t *mp, /* file system mount structure */
int whichfork) /* data or attr fork */
{
uint64_t maxblocks; /* max blocks at this level */
xfs_extnum_t maxleafents; /* max leaf entries possible */
int level; /* btree level */
uint maxblocks; /* max blocks at this level */
uint maxleafents; /* max leaf entries possible */
int maxrootrecs; /* max records in root block */
int minleafrecs; /* min records in leaf block */
int minnoderecs; /* min records in node block */
int sz; /* root block size */

/*
* The maximum number of extents in a file, hence the maximum number of
* leaf entries, is controlled by the size of the on-disk extent count,
* either a signed 32-bit number for the data fork, or a signed 16-bit
* number for the attr fork.
* The maximum number of extents in a fork, hence the maximum number of
* leaf entries, is controlled by the size of the on-disk extent count.
*
* Note that we can no longer assume that if we are in ATTR1 that the
* fork offset of all the inodes will be
Expand All @@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels(
* ATTR2 we have to assume the worst case scenario of a minimum size
* available.
*/
if (whichfork == XFS_DATA_FORK) {
maxleafents = MAXEXTNUM;
maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
whichfork);
if (whichfork == XFS_DATA_FORK)
sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
} else {
maxleafents = MAXAEXTNUM;
else
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
}

maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
maxblocks = howmany_64(maxleafents, minleafrecs);
for (level = 1; maxblocks > 1; level++) {
if (maxblocks <= maxrootrecs)
maxblocks = 1;
else
maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
maxblocks = howmany_64(maxblocks, minnoderecs);
}
mp->m_bm_maxlevels[whichfork] = level;
ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
Expand Down Expand Up @@ -468,7 +466,7 @@ xfs_bmap_check_leaf_extents(
if (bp_release)
xfs_trans_brelse(NULL, bp);
error_norelse:
xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
__func__, i);
xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
Expand Down Expand Up @@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;

/*
Expand All @@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
<= MAXEXTLEN))
<= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;

error = 0;
Expand Down Expand Up @@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;

/*
Expand All @@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
<= MAXEXTLEN))
<= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;

/*
Expand Down Expand Up @@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay(
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;

if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
right.br_blockcount <= MAXEXTLEN)))
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;

/*
Expand Down Expand Up @@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real(
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_startblock + left.br_blockcount == new->br_startblock &&
left.br_state == new->br_state &&
left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;

if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_startblock + new->br_blockcount == right.br_startblock &&
new->br_state == right.br_state &&
new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
left.br_blockcount + new->br_blockcount +
right.br_blockcount <= MAXEXTLEN))
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;

error = 0;
Expand Down Expand Up @@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align(

/*
* For large extent hint sizes, the aligned extent might be larger than
* MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
* the length back under MAXEXTLEN. The outer allocation loops handle
* short allocation just fine, so it is safe to do this. We only want to
* do it when we are forced to, though, because it means more allocation
* operations are required.
* XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
* that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
* allocation loops handle short allocation just fine, so it is safe to
* do this. We only want to do it when we are forced to, though, because
* it means more allocation operations are required.
*/
while (align_alen > MAXEXTLEN)
while (align_alen > XFS_MAX_BMBT_EXTLEN)
align_alen -= extsz;
ASSERT(align_alen <= MAXEXTLEN);
ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);

/*
* If the previous block overlaps with this proposed allocation
Expand Down Expand Up @@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align(
return -EINVAL;
} else {
ASSERT(orig_off >= align_off);
/* see MAXEXTLEN handling above */
/* see XFS_BMBT_MAX_EXTLEN handling above */
ASSERT(orig_end <= align_off + align_alen ||
align_alen + extsz > MAXEXTLEN);
align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
}

#ifdef DEBUG
Expand Down Expand Up @@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
Expand Down Expand Up @@ -4104,7 +4102,7 @@ xfs_bmapi_allocate(
if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
bma->prev.br_startoff = NULLFILEOFF;
} else {
bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
if (!bma->eof)
bma->length = XFS_FILBLKS_MIN(bma->length,
bma->got.br_startoff - bma->offset);
Expand Down Expand Up @@ -4424,8 +4422,8 @@ xfs_bmapi_write(
* xfs_extlen_t and therefore 32 bits. Hence we have to
* check for 32-bit overflows and handle them here.
*/
if (len > (xfs_filblks_t)MAXEXTLEN)
bma.length = MAXEXTLEN;
if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
bma.length = XFS_MAX_BMBT_EXTLEN;
else
bma.length = len;

Expand Down Expand Up @@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc(
return error;

xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);

error = xfs_iext_count_may_overflow(ip, whichfork,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip,
XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;

xfs_trans_ijoin(tp, ip, 0);

if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
bma.got.br_startoff > offset_fsb) {
/*
Expand All @@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc(
bma.ip = ip;
bma.wasdel = true;
bma.offset = bma.got.br_startoff;
bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
XFS_MAX_BMBT_EXTLEN);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);

/*
Expand Down Expand Up @@ -4641,7 +4642,7 @@ xfs_bmapi_remap(

ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(len > 0);
ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
Expand Down Expand Up @@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real(
* Deleting the middle of the extent.
*/

/*
* For directories, -ENOSPC is returned since a directory entry
* remove operation must not fail due to low extent count
* availability. -ENOSPC will be handled by higher layers of XFS
* by letting the corresponding empty Data/Free blocks to linger
* until a future remove operation. Dabtree blocks would be
* swapped with the last block in the leaf space and then the
* new last block will be unmapped.
*
* The above logic also applies to the source directory entry of
* a rename operation.
*/
error = xfs_iext_count_may_overflow(ip, whichfork, 1);
if (error) {
ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
whichfork == XFS_DATA_FORK);
error = -ENOSPC;
goto done;
}

old = got;

got.br_blockcount = del->br_startoff - got.br_startoff;
Expand Down Expand Up @@ -5641,7 +5622,7 @@ xfs_bmse_can_merge(
if ((left->br_startoff + left->br_blockcount != startoff) ||
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
(left->br_state != got->br_state) ||
(left->br_blockcount + got->br_blockcount > MAXEXTLEN))
(left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
return false;

return true;
Expand Down
9 changes: 7 additions & 2 deletions fs/xfs/libxfs/xfs_bmap_btree.c
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
return xfs_bmbt_block_maxrecs(blocklen, leaf);
}

/* Compute the max possible height for block mapping btrees. */
/*
* Calculate the maximum possible height of the btree that the on-disk format
* supports. This is used for sizing structures large enough to support every
* possible configuration of a filesystem that might get mounted.
*/
unsigned int
xfs_bmbt_maxlevels_ondisk(void)
{
Expand All @@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;

/* One extra level for the inode root. */
return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
return xfs_btree_compute_maxlevels(minrecs,
XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
}

/*
Expand Down
1 change: 1 addition & 0 deletions fs/xfs/libxfs/xfs_da_btree.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ struct xfs_da_geometry {
unsigned int free_hdr_size; /* dir2 free header size */
unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
xfs_extnum_t max_extents; /* Max. extents in corresponding fork */

xfs_dir2_data_aoff_t data_first_offset;
size_t data_entry_offset;
Expand Down
1 change: 1 addition & 0 deletions fs/xfs/libxfs/xfs_da_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
* Directory address space divided into sections,
* spaces separated by 32GB.
*/
#define XFS_DIR2_MAX_SPACES 3
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
Expand Down
8 changes: 8 additions & 0 deletions fs/xfs/libxfs/xfs_dir2.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ xfs_da_mount(
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
mp->m_sb.sb_blocklog;
dageo->magicpct = (dageo->blksize * 37) / 100;

/* set up attribute geometry - single fsb only */
Expand All @@ -161,6 +163,12 @@ xfs_da_mount(
dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);

if (xfs_has_large_extent_counts(mp))
dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
else
dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;

dageo->magicpct = (dageo->blksize * 37) / 100;
return 0;
}
Expand Down
Loading

0 comments on commit a44a027

Please sign in to comment.