Skip to content

Commit

Permalink
xfs: Replace per-ag array with a radix tree
Browse files Browse the repository at this point in the history
The use of an array for the per-ag structures requires reallocation
of the array when growing the filesystem. This requires locking
access to the array to avoid use after free situations, and the
locking is difficult to get right. To avoid needing to reallocate an
array, change the per-ag structures to an allocated object per ag
and index them using a tree structure.

The AGs are always densely indexed (hence the use of an array), but
the number supported is 2^32 and lookups tend to be random and hence
indexing needs to scale. A simple choice is a radix tree - it works
well with this sort of index.  This change also removes another
large contiguous allocation from the mount/growfs path in XFS.

The growing process now needs to change to only initialise the new
AGs required for the extra space, and as such only needs to
exclusively lock the tree for inserts. The rest of the code only
needs to lock the tree while doing lookups, and hence this will
remove all the deadlocks that currently occur on the m_perag_lock as
it is now an innermost lock. The lock is also changed to a spinlock
from a read/write lock as the hold time is now extremely short.

To complete the picture, the per-ag structures will need to be
reference counted to ensure that we don't free/modify them while
they are still in use.  This will be done in subsequent patch.

Signed-off-by: Dave Chinner <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Alex Elder <[email protected]>
  • Loading branch information
dchinner authored and Alex Elder committed Jan 15, 2010
1 parent 44b56e0 commit 1c1c6eb
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 90 deletions.
8 changes: 0 additions & 8 deletions fs/xfs/xfs_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -2276,7 +2276,6 @@ xfs_alloc_vextent(
* These three force us into a single a.g.
*/
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
down_read(&mp->m_peraglock);
args->pag = xfs_perag_get(mp, args->agno);
args->minleft = 0;
error = xfs_alloc_fix_freelist(args, 0);
Expand All @@ -2286,14 +2285,12 @@ xfs_alloc_vextent(
goto error0;
}
if (!args->agbp) {
up_read(&mp->m_peraglock);
trace_xfs_alloc_vextent_noagbp(args);
break;
}
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
if ((error = xfs_alloc_ag_vextent(args)))
goto error0;
up_read(&mp->m_peraglock);
break;
case XFS_ALLOCTYPE_START_BNO:
/*
Expand Down Expand Up @@ -2345,7 +2342,6 @@ xfs_alloc_vextent(
* Loop over allocation groups twice; first time with
* trylock set, second time without.
*/
down_read(&mp->m_peraglock);
for (;;) {
args->pag = xfs_perag_get(mp, args->agno);
if (no_min) args->minleft = 0;
Expand Down Expand Up @@ -2408,7 +2404,6 @@ xfs_alloc_vextent(
}
xfs_perag_put(args->pag);
}
up_read(&mp->m_peraglock);
if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
if (args->agno == sagno)
mp->m_agfrotor = (mp->m_agfrotor + 1) %
Expand Down Expand Up @@ -2438,7 +2433,6 @@ xfs_alloc_vextent(
return 0;
error0:
xfs_perag_put(args->pag);
up_read(&mp->m_peraglock);
return error;
}

Expand All @@ -2463,7 +2457,6 @@ xfs_free_extent(
args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
ASSERT(args.agno < args.mp->m_sb.sb_agcount);
args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
down_read(&args.mp->m_peraglock);
args.pag = xfs_perag_get(args.mp, args.agno);
if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
goto error0;
Expand All @@ -2475,7 +2468,6 @@ xfs_free_extent(
error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
error0:
xfs_perag_put(args.pag);
up_read(&args.mp->m_peraglock);
return error;
}

Expand Down
7 changes: 1 addition & 6 deletions fs/xfs/xfs_bmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -2629,14 +2629,12 @@ xfs_bmap_btalloc(
if (startag == NULLAGNUMBER)
startag = ag = 0;
notinit = 0;
down_read(&mp->m_peraglock);
pag = xfs_perag_get(mp, ag);
while (blen < ap->alen) {
if (!pag->pagf_init &&
(error = xfs_alloc_pagf_init(mp, args.tp,
ag, XFS_ALLOC_FLAG_TRYLOCK))) {
xfs_perag_put(pag);
up_read(&mp->m_peraglock);
return error;
}
/*
Expand Down Expand Up @@ -2669,10 +2667,8 @@ xfs_bmap_btalloc(

error = xfs_filestream_new_ag(ap, &ag);
xfs_perag_put(pag);
if (error) {
up_read(&mp->m_peraglock);
if (error)
return error;
}

/* loop again to set 'blen'*/
startag = NULLAGNUMBER;
Expand All @@ -2688,7 +2684,6 @@ xfs_bmap_btalloc(
pag = xfs_perag_get(mp, ag);
}
xfs_perag_put(pag);
up_read(&mp->m_peraglock);
/*
* Since the above loop did a BUF_TRYLOCK, it is
* possible that there is space for this request.
Expand Down
13 changes: 4 additions & 9 deletions fs/xfs/xfs_filestream.c
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,7 @@ _xfs_filestream_pick_ag(

/*
* Set the allocation group number for a file or a directory, updating inode
* references and per-AG references as appropriate. Must be called with the
* m_peraglock held in read mode.
* references and per-AG references as appropriate.
*/
static int
_xfs_filestream_update_ag(
Expand Down Expand Up @@ -456,10 +455,10 @@ xfs_filestream_unmount(
}

/*
* If the mount point's m_perag array is going to be reallocated, all
* If the mount point's m_perag tree is going to be modified, all
* outstanding cache entries must be flushed to avoid accessing reference count
* addresses that have been freed. The call to xfs_filestream_flush() must be
* made inside the block that holds the m_peraglock in write mode to do the
* made inside the block that holds the m_perag_lock in write mode to do the
* reallocation.
*/
void
Expand Down Expand Up @@ -531,7 +530,6 @@ xfs_filestream_associate(

mp = pip->i_mount;
cache = mp->m_filestream;
down_read(&mp->m_peraglock);

/*
* We have a problem, Houston.
Expand All @@ -548,10 +546,8 @@ xfs_filestream_associate(
*
* So, if we can't get the iolock without sleeping then just give up
*/
if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) {
up_read(&mp->m_peraglock);
if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
return 1;
}

/* If the parent directory is already in the cache, use its AG. */
item = xfs_mru_cache_lookup(cache, pip->i_ino);
Expand Down Expand Up @@ -606,7 +602,6 @@ xfs_filestream_associate(

exit:
xfs_iunlock(pip, XFS_IOLOCK_EXCL);
up_read(&mp->m_peraglock);
return -err;
}

Expand Down
42 changes: 21 additions & 21 deletions fs/xfs/xfs_fsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,27 +167,14 @@ xfs_growfs_data_private(
}
new = nb - mp->m_sb.sb_dblocks;
oagcount = mp->m_sb.sb_agcount;
if (nagcount > oagcount) {
void *new_perag, *old_perag;

xfs_filestream_flush(mp);

new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
KM_MAYFAIL);
if (!new_perag)
return XFS_ERROR(ENOMEM);

down_write(&mp->m_peraglock);
memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
old_perag = mp->m_perag;
mp->m_perag = new_perag;

mp->m_flags |= XFS_MOUNT_32BITINODES;
nagimax = xfs_initialize_perag(mp, nagcount);
up_write(&mp->m_peraglock);

kmem_free(old_perag);
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
error = xfs_initialize_perag(mp, nagcount, &nagimax);
if (error)
return error;
}

tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
tp->t_flags |= XFS_TRANS_RESERVE;
if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
Expand All @@ -196,6 +183,11 @@ xfs_growfs_data_private(
return error;
}

/*
* Write new AG headers to disk. Non-transactional, but written
* synchronously so they are completed prior to the growfs transaction
* being logged.
*/
nfree = 0;
for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
/*
Expand Down Expand Up @@ -359,6 +351,12 @@ xfs_growfs_data_private(
goto error0;
}
}

/*
* Update changed superblock fields transactionally. These are not
* seen by the rest of the world until the transaction commit applies
* them atomically to the superblock.
*/
if (nagcount > oagcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
if (nb > mp->m_sb.sb_dblocks)
Expand All @@ -369,9 +367,9 @@ xfs_growfs_data_private(
if (dpct)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
error = xfs_trans_commit(tp, 0);
if (error) {
if (error)
return error;
}

/* New allocation groups fully initialized, so update mount struct */
if (nagimax)
mp->m_maxagi = nagimax;
Expand All @@ -381,6 +379,8 @@ xfs_growfs_data_private(
mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
} else
mp->m_maxicount = 0;

/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
error = xfs_read_buf(mp, mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
Expand Down
25 changes: 2 additions & 23 deletions fs/xfs/xfs_ialloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -383,11 +383,9 @@ xfs_ialloc_ag_alloc(
newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
down_read(&args.mp->m_peraglock);
pag = xfs_perag_get(args.mp, agno);
pag->pagi_freecount += newlen;
xfs_perag_put(pag);
up_read(&args.mp->m_peraglock);
agi->agi_newino = cpu_to_be32(newino);

/*
Expand Down Expand Up @@ -489,7 +487,6 @@ xfs_ialloc_ag_select(
*/
agno = pagno;
flags = XFS_ALLOC_FLAG_TRYLOCK;
down_read(&mp->m_peraglock);
for (;;) {
pag = xfs_perag_get(mp, agno);
if (!pag->pagi_init) {
Expand Down Expand Up @@ -531,7 +528,6 @@ xfs_ialloc_ag_select(
goto nextag;
}
xfs_perag_put(pag);
up_read(&mp->m_peraglock);
return agbp;
}
}
Expand All @@ -544,18 +540,14 @@ xfs_ialloc_ag_select(
* No point in iterating over the rest, if we're shutting
* down.
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
up_read(&mp->m_peraglock);
if (XFS_FORCED_SHUTDOWN(mp))
return NULL;
}
agno++;
if (agno >= agcount)
agno = 0;
if (agno == pagno) {
if (flags == 0) {
up_read(&mp->m_peraglock);
if (flags == 0)
return NULL;
}
flags = 0;
}
}
Expand Down Expand Up @@ -777,16 +769,13 @@ xfs_dialloc(
*inop = NULLFSINO;
return noroom ? ENOSPC : 0;
}
down_read(&mp->m_peraglock);
pag = xfs_perag_get(mp, tagno);
if (pag->pagi_inodeok == 0) {
xfs_perag_put(pag);
up_read(&mp->m_peraglock);
goto nextag;
}
error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
xfs_perag_put(pag);
up_read(&mp->m_peraglock);
if (error)
goto nextag;
agi = XFS_BUF_TO_AGI(agbp);
Expand Down Expand Up @@ -1015,9 +1004,7 @@ xfs_dialloc(
goto error0;
be32_add_cpu(&agi->agi_freecount, -1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
down_read(&mp->m_peraglock);
pag->pagi_freecount--;
up_read(&mp->m_peraglock);

error = xfs_check_agi_freecount(cur, agi);
if (error)
Expand Down Expand Up @@ -1100,9 +1087,7 @@ xfs_difree(
/*
* Get the allocation group header.
*/
down_read(&mp->m_peraglock);
error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
up_read(&mp->m_peraglock);
if (error) {
cmn_err(CE_WARN,
"xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.",
Expand Down Expand Up @@ -1169,11 +1154,9 @@ xfs_difree(
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
down_read(&mp->m_peraglock);
pag = xfs_perag_get(mp, agno);
pag->pagi_freecount -= ilen - 1;
xfs_perag_put(pag);
up_read(&mp->m_peraglock);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));

Expand Down Expand Up @@ -1202,11 +1185,9 @@ xfs_difree(
*/
be32_add_cpu(&agi->agi_freecount, 1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
down_read(&mp->m_peraglock);
pag = xfs_perag_get(mp, agno);
pag->pagi_freecount++;
xfs_perag_put(pag);
up_read(&mp->m_peraglock);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}

Expand Down Expand Up @@ -1328,9 +1309,7 @@ xfs_imap(
xfs_buf_t *agbp; /* agi buffer */
int i; /* temp state */

down_read(&mp->m_peraglock);
error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
up_read(&mp->m_peraglock);
if (error) {
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
"xfs_ialloc_read_agi() returned "
Expand Down
4 changes: 0 additions & 4 deletions fs/xfs/xfs_itable.c
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,7 @@ xfs_bulkstat(
while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
cond_resched();
bp = NULL;
down_read(&mp->m_peraglock);
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
up_read(&mp->m_peraglock);
if (error) {
/*
* Skip this allocation group and go to the next one.
Expand Down Expand Up @@ -849,9 +847,7 @@ xfs_inumbers(
agbp = NULL;
while (left > 0 && agno < mp->m_sb.sb_agcount) {
if (agbp == NULL) {
down_read(&mp->m_peraglock);
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
up_read(&mp->m_peraglock);
if (error) {
/*
* If we can't read the AGI of this ag,
Expand Down
Loading

0 comments on commit 1c1c6eb

Please sign in to comment.