From 801cc4e17a34c32e1527827292fac9cec5d3393b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 15 Mar 2016 11:42:44 +1100 Subject: [PATCH 1/6] xfs: debug mode forced buffered write failure Add a DEBUG mode-only sysfs knob to enable forced buffered write failure. An additional side effect of this mode is brute force killing of delayed allocation blocks in the range of the write. The latter is the prime motiviation behind this patch, as userspace test infrastructure requires a reliable mechanism to create and split delalloc extents without causing extent conversion. Certain fallocate operations (i.e., zero range) were used for this in the past, but the implementations have changed such that delalloc extents are flushed and converted to real blocks, rendering the test useless. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 9 +++++- fs/xfs/xfs_mount.h | 25 +++++++++++++++ fs/xfs/xfs_sysfs.c | 78 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 101 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 379c089fb0514a..eed0bfc4443c29 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1752,6 +1752,7 @@ xfs_vm_write_failed( loff_t from = pos & (PAGE_CACHE_SIZE - 1); loff_t to = from + len; struct buffer_head *bh, *head; + struct xfs_mount *mp = XFS_I(inode)->i_mount; /* * The request pos offset might be 32 or 64 bit, this is all fine @@ -1786,7 +1787,8 @@ xfs_vm_write_failed( if (!buffer_delay(bh)) continue; - if (!buffer_new(bh) && block_offset < i_size_read(inode)) + if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && + block_offset < i_size_read(inode)) continue; xfs_vm_kill_delalloc_range(inode, block_offset, @@ -1824,6 +1826,7 @@ xfs_vm_write_begin( pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; int status; + struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; ASSERT(len <= PAGE_CACHE_SIZE); @@ -1832,6 +1835,8 @@ xfs_vm_write_begin( return -ENOMEM; status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (xfs_mp_fail_writes(mp)) + status = -EIO; if (unlikely(status)) { struct inode *inode = mapping->host; size_t isize = i_size_read(inode); @@ -1844,6 +1849,8 @@ xfs_vm_write_begin( * allocated in this write, not blocks that were previously * written successfully. */ + if (xfs_mp_fail_writes(mp)) + isize = 0; if (pos + len > isize) { ssize_t start = max_t(ssize_t, pos, isize); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b57098481c10a2..546cf46a97ce60 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -147,6 +147,17 @@ typedef struct xfs_mount { * to various other kinds of pain inflicted on the pNFS server. */ __uint32_t m_generation; + +#ifdef DEBUG + /* + * DEBUG mode instrumentation to test and/or trigger delayed allocation + * block killing in the event of failed writes. When enabled, all + * buffered writes are forced to fail. All delalloc blocks in the range + * of the write (including pre-existing delalloc blocks!) are tossed as + * part of the write failure error handling sequence. + */ + bool m_fail_writes; +#endif } xfs_mount_t; /* @@ -264,6 +275,20 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } +#ifdef DEBUG +static inline bool +xfs_mp_fail_writes(struct xfs_mount *mp) +{ + return mp->m_fail_writes; +} +#else +static inline bool +xfs_mp_fail_writes(struct xfs_mount *mp) +{ + return 0; +} +#endif + /* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 641d625eb334c3..6ced4f1434948d 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -18,10 +18,13 @@ #include "xfs.h" #include "xfs_sysfs.h" +#include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_stats.h" +#include "xfs_mount.h" struct xfs_sysfs_attr { struct attribute attr; @@ -45,16 +48,6 @@ to_attr(struct attribute *attr) #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr -/* - * xfs_mount kobject. This currently has no attributes and thus no need for show - * and store helpers. The mp kobject serves as the per-mount parent object that - * is identified by the fsname under sysfs. - */ - -struct kobj_type xfs_mp_ktype = { - .release = xfs_sysfs_release, -}; - STATIC ssize_t xfs_sysfs_object_show( struct kobject *kobject, @@ -83,6 +76,71 @@ static const struct sysfs_ops xfs_sysfs_ops = { .store = xfs_sysfs_object_store, }; +/* + * xfs_mount kobject. The mp kobject also serves as the per-mount parent object + * that is identified by the fsname under sysfs. + */ + +static inline struct xfs_mount * +to_mp(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + + return container_of(kobj, struct xfs_mount, m_kobj); +} + +#ifdef DEBUG + +STATIC ssize_t +fail_writes_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_mount *mp = to_mp(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val == 1) + mp->m_fail_writes = true; + else if (val == 0) + mp->m_fail_writes = false; + else + return -EINVAL; + + return count; +} + +STATIC ssize_t +fail_writes_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_mount *mp = to_mp(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0); +} +XFS_SYSFS_ATTR_RW(fail_writes); + +#endif /* DEBUG */ + +static struct attribute *xfs_mp_attrs[] = { +#ifdef DEBUG + ATTR_LIST(fail_writes), +#endif + NULL, +}; + +struct kobj_type xfs_mp_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_mp_attrs, +}; + #ifdef DEBUG /* debug */ From b2706a05bad36c0a826493c6ba84c8a9caf8a3ae Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 15 Mar 2016 11:42:46 +1100 Subject: [PATCH 2/6] xfs: update freeblocks counter after extent deletion xfs_bunmapi() currently updates the fdblocks counter, unreserves quota, etc. before the extent is deleted by xfs_bmap_del_extent(). The function has problems dividing up the indirect reserved blocks for scenarios where a single delalloc extent is split in two. Particularly, there aren't always enough blocks reserved for multiple extents in a single extent reservation. The solution to this problem is to allow the extent removal code to steal from the deleted extent to meet indirect reservation requirements. Move the block of code in xfs_bmapi() that updates the fdblocks counter to after the call to xfs_bmap_del_extent() to allow the codepath to update the extent record before the free blocks are accounted. Also, reshuffle the code slightly so the delalloc accounting occurs near the xfs_bmap_del_extent() call to provide context for the comments. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 58 +++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ef00156f4f9616..b48abc300c36e7 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5296,31 +5296,7 @@ xfs_bunmapi( goto nodelete; } } - if (wasdel) { - ASSERT(startblockval(del.br_startblock) > 0); - /* Update realtime/data freespace, unreserve quota */ - if (isrt) { - xfs_filblks_t rtexts; - rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); - do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_frextents(mp, (int64_t)rtexts); - (void)xfs_trans_reserve_quota_nblks(NULL, - ip, -((long)del.br_blockcount), 0, - XFS_QMOPT_RES_RTBLKS); - } else { - xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, - false); - (void)xfs_trans_reserve_quota_nblks(NULL, - ip, -((long)del.br_blockcount), 0, - XFS_QMOPT_RES_REGBLKS); - } - ip->i_delayed_blks -= del.br_blockcount; - if (cur) - cur->bc_private.b.flags |= - XFS_BTCUR_BPRV_WASDEL; - } else if (cur) - cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; /* * If it's the case where the directory code is running * with no block reservation, and the deleted block is in @@ -5342,11 +5318,45 @@ xfs_bunmapi( error = -ENOSPC; goto error0; } + + /* + * Unreserve quota and update realtime free space, if + * appropriate. If delayed allocation, update the inode delalloc + * counter now and wait to update the sb counters as + * xfs_bmap_del_extent() might need to borrow some blocks. + */ + if (wasdel) { + ASSERT(startblockval(del.br_startblock) > 0); + if (isrt) { + xfs_filblks_t rtexts; + + rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); + do_div(rtexts, mp->m_sb.sb_rextsize); + xfs_mod_frextents(mp, (int64_t)rtexts); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_RTBLKS); + } else { + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_REGBLKS); + } + ip->i_delayed_blks -= del.br_blockcount; + if (cur) + cur->bc_private.b.flags |= + XFS_BTCUR_BPRV_WASDEL; + } else if (cur) + cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; + error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) goto error0; + + if (!isrt && wasdel) + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); + bno = del.br_startoff - 1; nodelete: /* From a9bd24ac2becf69e896d88bf8b1b7b0f18c2157b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 15 Mar 2016 11:42:46 +1100 Subject: [PATCH 3/6] xfs: refactor delalloc indlen reservation split into helper The delayed allocation indirect reservation splitting code is not sufficient in some cases where a delalloc extent is split in two. In preparation for enhancements to this code, refactor the current indlen distribution algorithm into a new helper function. [dchinner: rename temp, temp2 variables] Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 73 +++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b48abc300c36e7..6de613b558a96e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4720,6 +4720,47 @@ xfs_bmapi_write( return error; } +/* + * When a delalloc extent is split (e.g., due to a hole punch), the original + * indlen reservation must be shared across the two new extents that are left + * behind. + * + * Given the original reservation and the worst case indlen for the two new + * extents (as calculated by xfs_bmap_worst_indlen()), split the original + * reservation fairly across the two new extents. + */ +static void +xfs_bmap_split_indlen( + xfs_filblks_t ores, /* original res. */ + xfs_filblks_t *indlen1, /* ext1 worst indlen */ + xfs_filblks_t *indlen2) /* ext2 worst indlen */ +{ + xfs_filblks_t len1 = *indlen1; + xfs_filblks_t len2 = *indlen2; + xfs_filblks_t nres = len1 + len2; /* new total res. */ + + /* + * The only blocks available are those reserved for the original extent. + * Therefore, we have to skim blocks off each of the new reservations so + * long as the new total reservation is greater than the original. + */ + while (nres > ores) { + if (len1) { + len1--; + nres--; + } + if (nres == ores) + break; + if (len2) { + len2--; + nres--; + } + } + + *indlen1 = len1; + *indlen2 = len2; +} + /* * Called by xfs_bmapi to update file extent records and the btree * after removing space (or undoing a delayed allocation). @@ -4985,27 +5026,21 @@ xfs_bmap_del_extent( XFS_IFORK_NEXTENTS(ip, whichfork) + 1); } else { ASSERT(whichfork == XFS_DATA_FORK); - temp = xfs_bmap_worst_indlen(ip, temp); + + /* + * Distribute the original indlen reservation across the + * two new extents. + */ + temp = xfs_bmap_worst_indlen(ip, got.br_blockcount); + temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount); + xfs_bmap_split_indlen(da_old, &temp, &temp2); + da_new = temp + temp2; + + /* + * Set the reservation for each extent. + */ xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - temp2 = xfs_bmap_worst_indlen(ip, temp2); new.br_startblock = nullstartblock((int)temp2); - da_new = temp + temp2; - while (da_new > da_old) { - if (temp) { - temp--; - da_new--; - xfs_bmbt_set_startblock(ep, - nullstartblock((int)temp)); - } - if (da_new == da_old) - break; - if (temp2) { - temp2--; - da_new--; - new.br_startblock = - nullstartblock((int)temp2); - } - } } trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); xfs_iext_insert(ip, *idx + 1, 1, &new, state); From d34999c97ae87cd56514b8cbc6269651efe274fe Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 15 Mar 2016 11:42:47 +1100 Subject: [PATCH 4/6] xfs: borrow indirect blocks from freed extent when available xfs_bmap_del_extent() handles extent removal from the in-core and on-disk extent lists. When removing a delalloc range, it updates the indirect block reservation appropriately based on the removal. It currently enforces that the new indirect block reservation is less than or equal to the original. This is normally the case in all situations except for in certain cases when the removed range creates a hole in a single delalloc extent, thus splitting a single delalloc extent in two. It is possible with small enough extents to split an indlen==1 extent into two such slightly smaller extents. This leaves one extent with 0 indirect blocks and leads to assert failures in other areas (e.g., xfs_bunmapi() if the extent happens to be removed). Update the indlen distribution code to steal blocks from the deleted extent, if necessary, to satisfy the worst case total indirect reservation for the new extents. This is safe as the caller does not update the fdblocks counters until the extent is removed. Blocks stolen in this manner simply remain accounted as allocated, having ownership transferred from the data extent to an indirect reservation. As a precaution, fall back to the original reservation algorithm if the new indlen requirement is not met and warn if we end up with extents without any reservation at all to detect this more easily in the future. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 46 +++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6de613b558a96e..2a43d5c7fde576 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4727,22 +4727,39 @@ xfs_bmapi_write( * * Given the original reservation and the worst case indlen for the two new * extents (as calculated by xfs_bmap_worst_indlen()), split the original - * reservation fairly across the two new extents. + * reservation fairly across the two new extents. If necessary, steal available + * blocks from a deleted extent to make up a reservation deficiency (e.g., if + * ores == 1). The number of stolen blocks is returned. The availability and + * subsequent accounting of stolen blocks is the responsibility of the caller. */ -static void +static xfs_filblks_t xfs_bmap_split_indlen( xfs_filblks_t ores, /* original res. */ xfs_filblks_t *indlen1, /* ext1 worst indlen */ - xfs_filblks_t *indlen2) /* ext2 worst indlen */ + xfs_filblks_t *indlen2, /* ext2 worst indlen */ + xfs_filblks_t avail) /* stealable blocks */ { xfs_filblks_t len1 = *indlen1; xfs_filblks_t len2 = *indlen2; xfs_filblks_t nres = len1 + len2; /* new total res. */ + xfs_filblks_t stolen = 0; + + /* + * Steal as many blocks as we can to try and satisfy the worst case + * indlen for both new extents. + */ + while (nres > ores && avail) { + nres--; + avail--; + stolen++; + } /* - * The only blocks available are those reserved for the original extent. - * Therefore, we have to skim blocks off each of the new reservations so - * long as the new total reservation is greater than the original. + * The only blocks available are those reserved for the original + * extent and what we can steal from the extent being removed. + * If this still isn't enough to satisfy the combined + * requirements for the two new extents, skim blocks off of each + * of the new reservations until they match what is available. */ while (nres > ores) { if (len1) { @@ -4759,6 +4776,8 @@ xfs_bmap_split_indlen( *indlen1 = len1; *indlen2 = len2; + + return stolen; } /* @@ -5025,20 +5044,27 @@ xfs_bmap_del_extent( XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); } else { + xfs_filblks_t stolen; ASSERT(whichfork == XFS_DATA_FORK); /* * Distribute the original indlen reservation across the - * two new extents. + * two new extents. Steal blocks from the deleted extent + * if necessary. Stealing blocks simply fudges the + * fdblocks accounting in xfs_bunmapi(). */ temp = xfs_bmap_worst_indlen(ip, got.br_blockcount); temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount); - xfs_bmap_split_indlen(da_old, &temp, &temp2); - da_new = temp + temp2; + stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2, + del->br_blockcount); + da_new = temp + temp2 - stolen; + del->br_blockcount -= stolen; /* - * Set the reservation for each extent. + * Set the reservation for each extent. Warn if either + * is zero as this can lead to delalloc problems. */ + WARN_ON_ONCE(!temp || !temp2); xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); new.br_startblock = nullstartblock((int)temp2); } From cc07eed8336d6452214d13e0cba770a0f5296a7f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 15 Mar 2016 11:42:47 +1100 Subject: [PATCH 5/6] xfs: ensure committed is initialized in xfs_trans_roll __xfs_trans_roll() can return without setting the *committed argument; this was a problem for xfs_bmap_finish(): int committed;/* xact committed or not */ ... error = __xfs_trans_roll(tp, ip, &committed); if (error) { ... if (committed) { and we tested an uninitialized "committed" variable on the error path. No caller is preserving "committed" state across calls to __xfs_trans_roll(), so just initialize committed inside the function to avoid future errors like this. Reported-by: Dan Carpenter Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_trans.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 748b16aff45a1c..20c53666cb4b32 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1028,6 +1028,8 @@ __xfs_trans_roll( struct xfs_trans_res tres; int error; + *committed = 0; + /* * Ensure that the inode is always logged. */ @@ -1082,6 +1084,6 @@ xfs_trans_roll( struct xfs_trans **tpp, struct xfs_inode *dp) { - int committed = 0; + int committed; return __xfs_trans_roll(tpp, dp, &committed); } From 355cced45286ed7e710058174066628ff9ad9fa4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Mar 2016 11:44:18 +1100 Subject: [PATCH 6/6] xfs: always set rvalp in xfs_dir2_node_trim_free xfs_dir2_node_trim_free can return with setting the rvalp argument pointer. Initialize it to 0 at the beginning of the function and only update it to 1 if we succeeded trimming a freespace block. Reported-by: Dan Carpenter Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2_node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 63ee03db796ca9..75a557432d0f87 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -2235,6 +2235,9 @@ xfs_dir2_node_trim_free( dp = args->dp; tp = args->trans; + + *rvalp = 0; + /* * Read the freespace block. */ @@ -2255,7 +2258,6 @@ xfs_dir2_node_trim_free( */ if (freehdr.nused > 0) { xfs_trans_brelse(tp, bp); - *rvalp = 0; return 0; } /*