diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 86de8a1ad91c23..6178153d33200e 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -495,3 +495,45 @@ the class and error context. For example, the default values for "metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults to "fail immediately" behaviour. This is done because ENODEV is a fatal, unrecoverable error no matter how many times the metadata IO is retried. + +Workqueue Concurrency +===================== + +XFS uses kernel workqueues to parallelize metadata update processes. This +enables it to take advantage of storage hardware that can service many IO +operations simultaneously. This interface exposes internal implementation +details of XFS, and as such is explicitly not part of any userspace API/ABI +guarantee the kernel may give userspace. These are undocumented features of +the generic workqueue implementation XFS uses for concurrency, and they are +provided here purely for diagnostic and tuning purposes and may change at any +time in the future. + +The control knobs for a filesystem's workqueues are organized by task at hand +and the short name of the data device. They all can be found in: + + /sys/bus/workqueue/devices/${task}!${device} + +================ =========== + Task Description +================ =========== + xfs_iwalk-$pid Inode scans of the entire filesystem. Currently limited to + mount time quotacheck. + xfs-blockgc Background garbage collection of disk space that have been + speculatively allocated beyond EOF or for staging copy on + write operations. +================ =========== + +For example, the knobs for the quotacheck workqueue for /dev/nvme0n1 would be +found in /sys/bus/workqueue/devices/xfs_iwalk-1111!nvme0n1/. + +The interesting knobs for XFS workqueues are as follows: + +============ =========== + Knob Description +============ =========== + max_active Maximum number of background threads that can be started to + run the work. + cpumask CPUs upon which the threads are allowed to run. + nice Relative priority of scheduling the threads. These are the + same nice levels that can be applied to userspace processes. +============ =========== diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7cb9f064ac6452..0c623d3c1036df 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2474,6 +2474,47 @@ xfs_defer_agfl_block( xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); } +#ifdef DEBUG +/* + * Check if an AGF has a free extent record whose length is equal to + * args->minlen. + */ +STATIC int +xfs_exact_minlen_extent_available( + struct xfs_alloc_arg *args, + struct xfs_buf *agbp, + int *stat) +{ + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error = 0; + + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, + args->agno, XFS_BTNUM_CNT); + error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); + if (error) + goto out; + + if (*stat == 0) { + error = -EFSCORRUPTED; + goto out; + } + + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); + if (error) + goto out; + + if (*stat == 1 && flen != args->minlen) + *stat = 0; + +out: + xfs_btree_del_cursor(cnt_cur, error); + + return error; +} +#endif + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. @@ -2545,6 +2586,15 @@ xfs_alloc_fix_freelist( if (!xfs_alloc_space_available(args, need, flags)) goto out_agbp_relse; +#ifdef DEBUG + if (args->alloc_minlen_only) { + int stat; + + error = xfs_exact_minlen_extent_available(args, agbp, &stat); + if (error || !stat) + goto out_agbp_relse; + } +#endif /* * Make the freelist shorter if it's too long. * diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6c22b12176b8b6..a4427c5775c252 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -75,6 +75,9 @@ typedef struct xfs_alloc_arg { char wasfromfl; /* set if allocation is from freelist */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ enum xfs_ag_resv_type resv; /* block reservation to use */ +#ifdef DEBUG + bool alloc_minlen_only; /* allocate exact minlen extent */ +#endif } xfs_alloc_arg_t; /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fd8e6418a0d312..472b3039eabbf1 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -396,6 +396,7 @@ xfs_attr_set( struct xfs_trans_res tres; bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; + int rmt_blks = 0; unsigned int total; if (XFS_FORCED_SHUTDOWN(dp->i_mount)) @@ -442,34 +443,33 @@ xfs_attr_set( tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; total = args->total; + + if (!local) + rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); } else { XFS_STATS_INC(mp, xs_attr_remove); tres = M_RES(mp)->tr_attrrm; total = XFS_ATTRRM_SPACE_RES(mp); + rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc(mp, &tres, total, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &args->trans); + error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; - xfs_ilock(dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(args->trans, dp, 0); - if (args->value) { - unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS; - - if (rsvd) - quota_flags |= XFS_QMOPT_FORCE_RES; - error = xfs_trans_reserve_quota_nblks(args->trans, dp, - args->total, 0, quota_flags); + if (args->value || xfs_inode_hasattr(dp)) { + error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, + XFS_IEXT_ATTR_MANIP_CNT(rmt_blks)); if (error) goto out_trans_cancel; + } + if (args->value) { error = xfs_has_attr(args); if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) goto out_trans_cancel; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index bc446418e2276f..e0905ad171f0a5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1079,21 +1079,13 @@ xfs_bmap_add_attrfork( blks = XFS_ADDAFORK_SPACE_RES(mp); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0, + rsvd, &tp); if (error) return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) - goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; - xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) @@ -3463,34 +3455,16 @@ xfs_bmap_btalloc_accounting( args->len); } -STATIC int -xfs_bmap_btalloc( - struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +static int +xfs_bmap_compute_alignments( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args) { - xfs_mount_t *mp; /* mount point structure */ - xfs_alloctype_t atype = 0; /* type for allocation routines */ - xfs_extlen_t align = 0; /* minimum allocation alignment */ - xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t ag; - xfs_alloc_arg_t args; - xfs_fileoff_t orig_offset; - xfs_extlen_t orig_length; - xfs_extlen_t blen; - xfs_extlen_t nextminlen = 0; - int nullfb; /* true if ap->firstblock isn't set */ - int isaligned; - int tryagain; - int error; - int stripe_align; - - ASSERT(ap->length); - orig_offset = ap->offset; - orig_length = ap->length; - - mp = ap->ip->i_mount; + struct xfs_mount *mp = args->mp; + xfs_extlen_t align = 0; /* minimum allocation alignment */ + int stripe_align = 0; /* stripe alignment for allocation is determined by mount parameters */ - stripe_align = 0; if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) stripe_align = mp->m_swidth; else if (mp->m_dalign) @@ -3501,13 +3475,171 @@ xfs_bmap_btalloc( else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); if (align) { - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 0, ap->eof, 0, ap->conv, - &ap->offset, &ap->length); - ASSERT(!error); + if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, + ap->eof, 0, ap->conv, &ap->offset, + &ap->length)) + ASSERT(0); ASSERT(ap->length); } + /* apply extent size hints if obtained earlier */ + if (align) { + args->prod = align; + div_u64_rem(ap->offset, args->prod, &args->mod); + if (args->mod) + args->mod = args->prod - args->mod; + } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { + args->prod = 1; + args->mod = 0; + } else { + args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; + div_u64_rem(ap->offset, args->prod, &args->mod); + if (args->mod) + args->mod = args->prod - args->mod; + } + + return stripe_align; +} + +static void +xfs_bmap_process_allocated_extent( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_fileoff_t orig_offset, + xfs_extlen_t orig_length) +{ + int nullfb; + + nullfb = ap->tp->t_firstblock == NULLFSBLOCK; + + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(nullfb || + XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <= + XFS_FSB_TO_AGNO(args->mp, args->fsbno)); + + ap->blkno = args->fsbno; + if (nullfb) + ap->tp->t_firstblock = args->fsbno; + ap->length = args->len; + /* + * If the extent size hint is active, we tried to round the + * caller's allocation request offset down to extsz and the + * length up to another extsz boundary. If we found a free + * extent we mapped it in starting at this new offset. If the + * newly mapped space isn't long enough to cover any of the + * range of offsets that was originally requested, move the + * mapping up so that we can fill as much of the caller's + * original request as possible. Free space is apparently + * very fragmented so we're unlikely to be able to satisfy the + * hints anyway. + */ + if (ap->length <= orig_length) + ap->offset = orig_offset; + else if (ap->offset + ap->length < orig_offset + orig_length) + ap->offset = orig_offset + orig_length - ap->length; + xfs_bmap_btalloc_accounting(ap, args); +} + +#ifdef DEBUG +static int +xfs_bmap_exact_minlen_extent_alloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; + int error; + + ASSERT(ap->length); + + if (ap->minlen != 1) { + ap->blkno = NULLFSBLOCK; + ap->length = 0; + return 0; + } + + orig_offset = ap->offset; + orig_length = ap->length; + + args.alloc_minlen_only = 1; + + xfs_bmap_compute_alignments(ap, &args); + + if (ap->tp->t_firstblock == NULLFSBLOCK) { + /* + * Unlike the longest extent available in an AG, we don't track + * the length of an AG's shortest extent. + * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and + * hence we can afford to start traversing from the 0th AG since + * we need not be concerned about a drop in performance in + * "debug only" code paths. + */ + ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); + } else { + ap->blkno = ap->tp->t_firstblock; + } + + args.fsbno = ap->blkno; + args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = args.minlen = args.maxlen = ap->minlen; + + args.alignment = 1; + args.minalignslop = 0; + + args.minleft = ap->minleft; + args.wasdel = ap->wasdel; + args.resv = XFS_AG_RESV_NONE; + args.datatype = ap->datatype; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + if (args.fsbno != NULLFSBLOCK) { + xfs_bmap_process_allocated_extent(ap, &args, orig_offset, + orig_length); + } else { + ap->blkno = NULLFSBLOCK; + ap->length = 0; + } + + return 0; +} +#else + +#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED) + +#endif + +STATIC int +xfs_bmap_btalloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; + xfs_alloctype_t atype = 0; + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_agnumber_t ag; + xfs_fileoff_t orig_offset; + xfs_extlen_t orig_length; + xfs_extlen_t blen; + xfs_extlen_t nextminlen = 0; + int nullfb; /* true if ap->firstblock isn't set */ + int isaligned; + int tryagain; + int error; + int stripe_align; + + ASSERT(ap->length); + orig_offset = ap->offset; + orig_length = ap->length; + + stripe_align = xfs_bmap_compute_alignments(ap, &args); nullfb = ap->tp->t_firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, @@ -3538,9 +3670,6 @@ xfs_bmap_btalloc( * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; - memset(&args, 0, sizeof(args)); - args.tp = ap->tp; - args.mp = mp; args.fsbno = ap->blkno; args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; @@ -3571,21 +3700,7 @@ xfs_bmap_btalloc( args.total = ap->total; args.minlen = ap->minlen; } - /* apply extent size hints if obtained earlier */ - if (align) { - args.prod = align; - div_u64_rem(ap->offset, args.prod, &args.mod); - if (args.mod) - args.mod = args.prod - args.mod; - } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) { - args.prod = 1; - args.mod = 0; - } else { - args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog; - div_u64_rem(ap->offset, args.prod, &args.mod); - if (args.mod) - args.mod = args.prod - args.mod; - } + /* * If we are not low on available data blocks, and the underlying * logical volume manager is a stripe, and the file offset is zero then @@ -3687,37 +3802,10 @@ xfs_bmap_btalloc( return error; ap->tp->t_flags |= XFS_TRANS_LOWMODE; } + if (args.fsbno != NULLFSBLOCK) { - /* - * check the allocation happened at the same or higher AG than - * the first block that was allocated. - */ - ASSERT(ap->tp->t_firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock) <= - XFS_FSB_TO_AGNO(mp, args.fsbno)); - - ap->blkno = args.fsbno; - if (ap->tp->t_firstblock == NULLFSBLOCK) - ap->tp->t_firstblock = args.fsbno; - ASSERT(nullfb || fb_agno <= args.agno); - ap->length = args.len; - /* - * If the extent size hint is active, we tried to round the - * caller's allocation request offset down to extsz and the - * length up to another extsz boundary. If we found a free - * extent we mapped it in starting at this new offset. If the - * newly mapped space isn't long enough to cover any of the - * range of offsets that was originally requested, move the - * mapping up so that we can fill as much of the caller's - * original request as possible. Free space is apparently - * very fragmented so we're unlikely to be able to satisfy the - * hints anyway. - */ - if (ap->length <= orig_length) - ap->offset = orig_offset; - else if (ap->offset + ap->length < orig_offset + orig_length) - ap->offset = orig_offset + orig_length - ap->length; - xfs_bmap_btalloc_accounting(ap, &args); + xfs_bmap_process_allocated_extent(ap, &args, orig_offset, + orig_length); } else { ap->blkno = NULLFSBLOCK; ap->length = 0; @@ -4001,8 +4089,7 @@ xfs_bmapi_reserve_delalloc( * blocks. This number gets adjusted later. We return if we haven't * allocated blocks already inside this loop. */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_quota_reserve_blkres(ip, alen); if (error) return error; @@ -4048,8 +4135,7 @@ xfs_bmapi_reserve_delalloc( xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) - xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, - XFS_QMOPT_RES_REGBLKS); + xfs_quota_unreserve_blkres(ip, alen); return error; } @@ -4083,6 +4169,10 @@ xfs_bmap_alloc_userdata( return xfs_bmap_rtalloc(bma); } + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + return xfs_bmap_exact_minlen_extent_alloc(bma); + return xfs_bmap_btalloc(bma); } @@ -4119,10 +4209,15 @@ xfs_bmapi_allocate( else bma->minlen = 1; - if (bma->flags & XFS_BMAPI_METADATA) - error = xfs_bmap_btalloc(bma); - else + if (bma->flags & XFS_BMAPI_METADATA) { + if (unlikely(XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + error = xfs_bmap_exact_minlen_extent_alloc(bma); + else + error = xfs_bmap_btalloc(bma); + } else { error = xfs_bmap_alloc_userdata(bma); + } if (error || bma->blkno == NULLFSBLOCK) return error; @@ -4527,6 +4622,12 @@ xfs_bmapi_convert_delalloc( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); + + error = xfs_iext_count_may_overflow(ip, whichfork, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_trans_cancel; + xfs_trans_ijoin(tp, ip, 0); if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || @@ -4826,9 +4927,8 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, - -((long)del->br_blockcount), 0, - isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + ASSERT(!isrt); + error = xfs_quota_unreserve_blkres(ip, del->br_blockcount); if (error) return error; ip->i_delayed_blks -= del->br_blockcount; @@ -5145,6 +5245,27 @@ xfs_bmap_del_extent_real( /* * Deleting the middle of the extent. */ + + /* + * For directories, -ENOSPC is returned since a directory entry + * remove operation must not fail due to low extent count + * availability. -ENOSPC will be handled by higher layers of XFS + * by letting the corresponding empty Data/Free blocks to linger + * until a future remove operation. Dabtree blocks would be + * swapped with the last block in the leaf space and then the + * new last block will be unmapped. + * + * The above logic also applies to the source directory entry of + * a rename operation. + */ + error = xfs_iext_count_may_overflow(ip, whichfork, 1); + if (error) { + ASSERT(S_ISDIR(VFS_I(ip)->i_mode) && + whichfork == XFS_DATA_FORK); + error = -ENOSPC; + goto done; + } + old = got; got.br_blockcount = del->br_startoff - got.br_startoff; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c4d7a9241dc324..b56ff451adcee8 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -353,20 +353,17 @@ xfs_btree_free_block( */ void xfs_btree_del_cursor( - xfs_btree_cur_t *cur, /* btree cursor */ - int error) /* del because of error */ + struct xfs_btree_cur *cur, /* btree cursor */ + int error) /* del because of error */ { - int i; /* btree level */ + int i; /* btree level */ /* - * Clear the buffer pointers, and release the buffers. - * If we're doing this in the face of an error, we - * need to make sure to inspect all of the entries - * in the bc_bufs array for buffers to be unlocked. - * This is because some of the btree code works from - * level n down to 0, and if we get an error along - * the way we won't have initialized all the entries - * down to 0. + * Clear the buffer pointers and release the buffers. If we're doing + * this because of an error, inspect all of the entries in the bc_bufs + * array for buffers to be unlocked. This is because some of the btree + * code works from level n down to 0, and if we get an error along the + * way we won't have initialized all the entries down to 0. */ for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_bufs[i]) @@ -374,17 +371,11 @@ xfs_btree_del_cursor( else if (!error) break; } - /* - * Can't free a bmap cursor without having dealt with the - * allocated indirect blocks' accounting. - */ - ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_ino.allocated == 0); - /* - * Free the cursor. - */ + + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || + XFS_FORCED_SHUTDOWN(cur->bc_mp)); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) - kmem_free((void *)cur->bc_ops); + kmem_free(cur->bc_ops); kmem_cache_free(xfs_btree_cur_zone, cur); } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index e55378640b0561..d03e6098ded9f9 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -47,8 +47,6 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); -extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp, - xfs_ino_t inum); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 2463b5d7344724..8c4f76bba88be1 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -1018,7 +1018,7 @@ xfs_dir2_sf_removename( /* * Check whether the sf dir replace operation need more blocks. */ -bool +static bool xfs_dir2_sf_replace_needblock( struct xfs_inode *dp, xfs_ino_t inum) diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 53b305dea38156..6ca9084b6934ef 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -56,7 +56,9 @@ #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 #define XFS_ERRTAG_IUNLINK_FALLBACK 34 #define XFS_ERRTAG_BUF_IOERROR 35 -#define XFS_ERRTAG_MAX 36 +#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36 +#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37 +#define XFS_ERRTAG_MAX 38 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -97,5 +99,7 @@ #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 #define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT +#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 +#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2a2e3cfd94f0cc..6fad140d4c8e57 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -250,6 +250,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ #define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ #define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */ +#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 7575de5cecb1f6..e080d7e0764351 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -23,6 +23,8 @@ #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" #include "xfs_attr_leaf.h" +#include "xfs_types.h" +#include "xfs_errortag.h" kmem_zone_t *xfs_ifork_zone; @@ -728,3 +730,28 @@ xfs_ifork_verify_local_attr( return 0; } + +int +xfs_iext_count_may_overflow( + struct xfs_inode *ip, + int whichfork, + int nr_to_add) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + uint64_t max_exts; + uint64_t nr_exts; + + if (whichfork == XFS_COW_FORK) + return 0; + + max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM; + + if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS)) + max_exts = 10; + + nr_exts = ifp->if_nextents + nr_to_add; + if (nr_exts < ifp->if_nextents || nr_exts > max_exts) + return -EFBIG; + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index a4953e95c4f3f6..9e2137cd73724b 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -34,6 +34,67 @@ struct xfs_ifork { #define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ #define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +/* + * Worst-case increase in the fork extent count when we're adding a single + * extent to a fork and there's no possibility of splitting an existing mapping. + */ +#define XFS_IEXT_ADD_NOSPLIT_CNT (1) + +/* + * Punching out an extent from the middle of an existing extent can cause the + * extent count to increase by 1. + * i.e. | Old extent | Hole | Old extent | + */ +#define XFS_IEXT_PUNCH_HOLE_CNT (1) + +/* + * Directory entry addition can cause the following, + * 1. Data block can be added/removed. + * A new extent can cause extent count to increase by 1. + * 2. Free disk block can be added/removed. + * Same behaviour as described above for Data block. + * 3. Dabtree blocks. + * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new + * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH. + */ +#define XFS_IEXT_DIR_MANIP_CNT(mp) \ + ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount) + +/* + * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to + * be added. One extra extent for dabtree in case a local attr is + * large enough to cause a double split. It can also cause extent + * count to increase proportional to the size of a remote xattr's + * value. + */ +#define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \ + (XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks)) + +/* + * A write to a sub-interval of an existing unwritten extent causes the original + * extent to be split into 3 extents + * i.e. | Unwritten | Real | Unwritten | + * Hence extent count can increase by 2. + */ +#define XFS_IEXT_WRITE_UNWRITTEN_CNT (2) + + +/* + * Moving an extent to data fork can cause a sub-interval of an existing extent + * to be unmapped. This will increase extent count by 1. Mapping in the new + * extent can increase the extent count by 1 again i.e. + * | Old extent | New extent | Old extent | + * Hence number of extents increases by 2. + */ +#define XFS_IEXT_REFLINK_END_COW_CNT (2) + +/* + * Removing an initial range of source/donor file's extent and adding a new + * extent (from donor/source file) in its place will cause extent count to + * increase by 1. + */ +#define XFS_IEXT_SWAP_RMAP_CNT (1) + /* * Fork handling. */ @@ -172,5 +233,7 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip); int xfs_ifork_verify_local_data(struct xfs_inode *ip); int xfs_ifork_verify_local_attr(struct xfs_inode *ip); +int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, + int nr_to_add); #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index bbda117e5d856b..60e6d255e5e2c8 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1138,6 +1138,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; if (xfs_sb_version_hasbigtime(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; + if (xfs_sb_version_hasinobtcounts(sbp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; if (xfs_sb_version_hassector(sbp)) geo->logsectsize = sbp->sb_logsectsize; else diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8ea6d4aa3f55b8..53456f3de881e2 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -888,7 +888,7 @@ xchk_stop_reaping( struct xfs_scrub *sc) { sc->flags |= XCHK_REAPING_DISABLED; - xfs_stop_block_reaping(sc->mp); + xfs_blockgc_stop(sc->mp); } /* Restart background reaping of resources. */ @@ -896,6 +896,6 @@ void xchk_start_reaping( struct xfs_scrub *sc) { - xfs_start_block_reaping(sc->mp); + xfs_blockgc_start(sc->mp); sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 93e4d8ae6e92b6..2344757ede63c0 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -471,6 +471,7 @@ xfs_bui_item_recover( xfs_exntst_t state; unsigned int bui_type; int whichfork; + int iext_delta; int error = 0; if (!xfs_bui_validate(mp, buip)) { @@ -508,6 +509,15 @@ xfs_bui_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + if (bui_type == XFS_BMAP_MAP) + iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; + else + iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; + + error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + if (error) + goto err_cancel; + count = bmap->me_len; error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 7371a7f7c65294..e7d68318e6a55c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -727,11 +727,9 @@ xfs_alloc_file_space( xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; int nimaps; - int quota_flag; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; - uint qblocks, resblks, resrtextents; int error; trace_xfs_alloc_file_space(ip); @@ -761,6 +759,7 @@ xfs_alloc_file_space( */ while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; + unsigned int dblocks, rblocks, resblks; /* * Determine space reservations for data/realtime. @@ -790,45 +789,31 @@ xfs_alloc_file_space( */ resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = qblocks = resblks; - resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resblks; } else { - resrtextents = 0; - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); - quota_flag = XFS_QMOPT_RES_REGBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + rblocks = 0; } /* * Allocate and setup the transaction. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, - resrtextents, 0, &tp); - - /* - * Check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, - 0, quota_flag); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + dblocks, rblocks, false, &tp); if (error) - goto error1; + break; - xfs_trans_ijoin(tp, ip, 0); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto error; error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, 0, imapp, &nimaps); if (error) - goto error0; + goto error; /* * Complete the transaction @@ -851,10 +836,7 @@ xfs_alloc_file_space( return error; -error0: /* unlock inode, unreserve quota blocks, cancel trans */ - xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ +error: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -872,20 +854,16 @@ xfs_unmap_extent( uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); int error; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - if (error) { - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, + false, &tp); + if (error) return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, - ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_PUNCH_HOLE_CNT); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, 0); - error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done); if (error) goto out_trans_cancel; @@ -1163,6 +1141,11 @@ xfs_insert_file_space( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_PUNCH_HOLE_CNT); + if (error) + goto out_trans_cancel; + /* * The extent shifting code works on extent granularity. So, if stop_fsb * is not the starting block of extent, we need to split the extent at @@ -1384,6 +1367,22 @@ xfs_swap_extent_rmap( irec.br_blockcount); trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); + if (xfs_bmap_is_real_extent(&uirec)) { + error = xfs_iext_count_may_overflow(ip, + XFS_DATA_FORK, + XFS_IEXT_SWAP_RMAP_CNT); + if (error) + goto out; + } + + if (xfs_bmap_is_real_extent(&irec)) { + error = xfs_iext_count_may_overflow(tip, + XFS_DATA_FORK, + XFS_IEXT_SWAP_RMAP_CNT); + if (error) + goto out; + } + /* Remove the mapping from the donor file. */ xfs_bmap_unmap_extent(tp, tip, &uirec); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f8400bbd64730b..f6e5235df7c904 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -43,7 +43,7 @@ static kmem_zone_t *xfs_buf_zone; * pag_buf_lock * lru_lock * - * xfs_buftarg_wait_rele + * xfs_buftarg_drain_rele * lru_lock * b_lock (trylock due to inversion) * @@ -88,7 +88,7 @@ xfs_buf_vmap_len( * because the corresponding decrement is deferred to buffer release. Buffers * can undergo I/O multiple times in a hold-release cycle and per buffer I/O * tracking adds unnecessary overhead. This is used for sychronization purposes - * with unmount (see xfs_wait_buftarg()), so all we really need is a count of + * with unmount (see xfs_buftarg_drain()), so all we really need is a count of * in-flight buffers. * * Buffers that are never released (e.g., superblock, iclog buffers) must set @@ -1786,7 +1786,7 @@ __xfs_buf_mark_corrupt( * while freeing all the buffers only held by the LRU. */ static enum lru_status -xfs_buftarg_wait_rele( +xfs_buftarg_drain_rele( struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, @@ -1798,7 +1798,7 @@ xfs_buftarg_wait_rele( if (atomic_read(&bp->b_hold) > 1) { /* need to wait, so skip it this pass */ - trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + trace_xfs_buf_drain_buftarg(bp, _RET_IP_); return LRU_SKIP; } if (!spin_trylock(&bp->b_lock)) @@ -1815,14 +1815,13 @@ xfs_buftarg_wait_rele( return LRU_REMOVED; } +/* + * Wait for outstanding I/O on the buftarg to complete. + */ void -xfs_wait_buftarg( +xfs_buftarg_wait( struct xfs_buftarg *btp) { - LIST_HEAD(dispose); - int loop = 0; - bool write_fail = false; - /* * First wait on the buftarg I/O count for all in-flight buffers to be * released. This is critical as new buffers do not make the LRU until @@ -1838,10 +1837,21 @@ xfs_wait_buftarg( while (percpu_counter_sum(&btp->bt_io_count)) delay(100); flush_workqueue(btp->bt_mount->m_buf_workqueue); +} + +void +xfs_buftarg_drain( + struct xfs_buftarg *btp) +{ + LIST_HEAD(dispose); + int loop = 0; + bool write_fail = false; + + xfs_buftarg_wait(btp); /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { - list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, &dispose, LONG_MAX); while (!list_empty(&dispose)) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 5d91a31298a4bd..459ca34f26f588 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -152,7 +152,7 @@ struct xfs_buf { struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ struct xfs_mount *b_mount; - xfs_buftarg_t *b_target; /* buffer target (device) */ + struct xfs_buftarg *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ struct work_struct b_ioend_work; struct completion b_iowait; /* queue for I/O waiters */ @@ -344,11 +344,12 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) /* * Handling of buftargs. */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, struct dax_device *); +extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *, + struct block_device *, struct dax_device *); extern void xfs_free_buftarg(struct xfs_buftarg *); -extern void xfs_wait_buftarg(xfs_buftarg_t *); -extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); +extern void xfs_buftarg_wait(struct xfs_buftarg *); +extern void xfs_buftarg_drain(struct xfs_buftarg *); +extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 1d95ed387d66d8..bd8379b98374f8 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -314,8 +314,14 @@ xfs_dquot_disk_alloc( return -ESRCH; } - /* Create the block mapping. */ xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); + + error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + return error; + + /* Create the block mapping. */ error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, &nmaps); @@ -500,6 +506,42 @@ xfs_dquot_alloc( return dqp; } +/* Check the ondisk dquot's id and type match what the incore dquot expects. */ +static bool +xfs_dquot_check_type( + struct xfs_dquot *dqp, + struct xfs_disk_dquot *ddqp) +{ + uint8_t ddqp_type; + uint8_t dqp_type; + + ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK; + dqp_type = xfs_dquot_type(dqp); + + if (be32_to_cpu(ddqp->d_id) != dqp->q_id) + return false; + + /* + * V5 filesystems always expect an exact type match. V4 filesystems + * expect an exact match for user dquots and for non-root group and + * project dquots. + */ + if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) || + dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0) + return ddqp_type == dqp_type; + + /* + * V4 filesystems support either group or project quotas, but not both + * at the same time. The non-user quota file can be switched between + * group and project quota uses depending on the mount options, which + * means that we can encounter the other type when we try to load quota + * defaults. Quotacheck will soon reset the the entire quota file + * (including the root dquot) anyway, but don't log scary corruption + * reports to dmesg. + */ + return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ; +} + /* Copy the in-core quota fields in from the on-disk buffer. */ STATIC int xfs_dquot_from_disk( @@ -512,8 +554,7 @@ xfs_dquot_from_disk( * Ensure that we got the type and ID we were looking for. * Everything else was checked by the dquot buffer verifier. */ - if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || - be32_to_cpu(ddqp->d_id) != dqp->q_id) { + if (!xfs_dquot_check_type(dqp, ddqp)) { xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, quota %u", __this_address, dqp->q_id); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 7f6e2089947302..185b4915b7bffe 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -54,6 +54,8 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_FORCE_SUMMARY_RECALC, XFS_RANDOM_IUNLINK_FALLBACK, XFS_RANDOM_BUF_IOERROR, + XFS_RANDOM_REDUCE_MAX_IEXTENTS, + XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, }; struct xfs_errortag_attr { @@ -164,6 +166,8 @@ XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); +XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); +XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -202,6 +206,8 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(bad_summary), XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), XFS_ERRORTAG_ATTR_LIST(buf_ioerror), + XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), + XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), NULL, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 39695b59dfcc92..dc91973c0b4f8c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -118,6 +118,54 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } +static xfs_lsn_t +xfs_fsync_lsn( + struct xfs_inode *ip, + bool datasync) +{ + if (!xfs_ipincount(ip)) + return 0; + if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + return 0; + return ip->i_itemp->ili_last_lsn; +} + +/* + * All metadata updates are logged, which means that we just have to flush the + * log up to the latest LSN that touched the inode. + * + * If we have concurrent fsync/fdatasync() calls, we need them to all block on + * the log force before we clear the ili_fsync_fields field. This ensures that + * we don't get a racing sync operation that does not wait for the metadata to + * hit the journal before returning. If we race with clearing ili_fsync_fields, + * then all that will happen is the log force will do nothing as the lsn will + * already be on disk. We can't race with setting ili_fsync_fields because that + * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock + * shared until after the ili_fsync_fields is cleared. + */ +static int +xfs_fsync_flush_log( + struct xfs_inode *ip, + bool datasync, + int *log_flushed) +{ + int error = 0; + xfs_lsn_t lsn; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + lsn = xfs_fsync_lsn(ip, datasync); + if (lsn) { + error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, + log_flushed); + + spin_lock(&ip->i_itemp->ili_lock); + ip->i_itemp->ili_fsync_fields = 0; + spin_unlock(&ip->i_itemp->ili_lock); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + STATIC int xfs_file_fsync( struct file *file, @@ -125,13 +173,10 @@ xfs_file_fsync( loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_inode *ip = XFS_I(file->f_mapping->host); struct xfs_mount *mp = ip->i_mount; int error = 0; int log_flushed = 0; - xfs_lsn_t lsn = 0; trace_xfs_file_fsync(ip); @@ -156,32 +201,13 @@ xfs_file_fsync( xfs_blkdev_issue_flush(mp->m_ddev_targp); /* - * All metadata updates are logged, which means that we just have to - * flush the log up to the latest LSN that touched the inode. If we have - * concurrent fsync/fdatasync() calls, we need them to all block on the - * log force before we clear the ili_fsync_fields field. This ensures - * that we don't get a racing sync operation that does not wait for the - * metadata to hit the journal before returning. If we race with - * clearing the ili_fsync_fields, then all that will happen is the log - * force will do nothing as the lsn will already be on disk. We can't - * race with setting ili_fsync_fields because that is done under - * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared - * until after the ili_fsync_fields is cleared. + * Any inode that has dirty modifications in the log is pinned. The + * racy check here for a pinned inode while not catch modifications + * that happen concurrently to the fsync call, but fsync semantics + * only require to sync previously completed I/O. */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - if (!datasync || - (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - lsn = iip->ili_last_lsn; - } - - if (lsn) { - error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); - spin_lock(&iip->ili_lock); - iip->ili_fsync_fields = 0; - spin_unlock(&iip->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + error = xfs_fsync_flush_log(ip, datasync, &log_flushed); /* * If we only have a single device, and the log force about was @@ -408,12 +434,6 @@ xfs_file_write_checks( } else spin_unlock(&ip->i_flags_lock); - /* - * Updating the timestamps will grab the ilock again from - * xfs_fs_dirty_inode, so we have to call it after dropping the - * lock above. Eventually we should look into a way to avoid - * the pointless lock roundtrip. - */ return file_modified(file); } @@ -693,7 +713,7 @@ xfs_file_buffered_write( struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - int enospc = 0; + bool cleared_space = false; int iolock; if (iocb->ki_flags & IOCB_NOWAIT) @@ -723,27 +743,23 @@ xfs_file_buffered_write( * metadata space. This reduces the chances that the eofblocks scan * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this * also behaves as a filter to prevent too many eofblocks scans from - * running at the same time. + * running at the same time. Use a synchronous scan to increase the + * effectiveness of the scan. */ - if (ret == -EDQUOT && !enospc) { + if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - enospc = xfs_inode_free_quota_eofblocks(ip); - if (enospc) - goto write_retry; - enospc = xfs_inode_free_quota_cowblocks(ip); - if (enospc) - goto write_retry; - iolock = 0; - } else if (ret == -ENOSPC && !enospc) { + xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC); + cleared_space = true; + goto write_retry; + } else if (ret == -ENOSPC && !cleared_space) { struct xfs_eofblocks eofb = {0}; - enospc = 1; + cleared_space = true; xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; - xfs_icache_free_eofblocks(ip->i_mount, &eofb); - xfs_icache_free_cowblocks(ip->i_mount, &eofb); + xfs_blockgc_free_space(ip->i_mount, &eofb); goto write_retry; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 959ce91a375566..a2a40703922772 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -25,17 +25,17 @@ */ static int xfs_growfs_data_private( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_growfs_data_t *in) /* growfs data input struct */ + struct xfs_mount *mp, /* mount point for filesystem */ + struct xfs_growfs_data *in) /* growfs data input struct */ { struct xfs_buf *bp; int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; - xfs_rfsblock_t nb, nb_mod; - xfs_rfsblock_t new; + xfs_rfsblock_t nb, nb_div, nb_mod; + xfs_rfsblock_t delta; xfs_agnumber_t oagcount; - xfs_trans_t *tp; + struct xfs_trans *tp; struct aghdr_init_data id = {}; nb = in->newblocks; @@ -50,16 +50,16 @@ xfs_growfs_data_private( return error; xfs_buf_relse(bp); - new = nb; /* use new as a temporary here */ - nb_mod = do_div(new, mp->m_sb.sb_agblocks); - nagcount = new + (nb_mod != 0); + nb_div = nb; + nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); + nagcount = nb_div + (nb_mod != 0); if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; if (nb < mp->m_sb.sb_dblocks) return -EINVAL; } - new = nb - mp->m_sb.sb_dblocks; + delta = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ @@ -89,7 +89,7 @@ xfs_growfs_data_private( INIT_LIST_HEAD(&id.buffer_list); for (id.agno = nagcount - 1; id.agno >= oagcount; - id.agno--, new -= id.agsize) { + id.agno--, delta -= id.agsize) { if (id.agno == nagcount - 1) id.agsize = nb - @@ -110,8 +110,8 @@ xfs_growfs_data_private( xfs_trans_agblocks_delta(tp, id.nfree); /* If there are new blocks in the old last AG, extend it. */ - if (new) { - error = xfs_ag_extend_space(mp, tp, &id, new); + if (delta) { + error = xfs_ag_extend_space(mp, tp, &id, delta); if (error) goto out_trans_cancel; } @@ -143,7 +143,7 @@ xfs_growfs_data_private( * If we expanded the last AG, free the per-AG reservation * so we can reinitialize it with the new size. */ - if (new) { + if (delta) { struct xfs_perag *pag; pag = xfs_perag_get(mp, id.agno); @@ -170,8 +170,8 @@ xfs_growfs_data_private( static int xfs_growfs_log_private( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_growfs_log_t *in) /* growfs log input struct */ + struct xfs_mount *mp, /* mount point for filesystem */ + struct xfs_growfs_log *in) /* growfs log input struct */ { xfs_extlen_t nb; @@ -268,7 +268,7 @@ xfs_growfs_data( int xfs_growfs_log( xfs_mount_t *mp, - xfs_growfs_log_t *in) + struct xfs_growfs_log *in) { int error; diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 92869f6ec8d3e4..2cffe51a31e8b2 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -6,8 +6,8 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); -extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); +extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); +extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, xfs_fsop_resblks_t *outval); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index fa55ab8b8d80ef..f62fa652c2fd24 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -8,8 +8,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second) with the exception of eofb_timer and cowb_timer, which - * are measured in seconds. + * 100ths of a second) with the exception of blockgc_timer, which is measured + * in seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -28,8 +28,7 @@ xfs_param_t xfs_params = { .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, - .eofb_timer = { 1, 300, 3600*24}, - .cowb_timer = { 1, 1800, 3600*24}, + .blockgc_timer = { 1, 300, 3600*24}, }; struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index deb99300d171c8..1d7720a0c068b7 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -915,69 +915,6 @@ xfs_inode_walk( return last_error; } -/* - * Background scanning to trim post-EOF preallocated space. This is queued - * based on the 'speculative_prealloc_lifetime' tunable (5m by default). - */ -void -xfs_queue_eofblocks( - struct xfs_mount *mp) -{ - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_eofblocks_work, - msecs_to_jiffies(xfs_eofb_secs * 1000)); - rcu_read_unlock(); -} - -void -xfs_eofblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_eofblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_icache_free_eofblocks(mp, NULL); - sb_end_write(mp->m_super); - - xfs_queue_eofblocks(mp); -} - -/* - * Background scanning to trim preallocated CoW space. This is queued - * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). - * (We'll just piggyback on the post-EOF prealloc space workqueue.) - */ -void -xfs_queue_cowblocks( - struct xfs_mount *mp) -{ - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) - queue_delayed_work(mp->m_eofblocks_workqueue, - &mp->m_cowblocks_work, - msecs_to_jiffies(xfs_cowb_secs * 1000)); - rcu_read_unlock(); -} - -void -xfs_cowblocks_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_cowblocks_work); - - if (!sb_start_write_trylock(mp->m_super)) - return; - xfs_icache_free_cowblocks(mp, NULL); - sb_end_write(mp->m_super); - - xfs_queue_cowblocks(mp); -} - /* * Grab the inode for reclaim exclusively. * @@ -1346,14 +1283,17 @@ xfs_reclaim_worker( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - void *args) + void *args, + unsigned int *lockflags) { struct xfs_eofblocks *eofb = args; bool wait; - int ret; wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) + return 0; + if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ trace_xfs_inode_free_eofblocks_invalid(ip); @@ -1380,130 +1320,68 @@ xfs_inode_free_eofblocks( return -EAGAIN; return 0; } + *lockflags |= XFS_IOLOCK_EXCL; - ret = xfs_free_eofblocks(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - - return ret; -} - -int -xfs_icache_free_eofblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb) -{ - return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, - XFS_ICI_EOFBLOCKS_TAG); + return xfs_free_eofblocks(ip); } /* - * Run eofblocks scans on the quotas applicable to the inode. For inodes with - * multiple quotas, we don't know exactly which quota caused an allocation - * failure. We make a best effort by including each quota under low free space - * conditions (less than 1% free space) in the scan. + * Background scanning to trim preallocated space. This is queued based on the + * 'speculative_prealloc_lifetime' tunable (5m by default). */ -static int -__xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip, - int (*execute)(struct xfs_mount *mp, - struct xfs_eofblocks *eofb)) -{ - int scan = 0; - struct xfs_eofblocks eofb = {0}; - struct xfs_dquot *dq; - - /* - * Run a sync scan to increase effectiveness and use the union filter to - * cover all applicable quotas in a single scan. - */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; - - if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_uid = VFS_I(ip)->i_uid; - eofb.eof_flags |= XFS_EOF_FLAGS_UID; - scan = 1; - } - } - - if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { - dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); - if (dq && xfs_dquot_lowsp(dq)) { - eofb.eof_gid = VFS_I(ip)->i_gid; - eofb.eof_flags |= XFS_EOF_FLAGS_GID; - scan = 1; - } - } - - if (scan) - execute(ip->i_mount, &eofb); - - return scan; -} - -int -xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip) -{ - return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); -} - -static inline unsigned long -xfs_iflag_for_tag( - int tag) +static inline void +xfs_blockgc_queue( + struct xfs_perag *pag) { - switch (tag) { - case XFS_ICI_EOFBLOCKS_TAG: - return XFS_IEOFBLOCKS; - case XFS_ICI_COWBLOCKS_TAG: - return XFS_ICOWBLOCKS; - default: - ASSERT(0); - return 0; - } + rcu_read_lock(); + if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) + queue_delayed_work(pag->pag_mount->m_blockgc_workqueue, + &pag->pag_blockgc_work, + msecs_to_jiffies(xfs_blockgc_secs * 1000)); + rcu_read_unlock(); } static void -__xfs_inode_set_blocks_tag( - xfs_inode_t *ip, - void (*execute)(struct xfs_mount *mp), - void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, - int error, unsigned long caller_ip), - int tag) +xfs_blockgc_set_iflag( + struct xfs_inode *ip, + unsigned long iflag) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - int tagged; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); /* * Don't bother locking the AG and looking up in the radix trees * if we already know that we have the tag set. */ - if (ip->i_flags & xfs_iflag_for_tag(tag)) + if (ip->i_flags & iflag) return; spin_lock(&ip->i_flags_lock); - ip->i_flags |= xfs_iflag_for_tag(tag); + ip->i_flags |= iflag; spin_unlock(&ip->i_flags_lock); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - tagged = radix_tree_tagged(&pag->pag_ici_root, tag); + tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG); radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); if (!tagged) { - /* propagate the eofblocks tag up into the perag radix tree */ + /* propagate the blockgc tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_set(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - tag); + XFS_ICI_BLOCKGC_TAG); spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ - execute(ip->i_mount); + xfs_blockgc_queue(pag); - set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, + _RET_IP_); } spin_unlock(&pag->pag_ici_lock); @@ -1515,38 +1393,43 @@ xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_eofblocks_tag(ip); - return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, - trace_xfs_perag_set_eofblocks, - XFS_ICI_EOFBLOCKS_TAG); + return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); } static void -__xfs_inode_clear_blocks_tag( - xfs_inode_t *ip, - void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, - int error, unsigned long caller_ip), - int tag) +xfs_blockgc_clear_iflag( + struct xfs_inode *ip, + unsigned long iflag) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + bool clear_tag; + + ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~xfs_iflag_for_tag(tag); + ip->i_flags &= ~iflag; + clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; spin_unlock(&ip->i_flags_lock); + if (!clear_tag) + return; + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); - if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { - /* clear the eofblocks tag from the perag radix tree */ + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { + /* clear the blockgc tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_clear(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - tag); + XFS_ICI_BLOCKGC_TAG); spin_unlock(&ip->i_mount->m_perag_lock); - clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1, + _RET_IP_); } spin_unlock(&pag->pag_ici_lock); @@ -1558,8 +1441,7 @@ xfs_inode_clear_eofblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_eofblocks_tag(ip); - return __xfs_inode_clear_blocks_tag(ip, - trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); + return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); } /* @@ -1609,20 +1491,42 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - void *args) + void *args, + unsigned int *lockflags) { struct xfs_eofblocks *eofb = args; + bool wait; int ret = 0; + wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + + if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) + return 0; + if (!xfs_prep_free_cowblocks(ip)) return 0; if (!xfs_inode_matches_eofb(ip, eofb)) return 0; - /* Free the CoW blocks */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ + if (!(*lockflags & XFS_IOLOCK_EXCL) && + !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (wait) + return -EAGAIN; + return 0; + } + *lockflags |= XFS_IOLOCK_EXCL; + + if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { + if (wait) + return -EAGAIN; + return 0; + } + *lockflags |= XFS_MMAPLOCK_EXCL; /* * Check again, nobody else should be able to dirty blocks or change @@ -1630,37 +1534,15 @@ xfs_inode_free_cowblocks( */ if (xfs_prep_free_cowblocks(ip)) ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); - - xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; } -int -xfs_icache_free_cowblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb) -{ - return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, - XFS_ICI_COWBLOCKS_TAG); -} - -int -xfs_inode_free_quota_cowblocks( - struct xfs_inode *ip) -{ - return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); -} - void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_set_cowblocks_tag(ip); - return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, - trace_xfs_perag_set_cowblocks, - XFS_ICI_COWBLOCKS_TAG); + return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); } void @@ -1668,24 +1550,158 @@ xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { trace_xfs_inode_clear_cowblocks_tag(ip); - return __xfs_inode_clear_blocks_tag(ip, - trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); + return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); } +#define for_each_perag_tag(mp, next_agno, pag, tag) \ + for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + (pag) != NULL; \ + (next_agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) + + /* Disable post-EOF and CoW block auto-reclamation. */ void -xfs_stop_block_reaping( +xfs_blockgc_stop( struct xfs_mount *mp) { - cancel_delayed_work_sync(&mp->m_eofblocks_work); - cancel_delayed_work_sync(&mp->m_cowblocks_work); + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + cancel_delayed_work_sync(&pag->pag_blockgc_work); } /* Enable post-EOF and CoW block auto-reclamation. */ void -xfs_start_block_reaping( +xfs_blockgc_start( struct xfs_mount *mp) { - xfs_queue_eofblocks(mp); - xfs_queue_cowblocks(mp); + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + xfs_blockgc_queue(pag); +} + +/* Scan one incore inode for block preallocations that we can remove. */ +static int +xfs_blockgc_scan_inode( + struct xfs_inode *ip, + void *args) +{ + unsigned int lockflags = 0; + int error; + + error = xfs_inode_free_eofblocks(ip, args, &lockflags); + if (error) + goto unlock; + + error = xfs_inode_free_cowblocks(ip, args, &lockflags); +unlock: + if (lockflags) + xfs_iunlock(ip, lockflags); + return error; +} + +/* Background worker that trims preallocated space. */ +void +xfs_blockgc_worker( + struct work_struct *work) +{ + struct xfs_perag *pag = container_of(to_delayed_work(work), + struct xfs_perag, pag_blockgc_work); + struct xfs_mount *mp = pag->pag_mount; + int error; + + if (!sb_start_write_trylock(mp->m_super)) + return; + error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, + XFS_ICI_BLOCKGC_TAG); + if (error) + xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", + pag->pag_agno, error); + sb_end_write(mp->m_super); + xfs_blockgc_queue(pag); +} + +/* + * Try to free space in the filesystem by purging eofblocks and cowblocks. + */ +int +xfs_blockgc_free_space( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); + + return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb, + XFS_ICI_BLOCKGC_TAG); +} + +/* + * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which + * quota caused an allocation failure, so we make a best effort by including + * each quota under low free space conditions (less than 1% free space) in the + * scan. + * + * Callers must not hold any inode's ILOCK. If requesting a synchronous scan + * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or + * MMAPLOCK. + */ +int +xfs_blockgc_free_dquots( + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + unsigned int eof_flags) +{ + struct xfs_eofblocks eofb = {0}; + bool do_work = false; + + if (!udqp && !gdqp && !pdqp) + return 0; + + /* + * Run a scan to free blocks using the union filter to cover all + * applicable quotas in a single scan. + */ + eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags; + + if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { + eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + do_work = true; + } + + if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { + eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + do_work = true; + } + + if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { + eofb.eof_prid = pdqp->q_id; + eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + do_work = true; + } + + if (!do_work) + return 0; + + return xfs_blockgc_free_space(mp, &eofb); +} + +/* Run cow/eofblocks scans on the quotas attached to the inode. */ +int +xfs_blockgc_free_quota( + struct xfs_inode *ip, + unsigned int eof_flags) +{ + return xfs_blockgc_free_dquots(ip->i_mount, + xfs_inode_dquot(ip, XFS_DQTYPE_USER), + xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), + xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 3a4c8b382cd0fe..d1fddb1524200b 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -23,8 +23,8 @@ struct xfs_eofblocks { #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup in xfs_inode_walk */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ -#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ -#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ +/* Inode has speculative preallocations (posteof or cow) to clean. */ +#define XFS_ICI_BLOCKGC_TAG 1 /* * Flags for xfs_iget() @@ -54,19 +54,19 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, + unsigned int eof_flags); +int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags); +int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb); + void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); -int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); -void xfs_eofblocks_worker(struct work_struct *); -void xfs_queue_eofblocks(struct xfs_mount *); void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); -int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); -int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); -void xfs_cowblocks_worker(struct work_struct *); -void xfs_queue_cowblocks(struct xfs_mount *); + +void xfs_blockgc_worker(struct work_struct *work); int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, int (*execute)(struct xfs_inode *ip, void *args), @@ -75,7 +75,7 @@ int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); -void xfs_stop_block_reaping(struct xfs_mount *mp); -void xfs_start_block_reaping(struct xfs_mount *mp); +void xfs_blockgc_stop(struct xfs_mount *mp); +void xfs_blockgc_start(struct xfs_mount *mp); #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b7352bc4c81527..636ac13b1df2a4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -775,6 +775,7 @@ xfs_init_new_inode( prid_t prid, struct xfs_inode **ipp) { + struct inode *dir = pip ? VFS_I(pip) : NULL; struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *ip; unsigned int flags; @@ -804,18 +805,17 @@ xfs_init_new_inode( ASSERT(ip != NULL); inode = VFS_I(ip); - inode->i_mode = mode; set_nlink(inode, nlink); - inode->i_uid = current_fsuid(); inode->i_rdev = rdev; ip->i_d.di_projid = prid; - if (pip && XFS_INHERIT_GID(pip)) { - inode->i_gid = VFS_I(pip)->i_gid; - if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) - inode->i_mode |= S_ISGID; + if (dir && !(dir->i_mode & S_ISGID) && + (mp->m_flags & XFS_MOUNT_GRPID)) { + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + inode->i_mode = mode; } else { - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); } /* @@ -1022,23 +1022,22 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, + resblks, &tp); } if (error) - goto out_release_inode; + goto out_release_dquots; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; - /* - * Reserve disk quota and the inode. - */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, + XFS_IEXT_DIR_MANIP_CNT(mp)); if (error) goto out_trans_cancel; @@ -1116,7 +1115,7 @@ xfs_create( xfs_finish_inode_setup(ip); xfs_irele(ip); } - + out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -1160,14 +1159,10 @@ xfs_create_tmpfile( resblks = XFS_IALLOC_SPACE_RES(mp); tres = &M_RES(mp)->tr_create_tmpfile; - error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); - if (error) - goto out_release_inode; - - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, + &tp); if (error) - goto out_trans_cancel; + goto out_release_dquots; error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) @@ -1210,7 +1205,7 @@ xfs_create_tmpfile( xfs_finish_inode_setup(ip); xfs_irele(ip); } - + out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -1258,6 +1253,11 @@ xfs_link( xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, + XFS_IEXT_DIR_MANIP_CNT(mp)); + if (error) + goto error_return; + /* * If we are using project inheritance, we only allow hard link * creation in our tree when the project IDs are the same; else @@ -3017,7 +3017,7 @@ xfs_rename( struct xfs_trans *tp; struct xfs_inode *wip = NULL; /* whiteout inode */ struct xfs_inode *inodes[__XFS_SORT_INODES]; - struct xfs_buf *agibp; + int i; int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); @@ -3106,6 +3106,35 @@ xfs_rename( /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. + * + * Extent count overflow check: + * + * From the perspective of src_dp, a rename operation is essentially a + * directory entry remove operation. Hence the only place where we check + * for extent count overflow for src_dp is in + * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns + * -ENOSPC when it detects a possible extent count overflow and in + * response, the higher layers of directory handling code do the + * following: + * 1. Data/Free blocks: XFS lets these blocks linger until a + * future remove operation removes them. + * 2. Dabtree blocks: XFS swaps the blocks with the last block in the + * Leaf space and unmaps the last block. + * + * For target_dp, there are two cases depending on whether the + * destination directory entry exists or not. + * + * When destination directory entry does not exist (i.e. target_ip == + * NULL), extent count overflow check is performed only when transaction + * has a non-zero sized space reservation associated with it. With a + * zero-sized space reservation, XFS allows a rename operation to + * continue only when the directory has sufficient free space in its + * data/leaf/free space blocks to hold the new entry. + * + * When destination directory entry exists (i.e. target_ip != NULL), all + * we need to do is change the inode number associated with the already + * existing entry. Hence there is no need to perform an extent count + * overflow check. */ if (target_ip == NULL) { /* @@ -3116,6 +3145,12 @@ xfs_rename( error = xfs_dir_canenter(tp, target_dp, target_name); if (error) goto out_trans_cancel; + } else { + error = xfs_iext_count_may_overflow(target_dp, + XFS_DATA_FORK, + XFS_IEXT_DIR_MANIP_CNT(mp)); + if (error) + goto out_trans_cancel; } } else { /* @@ -3130,6 +3165,30 @@ xfs_rename( } } + /* + * Lock the AGI buffers we need to handle bumping the nlink of the + * whiteout inode off the unlinked list and to handle dropping the + * nlink of the target inode. Per locking order rules, do this in + * increasing AG order and before directory block allocation tries to + * grab AGFs because we grab AGIs before AGFs. + * + * The (vfs) caller must ensure that if src is a directory then + * target_ip is either null or an empty directory. + */ + for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { + if (inodes[i] == wip || + (inodes[i] == target_ip && + (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { + struct xfs_buf *bp; + xfs_agnumber_t agno; + + agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); + error = xfs_read_agi(mp, tp, agno, &bp); + if (error) + goto out_trans_cancel; + } + } + /* * Directory entry creation below may acquire the AGF. Remove * the whiteout from the unlinked list first to preserve correct @@ -3182,22 +3241,6 @@ xfs_rename( * In case there is already an entry with the same * name at the destination directory, remove it first. */ - - /* - * Check whether the replace operation will need to allocate - * blocks. This happens when the shortform directory lacks - * space and we have to convert it to a block format directory. - * When more blocks are necessary, we must lock the AGI first - * to preserve locking order (AGI -> AGF). - */ - if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) { - error = xfs_read_agi(mp, tp, - XFS_INO_TO_AGNO(mp, target_ip->i_ino), - &agibp); - if (error) - goto out_trans_cancel; - } - error = xfs_dir_replace(tp, target_dp, target_name, src_ip->i_ino, spaceres); if (error) @@ -3273,9 +3316,16 @@ xfs_rename( if (wip) { error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, spaceres); - } else + } else { + /* + * NOTE: We don't need to check for extent count overflow here + * because the dir remove name code will leave the dir block in + * place if the extent count would overflow. + */ error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, spaceres); + } + if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3fbd98f61ea5c0..248083ea0276c4 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1275,24 +1275,23 @@ xfs_ioctl_setattr_prepare_dax( */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct xfs_inode *ip) + struct xfs_inode *ip, + struct xfs_dquot *pdqp) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = -EROFS; if (mp->m_flags & XFS_MOUNT_RDONLY) - goto out_unlock; + goto out_error; error = -EIO; if (XFS_FORCED_SHUTDOWN(mp)) - goto out_unlock; + goto out_error; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, + capable(CAP_FOWNER), &tp); if (error) - goto out_unlock; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + goto out_error; /* * CAP_FOWNER overrides the following restrictions: @@ -1312,7 +1311,7 @@ xfs_ioctl_setattr_get_trans( out_cancel: xfs_trans_cancel(tp); -out_unlock: +out_error: return ERR_PTR(error); } @@ -1436,13 +1435,13 @@ xfs_ioctl_setattr( struct xfs_trans *tp; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; - int code; + int error; trace_xfs_ioctl_setattr(ip); - code = xfs_ioctl_setattr_check_projid(ip, fa); - if (code) - return code; + error = xfs_ioctl_setattr_check_projid(ip, fa); + if (error) + return error; /* * If disk quotas is on, we make sure that the dquots do exist on disk, @@ -1453,44 +1452,36 @@ xfs_ioctl_setattr( * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { - code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, + error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, VFS_I(ip)->i_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp); - if (code) - return code; + if (error) + return error; } xfs_ioctl_setattr_prepare_dax(ip, fa); - tp = xfs_ioctl_setattr_get_trans(ip); + tp = xfs_ioctl_setattr_get_trans(ip, pdqp); if (IS_ERR(tp)) { - code = PTR_ERR(tp); + error = PTR_ERR(tp); goto error_free_dquots; } - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && - ip->i_d.di_projid != fa->fsx_projid) { - code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp, - capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); - if (code) /* out of quota */ - goto error_trans_cancel; - } - xfs_fill_fsxattr(ip, false, &old_fa); - code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); - if (code) + error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); + if (error) goto error_trans_cancel; - code = xfs_ioctl_setattr_check_extsize(ip, fa); - if (code) + error = xfs_ioctl_setattr_check_extsize(ip, fa); + if (error) goto error_trans_cancel; - code = xfs_ioctl_setattr_check_cowextsize(ip, fa); - if (code) + error = xfs_ioctl_setattr_check_cowextsize(ip, fa); + if (error) goto error_trans_cancel; - code = xfs_ioctl_setattr_xflags(tp, ip, fa); - if (code) + error = xfs_ioctl_setattr_xflags(tp, ip, fa); + if (error) goto error_trans_cancel; /* @@ -1530,7 +1521,7 @@ xfs_ioctl_setattr( else ip->i_d.di_cowextsize = 0; - code = xfs_trans_commit(tp); + error = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. @@ -1538,13 +1529,13 @@ xfs_ioctl_setattr( xfs_qm_dqrele(olddquot); xfs_qm_dqrele(pdqp); - return code; + return error; error_trans_cancel: xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(pdqp); - return code; + return error; } STATIC int @@ -1608,7 +1599,7 @@ xfs_ioc_setxflags( xfs_ioctl_setattr_prepare_dax(ip, &fa); - tp = xfs_ioctl_setattr_get_trans(ip); + tp = xfs_ioctl_setattr_get_trans(ip, NULL); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto out_drop_write; @@ -2260,7 +2251,7 @@ xfs_file_ioctl( } case XFS_IOC_FSGROWFSDATA: { - xfs_growfs_data_t in; + struct xfs_growfs_data in; if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; @@ -2274,7 +2265,7 @@ xfs_file_ioctl( } case XFS_IOC_FSGROWFSLOG: { - xfs_growfs_log_t in; + struct xfs_growfs_log in; if (copy_from_user(&in, arg, sizeof(in))) return -EFAULT; @@ -2348,8 +2339,10 @@ xfs_file_ioctl( if (error) return error; + trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_); + sb_start_write(mp->m_super); - error = xfs_icache_free_eofblocks(mp, &keofb); + error = xfs_blockgc_free_space(mp, &keofb); sb_end_write(mp->m_super); return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ef76f775fabf11..e17ab7f42928a5 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -194,25 +194,21 @@ xfs_iomap_write_direct( struct xfs_trans *tp; xfs_filblks_t resaligned; int nimaps; - int quota_flag; - uint qblocks, resblks; - unsigned int resrtextents = 0; + unsigned int dblocks, rblocks; + bool force = false; int error; int bmapi_flags = XFS_BMAPI_PREALLOC; - uint tflags = 0; ASSERT(count_fsb > 0); resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, xfs_get_extsz_hint(ip)); if (unlikely(XFS_IS_REALTIME_INODE(ip))) { - resrtextents = qblocks = resaligned; - resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; } else { - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - quota_flag = XFS_QMOPT_RES_REGBLKS; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; } error = xfs_qm_dqattach(ip); @@ -235,23 +231,21 @@ xfs_iomap_write_direct( if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { - tflags |= XFS_TRANS_RESERVE; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + force = true; + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, - tflags, &tp); + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, force, &tp); if (error) return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, 0); - /* * From this point onwards we overwrite the imap pointer that the * caller gave to us. @@ -260,7 +254,7 @@ xfs_iomap_write_direct( error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0, imap, &nimaps); if (error) - goto out_res_cancel; + goto out_trans_cancel; /* * Complete the transaction @@ -284,8 +278,6 @@ xfs_iomap_write_direct( xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; -out_res_cancel: - xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; @@ -548,16 +540,13 @@ xfs_iomap_write_unwritten( * here as we might be asked to write out the same inode that we * complete here and might deadlock on the iolock. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, + 0, true, &tp); if (error) return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; @@ -855,7 +844,8 @@ xfs_direct_write_iomap_begin( return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); out_unlock: - xfs_iunlock(ip, lockmode); + if (lockmode) + xfs_iunlock(ip, lockmode); return error; } @@ -883,6 +873,9 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + /* we can't use delayed allocations when using extent size hints */ if (xfs_get_extsz_hint(ip)) return xfs_direct_write_iomap_begin(inode, offset, count, diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 67c8dc9de8aa47..00369502fe2522 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -700,13 +700,11 @@ xfs_setattr_nonsize( return error; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL, + capable(CAP_FOWNER), &tp); if (error) goto out_dqrele; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * Change file ownership. Must be the owner or privileged. */ @@ -722,21 +720,6 @@ xfs_setattr_nonsize( gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - /* - * Do a quota reservation only if uid/gid is actually - * going to change. - */ - if (XFS_IS_QUOTA_RUNNING(mp) && - ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || - (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { - ASSERT(tp); - error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, - NULL, capable(CAP_FOWNER) ? - XFS_QMOPT_FORCE_RES : 0); - if (error) /* out of quota */ - goto out_cancel; - } - /* * CAP_FSETID overrides the following restrictions: * @@ -786,8 +769,6 @@ xfs_setattr_nonsize( xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* * Release any dquot(s) the inode had kept before chown. */ @@ -814,9 +795,6 @@ xfs_setattr_nonsize( return 0; -out_cancel: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -846,7 +824,7 @@ xfs_setattr_size( ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(inode->i_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0); oldsize = inode->i_size; newsize = iattr->ia_size; diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index eae3aff9bc9767..c4a340f1f1e1c4 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -618,15 +618,12 @@ xfs_iwalk_threaded( { struct xfs_pwork_ctl pctl; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); - unsigned int nr_threads; int error; ASSERT(agno < mp->m_sb.sb_agcount); ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); - nr_threads = xfs_pwork_guess_datadev_parallelism(mp); - error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", - nr_threads); + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk"); if (error) return error; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 5b7a1e2015595f..af6be9b9ccdf81 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -98,8 +98,7 @@ typedef __u32 xfs_nlink_t; #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val -#define xfs_eofb_secs xfs_params.eofb_timer.val -#define xfs_cowb_secs xfs_params.cowb_timer.val +#define xfs_blockgc_secs xfs_params.blockgc_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_set_flags_nested(sp, f) \ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fa2d05e65ff1fc..06041834daa313 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -91,6 +91,9 @@ STATIC int xlog_iclogs_empty( struct xlog *log); +static int +xfs_log_cover(struct xfs_mount *); + static void xlog_grant_sub_space( struct xlog *log, @@ -347,6 +350,25 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) tic->t_res_num++; } +bool +xfs_log_writable( + struct xfs_mount *mp) +{ + /* + * Never write to the log on norecovery mounts, if the block device is + * read-only, or if the filesystem is shutdown. Read-only mounts still + * allow internal writes for log recovery and unmount purposes, so don't + * restrict that case here. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return false; + if (xfs_readonly_buftarg(mp->m_log->l_targ)) + return false; + if (XFS_FORCED_SHUTDOWN(mp)) + return false; + return true; +} + /* * Replenish the byte reservation required by moving the grant write head. */ @@ -741,7 +763,7 @@ xfs_log_mount_finish( xfs_log_force(mp, XFS_LOG_SYNC); xfs_ail_push_all_sync(mp->m_ail); } - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buftarg_drain(mp->m_ddev_targp); if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY; @@ -886,15 +908,8 @@ xfs_log_unmount_write( { struct xlog *log = mp->m_log; - /* - * Don't write out unmount record on norecovery mounts or ro devices. - * Or, if we are doing a forced umount (typically because of IO errors). - */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY || - xfs_readonly_buftarg(log->l_targ)) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + if (!xfs_log_writable(mp)) return; - } xfs_log_force(mp, XFS_LOG_SYNC); @@ -924,10 +939,9 @@ xfs_log_unmount_write( * To do this, we first need to shut down the background log work so it is not * trying to cover the log as we clean up. We then need to unpin all objects in * the log so we can then flush them out. Once they have completed their IO and - * run the callbacks removing themselves from the AIL, we can write the unmount - * record. + * run the callbacks removing themselves from the AIL, we can cover the log. */ -void +int xfs_log_quiesce( struct xfs_mount *mp) { @@ -936,16 +950,24 @@ xfs_log_quiesce( /* * The superblock buffer is uncached and while xfs_ail_push_all_sync() - * will push it, xfs_wait_buftarg() will not wait for it. Further, + * will push it, xfs_buftarg_wait() will not wait for it. Further, * xfs_buf_iowait() cannot be used because it was pushed with the * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for * the IO to complete. */ xfs_ail_push_all_sync(mp->m_ail); - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buftarg_wait(mp->m_ddev_targp); xfs_buf_lock(mp->m_sb_bp); xfs_buf_unlock(mp->m_sb_bp); + return xfs_log_cover(mp); +} + +void +xfs_log_clean( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); xfs_log_unmount_write(mp); } @@ -960,7 +982,9 @@ void xfs_log_unmount( struct xfs_mount *mp) { - xfs_log_quiesce(mp); + xfs_log_clean(mp); + + xfs_buftarg_drain(mp->m_ddev_targp); xfs_trans_ail_destroy(mp); @@ -1037,17 +1061,15 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -static int -xfs_log_need_covered(xfs_mount_t *mp) +static bool +xfs_log_need_covered( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - int needed = 0; - - if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) - return 0; + struct xlog *log = mp->m_log; + bool needed = false; if (!xlog_cil_empty(log)) - return 0; + return false; spin_lock(&log->l_icloglock); switch (log->l_covered_state) { @@ -1062,20 +1084,74 @@ xfs_log_need_covered(xfs_mount_t *mp) if (!xlog_iclogs_empty(log)) break; - needed = 1; + needed = true; if (log->l_covered_state == XLOG_STATE_COVER_NEED) log->l_covered_state = XLOG_STATE_COVER_DONE; else log->l_covered_state = XLOG_STATE_COVER_DONE2; break; default: - needed = 1; + needed = true; break; } spin_unlock(&log->l_icloglock); return needed; } +/* + * Explicitly cover the log. This is similar to background log covering but + * intended for usage in quiesce codepaths. The caller is responsible to ensure + * the log is idle and suitable for covering. The CIL, iclog buffers and AIL + * must all be empty. + */ +static int +xfs_log_cover( + struct xfs_mount *mp) +{ + int error = 0; + bool need_covered; + + ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) && + !xfs_ail_min_lsn(mp->m_log->l_ailp)) || + XFS_FORCED_SHUTDOWN(mp)); + + if (!xfs_log_writable(mp)) + return 0; + + /* + * xfs_log_need_covered() is not idempotent because it progresses the + * state machine if the log requires covering. Therefore, we must call + * this function once and use the result until we've issued an sb sync. + * Do so first to make that abundantly clear. + * + * Fall into the covering sequence if the log needs covering or the + * mount has lazy superblock accounting to sync to disk. The sb sync + * used for covering accumulates the in-core counters, so covering + * handles this for us. + */ + need_covered = xfs_log_need_covered(mp); + if (!need_covered && !xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; + + /* + * To cover the log, commit the superblock twice (at most) in + * independent checkpoints. The first serves as a reference for the + * tail pointer. The sync transaction and AIL push empties the AIL and + * updates the in-core tail to the LSN of the first checkpoint. The + * second commit updates the on-disk tail with the in-core LSN, + * covering the log. Push the AIL one more time to leave it empty, as + * we found it. + */ + do { + error = xfs_sync_sb(mp, true); + if (error) + break; + xfs_ail_push_all_sync(mp->m_ail); + } while (xfs_log_need_covered(mp)); + + return error; +} + /* * We may be holding the log iclog lock upon entering this routine. */ @@ -1259,7 +1335,7 @@ xfs_log_worker( struct xfs_mount *mp = log->l_mp; /* dgc: errors ignored - not fatal and nowhere to report them */ - if (xfs_log_need_covered(mp)) { + if (xfs_fs_writable(mp, SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) { /* * Dump a transaction into the log that contains no real change. * This is needed to stamp the current tail LSN into the log @@ -1416,8 +1492,9 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, - mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | + WQ_HIGHPRI), + 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -2538,12 +2615,15 @@ xlog_covered_state( int iclogs_changed) { /* - * We usually go to NEED. But we go to NEED2 if the changed indicates we - * are done writing the dummy record. If we are done with the second - * dummy recored (DONE2), then we go to IDLE. + * We go to NEED for any non-covering writes. We go to NEED2 if we just + * wrote the first covering record (DONE). We go to IDLE if we just + * wrote the second covering record (DONE2) and remain in IDLE until a + * non-covering write occurs. */ switch (prev_state) { case XLOG_STATE_COVER_IDLE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: break; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 58c3fcbec94a2a..044e02cb892190 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -127,6 +127,7 @@ int xfs_log_reserve(struct xfs_mount *mp, int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); +bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); @@ -137,7 +138,8 @@ void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); -void xfs_log_quiesce(struct xfs_mount *mp); +int xfs_log_quiesce(struct xfs_mount *mp); +void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xfs_log_in_recovery(struct xfs_mount *); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7110507a2b6bc2..52370d0a3f4343 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -126,6 +126,7 @@ __xfs_free_perag( { struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -146,6 +147,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -201,6 +203,7 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; spin_lock_init(&pag->pag_ici_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); error = xfs_buf_hash_init(pag); @@ -946,7 +949,7 @@ xfs_mountfs( */ if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == XFS_MOUNT_RDONLY) { - xfs_quiesce_attr(mp); + xfs_log_clean(mp); } /* @@ -1023,8 +1026,8 @@ xfs_mountfs( xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) - xfs_wait_buftarg(mp->m_logdev_targp); - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buftarg_drain(mp->m_logdev_targp); + xfs_buftarg_drain(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); out_free_dir: @@ -1054,7 +1057,7 @@ xfs_unmountfs( uint64_t resblks; int error; - xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); @@ -1124,12 +1127,6 @@ xfs_unmountfs( xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "Unable to update superblock counters. " - "Freespace may not be correct on next mount."); - - xfs_log_unmount(mp); xfs_da_unmount(mp); xfs_uuid_unmount(mp); @@ -1164,32 +1161,6 @@ xfs_fs_writable( return true; } -/* - * xfs_log_sbcount - * - * Sync the superblock counters to disk. - * - * Note this code can be called during the process of freezing, so we use the - * transaction allocator that does not block when the transaction subsystem is - * in its frozen state. - */ -int -xfs_log_sbcount(xfs_mount_t *mp) -{ - /* allow this to proceed during the freeze sequence... */ - if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) - return 0; - - /* - * we don't need to do this if we are updating the superblock - * counters on every modification. - */ - if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) - return 0; - - return xfs_sync_sb(mp, true); -} - /* * Deltas for the block count can vary from 1 to very large, but lock contention * only occurs on frequent small block count updates such as in the delayed diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index dfa429b77ee285..659ad95fe3e0ba 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -93,7 +93,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_blockgc_workqueue; struct workqueue_struct *m_sync_workqueue; int m_bsize; /* fs logical block size */ @@ -177,10 +177,6 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ - struct delayed_work m_eofblocks_work; /* background eof blocks - trimming */ - struct delayed_work m_cowblocks_work; /* background cow blocks - trimming */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; @@ -369,6 +365,9 @@ typedef struct xfs_perag { /* Blocks reserved for the reverse mapping btree. */ struct xfs_ag_resv pag_rmapbt_resv; + /* background prealloc block trimming */ + struct delayed_work pag_blockgc_work; + /* reference count */ uint8_t pagf_refcount_level; @@ -399,7 +398,6 @@ int xfs_buf_hash_init(xfs_perag_t *pag); void xfs_buf_hash_destroy(xfs_perag_t *pag); extern void xfs_uuid_table_free(void); -extern int xfs_log_sbcount(xfs_mount_t *); extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index a06661dac5be90..34c3b16f834f94 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -294,7 +294,7 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c index b03333f1c84ad0..c283b801cc5d54 100644 --- a/fs/xfs/xfs_pwork.c +++ b/fs/xfs/xfs_pwork.c @@ -61,16 +61,18 @@ xfs_pwork_init( struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, xfs_pwork_work_fn work_fn, - const char *tag, - unsigned int nr_threads) + const char *tag) { + unsigned int nr_threads = 0; + #ifdef DEBUG if (xfs_globals.pwork_threads >= 0) nr_threads = xfs_globals.pwork_threads; #endif trace_xfs_pwork_init(mp, nr_threads, current->pid); - pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag, + pctl->wq = alloc_workqueue("%s-%d", + WQ_UNBOUND | WQ_SYSFS | WQ_FREEZABLE, nr_threads, tag, current->pid); if (!pctl->wq) return -ENOMEM; @@ -117,20 +119,3 @@ xfs_pwork_poll( atomic_read(&pctl->nr_work) == 0, HZ) == 0) touch_softlockup_watchdog(); } - -/* - * Return the amount of parallelism that the data device can handle, or 0 for - * no limit. - */ -unsigned int -xfs_pwork_guess_datadev_parallelism( - struct xfs_mount *mp) -{ - struct xfs_buftarg *btp = mp->m_ddev_targp; - - /* - * For now we'll go with the most conservative setting possible, - * which is two threads for an SSD and 1 thread everywhere else. - */ - return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1; -} diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h index 8133124cf3bb79..c0ef81fc85ddb9 100644 --- a/fs/xfs/xfs_pwork.h +++ b/fs/xfs/xfs_pwork.h @@ -51,11 +51,9 @@ xfs_pwork_want_abort( } int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, - xfs_pwork_work_fn work_fn, const char *tag, - unsigned int nr_threads); + xfs_pwork_work_fn work_fn, const char *tag); void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); void xfs_pwork_poll(struct xfs_pwork_ctl *pctl); -unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); #endif /* __XFS_PWORK_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index c134eb4aeaa85b..742d1413e2d024 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1786,105 +1786,35 @@ xfs_qm_vop_chown( xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* - * Take an extra reference, because the inode is going to keep - * this dquot pointer even after the trans_commit. + * Back when we made quota reservations for the chown, we reserved the + * ondisk blocks + delalloc blocks with the new dquot. Now that we've + * switched the dquots, decrease the new dquot's block reservation + * (having already bumped up the real counter) so that we don't have + * any reservation to give back when we commit. */ - *IO_olddq = xfs_qm_dqhold(newdq); + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS, + -ip->i_delayed_blks); - return prevdq; -} - -/* - * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). - */ -int -xfs_qm_vop_chown_reserve( - struct xfs_trans *tp, - struct xfs_inode *ip, - struct xfs_dquot *udqp, - struct xfs_dquot *gdqp, - struct xfs_dquot *pdqp, - uint flags) -{ - struct xfs_mount *mp = ip->i_mount; - uint64_t delblks; - unsigned int blkflags; - struct xfs_dquot *udq_unres = NULL; - struct xfs_dquot *gdq_unres = NULL; - struct xfs_dquot *pdq_unres = NULL; - struct xfs_dquot *udq_delblks = NULL; - struct xfs_dquot *gdq_delblks = NULL; - struct xfs_dquot *pdq_delblks = NULL; - int error; - - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - delblks = ip->i_delayed_blks; - blkflags = XFS_IS_REALTIME_INODE(ip) ? - XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; - - if (XFS_IS_UQUOTA_ON(mp) && udqp && - i_uid_read(VFS_I(ip)) != udqp->q_id) { - udq_delblks = udqp; - /* - * If there are delayed allocation blocks, then we have to - * unreserve those from the old dquot, and add them to the - * new dquot. - */ - if (delblks) { - ASSERT(ip->i_udquot); - udq_unres = ip->i_udquot; - } - } - if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - i_gid_read(VFS_I(ip)) != gdqp->q_id) { - gdq_delblks = gdqp; - if (delblks) { - ASSERT(ip->i_gdquot); - gdq_unres = ip->i_gdquot; - } - } - - if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && - ip->i_d.di_projid != pdqp->q_id) { - pdq_delblks = pdqp; - if (delblks) { - ASSERT(ip->i_pdquot); - pdq_unres = ip->i_pdquot; - } - } - - error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, - udq_delblks, gdq_delblks, pdq_delblks, - ip->i_d.di_nblocks, 1, flags | blkflags); - if (error) - return error; + /* + * Give the incore reservation for delalloc blocks back to the old + * dquot. We don't normally handle delalloc quota reservations + * transactionally, so just lock the dquot and subtract from the + * reservation. Dirty the transaction because it's too late to turn + * back now. + */ + tp->t_flags |= XFS_TRANS_DIRTY; + xfs_dqlock(prevdq); + ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); + prevdq->q_blk.reserved -= ip->i_delayed_blks; + xfs_dqunlock(prevdq); /* - * Do the delayed blks reservations/unreservations now. Since, these - * are done without the help of a transaction, if a reservation fails - * its previous reservations won't be automatically undone by trans - * code. So, we have to do it manually here. + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. */ - if (delblks) { - /* - * Do the reservations first. Unreservation can't fail. - */ - ASSERT(udq_delblks || gdq_delblks || pdq_delblks); - ASSERT(udq_unres || gdq_unres || pdq_unres); - error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - udq_delblks, gdq_delblks, pdq_delblks, - (xfs_qcnt_t)delblks, 0, flags | blkflags); - if (error) - return error; - xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - udq_unres, gdq_unres, pdq_unres, - -((xfs_qcnt_t)delblks), 0, blkflags); - } + *IO_olddq = xfs_qm_dqhold(newdq); - return 0; + return prevdq; } int diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 5a62398940d022..d00d0130254521 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -81,11 +81,14 @@ extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, uint, int64_t); extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); -extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, - struct xfs_inode *, int64_t, long, uint); +int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip, + int64_t dblocks, int64_t rblocks, bool force); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); +int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, + struct xfs_dquot *udqp, struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, int64_t dblocks); extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, @@ -95,9 +98,6 @@ extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); -extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, - struct xfs_dquot *, struct xfs_dquot *, - struct xfs_dquot *, uint); extern int xfs_qm_dqattach(struct xfs_inode *); extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc); extern void xfs_qm_dqdetach(struct xfs_inode *); @@ -108,6 +108,11 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *); extern void xfs_qm_unmount(struct xfs_mount *); extern void xfs_qm_unmount_quotas(struct xfs_mount *); +static inline int +xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); +} #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -121,11 +126,12 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, } #define xfs_trans_dup_dqinfo(tp, tp2) #define xfs_trans_free_dqinfo(tp) -#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) +#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0) #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, - struct xfs_inode *ip, int64_t nblks, long ninos, uint flags) + struct xfs_inode *ip, int64_t dblocks, int64_t rblocks, + bool force) { return 0; } @@ -136,26 +142,39 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, { return 0; } + +static inline int +xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return 0; +} + +static inline int +xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks) +{ + return 0; +} + #define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) -#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) #define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) -#define xfs_qm_dqrele(d) -#define xfs_qm_statvfs(ip, s) +#define xfs_qm_dqrele(d) do { (d) = (d); } while(0) +#define xfs_qm_statvfs(ip, s) do { } while(0) #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */ -#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ - xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) -#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ - xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ - f | XFS_QMOPT_RES_REGBLKS) +static inline int +xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks) +{ + return xfs_quota_reserve_blkres(ip, -blocks); +} extern int xfs_mount_reset_sbqflags(struct xfs_mount *); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 6fa05fb78189bb..725c7d8e44381b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -376,16 +376,14 @@ xfs_reflink_allocate_cow( resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); xfs_iunlock(ip, *lockmode); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - *lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, *lockmode); + *lockmode = 0; + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, + false, &tp); if (error) return error; - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out_trans_cancel; + *lockmode = XFS_ILOCK_EXCL; /* * Check for an overlapping extent again now that we dropped the ilock. @@ -398,20 +396,13 @@ xfs_reflink_allocate_cow( goto convert; } - error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(tp, ip, 0); - /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, &nimaps); if (error) - goto out_unreserve; + goto out_trans_cancel; xfs_inode_set_cowblocks_tag(ip); error = xfs_trans_commit(tp); @@ -436,9 +427,6 @@ xfs_reflink_allocate_cow( trace_xfs_reflink_convert_cow(ip, cmap); return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); -out_unreserve: - xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, - XFS_QMOPT_RES_REGBLKS); out_trans_cancel: xfs_trans_cancel(tp); return error; @@ -508,9 +496,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &icur, &got, &del); /* Remove the quota reservation */ - error = xfs_trans_reserve_quota_nblks(NULL, ip, - -(long)del.br_blockcount, 0, - XFS_QMOPT_RES_REGBLKS); + error = xfs_quota_unreserve_blkres(ip, + del.br_blockcount); if (error) break; } else { @@ -628,6 +615,11 @@ xfs_reflink_end_cow_extent( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + goto out_cancel; + /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that @@ -997,22 +989,47 @@ xfs_reflink_remap_extent( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; xfs_off_t newlen; - int64_t qres, qdelta; + int64_t qdelta = 0; unsigned int resblks; + bool quota_reserved = true; bool smap_real; bool dmap_written = xfs_bmap_is_written_extent(dmap); + int iext_delta = 0; int nimaps; int error; - /* Start a rolling transaction to switch the mappings */ + /* + * Start a rolling transaction to switch the mappings. + * + * Adding a written extent to the extent map can cause a bmbt split, + * and removing a mapped extent from the extent can cause a bmbt split. + * The two operations cannot both cause a split since they operate on + * the same index in the bmap btree, so we only need a reservation for + * one bmbt split if either thing is happening. However, we haven't + * locked the inode yet, so we reserve assuming this is the case. + * + * The first allocation call tries to reserve enough space to handle + * mapping dmap into a sparse part of the file plus the bmbt split. We + * haven't locked the inode or read the existing mapping yet, so we do + * not know for sure that we need the space. This should succeed most + * of the time. + * + * If the first attempt fails, try again but reserving only enough + * space to handle a bmbt split. This is the hard minimum requirement, + * and we revisit quota reservations later when we know more about what + * we're remapping. + */ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + resblks + dmap->br_blockcount, 0, false, &tp); + if (error == -EDQUOT || error == -ENOSPC) { + quota_reserved = false; + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, + resblks, 0, false, &tp); + } if (error) goto out; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * Read what's currently mapped in the destination file into smap. * If smap isn't a hole, we will have to remove it before we can add @@ -1060,15 +1077,9 @@ xfs_reflink_remap_extent( } /* - * Compute quota reservation if we think the quota block counter for + * Increase quota reservation if we think the quota block counter for * this file could increase. * - * Adding a written extent to the extent map can cause a bmbt split, - * and removing a mapped extent from the extent can cause a bmbt split. - * The two operations cannot both cause a split since they operate on - * the same index in the bmap btree, so we only need a reservation for - * one bmbt split if either thing is happening. - * * If we are mapping a written extent into the file, we need to have * enough quota block count reservation to handle the blocks in that * extent. We log only the delta to the quota block counts, so if the @@ -1081,19 +1092,29 @@ xfs_reflink_remap_extent( * count. This is suboptimal, but the VFS flushed the dest range * before we started. That should have removed all the delalloc * reservations, but we code defensively. + * + * xfs_trans_alloc_inode above already tried to grab an even larger + * quota reservation, and kicked off a blockgc scan if it couldn't. + * If we can't get a potentially smaller quota reservation now, we're + * done. */ - qres = qdelta = 0; - if (smap_real || dmap_written) - qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - if (!smap_real && dmap_written) - qres += dmap->br_blockcount; - if (qres > 0) { - error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, - XFS_QMOPT_RES_REGBLKS); + if (!quota_reserved && !smap_real && dmap_written) { + error = xfs_trans_reserve_quota_nblks(tp, ip, + dmap->br_blockcount, 0, false); if (error) goto out_cancel; } + if (smap_real) + ++iext_delta; + + if (dmap_written) + ++iext_delta; + + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); + if (error) + goto out_cancel; + if (smap_real) { /* * If the extent we're unmapping is backed by storage (written diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b4999fb01ff7aa..161b0e8992ba82 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -804,6 +804,11 @@ xfs_growfs_rt_alloc( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, + XFS_IEXT_ADD_NOSPLIT_CNT); + if (error) + goto out_trans_cancel; + /* * Allocate blocks to the bitmap file. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 813be879a5e51d..21b1d034aca3fb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -35,6 +35,7 @@ #include "xfs_refcount_item.h" #include "xfs_bmap_item.h" #include "xfs_reflink.h" +#include "xfs_pwork.h" #include #include @@ -495,40 +496,44 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), 0, mp->m_super->s_id); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; - mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id); - if (!mp->m_eofblocks_workqueue) + mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s", + WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, + 0, mp->m_super->s_id); + if (!mp->m_blockgc_workqueue) goto out_destroy_reclaim; - mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, - mp->m_super->s_id); + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", + XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_eofb; return 0; out_destroy_eofb: - destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_blockgc_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: @@ -546,7 +551,7 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_blockgc_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); @@ -867,39 +872,6 @@ xfs_restore_resvblks(struct xfs_mount *mp) xfs_reserve_blocks(mp, &resblks, NULL); } -/* - * Trigger writeback of all the dirty metadata in the file system. - * - * This ensures that the metadata is written to their location on disk rather - * than just existing in transactions in the log. This means after a quiesce - * there is no log replay required to write the inodes to disk - this is the - * primary difference between a sync and a quiesce. - * - * We cancel log work early here to ensure all transactions the log worker may - * run have finished before we clean up and log the superblock and write an - * unmount record. The unfreeze process is responsible for restarting the log - * worker correctly. - */ -void -xfs_quiesce_attr( - struct xfs_mount *mp) -{ - int error = 0; - - cancel_delayed_work_sync(&mp->m_log->l_work); - - /* force the log to unpin objects from the now complete transactions */ - xfs_log_force(mp, XFS_LOG_SYNC); - - - /* Push the superblock and write an unmount record */ - error = xfs_log_sbcount(mp); - if (error) - xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " - "Frozen image may not be consistent."); - xfs_log_quiesce(mp); -} - /* * Second stage of a freeze. The data is already frozen so we only * need to take care of the metadata. Once that's done sync the superblock @@ -920,10 +892,9 @@ xfs_fs_freeze( * set a GFP_NOFS context here to avoid recursion deadlocks. */ flags = memalloc_nofs_save(); - xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp); xfs_save_resvblks(mp); - xfs_quiesce_attr(mp); - ret = xfs_sync_sb(mp, true); + ret = xfs_log_quiesce(mp); memalloc_nofs_restore(flags); return ret; } @@ -936,7 +907,7 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_start_block_reaping(mp); + xfs_blockgc_start(mp); return 0; } @@ -1720,7 +1691,7 @@ xfs_remount_rw( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } - xfs_start_block_reaping(mp); + xfs_blockgc_start(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1740,10 +1711,10 @@ xfs_remount_ro( * Cancel background eofb scanning so it cannot race with the final * log force+buftarg wait and deadlock the remount. */ - xfs_stop_block_reaping(mp); + xfs_blockgc_stop(mp); /* Get rid of any leftover CoW reservations... */ - error = xfs_icache_free_cowblocks(mp, NULL); + error = xfs_blockgc_free_space(mp, NULL); if (error) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; @@ -1765,7 +1736,7 @@ xfs_remount_ro( */ xfs_save_resvblks(mp); - xfs_quiesce_attr(mp); + xfs_log_clean(mp); mp->m_flags |= XFS_MOUNT_RDONLY; return 0; @@ -1872,8 +1843,6 @@ static int xfs_init_fs_context( mutex_init(&mp->m_growlock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); - INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); - INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; /* * We don't create the finobt per-ag space reservation until after log @@ -2119,11 +2088,12 @@ xfs_init_workqueues(void) * max_active value for this workqueue. */ xfs_alloc_wq = alloc_workqueue("xfsalloc", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); if (!xfs_alloc_wq) return -ENOMEM; - xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0); + xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND), + 0); if (!xfs_discard_wq) goto out_free_alloc_wq; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index b552cf6d337984..1ca484b8357f7e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -75,6 +75,12 @@ extern void xfs_qm_exit(void); XFS_ASSERT_FATAL_STRING \ XFS_DBG_STRING /* DBG must be last */ +#ifdef DEBUG +# define XFS_WQFLAGS(wqflags) (WQ_SYSFS | (wqflags)) +#else +# define XFS_WQFLAGS(wqflags) (wqflags) +#endif + struct xfs_inode; struct xfs_mount; struct xfs_buftarg; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 1f43fd7f3209ff..8565663b16cdb5 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -197,9 +197,10 @@ xfs_symlink( fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); + error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp, + pdqp, resblks, &tp); if (error) - goto out_release_inode; + goto out_release_dquots; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -212,11 +213,8 @@ xfs_symlink( goto out_trans_cancel; } - /* - * Reserve disk quota : blocks and inode. - */ - error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, - pdqp, resblks, 1, 0); + error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK, + XFS_IEXT_DIR_MANIP_CNT(mp)); if (error) goto out_trans_cancel; @@ -300,6 +298,7 @@ xfs_symlink( } ASSERT(pathlen == 0); } + i_size_write(VFS_I(ip), ip->i_d.di_size); /* * Create the directory entry for the symlink. @@ -342,7 +341,7 @@ xfs_symlink( xfs_finish_inode_setup(ip); xfs_irele(ip); } - +out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index fac9de7ee6d00a..145e06c4774481 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -194,21 +194,12 @@ static struct ctl_table xfs_table[] = { }, { .procname = "speculative_prealloc_lifetime", - .data = &xfs_params.eofb_timer.val, + .data = &xfs_params.blockgc_timer.val, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xfs_params.eofb_timer.min, - .extra2 = &xfs_params.eofb_timer.max, - }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.cowb_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &xfs_params.cowb_timer.min, - .extra2 = &xfs_params.cowb_timer.max, + .extra1 = &xfs_params.blockgc_timer.min, + .extra2 = &xfs_params.blockgc_timer.max, }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 8abf4640f1d552..7692e76ead33cc 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -35,8 +35,7 @@ typedef struct xfs_param { xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ - xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ - xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ + xfs_sysctl_val_t blockgc_timer; /* Interval between blockgc scans */ } xfs_param_t; /* diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 120398a37c2a68..9b8d703dc9fd9c 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -29,6 +29,7 @@ #include "xfs_filestream.h" #include "xfs_fsmap.h" #include "xfs_btree_staging.h" +#include "xfs_icache.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0cfd65cd67c190..e74bbb648f83db 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -37,6 +37,7 @@ struct xfs_trans_res; struct xfs_inobt_rec_incore; union xfs_btree_ptr; struct xfs_dqtrx; +struct xfs_eofblocks; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -154,10 +155,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), @@ -358,7 +357,7 @@ DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); -DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); +DEFINE_BUF_EVENT(xfs_buf_drain_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); /* not really buffer traces, but the buf provides useful information */ @@ -3888,6 +3887,47 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \ DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range); +DECLARE_EVENT_CLASS(xfs_eofblocks_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, + unsigned long caller_ip), + TP_ARGS(mp, eofb, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__u32, flags) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(prid_t, prid) + __field(__u64, min_file_size) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = eofb ? eofb->eof_flags : 0; + __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns, + eofb->eof_uid) : 0; + __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns, + eofb->eof_gid) : 0; + __entry->prid = eofb ? eofb->eof_prid : 0; + __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags, + __entry->uid, + __entry->gid, + __entry->prid, + __entry->min_file_size, + (char *)__entry->caller_ip) +); +#define DEFINE_EOFBLOCKS_EVENT(name) \ +DEFINE_EVENT(xfs_eofblocks_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \ + unsigned long caller_ip), \ + TP_ARGS(mp, eofb, caller_ip)) +DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks); +DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index e72730f85af1b7..44f72c09c20349 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -20,6 +20,10 @@ #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_icache.h" kmem_zone_t *xfs_trans_zone; @@ -285,6 +289,17 @@ xfs_trans_alloc( tp->t_firstblock = NULLFSBLOCK; error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error == -ENOSPC) { + /* + * We weren't able to reserve enough space for the transaction. + * Flush the other speculative space allocations to free space. + * Do not perform a synchronous scan because callers can hold + * other locks. + */ + error = xfs_blockgc_free_space(mp, NULL); + if (!error) + error = xfs_trans_reserve(tp, resp, blocks, rtextents); + } if (error) { xfs_trans_cancel(tp); return error; @@ -1024,3 +1039,183 @@ xfs_trans_roll( tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; return xfs_trans_reserve(*tpp, &tres, 0, 0); } + +/* + * Allocate an transaction, lock and join the inode to it, and reserve quota. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The caller is responsible for + * releasing ILOCK_EXCL if a new transaction is returned. + */ +int +xfs_trans_alloc_inode( + struct xfs_inode *ip, + struct xfs_trans_res *resv, + unsigned int dblocks, + unsigned int rblocks, + bool force, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + bool retried = false; + int error; + +retry: + error = xfs_trans_alloc(mp, resv, dblocks, + rblocks / mp->m_sb.sb_rextsize, + force ? XFS_TRANS_RESERVE : 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_blockgc_free_quota(ip, 0); + retried = true; + goto retry; + } + if (error) + goto out_cancel; + + *tpp = tp; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Allocate an transaction in preparation for inode creation by reserving quota + * against the given dquots. Callers are not required to hold any inode locks. + */ +int +xfs_trans_alloc_icreate( + struct xfs_mount *mp, + struct xfs_trans_res *resv, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + unsigned int dblocks, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + bool retried = false; + int error; + +retry: + error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp); + if (error) + return error; + + error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); + retried = true; + goto retry; + } + if (error) { + xfs_trans_cancel(tp); + return error; + } + + *tpp = tp; + return 0; +} + +/* + * Allocate an transaction, lock and join the inode to it, and reserve quota + * in preparation for inode attribute changes that include uid, gid, or prid + * changes. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The ILOCK will be dropped when the + * transaction is committed or cancelled. + */ +int +xfs_trans_alloc_ichange( + struct xfs_inode *ip, + struct xfs_dquot *new_udqp, + struct xfs_dquot *new_gdqp, + struct xfs_dquot *new_pdqp, + bool force, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *udqp; + struct xfs_dquot *gdqp; + struct xfs_dquot *pdqp; + bool retried = false; + int error; + +retry: + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + /* + * For each quota type, skip quota reservations if the inode's dquots + * now match the ones that came from the caller, or the caller didn't + * pass one in. The inode's dquots can change if we drop the ILOCK to + * perform a blockgc scan, so we must preserve the caller's arguments. + */ + udqp = (new_udqp != ip->i_udquot) ? new_udqp : NULL; + gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL; + pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL; + if (udqp || gdqp || pdqp) { + unsigned int qflags = XFS_QMOPT_RES_REGBLKS; + + if (force) + qflags |= XFS_QMOPT_FORCE_RES; + + /* + * Reserve enough quota to handle blocks on disk and reserved + * for a delayed allocation. We'll actually transfer the + * delalloc reservation between dquots at chown time, even + * though that part is only semi-transactional. + */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, + pdqp, ip->i_d.di_nblocks + ip->i_delayed_blks, + 1, qflags); + if ((error == -EDQUOT || error == -ENOSPC) && !retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0); + retried = true; + goto retry; + } + if (error) + goto out_cancel; + } + + *tpp = tp; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 084658946cc89f..8b03fbfe9a1bda 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -268,4 +268,17 @@ xfs_trans_item_relog( return lip->li_ops->iop_relog(lip, tp); } +struct xfs_dquot; + +int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, + unsigned int dblocks, unsigned int rblocks, bool force, + struct xfs_trans **tpp); +int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, + struct xfs_dquot *udqp, struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, unsigned int dblocks, + struct xfs_trans **tpp); +int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, + struct xfs_trans **tpp); + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 28b8ac7019199a..48e09ea30ee539 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -16,6 +16,7 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_trace.h" +#include "xfs_error.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -691,9 +692,11 @@ xfs_trans_dqresv( nblks); xfs_trans_mod_dquot(tp, dqp, XFS_TRANS_DQ_RES_INOS, ninos); } - ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count); - ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count); - ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count); + + if (XFS_IS_CORRUPT(mp, dqp->q_blk.reserved < dqp->q_blk.count) || + XFS_IS_CORRUPT(mp, dqp->q_rtb.reserved < dqp->q_rtb.count) || + XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count)) + goto error_corrupt; xfs_dqunlock(dqp); return 0; @@ -703,6 +706,10 @@ xfs_trans_dqresv( if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; +error_corrupt: + xfs_dqunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; } @@ -780,28 +787,60 @@ int xfs_trans_reserve_quota_nblks( struct xfs_trans *tp, struct xfs_inode *ip, - int64_t nblks, - long ninos, - uint flags) + int64_t dblocks, + int64_t rblocks, + bool force) { struct xfs_mount *mp = ip->i_mount; + unsigned int qflags = 0; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS || - (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS); - /* - * Reserve nblks against these dquots, with trans as the mediator. - */ - return xfs_trans_reserve_quota_bydquots(tp, mp, - ip->i_udquot, ip->i_gdquot, - ip->i_pdquot, - nblks, ninos, flags); + if (force) + qflags |= XFS_QMOPT_FORCE_RES; + + /* Reserve data device quota against the inode's dquots. */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, dblocks, 0, + XFS_QMOPT_RES_REGBLKS | qflags); + if (error) + return error; + + /* Do the same but for realtime blocks. */ + error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, rblocks, 0, + XFS_QMOPT_RES_RTBLKS | qflags); + if (error) { + xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, -dblocks, 0, + XFS_QMOPT_RES_REGBLKS); + return error; + } + + return 0; +} + +/* Change the quota reservations for an inode creation activity. */ +int +xfs_trans_reserve_quota_icreate( + struct xfs_trans *tp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + int64_t dblocks) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp, + dblocks, 1, XFS_QMOPT_RES_REGBLKS); } /*