From 3acdfd280fe7d807237f2cb7a09d6f8f7f1b484f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 24 Jul 2017 06:22:15 -0400 Subject: [PATCH 01/10] errseq: rename __errseq_set to errseq_set Nothing calls this wrapper anymore, so just remove it and rename the old function to get rid of the double underscore prefix. Signed-off-by: Jeff Layton --- include/linux/errseq.h | 9 +-------- lib/errseq.c | 17 +++++++---------- mm/filemap.c | 2 +- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/include/linux/errseq.h b/include/linux/errseq.h index 9e0d444ac88df1..784f0860527be4 100644 --- a/include/linux/errseq.h +++ b/include/linux/errseq.h @@ -5,14 +5,7 @@ typedef u32 errseq_t; -errseq_t __errseq_set(errseq_t *eseq, int err); -static inline void errseq_set(errseq_t *eseq, int err) -{ - /* Optimize for the common case of no error */ - if (unlikely(err)) - __errseq_set(eseq, err); -} - +errseq_t errseq_set(errseq_t *eseq, int err); errseq_t errseq_sample(errseq_t *eseq); int errseq_check(errseq_t *eseq, errseq_t since); int errseq_check_and_advance(errseq_t *eseq, errseq_t *since); diff --git a/lib/errseq.c b/lib/errseq.c index 841fa24e6e00e4..7b900c2a277ad3 100644 --- a/lib/errseq.c +++ b/lib/errseq.c @@ -41,23 +41,20 @@ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) /** - * __errseq_set - set a errseq_t for later reporting + * errseq_set - set a errseq_t for later reporting * @eseq: errseq_t field that should be set - * @err: error to set + * @err: error to set (must be between -1 and -MAX_ERRNO) * * This function sets the error in *eseq, and increments the sequence counter * if the last sequence was sampled at some point in the past. * * Any error set will always overwrite an existing error. * - * Most callers will want to use the errseq_set inline wrapper to efficiently - * handle the common case where err is 0. - * - * We do return an errseq_t here, primarily for debugging purposes. The return - * value should not be used as a previously sampled value in later calls as it - * will not have the SEEN flag set. + * We do return the latest value here, primarily for debugging purposes. The + * return value should not be used as a previously sampled value in later calls + * as it will not have the SEEN flag set. */ -errseq_t __errseq_set(errseq_t *eseq, int err) +errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; @@ -107,7 +104,7 @@ errseq_t __errseq_set(errseq_t *eseq, int err) } return cur; } -EXPORT_SYMBOL(__errseq_set); +EXPORT_SYMBOL(errseq_set); /** * errseq_sample - grab current errseq_t value diff --git a/mm/filemap.c b/mm/filemap.c index a49702445ce05b..e1cca770688fe6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -589,7 +589,7 @@ EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { - errseq_t eseq = __errseq_set(&mapping->wb_err, err); + errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } From 80aafd50b6a4fa6b6bba4b451b553d5d221f59ff Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 24 Jul 2017 06:22:16 -0400 Subject: [PATCH 02/10] Documentation: add some docs for errseq_t ...and fix up a few comments in the code. Signed-off-by: Jeff Layton --- Documentation/errseq.rst | 149 +++++++++++++++++++++++++++++++++++++++ include/linux/errseq.h | 5 +- include/linux/fs.h | 2 - 3 files changed, 152 insertions(+), 4 deletions(-) create mode 100644 Documentation/errseq.rst diff --git a/Documentation/errseq.rst b/Documentation/errseq.rst new file mode 100644 index 00000000000000..4c29bd5afbc570 --- /dev/null +++ b/Documentation/errseq.rst @@ -0,0 +1,149 @@ +The errseq_t datatype +===================== +An errseq_t is a way of recording errors in one place, and allowing any +number of "subscribers" to tell whether it has changed since a previous +point where it was sampled. + +The initial use case for this is tracking errors for file +synchronization syscalls (fsync, fdatasync, msync and sync_file_range), +but it may be usable in other situations. + +It's implemented as an unsigned 32-bit value. The low order bits are +designated to hold an error code (between 1 and MAX_ERRNO). The upper bits +are used as a counter. This is done with atomics instead of locking so that +these functions can be called from any context. + +Note that there is a risk of collisions if new errors are being recorded +frequently, since we have so few bits to use as a counter. + +To mitigate this, the bit between the error value and counter is used as +a flag to tell whether the value has been sampled since a new value was +recorded. That allows us to avoid bumping the counter if no one has +sampled it since the last time an error was recorded. + +Thus we end up with a value that looks something like this:: + + bit: 31..13 12 11..0 + +-----------------+----+----------------+ + | counter | SF | errno | + +-----------------+----+----------------+ + +The general idea is for "watchers" to sample an errseq_t value and keep +it as a running cursor. That value can later be used to tell whether +any new errors have occurred since that sampling was done, and atomically +record the state at the time that it was checked. This allows us to +record errors in one place, and then have a number of "watchers" that +can tell whether the value has changed since they last checked it. + +A new errseq_t should always be zeroed out. An errseq_t value of all zeroes +is the special (but common) case where there has never been an error. An all +zero value thus serves as the "epoch" if one wishes to know whether there +has ever been an error set since it was first initialized. + +API usage +========= +Let me tell you a story about a worker drone. Now, he's a good worker +overall, but the company is a little...management heavy. He has to +report to 77 supervisors today, and tomorrow the "big boss" is coming in +from out of town and he's sure to test the poor fellow too. + +They're all handing him work to do -- so much he can't keep track of who +handed him what, but that's not really a big problem. The supervisors +just want to know when he's finished all of the work they've handed him so +far and whether he made any mistakes since they last asked. + +He might have made the mistake on work they didn't actually hand him, +but he can't keep track of things at that level of detail, all he can +remember is the most recent mistake that he made. + +Here's our worker_drone representation:: + + struct worker_drone { + errseq_t wd_err; /* for recording errors */ + }; + +Every day, the worker_drone starts out with a blank slate:: + + struct worker_drone wd; + + wd.wd_err = (errseq_t)0; + +The supervisors come in and get an initial read for the day. They +don't care about anything that happened before their watch begins:: + + struct supervisor { + errseq_t s_wd_err; /* private "cursor" for wd_err */ + spinlock_t s_wd_err_lock; /* protects s_wd_err */ + } + + struct supervisor su; + + su.s_wd_err = errseq_sample(&wd.wd_err); + spin_lock_init(&su.s_wd_err_lock); + +Now they start handing him tasks to do. Every few minutes they ask him to +finish up all of the work they've handed him so far. Then they ask him +whether he made any mistakes on any of it:: + + spin_lock(&su.su_wd_err_lock); + err = errseq_check_and_advance(&wd.wd_err, &su.s_wd_err); + spin_unlock(&su.su_wd_err_lock); + +Up to this point, that just keeps returning 0. + +Now, the owners of this company are quite miserly and have given him +substandard equipment with which to do his job. Occasionally it +glitches and he makes a mistake. He sighs a heavy sigh, and marks it +down:: + + errseq_set(&wd.wd_err, -EIO); + +...and then gets back to work. The supervisors eventually poll again +and they each get the error when they next check. Subsequent calls will +return 0, until another error is recorded, at which point it's reported +to each of them once. + +Note that the supervisors can't tell how many mistakes he made, only +whether one was made since they last checked, and the latest value +recorded. + +Occasionally the big boss comes in for a spot check and asks the worker +to do a one-off job for him. He's not really watching the worker +full-time like the supervisors, but he does need to know whether a +mistake occurred while his job was processing. + +He can just sample the current errseq_t in the worker, and then use that +to tell whether an error has occurred later:: + + errseq_t since = errseq_sample(&wd.wd_err); + /* submit some work and wait for it to complete */ + err = errseq_check(&wd.wd_err, since); + +Since he's just going to discard "since" after that point, he doesn't +need to advance it here. He also doesn't need any locking since it's +not usable by anyone else. + +Serializing errseq_t cursor updates +=================================== +Note that the errseq_t API does not protect the errseq_t cursor during a +check_and_advance_operation. Only the canonical error code is handled +atomically. In a situation where more than one task might be using the +same errseq_t cursor at the same time, it's important to serialize +updates to that cursor. + +If that's not done, then it's possible for the cursor to go backward +in which case the same error could be reported more than once. + +Because of this, it's often advantageous to first do an errseq_check to +see if anything has changed, and only later do an +errseq_check_and_advance after taking the lock. e.g.:: + + if (errseq_check(&wd.wd_err, READ_ONCE(su.s_wd_err)) { + /* su.s_wd_err is protected by s_wd_err_lock */ + spin_lock(&su.s_wd_err_lock); + err = errseq_check_and_advance(&wd.wd_err, &su.s_wd_err); + spin_unlock(&su.s_wd_err_lock); + } + +That avoids the spinlock in the common case where nothing has changed +since the last time it was checked. diff --git a/include/linux/errseq.h b/include/linux/errseq.h index 784f0860527be4..f746bd8fe4d087 100644 --- a/include/linux/errseq.h +++ b/include/linux/errseq.h @@ -1,8 +1,9 @@ +/* + * See Documentation/errseq.rst and lib/errseq.c + */ #ifndef _LINUX_ERRSEQ_H #define _LINUX_ERRSEQ_H -/* See lib/errseq.c for more info */ - typedef u32 errseq_t; errseq_t errseq_set(errseq_t *eseq, int err); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7b5d6816542b7f..21e7df1ad613df 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2571,8 +2571,6 @@ extern int __must_check file_write_and_wait_range(struct file *file, * When a writeback error occurs, most filesystems will want to call * filemap_set_wb_err to record the error in the mapping so that it will be * automatically reported whenever fsync is called on the file. - * - * FIXME: mention FS_* flag here? */ static inline void filemap_set_wb_err(struct address_space *mapping, int err) { From 9326c9b20dd813248280cca1e1d1c05e939dae15 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 26 Jul 2017 10:21:11 -0400 Subject: [PATCH 03/10] mm: consolidate dax / non-dax checks for writeback We have this complex conditional copied to several places. Turn it into a helper function. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton --- mm/filemap.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index e1cca770688fe6..72e46e6f0d9a0c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -522,12 +522,17 @@ int filemap_fdatawait(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawait); +static bool mapping_needs_writeback(struct address_space *mapping) +{ + return (!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional); +} + int filemap_write_and_wait(struct address_space *mapping) { int err = 0; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be @@ -566,8 +571,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -656,8 +660,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) int err = 0, err2; struct address_space *mapping = file->f_mapping; - if ((!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional)) { + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ From 7e51fe1dd180e525c9cb9dc613c524c83c130867 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 22 Jul 2017 09:27:43 -0400 Subject: [PATCH 04/10] fuse: convert to errseq_t based error tracking for fsync Change to file_write_and_wait_range and file_check_and_advance_wb_err Signed-off-by: Jeff Layton --- fs/fuse/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3ee4fdc3da9ec3..e2ffc499d106c3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -457,7 +457,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) goto out; @@ -465,10 +465,10 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, /* * Due to implementation of fuse writeback - * filemap_write_and_wait_range() does not catch errors. + * file_write_and_wait_range() does not catch errors. * We have to do this directly after fuse_sync_writes() */ - err = filemap_check_errors(file->f_mapping); + err = file_check_and_advance_wb_err(file); if (err) goto out; From a823e4589e68996436d16db4ede9a43b646332f9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Jul 2017 07:24:43 -0400 Subject: [PATCH 05/10] mm: add file_fdatawait_range and file_write_and_wait Necessary now for gfs2_fsync and sync_file_range, but there will eventually be other callers. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton --- include/linux/fs.h | 11 ++++++++++- mm/filemap.c | 23 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 21e7df1ad613df..af592ca3d50965 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2544,6 +2544,8 @@ extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); +extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, + loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); @@ -2552,12 +2554,19 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping, extern int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); extern int filemap_check_errors(struct address_space *mapping); - extern void __filemap_set_wb_err(struct address_space *mapping, int err); + +extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, + loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); +static inline int file_write_and_wait(struct file *file) +{ + return file_write_and_wait_range(file, 0, LLONG_MAX); +} + /** * filemap_set_wb_err - set a writeback error on an address_space * @mapping: mapping in which to set writeback error diff --git a/mm/filemap.c b/mm/filemap.c index 72e46e6f0d9a0c..394bb5e96f87eb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -475,6 +475,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, } EXPORT_SYMBOL(filemap_fdatawait_range); +/** + * file_fdatawait_range - wait for writeback to complete + * @file: file pointing to address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the address space that file + * refers to, in the given range and wait for all of them. Check error + * status of the address space vs. the file->f_wb_err cursor and return it. + * + * Since the error status of the file is advanced by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + */ +int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) +{ + struct address_space *mapping = file->f_mapping; + + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return file_check_and_advance_wb_err(file); +} +EXPORT_SYMBOL(file_fdatawait_range); + /** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for From 6454568d961bc5de316014548838398174a128c4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 24 Jul 2017 06:22:14 -0400 Subject: [PATCH 06/10] fs: convert sync_file_range to use errseq_t based error-tracking sync_file_range doesn't call down into the filesystem directly at all. It only kicks off writeback of pagecache pages and optionally waits on the result. Convert sync_file_range to use errseq_t based error tracking, under the assumption that most users will prefer this behavior when errors occur. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton --- fs/sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 2a54c1f2203595..27d6b8bbcb6a51 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -342,7 +342,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); + ret = file_fdatawait_range(f.file, offset, endbyte); if (ret < 0) goto out_put; } @@ -355,7 +355,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, } if (flags & SYNC_FILE_RANGE_WAIT_AFTER) - ret = filemap_fdatawait_range(mapping, offset, endbyte); + ret = file_fdatawait_range(f.file, offset, endbyte); out_put: fdput(f); From d07a6ac7b6f878c1078b75181cdae060daac5820 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 7 Jul 2017 15:20:53 -0400 Subject: [PATCH 07/10] gfs2: convert to errseq_t based writeback error reporting for fsync Also, fix a place where a writeback error might get dropped in the gfs2_is_jdata case. Signed-off-by: Jeff Layton --- fs/gfs2/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c2062a108d1960..c53ac6efd04c5e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -668,12 +668,14 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (ret) return ret; if (gfs2_is_jdata(ip)) - filemap_write_and_wait(mapping); + ret = file_write_and_wait(file); + if (ret) + return ret; gfs2_ail_flush(ip->i_gl, 1); } if (mapping->nrpages) - ret = filemap_fdatawait_range(mapping, start, end); + ret = file_fdatawait_range(file, start, end); return ret ? ret : ret1; } From 3b49c9a1e984b524142afc7536041d8c66877113 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 7 Jul 2017 15:20:52 -0400 Subject: [PATCH 08/10] fs: convert a pile of fsync routines to errseq_t based reporting This patch converts most of the in-kernel filesystems that do writeback out of the pagecache to report errors using the errseq_t-based infrastructure that was recently added. This allows them to report errors once for each open file description. Most filesystems have a fairly straightforward fsync operation. They call filemap_write_and_wait_range to write back all of the data and wait on it, and then (sometimes) sync out the metadata. For those filesystems this is a straightforward conversion from calling filemap_write_and_wait_range in their fsync operation to calling file_write_and_wait_range. Acked-by: Jan Kara Acked-by: Dave Kleikamp Signed-off-by: Jeff Layton --- arch/powerpc/platforms/cell/spufs/file.c | 2 +- drivers/staging/lustre/lustre/llite/file.c | 2 +- drivers/video/fbdev/core/fb_defio.c | 2 +- fs/9p/vfs_file.c | 4 ++-- fs/affs/file.c | 2 +- fs/afs/write.c | 2 +- fs/cifs/file.c | 4 ++-- fs/exofs/file.c | 2 +- fs/f2fs/file.c | 2 +- fs/hfs/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/hpfs/file.c | 2 +- fs/jffs2/file.c | 2 +- fs/jfs/file.c | 2 +- fs/ncpfs/file.c | 2 +- fs/ntfs/dir.c | 2 +- fs/ntfs/file.c | 2 +- fs/ocfs2/file.c | 2 +- fs/reiserfs/dir.c | 2 +- fs/reiserfs/file.c | 2 +- fs/ubifs/file.c | 2 +- 22 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index ae2f740a82f149..5ffcdeb1eb1768 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1749,7 +1749,7 @@ static int spufs_mfc_flush(struct file *file, fl_owner_t id) static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file_inode(file); - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = file_write_and_wait_range(file, start, end); if (!err) { inode_lock(inode); err = spufs_mfc_flush(file, NULL); diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c index ab1c85c1ed3896..f7d07735ac6676 100644 --- a/drivers/staging/lustre/lustre/llite/file.c +++ b/drivers/staging/lustre/lustre/llite/file.c @@ -2364,7 +2364,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) PFID(ll_inode2fid(inode)), inode); ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); inode_lock(inode); /* catch async errors that were recorded back when async writeback diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 37f69c061210ed..487d5e336e1b6c 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -69,7 +69,7 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy { struct fb_info *info = file->private_data; struct inode *inode = file_inode(file); - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = file_write_and_wait_range(file, start, end); if (err) return err; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3de3b4a89d89a1..4802d75b3cf798 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -445,7 +445,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, struct p9_wstat wstat; int retval; - retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; @@ -468,7 +468,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int retval; - retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; diff --git a/fs/affs/file.c b/fs/affs/file.c index 196ee7f6fdc40e..00331810f69084 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -954,7 +954,7 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct inode *inode = filp->f_mapping->host; int ret, err; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; diff --git a/fs/afs/write.c b/fs/afs/write.c index 2d2fccd5044bcd..106e43db11153f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -714,7 +714,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) vnode->fid.vid, vnode->fid.vnode, file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; inode_lock(inode); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bc09df6b473ae3..0786f19d288f8e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2329,7 +2329,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, struct inode *inode = file_inode(file); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; inode_lock(inode); @@ -2371,7 +2371,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct inode *inode = file->f_mapping->host; - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; inode_lock(inode); diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 28645f0640f735..a94594ea2aa346 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -48,7 +48,7 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a0e6d2c65a9ec0..40fb3d4bb9c256 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -206,7 +206,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(inode, FI_NEED_IPU); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); clear_inode_flag(inode, FI_NEED_IPU); if (ret) { diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index bfbba799430f15..2538b49cc349e9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, struct super_block * sb; int ret, err; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; inode_lock(inode); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index e8638d52819519..4f26b687713052 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -283,7 +283,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); int error = 0, error2; - error = filemap_write_and_wait_range(inode->i_mapping, start, end); + error = file_write_and_wait_range(file, start, end); if (error) return error; inode_lock(inode); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e61261a7417e57..c148e7f4f45197 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -374,7 +374,7 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, struct inode *inode = file->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b3be1b5a62e213..f26138425b16ed 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -24,7 +24,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(file->f_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; return sync_blockdev(inode->i_sb->s_bdev); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index c12476e309c67c..bd0428bebe9b77 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -35,7 +35,7 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 739492c7a3fd9b..36665fd37095ed 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -34,7 +34,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; int rc = 0; - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 76965e7722643e..a06c07619ee660 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -23,7 +23,7 @@ static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - return filemap_write_and_wait_range(file->f_mapping, start, end); + return file_write_and_wait_range(file, start, end); } /* diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 0ee19ecc982d4e..1a24be9e8405d9 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1506,7 +1506,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - err = filemap_write_and_wait_range(vi->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; inode_lock(vi); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c4f68c338735b9..331910fa84422c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1989,7 +1989,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - err = filemap_write_and_wait_range(vi->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; inode_lock(vi); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bfeb647459d95e..66e59d3163ea6d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -196,7 +196,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 45aa05e2232f9a..5b50689d853964 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -34,7 +34,7 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int err; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index b396eb09f28886..843aadcc123c2a 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -154,7 +154,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, int err; int barrier_done; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8cad0b19b404cc..f90a466ea5db30 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1337,7 +1337,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ return 0; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; inode_lock(inode); From ffb959bbdf923b4f89a08a04aba2501b1b16d164 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 31 Jul 2017 10:29:38 -0400 Subject: [PATCH 09/10] mm: remove optimizations based on i_size in mapping writeback waits Marcelo added this i_size based optimization with a patch in 2004 (commitid is from the linux-history tree): commit 765dad09b4ac101a32d87af2bb793c3060497d3c Author: Marcelo Tosatti Date: Tue Sep 7 17:51:17 2004 -0700 small wait_on_page_writeback_range() optimization filemap_fdatawait() calls wait_on_page_writeback_range() with -1 as "end" parameter. This is not needed since we know the EOF from the inode. Use that instead. There may be races here, particularly with clustered or network filesystems. It also seems like a bit of a layering violation since we're operating on an address_space here, not an inode. Finally, it's also questionable whether this optimization really helps on workloads that we care about. Should we be optimizing for writeback vs. truncate races in a codepath where we expect to wait anyway? It doesn't seem worth the risk. Remove this optimization from the filemap_fdatawait codepaths. This means that filemap_fdatawait becomes a trivial wrapper around filemap_fdatawait_range. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton --- include/linux/fs.h | 7 ++++++- mm/filemap.c | 30 +----------------------------- 2 files changed, 7 insertions(+), 30 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index af592ca3d50965..909210bd63665a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2538,10 +2538,15 @@ extern int invalidate_inode_pages2_range(struct address_space *mapping, extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); -extern int filemap_fdatawait(struct address_space *); extern int filemap_fdatawait_keep_errors(struct address_space *mapping); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); + +static inline int filemap_fdatawait(struct address_space *mapping) +{ + return filemap_fdatawait_range(mapping, 0, LLONG_MAX); +} + extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, diff --git a/mm/filemap.c b/mm/filemap.c index 394bb5e96f87eb..85dfe3bee324a6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -512,39 +512,11 @@ EXPORT_SYMBOL(file_fdatawait_range); */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - __filemap_fdatawait_range(mapping, 0, i_size - 1); + __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); -/** - * filemap_fdatawait - wait for all under-writeback pages to complete - * @mapping: address space structure to wait for - * - * Walk the list of under-writeback pages of the given address space - * and wait for all of them. Check error status of the address space - * and return it. - * - * Since the error status of the address space is cleared by this function, - * callers are responsible for checking the return value and handling and/or - * reporting the error. - */ -int filemap_fdatawait(struct address_space *mapping) -{ - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - return filemap_fdatawait_range(mapping, 0, i_size - 1); -} -EXPORT_SYMBOL(filemap_fdatawait); - static bool mapping_needs_writeback(struct address_space *mapping) { return (!dax_mapping(mapping) && mapping->nrpages) || From 6d4b51241394664fffbf68ea86c96d2699344583 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 2 Aug 2017 10:14:21 -0400 Subject: [PATCH 10/10] ecryptfs: convert to file_write_and_wait in ->fsync This change is mainly for documentation/completeness, as ecryptfs never calls mapping_set_error, and so will never return a previous writeback error. Signed-off-by: Jeff Layton --- fs/ecryptfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index ca4e83750214ad..c74ed3ca3372f8 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -328,7 +328,7 @@ ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int rc; - rc = filemap_write_and_wait(file->f_mapping); + rc = file_write_and_wait(file); if (rc) return rc;