Skip to content

Commit

Permalink
Merge tag 'pull-work.iov_iter-base' of git://git.kernel.org/pub/scm/l…
Browse files Browse the repository at this point in the history
…inux/kernel/git/viro/vfs

Pull vfs iov_iter updates from Al Viro:
 "Part 1 - isolated cleanups and optimizations.

  One of the goals is to reduce the overhead of using ->read_iter() and
  ->write_iter() instead of ->read()/->write().

  new_sync_{read,write}() has a surprising amount of overhead, in
  particular inside iocb_flags(). That's the explanation for the
  beginning of the series is in this pile; it's not directly
  iov_iter-related, but it's a part of the same work..."

* tag 'pull-work.iov_iter-base' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
  first_iovec_segment(): just return address
  iov_iter: massage calling conventions for first_{iovec,bvec}_segment()
  iov_iter: first_{iovec,bvec}_segment() - simplify a bit
  iov_iter: lift dealing with maxpages out of first_{iovec,bvec}_segment()
  iov_iter_get_pages{,_alloc}(): cap the maxsize with MAX_RW_COUNT
  iov_iter_bvec_advance(): don't bother with bvec_iter
  copy_page_{to,from}_iter(): switch iovec variants to generic
  keep iocb_flags() result cached in struct file
  iocb: delay evaluation of IS_SYNC(...) until we want to check IOCB_DSYNC
  struct file: use anonymous union member for rcuhead and llist
  btrfs: use IOMAP_DIO_NOSYNC
  teach iomap_dio_rw() to suppress dsync
  No need of likely/unlikely on calls of check_copy_size()
  • Loading branch information
torvalds committed Aug 3, 2022
2 parents 200e340 + dd45ab9 commit 5264406
Show file tree
Hide file tree
Showing 20 changed files with 113 additions and 296 deletions.
2 changes: 1 addition & 1 deletion arch/powerpc/include/asm/uaccess.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned long size)
static inline unsigned long __must_check
copy_mc_to_user(void __user *to, const void *from, unsigned long n)
{
if (likely(check_copy_size(from, n, true))) {
if (check_copy_size(from, n, true)) {
if (access_ok(to, n)) {
allow_write_to_user(to, n);
n = copy_mc_generic((void *)to, from, n);
Expand Down
4 changes: 2 additions & 2 deletions arch/s390/include/asm/uaccess.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ _copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned
static __always_inline unsigned long __must_check
copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key)
{
if (likely(check_copy_size(to, n, false)))
if (check_copy_size(to, n, false))
n = _copy_from_user_key(to, from, n, key);
return n;
}
Expand All @@ -50,7 +50,7 @@ _copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned l
static __always_inline unsigned long __must_check
copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key)
{
if (likely(check_copy_size(from, n, true)))
if (check_copy_size(from, n, true))
n = _copy_to_user_key(to, from, n, key);
return n;
}
Expand Down
2 changes: 1 addition & 1 deletion block/fops.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;

/* avoid the need for a I/O completion work item */
if (iocb->ki_flags & IOCB_DSYNC)
if (iocb_is_dsync(iocb))
opf |= REQ_FUA;
return opf;
}
Expand Down
2 changes: 1 addition & 1 deletion drivers/nvme/target/io-cmd-file.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,

iocb->ki_pos = pos;
iocb->ki_filp = req->ns->file;
iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
iocb->ki_flags = ki_flags | iocb->ki_filp->f_iocb_flags;

return call_iter(iocb, &iter);
}
Expand Down
2 changes: 1 addition & 1 deletion fs/aio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1475,7 +1475,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
req->ki_flags = iocb_flags(req->ki_filp);
req->ki_flags = req->ki_filp->f_iocb_flags;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
Expand Down
19 changes: 1 addition & 18 deletions fs/btrfs/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -1848,7 +1848,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,

static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Expand Down Expand Up @@ -1901,15 +1900,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
goto buffered;
}

/*
* We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
* calls generic_write_sync() (through iomap_dio_complete()), because
* that results in calling fsync (btrfs_sync_file()) which will try to
* lock the inode in exclusive/write mode.
*/
if (is_sync_write)
iocb->ki_flags &= ~IOCB_DSYNC;

/*
* The iov_iter can be mapped to the same file range we are writing to.
* If that's the case, then we will deadlock in the iomap code, because
Expand Down Expand Up @@ -1964,13 +1954,6 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)

btrfs_inode_unlock(inode, ilock_flags);

/*
* Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
* the fsync (call generic_write_sync()).
*/
if (is_sync_write)
iocb->ki_flags |= IOCB_DSYNC;

/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
goto out;
Expand Down Expand Up @@ -2038,7 +2021,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
struct file *file = iocb->ki_filp;
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written, num_sync;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
const bool sync = iocb_is_dsync(iocb);

/*
* If the fs flips readonly due to some impossible error, although we
Expand Down
3 changes: 2 additions & 1 deletion fs/btrfs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -8165,7 +8165,8 @@ ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_befo
struct btrfs_dio_data data;

return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
IOMAP_DIO_PARTIAL, &data, done_before);
IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC,
&data, done_before);
}

static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
Expand Down
2 changes: 1 addition & 1 deletion fs/direct-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
*/
if (dio->is_async && iov_iter_rw(iter) == WRITE) {
retval = 0;
if (iocb->ki_flags & IOCB_DSYNC)
if (iocb_is_dsync(iocb))
retval = dio_set_defer_completion(dio);
else if (!dio->inode->i_sb->s_dio_done_wq) {
/*
Expand Down
1 change: 1 addition & 0 deletions fs/fcntl.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
}
spin_lock(&filp->f_lock);
filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
filp->f_iocb_flags = iocb_flags(filp);
spin_unlock(&filp->f_lock);

out:
Expand Down
17 changes: 9 additions & 8 deletions fs/file_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp;

static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
struct file *f = container_of(head, struct file, f_rcuhead);

put_cred(f->f_cred);
kmem_cache_free(filp_cachep, f);
Expand All @@ -56,7 +56,7 @@ static inline void file_free(struct file *f)
security_file_free(f);
if (!(f->f_mode & FMODE_NOACCOUNT))
percpu_counter_dec(&nr_files);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
call_rcu(&f->f_rcuhead, file_free_rcu);
}

/*
Expand Down Expand Up @@ -142,7 +142,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
f->f_cred = get_cred(cred);
error = security_file_alloc(f);
if (unlikely(error)) {
file_free_rcu(&f->f_u.fu_rcuhead);
file_free_rcu(&f->f_rcuhead);
return ERR_PTR(error);
}

Expand Down Expand Up @@ -243,6 +243,7 @@ static struct file *alloc_file(const struct path *path, int flags,
if ((file->f_mode & FMODE_WRITE) &&
likely(fop->write || fop->write_iter))
file->f_mode |= FMODE_CAN_WRITE;
file->f_iocb_flags = iocb_flags(file);
file->f_mode |= FMODE_OPENED;
file->f_op = fop;
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
Expand Down Expand Up @@ -343,13 +344,13 @@ static void delayed_fput(struct work_struct *unused)
struct llist_node *node = llist_del_all(&delayed_fput_list);
struct file *f, *t;

llist_for_each_entry_safe(f, t, node, f_u.fu_llist)
llist_for_each_entry_safe(f, t, node, f_llist)
__fput(f);
}

static void ____fput(struct callback_head *work)
{
__fput(container_of(work, struct file, f_u.fu_rcuhead));
__fput(container_of(work, struct file, f_rcuhead));
}

/*
Expand All @@ -376,8 +377,8 @@ void fput(struct file *file)
struct task_struct *task = current;

if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
init_task_work(&file->f_u.fu_rcuhead, ____fput);
if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
init_task_work(&file->f_rcuhead, ____fput);
if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
return;
/*
* After this task has run exit_task_work(),
Expand All @@ -386,7 +387,7 @@ void fput(struct file *file)
*/
}

if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
if (llist_add(&file->f_llist, &delayed_fput_list))
schedule_delayed_work(&delayed_fput_work, 1);
}
}
Expand Down
2 changes: 1 addition & 1 deletion fs/fuse/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -1042,7 +1042,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb)
{
unsigned int flags = iocb->ki_filp->f_flags;

if (iocb->ki_flags & IOCB_DSYNC)
if (iocb_is_dsync(iocb))
flags |= O_DSYNC;
if (iocb->ki_flags & IOCB_SYNC)
flags |= O_SYNC;
Expand Down
19 changes: 10 additions & 9 deletions fs/iomap/direct-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -548,17 +548,18 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}

/* for data sync or sync, we need sync completion processing */
if (iocb->ki_flags & IOCB_DSYNC)
if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) {
dio->flags |= IOMAP_DIO_NEED_SYNC;

/*
* For datasync only writes, we optimistically try using FUA for
* this IO. Any non-FUA write that occurs will clear this flag,
* hence we know before completion whether a cache flush is
* necessary.
*/
if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
dio->flags |= IOMAP_DIO_WRITE_FUA;
/*
* For datasync only writes, we optimistically try
* using FUA for this IO. Any non-FUA write that
* occurs will clear this flag, hence we know before
* completion whether a cache flush is necessary.
*/
if (!(iocb->ki_flags & IOCB_SYNC))
dio->flags |= IOMAP_DIO_WRITE_FUA;
}
}

if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
Expand Down
1 change: 1 addition & 0 deletions fs/open.c
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,7 @@ static int do_dentry_open(struct file *f,
f->f_mode |= FMODE_CAN_ODIRECT;

f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
f->f_iocb_flags = iocb_flags(f);

file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

Expand Down
2 changes: 1 addition & 1 deletion fs/zonefs/super.c
Original file line number Diff line number Diff line change
Expand Up @@ -779,7 +779,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
bio->bi_iter.bi_sector = zi->i_zsector;
bio->bi_ioprio = iocb->ki_ioprio;
if (iocb->ki_flags & IOCB_DSYNC)
if (iocb_is_dsync(iocb))
bio->bi_opf |= REQ_FUA;

ret = bio_iov_iter_get_pages(bio, from);
Expand Down
21 changes: 13 additions & 8 deletions include/linux/fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -943,9 +943,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)

struct file {
union {
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct llist_node f_llist;
struct rcu_head f_rcuhead;
unsigned int f_iocb_flags;
};
struct path f_path;
struct inode *f_inode; /* cached value */
const struct file_operations *f_op;
Expand Down Expand Up @@ -2328,13 +2329,11 @@ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
!vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode));
}

static inline int iocb_flags(struct file *file);

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
.ki_filp = filp,
.ki_flags = iocb_flags(filp),
.ki_flags = filp->f_iocb_flags,
.ki_ioprio = get_current_ioprio(),
};
}
Expand Down Expand Up @@ -2850,14 +2849,20 @@ extern int vfs_fsync(struct file *file, int datasync);
extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
unsigned int flags);

static inline bool iocb_is_dsync(const struct kiocb *iocb)
{
return (iocb->ki_flags & IOCB_DSYNC) ||
IS_SYNC(iocb->ki_filp->f_mapping->host);
}

/*
* Sync the bytes written if this was a synchronous write. Expect ki_pos
* to already be updated for the write, and will return either the amount
* of bytes passed in, or an error if syncing the file failed.
*/
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
if (iocb->ki_flags & IOCB_DSYNC) {
if (iocb_is_dsync(iocb)) {
int ret = vfs_fsync_range(iocb->ki_filp,
iocb->ki_pos - count, iocb->ki_pos - 1,
(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
Expand Down Expand Up @@ -3380,7 +3385,7 @@ static inline int iocb_flags(struct file *file)
res |= IOCB_APPEND;
if (file->f_flags & O_DIRECT)
res |= IOCB_DIRECT;
if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
if (file->f_flags & O_DSYNC)
res |= IOCB_DSYNC;
if (file->f_flags & __O_SYNC)
res |= IOCB_SYNC;
Expand Down
6 changes: 6 additions & 0 deletions include/linux/iomap.h
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,12 @@ struct iomap_dio_ops {
*/
#define IOMAP_DIO_PARTIAL (1 << 2)

/*
* The caller will sync the write if needed; do not sync it within
* iomap_dio_rw. Overrides IOMAP_DIO_FORCE_WAIT.
*/
#define IOMAP_DIO_NOSYNC (1 << 3)

ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before);
Expand Down
4 changes: 2 additions & 2 deletions include/linux/uaccess.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,15 @@ _copy_to_user(void __user *, const void *, unsigned long);
static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (likely(check_copy_size(to, n, false)))
if (check_copy_size(to, n, false))
n = _copy_from_user(to, from, n);
return n;
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (likely(check_copy_size(from, n, true)))
if (check_copy_size(from, n, true))
n = _copy_to_user(to, from, n);
return n;
}
Expand Down
15 changes: 6 additions & 9 deletions include/linux/uio.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,19 +156,17 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
if (unlikely(!check_copy_size(addr, bytes, true)))
return 0;
else
if (check_copy_size(addr, bytes, true))
return _copy_to_iter(addr, bytes, i);
return 0;
}

static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
if (unlikely(!check_copy_size(addr, bytes, false)))
return 0;
else
if (check_copy_size(addr, bytes, false))
return _copy_from_iter(addr, bytes, i);
return 0;
}

static __always_inline __must_check
Expand All @@ -184,10 +182,9 @@ bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
if (unlikely(!check_copy_size(addr, bytes, false)))
return 0;
else
if (check_copy_size(addr, bytes, false))
return _copy_from_iter_nocache(addr, bytes, i);
return 0;
}

static __always_inline __must_check
Expand Down
2 changes: 1 addition & 1 deletion io_uring/rw.c
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (!io_req_ffs_set(req))
req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;

kiocb->ki_flags = iocb_flags(file);
kiocb->ki_flags = file->f_iocb_flags;
ret = kiocb_set_rw_flags(kiocb, rw->flags);
if (unlikely(ret))
return ret;
Expand Down
Loading

0 comments on commit 5264406

Please sign in to comment.