From 20aac6c60981f5bfacd66661d090d907bf1482f0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 Jul 2022 17:26:29 -0400 Subject: [PATCH 01/10] __follow_mount_rcu(): verify that mount_lock remains unchanged Validate mount_lock seqcount as soon as we cross into mount in RCU mode. Sure, ->mnt_root is pinned and will remain so until we do rcu_read_unlock() anyway, and we will eventually fail to unlazy if the mount_lock had been touched, but we might run into a hard error (e.g. -ENOENT) before trying to unlazy. And it's possible to end up with RCU pathwalk racing with rename() and umount() in a way that would fail with -ENOENT while non-RCU pathwalk would've succeeded with any timings. Once upon a time we hadn't needed that, but analysis had been subtle, brittle and went out of window as soon as RENAME_EXCHANGE had been added. It's narrow, hard to hit and won't get you anything other than stray -ENOENT that could be arranged in much easier way with the same priveleges, but it's a bug all the same. Cc: stable@kernel.org X-sky-is-falling: unlikely Fixes: da1ce0670c14 "vfs: add cross-rename" Signed-off-by: Al Viro --- fs/namei.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 1f28d3f463c3b0..4dbf55b37ec63c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1505,6 +1505,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * becoming unpinned. */ flags = dentry->d_flags; + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) From 82ef069805a352bacb22fd4322b746edf809603c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Jul 2022 11:23:58 -0400 Subject: [PATCH 02/10] namei: get rid of pointless unlikely(read_seqcount_retry(...)) read_seqcount_retry() et.al. are inlined and there's enough annotations for compiler to figure out that those are unlikely to return non-zero. Signed-off-by: Al Viro --- fs/namei.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 4dbf55b37ec63c..e4a58d5975ca3f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -818,7 +818,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) + if (read_seqcount_retry(&dentry->d_seq, seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is @@ -962,7 +962,7 @@ static int nd_jump_root(struct nameidata *nd) d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; - if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); @@ -1635,7 +1635,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, * the dentry name information from lookup. */ *inode = d_backing_inode(dentry); - if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) + if (read_seqcount_retry(&dentry->d_seq, seq)) return ERR_PTR(-ECHILD); /* @@ -1645,7 +1645,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, * The memory barrier in read_seqcount_begin of child is * enough, we can use __read_seqcount_retry here. */ - if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) + if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); *seqp = seq; @@ -1891,7 +1891,7 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } @@ -1899,13 +1899,13 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, parent = old->d_parent; *inodep = parent->d_inode; *seqp = read_seqcount_begin(&parent->d_seq); - if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); @@ -1985,9 +1985,9 @@ static const char *handle_dots(struct nameidata *nd, int type) * some fallback). */ smp_rmb(); - if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); - if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } From 51c6546c30ea6efe9aa819015bd61ffefc910944 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 Jul 2022 11:20:51 -0400 Subject: [PATCH 03/10] follow_dotdot{,_rcu}(): change calling conventions Instead of returning NULL when we are in root, just make it return the current position (and set *seqp and *inodep accordingly). That collapses the calls of step_into() in handle_dots() Signed-off-by: Al Viro --- fs/namei.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index e4a58d5975ca3f..9c50facb97699d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1909,7 +1909,9 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); - return NULL; + *seqp = nd->seq; + *inodep = nd->path.dentry->d_inode; + return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd, @@ -1945,8 +1947,9 @@ static struct dentry *follow_dotdot(struct nameidata *nd, in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); - dget(nd->path.dentry); - return NULL; + *seqp = 0; + *inodep = nd->path.dentry->d_inode; + return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) @@ -1968,12 +1971,7 @@ static const char *handle_dots(struct nameidata *nd, int type) parent = follow_dotdot(nd, &inode, &seq); if (IS_ERR(parent)) return ERR_CAST(parent); - if (unlikely(!parent)) - error = step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq); - else - error = step_into(nd, WALK_NOFOLLOW, - parent, inode, seq); + error = step_into(nd, WALK_NOFOLLOW, parent, inode, seq); if (unlikely(error)) return error; From 7e4745a09426b3fe63e9fbea3190e0f8500820a4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Jul 2022 12:22:46 -0400 Subject: [PATCH 04/10] switch try_to_unlazy_next() to __legitimize_mnt() The tricky case (__legitimize_mnt() failing after having grabbed a reference) can be trivially dealt with by leaving nd->path.mnt non-NULL, for terminate_walk() to drop it. legitimize_mnt() becomes static after that. Signed-off-by: Al Viro --- fs/mount.h | 1 - fs/namei.c | 9 +++++++-- fs/namespace.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 0b6e08cf8afb07..130c07c2f8d258 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -100,7 +100,6 @@ static inline int is_mounted(struct vfsmount *mnt) extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); -extern bool legitimize_mnt(struct vfsmount *, unsigned); static inline bool __path_is_mountpoint(const struct path *path) { diff --git a/fs/namei.c b/fs/namei.c index 9c50facb97699d..e864d5b9eeac19 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -799,13 +799,18 @@ static bool try_to_unlazy(struct nameidata *nd) */ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq) { + int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out2; - if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) - goto out2; + res = __legitimize_mnt(nd->path.mnt, nd->m_seq); + if (unlikely(res)) { + if (res > 0) + goto out2; + goto out1; + } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; diff --git a/fs/namespace.c b/fs/namespace.c index e6a7e769d25dd1..68789f896f0819 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -648,7 +648,7 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) } /* call under rcu_read_lock */ -bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) From 6e180327153071281dbbf6a16759e49862debdca Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 6 Jul 2022 12:40:31 -0400 Subject: [PATCH 05/10] namei: move clearing LOOKUP_RCU towards rcu_read_unlock() try_to_unlazy()/try_to_unlazy_next() drop LOOKUP_RCU in the very beginning and do rcu_read_unlock() only at the very end. However, nothing done in between even looks at the flag in question; might as well clear it at the same time we unlock. Note that try_to_unlazy_next() used to call legitimize_mnt(), which might drop/regain rcu_read_lock() in some cases. This is no longer true, so we really have rcu_read_lock() held all along until the end. Signed-off-by: Al Viro --- fs/namei.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index e864d5b9eeac19..4b7a2147c2070a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -665,6 +665,12 @@ static void drop_links(struct nameidata *nd) } } +static void leave_rcu(struct nameidata *nd) +{ + nd->flags &= ~LOOKUP_RCU; + rcu_read_unlock(); +} + static void terminate_walk(struct nameidata *nd) { drop_links(nd); @@ -678,8 +684,7 @@ static void terminate_walk(struct nameidata *nd) nd->state &= ~ND_ROOT_GRABBED; } } else { - nd->flags &= ~LOOKUP_RCU; - rcu_read_unlock(); + leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; @@ -765,14 +770,13 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; - rcu_read_unlock(); + leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; @@ -780,7 +784,7 @@ static bool try_to_unlazy(struct nameidata *nd) nd->path.mnt = NULL; nd->path.dentry = NULL; out: - rcu_read_unlock(); + leave_rcu(nd); return false; } @@ -802,7 +806,6 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - nd->flags &= ~LOOKUP_RCU; if (unlikely(!legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); @@ -831,7 +834,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!legitimize_root(nd))) goto out_dput; - rcu_read_unlock(); + leave_rcu(nd); return true; out2: @@ -839,10 +842,10 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi out1: nd->path.dentry = NULL; out: - rcu_read_unlock(); + leave_rcu(nd); return false; out_dput: - rcu_read_unlock(); + leave_rcu(nd); dput(dentry); return false; } From 03fa86e9f79d8b9a6aa28965829a4a8646139a0a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 Jul 2022 18:12:39 -0400 Subject: [PATCH 06/10] namei: stash the sampled ->d_seq into nameidata New field: nd->next_seq. Set to 0 outside of RCU mode, holds the sampled value for the next dentry to be considered. Used instead of an arseload of local variables, arguments, etc. step_into() has lost seq argument; nd->next_seq is used, so dentry passed to it must be the one ->next_seq is about. There are two requirements for RCU pathwalk: 1) it should not give a hard failure (other than -ECHILD) unless non-RCU pathwalk might fail that way given suitable timings. 2) it should not succeed unless non-RCU pathwalk might succeed with the same end location given suitable timings. The use of seq numbers is the way we achieve that. Invariant we want to maintain is: if RCU pathwalk can reach the state with given nd->path, nd->inode and nd->seq after having traversed some part of pathname, it must be possible for non-RCU pathwalk to reach the same nd->path and nd->inode after having traversed the same part of pathname, and observe the nd->path.dentry->d_seq equal to what RCU pathwalk has in nd->seq For transition from parent to child, we sample child's ->d_seq and verify that parent's ->d_seq remains unchanged. Anything that disrupts parent-child relationship would've bumped ->d_seq on both. For transitions from child to parent we sample parent's ->d_seq and verify that child's ->d_seq has not changed. Same reasoning as for the previous case applies. For transition from mountpoint to root of mounted we sample the ->d_seq of root and verify that nobody has touched mount_lock since the beginning of pathwalk. That guarantees that mount we'd found had been there all along, with these mountpoint and root of the mounted. It would be possible for a non-RCU pathwalk to reach the previous state, find the same mount and observe its root at the moment we'd sampled ->d_seq of that For transitions from root of mounted to mountpoint we sample ->d_seq of mountpoint and verify that mount_lock had not been touched since the beginning of pathwalk. The same reasoning as in the previous case applies. Signed-off-by: Al Viro --- fs/namei.c | 98 ++++++++++++++++++++++++++---------------------------- 1 file changed, 48 insertions(+), 50 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 4b7a2147c2070a..8dd7874816cc05 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -567,7 +567,7 @@ struct nameidata { struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; - unsigned seq, m_seq, r_seq; + unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; @@ -668,6 +668,7 @@ static void drop_links(struct nameidata *nd) static void leave_rcu(struct nameidata *nd) { nd->flags &= ~LOOKUP_RCU; + nd->seq = nd->next_seq = 0; rcu_read_unlock(); } @@ -792,7 +793,6 @@ static bool try_to_unlazy(struct nameidata *nd) * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into - * @seq: seq number to check @dentry against * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already @@ -801,7 +801,7 @@ static bool try_to_unlazy(struct nameidata *nd) * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ -static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); @@ -826,7 +826,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; - if (read_seqcount_retry(&dentry->d_seq, seq)) + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is @@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(follow_down); * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode, unsigned *seqp) + struct inode **inode) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; @@ -1504,7 +1504,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; - *seqp = read_seqcount_begin(&dentry->d_seq); + nd->next_seq = read_seqcount_begin(&dentry->d_seq); *inode = dentry->d_inode; /* * We don't need to re-check ->d_seq after this @@ -1513,6 +1513,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * becoming unpinned. */ flags = dentry->d_flags; + // makes sure that non-RCU pathwalk could reach + // this state. if (read_seqretry(&mount_lock, nd->m_seq)) return false; continue; @@ -1525,8 +1527,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct inode **inode, - unsigned int *seqp) + struct path *path, struct inode **inode) { bool jumped; int ret; @@ -1534,16 +1535,17 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { - unsigned int seq = *seqp; + unsigned int seq = nd->next_seq; if (unlikely(!*inode)) return -ENOENT; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + if (likely(__follow_mount_rcu(nd, path, inode))) return 0; - if (!try_to_unlazy_next(nd, dentry, seq)) - return -ECHILD; - // *path might've been clobbered by __follow_mount_rcu() + // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; + nd->next_seq = seq; + if (!try_to_unlazy_next(nd, dentry)) + return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { @@ -1558,7 +1560,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, mntput(path->mnt); } else { *inode = d_backing_inode(path->dentry); - *seqp = 0; /* out of RCU mode, so the value doesn't matter */ } return ret; } @@ -1618,8 +1619,7 @@ static struct dentry *__lookup_hash(const struct qstr *name, } static struct dentry *lookup_fast(struct nameidata *nd, - struct inode **inode, - unsigned *seqp) + struct inode **inode) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; @@ -1630,8 +1630,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { - unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq); + dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); @@ -1643,7 +1642,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, * the dentry name information from lookup. */ *inode = d_backing_inode(dentry); - if (read_seqcount_retry(&dentry->d_seq, seq)) + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); /* @@ -1656,11 +1655,10 @@ static struct dentry *lookup_fast(struct nameidata *nd, if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); - *seqp = seq; status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; - if (!try_to_unlazy_next(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ @@ -1741,7 +1739,7 @@ static inline int may_lookup(struct user_namespace *mnt_userns, return inode_permission(mnt_userns, nd->inode, MAY_EXEC); } -static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) +static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; @@ -1756,7 +1754,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it - bool grabbed_link = legitimize_path(nd, link, seq); + bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; @@ -1770,11 +1768,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, - struct inode *inode, unsigned seq, int flags) + struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link, seq); + int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) @@ -1784,7 +1782,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link, last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); - last->seq = seq; + last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); @@ -1846,12 +1844,14 @@ static const char *pick_link(struct nameidata *nd, struct path *link, * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. + * + * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, - struct dentry *dentry, struct inode *inode, unsigned seq) + struct dentry *dentry, struct inode *inode) { struct path path; - int err = handle_mounts(nd, dentry, &path, &inode, &seq); + int err = handle_mounts(nd, dentry, &path, &inode); if (err < 0) return ERR_PTR(err); @@ -1866,23 +1866,22 @@ static const char *step_into(struct nameidata *nd, int flags, } nd->path = path; nd->inode = inode; - nd->seq = seq; + nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, seq)) + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } - return pick_link(nd, &path, inode, seq, flags); + return pick_link(nd, &path, inode, flags); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) + struct inode **inodep) { struct dentry *parent, *old; @@ -1899,6 +1898,7 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; + // makes sure that non-RCU pathwalk could reach this state if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ @@ -1906,7 +1906,8 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, old = nd->path.dentry; parent = old->d_parent; *inodep = parent->d_inode; - *seqp = read_seqcount_begin(&parent->d_seq); + nd->next_seq = read_seqcount_begin(&parent->d_seq); + // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) @@ -1917,14 +1918,13 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); - *seqp = nd->seq; + nd->next_seq = nd->seq; *inodep = nd->path.dentry->d_inode; return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd, - struct inode **inodep, - unsigned *seqp) + struct inode **inodep) { struct dentry *parent; @@ -1948,14 +1948,12 @@ static struct dentry *follow_dotdot(struct nameidata *nd, dput(parent); return ERR_PTR(-ENOENT); } - *seqp = 0; *inodep = parent->d_inode; return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); - *seqp = 0; *inodep = nd->path.dentry->d_inode; return dget(nd->path.dentry); } @@ -1966,7 +1964,6 @@ static const char *handle_dots(struct nameidata *nd, int type) const char *error = NULL; struct dentry *parent; struct inode *inode; - unsigned seq; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); @@ -1974,12 +1971,12 @@ static const char *handle_dots(struct nameidata *nd, int type) return error; } if (nd->flags & LOOKUP_RCU) - parent = follow_dotdot_rcu(nd, &inode, &seq); + parent = follow_dotdot_rcu(nd, &inode); else - parent = follow_dotdot(nd, &inode, &seq); + parent = follow_dotdot(nd, &inode); if (IS_ERR(parent)) return ERR_CAST(parent); - error = step_into(nd, WALK_NOFOLLOW, parent, inode, seq); + error = step_into(nd, WALK_NOFOLLOW, parent, inode); if (unlikely(error)) return error; @@ -2004,7 +2001,6 @@ static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; struct inode *inode; - unsigned seq; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and @@ -2015,7 +2011,7 @@ static const char *walk_component(struct nameidata *nd, int flags) put_link(nd); return handle_dots(nd, nd->last_type); } - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd, &inode); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { @@ -2025,7 +2021,7 @@ static const char *walk_component(struct nameidata *nd, int flags) } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - return step_into(nd, flags, dentry, inode, seq); + return step_into(nd, flags, dentry, inode); } /* @@ -2380,6 +2376,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); + else + nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; @@ -2481,8 +2479,9 @@ static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); + nd->next_seq = nd->seq; return PTR_ERR(step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq)); + nd->path.dentry, nd->inode)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -3401,7 +3400,6 @@ static const char *open_last_lookups(struct nameidata *nd, struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; - unsigned seq; struct inode *inode; struct dentry *dentry; const char *res; @@ -3418,7 +3416,7 @@ static const char *open_last_lookups(struct nameidata *nd, if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd, &inode, &seq); + dentry = lookup_fast(nd, &inode); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) @@ -3472,7 +3470,7 @@ static const char *open_last_lookups(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - res = step_into(nd, WALK_TRAILING, dentry, inode, seq); + res = step_into(nd, WALK_TRAILING, dentry, inode); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; From a4f5b52167a80edec74093fe6fef291a0318f4ba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Jul 2022 22:07:32 -0400 Subject: [PATCH 07/10] step_into(): lose inode argument make handle_mounts() always fetch it. This is just the first step - the callers of step_into() will stop trying to calculate the sucker, etc. The passed value should be equal to dentry->d_inode in all cases; in RCU mode - fetched after we'd sampled ->d_seq. Might as well fetch it here. We do need to validate ->d_seq, which duplicates the check currently done in lookup_fast(); that duplication will go away shortly. After that change handle_mounts() always ignores the initial value of *inode and always sets it on success. Signed-off-by: Al Viro --- fs/namei.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 8dd7874816cc05..60e17152374a52 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1536,6 +1536,9 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; + *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, seq)) + return -ECHILD; if (unlikely(!*inode)) return -ENOENT; if (likely(__follow_mount_rcu(nd, path, inode))) @@ -1848,9 +1851,10 @@ static const char *pick_link(struct nameidata *nd, struct path *link, * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, - struct dentry *dentry, struct inode *inode) + struct dentry *dentry) { struct path path; + struct inode *inode; int err = handle_mounts(nd, dentry, &path, &inode); if (err < 0) @@ -1976,7 +1980,7 @@ static const char *handle_dots(struct nameidata *nd, int type) parent = follow_dotdot(nd, &inode); if (IS_ERR(parent)) return ERR_CAST(parent); - error = step_into(nd, WALK_NOFOLLOW, parent, inode); + error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; @@ -2021,7 +2025,7 @@ static const char *walk_component(struct nameidata *nd, int flags) } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - return step_into(nd, flags, dentry, inode); + return step_into(nd, flags, dentry); } /* @@ -2480,8 +2484,7 @@ static int handle_lookup_down(struct nameidata *nd) if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); nd->next_seq = nd->seq; - return PTR_ERR(step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode)); + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -3470,7 +3473,7 @@ static const char *open_last_lookups(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - res = step_into(nd, WALK_TRAILING, dentry, inode); + res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; From b16c001de0f66bc633aefe770a8b0a75c8c39a3b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Jul 2022 22:18:11 -0400 Subject: [PATCH 08/10] follow_dotdot{,_rcu}(): don't bother with inode step_into() will fetch it, TYVM. Signed-off-by: Al Viro --- fs/namei.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 60e17152374a52..d631d797ea0945 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1884,8 +1884,7 @@ static const char *step_into(struct nameidata *nd, int flags, return pick_link(nd, &path, inode, flags); } -static struct dentry *follow_dotdot_rcu(struct nameidata *nd, - struct inode **inodep) +static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; @@ -1909,7 +1908,6 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, } old = nd->path.dentry; parent = old->d_parent; - *inodep = parent->d_inode; nd->next_seq = read_seqcount_begin(&parent->d_seq); // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) @@ -1923,12 +1921,10 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); nd->next_seq = nd->seq; - *inodep = nd->path.dentry->d_inode; return nd->path.dentry; } -static struct dentry *follow_dotdot(struct nameidata *nd, - struct inode **inodep) +static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; @@ -1952,13 +1948,11 @@ static struct dentry *follow_dotdot(struct nameidata *nd, dput(parent); return ERR_PTR(-ENOENT); } - *inodep = parent->d_inode; return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); - *inodep = nd->path.dentry->d_inode; return dget(nd->path.dentry); } @@ -1967,7 +1961,6 @@ static const char *handle_dots(struct nameidata *nd, int type) if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; - struct inode *inode; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); @@ -1975,9 +1968,9 @@ static const char *handle_dots(struct nameidata *nd, int type) return error; } if (nd->flags & LOOKUP_RCU) - parent = follow_dotdot_rcu(nd, &inode); + parent = follow_dotdot_rcu(nd); else - parent = follow_dotdot(nd, &inode); + parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); error = step_into(nd, WALK_NOFOLLOW, parent); From 4cb640248041dab1c718a6140d758dad5a84b8ec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Jul 2022 22:20:20 -0400 Subject: [PATCH 09/10] lookup_fast(): don't bother with inode Note that validation of ->d_seq after ->d_inode fetch is gone, along with fetching of ->d_inode itself. Signed-off-by: Al Viro --- fs/namei.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d631d797ea0945..3e30cec54ced7f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1621,8 +1621,7 @@ static struct dentry *__lookup_hash(const struct qstr *name, return dentry; } -static struct dentry *lookup_fast(struct nameidata *nd, - struct inode **inode) +static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; @@ -1640,22 +1639,11 @@ static struct dentry *lookup_fast(struct nameidata *nd, return NULL; } - /* - * This sequence count validates that the inode matches - * the dentry name information from lookup. - */ - *inode = d_backing_inode(dentry); - if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) - return ERR_PTR(-ECHILD); - /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. - * - * The memory barrier in read_seqcount_begin of child is - * enough, we can use __read_seqcount_retry here. */ - if (__read_seqcount_retry(&parent->d_seq, nd->seq)) + if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); status = d_revalidate(dentry, nd->flags); @@ -1997,7 +1985,6 @@ static const char *handle_dots(struct nameidata *nd, int type) static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; - struct inode *inode; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and @@ -2008,7 +1995,7 @@ static const char *walk_component(struct nameidata *nd, int flags) put_link(nd); return handle_dots(nd, nd->last_type); } - dentry = lookup_fast(nd, &inode); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { @@ -3396,7 +3383,6 @@ static const char *open_last_lookups(struct nameidata *nd, struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; - struct inode *inode; struct dentry *dentry; const char *res; @@ -3412,7 +3398,7 @@ static const char *open_last_lookups(struct nameidata *nd, if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd, &inode); + dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) From 3bd8bc897161730042051cd5f9c6ed1e94cb5453 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Jul 2022 22:35:56 -0400 Subject: [PATCH 10/10] step_into(): move fetching ->d_inode past handle_mounts() ... and lose messing with it in __follow_mount_rcu() Signed-off-by: Al Viro --- fs/namei.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 3e30cec54ced7f..ed3ffd9b22a369 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1474,8 +1474,7 @@ EXPORT_SYMBOL(follow_down); * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ -static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode) +static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; @@ -1505,13 +1504,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; nd->next_seq = read_seqcount_begin(&dentry->d_seq); - *inode = dentry->d_inode; - /* - * We don't need to re-check ->d_seq after this - * ->d_inode read - there will be an RCU delay - * between mount hash removal and ->mnt_root - * becoming unpinned. - */ flags = dentry->d_flags; // makes sure that non-RCU pathwalk could reach // this state. @@ -1527,7 +1519,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct inode **inode) + struct path *path) { bool jumped; int ret; @@ -1536,12 +1528,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; - *inode = dentry->d_inode; - if (read_seqcount_retry(&dentry->d_seq, seq)) - return -ECHILD; - if (unlikely(!*inode)) - return -ENOENT; - if (likely(__follow_mount_rcu(nd, path, inode))) + if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; @@ -1561,8 +1548,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); - } else { - *inode = d_backing_inode(path->dentry); } return ret; } @@ -1843,15 +1828,21 @@ static const char *step_into(struct nameidata *nd, int flags, { struct path path; struct inode *inode; - int err = handle_mounts(nd, dentry, &path, &inode); + int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); + inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - if (!(nd->flags & LOOKUP_RCU)) { + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt);