All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] fs: inline step_into() and walk_component()
@ 2025-11-19 18:40 Mateusz Guzik
  2025-11-19 18:49 ` Mateusz Guzik
  2025-11-19 19:42 ` Al Viro
  0 siblings, 2 replies; 7+ messages in thread
From: Mateusz Guzik @ 2025-11-19 18:40 UTC (permalink / raw)
  To: brauner; +Cc: viro, jack, linux-kernel, linux-fsdevel, Mateusz Guzik

The primary consumer is link_path_walk(), calling walk_component() every
time which in turn calls step_into().

Inlining these saves overhead of 2 function calls per path component,
along with allowing the compiler to do better job optimizing them in place.

step_into() had absolutely atrocious assembly to facilitate the
slowpath. In order to lessen the burden at the callsite all the hard
work is moved into step_into_slowpath(). This also elides some of the
branches as for example LOOKUP_RCU is only checked once.

The inline-able step_into() variant deals with the common case of
traversing a cached non-mounted on directory while in RCU mode.

Since symlink handling is already denoted as unlikely(), I took the
opportunity to also shorten step_into_slowpath() by moving parts of it
into pick_link() which further shortens assembly.

Benchmarked as follows on Sapphire Rapids:
1. the "before" was a kernel with not-yet-merged optimizations (notably
   elision of calls to security_inode_permissin() and marking ext4
   inodes as not having acls)
2. "after" is the same + this patch
3. benchmark consists of issuing 205 calls to access(2) in a loop with
   pathnames lifted out of gcc and the linker building real code, most
   of which have several path components and 118 of which fail with
   -ENOENT. Some of those do symlink traversal.

In terms of ops/s:
before:	21619
after:	22536 (+4%)

profile before:
  20.25%  [kernel]                  [k] __d_lookup_rcu
  10.54%  [kernel]                  [k] link_path_walk
  10.22%  [kernel]                  [k] entry_SYSCALL_64
   6.50%  libc.so.6                 [.] __GI___access
   6.35%  [kernel]                  [k] strncpy_from_user
   4.87%  [kernel]                  [k] step_into
   3.68%  [kernel]                  [k] kmem_cache_alloc_noprof
   2.88%  [kernel]                  [k] walk_component
   2.86%  [kernel]                  [k] kmem_cache_free
   2.14%  [kernel]                  [k] set_root
   2.08%  [kernel]                  [k] lookup_fast

after:
  23.38%  [kernel]                  [k] __d_lookup_rcu
  11.27%  [kernel]                  [k] entry_SYSCALL_64
  10.89%  [kernel]                  [k] link_path_walk
   7.00%  libc.so.6                 [.] __GI___access
   6.88%  [kernel]                  [k] strncpy_from_user
   3.50%  [kernel]                  [k] kmem_cache_alloc_noprof
   2.01%  [kernel]                  [k] kmem_cache_free
   2.00%  [kernel]                  [k] set_root
   1.99%  [kernel]                  [k] lookup_fast
   1.81%  [kernel]                  [k] do_syscall_64
   1.69%  [kernel]                  [k] entry_SYSCALL_64_safe_stack

While walk_component() and step_into() of course disappear from the
profile, the link_path_walk() barely gets more overhead despite the
inlining thanks to the fast path added and while completing more walks
per second.

I don't know why overhead grew a lot on __d_lookup_rcu().

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

perhaps this could be 2-3 patches instead to do things incrementally
the other patches not listed in the commit message are:
1. mntput_not_expire slowpath (Christian took it, not present in fs-next yet)
2. nd->depth predicts.
3. lookup_slow noinline

all of those sent

you may notice step_into is quite high on the profile:
   4.87%  [kernel]                  [k] step_into

here is the routine prior to the patch:
    call   ffffffff81374630 <__fentry__>
    push   %rbp
    mov    %rsp,%rbp
    push   %r15
    push   %r14
    mov    %esi,%r14d
    push   %r13
    push   %r12
    push   %rbx
    mov    %rdi,%rbx
    sub    $0x28,%rsp
    mov    (%rdi),%rax
    mov    0x38(%rdi),%r8d
    mov    %gs:0x2e10acd(%rip),%r12        # ffffffff84551008 <__stack_chk_guard>

    mov    %r12,0x20(%rsp)
    mov    %rdx,%r12
    mov    %rdx,0x18(%rsp)
    mov    %rax,0x10(%rsp)

This is setup before it even gets to do anything which of course has
to be undone later. Also note the stackguard check. I'm not pasting
everything you can disasm yourself to check the entire monstrosity.

In contrast this is entirety of step_into() fast path with the patch + noinline:
         call   ffffffff81374630 <__fentry__>
         testb  $0x1,0x39(%rdi)
         je     ffffffff81740cbe <step_into+0x4e>
         mov    (%rdx),%eax
         test   $0x38000,%eax
         jne    ffffffff81740cbe <step_into+0x4e>
         and    $0x380000,%eax
         mov    0x30(%rdx),%rcx
         cmp    $0x300000,%eax
         je     ffffffff81740cbe <step_into+0x4e>
         mov    (%rdi),%r8
         mov    0x44(%rdi),%esi
         mov    0x4(%rdx),%eax
         cmp    %eax,%esi
         jne    ffffffff81740cc3 <step_into+0x53>
         test   %rcx,%rcx
         je     ffffffff81740ccf <step_into+0x5f>
         mov    0x44(%rdi),%eax
         mov    %r8,(%rdi)
         mov    %rdx,0x8(%rdi)
         mov    %eax,0x40(%rdi)
         xor    %eax,%eax
         mov    %rcx,0x30(%rdi)
         jmp    ffffffff8230b1f0 <__pi___x86_return_thunk>

This possibly can be shortened but I have not tried yet.

 fs/namei.c | 67 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1d1f864ad6ad..e00b9ce21536 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1668,17 +1668,17 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 	bool jumped;
 	int ret;
 
-	path->mnt = nd->path.mnt;
-	path->dentry = dentry;
 	if (nd->flags & LOOKUP_RCU) {
 		unsigned int seq = nd->next_seq;
+		if (likely(!(dentry->d_flags & DCACHE_MANAGED_DENTRY)))
+			return 0;
 		if (likely(__follow_mount_rcu(nd, path)))
 			return 0;
 		// *path and nd->next_seq might've been clobbered
 		path->mnt = nd->path.mnt;
 		path->dentry = dentry;
 		nd->next_seq = seq;
-		if (!try_to_unlazy_next(nd, dentry))
+		if (unlikely(!try_to_unlazy_next(nd, dentry)))
 			return -ECHILD;
 	}
 	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
@@ -1941,12 +1941,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link)
 
 enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
 
-static const char *pick_link(struct nameidata *nd, struct path *link,
+static noinline const char *pick_link(struct nameidata *nd, struct path *link,
 		     struct inode *inode, int flags)
 {
 	struct saved *last;
 	const char *res;
-	int error = reserve_stack(nd, link);
+	int error;
+
+	if (nd->flags & LOOKUP_RCU) {
+		/* make sure that d_is_symlink above matches inode */
+		if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
+			return ERR_PTR(-ECHILD);
+	} else {
+		if (link->mnt == nd->path.mnt)
+			mntget(link->mnt);
+	}
+
+	error = reserve_stack(nd, link);
 
 	if (unlikely(error)) {
 		if (!(nd->flags & LOOKUP_RCU))
@@ -2021,14 +2032,17 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
  *
  * NOTE: dentry must be what nd->next_seq had been sampled from.
  */
-static const char *step_into(struct nameidata *nd, int flags,
+static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
 		     struct dentry *dentry)
 {
 	struct path path;
 	struct inode *inode;
-	int err = handle_mounts(nd, dentry, &path);
+	int err;
 
-	if (err < 0)
+	path.mnt = nd->path.mnt;
+	path.dentry = dentry;
+	err = handle_mounts(nd, dentry, &path);
+	if (unlikely(err < 0))
 		return ERR_PTR(err);
 	inode = path.dentry->d_inode;
 	if (likely(!d_is_symlink(path.dentry)) ||
@@ -2050,17 +2064,36 @@ static const char *step_into(struct nameidata *nd, int flags,
 		nd->seq = nd->next_seq;
 		return NULL;
 	}
-	if (nd->flags & LOOKUP_RCU) {
-		/* make sure that d_is_symlink above matches inode */
-		if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
-			return ERR_PTR(-ECHILD);
-	} else {
-		if (path.mnt == nd->path.mnt)
-			mntget(path.mnt);
-	}
 	return pick_link(nd, &path, inode, flags);
 }
 
+static __always_inline const char *step_into(struct nameidata *nd, int flags,
+		     struct dentry *dentry)
+{
+	struct path path;
+	struct inode *inode;
+
+	path.mnt = nd->path.mnt;
+	path.dentry = dentry;
+	if (!(nd->flags & LOOKUP_RCU))
+		goto slowpath;
+	if (unlikely(dentry->d_flags & DCACHE_MANAGED_DENTRY))
+		goto slowpath;
+	inode = path.dentry->d_inode;
+	if (unlikely(d_is_symlink(path.dentry)))
+		goto slowpath;
+	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
+		return ERR_PTR(-ECHILD);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOENT);
+	nd->path = path;
+	nd->inode = inode;
+	nd->seq = nd->next_seq;
+	return NULL;
+slowpath:
+	return step_into_slowpath(nd, flags, dentry);
+}
+
 static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct dentry *parent, *old;
@@ -2171,7 +2204,7 @@ static const char *handle_dots(struct nameidata *nd, int type)
 	return NULL;
 }
 
-static const char *walk_component(struct nameidata *nd, int flags)
+static __always_inline const char *walk_component(struct nameidata *nd, int flags)
 {
 	struct dentry *dentry;
 	/*
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-11-19 20:41 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-19 18:40 [PATCH] fs: inline step_into() and walk_component() Mateusz Guzik
2025-11-19 18:49 ` Mateusz Guzik
2025-11-19 19:42 ` Al Viro
2025-11-19 19:49   ` Mateusz Guzik
2025-11-19 20:23     ` Al Viro
2025-11-19 20:40       ` Mateusz Guzik
2025-11-19 20:41         ` Mateusz Guzik

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.