All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeff Layton <jlayton@kernel.org>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	 Christian Brauner <brauner@kernel.org>, Jan Kara <jack@suse.cz>,
	 Andrew Morton <akpm@linux-foundation.org>
Cc: Mateusz Guzik <mjguzik@gmail.com>,
	Josef Bacik <josef@toxicpanda.com>,
	 linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	 Jeff Layton <jlayton@kernel.org>
Subject: [PATCH v2] fs: try an opportunistic lookup for O_CREAT opens too
Date: Tue, 06 Aug 2024 10:32:17 -0400	[thread overview]
Message-ID: <20240806-openfast-v2-1-42da45981811@kernel.org> (raw)

Today, when opening a file we'll typically do a fast lookup, but if
O_CREAT is set, the kernel always takes the exclusive inode lock. I
assume this was done with the expectation that O_CREAT means that we
always expect to do the create, but that's often not the case. Many
programs set O_CREAT even in scenarios where the file already exists.

This patch rearranges the pathwalk-for-open code to also attempt a
fast_lookup in certain O_CREAT cases. If a positive dentry is found, the
inode_lock can be avoided altogether, and if auditing isn't enabled, it
can stay in rcuwalk mode for the last step_into.

One notable exception that is hopefully temporary: if we're doing an
rcuwalk and auditing is enabled, skip the lookup_fast. Legitimizing the
dentry in that case is more expensive than taking the i_rwsem for now.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
Here's a revised patch that does a fast_lookup in the O_CREAT codepath
too. The main difference here is that if a positive dentry is found and
audit_dummy_context is true, then we keep the walk lazy for the last
component, which avoids having to take any locks on the parent (just
like with non-O_CREAT opens).

The testcase below runs in about 18s on v6.10 (on an 80 CPU machine).
With this patch, it runs in about 1s:

 #define _GNU_SOURCE 1
 #include <stdio.h>
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/wait.h>

 #define PROCS           70
 #define LOOPS           500000

static int openloop(int tnum)
{
	char *file;
	int i, ret;

       	ret = asprintf(&file, "./testfile%d", tnum);
	if (ret < 0) {
		printf("asprintf failed for proc %d", tnum);
		return 1;
	}

	for (i = 0; i < LOOPS; ++i) {
		int fd = open(file, O_RDWR|O_CREAT, 0644);

		if (fd < 0) {
			perror("open");
			return 1;
		}
		close(fd);
	}
	unlink(file);
	free(file);
	return 0;
}

int main(int argc, char **argv) {
	pid_t kids[PROCS];
	int i, ret = 0;

	for (i = 0; i < PROCS; ++i) {
		kids[i] = fork();
		if (kids[i] > 0)
			return openloop(i);
		if (kids[i] < 0)
			perror("fork");
	}

	for (i = 0; i < PROCS; ++i) {
		int ret2;

		if (kids[i] > 0) {
			wait(&ret2);
			if (ret2 != 0)
				ret = ret2;
		}
	}
	return ret;
}
---
Changes in v2:
- drop the lockref patch since Mateusz is working on a better approach
- add trailing_slashes helper function
- add a lookup_fast_for_open helper function
- make lookup_fast_for_open skip the lookup if auditing is enabled
- if we find a positive dentry and auditing is disabled, don't unlazy
- Link to v1: https://lore.kernel.org/r/20240802-openfast-v1-0-a1cff2a33063@kernel.org
---
 fs/namei.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 9 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1e05a0f3f04d..2d716fb114c9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3518,6 +3518,47 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	return ERR_PTR(error);
 }
 
+static inline bool trailing_slashes(struct nameidata *nd)
+{
+	return (bool)nd->last.name[nd->last.len];
+}
+
+static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
+{
+	struct dentry *dentry;
+
+	if (open_flag & O_CREAT) {
+		/* Don't bother on an O_EXCL create */
+		if (open_flag & O_EXCL)
+			return NULL;
+
+		/*
+		 * FIXME: If auditing is enabled, then we'll have to unlazy to
+		 * use the dentry. For now, don't do this, since it shifts
+		 * contention from parent's i_rwsem to its d_lockref spinlock.
+		 * Reconsider this once dentry refcounting handles heavy
+		 * contention better.
+		 */
+		if ((nd->flags & LOOKUP_RCU) && !audit_dummy_context())
+			return NULL;
+	}
+
+	if (trailing_slashes(nd))
+		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+	dentry = lookup_fast(nd);
+
+	if (open_flag & O_CREAT) {
+		/* Discard negative dentries. Need inode_lock to do the create */
+		if (dentry && !dentry->d_inode) {
+			if (!(nd->flags & LOOKUP_RCU))
+				dput(dentry);
+			dentry = NULL;
+		}
+	}
+	return dentry;
+}
+
 static const char *open_last_lookups(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
@@ -3535,28 +3576,31 @@ static const char *open_last_lookups(struct nameidata *nd,
 		return handle_dots(nd, nd->last_type);
 	}
 
+	/* We _can_ be in RCU mode here */
+	dentry = lookup_fast_for_open(nd, open_flag);
+	if (IS_ERR(dentry))
+		return ERR_CAST(dentry);
+
 	if (!(open_flag & O_CREAT)) {
-		if (nd->last.name[nd->last.len])
-			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-		/* we _can_ be in RCU mode here */
-		dentry = lookup_fast(nd);
-		if (IS_ERR(dentry))
-			return ERR_CAST(dentry);
 		if (likely(dentry))
 			goto finish_lookup;
 
 		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
 			return ERR_PTR(-ECHILD);
 	} else {
-		/* create side of things */
 		if (nd->flags & LOOKUP_RCU) {
+			/* can stay in rcuwalk if not auditing */
+			if (dentry && audit_dummy_context())
+				goto check_slashes;
 			if (!try_to_unlazy(nd))
 				return ERR_PTR(-ECHILD);
 		}
 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
-		/* trailing slashes? */
-		if (unlikely(nd->last.name[nd->last.len]))
+check_slashes:
+		if (trailing_slashes(nd))
 			return ERR_PTR(-EISDIR);
+		if (dentry)
+			goto finish_lookup;
 	}
 
 	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {

---
base-commit: 0c3836482481200ead7b416ca80c68a29cfdaabd
change-id: 20240723-openfast-ac49a7b6ade2

Best regards,
-- 
Jeff Layton <jlayton@kernel.org>


             reply	other threads:[~2024-08-06 14:32 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-08-06 14:32 Jeff Layton [this message]
2024-08-06 15:25 ` [PATCH v2] fs: try an opportunistic lookup for O_CREAT opens too Mateusz Guzik
2024-08-06 16:17   ` Jeff Layton
2024-08-06 16:42     ` Mateusz Guzik
2024-08-06 19:11   ` Andi Kleen
2024-08-06 19:22     ` Mateusz Guzik
2024-08-06 20:42       ` Jeff Layton
2024-08-06 19:26     ` Jeff Layton
2024-08-06 20:03       ` Mateusz Guzik
2024-08-06 20:47         ` Andi Kleen
2024-08-15 15:07           ` Mateusz Guzik
2024-08-06 19:51 ` Jeff Layton
2024-08-14  2:18   ` Al Viro
2024-08-14  2:40     ` Al Viro
2024-08-14 11:48       ` Jeff Layton
2024-08-14 12:40         ` Christian Brauner
2024-08-14 15:44           ` Al Viro
2024-08-16  8:34             ` Christian Brauner
2024-08-14 15:42         ` Al Viro
2024-08-14 16:46           ` Jeff Layton
2024-08-07 14:26 ` Christian Brauner
2024-08-07 14:36   ` Jeff Layton
2024-08-08 10:36     ` Christian Brauner
2024-08-08 10:54       ` Jeff Layton
2024-08-08 11:18         ` Christian Brauner
2024-08-08 17:11       ` Jan Kara
2024-08-08 21:12         ` Paul Moore
2024-08-08 23:43           ` Jeff Layton
2024-08-09  0:28             ` Paul Moore
2024-08-09  0:33               ` Jeff Layton
2024-08-09  1:22                 ` Paul Moore
2024-08-09 14:21                   ` Jeff Layton
2024-08-11 21:52                     ` Paul Moore

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240806-openfast-v2-1-42da45981811@kernel.org \
    --to=jlayton@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=brauner@kernel.org \
    --cc=jack@suse.cz \
    --cc=josef@toxicpanda.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mjguzik@gmail.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.