All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] speeding up the stat() family of system calls...
@ 2013-12-21 20:27 Linus Torvalds
  2013-12-21 22:54 ` John Stoffel
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Linus Torvalds @ 2013-12-21 20:27 UTC (permalink / raw)
  To: Peter Anvin, Ingo Molnar, Thomas Gleixner, Al Viro
  Cc: the arch/x86 maintainers, linux-fsdevel,
	Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 1774 bytes --]

Here's both x86 people and filesystem people involved, because this
hacky RFC patch touches both.

NOTE NOTE NOTE! I've modified "cp_new_stat()" in place, in a way that
is x86-64 specific. So the attached patch *only* works on x86-64, and
will very actively break on anything else. That's intentional, because
that way it's more obvious how the code changes, but a real patch
would create a *new* cp_new_stat() for x86-64, and conditionalize the
existing generic "cp_new_stat()" on not already having an
architecture-optimized one.

Basically, under some filesystem loads, "stat()" and friends are the
most common ops (think tree traversal, but also things like git
verifying the index). And our "cp_new_stat()" function (which is the
common interface, ignoring 32-bit legacy stuff) is generic, but
actually pretty disgusting. It copies things to a temporary 'struct
stat' buffer on the kernel stack, and then uses copy_to_user() to copy
it to user space. The double copy is quite noticeable in profiles, and
it generates a big stack frame too.

By doing an architecture-specific cp_new_stat() function, we can
improve on that.

HOWEVER. On x86, doing an efficient field-at-a-time copy also requires
us to use put_user_try() and put_user_catch() in order to not have
tons of clac/stac instructions for the extended permission testing.
And the implementation of that was actually fairly non-optimal, so to
actually get the code I wanted, I had to change how that all worked
too, using "asm_volatile_goto()".

Thus both x86 and FS people on the list.

Comments? This would obviously be a 3.14 issue, I'm not suggesting
we'd do this now. I just want to lay the ground-work..

It's tested in the sense that "it works for me", and profiles look nice, but..

               Linus

[-- Attachment #2: vfs-stat-improvement --]
[-- Type: application/octet-stream, Size: 6155 bytes --]

 arch/x86/include/asm/uaccess.h | 60 +++++++++++++++++++++----------------
 fs/stat.c                      | 67 +++++++++++++++++++++++++-----------------
 2 files changed, 74 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8ec57c07b125..b6f6816b9bba 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -197,13 +197,12 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 		     : "A" (x), "r" (addr), "i" (errret), "0" (err))
 
 #define __put_user_asm_ex_u64(x, addr)					\
-	asm volatile(ASM_STAC "\n"					\
+	asm_volatile_goto(						\
 		     "1:	movl %%eax,0(%1)\n"			\
 		     "2:	movl %%edx,4(%1)\n"			\
-		     "3: " ASM_CLAC "\n"				\
-		     _ASM_EXTABLE_EX(1b, 2b)				\
-		     _ASM_EXTABLE_EX(2b, 3b)				\
-		     : : "A" (x), "r" (addr))
+		     _ASM_EXTABLE_EX(1b, %l[put_user_fail])		\
+		     _ASM_EXTABLE_EX(2b, %l[put_user_fail])		\
+		     : : "A" (x), "r" (addr) : : put_user_fail)
 
 #define __put_user_x8(x, ptr, __ret_pu)				\
 	asm volatile("call __put_user_8" : "=a" (__ret_pu)	\
@@ -424,23 +423,9 @@ struct __large_struct { unsigned long buf[100]; };
 		     : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
 
 #define __put_user_asm_ex(x, addr, itype, rtype, ltype)			\
-	asm volatile("1:	mov"itype" %"rtype"0,%1\n"		\
-		     "2:\n"						\
-		     _ASM_EXTABLE_EX(1b, 2b)				\
-		     : : ltype(x), "m" (__m(addr)))
-
-/*
- * uaccess_try and catch
- */
-#define uaccess_try	do {						\
-	current_thread_info()->uaccess_err = 0;				\
-	stac();								\
-	barrier();
-
-#define uaccess_catch(err)						\
-	clac();								\
-	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\
-} while (0)
+	asm_volatile_goto("1:	mov"itype" %"rtype"0,%1\n"		\
+		     _ASM_EXTABLE_EX(1b, %l[put_user_fail])		\
+		     : : ltype(x), "m" (__m(addr)) : : put_user_fail)
 
 /**
  * __get_user: - Get a simple variable from user space, with less checking.
@@ -499,8 +484,18 @@ struct __large_struct { unsigned long buf[100]; };
  *	get_user_ex(...);
  * } get_user_catch(err)
  */
-#define get_user_try		uaccess_try
-#define get_user_catch(err)	uaccess_catch(err)
+/*
+ * uaccess_try and catch
+ */
+#define get_user_try	do {						\
+	current_thread_info()->uaccess_err = 0;				\
+	stac();								\
+	barrier();
+
+#define get_user_catch(err)						\
+	clac();								\
+	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\
+} while (0)
 
 #define get_user_ex(x, ptr)	do {					\
 	unsigned long __gue_val;					\
@@ -508,8 +503,21 @@ struct __large_struct { unsigned long buf[100]; };
 	(x) = (__force __typeof__(*(ptr)))__gue_val;			\
 } while (0)
 
-#define put_user_try		uaccess_try
-#define put_user_catch(err)	uaccess_catch(err)
+/*
+ * uaccess_try and catch
+ */
+#define put_user_try	do {		\
+	stac();				\
+	barrier();
+
+#define put_user_catch(err)		\
+	if (0) {			\
+put_user_fail:				\
+		(err) = -EFAULT;	\
+	}				\
+	clac();				\
+} while (0)
+
 
 #define put_user_ex(x, ptr)						\
 	__put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
diff --git a/fs/stat.c b/fs/stat.c
index ae0c3cef9927..5bfa3dae5999 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -228,39 +228,52 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
 
 static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 {
-	struct stat tmp;
+	int err;
+	typeof(statbuf->st_ino) st_ino;
+	typeof(statbuf->st_nlink) st_nlink;
+
+	if (!access_ok(VERIFY_WRITE, statbuf, sizeof(*statbuf)))
+		return -EFAULT;
 
 	if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
 		return -EOVERFLOW;
-#if BITS_PER_LONG == 32
-	if (stat->size > MAX_NON_LFS)
-		return -EOVERFLOW;
-#endif
 
-	INIT_STRUCT_STAT_PADDING(tmp);
-	tmp.st_dev = encode_dev(stat->dev);
-	tmp.st_ino = stat->ino;
-	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+	st_ino = stat->ino;
+	if (sizeof(st_ino) < sizeof(stat->ino) && st_ino != stat->ino)
 		return -EOVERFLOW;
-	tmp.st_mode = stat->mode;
-	tmp.st_nlink = stat->nlink;
-	if (tmp.st_nlink != stat->nlink)
+	st_nlink = stat->nlink;
+	if (st_nlink != stat->nlink)
 		return -EOVERFLOW;
-	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
-	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
-	tmp.st_rdev = encode_dev(stat->rdev);
-	tmp.st_size = stat->size;
-	tmp.st_atime = stat->atime.tv_sec;
-	tmp.st_mtime = stat->mtime.tv_sec;
-	tmp.st_ctime = stat->ctime.tv_sec;
-#ifdef STAT_HAVE_NSEC
-	tmp.st_atime_nsec = stat->atime.tv_nsec;
-	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
-	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
-#endif
-	tmp.st_blocks = stat->blocks;
-	tmp.st_blksize = stat->blksize;
-	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+
+	err = 0;
+	put_user_try {
+		typeof(statbuf->st_uid) st_uid;
+		typeof(statbuf->st_gid) st_gid;
+
+		put_user_ex(encode_dev(stat->dev), &statbuf->st_dev);
+		put_user_ex(st_ino, &statbuf->st_ino);
+		put_user_ex(st_nlink, &statbuf->st_nlink);
+		put_user_ex(stat->mode, &statbuf->st_mode);
+		SET_UID(st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+		put_user_ex(st_uid, &statbuf->st_uid);
+		SET_GID(st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+		put_user_ex(st_gid, &statbuf->st_gid);
+		put_user_ex(0, &statbuf->__pad0);
+		put_user_ex(encode_dev(stat->rdev), &statbuf->st_rdev);
+		put_user_ex(stat->size, &statbuf->st_size);
+		put_user_ex(stat->blksize, &statbuf->st_blksize);
+		put_user_ex(stat->blocks, &statbuf->st_blocks);
+		put_user_ex(stat->atime.tv_sec, &statbuf->st_atime);
+		put_user_ex(stat->atime.tv_nsec, &statbuf->st_atime_nsec);
+		put_user_ex(stat->mtime.tv_sec, &statbuf->st_mtime);
+		put_user_ex(stat->mtime.tv_nsec, &statbuf->st_mtime_nsec);
+		put_user_ex(stat->ctime.tv_sec, &statbuf->st_ctime);
+		put_user_ex(stat->ctime.tv_nsec, &statbuf->st_ctime_nsec);
+		put_user_ex(0, &statbuf->__unused[0]);
+		put_user_ex(0, &statbuf->__unused[1]);
+		put_user_ex(0, &statbuf->__unused[2]);
+	} put_user_catch(err);
+	return err;
 }
 
 SYSCALL_DEFINE2(newstat, const char __user *, filename,

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2014-01-12 17:46 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-12-21 20:27 [RFC] speeding up the stat() family of system calls Linus Torvalds
2013-12-21 22:54 ` John Stoffel
2013-12-22  0:11   ` Linus Torvalds
2013-12-24  0:00 ` H. Peter Anvin
2013-12-24  0:12   ` Linus Torvalds
2013-12-24  6:00     ` H. Peter Anvin
2013-12-24 20:46 ` Ingo Molnar
2013-12-26 19:00   ` Linus Torvalds
2013-12-27  0:45     ` H. Peter Anvin
2013-12-27  3:18       ` H. Peter Anvin
2013-12-27  6:09     ` H. Peter Anvin
2013-12-27 23:30       ` H. Peter Anvin
2014-01-12 17:46         ` Ingo Molnar
2013-12-28  1:00     ` [tip:x86/asm] x86: Replace assembly access_ok() with a C variant tip-bot for Linus Torvalds
2013-12-28  1:00     ` [tip:x86/asm] x86: Slightly tweak the access_ok() C variant for better code tip-bot for H. Peter Anvin
2013-12-28  1:06     ` tip-bot for H. Peter Anvin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.