linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] speeding up the stat() family of system calls...
@ 2013-12-21 20:27 Linus Torvalds
  2013-12-21 22:54 ` John Stoffel
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Linus Torvalds @ 2013-12-21 20:27 UTC (permalink / raw)
  To: Peter Anvin, Ingo Molnar, Thomas Gleixner, Al Viro
  Cc: the arch/x86 maintainers, linux-fsdevel,
	Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 1774 bytes --]

Here's both x86 people and filesystem people involved, because this
hacky RFC patch touches both.

NOTE NOTE NOTE! I've modified "cp_new_stat()" in place, in a way that
is x86-64 specific. So the attached patch *only* works on x86-64, and
will very actively break on anything else. That's intentional, because
that way it's more obvious how the code changes, but a real patch
would create a *new* cp_new_stat() for x86-64, and conditionalize the
existing generic "cp_new_stat()" on not already having an
architecture-optimized one.

Basically, under some filesystem loads, "stat()" and friends are the
most common ops (think tree traversal, but also things like git
verifying the index). And our "cp_new_stat()" function (which is the
common interface, ignoring 32-bit legacy stuff) is generic, but
actually pretty disgusting. It copies things to a temporary 'struct
stat' buffer on the kernel stack, and then uses copy_to_user() to copy
it to user space. The double copy is quite noticeable in profiles, and
it generates a big stack frame too.

By doing an architecture-specific cp_new_stat() function, we can
improve on that.

HOWEVER. On x86, doing an efficient field-at-a-time copy also requires
us to use put_user_try() and put_user_catch() in order to not have
tons of clac/stac instructions for the extended permission testing.
And the implementation of that was actually fairly non-optimal, so to
actually get the code I wanted, I had to change how that all worked
too, using "asm_volatile_goto()".

Thus both x86 and FS people on the list.

Comments? This would obviously be a 3.14 issue, I'm not suggesting
we'd do this now. I just want to lay the ground-work..

It's tested in the sense that "it works for me", and profiles look nice, but..

               Linus

[-- Attachment #2: vfs-stat-improvement --]
[-- Type: application/octet-stream, Size: 6155 bytes --]

 arch/x86/include/asm/uaccess.h | 60 +++++++++++++++++++++----------------
 fs/stat.c                      | 67 +++++++++++++++++++++++++-----------------
 2 files changed, 74 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8ec57c07b125..b6f6816b9bba 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -197,13 +197,12 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 		     : "A" (x), "r" (addr), "i" (errret), "0" (err))
 
 #define __put_user_asm_ex_u64(x, addr)					\
-	asm volatile(ASM_STAC "\n"					\
+	asm_volatile_goto(						\
 		     "1:	movl %%eax,0(%1)\n"			\
 		     "2:	movl %%edx,4(%1)\n"			\
-		     "3: " ASM_CLAC "\n"				\
-		     _ASM_EXTABLE_EX(1b, 2b)				\
-		     _ASM_EXTABLE_EX(2b, 3b)				\
-		     : : "A" (x), "r" (addr))
+		     _ASM_EXTABLE_EX(1b, %l[put_user_fail])		\
+		     _ASM_EXTABLE_EX(2b, %l[put_user_fail])		\
+		     : : "A" (x), "r" (addr) : : put_user_fail)
 
 #define __put_user_x8(x, ptr, __ret_pu)				\
 	asm volatile("call __put_user_8" : "=a" (__ret_pu)	\
@@ -424,23 +423,9 @@ struct __large_struct { unsigned long buf[100]; };
 		     : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
 
 #define __put_user_asm_ex(x, addr, itype, rtype, ltype)			\
-	asm volatile("1:	mov"itype" %"rtype"0,%1\n"		\
-		     "2:\n"						\
-		     _ASM_EXTABLE_EX(1b, 2b)				\
-		     : : ltype(x), "m" (__m(addr)))
-
-/*
- * uaccess_try and catch
- */
-#define uaccess_try	do {						\
-	current_thread_info()->uaccess_err = 0;				\
-	stac();								\
-	barrier();
-
-#define uaccess_catch(err)						\
-	clac();								\
-	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\
-} while (0)
+	asm_volatile_goto("1:	mov"itype" %"rtype"0,%1\n"		\
+		     _ASM_EXTABLE_EX(1b, %l[put_user_fail])		\
+		     : : ltype(x), "m" (__m(addr)) : : put_user_fail)
 
 /**
  * __get_user: - Get a simple variable from user space, with less checking.
@@ -499,8 +484,18 @@ struct __large_struct { unsigned long buf[100]; };
  *	get_user_ex(...);
  * } get_user_catch(err)
  */
-#define get_user_try		uaccess_try
-#define get_user_catch(err)	uaccess_catch(err)
+/*
+ * uaccess_try and catch
+ */
+#define get_user_try	do {						\
+	current_thread_info()->uaccess_err = 0;				\
+	stac();								\
+	barrier();
+
+#define get_user_catch(err)						\
+	clac();								\
+	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\
+} while (0)
 
 #define get_user_ex(x, ptr)	do {					\
 	unsigned long __gue_val;					\
@@ -508,8 +503,21 @@ struct __large_struct { unsigned long buf[100]; };
 	(x) = (__force __typeof__(*(ptr)))__gue_val;			\
 } while (0)
 
-#define put_user_try		uaccess_try
-#define put_user_catch(err)	uaccess_catch(err)
+/*
+ * uaccess_try and catch
+ */
+#define put_user_try	do {		\
+	stac();				\
+	barrier();
+
+#define put_user_catch(err)		\
+	if (0) {			\
+put_user_fail:				\
+		(err) = -EFAULT;	\
+	}				\
+	clac();				\
+} while (0)
+
 
 #define put_user_ex(x, ptr)						\
 	__put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
diff --git a/fs/stat.c b/fs/stat.c
index ae0c3cef9927..5bfa3dae5999 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -228,39 +228,52 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
 
 static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 {
-	struct stat tmp;
+	int err;
+	typeof(statbuf->st_ino) st_ino;
+	typeof(statbuf->st_nlink) st_nlink;
+
+	if (!access_ok(VERIFY_WRITE, statbuf, sizeof(*statbuf)))
+		return -EFAULT;
 
 	if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
 		return -EOVERFLOW;
-#if BITS_PER_LONG == 32
-	if (stat->size > MAX_NON_LFS)
-		return -EOVERFLOW;
-#endif
 
-	INIT_STRUCT_STAT_PADDING(tmp);
-	tmp.st_dev = encode_dev(stat->dev);
-	tmp.st_ino = stat->ino;
-	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+	st_ino = stat->ino;
+	if (sizeof(st_ino) < sizeof(stat->ino) && st_ino != stat->ino)
 		return -EOVERFLOW;
-	tmp.st_mode = stat->mode;
-	tmp.st_nlink = stat->nlink;
-	if (tmp.st_nlink != stat->nlink)
+	st_nlink = stat->nlink;
+	if (st_nlink != stat->nlink)
 		return -EOVERFLOW;
-	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
-	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
-	tmp.st_rdev = encode_dev(stat->rdev);
-	tmp.st_size = stat->size;
-	tmp.st_atime = stat->atime.tv_sec;
-	tmp.st_mtime = stat->mtime.tv_sec;
-	tmp.st_ctime = stat->ctime.tv_sec;
-#ifdef STAT_HAVE_NSEC
-	tmp.st_atime_nsec = stat->atime.tv_nsec;
-	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
-	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
-#endif
-	tmp.st_blocks = stat->blocks;
-	tmp.st_blksize = stat->blksize;
-	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
+
+	err = 0;
+	put_user_try {
+		typeof(statbuf->st_uid) st_uid;
+		typeof(statbuf->st_gid) st_gid;
+
+		put_user_ex(encode_dev(stat->dev), &statbuf->st_dev);
+		put_user_ex(st_ino, &statbuf->st_ino);
+		put_user_ex(st_nlink, &statbuf->st_nlink);
+		put_user_ex(stat->mode, &statbuf->st_mode);
+		SET_UID(st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+		put_user_ex(st_uid, &statbuf->st_uid);
+		SET_GID(st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+		put_user_ex(st_gid, &statbuf->st_gid);
+		put_user_ex(0, &statbuf->__pad0);
+		put_user_ex(encode_dev(stat->rdev), &statbuf->st_rdev);
+		put_user_ex(stat->size, &statbuf->st_size);
+		put_user_ex(stat->blksize, &statbuf->st_blksize);
+		put_user_ex(stat->blocks, &statbuf->st_blocks);
+		put_user_ex(stat->atime.tv_sec, &statbuf->st_atime);
+		put_user_ex(stat->atime.tv_nsec, &statbuf->st_atime_nsec);
+		put_user_ex(stat->mtime.tv_sec, &statbuf->st_mtime);
+		put_user_ex(stat->mtime.tv_nsec, &statbuf->st_mtime_nsec);
+		put_user_ex(stat->ctime.tv_sec, &statbuf->st_ctime);
+		put_user_ex(stat->ctime.tv_nsec, &statbuf->st_ctime_nsec);
+		put_user_ex(0, &statbuf->__unused[0]);
+		put_user_ex(0, &statbuf->__unused[1]);
+		put_user_ex(0, &statbuf->__unused[2]);
+	} put_user_catch(err);
+	return err;
 }
 
 SYSCALL_DEFINE2(newstat, const char __user *, filename,

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2014-01-12 17:46 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-12-21 20:27 [RFC] speeding up the stat() family of system calls Linus Torvalds
2013-12-21 22:54 ` John Stoffel
2013-12-22  0:11   ` Linus Torvalds
2013-12-24  0:00 ` H. Peter Anvin
2013-12-24  0:12   ` Linus Torvalds
2013-12-24  6:00     ` H. Peter Anvin
2013-12-24 20:46 ` Ingo Molnar
2013-12-26 19:00   ` Linus Torvalds
2013-12-27  0:45     ` H. Peter Anvin
2013-12-27  3:18       ` H. Peter Anvin
2013-12-27  6:09     ` H. Peter Anvin
2013-12-27 23:30       ` H. Peter Anvin
2014-01-12 17:46         ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).