LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 09/11] fs: remove compat_sys_vmsplice
From: Christoph Hellwig @ 2020-09-21 14:34 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, io-uring, linux-arm-kernel, Jens Axboe, linux-parisc,
	netdev, linux-kernel, linux-security-module, David Laight,
	linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200921143434.707844-1-hch@lst.de>

Now that import_iovec handles compat iovecs, the native vmsplice syscall
can be used for the compat case as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/arm64/include/asm/unistd32.h             |  2 +-
 arch/mips/kernel/syscalls/syscall_n32.tbl     |  2 +-
 arch/mips/kernel/syscalls/syscall_o32.tbl     |  2 +-
 arch/parisc/kernel/syscalls/syscall.tbl       |  2 +-
 arch/powerpc/kernel/syscalls/syscall.tbl      |  2 +-
 arch/s390/kernel/syscalls/syscall.tbl         |  2 +-
 arch/sparc/kernel/syscalls/syscall.tbl        |  2 +-
 arch/x86/entry/syscall_x32.c                  |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl        |  2 +-
 arch/x86/entry/syscalls/syscall_64.tbl        |  2 +-
 fs/splice.c                                   | 57 +++++--------------
 include/linux/compat.h                        |  4 --
 include/uapi/asm-generic/unistd.h             |  2 +-
 tools/include/uapi/asm-generic/unistd.h       |  2 +-
 .../arch/powerpc/entry/syscalls/syscall.tbl   |  2 +-
 .../perf/arch/s390/entry/syscalls/syscall.tbl |  2 +-
 .../arch/x86/entry/syscalls/syscall_64.tbl    |  2 +-
 17 files changed, 28 insertions(+), 62 deletions(-)

diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 4a236493dca5b9..11dfae3a8563bd 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -697,7 +697,7 @@ __SYSCALL(__NR_sync_file_range2, compat_sys_aarch32_sync_file_range2)
 #define __NR_tee 342
 __SYSCALL(__NR_tee, sys_tee)
 #define __NR_vmsplice 343
-__SYSCALL(__NR_vmsplice, compat_sys_vmsplice)
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages 344
 __SYSCALL(__NR_move_pages, compat_sys_move_pages)
 #define __NR_getcpu 345
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index c99a92646f8ee9..5a39d4de0ac85b 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -278,7 +278,7 @@
 267	n32	splice				sys_splice
 268	n32	sync_file_range			sys_sync_file_range
 269	n32	tee				sys_tee
-270	n32	vmsplice			compat_sys_vmsplice
+270	n32	vmsplice			sys_vmsplice
 271	n32	move_pages			compat_sys_move_pages
 272	n32	set_robust_list			compat_sys_set_robust_list
 273	n32	get_robust_list			compat_sys_get_robust_list
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 075064d10661bf..136efc6b8c5444 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -318,7 +318,7 @@
 304	o32	splice				sys_splice
 305	o32	sync_file_range			sys_sync_file_range		sys32_sync_file_range
 306	o32	tee				sys_tee
-307	o32	vmsplice			sys_vmsplice			compat_sys_vmsplice
+307	o32	vmsplice			sys_vmsplice
 308	o32	move_pages			sys_move_pages			compat_sys_move_pages
 309	o32	set_robust_list			sys_set_robust_list		compat_sys_set_robust_list
 310	o32	get_robust_list			sys_get_robust_list		compat_sys_get_robust_list
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 192abde0001d9d..a9e184192caedd 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -330,7 +330,7 @@
 292	32	sync_file_range		parisc_sync_file_range
 292	64	sync_file_range		sys_sync_file_range
 293	common	tee			sys_tee
-294	common	vmsplice		sys_vmsplice			compat_sys_vmsplice
+294	common	vmsplice		sys_vmsplice
 295	common	move_pages		sys_move_pages			compat_sys_move_pages
 296	common	getcpu			sys_getcpu
 297	common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 6f1e2ecf0edad9..0d4985919ca34d 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -369,7 +369,7 @@
 282	common	unshare				sys_unshare
 283	common	splice				sys_splice
 284	common	tee				sys_tee
-285	common	vmsplice			sys_vmsplice			compat_sys_vmsplice
+285	common	vmsplice			sys_vmsplice
 286	common	openat				sys_openat			compat_sys_openat
 287	common	mkdirat				sys_mkdirat
 288	common	mknodat				sys_mknodat
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 6101cf2e004cb4..b5495a42814bd1 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -316,7 +316,7 @@
 306  common	splice			sys_splice			sys_splice
 307  common	sync_file_range		sys_sync_file_range		compat_sys_s390_sync_file_range
 308  common	tee			sys_tee				sys_tee
-309  common	vmsplice		sys_vmsplice			compat_sys_vmsplice
+309  common	vmsplice		sys_vmsplice			sys_vmsplice
 310  common	move_pages		sys_move_pages			compat_sys_move_pages
 311  common	getcpu			sys_getcpu			sys_getcpu
 312  common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index a87ddb282ab16f..f1810c1a35caa5 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -38,7 +38,7 @@
 23	64    	setuid			sys_setuid
 24	32	getuid			sys_getuid16
 24	64   	getuid			sys_getuid
-25	common	vmsplice		sys_vmsplice			compat_sys_vmsplice
+25	common	vmsplice		sys_vmsplice
 26	common	ptrace			sys_ptrace			compat_sys_ptrace
 27	common	alarm			sys_alarm
 28	common	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index aa321444a41f63..a4840b9d50ad14 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -16,6 +16,7 @@
 #define __x32_sys_writev	__x64_sys_writev
 #define __x32_sys_getsockopt	__x64_sys_getsockopt
 #define __x32_sys_setsockopt	__x64_sys_setsockopt
+#define __x32_sys_vmsplice	__x64_sys_vmsplice
 
 #define __SYSCALL_64(nr, sym)
 
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 54ab4beb517f25..0fb2f172581e51 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -327,7 +327,7 @@
 313	i386	splice			sys_splice
 314	i386	sync_file_range		sys_ia32_sync_file_range
 315	i386	tee			sys_tee
-316	i386	vmsplice		sys_vmsplice			compat_sys_vmsplice
+316	i386	vmsplice		sys_vmsplice
 317	i386	move_pages		sys_move_pages			compat_sys_move_pages
 318	i386	getcpu			sys_getcpu
 319	i386	epoll_pwait		sys_epoll_pwait
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index b1e59957c5c51c..642af919183de4 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -388,7 +388,7 @@
 529	x32	waitid			compat_sys_waitid
 530	x32	set_robust_list		compat_sys_set_robust_list
 531	x32	get_robust_list		compat_sys_get_robust_list
-532	x32	vmsplice		compat_sys_vmsplice
+532	x32	vmsplice		sys_vmsplice
 533	x32	move_pages		compat_sys_move_pages
 534	x32	preadv			compat_sys_preadv64
 535	x32	pwritev			compat_sys_pwritev64
diff --git a/fs/splice.c b/fs/splice.c
index 132d42b9871f9b..18d84544030b39 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,7 +33,6 @@
 #include <linux/security.h>
 #include <linux/gfp.h>
 #include <linux/socket.h>
-#include <linux/compat.h>
 #include <linux/sched/signal.h>
 
 #include "internal.h"
@@ -1332,20 +1331,6 @@ static int vmsplice_type(struct fd f, int *type)
  * Currently we punt and implement it as a normal copy, see pipe_to_user().
  *
  */
-static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags)
-{
-	if (unlikely(flags & ~SPLICE_F_ALL))
-		return -EINVAL;
-
-	if (!iov_iter_count(iter))
-		return 0;
-
-	if (iov_iter_rw(iter) == WRITE)
-		return vmsplice_to_pipe(f, iter, flags);
-	else
-		return vmsplice_to_user(f, iter, flags);
-}
-
 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 		unsigned long, nr_segs, unsigned int, flags)
 {
@@ -1356,6 +1341,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 	struct fd f;
 	int type;
 
+	if (unlikely(flags & ~SPLICE_F_ALL))
+		return -EINVAL;
+
 	f = fdget(fd);
 	error = vmsplice_type(f, &type);
 	if (error)
@@ -1363,40 +1351,21 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 
 	error = import_iovec(type, uiov, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (error >= 0) {
-		error = do_vmsplice(f.file, &iter, flags);
-		kfree(iov);
-	}
-	fdput(f);
-	return error;
-}
+	if (error < 0)
+		goto out_fdput;
 
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
-		    unsigned int, nr_segs, unsigned int, flags)
-{
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	struct iov_iter iter;
-	ssize_t error;
-	struct fd f;
-	int type;
-
-	f = fdget(fd);
-	error = vmsplice_type(f, &type);
-	if (error)
-		return error;
+	if (!iov_iter_count(&iter))
+		error = 0;
+	else if (iov_iter_rw(&iter) == WRITE)
+		error = vmsplice_to_pipe(f.file, &iter, flags);
+	else
+		error = vmsplice_to_user(f.file, &iter, flags);
 
-	error = import_iovec(type, (struct iovec __user *)iov32, nr_segs,
-			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (error >= 0) {
-		error = do_vmsplice(f.file, &iter, flags);
-		kfree(iov);
-	}
+	kfree(iov);
+out_fdput:
 	fdput(f);
 	return error;
 }
-#endif
 
 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 		int, fd_out, loff_t __user *, off_out,
diff --git a/include/linux/compat.h b/include/linux/compat.h
index fa39b7a5488d70..14d8bf412bd02e 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -597,10 +597,6 @@ asmlinkage long compat_sys_signalfd4(int ufd,
 				     const compat_sigset_t __user *sigmask,
 				     compat_size_t sigsetsize, int flags);
 
-/* fs/splice.c */
-asmlinkage long compat_sys_vmsplice(int fd, const struct compat_iovec __user *,
-				    unsigned int nr_segs, unsigned int flags);
-
 /* fs/stat.c */
 asmlinkage long compat_sys_newfstatat(unsigned int dfd,
 				      const char __user *filename,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 211c9eacbda6eb..f2dcb0d5703014 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -237,7 +237,7 @@ __SC_COMP(__NR_signalfd4, sys_signalfd4, compat_sys_signalfd4)
 
 /* fs/splice.c */
 #define __NR_vmsplice 75
-__SC_COMP(__NR_vmsplice, sys_vmsplice, compat_sys_vmsplice)
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_splice 76
 __SYSCALL(__NR_splice, sys_splice)
 #define __NR_tee 77
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 211c9eacbda6eb..f2dcb0d5703014 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -237,7 +237,7 @@ __SC_COMP(__NR_signalfd4, sys_signalfd4, compat_sys_signalfd4)
 
 /* fs/splice.c */
 #define __NR_vmsplice 75
-__SC_COMP(__NR_vmsplice, sys_vmsplice, compat_sys_vmsplice)
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_splice 76
 __SYSCALL(__NR_splice, sys_splice)
 #define __NR_tee 77
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 46be68029587f9..26f0347c15118b 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -363,7 +363,7 @@
 282	common	unshare				sys_unshare
 283	common	splice				sys_splice
 284	common	tee				sys_tee
-285	common	vmsplice			sys_vmsplice			compat_sys_vmsplice
+285	common	vmsplice			sys_vmsplice
 286	common	openat				sys_openat			compat_sys_openat
 287	common	mkdirat				sys_mkdirat
 288	common	mknodat				sys_mknodat
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index fb5e61ce9d5838..02ad81f69bb7e3 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -316,7 +316,7 @@
 306  common	splice			sys_splice			compat_sys_splice
 307  common	sync_file_range		sys_sync_file_range		compat_sys_s390_sync_file_range
 308  common	tee			sys_tee				compat_sys_tee
-309  common	vmsplice		sys_vmsplice			compat_sys_vmsplice
+309  common	vmsplice		sys_vmsplice			sys_vmsplice
 310  common	move_pages		sys_move_pages			compat_sys_move_pages
 311  common	getcpu			sys_getcpu			compat_sys_getcpu
 312  common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index b1e59957c5c51c..642af919183de4 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -388,7 +388,7 @@
 529	x32	waitid			compat_sys_waitid
 530	x32	set_robust_list		compat_sys_set_robust_list
 531	x32	get_robust_list		compat_sys_get_robust_list
-532	x32	vmsplice		compat_sys_vmsplice
+532	x32	vmsplice		sys_vmsplice
 533	x32	move_pages		compat_sys_move_pages
 534	x32	preadv			compat_sys_preadv64
 535	x32	pwritev			compat_sys_pwritev64
-- 
2.28.0


^ permalink raw reply related

* [PATCH 07/11] fs: remove various compat readv/writev helpers
From: Christoph Hellwig @ 2020-09-21 14:34 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, io-uring, linux-arm-kernel, Jens Axboe, linux-parisc,
	netdev, linux-kernel, linux-security-module, David Laight,
	linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200921143434.707844-1-hch@lst.de>

Now that import_iovec handles compat iovecs as well, all the duplicated
code in the compat readv/writev helpers is not needed.  Remove them
and switch the compat syscall handlers to use the native helpers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/read_write.c | 179 ++++++++----------------------------------------
 1 file changed, 30 insertions(+), 149 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 0a68037580b455..eab427b7cc0a3f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1068,226 +1068,107 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
 	return do_pwritev(fd, vec, vlen, pos, flags);
 }
 
+/*
+ * Various compat syscalls.  Note that they all pretend to take a native
+ * iovec - import_iovec will properly treat those as compat_iovecs based on
+ * in_compat_syscall().
+ */
 #ifdef CONFIG_COMPAT
-static size_t compat_readv(struct file *file,
-			   const struct compat_iovec __user *vec,
-			   unsigned long vlen, loff_t *pos, rwf_t flags)
-{
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	struct iov_iter iter;
-	ssize_t ret;
-
-	ret = import_iovec(READ, (const struct iovec __user *)vec, vlen,
-			   UIO_FASTIOV, &iov, &iter);
-	if (ret >= 0) {
-		ret = do_iter_read(file, &iter, pos, flags);
-		kfree(iov);
-	}
-	if (ret > 0)
-		add_rchar(current, ret);
-	inc_syscr(current);
-	return ret;
-}
-
-static size_t do_compat_readv(compat_ulong_t fd,
-				 const struct compat_iovec __user *vec,
-				 compat_ulong_t vlen, rwf_t flags)
-{
-	struct fd f = fdget_pos(fd);
-	ssize_t ret;
-	loff_t pos;
-
-	if (!f.file)
-		return -EBADF;
-	pos = f.file->f_pos;
-	ret = compat_readv(f.file, vec, vlen, &pos, flags);
-	if (ret >= 0)
-		f.file->f_pos = pos;
-	fdput_pos(f);
-	return ret;
-
-}
-
 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *, vec,
 		compat_ulong_t, vlen)
 {
-	return do_compat_readv(fd, vec, vlen, 0);
-}
-
-static long do_compat_preadv64(unsigned long fd,
-				  const struct compat_iovec __user *vec,
-				  unsigned long vlen, loff_t pos, rwf_t flags)
-{
-	struct fd f;
-	ssize_t ret;
-
-	if (pos < 0)
-		return -EINVAL;
-	f = fdget(fd);
-	if (!f.file)
-		return -EBADF;
-	ret = -ESPIPE;
-	if (f.file->f_mode & FMODE_PREAD)
-		ret = compat_readv(f.file, vec, vlen, &pos, flags);
-	fdput(f);
-	return ret;
+	return do_readv(fd, vec, vlen, 0);
 }
 
 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *, vec,
 		unsigned long, vlen, loff_t, pos)
 {
-	return do_compat_preadv64(fd, vec, vlen, pos, 0);
+	return do_preadv(fd, vec, vlen, pos, 0);
 }
 #endif
 
 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *, vec,
 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
-	return do_compat_preadv64(fd, vec, vlen, pos, 0);
+	return do_preadv(fd, vec, vlen, pos, 0);
 }
 
 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *, vec,
 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
 	if (pos == -1)
-		return do_compat_readv(fd, vec, vlen, flags);
-
-	return do_compat_preadv64(fd, vec, vlen, pos, flags);
+		return do_readv(fd, vec, vlen, flags);
+	return do_preadv(fd, vec, vlen, pos, flags);
 }
 #endif
 
 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *, vec,
 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
 		rwf_t, flags)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
 	if (pos == -1)
-		return do_compat_readv(fd, vec, vlen, flags);
-
-	return do_compat_preadv64(fd, vec, vlen, pos, flags);
-}
-
-static size_t compat_writev(struct file *file,
-			    const struct compat_iovec __user *vec,
-			    unsigned long vlen, loff_t *pos, rwf_t flags)
-{
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	struct iov_iter iter;
-	ssize_t ret;
-
-	ret = import_iovec(WRITE, (const struct iovec __user *)vec, vlen,
-			   UIO_FASTIOV, &iov, &iter);
-	if (ret >= 0) {
-		file_start_write(file);
-		ret = do_iter_write(file, &iter, pos, flags);
-		file_end_write(file);
-		kfree(iov);
-	}
-	if (ret > 0)
-		add_wchar(current, ret);
-	inc_syscw(current);
-	return ret;
-}
-
-static size_t do_compat_writev(compat_ulong_t fd,
-				  const struct compat_iovec __user* vec,
-				  compat_ulong_t vlen, rwf_t flags)
-{
-	struct fd f = fdget_pos(fd);
-	ssize_t ret;
-	loff_t pos;
-
-	if (!f.file)
-		return -EBADF;
-	pos = f.file->f_pos;
-	ret = compat_writev(f.file, vec, vlen, &pos, flags);
-	if (ret >= 0)
-		f.file->f_pos = pos;
-	fdput_pos(f);
-	return ret;
+		return do_readv(fd, vec, vlen, flags);
+	return do_preadv(fd, vec, vlen, pos, flags);
 }
 
 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
-		const struct compat_iovec __user *, vec,
+		const struct iovec __user *, vec,
 		compat_ulong_t, vlen)
 {
-	return do_compat_writev(fd, vec, vlen, 0);
-}
-
-static long do_compat_pwritev64(unsigned long fd,
-				   const struct compat_iovec __user *vec,
-				   unsigned long vlen, loff_t pos, rwf_t flags)
-{
-	struct fd f;
-	ssize_t ret;
-
-	if (pos < 0)
-		return -EINVAL;
-	f = fdget(fd);
-	if (!f.file)
-		return -EBADF;
-	ret = -ESPIPE;
-	if (f.file->f_mode & FMODE_PWRITE)
-		ret = compat_writev(f.file, vec, vlen, &pos, flags);
-	fdput(f);
-	return ret;
+	return do_writev(fd, vec, vlen, 0);
 }
 
 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *, vec,
 		unsigned long, vlen, loff_t, pos)
 {
-	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+	return do_pwritev(fd, vec, vlen, pos, 0);
 }
 #endif
 
 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *,vec,
 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
-	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+	return do_pwritev(fd, vec, vlen, pos, 0);
 }
 
 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *, vec,
 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
 	if (pos == -1)
-		return do_compat_writev(fd, vec, vlen, flags);
-
-	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+		return do_writev(fd, vec, vlen, flags);
+	return do_pwritev(fd, vec, vlen, pos, flags);
 }
 #endif
 
 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
-		const struct compat_iovec __user *,vec,
+		const struct iovec __user *,vec,
 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
 	if (pos == -1)
-		return do_compat_writev(fd, vec, vlen, flags);
-
-	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+		return do_writev(fd, vec, vlen, flags);
+	return do_pwritev(fd, vec, vlen, pos, flags);
 }
-
-#endif
+#endif /* CONFIG_COMPAT */
 
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		  	   size_t count, loff_t max)
-- 
2.28.0


^ permalink raw reply related

* [PATCH 11/11] security/keys: remove compat_keyctl_instantiate_key_iov
From: Christoph Hellwig @ 2020-09-21 14:34 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, io-uring, linux-arm-kernel, Jens Axboe, linux-parisc,
	netdev, linux-kernel, linux-security-module, David Laight,
	linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200921143434.707844-1-hch@lst.de>

Now that import_iovec handles compat iovecs, the native version of
keyctl_instantiate_key_iov can be used for the compat case as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 security/keys/compat.c   | 36 ++----------------------------------
 security/keys/internal.h |  5 -----
 security/keys/keyctl.c   |  2 +-
 3 files changed, 3 insertions(+), 40 deletions(-)

diff --git a/security/keys/compat.c b/security/keys/compat.c
index 7ae531db031cf8..1545efdca56227 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -11,38 +11,6 @@
 #include <linux/slab.h>
 #include "internal.h"
 
-/*
- * Instantiate a key with the specified compatibility multipart payload and
- * link the key into the destination keyring if one is given.
- *
- * The caller must have the appropriate instantiation permit set for this to
- * work (see keyctl_assume_authority).  No other permissions are required.
- *
- * If successful, 0 will be returned.
- */
-static long compat_keyctl_instantiate_key_iov(
-	key_serial_t id,
-	const struct compat_iovec __user *_payload_iov,
-	unsigned ioc,
-	key_serial_t ringid)
-{
-	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
-	struct iov_iter from;
-	long ret;
-
-	if (!_payload_iov)
-		ioc = 0;
-
-	ret = import_iovec(WRITE, (const struct iovec __user *)_payload_iov,
-			   ioc, ARRAY_SIZE(iovstack), &iov, &from);
-	if (ret < 0)
-		return ret;
-
-	ret = keyctl_instantiate_key_common(id, &from, ringid);
-	kfree(iov);
-	return ret;
-}
-
 /*
  * The key control system call, 32-bit compatibility version for 64-bit archs
  */
@@ -113,8 +81,8 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option,
 		return keyctl_reject_key(arg2, arg3, arg4, arg5);
 
 	case KEYCTL_INSTANTIATE_IOV:
-		return compat_keyctl_instantiate_key_iov(
-			arg2, compat_ptr(arg3), arg4, arg5);
+		return keyctl_instantiate_key_iov(arg2, compat_ptr(arg3), arg4,
+						  arg5);
 
 	case KEYCTL_INVALIDATE:
 		return keyctl_invalidate_key(arg2);
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 338a526cbfa516..9b9cf3b6fcbb4d 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -262,11 +262,6 @@ extern long keyctl_instantiate_key_iov(key_serial_t,
 				       const struct iovec __user *,
 				       unsigned, key_serial_t);
 extern long keyctl_invalidate_key(key_serial_t);
-
-struct iov_iter;
-extern long keyctl_instantiate_key_common(key_serial_t,
-					  struct iov_iter *,
-					  key_serial_t);
 extern long keyctl_restrict_keyring(key_serial_t id,
 				    const char __user *_type,
 				    const char __user *_restriction);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 9febd37a168fd0..e26bbccda7ccee 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1164,7 +1164,7 @@ static int keyctl_change_reqkey_auth(struct key *key)
  *
  * If successful, 0 will be returned.
  */
-long keyctl_instantiate_key_common(key_serial_t id,
+static long keyctl_instantiate_key_common(key_serial_t id,
 				   struct iov_iter *from,
 				   key_serial_t ringid)
 {
-- 
2.28.0


^ permalink raw reply related

* [PATCH 10/11] mm: remove compat_process_vm_{readv,writev}
From: Christoph Hellwig @ 2020-09-21 14:34 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, io-uring, linux-arm-kernel, Jens Axboe, linux-parisc,
	netdev, linux-kernel, linux-security-module, David Laight,
	linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200921143434.707844-1-hch@lst.de>

Now that import_iovec handles compat iovecs, the native syscalls
can be used for the compat case as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/arm64/include/asm/unistd32.h             |  4 +-
 arch/mips/kernel/syscalls/syscall_n32.tbl     |  4 +-
 arch/mips/kernel/syscalls/syscall_o32.tbl     |  4 +-
 arch/parisc/kernel/syscalls/syscall.tbl       |  4 +-
 arch/powerpc/kernel/syscalls/syscall.tbl      |  4 +-
 arch/s390/kernel/syscalls/syscall.tbl         |  4 +-
 arch/sparc/kernel/syscalls/syscall.tbl        |  4 +-
 arch/x86/entry/syscall_x32.c                  |  2 +
 arch/x86/entry/syscalls/syscall_32.tbl        |  4 +-
 arch/x86/entry/syscalls/syscall_64.tbl        |  4 +-
 include/linux/compat.h                        |  8 ---
 include/uapi/asm-generic/unistd.h             |  6 +-
 mm/process_vm_access.c                        | 71 -------------------
 tools/include/uapi/asm-generic/unistd.h       |  6 +-
 .../arch/powerpc/entry/syscalls/syscall.tbl   |  4 +-
 .../perf/arch/s390/entry/syscalls/syscall.tbl |  4 +-
 .../arch/x86/entry/syscalls/syscall_64.tbl    |  4 +-
 17 files changed, 30 insertions(+), 111 deletions(-)

diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 11dfae3a8563bd..0c280a05f699bf 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -763,9 +763,9 @@ __SYSCALL(__NR_sendmmsg, compat_sys_sendmmsg)
 #define __NR_setns 375
 __SYSCALL(__NR_setns, sys_setns)
 #define __NR_process_vm_readv 376
-__SYSCALL(__NR_process_vm_readv, compat_sys_process_vm_readv)
+__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
 #define __NR_process_vm_writev 377
-__SYSCALL(__NR_process_vm_writev, compat_sys_process_vm_writev)
+__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
 #define __NR_kcmp 378
 __SYSCALL(__NR_kcmp, sys_kcmp)
 #define __NR_finit_module 379
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 5a39d4de0ac85b..0bc2e0fcf1ee56 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -317,8 +317,8 @@
 306	n32	syncfs				sys_syncfs
 307	n32	sendmmsg			compat_sys_sendmmsg
 308	n32	setns				sys_setns
-309	n32	process_vm_readv		compat_sys_process_vm_readv
-310	n32	process_vm_writev		compat_sys_process_vm_writev
+309	n32	process_vm_readv		sys_process_vm_readv
+310	n32	process_vm_writev		sys_process_vm_writev
 311	n32	kcmp				sys_kcmp
 312	n32	finit_module			sys_finit_module
 313	n32	sched_setattr			sys_sched_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 136efc6b8c5444..b408c13b934296 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -356,8 +356,8 @@
 342	o32	syncfs				sys_syncfs
 343	o32	sendmmsg			sys_sendmmsg			compat_sys_sendmmsg
 344	o32	setns				sys_setns
-345	o32	process_vm_readv		sys_process_vm_readv		compat_sys_process_vm_readv
-346	o32	process_vm_writev		sys_process_vm_writev		compat_sys_process_vm_writev
+345	o32	process_vm_readv		sys_process_vm_readv
+346	o32	process_vm_writev		sys_process_vm_writev
 347	o32	kcmp				sys_kcmp
 348	o32	finit_module			sys_finit_module
 349	o32	sched_setattr			sys_sched_setattr
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index a9e184192caedd..2015a5124b78ad 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -372,8 +372,8 @@
 327	common	syncfs			sys_syncfs
 328	common	setns			sys_setns
 329	common	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
-330	common	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
-331	common	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+330	common	process_vm_readv	sys_process_vm_readv
+331	common	process_vm_writev	sys_process_vm_writev
 332	common	kcmp			sys_kcmp
 333	common	finit_module		sys_finit_module
 334	common	sched_setattr		sys_sched_setattr
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 0d4985919ca34d..66a472aa635d3f 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -449,8 +449,8 @@
 348	common	syncfs				sys_syncfs
 349	common	sendmmsg			sys_sendmmsg			compat_sys_sendmmsg
 350	common	setns				sys_setns
-351	nospu	process_vm_readv		sys_process_vm_readv		compat_sys_process_vm_readv
-352	nospu	process_vm_writev		sys_process_vm_writev		compat_sys_process_vm_writev
+351	nospu	process_vm_readv		sys_process_vm_readv
+352	nospu	process_vm_writev		sys_process_vm_writev
 353	nospu	finit_module			sys_finit_module
 354	nospu	kcmp				sys_kcmp
 355	common	sched_setattr			sys_sched_setattr
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index b5495a42814bd1..7485867a490bb2 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -347,8 +347,8 @@
 337  common	clock_adjtime		sys_clock_adjtime		sys_clock_adjtime32
 338  common	syncfs			sys_syncfs			sys_syncfs
 339  common	setns			sys_setns			sys_setns
-340  common	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
-341  common	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+340  common	process_vm_readv	sys_process_vm_readv		sys_process_vm_readv
+341  common	process_vm_writev	sys_process_vm_writev		sys_process_vm_writev
 342  common	s390_runtime_instr	sys_s390_runtime_instr		sys_s390_runtime_instr
 343  common	kcmp			sys_kcmp			sys_kcmp
 344  common	finit_module		sys_finit_module		sys_finit_module
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index f1810c1a35caa5..4a9365b2e340b2 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -406,8 +406,8 @@
 335	common	syncfs			sys_syncfs
 336	common	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
 337	common	setns			sys_setns
-338	common	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
-339	common	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+338	common	process_vm_readv	sys_process_vm_readv
+339	common	process_vm_writev	sys_process_vm_writev
 340	32	kern_features		sys_ni_syscall			sys_kern_features
 340	64	kern_features		sys_kern_features
 341	common	kcmp			sys_kcmp
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index a4840b9d50ad14..f2fe0a33bcfdd5 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -17,6 +17,8 @@
 #define __x32_sys_getsockopt	__x64_sys_getsockopt
 #define __x32_sys_setsockopt	__x64_sys_setsockopt
 #define __x32_sys_vmsplice	__x64_sys_vmsplice
+#define __x32_sys_process_vm_readv	__x64_sys_process_vm_readv
+#define __x32_sys_process_vm_writev	__x64_sys_process_vm_writev
 
 #define __SYSCALL_64(nr, sym)
 
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 0fb2f172581e51..5fbe10ad8a23fc 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -358,8 +358,8 @@
 344	i386	syncfs			sys_syncfs
 345	i386	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
 346	i386	setns			sys_setns
-347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
-348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+347	i386	process_vm_readv	sys_process_vm_readv
+348	i386	process_vm_writev	sys_process_vm_writev
 349	i386	kcmp			sys_kcmp
 350	i386	finit_module		sys_finit_module
 351	i386	sched_setattr		sys_sched_setattr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 642af919183de4..347809649ba28f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -395,8 +395,8 @@
 536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo
 537	x32	recvmmsg		compat_sys_recvmmsg_time64
 538	x32	sendmmsg		compat_sys_sendmmsg
-539	x32	process_vm_readv	compat_sys_process_vm_readv
-540	x32	process_vm_writev	compat_sys_process_vm_writev
+539	x32	process_vm_readv	sys_process_vm_readv
+540	x32	process_vm_writev	sys_process_vm_writev
 541	x32	setsockopt		sys_setsockopt
 542	x32	getsockopt		sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 14d8bf412bd02e..d46e5905970817 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -780,14 +780,6 @@ asmlinkage long compat_sys_open_by_handle_at(int mountdirfd,
 					     int flags);
 asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg,
 				    unsigned vlen, unsigned int flags);
-asmlinkage ssize_t compat_sys_process_vm_readv(compat_pid_t pid,
-		const struct compat_iovec __user *lvec,
-		compat_ulong_t liovcnt, const struct compat_iovec __user *rvec,
-		compat_ulong_t riovcnt, compat_ulong_t flags);
-asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,
-		const struct compat_iovec __user *lvec,
-		compat_ulong_t liovcnt, const struct compat_iovec __user *rvec,
-		compat_ulong_t riovcnt, compat_ulong_t flags);
 asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
 		     const compat_uptr_t __user *argv,
 		     const compat_uptr_t __user *envp, int flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index f2dcb0d5703014..c1dfe99c9c3f70 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -727,11 +727,9 @@ __SYSCALL(__NR_setns, sys_setns)
 #define __NR_sendmmsg 269
 __SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg)
 #define __NR_process_vm_readv 270
-__SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \
-          compat_sys_process_vm_readv)
+__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
 #define __NR_process_vm_writev 271
-__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
-          compat_sys_process_vm_writev)
+__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
 #define __NR_kcmp 272
 __SYSCALL(__NR_kcmp, sys_kcmp)
 #define __NR_finit_module 273
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index b759ed264840d8..a5f49ae45643fc 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -14,10 +14,6 @@
 #include <linux/slab.h>
 #include <linux/syscalls.h>
 
-#ifdef CONFIG_COMPAT
-#include <linux/compat.h>
-#endif
-
 /**
  * process_vm_rw_pages - read/write pages from task specified
  * @pages: array of pointers to pages we want to copy
@@ -309,70 +305,3 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
 {
 	return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
 }
-
-#ifdef CONFIG_COMPAT
-
-static ssize_t
-compat_process_vm_rw(compat_pid_t pid,
-		     const struct compat_iovec __user *lvec,
-		     unsigned long liovcnt,
-		     const struct compat_iovec __user *rvec,
-		     unsigned long riovcnt,
-		     unsigned long flags, int vm_write)
-{
-	struct iovec iovstack_l[UIO_FASTIOV];
-	struct iovec iovstack_r[UIO_FASTIOV];
-	struct iovec *iov_l = iovstack_l;
-	struct iovec *iov_r = iovstack_r;
-	struct iov_iter iter_l, iter_r;
-	ssize_t rc = -EFAULT;
-	int dir = vm_write ? WRITE : READ;
-
-	if (flags != 0)
-		return -EINVAL;
-
-	rc = import_iovec(dir, (const struct iovec __user *)iov_l, liovcnt,
-			  UIO_FASTIOV, &iov_l, &iter_l);
-	if (rc < 0)
-		return rc;
-	if (!iov_iter_count(&iter_l))
-		goto free_iovecs;
-	rc = import_iovec(CHECK_IOVEC_ONLY, iov_r, riovcnt, UIO_FASTIOV, &iov_r,
-			  &iter_r);
-	if (rc <= 0)
-		goto free_iovecs;
-
-	rc = process_vm_rw_core(pid, &iter_l, iter_r.iov, iter_r.nr_segs,
-				flags, vm_write);
-
-free_iovecs:
-	if (iov_r != iovstack_r)
-		kfree(iov_r);
-	if (iov_l != iovstack_l)
-		kfree(iov_l);
-	return rc;
-}
-
-COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid,
-		       const struct compat_iovec __user *, lvec,
-		       compat_ulong_t, liovcnt,
-		       const struct compat_iovec __user *, rvec,
-		       compat_ulong_t, riovcnt,
-		       compat_ulong_t, flags)
-{
-	return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
-				    riovcnt, flags, 0);
-}
-
-COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid,
-		       const struct compat_iovec __user *, lvec,
-		       compat_ulong_t, liovcnt,
-		       const struct compat_iovec __user *, rvec,
-		       compat_ulong_t, riovcnt,
-		       compat_ulong_t, flags)
-{
-	return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
-				    riovcnt, flags, 1);
-}
-
-#endif
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index f2dcb0d5703014..c1dfe99c9c3f70 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -727,11 +727,9 @@ __SYSCALL(__NR_setns, sys_setns)
 #define __NR_sendmmsg 269
 __SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg)
 #define __NR_process_vm_readv 270
-__SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \
-          compat_sys_process_vm_readv)
+__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
 #define __NR_process_vm_writev 271
-__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
-          compat_sys_process_vm_writev)
+__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
 #define __NR_kcmp 272
 __SYSCALL(__NR_kcmp, sys_kcmp)
 #define __NR_finit_module 273
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 26f0347c15118b..a188f053cbf90a 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -443,8 +443,8 @@
 348	common	syncfs				sys_syncfs
 349	common	sendmmsg			sys_sendmmsg			compat_sys_sendmmsg
 350	common	setns				sys_setns
-351	nospu	process_vm_readv		sys_process_vm_readv		compat_sys_process_vm_readv
-352	nospu	process_vm_writev		sys_process_vm_writev		compat_sys_process_vm_writev
+351	nospu	process_vm_readv		sys_process_vm_readv
+352	nospu	process_vm_writev		sys_process_vm_writev
 353	nospu	finit_module			sys_finit_module
 354	nospu	kcmp				sys_kcmp
 355	common	sched_setattr			sys_sched_setattr
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index 02ad81f69bb7e3..c44c83032c3a04 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -347,8 +347,8 @@
 337  common	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
 338  common	syncfs			sys_syncfs			sys_syncfs
 339  common	setns			sys_setns			sys_setns
-340  common	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
-341  common	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+340  common	process_vm_readv	sys_process_vm_readv		sys_process_vm_readv
+341  common	process_vm_writev	sys_process_vm_writev		sys_process_vm_writev
 342  common	s390_runtime_instr	sys_s390_runtime_instr		sys_s390_runtime_instr
 343  common	kcmp			sys_kcmp			compat_sys_kcmp
 344  common	finit_module		sys_finit_module		compat_sys_finit_module
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index 642af919183de4..347809649ba28f 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -395,8 +395,8 @@
 536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo
 537	x32	recvmmsg		compat_sys_recvmmsg_time64
 538	x32	sendmmsg		compat_sys_sendmmsg
-539	x32	process_vm_readv	compat_sys_process_vm_readv
-540	x32	process_vm_writev	compat_sys_process_vm_writev
+539	x32	process_vm_readv	sys_process_vm_readv
+540	x32	process_vm_writev	sys_process_vm_writev
 541	x32	setsockopt		sys_setsockopt
 542	x32	getsockopt		sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
-- 
2.28.0


^ permalink raw reply related

* Re: [PATCH 02/11] mm: call import_iovec() instead of rw_copy_check_uvector() in process_vm_rw()
From: Matthew Wilcox @ 2020-09-21 14:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, Alexander Viro, io-uring, linux-arm-kernel,
	Jens Axboe, linux-parisc, netdev, linux-kernel,
	linux-security-module, David Laight, linux-fsdevel, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20200921143434.707844-3-hch@lst.de>

On Mon, Sep 21, 2020 at 04:34:25PM +0200, Christoph Hellwig wrote:
>  {
> -	WARN_ON(direction & ~(READ | WRITE));
> +	WARN_ON(direction & ~(READ | WRITE | CHECK_IOVEC_ONLY));

This is now a no-op because:

include/linux/fs.h:#define CHECK_IOVEC_ONLY -1

I'd suggest we renumber it to 2?

(READ is 0, WRITE is 1.  This WARN_ON should probably be
	WARN_ON(direction > CHECK_IOVEC_ONLY)

^ permalink raw reply

* Re: [PATCH 02/11] mm: call import_iovec() instead of rw_copy_check_uvector() in process_vm_rw()
From: Al Viro @ 2020-09-21 15:02 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, io-uring, linux-arm-kernel, Jens Axboe, linux-parisc,
	netdev, linux-kernel, linux-security-module, David Laight,
	linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200921143434.707844-3-hch@lst.de>

On Mon, Sep 21, 2020 at 04:34:25PM +0200, Christoph Hellwig wrote:
> From: David Laight <David.Laight@ACULAB.COM>
> 
> This is the only direct call of rw_copy_check_uvector().  Removing it
> will allow rw_copy_check_uvector() to be inlined into import_iovec(),
> while only paying a minor price by setting up an otherwise unused
> iov_iter in the process_vm_readv/process_vm_writev syscalls that aren't
> in a super hot path.

> @@ -443,7 +443,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
>  			const struct iovec *iov, unsigned long nr_segs,
>  			size_t count)
>  {
> -	WARN_ON(direction & ~(READ | WRITE));
> +	WARN_ON(direction & ~(READ | WRITE | CHECK_IOVEC_ONLY));
>  	direction &= READ | WRITE;

Ugh...

> -	rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
> -				   iovstack_r, &iov_r);
> +	rc = import_iovec(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, &iov_r,
> +			  &iter_r);
>  	if (rc <= 0)
>  		goto free_iovecs;
>  
> -	rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
> +	rc = process_vm_rw_core(pid, &iter_l, iter_r.iov, iter_r.nr_segs,
> +				flags, vm_write);

... and ugh^2, since now you are not only setting a meaningless iov_iter,
you are creating a new place that pokes directly into struct iov_iter
guts.

Sure, moving rw_copy_check_uvector() over to lib/iov_iter.c makes sense.
But I would rather split the access_ok()-related checks out of that thing
and bury CHECK_IOVEC_ONLY.

Step 1: move the damn thing to lib/iov_iter.c (same as you do, but without
making it static)

Step 2: split it in two:

ssize_t rw_copy_check_uvector(const struct iovec __user * uvector,
                              unsigned long nr_segs, unsigned long fast_segs,
                              struct iovec *fast_pointer,
                              struct iovec **ret_pointer)
{
	unsigned long seg;
	ssize_t ret;
	struct iovec *iov = fast_pointer;

	*ret_pointer = fast_pointer;
	/*
	 * SuS says "The readv() function *may* fail if the iovcnt argument
	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
	 * traditionally returned zero for zero segments, so...
	 */
	if (nr_segs == 0)
		return 0;

	/*
	 * First get the "struct iovec" from user memory and
	 * verify all the pointers
	 */
	if (nr_segs > UIO_MAXIOV)
		return -EINVAL;

	if (nr_segs > fast_segs) {
		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
		if (!iov)
			return -ENOMEM;
		*ret_pointer = iov;
	}
	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector)))
		return -EFAULT;

	/*
	 * According to the Single Unix Specification we should return EINVAL
	 * if an element length is < 0 when cast to ssize_t or if the
	 * total length would overflow the ssize_t return value of the
	 * system call.
	 *
	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
	 * overflow case.
	 */
	ret = 0;
	for (seg = 0; seg < nr_segs; seg++) {
		void __user *buf = iov[seg].iov_base;
		ssize_t len = (ssize_t)iov[seg].iov_len;

		/* see if we we're about to use an invalid len or if
		 * it's about to overflow ssize_t */
		if (len < 0)
			return -EINVAL;
		if (len > MAX_RW_COUNT - ret) {
			len = MAX_RW_COUNT - ret;
			iov[seg].iov_len = len;
		}
		ret += len;
	}
	return ret;
}

/*
 *  This is merely an early sanity check; we do _not_ rely upon
 *  it when we get to the actual memory accesses.
 */
static bool check_iovecs(const struct iovec *iov, int nr_segs)
{
        for (seg = 0; seg < nr_segs; seg++) {
                void __user *buf = iov[seg].iov_base;
                ssize_t len = (ssize_t)iov[seg].iov_len;

                if (unlikely(!access_ok(buf, len)))
                        return false;
        }
	return true;
}

ssize_t import_iovec(int type, const struct iovec __user * uvector,
                 unsigned nr_segs, unsigned fast_segs,
                 struct iovec **iov, struct iov_iter *i)
{
	struct iovec *p;
	ssize_t n;

	n = rw_copy_check_uvector(uvector, nr_segs, fast_segs, *iov, &p);
	if (n > 0 && !check_iovecs(p, nr_segs))
		n = -EFAULT;
	if (n < 0) {
		if (p != *iov)
			kfree(p);
		*iov = NULL;
		return n;
	}
	iov_iter_init(i, type, p, nr_segs, n);
	*iov = p == *iov ? NULL : p;
	return n;
}

kill CHECK_IOVEC_ONLY and use rw_copy_check_uvector() without the type
argument in mm/process_vm_access.c

Saner that way, IMO...

^ permalink raw reply

* RE: [PATCH 04/11] iov_iter: explicitly check for CHECK_IOVEC_ONLY in rw_copy_check_uvector
From: David Laight @ 2020-09-21 15:05 UTC (permalink / raw)
  To: 'Christoph Hellwig', Alexander Viro
  Cc: Jens Axboe, linux-s390@vger.kernel.org,
	linux-arch@vger.kernel.org, linux-parisc@vger.kernel.org,
	Arnd Bergmann, linux-aio@kvack.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	David Howells, linux-fsdevel@vger.kernel.org,
	linux-security-module@vger.kernel.org, keyrings@vger.kernel.org,
	linux-block@vger.kernel.org, linux-mm@kvack.org,
	linux-scsi@vger.kernel.org, sparclinux@vger.kernel.org,
	Andrew Morton, linuxppc-dev@lists.ozlabs.org,
	io-uring@vger.kernel.org, linux-arm-kernel@lists.infradead.org
In-Reply-To: <20200921143434.707844-5-hch@lst.de>

From: Christoph Hellwig
> Sent: 21 September 2020 15:34
> 
> Explicitly check for the magic value insted of implicitly relying on
> its numeric representation.   Also drop the rather pointless unlikely
> annotation.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  lib/iov_iter.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index d7e72343c360eb..a64867501a7483 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -1709,8 +1709,7 @@ static ssize_t rw_copy_check_uvector(int type,
>  			ret = -EINVAL;
>  			goto out;
>  		}
> -		if (type >= 0
> -		    && unlikely(!access_ok(buf, len))) {
> +		if (type != CHECK_IOVEC_ONLY && !access_ok(buf, len)) {
>  			ret = -EFAULT;
>  			goto out;
>  		}
> @@ -1824,7 +1823,7 @@ static ssize_t compat_rw_copy_check_uvector(int type,
>  		}
>  		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
>  			goto out;
> -		if (type >= 0 &&
> +		if (type != CHECK_IOVEC_ONLY &&
>  		    !access_ok(compat_ptr(buf), len)) {
>  			ret = -EFAULT;
>  			goto out;
> --
> 2.28.0

I've actually no idea:
1) Why there is an access_ok() check here.
   It will be repeated by the user copy functions.
2) Why it isn't done when called from mm/process_vm_access.c.
   Ok, the addresses refer to a different process, but they
   must still be valid user addresses.

Is 2 a legacy from when access_ok() actually checked that the
addresses were mapped into the process's address space?

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply

* Re: [PATCH 04/11] iov_iter: explicitly check for CHECK_IOVEC_ONLY in rw_copy_check_uvector
From: Al Viro @ 2020-09-21 15:07 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, io-uring, linux-arm-kernel, Jens Axboe, linux-parisc,
	netdev, linux-kernel, linux-security-module, David Laight,
	linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200921143434.707844-5-hch@lst.de>

On Mon, Sep 21, 2020 at 04:34:27PM +0200, Christoph Hellwig wrote:
> Explicitly check for the magic value insted of implicitly relying on
> its numeric representation.   Also drop the rather pointless unlikely
> annotation.

See above - I would rather have CHECK_IOVEC_ONLY gone.

The reason for doing these access_ok() in the same loop is fairly weak,
especially since you are checking type on each iteration.  Might as
well do that in a separate loop afterwards.

^ permalink raw reply

* Re: [PATCH 04/11] iov_iter: explicitly check for CHECK_IOVEC_ONLY in rw_copy_check_uvector
From: Al Viro @ 2020-09-21 15:11 UTC (permalink / raw)
  To: David Laight
  Cc: linux-aio@kvack.org, linux-mips@vger.kernel.org, David Howells,
	linux-mm@kvack.org, keyrings@vger.kernel.org,
	sparclinux@vger.kernel.org, 'Christoph Hellwig',
	linux-arch@vger.kernel.org, linux-s390@vger.kernel.org,
	linux-scsi@vger.kernel.org, Arnd Bergmann,
	linux-block@vger.kernel.org, io-uring@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, Jens Axboe,
	linux-parisc@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-security-module@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, Andrew Morton,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <7336624280b8444fb4cb00407317741b@AcuMS.aculab.com>

On Mon, Sep 21, 2020 at 03:05:32PM +0000, David Laight wrote:

> I've actually no idea:
> 1) Why there is an access_ok() check here.
>    It will be repeated by the user copy functions.

Early sanity check.

> 2) Why it isn't done when called from mm/process_vm_access.c.
>    Ok, the addresses refer to a different process, but they
>    must still be valid user addresses.
> 
> Is 2 a legacy from when access_ok() actually checked that the
> addresses were mapped into the process's address space?

It never did.  2 is for the situation when a 32bit process
accesses 64bit one; addresses that are perfectly legitimate
for 64bit userland (and fitting into the first 4Gb of address
space, so they can be represented by 32bit pointers just fine)
might be rejected by access_ok() if the caller is 32bit.

^ permalink raw reply

* Re: [PATCH 05/11] iov_iter: merge the compat case into rw_copy_check_uvector
From: Al Viro @ 2020-09-21 15:14 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, io-uring, linux-arm-kernel, Jens Axboe, linux-parisc,
	netdev, linux-kernel, linux-security-module, David Laight,
	linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200921143434.707844-6-hch@lst.de>

On Mon, Sep 21, 2020 at 04:34:28PM +0200, Christoph Hellwig wrote:
> +static int compat_copy_iovecs_from_user(struct iovec *iov,
> +		const struct iovec __user *uvector, unsigned long nr_segs)
> +{
> +	const struct compat_iovec __user *uiov =
> +		(const struct compat_iovec __user *)uvector;
> +	unsigned long i;
	         ^^^^
Huh?

^ permalink raw reply

* Re: [PATCH 06/11] iov_iter: handle the compat case in import_iovec
From: Al Viro @ 2020-09-21 15:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, linux-arch, linux-s390, linux-scsi, Arnd Bergmann,
	linux-block, io-uring, linux-arm-kernel, Jens Axboe, linux-parisc,
	netdev, linux-kernel, linux-security-module, David Laight,
	linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200921143434.707844-7-hch@lst.de>

On Mon, Sep 21, 2020 at 04:34:29PM +0200, Christoph Hellwig wrote:
> Use in compat_syscall to import either native or the compat iovecs, and
> remove the now superflous compat_import_iovec, which removes the need for
> special compat logic in most callers.  Only io_uring needs special
> treatment given that it can call import_iovec from kernel threads acting
> on behalf of native or compat syscalls.  Expose the low-level
> __import_iovec helper and use it in io_uring to explicitly pick a iovec
> layout.

fs/aio.c part is not obvious...  Might be better to use __import_iovec()
there as well and leave the rest of aio.c changes to followup.

> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -1683,7 +1683,7 @@ static int compat_copy_iovecs_from_user(struct iovec *iov,
>  	return ret;
>  }
>  
> -static ssize_t __import_iovec(int type, const struct iovec __user *uvector,
> +ssize_t __import_iovec(int type, const struct iovec __user *uvector,
>  		unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
>  		struct iov_iter *i, bool compat)
>  {

Don't make it static in the first place, perhaps?

^ permalink raw reply

* RE: [PATCH 02/11] mm: call import_iovec() instead of rw_copy_check_uvector() in process_vm_rw()
From: David Laight @ 2020-09-21 15:21 UTC (permalink / raw)
  To: 'Al Viro', Christoph Hellwig
  Cc: Jens Axboe, linux-s390@vger.kernel.org,
	linux-arch@vger.kernel.org, linux-parisc@vger.kernel.org,
	Arnd Bergmann, linux-aio@kvack.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mips@vger.kernel.org,
	David Howells, linux-fsdevel@vger.kernel.org,
	linux-security-module@vger.kernel.org, keyrings@vger.kernel.org,
	linux-block@vger.kernel.org, linux-mm@kvack.org,
	linux-scsi@vger.kernel.org, sparclinux@vger.kernel.org,
	Andrew Morton, linuxppc-dev@lists.ozlabs.org,
	io-uring@vger.kernel.org, linux-arm-kernel@lists.infradead.org
In-Reply-To: <20200921150211.GS3421308@ZenIV.linux.org.uk>

From: Al Viro
> Sent: 21 September 2020 16:02
> 
> On Mon, Sep 21, 2020 at 04:34:25PM +0200, Christoph Hellwig wrote:
> > From: David Laight <David.Laight@ACULAB.COM>
> >
> > This is the only direct call of rw_copy_check_uvector().  Removing it
> > will allow rw_copy_check_uvector() to be inlined into import_iovec(),
> > while only paying a minor price by setting up an otherwise unused
> > iov_iter in the process_vm_readv/process_vm_writev syscalls that aren't
> > in a super hot path.
> 
> > @@ -443,7 +443,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
> >  			const struct iovec *iov, unsigned long nr_segs,
> >  			size_t count)
> >  {
> > -	WARN_ON(direction & ~(READ | WRITE));
> > +	WARN_ON(direction & ~(READ | WRITE | CHECK_IOVEC_ONLY));
> >  	direction &= READ | WRITE;
> 
> Ugh...
> 
> > -	rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
> > -				   iovstack_r, &iov_r);
> > +	rc = import_iovec(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, &iov_r,
> > +			  &iter_r);
> >  	if (rc <= 0)
> >  		goto free_iovecs;
> >
> > -	rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
> > +	rc = process_vm_rw_core(pid, &iter_l, iter_r.iov, iter_r.nr_segs,
> > +				flags, vm_write);
> 
> ... and ugh^2, since now you are not only setting a meaningless iov_iter,
> you are creating a new place that pokes directly into struct iov_iter
> guts.
> 
> Sure, moving rw_copy_check_uvector() over to lib/iov_iter.c makes sense.
> But I would rather split the access_ok()-related checks out of that thing
> and bury CHECK_IOVEC_ONLY.
> 
> Step 1: move the damn thing to lib/iov_iter.c (same as you do, but without
> making it static)
> 
> Step 2: split it in two:
> 
> ssize_t rw_copy_check_uvector(const struct iovec __user * uvector,
>                               unsigned long nr_segs, unsigned long fast_segs,
>                               struct iovec *fast_pointer,
>                               struct iovec **ret_pointer)
> {
> 	unsigned long seg;
...
> 	ret = 0;
> 	for (seg = 0; seg < nr_segs; seg++) {
> 		void __user *buf = iov[seg].iov_base;
> 		ssize_t len = (ssize_t)iov[seg].iov_len;
> 
> 		/* see if we we're about to use an invalid len or if
> 		 * it's about to overflow ssize_t */
> 		if (len < 0)
> 			return -EINVAL;
> 		if (len > MAX_RW_COUNT - ret) {
> 			len = MAX_RW_COUNT - ret;
> 			iov[seg].iov_len = len;
> 		}
> 		ret += len;
> 	}
> 	return ret;
> }
> 
> /*
>  *  This is merely an early sanity check; we do _not_ rely upon
>  *  it when we get to the actual memory accesses.
>  */
> static bool check_iovecs(const struct iovec *iov, int nr_segs)
> {
>         for (seg = 0; seg < nr_segs; seg++) {
>                 void __user *buf = iov[seg].iov_base;
>                 ssize_t len = (ssize_t)iov[seg].iov_len;
> 
>                 if (unlikely(!access_ok(buf, len)))
>                         return false;
>         }
> 	return true;
> }

You really don't want to be looping through the array twice.
In fact you don't really want to be doing all those tests at all.
This code makes a significant fraction of the not-insignificant
difference between the 'costs' of send() and sendmsg().

I think the 'length' check can be optimised to do something like:
	for (...) {
		ssize_t len = (ssize_t)iov[seg].iov_len;
		ret += len;
		len_hi += (unsigned long)len >> 20;
	}
	if (len_hi) {
		/* Something potentially odd in the lengths.
		 * Might just be a very long fragment.
		 * Check the individial values. */
		Add the exiting loop here.
	}

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply

* RE: [PATCH 04/11] iov_iter: explicitly check for CHECK_IOVEC_ONLY in rw_copy_check_uvector
From: David Laight @ 2020-09-21 15:26 UTC (permalink / raw)
  To: 'Al Viro'
  Cc: linux-aio@kvack.org, linux-mips@vger.kernel.org, David Howells,
	linux-mm@kvack.org, keyrings@vger.kernel.org,
	sparclinux@vger.kernel.org, 'Christoph Hellwig',
	linux-arch@vger.kernel.org, linux-s390@vger.kernel.org,
	linux-scsi@vger.kernel.org, Arnd Bergmann,
	linux-block@vger.kernel.org, io-uring@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, Jens Axboe,
	linux-parisc@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-security-module@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, Andrew Morton,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <20200921151119.GU3421308@ZenIV.linux.org.uk>

From: Al Viro
> Sent: 21 September 2020 16:11
> On Mon, Sep 21, 2020 at 03:05:32PM +0000, David Laight wrote:
> 
> > I've actually no idea:
> > 1) Why there is an access_ok() check here.
> >    It will be repeated by the user copy functions.
> 
> Early sanity check.
> 
> > 2) Why it isn't done when called from mm/process_vm_access.c.
> >    Ok, the addresses refer to a different process, but they
> >    must still be valid user addresses.
> >
> > Is 2 a legacy from when access_ok() actually checked that the
> > addresses were mapped into the process's address space?
> 
> It never did.  2 is for the situation when a 32bit process
> accesses 64bit one; addresses that are perfectly legitimate
> for 64bit userland (and fitting into the first 4Gb of address
> space, so they can be represented by 32bit pointers just fine)
> might be rejected by access_ok() if the caller is 32bit.

Can't 32 bit processes on a 64bit system access all the way to 4GB?
Mapping things by default above 3GB will probably break things.
But there is no reason to disallow explicit maps.
And in any case access_ok() can use the same limit as it does for
64bit processes - the page fault handler will sort it all out.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply

* Re: [PATCH 02/11] mm: call import_iovec() instead of rw_copy_check_uvector() in process_vm_rw()
From: Al Viro @ 2020-09-21 15:29 UTC (permalink / raw)
  To: David Laight
  Cc: linux-aio@kvack.org, linux-mips@vger.kernel.org, David Howells,
	linux-mm@kvack.org, keyrings@vger.kernel.org,
	sparclinux@vger.kernel.org, Christoph Hellwig,
	linux-arch@vger.kernel.org, linux-s390@vger.kernel.org,
	linux-scsi@vger.kernel.org, Arnd Bergmann,
	linux-block@vger.kernel.org, io-uring@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, Jens Axboe,
	linux-parisc@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-security-module@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, Andrew Morton,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <ef67787edb2f48548d69caaaff6997ba@AcuMS.aculab.com>

On Mon, Sep 21, 2020 at 03:21:35PM +0000, David Laight wrote:

> You really don't want to be looping through the array twice.

Profiles, please.

> I think the 'length' check can be optimised to do something like:
> 	for (...) {
> 		ssize_t len = (ssize_t)iov[seg].iov_len;
> 		ret += len;
> 		len_hi += (unsigned long)len >> 20;
> 	}
> 	if (len_hi) {
> 		/* Something potentially odd in the lengths.
> 		 * Might just be a very long fragment.
> 		 * Check the individial values. */
> 		Add the exiting loop here.
> 	}

Far too ugly to live.

^ permalink raw reply

* Re: [PATCH -next] ocxl: simplify the return expression of free_function_dev()
From: Frederic Barrat @ 2020-09-21 15:34 UTC (permalink / raw)
  To: Qinglang Miao, Andrew Donnellan, Arnd Bergmann,
	Greg Kroah-Hartman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200921131047.92526-1-miaoqinglang@huawei.com>



Le 21/09/2020 à 15:10, Qinglang Miao a écrit :
> Simplify the return expression.
> 
> Signed-off-by: Qinglang Miao <miaoqinglang@huawei.com>
> ---


Thanks!
Acked-by: Frederic Barrat <fbarrat@linux.ibm.com>


>   drivers/misc/ocxl/core.c | 7 +------
>   1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/drivers/misc/ocxl/core.c b/drivers/misc/ocxl/core.c
> index b7a09b21a..aebfc53a2 100644
> --- a/drivers/misc/ocxl/core.c
> +++ b/drivers/misc/ocxl/core.c
> @@ -327,14 +327,9 @@ static void free_function_dev(struct device *dev)
>   
>   static int set_function_device(struct ocxl_fn *fn, struct pci_dev *dev)
>   {
> -	int rc;
> -
>   	fn->dev.parent = &dev->dev;
>   	fn->dev.release = free_function_dev;
> -	rc = dev_set_name(&fn->dev, "ocxlfn.%s", dev_name(&dev->dev));
> -	if (rc)
> -		return rc;
> -	return 0;
> +	return dev_set_name(&fn->dev, "ocxlfn.%s", dev_name(&dev->dev));
>   }
>   
>   static int assign_function_actag(struct ocxl_fn *fn)
> 

^ permalink raw reply

* RE: [PATCH 02/11] mm: call import_iovec() instead of rw_copy_check_uvector() in process_vm_rw()
From: David Laight @ 2020-09-21 15:44 UTC (permalink / raw)
  To: 'Al Viro'
  Cc: linux-aio@kvack.org, linux-mips@vger.kernel.org, David Howells,
	linux-mm@kvack.org, keyrings@vger.kernel.org,
	sparclinux@vger.kernel.org, Christoph Hellwig,
	linux-arch@vger.kernel.org, linux-s390@vger.kernel.org,
	linux-scsi@vger.kernel.org, Arnd Bergmann,
	linux-block@vger.kernel.org, io-uring@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, Jens Axboe,
	linux-parisc@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-security-module@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, Andrew Morton,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <20200921152937.GX3421308@ZenIV.linux.org.uk>

From: Al Viro
> Sent: 21 September 2020 16:30
> 
> On Mon, Sep 21, 2020 at 03:21:35PM +0000, David Laight wrote:
> 
> > You really don't want to be looping through the array twice.
> 
> Profiles, please.

I did some profiling of send() v sendmsg() much earlier in the year.
I can't remember the exact details but the extra cost of sendmsg()
is far more than you might expect.
(I was timing sending fully built IPv4 UDP packets using a raw socket.)

About half the difference does away if you change the
copy_from_user() to __copy_from_user() when reading the struct msghdr
and iov[] from userspace (user copy hardening is expensive).

The rest is just code path, my gut feeling is that a lot of that
is in import_iovec().

Remember semdmsg() is likely to be called with an iov count of 1.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply

* Re: [PATCH V2] Doc: admin-guide: Add entry for kvm_cma_resv_ratio kernel param
From: Randy Dunlap @ 2020-09-21 15:58 UTC (permalink / raw)
  To: sathnaga, linux-doc
  Cc: Jonathan Corbet, linux-kernel, kvm-ppc, Paul Mackerras,
	linuxppc-dev
In-Reply-To: <20200921090220.14981-1-sathnaga@linux.vnet.ibm.com>

On 9/21/20 2:02 AM, sathnaga@linux.vnet.ibm.com wrote:
> From: Satheesh Rajendran <sathnaga@linux.vnet.ibm.com>
> 
> Add document entry for kvm_cma_resv_ratio kernel param which
> is used to alter the KVM contiguous memory allocation percentage
> for hash pagetable allocation used by hash mode PowerPC KVM guests.
> 
> Cc: linux-kernel@vger.kernel.org
> Cc: kvm-ppc@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Jonathan Corbet <corbet@lwn.net>
> Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
> Signed-off-by: Satheesh Rajendran <sathnaga@linux.vnet.ibm.com>

Hi,

> ---
> 
> V2: 
> Addressed review comments from Randy.
> 
> V1: https://lkml.org/lkml/2020/9/16/72

In reply to V1, I didn't add a Reviewed-by: tag, but I can now.

Reviewed-by: Randy Dunlap <rdunlap@infradead.org>

> ---
>  Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index a1068742a6df..932ed45740c9 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2258,6 +2258,14 @@
>  			[KVM,ARM] Allow use of GICv4 for direct injection of
>  			LPIs.
>  
> +	kvm_cma_resv_ratio=n [PPC]
> +			Reserves given percentage from system memory area for
> +			contiguous memory allocation for KVM hash pagetable
> +			allocation.
> +			By default it reserves 5% of total system memory.
> +			Format: <integer>
> +			Default: 5
> +
>  	kvm-intel.ept=	[KVM,Intel] Disable extended page tables
>  			(virtualized MMU) support on capable Intel chips.
>  			Default is 1 (enabled)
> 

thanks.
-- 
~Randy


^ permalink raw reply

* Re: [PATCH 02/11] mm: call import_iovec() instead of rw_copy_check_uvector() in process_vm_rw()
From: Christoph Hellwig @ 2020-09-21 16:12 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-aio@kvack.org, linux-mips@vger.kernel.org, David Howells,
	linux-mm@kvack.org, keyrings@vger.kernel.org,
	sparclinux@vger.kernel.org, Christoph Hellwig,
	linux-arch@vger.kernel.org, linux-s390@vger.kernel.org,
	linux-scsi@vger.kernel.org, Arnd Bergmann,
	linux-block@vger.kernel.org, io-uring@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, Jens Axboe,
	linux-parisc@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-security-module@vger.kernel.org, David Laight,
	linux-fsdevel@vger.kernel.org, Andrew Morton,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <20200921152937.GX3421308@ZenIV.linux.org.uk>

On Mon, Sep 21, 2020 at 04:29:37PM +0100, Al Viro wrote:
> On Mon, Sep 21, 2020 at 03:21:35PM +0000, David Laight wrote:
> 
> > You really don't want to be looping through the array twice.
> 
> Profiles, please.

Given that the iov array should be cache hot I'd be surprised to
see a huge difference.  

^ permalink raw reply

* Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag
From: Pavel Begunkov @ 2020-09-21 16:10 UTC (permalink / raw)
  To: Andy Lutomirski, Arnd Bergmann
  Cc: linux-aio, open list:MIPS, David Howells, Linux-MM, keyrings,
	sparclinux, Christoph Hellwig, linux-arch, linux-s390,
	Linux SCSI List, X86 ML, linux-block, Al Viro, Andy Lutomirski,
	io-uring, linux-arm-kernel, Jens Axboe, Parisc List,
	Network Development, LKML, LSM List, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <D0791499-1190-4C3F-A984-0A313ECA81C7@amacapital.net>

On 20/09/2020 01:22, Andy Lutomirski wrote:
> 
>> On Sep 19, 2020, at 2:16 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>
>> On Sat, Sep 19, 2020 at 6:21 PM Andy Lutomirski <luto@kernel.org> wrote:
>>>> On Fri, Sep 18, 2020 at 8:16 AM Christoph Hellwig <hch@lst.de> wrote:
>>>> On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
>>>>> Said that, why not provide a variant that would take an explicit
>>>>> "is it compat" argument and use it there?  And have the normal
>>>>> one pass in_compat_syscall() to that...
>>>>
>>>> That would help to not introduce a regression with this series yes.
>>>> But it wouldn't fix existing bugs when io_uring is used to access
>>>> read or write methods that use in_compat_syscall().  One example that
>>>> I recently ran into is drivers/scsi/sg.c.
>>
>> Ah, so reading /dev/input/event* would suffer from the same issue,
>> and that one would in fact be broken by your patch in the hypothetical
>> case that someone tried to use io_uring to read /dev/input/event on x32...
>>
>> For reference, I checked the socket timestamp handling that has a
>> number of corner cases with time32/time64 formats in compat mode,
>> but none of those appear to be affected by the problem.
>>
>>> Aside from the potentially nasty use of per-task variables, one thing
>>> I don't like about PF_FORCE_COMPAT is that it's one-way.  If we're
>>> going to have a generic mechanism for this, shouldn't we allow a full
>>> override of the syscall arch instead of just allowing forcing compat
>>> so that a compat syscall can do a non-compat operation?
>>
>> The only reason it's needed here is that the caller is in a kernel
>> thread rather than a system call. Are there any possible scenarios
>> where one would actually need the opposite?
>>
> 
> I can certainly imagine needing to force x32 mode from a kernel thread.
> 
> As for the other direction: what exactly are the desired bitness/arch semantics of io_uring?  Is the operation bitness chosen by the io_uring creation or by the io_uring_enter() bitness?

It's rather the second one. Even though AFAIR it wasn't discussed
specifically, that how it works now (_partially_).

-- 
Pavel Begunkov

^ permalink raw reply

* Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag
From: Pavel Begunkov @ 2020-09-21 16:13 UTC (permalink / raw)
  To: Andy Lutomirski, Arnd Bergmann
  Cc: linux-aio, open list:MIPS, David Howells, Linux-MM, keyrings,
	sparclinux, Christoph Hellwig, linux-arch, linux-s390,
	Linux SCSI List, X86 ML, linux-block, Al Viro, Andy Lutomirski,
	io-uring, linux-arm-kernel, Jens Axboe, Parisc List,
	Network Development, LKML, LSM List, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <563138b5-7073-74bc-f0c5-b2bad6277e87@gmail.com>

On 21/09/2020 19:10, Pavel Begunkov wrote:
> On 20/09/2020 01:22, Andy Lutomirski wrote:
>>
>>> On Sep 19, 2020, at 2:16 PM, Arnd Bergmann <arnd@arndb.de> wrote:
>>>
>>> On Sat, Sep 19, 2020 at 6:21 PM Andy Lutomirski <luto@kernel.org> wrote:
>>>>> On Fri, Sep 18, 2020 at 8:16 AM Christoph Hellwig <hch@lst.de> wrote:
>>>>> On Fri, Sep 18, 2020 at 02:58:22PM +0100, Al Viro wrote:
>>>>>> Said that, why not provide a variant that would take an explicit
>>>>>> "is it compat" argument and use it there?  And have the normal
>>>>>> one pass in_compat_syscall() to that...
>>>>>
>>>>> That would help to not introduce a regression with this series yes.
>>>>> But it wouldn't fix existing bugs when io_uring is used to access
>>>>> read or write methods that use in_compat_syscall().  One example that
>>>>> I recently ran into is drivers/scsi/sg.c.
>>>
>>> Ah, so reading /dev/input/event* would suffer from the same issue,
>>> and that one would in fact be broken by your patch in the hypothetical
>>> case that someone tried to use io_uring to read /dev/input/event on x32...
>>>
>>> For reference, I checked the socket timestamp handling that has a
>>> number of corner cases with time32/time64 formats in compat mode,
>>> but none of those appear to be affected by the problem.
>>>
>>>> Aside from the potentially nasty use of per-task variables, one thing
>>>> I don't like about PF_FORCE_COMPAT is that it's one-way.  If we're
>>>> going to have a generic mechanism for this, shouldn't we allow a full
>>>> override of the syscall arch instead of just allowing forcing compat
>>>> so that a compat syscall can do a non-compat operation?
>>>
>>> The only reason it's needed here is that the caller is in a kernel
>>> thread rather than a system call. Are there any possible scenarios
>>> where one would actually need the opposite?
>>>
>>
>> I can certainly imagine needing to force x32 mode from a kernel thread.
>>
>> As for the other direction: what exactly are the desired bitness/arch semantics of io_uring?  Is the operation bitness chosen by the io_uring creation or by the io_uring_enter() bitness?
> 
> It's rather the second one. Even though AFAIR it wasn't discussed
> specifically, that how it works now (_partially_).

Double checked -- I'm wrong, that's the former one. Most of it is based
on a flag that was set an creation.

-- 
Pavel Begunkov

^ permalink raw reply

* Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag
From: Pavel Begunkov @ 2020-09-21 16:20 UTC (permalink / raw)
  To: William Kucharski, Matthew Wilcox
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, Christoph Hellwig, linux-arch, linux-s390, linux-scsi,
	x86, Arnd Bergmann, linux-block, Alexander Viro, io-uring,
	linux-arm-kernel, Jens Axboe, linux-parisc, netdev, linux-kernel,
	linux-security-module, linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <76A432F3-4532-42A4-900E-16C0AC2D21D8@gmail.com>

On 20/09/2020 18:55, William Kucharski wrote:
> I really like that as it’s self-documenting and anyone debugging it can see what is actually being used at a glance.

Also creates special cases for things that few people care about,
and makes it a pain for cross-platform (cross-bitness) development.

> 
>> On Sep 20, 2020, at 09:15, Matthew Wilcox <willy@infradead.org> wrote:
>>
>> On Fri, Sep 18, 2020 at 02:45:25PM +0200, Christoph Hellwig wrote:
>>> Add a flag to force processing a syscall as a compat syscall.  This is
>>> required so that in_compat_syscall() works for I/O submitted by io_uring
>>> helper threads on behalf of compat syscalls.
>>
>> Al doesn't like this much, but my suggestion is to introduce two new
>> opcodes -- IORING_OP_READV32 and IORING_OP_WRITEV32.  The compat code
>> can translate IORING_OP_READV to IORING_OP_READV32 and then the core
>> code can know what that user pointer is pointing to.

-- 
Pavel Begunkov

^ permalink raw reply

* Re: [PATCH 02/11] mm: call import_iovec() instead of rw_copy_check_uvector() in process_vm_rw()
From: Al Viro @ 2020-09-21 16:27 UTC (permalink / raw)
  To: David Laight
  Cc: linux-aio@kvack.org, linux-mips@vger.kernel.org, David Howells,
	linux-mm@kvack.org, keyrings@vger.kernel.org,
	sparclinux@vger.kernel.org, Christoph Hellwig,
	linux-arch@vger.kernel.org, linux-s390@vger.kernel.org,
	linux-scsi@vger.kernel.org, Arnd Bergmann,
	linux-block@vger.kernel.org, io-uring@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, Jens Axboe,
	linux-parisc@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-security-module@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, Andrew Morton,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <226e03bf941844eba4d64af31633c177@AcuMS.aculab.com>

On Mon, Sep 21, 2020 at 03:44:00PM +0000, David Laight wrote:
> From: Al Viro
> > Sent: 21 September 2020 16:30
> > 
> > On Mon, Sep 21, 2020 at 03:21:35PM +0000, David Laight wrote:
> > 
> > > You really don't want to be looping through the array twice.
> > 
> > Profiles, please.
> 
> I did some profiling of send() v sendmsg() much earlier in the year.
> I can't remember the exact details but the extra cost of sendmsg()
> is far more than you might expect.
> (I was timing sending fully built IPv4 UDP packets using a raw socket.)
> 
> About half the difference does away if you change the
> copy_from_user() to __copy_from_user() when reading the struct msghdr
> and iov[] from userspace (user copy hardening is expensive).

Wha...?  So the difference is within 4 times the overhead of the
hardening checks done for one call of copy_from_user()?

> The rest is just code path, my gut feeling is that a lot of that
> is in import_iovec().
> 
> Remember semdmsg() is likely to be called with an iov count of 1.

... which makes that loop unlikely to be noticable in the entire
mess, whether you pass it once or twice.  IOW, unless you can show
profiles where that loop is sufficiently hot or if you can show
the timings change from splitting it in two, I'll remain very
sceptical about that assertion.

^ permalink raw reply

* Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag
From: Pavel Begunkov @ 2020-09-21 16:26 UTC (permalink / raw)
  To: Matthew Wilcox, Al Viro
  Cc: linux-aio, linux-mips, David Howells, linux-mm, keyrings,
	sparclinux, Christoph Hellwig, linux-arch, linux-s390, linux-scsi,
	x86, Arnd Bergmann, linux-block, io-uring, linux-arm-kernel,
	Jens Axboe, linux-parisc, netdev, linux-kernel,
	linux-security-module, linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200920192259.GU32101@casper.infradead.org>

On 20/09/2020 22:22, Matthew Wilcox wrote:
> On Sun, Sep 20, 2020 at 08:10:31PM +0100, Al Viro wrote:
>> IMO it's much saner to mark those and refuse to touch them from io_uring...
> 
> Simpler solution is to remove io_uring from the 32-bit syscall list.
> If you're a 32-bit process, you don't get to use io_uring.  Would
> any real users actually care about that?

There were .net and\or wine (which AFAIK often works in compat) guys
experimenting with io_uring, they might want it.

-- 
Pavel Begunkov

^ permalink raw reply

* Re: [patch RFC 00/15] mm/highmem: Provide a preemptible variant of kmap_atomic & friends
From: Linus Torvalds @ 2020-09-21 16:24 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Juri Lelli, Peter Zijlstra, Sebastian Andrzej Siewior,
	Joonas Lahtinen, dri-devel, linux-mips, Ben Segall, Max Filippov,
	Guo Ren, linux-sparc, Vincent Chen, Will Deacon, Ard Biesheuvel,
	linux-arch, Vincent Guittot, Herbert Xu, the arch/x86 maintainers,
	Russell King, linux-csky, David Airlie, Mel Gorman,
	open list:SYNOPSYS ARC ARCHITECTURE, linux-xtensa, Paul McKenney,
	intel-gfx, linuxppc-dev, Steven Rostedt, Jani Nikula,
	Rodrigo Vivi, Dietmar Eggemann, Linux ARM, Chris Zankel,
	Michal Simek, Thomas Bogendoerfer, Nick Hu, Linux-MM,
	Vineet Gupta, LKML, Arnd Bergmann, Daniel Vetter, Paul Mackerras,
	Andrew Morton, Daniel Bristot de Oliveira, David S. Miller,
	Greentime Hu
In-Reply-To: <87a6xjd1dw.fsf@nanos.tec.linutronix.de>

On Mon, Sep 21, 2020 at 12:39 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> If a task is migrated to a different CPU then the mapping address will
> change which will explode in colourful ways.

Heh.

Right you are.

Maybe we really *could* call this new kmap functionality something
like "kmap_percpu()" (or maybe "local" is good enough), and make it
act like your RT code does for spinlocks - not disable preemption, but
only disabling CPU migration.

That would probably be good enough for a lot of users that don't want
to expose excessive latencies, but where it's really not a huge deal
to say "stick to this CPU for a short while".

The crypto code certainly sounds like one such case.

             Linus

^ permalink raw reply

* Re: [PATCH 1/9] kernel: add a PF_FORCE_COMPAT flag
From: Pavel Begunkov @ 2020-09-21 16:31 UTC (permalink / raw)
  To: David Laight, 'Arnd Bergmann', Andy Lutomirski
  Cc: linux-aio, open list:MIPS, David Howells, Linux-MM,
	keyrings@vger.kernel.org, sparclinux, Christoph Hellwig,
	linux-arch, linux-s390, Linux SCSI List, X86 ML, Matthew Wilcox,
	linux-block, Al Viro, io-uring@vger.kernel.org, linux-arm-kernel,
	Jens Axboe, Parisc List, Network Development, LKML, LSM List,
	Linux FS Devel, Andrew Morton, linuxppc-dev
In-Reply-To: <8363d874e503470f8caa201e85e9fbd4@AcuMS.aculab.com>

On 21/09/2020 00:13, David Laight wrote:
> From: Arnd Bergmann
>> Sent: 20 September 2020 21:49
>>
>> On Sun, Sep 20, 2020 at 9:28 PM Andy Lutomirski <luto@kernel.org> wrote:
>>> On Sun, Sep 20, 2020 at 12:23 PM Matthew Wilcox <willy@infradead.org> wrote:
>>>>
>>>> On Sun, Sep 20, 2020 at 08:10:31PM +0100, Al Viro wrote:
>>>>> IMO it's much saner to mark those and refuse to touch them from io_uring...
>>>>
>>>> Simpler solution is to remove io_uring from the 32-bit syscall list.
>>>> If you're a 32-bit process, you don't get to use io_uring.  Would
>>>> any real users actually care about that?
>>>
>>> We could go one step farther and declare that we're done adding *any*
>>> new compat syscalls :)
>>
>> Would you also stop adding system calls to native 32-bit systems then?
>>
>> On memory constrained systems (less than 2GB a.t.m.), there is still a
>> strong demand for running 32-bit user space, but all of the recent Arm
>> cores (after Cortex-A55) dropped the ability to run 32-bit kernels, so
>> that compat mode may eventually become the primary way to run
>> Linux on cheap embedded systems.
>>
>> I don't think there is any chance we can realistically take away io_uring
>> from the 32-bit ABI any more now.
> 
> Can't it just run requests from 32bit apps in a kernel thread that has
> the 'in_compat_syscall' flag set?
> Not that i recall seeing the code where it saves the 'compat' nature
> of any requests.
> 
> It is already completely f*cked if you try to pass the command ring
> to a child process - it uses the wrong 'mm'.

And how so? io_uring uses mm of a submitter. The exception is SQPOLL
mode, but it requires CAP_SYS_ADMIN or CAP_SYS_NICE anyway.

> I suspect there are some really horrid security holes in that area.
> 
> 	David.
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
> 

-- 
Pavel Begunkov

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox