* Re: [PATCH v2 02/12] ocxl: Change type of pasid to unsigned int
From: Fenghua Yu @ 2020-06-18 15:37 UTC (permalink / raw)
To: Frederic Barrat
Cc: Dave Hansen, H Peter Anvin, Dave Jiang, Ashok Raj, Joerg Roedel,
x86, amd-gfx, Ingo Molnar, Ravi V Shankar, Yu-cheng Yu,
Andrew Donnellan, Borislav Petkov, Sohil Mehta, Thomas Gleixner,
Tony Luck, David Woodhouse, Felix Kuehling, linux-kernel, iommu,
Jacob Jun Pan, linuxppc-dev, Lu Baolu
In-Reply-To: <972dc2cb-9643-53af-b11d-ebb56d96053d@linux.ibm.com>
Hi, Frederic,
On Thu, Jun 18, 2020 at 10:05:19AM +0200, Frederic Barrat wrote:
>
>
> Le 13/06/2020 à 02:41, Fenghua Yu a écrit :
> >PASID is defined as "int" although it's a 20-bit value and shouldn't be
> >negative int. To be consistent with type defined in iommu, define PASID
> >as "unsigned int".
>
>
> It looks like this patch was considered because of the use of 'pasid' in
> variable or function names. The ocxl driver only makes sense on powerpc and
> shouldn't compile on anything else, so it's probably useless in the context
> of that series.
> The pasid here is defined by the opencapi specification
> (https://opencapi.org), it is borrowed from the PCI world and you could
> argue it could be an unsigned int. But then I think the patch doesn't go far
> enough. But considering it's not used on x86, I think this patch can be
> dropped.
The first 3 patches clean up pasid and flag defitions to prepare for
following patches.
If you think this patch can be dropped, we will drop it.
Thanks.
-Fenghua
^ permalink raw reply
* [PATCH 4/6] exec: split prepare_arg_pages
From: Christoph Hellwig @ 2020-06-18 14:46 UTC (permalink / raw)
To: Al Viro
Cc: linux-arch, linux-s390, linux-parisc, Arnd Bergmann, Brian Gerst,
x86, linux-mips, linux-kernel, linux-fsdevel, Luis Chamberlain,
sparclinux, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200618144627.114057-1-hch@lst.de>
Move counting the arguments and enviroment variables out of
prepare_arg_pages and rename the rest of the function to check_arg_limit.
This prepares for a version of do_execvat that takes kernel pointers.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/exec.c | 26 ++++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index a5d91f8b1341d5..34781db6bf6889 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -435,20 +435,10 @@ static int count_strings(const char __user *const __user *argv)
return i;
}
-static int prepare_arg_pages(struct linux_binprm *bprm,
- const char __user *const __user *argv,
- const char __user *const __user *envp)
+static int check_arg_limit(struct linux_binprm *bprm)
{
unsigned long limit, ptr_size;
- bprm->argc = count_strings(argv);
- if (bprm->argc < 0)
- return bprm->argc;
-
- bprm->envc = count_strings(envp);
- if (bprm->envc < 0)
- return bprm->envc;
-
/*
* Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
* (whichever is smaller) for the argv+env strings.
@@ -1886,7 +1876,19 @@ int do_execveat(int fd, struct filename *filename,
if (retval)
goto out_unmark;
- retval = prepare_arg_pages(bprm, argv, envp);
+ bprm->argc = count_strings(argv);
+ if (bprm->argc < 0) {
+ retval = bprm->argc;
+ goto out;
+ }
+
+ bprm->envc = count_strings(envp);
+ if (bprm->envc < 0) {
+ retval = bprm->envc;
+ goto out;
+ }
+
+ retval = check_arg_limit(bprm);
if (retval < 0)
goto out;
--
2.26.2
^ permalink raw reply related
* [PATCH 6/6] kernel: add a kernel_wait helper
From: Christoph Hellwig @ 2020-06-18 14:46 UTC (permalink / raw)
To: Al Viro
Cc: linux-arch, linux-s390, linux-parisc, Arnd Bergmann, Brian Gerst,
x86, linux-mips, linux-kernel, linux-fsdevel, Luis Chamberlain,
sparclinux, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200618144627.114057-1-hch@lst.de>
Add a helper that waits for a pid and stores the status in the passed
in kernel pointer. Use it to fix the usage of kernel_wait4 in
call_usermodehelper_exec_sync that only happens to work due to the
implicit set_fs(KERNEL_DS) for kernel threads.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
include/linux/sched/task.h | 1 +
kernel/exit.c | 16 ++++++++++++++++
kernel/umh.c | 29 ++++-------------------------
3 files changed, 21 insertions(+), 25 deletions(-)
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 38359071236ad7..a80007df396e95 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -102,6 +102,7 @@ struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
+int kernel_wait(pid_t pid, int *stat);
extern void free_task(struct task_struct *tsk);
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f2810338..fd598846df0b17 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
return ret;
}
+int kernel_wait(pid_t pid, int *stat)
+{
+ struct wait_opts wo = {
+ .wo_type = PIDTYPE_PID,
+ .wo_pid = find_get_pid(pid),
+ .wo_flags = WEXITED,
+ };
+ int ret;
+
+ ret = do_wait(&wo);
+ if (ret > 0 && wo.wo_stat)
+ *stat = wo.wo_stat;
+ put_pid(wo.wo_pid);
+ return ret;
+}
+
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
diff --git a/kernel/umh.c b/kernel/umh.c
index 1284823dbad338..6fd948e478bec4 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -126,37 +126,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
pid_t pid;
- /* If SIGCLD is ignored kernel_wait4 won't populate the status. */
+ /* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
+ if (pid < 0)
sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- kernel_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
+ else
+ kernel_wait(pid, &sub_info->retval);
/* Restore default kernel sig handler */
kernel_sigaction(SIGCHLD, SIG_IGN);
-
umh_complete(sub_info);
}
--
2.26.2
^ permalink raw reply related
* [PATCH 5/6] exec: add a kernel_execveat helper
From: Christoph Hellwig @ 2020-06-18 14:46 UTC (permalink / raw)
To: Al Viro
Cc: linux-arch, linux-s390, linux-parisc, Arnd Bergmann, Brian Gerst,
x86, linux-mips, linux-kernel, linux-fsdevel, Luis Chamberlain,
sparclinux, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200618144627.114057-1-hch@lst.de>
Add a kernel_execveat helper to execute a binary with kernel space argv
and envp pointers. Switch executing init and user mode helpers to this
new helper instead of relying on the implicit set_fs(KERNEL_DS) for early
init code and kernel threads, and move the getname call into the
do_execve helper.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/exec.c | 109 ++++++++++++++++++++++++++++++++--------
include/linux/binfmts.h | 6 +--
init/main.c | 6 +--
kernel/umh.c | 8 ++-
4 files changed, 95 insertions(+), 34 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 34781db6bf6889..7923b8334ae600 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -435,6 +435,21 @@ static int count_strings(const char __user *const __user *argv)
return i;
}
+static int count_kernel_strings(const char *const *argv)
+{
+ int i;
+
+ if (!argv)
+ return 0;
+
+ for (i = 0; argv[i]; i++) {
+ if (i >= MAX_ARG_STRINGS)
+ return -E2BIG;
+ }
+
+ return i;
+}
+
static int check_arg_limit(struct linux_binprm *bprm)
{
unsigned long limit, ptr_size;
@@ -611,6 +626,19 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
}
EXPORT_SYMBOL(copy_string_kernel);
+static int copy_strings_kernel(int argc, const char *const *argv,
+ struct linux_binprm *bprm)
+{
+ int ret;
+
+ while (argc-- > 0) {
+ ret = copy_string_kernel(argv[argc], bprm);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
#ifdef CONFIG_MMU
/*
@@ -1793,9 +1821,11 @@ static int exec_binprm(struct linux_binprm *bprm)
return 0;
}
-int do_execveat(int fd, struct filename *filename,
+static int __do_execveat(int fd, struct filename *filename,
const char __user *const __user *argv,
const char __user *const __user *envp,
+ const char *const *kernel_argv,
+ const char *const *kernel_envp,
int flags, struct file *file)
{
char *pathbuf = NULL;
@@ -1876,16 +1906,30 @@ int do_execveat(int fd, struct filename *filename,
if (retval)
goto out_unmark;
- bprm->argc = count_strings(argv);
- if (bprm->argc < 0) {
- retval = bprm->argc;
- goto out;
- }
+ if (unlikely(kernel_argv)) {
+ bprm->argc = count_kernel_strings(kernel_argv);
+ if (bprm->argc < 0) {
+ retval = bprm->argc;
+ goto out;
+ }
- bprm->envc = count_strings(envp);
- if (bprm->envc < 0) {
- retval = bprm->envc;
- goto out;
+ bprm->envc = count_kernel_strings(kernel_envp);
+ if (bprm->envc < 0) {
+ retval = bprm->envc;
+ goto out;
+ }
+ } else {
+ bprm->argc = count_strings(argv);
+ if (bprm->argc < 0) {
+ retval = bprm->argc;
+ goto out;
+ }
+
+ bprm->envc = count_strings(envp);
+ if (bprm->envc < 0) {
+ retval = bprm->envc;
+ goto out;
+ }
}
retval = check_arg_limit(bprm);
@@ -1902,13 +1946,22 @@ int do_execveat(int fd, struct filename *filename,
goto out;
bprm->exec = bprm->p;
- retval = copy_strings(bprm->envc, envp, bprm);
- if (retval < 0)
- goto out;
- retval = copy_strings(bprm->argc, argv, bprm);
- if (retval < 0)
- goto out;
+ if (unlikely(kernel_argv)) {
+ retval = copy_strings_kernel(bprm->envc, kernel_envp, bprm);
+ if (retval < 0)
+ goto out;
+ retval = copy_strings_kernel(bprm->argc, kernel_argv, bprm);
+ if (retval < 0)
+ goto out;
+ } else {
+ retval = copy_strings(bprm->envc, envp, bprm);
+ if (retval < 0)
+ goto out;
+ retval = copy_strings(bprm->argc, argv, bprm);
+ if (retval < 0)
+ goto out;
+ }
retval = exec_binprm(bprm);
if (retval < 0)
@@ -1959,6 +2012,23 @@ int do_execveat(int fd, struct filename *filename,
return retval;
}
+static int do_execveat(int fd, const char *filename,
+ const char __user *const __user *argv,
+ const char __user *const __user *envp, int flags)
+{
+ int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+ struct filename *name = getname_flags(filename, lookup_flags, NULL);
+
+ return __do_execveat(fd, name, argv, envp, NULL, NULL, flags, NULL);
+}
+
+int kernel_execveat(int fd, const char *filename, const char *const *argv,
+ const char *const *envp, int flags, struct file *file)
+{
+ return __do_execveat(fd, getname_kernel(filename), NULL, NULL, argv,
+ envp, flags, file);
+}
+
void set_binfmt(struct linux_binfmt *new)
{
struct mm_struct *mm = current->mm;
@@ -1988,7 +2058,7 @@ SYSCALL_DEFINE3(execve,
const char __user *const __user *, argv,
const char __user *const __user *, envp)
{
- return do_execveat(AT_FDCWD, getname(filename), argv, envp, 0, NULL);
+ return do_execveat(AT_FDCWD, filename, argv, envp, 0);
}
SYSCALL_DEFINE5(execveat,
@@ -1997,8 +2067,5 @@ SYSCALL_DEFINE5(execveat,
const char __user *const __user *, envp,
int, flags)
{
- int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
- struct filename *name = getname_flags(filename, lookup_flags, NULL);
-
- return do_execveat(fd, name, argv, envp, flags, NULL);
+ return do_execveat(fd, filename, argv, envp, flags);
}
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index bed702e4b1fbd9..1e61c980c16354 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -134,9 +134,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm);
extern void set_binfmt(struct linux_binfmt *new);
extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
-int do_execveat(int fd, struct filename *filename,
- const char __user *const __user *__argv,
- const char __user *const __user *__envp,
- int flags, struct file *file);
+int kernel_execveat(int fd, const char *filename, const char *const *argv,
+ const char *const *envp, int flags, struct file *file);
#endif /* _LINUX_BINFMTS_H */
diff --git a/init/main.c b/init/main.c
index 838950ea7bca22..33de235dc2aa00 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1329,10 +1329,8 @@ static int run_init_process(const char *init_filename)
pr_debug(" with environment:\n");
for (p = envp_init; *p; p++)
pr_debug(" %s\n", *p);
- return do_execveat(AT_FDCWD, getname_kernel(init_filename),
- (const char __user *const __user *)argv_init,
- (const char __user *const __user *)envp_init,
- 0, NULL);
+ return kernel_execveat(AT_FDCWD, init_filename, argv_init, envp_init, 0,
+ NULL);
}
static int try_to_run_init_process(const char *init_filename)
diff --git a/kernel/umh.c b/kernel/umh.c
index 7aa9a5817582ca..1284823dbad338 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -103,11 +103,9 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
sub_info->pid = task_pid_nr(current);
- retval = do_execveat(AT_FDCWD,
- sub_info->path ? getname_kernel(sub_info->path) : NULL,
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp,
- 0, sub_info->file);
+ retval = kernel_execveat(AT_FDCWD, sub_info->path,
+ (const char *const *)sub_info->argv,
+ (const char *const *)sub_info->envp, 0, sub_info->file);
if (sub_info->file && !retval)
current->flags |= PF_UMH;
out:
--
2.26.2
^ permalink raw reply related
* [PATCH 3/6] exec: cleanup the count() function
From: Christoph Hellwig @ 2020-06-18 14:46 UTC (permalink / raw)
To: Al Viro
Cc: linux-arch, linux-s390, linux-parisc, Arnd Bergmann, Brian Gerst,
x86, linux-mips, linux-kernel, linux-fsdevel, Luis Chamberlain,
sparclinux, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200618144627.114057-1-hch@lst.de>
Remove the max argument as it is hard wired to MAX_ARG_STRINGS, and
give the function a slightly less generic name.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/exec.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 4e5db0e35797a5..a5d91f8b1341d5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -407,9 +407,9 @@ get_user_arg_ptr(const char __user *const __user *argv, int nr)
}
/*
- * count() counts the number of strings in array ARGV.
+ * count_strings() counts the number of strings in array ARGV.
*/
-static int count(const char __user *const __user *argv, int max)
+static int count_strings(const char __user *const __user *argv)
{
int i = 0;
@@ -423,7 +423,7 @@ static int count(const char __user *const __user *argv, int max)
if (IS_ERR(p))
return -EFAULT;
- if (i >= max)
+ if (i >= MAX_ARG_STRINGS)
return -E2BIG;
++i;
@@ -441,11 +441,11 @@ static int prepare_arg_pages(struct linux_binprm *bprm,
{
unsigned long limit, ptr_size;
- bprm->argc = count(argv, MAX_ARG_STRINGS);
+ bprm->argc = count_strings(argv);
if (bprm->argc < 0)
return bprm->argc;
- bprm->envc = count(envp, MAX_ARG_STRINGS);
+ bprm->envc = count_strings(envp);
if (bprm->envc < 0)
return bprm->envc;
--
2.26.2
^ permalink raw reply related
* [PATCH 2/6] exec: simplify the compat syscall handling
From: Christoph Hellwig @ 2020-06-18 14:46 UTC (permalink / raw)
To: Al Viro
Cc: linux-arch, linux-s390, linux-parisc, Arnd Bergmann, Brian Gerst,
x86, linux-mips, linux-kernel, linux-fsdevel, Luis Chamberlain,
sparclinux, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200618144627.114057-1-hch@lst.de>
The only differenence betweeen the compat exec* syscalls and their
native versions is that compat_ptr sign extension, and the fact that
the pointer arithmetics for the two dimensional arrays needs to use
the compat pointer size. Instead of the compat wrappers and the
struct user_arg_ptr machinery just use in_compat_syscall() to do the
right thing for the compat case deep inside get_user_arg_ptr().
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
arch/arm64/include/asm/unistd32.h | 4 +-
arch/mips/kernel/syscalls/syscall_n32.tbl | 4 +-
arch/mips/kernel/syscalls/syscall_o32.tbl | 4 +-
arch/parisc/kernel/syscalls/syscall.tbl | 4 +-
arch/powerpc/kernel/syscalls/syscall.tbl | 4 +-
arch/s390/kernel/syscalls/syscall.tbl | 4 +-
arch/sparc/kernel/syscalls.S | 4 +-
arch/x86/entry/syscall_x32.c | 7 ++
arch/x86/entry/syscalls/syscall_32.tbl | 4 +-
arch/x86/entry/syscalls/syscall_64.tbl | 4 +-
fs/exec.c | 103 ++++--------------
include/linux/compat.h | 7 --
include/uapi/asm-generic/unistd.h | 4 +-
tools/include/uapi/asm-generic/unistd.h | 4 +-
.../arch/powerpc/entry/syscalls/syscall.tbl | 4 +-
.../perf/arch/s390/entry/syscalls/syscall.tbl | 4 +-
.../arch/x86/entry/syscalls/syscall_64.tbl | 4 +-
17 files changed, 56 insertions(+), 117 deletions(-)
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 6d95d0c8bf2f47..141f5d2ff1c34f 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -33,7 +33,7 @@ __SYSCALL(__NR_link, sys_link)
#define __NR_unlink 10
__SYSCALL(__NR_unlink, sys_unlink)
#define __NR_execve 11
-__SYSCALL(__NR_execve, compat_sys_execve)
+__SYSCALL(__NR_execve, sys_execve)
#define __NR_chdir 12
__SYSCALL(__NR_chdir, sys_chdir)
/* 13 was sys_time */
@@ -785,7 +785,7 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
#define __NR_bpf 386
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 387
-__SYSCALL(__NR_execveat, compat_sys_execveat)
+__SYSCALL(__NR_execveat, sys_execveat)
#define __NR_userfaultfd 388
__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
#define __NR_membarrier 389
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index f777141f52568f..e861b5ab7179c9 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -64,7 +64,7 @@
54 n32 getsockopt compat_sys_getsockopt
55 n32 clone __sys_clone
56 n32 fork __sys_fork
-57 n32 execve compat_sys_execve
+57 n32 execve sys_execve
58 n32 exit sys_exit
59 n32 wait4 compat_sys_wait4
60 n32 kill sys_kill
@@ -328,7 +328,7 @@
317 n32 getrandom sys_getrandom
318 n32 memfd_create sys_memfd_create
319 n32 bpf sys_bpf
-320 n32 execveat compat_sys_execveat
+320 n32 execveat sys_execveat
321 n32 userfaultfd sys_userfaultfd
322 n32 membarrier sys_membarrier
323 n32 mlock2 sys_mlock2
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 13280625d312e9..bba80f74e9968e 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -18,7 +18,7 @@
8 o32 creat sys_creat
9 o32 link sys_link
10 o32 unlink sys_unlink
-11 o32 execve sys_execve compat_sys_execve
+11 o32 execve sys_execve
12 o32 chdir sys_chdir
13 o32 time sys_time32
14 o32 mknod sys_mknod
@@ -367,7 +367,7 @@
353 o32 getrandom sys_getrandom
354 o32 memfd_create sys_memfd_create
355 o32 bpf sys_bpf
-356 o32 execveat sys_execveat compat_sys_execveat
+356 o32 execveat sys_execveat
357 o32 userfaultfd sys_userfaultfd
358 o32 membarrier sys_membarrier
359 o32 mlock2 sys_mlock2
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 5a758fa6ec5242..23fa0d0edf3384 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -18,7 +18,7 @@
8 common creat sys_creat
9 common link sys_link
10 common unlink sys_unlink
-11 common execve sys_execve compat_sys_execve
+11 common execve sys_execve
12 common chdir sys_chdir
13 32 time sys_time32
13 64 time sys_time
@@ -385,7 +385,7 @@
339 common getrandom sys_getrandom
340 common memfd_create sys_memfd_create
341 common bpf sys_bpf
-342 common execveat sys_execveat compat_sys_execveat
+342 common execveat sys_execveat
343 common membarrier sys_membarrier
344 common userfaultfd sys_userfaultfd
345 common mlock2 sys_mlock2
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index f833a319082247..c52cdab89dc0ae 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -20,7 +20,7 @@
8 common creat sys_creat
9 common link sys_link
10 common unlink sys_unlink
-11 nospu execve sys_execve compat_sys_execve
+11 nospu execve sys_execve
12 common chdir sys_chdir
13 32 time sys_time32
13 64 time sys_time
@@ -460,7 +460,7 @@
359 common getrandom sys_getrandom
360 common memfd_create sys_memfd_create
361 common bpf sys_bpf
-362 nospu execveat sys_execveat compat_sys_execveat
+362 nospu execveat sys_execveat
363 32 switch_endian sys_ni_syscall
363 64 switch_endian sys_switch_endian
363 spu switch_endian sys_ni_syscall
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index bfdcb763395735..bd2275db2026ea 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -18,7 +18,7 @@
8 common creat sys_creat sys_creat
9 common link sys_link sys_link
10 common unlink sys_unlink sys_unlink
-11 common execve sys_execve compat_sys_execve
+11 common execve sys_execve sys_execve
12 common chdir sys_chdir sys_chdir
13 32 time - sys_time32
14 common mknod sys_mknod sys_mknod
@@ -361,7 +361,7 @@
351 common bpf sys_bpf sys_bpf
352 common s390_pci_mmio_write sys_s390_pci_mmio_write sys_s390_pci_mmio_write
353 common s390_pci_mmio_read sys_s390_pci_mmio_read sys_s390_pci_mmio_read
-354 common execveat sys_execveat compat_sys_execveat
+354 common execveat sys_execveat sys_execveat
355 common userfaultfd sys_userfaultfd sys_userfaultfd
356 common membarrier sys_membarrier sys_membarrier
357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32
diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S
index db42b4fb370844..70463972152a92 100644
--- a/arch/sparc/kernel/syscalls.S
+++ b/arch/sparc/kernel/syscalls.S
@@ -16,12 +16,12 @@ sys64_execveat:
sunos_execv:
mov %g0, %o2
sys32_execve:
- set compat_sys_execve, %g1
+ set sys_execve, %g1
jmpl %g1, %g0
flushw
sys32_execveat:
- set compat_sys_execveat, %g1
+ set sys_execveat, %g1
jmpl %g1, %g0
flushw
#endif
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index 3d8d70d3896c87..c9f39900a93b96 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -8,6 +8,13 @@
#include <asm/unistd.h>
#include <asm/syscall.h>
+/*
+ * Reuse the 64-bit entry points for the x32 versions that occupy different
+ * slots in the syscall table.
+ */
+#define __x32_sys_execve __x64_sys_execve
+#define __x32_sys_execveat __x64_sys_execveat
+
#define __SYSCALL_64(nr, sym)
#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *);
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index d8f8a1a69ed11f..2b1eccd3f8f697 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -22,7 +22,7 @@
8 i386 creat sys_creat
9 i386 link sys_link
10 i386 unlink sys_unlink
-11 i386 execve sys_execve compat_sys_execve
+11 i386 execve sys_execve
12 i386 chdir sys_chdir
13 i386 time sys_time32
14 i386 mknod sys_mknod
@@ -369,7 +369,7 @@
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
-358 i386 execveat sys_execveat compat_sys_execveat
+358 i386 execveat sys_execveat
359 i386 socket sys_socket
360 i386 socketpair sys_socketpair
361 i386 bind sys_bind
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 78847b32e1370f..cb3fce6ed63ebf 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,7 +375,7 @@
517 x32 recvfrom compat_sys_recvfrom
518 x32 sendmsg compat_sys_sendmsg
519 x32 recvmsg compat_sys_recvmsg
-520 x32 execve compat_sys_execve
+520 x32 execve sys_execve
521 x32 ptrace compat_sys_ptrace
522 x32 rt_sigpending compat_sys_rt_sigpending
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait_time64
@@ -400,6 +400,6 @@
542 x32 getsockopt compat_sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
-545 x32 execveat compat_sys_execveat
+545 x32 execveat sys_execveat
546 x32 preadv2 compat_sys_preadv64v2
547 x32 pwritev2 compat_sys_pwritev64v2
diff --git a/fs/exec.c b/fs/exec.c
index 354fdaa536ae7d..4e5db0e35797a5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -386,47 +386,34 @@ static int bprm_mm_init(struct linux_binprm *bprm)
return err;
}
-struct user_arg_ptr {
-#ifdef CONFIG_COMPAT
- bool is_compat;
-#endif
- union {
- const char __user *const __user *native;
-#ifdef CONFIG_COMPAT
- const compat_uptr_t __user *compat;
-#endif
- } ptr;
-};
-
-static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
+static const char __user *
+get_user_arg_ptr(const char __user *const __user *argv, int nr)
{
- const char __user *native;
-
-#ifdef CONFIG_COMPAT
- if (unlikely(argv.is_compat)) {
+ if (in_compat_syscall()) {
+ const compat_uptr_t __user *compat_argv =
+ compat_ptr((unsigned long)argv);
compat_uptr_t compat;
- if (get_user(compat, argv.ptr.compat + nr))
+ if (get_user(compat, compat_argv + nr))
return ERR_PTR(-EFAULT);
-
return compat_ptr(compat);
- }
-#endif
-
- if (get_user(native, argv.ptr.native + nr))
- return ERR_PTR(-EFAULT);
+ } else {
+ const char __user *native;
- return native;
+ if (get_user(native, argv + nr))
+ return ERR_PTR(-EFAULT);
+ return native;
+ }
}
/*
* count() counts the number of strings in array ARGV.
*/
-static int count(struct user_arg_ptr argv, int max)
+static int count(const char __user *const __user *argv, int max)
{
int i = 0;
- if (argv.ptr.native != NULL) {
+ if (argv) {
for (;;) {
const char __user *p = get_user_arg_ptr(argv, i);
@@ -449,7 +436,8 @@ static int count(struct user_arg_ptr argv, int max)
}
static int prepare_arg_pages(struct linux_binprm *bprm,
- struct user_arg_ptr argv, struct user_arg_ptr envp)
+ const char __user *const __user *argv,
+ const char __user *const __user *envp)
{
unsigned long limit, ptr_size;
@@ -497,7 +485,7 @@ static int prepare_arg_pages(struct linux_binprm *bprm,
* processes's memory to the new process's stack. The call to get_user_pages()
* ensures the destination page is created and not swapped out.
*/
-static int copy_strings(int argc, struct user_arg_ptr argv,
+static int copy_strings(int argc, const char __user *const __user *argv,
struct linux_binprm *bprm)
{
struct page *kmapped_page = NULL;
@@ -1815,10 +1803,10 @@ static int exec_binprm(struct linux_binprm *bprm)
return 0;
}
-static int __do_execveat(int fd, struct filename *filename,
- struct user_arg_ptr argv,
- struct user_arg_ptr envp,
- int flags, struct file *file)
+int do_execveat(int fd, struct filename *filename,
+ const char __user *const __user *argv,
+ const char __user *const __user *envp,
+ int flags, struct file *file)
{
char *pathbuf = NULL;
struct linux_binprm *bprm;
@@ -1969,17 +1957,6 @@ static int __do_execveat(int fd, struct filename *filename,
return retval;
}
-int do_execveat(int fd, struct filename *filename,
- const char __user *const __user *__argv,
- const char __user *const __user *__envp,
- int flags, struct file *file)
-{
- struct user_arg_ptr argv = { .ptr.native = __argv };
- struct user_arg_ptr envp = { .ptr.native = __envp };
-
- return __do_execveat(fd, filename, argv, envp, flags, file);
-}
-
void set_binfmt(struct linux_binfmt *new)
{
struct mm_struct *mm = current->mm;
@@ -2023,41 +2000,3 @@ SYSCALL_DEFINE5(execveat,
return do_execveat(fd, name, argv, envp, flags, NULL);
}
-
-#ifdef CONFIG_COMPAT
-static int do_compat_execve(int fd, struct filename *filename,
- const compat_uptr_t __user *__argv,
- const compat_uptr_t __user *__envp,
- int flags)
-{
- struct user_arg_ptr argv = {
- .is_compat = true,
- .ptr.compat = __argv,
- };
- struct user_arg_ptr envp = {
- .is_compat = true,
- .ptr.compat = __envp,
- };
-
- return __do_execveat(fd, filename, argv, envp, flags, NULL);
-}
-
-COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
- const compat_uptr_t __user *, argv,
- const compat_uptr_t __user *, envp)
-{
- return do_compat_execve(AT_FDCWD, getname(filename), argv, envp, 0);
-}
-
-COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
- const char __user *, filename,
- const compat_uptr_t __user *, argv,
- const compat_uptr_t __user *, envp,
- int, flags)
-{
- int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
- struct filename *name = getname_flags(filename, lookup_flags, NULL);
-
- return do_compat_execve(fd, name, argv, envp, flags);
-}
-#endif
diff --git a/include/linux/compat.h b/include/linux/compat.h
index e90100c0de72e4..5e8f6a588e0d43 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -752,10 +752,6 @@ asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg,
asmlinkage long compat_sys_keyctl(u32 option,
u32 arg2, u32 arg3, u32 arg4, u32 arg5);
-/* arch/example/kernel/sys_example.c */
-asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
- const compat_uptr_t __user *envp);
-
/* mm/fadvise.c: No generic prototype for fadvise64_64 */
/* mm/, CONFIG_MMU only */
@@ -806,9 +802,6 @@ asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,
const struct compat_iovec __user *lvec,
compat_ulong_t liovcnt, const struct compat_iovec __user *rvec,
compat_ulong_t riovcnt, compat_ulong_t flags);
-asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
- const compat_uptr_t __user *argv,
- const compat_uptr_t __user *envp, int flags);
asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd,
const struct compat_iovec __user *vec,
compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index f4a01305d9a65c..c639d04a094b8b 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -640,7 +640,7 @@ __SC_COMP(__NR_keyctl, sys_keyctl, compat_sys_keyctl)
#define __NR_clone 220
__SYSCALL(__NR_clone, sys_clone)
#define __NR_execve 221
-__SC_COMP(__NR_execve, sys_execve, compat_sys_execve)
+__SYSCALL(__NR_execve, sys_execve)
#define __NR3264_mmap 222
__SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap)
@@ -751,7 +751,7 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
#define __NR_bpf 280
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
-__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+__SYSCALL(__NR_execveat, sys_execveat)
#define __NR_userfaultfd 282
__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
#define __NR_membarrier 283
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index 3a3201e4618ef8..a6dbc8af8bd577 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -640,7 +640,7 @@ __SC_COMP(__NR_keyctl, sys_keyctl, compat_sys_keyctl)
#define __NR_clone 220
__SYSCALL(__NR_clone, sys_clone)
#define __NR_execve 221
-__SC_COMP(__NR_execve, sys_execve, compat_sys_execve)
+__SYSCALL(__NR_execve, sys_execve)
#define __NR3264_mmap 222
__SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap)
@@ -751,7 +751,7 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
#define __NR_bpf 280
__SYSCALL(__NR_bpf, sys_bpf)
#define __NR_execveat 281
-__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+__SYSCALL(__NR_execveat, sys_execveat)
#define __NR_userfaultfd 282
__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
#define __NR_membarrier 283
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 35b61bfc1b1ae9..42bf8b461a0ed6 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -18,7 +18,7 @@
8 common creat sys_creat
9 common link sys_link
10 common unlink sys_unlink
-11 nospu execve sys_execve compat_sys_execve
+11 nospu execve sys_execve sys_execve
12 common chdir sys_chdir
13 32 time sys_time32
13 64 time sys_time
@@ -454,7 +454,7 @@
359 common getrandom sys_getrandom
360 common memfd_create sys_memfd_create
361 common bpf sys_bpf
-362 nospu execveat sys_execveat compat_sys_execveat
+362 nospu execveat sys_execveat sys_execveat
363 32 switch_endian sys_ni_syscall
363 64 switch_endian ppc_switch_endian
363 spu switch_endian sys_ni_syscall
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index b38d48464368dc..f3c16f2d9746ac 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -18,7 +18,7 @@
8 common creat sys_creat compat_sys_creat
9 common link sys_link compat_sys_link
10 common unlink sys_unlink compat_sys_unlink
-11 common execve sys_execve compat_sys_execve
+11 common execve sys_execve sys_execve
12 common chdir sys_chdir compat_sys_chdir
13 32 time - compat_sys_time
14 common mknod sys_mknod compat_sys_mknod
@@ -361,7 +361,7 @@
351 common bpf sys_bpf compat_sys_bpf
352 common s390_pci_mmio_write sys_s390_pci_mmio_write compat_sys_s390_pci_mmio_write
353 common s390_pci_mmio_read sys_s390_pci_mmio_read compat_sys_s390_pci_mmio_read
-354 common execveat sys_execveat compat_sys_execveat
+354 common execveat sys_execveat sys_execveat
355 common userfaultfd sys_userfaultfd sys_userfaultfd
356 common membarrier sys_membarrier sys_membarrier
357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index 37b844f839bc4f..8b88868e622d32 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -374,7 +374,7 @@
517 x32 recvfrom compat_sys_recvfrom
518 x32 sendmsg compat_sys_sendmsg
519 x32 recvmsg compat_sys_recvmsg
-520 x32 execve compat_sys_execve
+520 x32 execve sys_execve
521 x32 ptrace compat_sys_ptrace
522 x32 rt_sigpending compat_sys_rt_sigpending
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait_time64
@@ -399,6 +399,6 @@
542 x32 getsockopt compat_sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
-545 x32 execveat compat_sys_execveat
+545 x32 execveat sys_execveat
546 x32 preadv2 compat_sys_preadv64v2
547 x32 pwritev2 compat_sys_pwritev64v2
--
2.26.2
^ permalink raw reply related
* [PATCH 1/6] exec: cleanup the execve wrappers
From: Christoph Hellwig @ 2020-06-18 14:46 UTC (permalink / raw)
To: Al Viro
Cc: linux-arch, linux-s390, linux-parisc, Arnd Bergmann, Brian Gerst,
x86, linux-mips, linux-kernel, linux-fsdevel, Luis Chamberlain,
sparclinux, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200618144627.114057-1-hch@lst.de>
Remove a whole bunch of wrappers that eventually all call
__do_execve_file, and consolidate the execvce helpers to:
(1) __do_execveat, which is the lowest level helper implementing the
actual functionality
(2) do_execvat, which is used by all callers that want native
pointers
(3) do_compat_execve, which is used by all compat syscalls
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/exec.c | 98 +++++++++++------------------------------
include/linux/binfmts.h | 12 ++---
init/main.c | 7 +--
kernel/umh.c | 16 +++----
4 files changed, 41 insertions(+), 92 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a7032784..354fdaa536ae7d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1815,10 +1815,7 @@ static int exec_binprm(struct linux_binprm *bprm)
return 0;
}
-/*
- * sys_execve() executes a new program.
- */
-static int __do_execve_file(int fd, struct filename *filename,
+static int __do_execveat(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags, struct file *file)
@@ -1972,74 +1969,16 @@ static int __do_execve_file(int fd, struct filename *filename,
return retval;
}
-static int do_execveat_common(int fd, struct filename *filename,
- struct user_arg_ptr argv,
- struct user_arg_ptr envp,
- int flags)
-{
- return __do_execve_file(fd, filename, argv, envp, flags, NULL);
-}
-
-int do_execve_file(struct file *file, void *__argv, void *__envp)
-{
- struct user_arg_ptr argv = { .ptr.native = __argv };
- struct user_arg_ptr envp = { .ptr.native = __envp };
-
- return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
-}
-
-int do_execve(struct filename *filename,
- const char __user *const __user *__argv,
- const char __user *const __user *__envp)
-{
- struct user_arg_ptr argv = { .ptr.native = __argv };
- struct user_arg_ptr envp = { .ptr.native = __envp };
- return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
-}
-
int do_execveat(int fd, struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp,
- int flags)
+ int flags, struct file *file)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
- return do_execveat_common(fd, filename, argv, envp, flags);
-}
-
-#ifdef CONFIG_COMPAT
-static int compat_do_execve(struct filename *filename,
- const compat_uptr_t __user *__argv,
- const compat_uptr_t __user *__envp)
-{
- struct user_arg_ptr argv = {
- .is_compat = true,
- .ptr.compat = __argv,
- };
- struct user_arg_ptr envp = {
- .is_compat = true,
- .ptr.compat = __envp,
- };
- return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
-}
-
-static int compat_do_execveat(int fd, struct filename *filename,
- const compat_uptr_t __user *__argv,
- const compat_uptr_t __user *__envp,
- int flags)
-{
- struct user_arg_ptr argv = {
- .is_compat = true,
- .ptr.compat = __argv,
- };
- struct user_arg_ptr envp = {
- .is_compat = true,
- .ptr.compat = __envp,
- };
- return do_execveat_common(fd, filename, argv, envp, flags);
+ return __do_execveat(fd, filename, argv, envp, flags, file);
}
-#endif
void set_binfmt(struct linux_binfmt *new)
{
@@ -2070,7 +2009,7 @@ SYSCALL_DEFINE3(execve,
const char __user *const __user *, argv,
const char __user *const __user *, envp)
{
- return do_execve(getname(filename), argv, envp);
+ return do_execveat(AT_FDCWD, getname(filename), argv, envp, 0, NULL);
}
SYSCALL_DEFINE5(execveat,
@@ -2080,18 +2019,34 @@ SYSCALL_DEFINE5(execveat,
int, flags)
{
int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+ struct filename *name = getname_flags(filename, lookup_flags, NULL);
- return do_execveat(fd,
- getname_flags(filename, lookup_flags, NULL),
- argv, envp, flags);
+ return do_execveat(fd, name, argv, envp, flags, NULL);
}
#ifdef CONFIG_COMPAT
+static int do_compat_execve(int fd, struct filename *filename,
+ const compat_uptr_t __user *__argv,
+ const compat_uptr_t __user *__envp,
+ int flags)
+{
+ struct user_arg_ptr argv = {
+ .is_compat = true,
+ .ptr.compat = __argv,
+ };
+ struct user_arg_ptr envp = {
+ .is_compat = true,
+ .ptr.compat = __envp,
+ };
+
+ return __do_execveat(fd, filename, argv, envp, flags, NULL);
+}
+
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
const compat_uptr_t __user *, argv,
const compat_uptr_t __user *, envp)
{
- return compat_do_execve(getname(filename), argv, envp);
+ return do_compat_execve(AT_FDCWD, getname(filename), argv, envp, 0);
}
COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
@@ -2101,9 +2056,8 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
int, flags)
{
int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+ struct filename *name = getname_flags(filename, lookup_flags, NULL);
- return compat_do_execveat(fd,
- getname_flags(filename, lookup_flags, NULL),
- argv, envp, flags);
+ return do_compat_execve(fd, name, argv, envp, flags);
}
#endif
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 4a20b7517dd036..bed702e4b1fbd9 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -134,13 +134,9 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm);
extern void set_binfmt(struct linux_binfmt *new);
extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
-extern int do_execve(struct filename *,
- const char __user * const __user *,
- const char __user * const __user *);
-extern int do_execveat(int, struct filename *,
- const char __user * const __user *,
- const char __user * const __user *,
- int);
-int do_execve_file(struct file *file, void *__argv, void *__envp);
+int do_execveat(int fd, struct filename *filename,
+ const char __user *const __user *__argv,
+ const char __user *const __user *__envp,
+ int flags, struct file *file);
#endif /* _LINUX_BINFMTS_H */
diff --git a/init/main.c b/init/main.c
index 0ead83e86b5aa2..838950ea7bca22 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1329,9 +1329,10 @@ static int run_init_process(const char *init_filename)
pr_debug(" with environment:\n");
for (p = envp_init; *p; p++)
pr_debug(" %s\n", *p);
- return do_execve(getname_kernel(init_filename),
- (const char __user *const __user *)argv_init,
- (const char __user *const __user *)envp_init);
+ return do_execveat(AT_FDCWD, getname_kernel(init_filename),
+ (const char __user *const __user *)argv_init,
+ (const char __user *const __user *)envp_init,
+ 0, NULL);
}
static int try_to_run_init_process(const char *init_filename)
diff --git a/kernel/umh.c b/kernel/umh.c
index 79f139a7ca03c6..7aa9a5817582ca 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -103,15 +103,13 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
sub_info->pid = task_pid_nr(current);
- if (sub_info->file) {
- retval = do_execve_file(sub_info->file,
- sub_info->argv, sub_info->envp);
- if (!retval)
- current->flags |= PF_UMH;
- } else
- retval = do_execve(getname_kernel(sub_info->path),
- (const char __user *const __user *)sub_info->argv,
- (const char __user *const __user *)sub_info->envp);
+ retval = do_execveat(AT_FDCWD,
+ sub_info->path ? getname_kernel(sub_info->path) : NULL,
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp,
+ 0, sub_info->file);
+ if (sub_info->file && !retval)
+ current->flags |= PF_UMH;
out:
sub_info->retval = retval;
/*
--
2.26.2
^ permalink raw reply related
* properly support exec and wait with kernel pointers v2
From: Christoph Hellwig @ 2020-06-18 14:46 UTC (permalink / raw)
To: Al Viro
Cc: linux-arch, linux-s390, linux-parisc, Arnd Bergmann, Brian Gerst,
x86, linux-mips, linux-kernel, linux-fsdevel, Luis Chamberlain,
sparclinux, linuxppc-dev, linux-arm-kernel
Hi all,
this series first cleans up the exec code and then adds proper
kernel_execveat and kernel_wait callers instead of relying on the fact
that the early init code and kernel threads implicitly run with
the address limit set to KERNEL_DS.
Note that the cleanup removes the compat execve(at) handlers entirely, as
we can handle the compat difference very nicely in a unified codebase.
x32 needs two hacky #defines for that for now, although those can go
away if the x32 syscall rework from Brian gets merged.
Changes since v1:
- remove a pointless ifdef from get_user_arg_ptr
- remove the need for a compat syscall handler for x32
Diffstat:
arch/arm64/include/asm/unistd32.h | 4
arch/mips/kernel/syscalls/syscall_n32.tbl | 4
arch/mips/kernel/syscalls/syscall_o32.tbl | 4
arch/parisc/kernel/syscalls/syscall.tbl | 4
arch/powerpc/kernel/syscalls/syscall.tbl | 4
arch/s390/kernel/syscalls/syscall.tbl | 4
arch/sparc/kernel/syscalls.S | 4
arch/x86/entry/syscall_x32.c | 7
arch/x86/entry/syscalls/syscall_32.tbl | 4
arch/x86/entry/syscalls/syscall_64.tbl | 4
fs/exec.c | 248 ++++++++-------------
include/linux/binfmts.h | 10
include/linux/compat.h | 7
include/linux/sched/task.h | 1
include/uapi/asm-generic/unistd.h | 4
init/main.c | 5
kernel/exit.c | 16 +
kernel/umh.c | 43 ---
tools/include/uapi/asm-generic/unistd.h | 4
tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 4
tools/perf/arch/s390/entry/syscalls/syscall.tbl | 4
tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 4
22 files changed, 170 insertions(+), 223 deletions(-)
^ permalink raw reply
* [PATCH] mm/debug_vm_pgtable: Fix build failure with powerpc 8xx
From: Christophe Leroy @ 2020-06-18 14:31 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Will Deacon, Andrew Morton, Peter Zijlstra (Intel),
Anshuman Khandual
Cc: linux-mm, linuxppc-dev, linux-kernel
Since commit 9e343b467c70 ("READ_ONCE: Enforce atomicity for
{READ,WRITE}_ONCE() memory accesses"), READ_ONCE() cannot be used
anymore to read complex page table entries. This leads to:
CC mm/debug_vm_pgtable.o
In file included from ./include/asm-generic/bug.h:5,
from ./arch/powerpc/include/asm/bug.h:109,
from ./include/linux/bug.h:5,
from ./include/linux/mmdebug.h:5,
from ./include/linux/gfp.h:5,
from mm/debug_vm_pgtable.c:13:
In function 'pte_clear_tests',
inlined from 'debug_vm_pgtable' at mm/debug_vm_pgtable.c:363:2:
./include/linux/compiler.h:392:38: error: call to '__compiletime_assert_210' declared with attribute error: Unsupported access size for {READ,WRITE}_ONCE().
392 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
./include/linux/compiler.h:373:4: note: in definition of macro '__compiletime_assert'
373 | prefix ## suffix(); \
| ^~~~~~
./include/linux/compiler.h:392:2: note: in expansion of macro '_compiletime_assert'
392 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
./include/linux/compiler.h:405:2: note: in expansion of macro 'compiletime_assert'
405 | compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
| ^~~~~~~~~~~~~~~~~~
./include/linux/compiler.h:291:2: note: in expansion of macro 'compiletime_assert_rwonce_type'
291 | compiletime_assert_rwonce_type(x); \
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mm/debug_vm_pgtable.c:249:14: note: in expansion of macro 'READ_ONCE'
249 | pte_t pte = READ_ONCE(*ptep);
| ^~~~~~~~~
make[2]: *** [mm/debug_vm_pgtable.o] Error 1
Fix it by using the recently added ptep_get() helper.
Fixes: 9e343b467c70 ("READ_ONCE: Enforce atomicity for {READ,WRITE}_ONCE() memory accesses")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
mm/debug_vm_pgtable.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index e45623016aea..61ab16fb2e36 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -246,13 +246,13 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
unsigned long vaddr)
{
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = ptep_get(ptep);
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
set_pte_at(mm, vaddr, ptep, pte);
barrier();
pte_clear(mm, vaddr, ptep);
- pte = READ_ONCE(*ptep);
+ pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
}
--
2.25.0
^ permalink raw reply related
* Re: [PATCH 3/3] powerpc/8xx: Provide ptep_get() with 16k pages
From: Christophe Leroy @ 2020-06-18 14:21 UTC (permalink / raw)
To: Michael Ellerman, Peter Zijlstra
Cc: Will Deacon, linux-kernel, linux-mm, Paul Mackerras,
Andrew Morton, linuxppc-dev
In-Reply-To: <87pn9xchql.fsf@mpe.ellerman.id.au>
Le 18/06/2020 à 02:58, Michael Ellerman a écrit :
> Peter Zijlstra <peterz@infradead.org> writes:
>> On Thu, Jun 18, 2020 at 12:21:22AM +1000, Michael Ellerman wrote:
>>> Peter Zijlstra <peterz@infradead.org> writes:
>>>> On Mon, Jun 15, 2020 at 12:57:59PM +0000, Christophe Leroy wrote:
>>
>>>>> +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
>>>>> +#define __HAVE_ARCH_PTEP_GET
>>>>> +static inline pte_t ptep_get(pte_t *ptep)
>>>>> +{
>>>>> + pte_t pte = {READ_ONCE(ptep->pte), 0, 0, 0};
>>>>> +
>>>>> + return pte;
>>>>> +}
>>>>> +#endif
>>>>
>>>> Would it make sense to have a comment with this magic? The casual reader
>>>> might wonder WTH just happened when he stumbles on this :-)
>>>
>>> I tried writing a helpful comment but it's too late for my brain to form
>>> sensible sentences.
>>>
>>> Christophe can you send a follow-up with a comment explaining it? In
>>> particular the zero entries stand out, it's kind of subtle that those
>>> entries are only populated with the right value when we write to the
>>> page table.
>>
>> static inline pte_t ptep_get(pte_t *ptep)
>> {
>> unsigned long val = READ_ONCE(ptep->pte);
>> /* 16K pages have 4 identical value 4K entries */
>> pte_t pte = {val, val, val, val);
>> return pte;
>> }
>>
>> Maybe something like that?
>
> I think val wants to be pte_basic_t, but otherwise yeah I like that much
> better.
>
I sent a patch for that.
I'll also send one to fix mm/debug_vm_pgtable.c which also uses
READ_ONCE() to access page table entries.
Christophe
^ permalink raw reply
* Re: [PATCH 3/3] powerpc/8xx: Provide ptep_get() with 16k pages
From: Christophe Leroy @ 2020-06-18 14:19 UTC (permalink / raw)
To: Michael Ellerman, Peter Zijlstra
Cc: Will Deacon, linux-kernel, linux-mm, Paul Mackerras,
Andrew Morton, linuxppc-dev
In-Reply-To: <87o8phchnu.fsf@mpe.ellerman.id.au>
Le 18/06/2020 à 03:00, Michael Ellerman a écrit :
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> Le 17/06/2020 à 16:38, Peter Zijlstra a écrit :
>>> On Thu, Jun 18, 2020 at 12:21:22AM +1000, Michael Ellerman wrote:
>>>> Peter Zijlstra <peterz@infradead.org> writes:
>>>>> On Mon, Jun 15, 2020 at 12:57:59PM +0000, Christophe Leroy wrote:
>>>
>>>>>> +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
>>>>>> +#define __HAVE_ARCH_PTEP_GET
>>>>>> +static inline pte_t ptep_get(pte_t *ptep)
>>>>>> +{
>>>>>> + pte_t pte = {READ_ONCE(ptep->pte), 0, 0, 0};
>>>>>> +
>>>>>> + return pte;
>>>>>> +}
>>>>>> +#endif
>>>>>
>>>>> Would it make sense to have a comment with this magic? The casual reader
>>>>> might wonder WTH just happened when he stumbles on this :-)
>>>>
>>>> I tried writing a helpful comment but it's too late for my brain to form
>>>> sensible sentences.
>>>>
>>>> Christophe can you send a follow-up with a comment explaining it? In
>>>> particular the zero entries stand out, it's kind of subtle that those
>>>> entries are only populated with the right value when we write to the
>>>> page table.
>>>
>>> static inline pte_t ptep_get(pte_t *ptep)
>>> {
>>> unsigned long val = READ_ONCE(ptep->pte);
>>> /* 16K pages have 4 identical value 4K entries */
>>> pte_t pte = {val, val, val, val);
>>> return pte;
>>> }
>>>
>>> Maybe something like that?
>>
>> This should work as well. Indeed nobody cares about what's in the other
>> three. They are only there to ensure that ptep++ increases the ptep
>> pointer by 16 bytes. Only the HW require 4 identical values, that's
>> taken care of in set_pte_at() and pte_update().
>
> Right, but it seems less error-prone to have the in-memory
> representation match what we have in the page table (well that's
> in-memory too but you know what I mean).
>
>> So we should use the most efficient. Thinking once more, maybe what you
>> propose is the most efficient as there is no need to load another
>> register with value 0 in order to write it in the stack.
>
> On 64-bit I'd say it makes zero difference, the only thing that's going
> to matter is the load from ptep->pte. I don't know whether that's true
> on the 8xx cores though.
On 8xx core, loading a register with value 0 will take one cycle unless
there is some bubble left by another instruction (like a load from
memory or a taken branch). But that's in the noise.
Christophe
^ permalink raw reply
* Re: [PATCH v2 0/2] powerpc/pci: unmap interrupts when a PHB is removed
From: Cédric Le Goater @ 2020-06-18 13:47 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev, Oliver O'Halloran
In-Reply-To: <20200617162938.743439-1-clg@kaod.org>
On 6/17/20 6:29 PM, Cédric Le Goater wrote:
> Hello,
>
> When a passthrough IO adapter is removed from a pseries machine using
> hash MMU and the XIVE interrupt mode, the POWER hypervisor expects the
> guest OS to clear all page table entries related to the adapter. If
> some are still present, the RTAS call which isolates the PCI slot
> returns error 9001 "valid outstanding translations" and the removal of
> the IO adapter fails. This is because when the PHBs are scanned, Linux
> maps automatically some interrupts in the Linux interrupt number space
> but these are never removed.
>
> To solve this problem, we introduce a PPC platform specific
> pcibios_remove_bus() routine which clears all interrupt mappings when
> the bus is removed. This also clears the associated page table entries
> of the ESB pages when using XIVE.
>
> For this purpose, we record the logical interrupt numbers of the
> mapped interrupt under the PHB structure and let pcibios_remove_bus()
> do the clean up.
>
> Tested on :
>
> - PowerNV with PCI, OpenCAPI, CAPI and GPU adapters. I don't know
> how to inject a failure on a PHB but that would be a good test.
I found out that powering down the slot is enough :
echo 0 > /sys/bus/pci/slots/<slot name>/power
The IRQ cleanup is done as expected on baremetal also.
Cheers,
C.
> - KVM P8+P9 guests with passthrough PCI adapters, but PHBs can not
> be removed under QEMU/KVM.
> - PowerVM with passthrough PCI adapters (main target)
>
> Thanks,
>
> C.
>
> Changes since v1:
>
> - extended the removal to interrupts other than the legacy INTx.
>
> Cédric Le Goater (2):
> powerpc/pci: unmap legacy INTx interrupts when a PHB is removed
> powerpc/pci: unmap all interrupts when a PHB is removed
>
> arch/powerpc/include/asm/pci-bridge.h | 6 ++
> arch/powerpc/kernel/pci-common.c | 114 ++++++++++++++++++++++++++
> 2 files changed, 120 insertions(+)
>
^ permalink raw reply
* Re: [PATCH v2 2/4] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
From: Laurent Dufour @ 2020-06-18 13:31 UTC (permalink / raw)
To: Ram Pai, kvm-ppc, linuxppc-dev
Cc: cclaudio, bharata, sathnaga, aneesh.kumar, sukadev, bauerman,
david
In-Reply-To: <1592471945-24786-3-git-send-email-linuxram@us.ibm.com>
Le 18/06/2020 à 11:19, Ram Pai a écrit :
> During the life of SVM, its GFNs transition through normal, secure and
> shared states. Since the kernel does not track GFNs that are shared, it
> is not possible to disambiguate a shared GFN from a GFN whose PFN has
> not yet been migrated to a secure-PFN. Also it is not possible to
> disambiguate a secure-GFN from a GFN whose GFN has been pagedout from
> the ultravisor.
>
> The ability to identify the state of a GFN is needed to skip migration of its
> PFN to secure-PFN during ESM transition.
>
> The code is re-organized to track the states of a GFN as explained
> below.
>
> ************************************************************************
> 1. States of a GFN
> ---------------
> The GFN can be in one of the following states.
>
> (a) Secure - The GFN is secure. The GFN is associated with
> a Secure VM, the contents of the GFN is not accessible
> to the Hypervisor. This GFN can be backed by a secure-PFN,
> or can be backed by a normal-PFN with contents encrypted.
> The former is true when the GFN is paged-in into the
> ultravisor. The latter is true when the GFN is paged-out
> of the ultravisor.
>
> (b) Shared - The GFN is shared. The GFN is associated with a
> a secure VM. The contents of the GFN is accessible to
> Hypervisor. This GFN is backed by a normal-PFN and its
> content is un-encrypted.
>
> (c) Normal - The GFN is a normal. The GFN is associated with
> a normal VM. The contents of the GFN is accesible to
> the Hypervisor. Its content is never encrypted.
>
> 2. States of a VM.
> ---------------
>
> (a) Normal VM: A VM whose contents are always accessible to
> the hypervisor. All its GFNs are normal-GFNs.
>
> (b) Secure VM: A VM whose contents are not accessible to the
> hypervisor without the VM's consent. Its GFNs are
> either Shared-GFN or Secure-GFNs.
>
> (c) Transient VM: A Normal VM that is transitioning to secure VM.
> The transition starts on successful return of
> H_SVM_INIT_START, and ends on successful return
> of H_SVM_INIT_DONE. This transient VM, can have GFNs
> in any of the three states; i.e Secure-GFN, Shared-GFN,
> and Normal-GFN. The VM never executes in this state
> in supervisor-mode.
>
> 3. Memory slot State.
> ------------------
> The state of a memory slot mirrors the state of the
> VM the memory slot is associated with.
>
> 4. VM State transition.
> --------------------
>
> A VM always starts in Normal Mode.
>
> H_SVM_INIT_START moves the VM into transient state. During this
> time the Ultravisor may request some of its GFNs to be shared or
> secured. So its GFNs can be in one of the three GFN states.
>
> H_SVM_INIT_DONE moves the VM entirely from transient state to
> secure-state. At this point any left-over normal-GFNs are
> transitioned to Secure-GFN.
>
> H_SVM_INIT_ABORT moves the transient VM back to normal VM.
> All its GFNs are moved to Normal-GFNs.
>
> UV_TERMINATE transitions the secure-VM back to normal-VM. All
> the secure-GFN and shared-GFNs are tranistioned to normal-GFN
> Note: The contents of the normal-GFN is undefined at this point.
>
> 5. GFN state implementation:
> -------------------------
>
> Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
> when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
> set, and contains the value of the secure-PFN.
> It is associated with a normal-PFN; also called mem_pfn, when
> the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
> The value of the normal-PFN is not tracked.
>
> Shared GFN is associated with a normal-PFN. Its pfn[] has
> KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
> is not tracked.
>
> Normal GFN is associated with normal-PFN. Its pfn[] has
> no flag set. The value of the normal-PFN is not tracked.
>
> 6. Life cycle of a GFN
> --------------------
> --------------------------------------------------------------
> | | Share | Unshare | SVM |H_SVM_INIT_DONE|
> | |operation |operation | abort/ | |
> | | | | terminate | |
> -------------------------------------------------------------
> | | | | | |
> | Secure | Shared | Secure |Normal |Secure |
> | | | | | |
> | Shared | Shared | Secure |Normal |Shared |
> | | | | | |
> | Normal | Shared | Secure |Normal |Secure |
> --------------------------------------------------------------
>
> 7. Life cycle of a VM
> --------------------
> --------------------------------------------------------------------
> | | start | H_SVM_ |H_SVM_ |H_SVM_ |UV_SVM_ |
> | | VM |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE |
> | | | | | | |
> --------- ----------------------------------------------------------
> | | | | | | |
> | Normal | Normal | Transient|Error |Error |Normal |
> | | | | | | |
> | Secure | Error | Error |Error |Error |Normal |
> | | | | | | |
> |Transient| N/A | Error |Secure |Normal |Normal |
> --------------------------------------------------------------------
>
> ************************************************************************
>
> Cc: Paul Mackerras <paulus@ozlabs.org>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Bharata B Rao <bharata@linux.ibm.com>
> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> Cc: Laurent Dufour <ldufour@linux.ibm.com>
> Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
> Cc: kvm-ppc@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
> arch/powerpc/include/asm/kvm_book3s_uvmem.h | 6 +-
> arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +-
> arch/powerpc/kvm/book3s_hv.c | 2 +-
> arch/powerpc/kvm/book3s_hv_uvmem.c | 195 +++++++++++++++++++++++++---
> 4 files changed, 180 insertions(+), 25 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
> index 5a9834e..f0c5708 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
> @@ -21,7 +21,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
> int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn);
> unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm);
> void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
> - struct kvm *kvm, bool skip_page_out);
> + struct kvm *kvm, bool skip_page_out,
> + bool purge_gfn);
> #else
> static inline int kvmppc_uvmem_init(void)
> {
> @@ -75,6 +76,7 @@ static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn)
>
> static inline void
> kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
> - struct kvm *kvm, bool skip_page_out) { }
> + struct kvm *kvm, bool skip_page_out,
> + bool purge_gfn) { }
> #endif /* CONFIG_PPC_UV */
> #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 803940d..3448459 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -1100,7 +1100,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm,
> unsigned int shift;
>
> if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
> - kvmppc_uvmem_drop_pages(memslot, kvm, true);
> + kvmppc_uvmem_drop_pages(memslot, kvm, true, false);
When reviewing the v1 of this series, I asked you the question about the fact
that the call here is made with purge_gfn = false. Your answer was:
> This function does not know, under what context it is called. Since
> its job is to just flush the memslot, it cannot assume anything
> about purging the pages in the memslot.
Indeed in the case of the memory hotplug operation, this function is called to
wipe the page from the secure device in the case the pages are secured. In that
case the purge is required. Indeed, I checked the other call to
kvmppc_radix_flush_memslot() in kvmppc_core_flush_memslot_hv() and I cannot see
why in that case too purge_gfn should be false, especially when the memslot is
reused as detailed in __kvm_set_memory_region() around the call to
kvm_arch_flush_shadow_memslot().
I'm sorry to not have ask this earlier, but could you please elaborate on this?
>
> if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
> return;
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 6717d24..6cf80e5 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -5482,7 +5482,7 @@ static int kvmhv_svm_off(struct kvm *kvm)
> continue;
>
> kvm_for_each_memslot(memslot, slots) {
> - kvmppc_uvmem_drop_pages(memslot, kvm, true);
> + kvmppc_uvmem_drop_pages(memslot, kvm, true, true);
> uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
> }
> }
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 3599aaa..666d1bb 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -98,7 +98,127 @@
> static unsigned long *kvmppc_uvmem_bitmap;
> static DEFINE_SPINLOCK(kvmppc_uvmem_bitmap_lock);
>
> -#define KVMPPC_UVMEM_PFN (1UL << 63)
> +/*
> + * States of a GFN
> + * ---------------
> + * The GFN can be in one of the following states.
> + *
> + * (a) Secure - The GFN is secure. The GFN is associated with
> + * a Secure VM, the contents of the GFN is not accessible
> + * to the Hypervisor. This GFN can be backed by a secure-PFN,
> + * or can be backed by a normal-PFN with contents encrypted.
> + * The former is true when the GFN is paged-in into the
> + * ultravisor. The latter is true when the GFN is paged-out
> + * of the ultravisor.
> + *
> + * (b) Shared - The GFN is shared. The GFN is associated with a
> + * a secure VM. The contents of the GFN is accessible to
> + * Hypervisor. This GFN is backed by a normal-PFN and its
> + * content is un-encrypted.
> + *
> + * (c) Normal - The GFN is a normal. The GFN is associated with
> + * a normal VM. The contents of the GFN is accesible to
> + * the Hypervisor. Its content is never encrypted.
> + *
> + * States of a VM.
> + * ---------------
> + *
> + * Normal VM: A VM whose contents are always accessible to
> + * the hypervisor. All its GFNs are normal-GFNs.
> + *
> + * Secure VM: A VM whose contents are not accessible to the
> + * hypervisor without the VM's consent. Its GFNs are
> + * either Shared-GFN or Secure-GFNs.
> + *
> + * Transient VM: A Normal VM that is transitioning to secure VM.
> + * The transition starts on successful return of
> + * H_SVM_INIT_START, and ends on successful return
> + * of H_SVM_INIT_DONE. This transient VM, can have GFNs
> + * in any of the three states; i.e Secure-GFN, Shared-GFN,
> + * and Normal-GFN. The VM never executes in this state
> + * in supervisor-mode.
> + *
> + * Memory slot State.
> + * -----------------------------
> + * The state of a memory slot mirrors the state of the
> + * VM the memory slot is associated with.
> + *
> + * VM State transition.
> + * --------------------
> + *
> + * A VM always starts in Normal Mode.
> + *
> + * H_SVM_INIT_START moves the VM into transient state. During this
> + * time the Ultravisor may request some of its GFNs to be shared or
> + * secured. So its GFNs can be in one of the three GFN states.
> + *
> + * H_SVM_INIT_DONE moves the VM entirely from transient state to
> + * secure-state. At this point any left-over normal-GFNs are
> + * transitioned to Secure-GFN.
> + *
> + * H_SVM_INIT_ABORT moves the transient VM back to normal VM.
> + * All its GFNs are moved to Normal-GFNs.
> + *
> + * UV_TERMINATE transitions the secure-VM back to normal-VM. All
> + * the secure-GFN and shared-GFNs are tranistioned to normal-GFN
> + * Note: The contents of the normal-GFN is undefined at this point.
> + *
> + * GFN state implementation:
> + * -------------------------
> + *
> + * Secure GFN is associated with a secure-PFN; also called uvmem_pfn,
> + * when the GFN is paged-in. Its pfn[] has KVMPPC_GFN_UVMEM_PFN flag
> + * set, and contains the value of the secure-PFN.
> + * It is associated with a normal-PFN; also called mem_pfn, when
> + * the GFN is pagedout. Its pfn[] has KVMPPC_GFN_MEM_PFN flag set.
> + * The value of the normal-PFN is not tracked.
> + *
> + * Shared GFN is associated with a normal-PFN. Its pfn[] has
> + * KVMPPC_UVMEM_SHARED_PFN flag set. The value of the normal-PFN
> + * is not tracked.
> + *
> + * Normal GFN is associated with normal-PFN. Its pfn[] has
> + * no flag set. The value of the normal-PFN is not tracked.
> + *
> + * Life cycle of a GFN
> + * --------------------
> + *
> + * --------------------------------------------------------------
> + * | | Share | Unshare | SVM |H_SVM_INIT_DONE|
> + * | |operation |operation | abort/ | |
> + * | | | | terminate | |
> + * -------------------------------------------------------------
> + * | | | | | |
> + * | Secure | Shared | Secure |Normal |Secure |
> + * | | | | | |
> + * | Shared | Shared | Secure |Normal |Shared |
> + * | | | | | |
> + * | Normal | Shared | Secure |Normal |Secure |
> + * --------------------------------------------------------------
> + *
> + * Life cycle of a VM
> + * --------------------
> + *
> + * --------------------------------------------------------------------
> + * | | start | H_SVM_ |H_SVM_ |H_SVM_ |UV_SVM_ |
> + * | | VM |INIT_START|INIT_DONE|INIT_ABORT |TERMINATE |
> + * | | | | | | |
> + * --------- ----------------------------------------------------------
> + * | | | | | | |
> + * | Normal | Normal | Transient|Error |Error |Normal |
> + * | | | | | | |
> + * | Secure | Error | Error |Error |Error |Normal |
> + * | | | | | | |
> + * |Transient| N/A | Error |Secure |Normal |Normal |
> + * --------------------------------------------------------------------
> + */
> +
> +#define KVMPPC_GFN_UVMEM_PFN (1UL << 63)
> +#define KVMPPC_GFN_MEM_PFN (1UL << 62)
> +#define KVMPPC_GFN_SHARED (1UL << 61)
> +#define KVMPPC_GFN_SECURE (KVMPPC_GFN_UVMEM_PFN | KVMPPC_GFN_MEM_PFN)
> +#define KVMPPC_GFN_FLAG_MASK (KVMPPC_GFN_SECURE | KVMPPC_GFN_SHARED)
> +#define KVMPPC_GFN_PFN_MASK (~KVMPPC_GFN_FLAG_MASK)
>
> struct kvmppc_uvmem_slot {
> struct list_head list;
> @@ -106,11 +226,11 @@ struct kvmppc_uvmem_slot {
> unsigned long base_pfn;
> unsigned long *pfns;
> };
> -
> struct kvmppc_uvmem_page_pvt {
> struct kvm *kvm;
> unsigned long gpa;
> bool skip_page_out;
> + bool purge_gfn;
> };
>
> int kvmppc_uvmem_slot_init(struct kvm *kvm, const struct kvm_memory_slot *slot)
> @@ -154,8 +274,8 @@ void kvmppc_uvmem_slot_free(struct kvm *kvm, const struct kvm_memory_slot *slot)
> mutex_unlock(&kvm->arch.uvmem_lock);
> }
>
> -static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
> - struct kvm *kvm)
> +static void kvmppc_mark_gfn(unsigned long gfn, struct kvm *kvm,
> + unsigned long flag, unsigned long uvmem_pfn)
> {
> struct kvmppc_uvmem_slot *p;
>
> @@ -163,24 +283,41 @@ static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn,
> if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
> unsigned long index = gfn - p->base_pfn;
>
> - p->pfns[index] = uvmem_pfn | KVMPPC_UVMEM_PFN;
> + if (flag == KVMPPC_GFN_UVMEM_PFN)
> + p->pfns[index] = uvmem_pfn | flag;
> + else
> + p->pfns[index] = flag;
That's minoir, but I'm wondering if that check is really needed since all the
calls to kvmppc_mark_gfn() with flags != KVMPPC_GFN_UVMEM_PFN are made with
uvmem_pfn = 0.
> return;
> }
> }
> }
>
> -static void kvmppc_uvmem_pfn_remove(unsigned long gfn, struct kvm *kvm)
> +/* mark the GFN as secure-GFN associated with @uvmem pfn device-PFN. */
> +static void kvmppc_gfn_secure_uvmem_pfn(unsigned long gfn,
> + unsigned long uvmem_pfn, struct kvm *kvm)
> {
> - struct kvmppc_uvmem_slot *p;
> + kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_UVMEM_PFN, uvmem_pfn);
> +}
>
> - list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
> - if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
> - p->pfns[gfn - p->base_pfn] = 0;
> - return;
> - }
> - }
> +/* mark the GFN as secure-GFN associated with a memory-PFN. */
> +static void kvmppc_gfn_secure_mem_pfn(unsigned long gfn, struct kvm *kvm)
> +{
> + kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_MEM_PFN, 0);
> }
>
> +/* mark the GFN as a shared GFN. */
> +static void kvmppc_gfn_shared(unsigned long gfn, struct kvm *kvm)
> +{
> + kvmppc_mark_gfn(gfn, kvm, KVMPPC_GFN_SHARED, 0);
> +}
> +
> +/* mark the GFN as a non-existent GFN. */
> +static void kvmppc_gfn_remove(unsigned long gfn, struct kvm *kvm)
> +{
> + kvmppc_mark_gfn(gfn, kvm, 0, 0);
> +}
> +
> +/* return true, if the GFN is a secure-GFN backed by a secure-PFN */
> static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
> unsigned long *uvmem_pfn)
> {
> @@ -190,10 +327,10 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
> if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
> unsigned long index = gfn - p->base_pfn;
>
> - if (p->pfns[index] & KVMPPC_UVMEM_PFN) {
> + if (p->pfns[index] & KVMPPC_GFN_UVMEM_PFN) {
> if (uvmem_pfn)
> *uvmem_pfn = p->pfns[index] &
> - ~KVMPPC_UVMEM_PFN;
> + KVMPPC_GFN_PFN_MASK;
> return true;
> } else
> return false;
> @@ -257,9 +394,13 @@ unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
> * is HV side fault on these pages. Next we *get* these pages, forcing
> * fault on them, do fault time migration to replace the device PTEs in
> * QEMU page table with normal PTEs from newly allocated pages.
> + *
> + * if @purge_gfn is set, invalidate the GFN. GFN is not shared nor secure
> + * anymore.
> */
> void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
> - struct kvm *kvm, bool skip_page_out)
> + struct kvm *kvm, bool skip_page_out,
> + bool purge_gfn)
> {
> int i;
> struct kvmppc_uvmem_page_pvt *pvt;
> @@ -270,14 +411,17 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
> struct page *uvmem_page;
>
> mutex_lock(&kvm->arch.uvmem_lock);
> +
> if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) {
> + if (purge_gfn)
> + kvmppc_gfn_remove(gfn, kvm);
> mutex_unlock(&kvm->arch.uvmem_lock);
> continue;
> }
> -
> uvmem_page = pfn_to_page(uvmem_pfn);
> pvt = uvmem_page->zone_device_data;
> pvt->skip_page_out = skip_page_out;
> + pvt->purge_gfn = purge_gfn;
> mutex_unlock(&kvm->arch.uvmem_lock);
>
> pfn = gfn_to_pfn(kvm, gfn);
> @@ -305,7 +449,7 @@ unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm)
> srcu_idx = srcu_read_lock(&kvm->srcu);
>
> kvm_for_each_memslot(memslot, kvm_memslots(kvm))
> - kvmppc_uvmem_drop_pages(memslot, kvm, false);
> + kvmppc_uvmem_drop_pages(memslot, kvm, false, true);
>
> srcu_read_unlock(&kvm->srcu, srcu_idx);
>
> @@ -347,7 +491,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
> goto out_clear;
>
> uvmem_pfn = bit + pfn_first;
> - kvmppc_uvmem_pfn_insert(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
> + kvmppc_gfn_secure_uvmem_pfn(gpa >> PAGE_SHIFT, uvmem_pfn, kvm);
>
> pvt->gpa = gpa;
> pvt->kvm = kvm;
> @@ -454,6 +598,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
> uvmem_page = pfn_to_page(uvmem_pfn);
> pvt = uvmem_page->zone_device_data;
> pvt->skip_page_out = true;
> + pvt->purge_gfn = false;
> }
>
> retry:
> @@ -467,12 +612,16 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
> uvmem_page = pfn_to_page(uvmem_pfn);
> pvt = uvmem_page->zone_device_data;
> pvt->skip_page_out = true;
> + pvt->purge_gfn = false;
> kvm_release_pfn_clean(pfn);
> goto retry;
> }
>
> - if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0, page_shift))
> + if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
> + page_shift)) {
> + kvmppc_gfn_shared(gfn, kvm);
> ret = H_SUCCESS;
> + }
> kvm_release_pfn_clean(pfn);
> mutex_unlock(&kvm->arch.uvmem_lock);
> out:
> @@ -530,6 +679,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
> &downgrade))
> ret = H_SUCCESS;
> +
> out_unlock:
> mutex_unlock(&kvm->arch.uvmem_lock);
> out:
> @@ -655,7 +805,10 @@ static void kvmppc_uvmem_page_free(struct page *page)
>
> pvt = page->zone_device_data;
> page->zone_device_data = NULL;
> - kvmppc_uvmem_pfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
> + if (pvt->purge_gfn)
> + kvmppc_gfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
> + else
> + kvmppc_gfn_secure_mem_pfn(pvt->gpa >> PAGE_SHIFT, pvt->kvm);
> kfree(pvt);
> }
>
>
^ permalink raw reply
* Re: [PATCH] powerpc/8xx: use pmd_off() to access a PMD entry in pte_update()
From: Michael Ellerman @ 2020-06-18 12:37 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton, Mike Rapoport
Cc: Christophe Leroy, linux-mm, linuxppc-dev, linux-kernel,
Mike Rapoport
In-Reply-To: <20200615092229.23142-1-rppt@kernel.org>
On Mon, 15 Jun 2020 12:22:29 +0300, Mike Rapoport wrote:
> The pte_update() implementation for PPC_8xx unfolds page table from the PGD
> level to access a PMD entry. Since 8xx has only 2-level page table this can
> be simplified with pmd_off() shortcut.
>
> Replace explicit unfolding with pmd_off() and drop defines of pgd_index()
> and pgd_offset() that are no longer needed.
Applied to powerpc/fixes.
[1/1] powerpc/8xx: use pmd_off() to access a PMD entry in pte_update()
https://git.kernel.org/powerpc/c/687993ccf3b05070598b89fad97410b26d7bc9d2
cheers
^ permalink raw reply
* Re: [PATCH] powerpc/64s: Fix KVM interrupt using wrong save area
From: Michael Ellerman @ 2020-06-18 12:37 UTC (permalink / raw)
To: linuxppc-dev, Nicholas Piggin; +Cc: Christian Zigotzky
In-Reply-To: <20200615061247.1310763-1-npiggin@gmail.com>
On Mon, 15 Jun 2020 16:12:47 +1000, Nicholas Piggin wrote:
> The CTR register reload in the KVM interrupt path used the wrong save
> area for SLB (and NMI) interrupts.
Applied to powerpc/fixes.
[1/1] powerpc/64s: Fix KVM interrupt using wrong save area
https://git.kernel.org/powerpc/c/0bdcfa182506526fbe4e088ff9ca86a31b81828d
cheers
^ permalink raw reply
* Re: [PATCH 1/2] powerpc/syscalls: Use the number when building SPU syscall table
From: Michael Ellerman @ 2020-06-18 12:37 UTC (permalink / raw)
To: Michael Ellerman, linuxppc-dev; +Cc: linux-arch, linux-kernel, arnd
In-Reply-To: <20200616135617.2937252-1-mpe@ellerman.id.au>
On Tue, 16 Jun 2020 23:56:16 +1000, Michael Ellerman wrote:
> Currently the macro that inserts entries into the SPU syscall table
> doesn't actually use the "nr" (syscall number) parameter.
>
> This does work, but it relies on the exact right number of syscall
> entries being emitted in order for the syscal numbers to line up with
> the array entries. If for example we had two entries with the same
> syscall number we wouldn't get an error, it would just cause all
> subsequent syscalls to be off by one in the spu_syscall_table.
>
> [...]
Applied to powerpc/fixes.
[1/2] powerpc/syscalls: Use the number when building SPU syscall table
https://git.kernel.org/powerpc/c/1497eea68624f6076bf3eaf66baec3771ea04045
[2/2] powerpc/syscalls: Split SPU-ness out of ABI
https://git.kernel.org/powerpc/c/35e32a6cb5f694fda54a5f391917e4ceefa0fece
cheers
^ permalink raw reply
* Re: [PATCH 0/3] Fix build failure with v5.8-rc1
From: Michael Ellerman @ 2020-06-18 12:37 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Christophe Leroy,
Michael Ellerman, Peter Zijlstra (Intel), Will Deacon,
Andrew Morton
Cc: linux-mm, linuxppc-dev, linux-kernel
In-Reply-To: <cover.1592225557.git.christophe.leroy@csgroup.eu>
On Mon, 15 Jun 2020 12:57:55 +0000 (UTC), Christophe Leroy wrote:
> Commit 2ab3a0a02905 ("READ_ONCE: Enforce atomicity for
> {READ,WRITE}_ONCE() memory accesses") leads to following build
> failure on powerpc 8xx.
>
> To fix it, this small series introduces a new helper named ptep_get()
> to replace the direct access with READ_ONCE(). This new helper
> can be overriden by architectures.
>
> [...]
Applied to powerpc/fixes.
[1/3] mm/gup: Use huge_ptep_get() in gup_hugepte()
https://git.kernel.org/powerpc/c/01a80ec6495f9e43f61b3231f3b283ca050a800e
[2/3] mm: Allow arches to provide ptep_get()
https://git.kernel.org/powerpc/c/f7583fd6bdcc4d0b43f68fb81ebfae9669ee9338
[3/3] powerpc/8xx: Provide ptep_get() with 16k pages
https://git.kernel.org/powerpc/c/b55129f97aeefd265314e12d98935330e011a14a
cheers
^ permalink raw reply
* Re: [PATCH v2 1/4] powerpc/instruction_dump: Fix kernel crash with show_instructions
From: Michael Ellerman @ 2020-06-18 12:37 UTC (permalink / raw)
To: linuxppc-dev, mpe, Aneesh Kumar K.V
In-Reply-To: <20200524093822.423487-1-aneesh.kumar@linux.ibm.com>
On Sun, 24 May 2020 15:08:19 +0530, Aneesh Kumar K.V wrote:
> With Hard Lockup watchdog, we can hit a BUG() if we take a watchdog
> interrupt when in OPAL mode. This happens in show_instructions()
> where the kernel takes the watchdog NMI IPI with MSR_IR == 0.
> With that show_instructions() updates the variable pc in the loop
> and the second iterations will result in BUG().
>
> We hit the BUG_ON due the below check in __va()
>
> [...]
Patch 1 applied to powerpc/fixes.
[1/4] powerpc: Fix kernel crash in show_instructions() w/DEBUG_VIRTUAL
https://git.kernel.org/powerpc/c/a6e2c226c3d51fd93636320e47cabc8a8f0824c5
cheers
^ permalink raw reply
* [PATCH 2/2] powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show cpumask
From: Kajol Jain @ 2020-06-18 12:27 UTC (permalink / raw)
To: linuxppc-dev, mpe; +Cc: nathanl, kjain, suka, maddy, anju
In-Reply-To: <20200618122713.9030-1-kjain@linux.ibm.com>
Patch here adds a cpumask attr to hv_24x7 pmu along with ABI documentation.
command:# cat /sys/devices/hv_24x7/cpumask
0
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
---
.../sysfs-bus-event_source-devices-hv_24x7 | 6 ++++
arch/powerpc/perf/hv-24x7.c | 31 ++++++++++++++++++-
2 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
index e8698afcd952..281e7b367733 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
@@ -43,6 +43,12 @@ Description: read only
This sysfs interface exposes the number of cores per chip
present in the system.
+What: /sys/devices/hv_24x7/cpumask
+Date: June 2020
+Contact: Linux on PowerPC Developer List <linuxppc-dev@lists.ozlabs.org>
+Description: read only
+ This sysfs file exposes cpumask.
+
What: /sys/bus/event_source/devices/hv_24x7/event_descs/<event-name>
Date: February 2014
Contact: Linux on PowerPC Developer List <linuxppc-dev@lists.ozlabs.org>
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index fdc4ae155d60..03d870a9fc36 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -448,6 +448,12 @@ static ssize_t device_show_string(struct device *dev,
return sprintf(buf, "%s\n", (char *)d->var);
}
+static ssize_t cpumask_get_attr(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask);
+}
+
static ssize_t sockets_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -1116,6 +1122,17 @@ static DEVICE_ATTR_RO(sockets);
static DEVICE_ATTR_RO(chipspersocket);
static DEVICE_ATTR_RO(coresperchip);
+static DEVICE_ATTR(cpumask, S_IRUGO, cpumask_get_attr, NULL);
+
+static struct attribute *cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+ .attrs = cpumask_attrs,
+};
+
static struct bin_attribute *if_bin_attrs[] = {
&bin_attr_catalog,
NULL,
@@ -1143,6 +1160,11 @@ static const struct attribute_group *attr_groups[] = {
&event_desc_group,
&event_long_desc_group,
&if_group,
+ /*
+ * This NULL is a placeholder for the cpumask attr which will update
+ * onlyif cpuhotplug registration is successful
+ */
+ NULL,
NULL,
};
@@ -1727,8 +1749,15 @@ static int hv_24x7_init(void)
/* init cpuhotplug */
r = hv_24x7_cpu_hotplug_init();
- if (r)
+ if (r) {
pr_err("hv_24x7: CPU hotplug init failed\n");
+ } else {
+ /*
+ * Cpu hotplug init is successful, add the
+ * cpumask file as part of pmu attr group
+ */
+ attr_groups[5] = &cpumask_attr_group;
+ }
r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
if (r)
--
2.18.2
^ permalink raw reply related
* [PATCH 1/2] powerpc/perf/hv-24x7: Add cpu hotplug support
From: Kajol Jain @ 2020-06-18 12:27 UTC (permalink / raw)
To: linuxppc-dev, mpe; +Cc: nathanl, kjain, suka, maddy, anju
In-Reply-To: <20200618122713.9030-1-kjain@linux.ibm.com>
Patch here adds cpu hotplug functions to hv_24x7 pmu.
A new cpuhp_state "CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE" enum
is added.
The online function update the cpumask only if its NULL.
As the primary intention for adding hotplug support
is to desiginate a CPU to make HCALL to collect the
count data.
The offline function test and clear corresponding cpu in a cpumask
and update cpumask to any other active cpu.
With this patchset, perf tool side does not need "-C <cpu>"
to be added.
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
---
arch/powerpc/perf/hv-24x7.c | 45 +++++++++++++++++++++++++++++++++++++
include/linux/cpuhotplug.h | 1 +
2 files changed, 46 insertions(+)
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index db213eb7cb02..fdc4ae155d60 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -31,6 +31,8 @@ static int interface_version;
/* Whether we have to aggregate result data for some domains. */
static bool aggregate_result_elements;
+static cpumask_t hv_24x7_cpumask;
+
static bool domain_is_valid(unsigned domain)
{
switch (domain) {
@@ -1641,6 +1643,44 @@ static struct pmu h_24x7_pmu = {
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
};
+static int ppc_hv_24x7_cpu_online(unsigned int cpu)
+{
+ /* Make this CPU the designated target for counter collection */
+ if (cpumask_empty(&hv_24x7_cpumask))
+ cpumask_set_cpu(cpu, &hv_24x7_cpumask);
+
+ return 0;
+}
+
+static int ppc_hv_24x7_cpu_offline(unsigned int cpu)
+{
+ int target = -1;
+
+ /* Check if exiting cpu is used for collecting 24x7 events */
+ if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask))
+ return 0;
+
+ /* Find a new cpu to collect 24x7 events */
+ target = cpumask_any_but(cpu_active_mask, cpu);
+
+ if (target < 0 || target >= nr_cpu_ids)
+ return -1;
+
+ /* Migrate 24x7 events to the new target */
+ cpumask_set_cpu(target, &hv_24x7_cpumask);
+ perf_pmu_migrate_context(&h_24x7_pmu, cpu, target);
+
+ return 0;
+}
+
+static int hv_24x7_cpu_hotplug_init(void)
+{
+ return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
+ "perf/powerpc/hv_24x7:online",
+ ppc_hv_24x7_cpu_online,
+ ppc_hv_24x7_cpu_offline);
+}
+
static int hv_24x7_init(void)
{
int r;
@@ -1685,6 +1725,11 @@ static int hv_24x7_init(void)
if (r)
return r;
+ /* init cpuhotplug */
+ r = hv_24x7_cpu_hotplug_init();
+ if (r)
+ pr_err("hv_24x7: CPU hotplug init failed\n");
+
r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
if (r)
return r;
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 8377afef8806..16ed8f6f8774 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -180,6 +180,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+ CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
CPUHP_AP_WATCHDOG_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
--
2.18.2
^ permalink raw reply related
* [PATCH 0/2] Add cpu hotplug support for powerpc/perf/hv-24x7
From: Kajol Jain @ 2020-06-18 12:27 UTC (permalink / raw)
To: linuxppc-dev, mpe; +Cc: nathanl, kjain, suka, maddy, anju
This patchset add cpu hotplug support for hv_24x7 driver by adding
online/offline cpu hotplug function. It also add sysfs file
"cpumask" to expose current online cpu that can be used for
hv_24x7 event count.
Kajol Jain (2):
powerpc/perf/hv-24x7: Add cpu hotplug support
powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show cpumask
.../sysfs-bus-event_source-devices-hv_24x7 | 6 ++
arch/powerpc/perf/hv-24x7.c | 74 +++++++++++++++++++
include/linux/cpuhotplug.h | 1 +
3 files changed, 81 insertions(+)
--
2.18.2
^ permalink raw reply
* [PATCH] powerpc/8xx: Modify ptep_get()
From: Christophe Leroy @ 2020-06-18 12:07 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
Will Deacon, Andrew Morton, Peter Zijlstra (Intel)
Cc: linux-mm, linuxppc-dev, linux-kernel
Move ptep_get() close to pte_update(), in an ifdef section already
dedicated to powerpc 8xx. This section contains explanation about
the layout of page table entries.
Also modify it to return 4 times the pte value instead of padding
with zeroes.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/nohash/32/pgtable.h | 22 +++++++++++---------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index b0afbdd07740..b9e134d0f03a 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -249,6 +249,18 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p
return old;
}
+
+#ifdef CONFIG_PPC_16K_PAGES
+#define __HAVE_ARCH_PTEP_GET
+static inline pte_t ptep_get(pte_t *ptep)
+{
+ pte_basic_t val = READ_ONCE(ptep->pte);
+ pte_t pte = {val, val, val, val};
+
+ return pte;
+}
+#endif /* CONFIG_PPC_16K_PAGES */
+
#else
static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p,
unsigned long clr, unsigned long set, int huge)
@@ -284,16 +296,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
return __pte(pte_update(mm, addr, ptep, ~0, 0, 0));
}
-#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
-#define __HAVE_ARCH_PTEP_GET
-static inline pte_t ptep_get(pte_t *ptep)
-{
- pte_t pte = {READ_ONCE(ptep->pte), 0, 0, 0};
-
- return pte;
-}
-#endif
-
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
--
2.25.0
^ permalink raw reply related
* [PATCH] ASoC: fsl_spdif: Add pm runtime function
From: Shengjiu Wang @ 2020-06-18 11:55 UTC (permalink / raw)
To: timur, nicoleotsuka, Xiubo.Lee, festevam, broonie, perex, tiwai,
alsa-devel
Cc: linuxppc-dev, linux-kernel
Add pm runtime support and move clock handling there.
Close the clocks at suspend to reduce the power consumption.
fsl_spdif_suspend is replaced by pm_runtime_force_suspend.
fsl_spdif_resume is replaced by pm_runtime_force_resume.
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
---
sound/soc/fsl/fsl_spdif.c | 113 ++++++++++++++++++++++----------------
1 file changed, 67 insertions(+), 46 deletions(-)
diff --git a/sound/soc/fsl/fsl_spdif.c b/sound/soc/fsl/fsl_spdif.c
index 5bc0e4729341..46719fd2f1ec 100644
--- a/sound/soc/fsl/fsl_spdif.c
+++ b/sound/soc/fsl/fsl_spdif.c
@@ -16,6 +16,7 @@
#include <linux/of_device.h>
#include <linux/of_irq.h>
#include <linux/regmap.h>
+#include <linux/pm_runtime.h>
#include <sound/asoundef.h>
#include <sound/dmaengine_pcm.h>
@@ -495,25 +496,10 @@ static int fsl_spdif_startup(struct snd_pcm_substream *substream,
struct platform_device *pdev = spdif_priv->pdev;
struct regmap *regmap = spdif_priv->regmap;
u32 scr, mask;
- int i;
int ret;
/* Reset module and interrupts only for first initialization */
if (!snd_soc_dai_active(cpu_dai)) {
- ret = clk_prepare_enable(spdif_priv->coreclk);
- if (ret) {
- dev_err(&pdev->dev, "failed to enable core clock\n");
- return ret;
- }
-
- if (!IS_ERR(spdif_priv->spbaclk)) {
- ret = clk_prepare_enable(spdif_priv->spbaclk);
- if (ret) {
- dev_err(&pdev->dev, "failed to enable spba clock\n");
- goto err_spbaclk;
- }
- }
-
ret = spdif_softreset(spdif_priv);
if (ret) {
dev_err(&pdev->dev, "failed to soft reset\n");
@@ -531,18 +517,10 @@ static int fsl_spdif_startup(struct snd_pcm_substream *substream,
mask = SCR_TXFIFO_AUTOSYNC_MASK | SCR_TXFIFO_CTRL_MASK |
SCR_TXSEL_MASK | SCR_USRC_SEL_MASK |
SCR_TXFIFO_FSEL_MASK;
- for (i = 0; i < SPDIF_TXRATE_MAX; i++) {
- ret = clk_prepare_enable(spdif_priv->txclk[i]);
- if (ret)
- goto disable_txclk;
- }
} else {
scr = SCR_RXFIFO_FSEL_IF8 | SCR_RXFIFO_AUTOSYNC;
mask = SCR_RXFIFO_FSEL_MASK | SCR_RXFIFO_AUTOSYNC_MASK|
SCR_RXFIFO_CTL_MASK | SCR_RXFIFO_OFF_MASK;
- ret = clk_prepare_enable(spdif_priv->rxclk);
- if (ret)
- goto err;
}
regmap_update_bits(regmap, REG_SPDIF_SCR, mask, scr);
@@ -551,15 +529,7 @@ static int fsl_spdif_startup(struct snd_pcm_substream *substream,
return 0;
-disable_txclk:
- for (i--; i >= 0; i--)
- clk_disable_unprepare(spdif_priv->txclk[i]);
err:
- if (!IS_ERR(spdif_priv->spbaclk))
- clk_disable_unprepare(spdif_priv->spbaclk);
-err_spbaclk:
- clk_disable_unprepare(spdif_priv->coreclk);
-
return ret;
}
@@ -569,20 +539,17 @@ static void fsl_spdif_shutdown(struct snd_pcm_substream *substream,
struct snd_soc_pcm_runtime *rtd = substream->private_data;
struct fsl_spdif_priv *spdif_priv = snd_soc_dai_get_drvdata(asoc_rtd_to_cpu(rtd, 0));
struct regmap *regmap = spdif_priv->regmap;
- u32 scr, mask, i;
+ u32 scr, mask;
if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
scr = 0;
mask = SCR_TXFIFO_AUTOSYNC_MASK | SCR_TXFIFO_CTRL_MASK |
SCR_TXSEL_MASK | SCR_USRC_SEL_MASK |
SCR_TXFIFO_FSEL_MASK;
- for (i = 0; i < SPDIF_TXRATE_MAX; i++)
- clk_disable_unprepare(spdif_priv->txclk[i]);
} else {
scr = SCR_RXFIFO_OFF | SCR_RXFIFO_CTL_ZERO;
mask = SCR_RXFIFO_FSEL_MASK | SCR_RXFIFO_AUTOSYNC_MASK|
SCR_RXFIFO_CTL_MASK | SCR_RXFIFO_OFF_MASK;
- clk_disable_unprepare(spdif_priv->rxclk);
}
regmap_update_bits(regmap, REG_SPDIF_SCR, mask, scr);
@@ -591,9 +558,6 @@ static void fsl_spdif_shutdown(struct snd_pcm_substream *substream,
spdif_intr_status_clear(spdif_priv);
regmap_update_bits(regmap, REG_SPDIF_SCR,
SCR_LOW_POWER, SCR_LOW_POWER);
- if (!IS_ERR(spdif_priv->spbaclk))
- clk_disable_unprepare(spdif_priv->spbaclk);
- clk_disable_unprepare(spdif_priv->coreclk);
}
}
@@ -1350,6 +1314,8 @@ static int fsl_spdif_probe(struct platform_device *pdev)
/* Register with ASoC */
dev_set_drvdata(&pdev->dev, spdif_priv);
+ pm_runtime_enable(&pdev->dev);
+ regcache_cache_only(spdif_priv->regmap, true);
ret = devm_snd_soc_register_component(&pdev->dev, &fsl_spdif_component,
&spdif_priv->cpu_dai_drv, 1);
@@ -1365,36 +1331,91 @@ static int fsl_spdif_probe(struct platform_device *pdev)
return ret;
}
-#ifdef CONFIG_PM_SLEEP
-static int fsl_spdif_suspend(struct device *dev)
+#ifdef CONFIG_PM
+static int fsl_spdif_runtime_suspend(struct device *dev)
{
struct fsl_spdif_priv *spdif_priv = dev_get_drvdata(dev);
+ int i;
regmap_read(spdif_priv->regmap, REG_SPDIF_SRPC,
&spdif_priv->regcache_srpc);
-
regcache_cache_only(spdif_priv->regmap, true);
- regcache_mark_dirty(spdif_priv->regmap);
+
+ clk_disable_unprepare(spdif_priv->rxclk);
+
+ for (i = 0; i < SPDIF_TXRATE_MAX; i++)
+ clk_disable_unprepare(spdif_priv->txclk[i]);
+
+ if (!IS_ERR(spdif_priv->spbaclk))
+ clk_disable_unprepare(spdif_priv->spbaclk);
+ clk_disable_unprepare(spdif_priv->coreclk);
return 0;
}
-static int fsl_spdif_resume(struct device *dev)
+static int fsl_spdif_runtime_resume(struct device *dev)
{
struct fsl_spdif_priv *spdif_priv = dev_get_drvdata(dev);
+ int ret;
+ int i;
+
+ ret = clk_prepare_enable(spdif_priv->coreclk);
+ if (ret) {
+ dev_err(dev, "failed to enable core clock\n");
+ return ret;
+ }
+
+ if (!IS_ERR(spdif_priv->spbaclk)) {
+ ret = clk_prepare_enable(spdif_priv->spbaclk);
+ if (ret) {
+ dev_err(dev, "failed to enable spba clock\n");
+ goto disable_core_clk;
+ }
+ }
+
+ for (i = 0; i < SPDIF_TXRATE_MAX; i++) {
+ ret = clk_prepare_enable(spdif_priv->txclk[i]);
+ if (ret)
+ goto disable_spba_clk;
+ }
+
+ ret = clk_prepare_enable(spdif_priv->rxclk);
+ if (ret)
+ goto disable_tx_clk;
regcache_cache_only(spdif_priv->regmap, false);
+ regcache_mark_dirty(spdif_priv->regmap);
regmap_update_bits(spdif_priv->regmap, REG_SPDIF_SRPC,
SRPC_CLKSRC_SEL_MASK | SRPC_GAINSEL_MASK,
spdif_priv->regcache_srpc);
- return regcache_sync(spdif_priv->regmap);
+ ret = regcache_sync(spdif_priv->regmap);
+ if (ret)
+ goto disable_rx_clk;
+
+ return 0;
+
+disable_rx_clk:
+ clk_disable_unprepare(spdif_priv->rxclk);
+disable_tx_clk:
+disable_spba_clk:
+ for (i--; i >= 0; i--)
+ clk_disable_unprepare(spdif_priv->txclk[i]);
+ if (!IS_ERR(spdif_priv->spbaclk))
+ clk_disable_unprepare(spdif_priv->spbaclk);
+disable_core_clk:
+ clk_disable_unprepare(spdif_priv->coreclk);
+
+ return ret;
}
-#endif /* CONFIG_PM_SLEEP */
+#endif
static const struct dev_pm_ops fsl_spdif_pm = {
- SET_SYSTEM_SLEEP_PM_OPS(fsl_spdif_suspend, fsl_spdif_resume)
+ SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+ pm_runtime_force_resume)
+ SET_RUNTIME_PM_OPS(fsl_spdif_runtime_suspend, fsl_spdif_runtime_resume,
+ NULL)
};
static const struct of_device_id fsl_spdif_dt_ids[] = {
--
2.21.0
^ permalink raw reply related
* [PATCH v2 4/4] KVM: PPC: Book3S HV: migrate hot plugged memory
From: Ram Pai @ 2020-06-18 9:19 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev
Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
sukadev, bauerman, david
In-Reply-To: <1592471945-24786-1-git-send-email-linuxram@us.ibm.com>
From: Laurent Dufour <ldufour@linux.ibm.com>
When a memory slot is hot plugged to a SVM, PFNs associated with the
GFNs in that slot must be migrated to the secure-PFNs, aka device-PFNs.
kvmppc_uv_migrate_mem_slot() is called to accomplish this. UV_PAGE_IN
ucall is skipped, since the ultravisor does not trust the content of
those pages and hence ignores it.
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
[resolved conflicts, and modified the commit log]
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
---
arch/powerpc/include/asm/kvm_book3s_uvmem.h | 2 ++
arch/powerpc/kvm/book3s_hv.c | 10 ++++++----
arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +-
3 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
index f0c5708..05ae789 100644
--- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h
+++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h
@@ -23,6 +23,8 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
struct kvm *kvm, bool skip_page_out,
bool purge_gfn);
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot);
#else
static inline int kvmppc_uvmem_init(void)
{
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6cf80e5..bf7324d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4531,10 +4531,12 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
case KVM_MR_CREATE:
if (kvmppc_uvmem_slot_init(kvm, new))
return;
- uv_register_mem_slot(kvm->arch.lpid,
- new->base_gfn << PAGE_SHIFT,
- new->npages * PAGE_SIZE,
- 0, new->id);
+ if (uv_register_mem_slot(kvm->arch.lpid,
+ new->base_gfn << PAGE_SHIFT,
+ new->npages * PAGE_SIZE,
+ 0, new->id))
+ return;
+ kvmppc_uv_migrate_mem_slot(kvm, new);
break;
case KVM_MR_DELETE:
uv_unregister_mem_slot(kvm->arch.lpid, old->id);
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 78f8580..4d8f5bc 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -451,7 +451,7 @@ static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
return ret;
}
-static int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
unsigned long gfn = memslot->base_gfn;
--
1.8.3.1
^ permalink raw reply related
* [PATCH v2 3/4] KVM: PPC: Book3S HV: migrate remaining normal-GFNs to secure-GFNs in H_SVM_INIT_DONE
From: Ram Pai @ 2020-06-18 9:19 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev
Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
sukadev, bauerman, david
In-Reply-To: <1592471945-24786-1-git-send-email-linuxram@us.ibm.com>
H_SVM_INIT_DONE incorrectly assumes that the Ultravisor has explicitly
called H_SVM_PAGE_IN for all secure pages. These GFNs continue to be
normal GFNs associated with normal PFNs; when infact, these GFNs should
have been secure GFNs, associated with device PFNs.
Move all the PFN associated with the SVM's GFNs, to secure-PFNs, in
H_SVM_INIT_DONE. Skip the GFNs that are already Paged-in or Shared
through H_SVM_PAGE_IN, or Paged-in followed by a Paged-out through
UV_PAGE_OUT.
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
Documentation/powerpc/ultravisor.rst | 2 +
arch/powerpc/kvm/book3s_hv_uvmem.c | 235 +++++++++++++++++++++++++----------
2 files changed, 171 insertions(+), 66 deletions(-)
diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst
index 363736d..3bc8957 100644
--- a/Documentation/powerpc/ultravisor.rst
+++ b/Documentation/powerpc/ultravisor.rst
@@ -933,6 +933,8 @@ Return values
* H_UNSUPPORTED if called from the wrong context (e.g.
from an SVM or before an H_SVM_INIT_START
hypercall).
+ * H_STATE if the hypervisor could not successfully
+ transition the VM to Secure VM.
Description
~~~~~~~~~~~
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 666d1bb..78f8580 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -339,6 +339,21 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
return false;
}
+/* return true, if the GFN is a shared-GFN, or a secure-GFN */
+bool kvmppc_gfn_has_transitioned(unsigned long gfn, struct kvm *kvm)
+{
+ struct kvmppc_uvmem_slot *p;
+
+ list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) {
+ if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) {
+ unsigned long index = gfn - p->base_pfn;
+
+ return (p->pfns[index] & KVMPPC_GFN_FLAG_MASK);
+ }
+ }
+ return false;
+}
+
unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
{
struct kvm_memslots *slots;
@@ -377,14 +392,152 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
return ret;
}
+static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm);
+
+/*
+ * Alloc a PFN from private device memory pool. If @pagein is true,
+ * copy page from normal memory to secure memory using UV_PAGE_IN uvcall.
+ */
+static int kvmppc_svm_migrate_page(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end, unsigned long gpa, struct kvm *kvm,
+ unsigned long page_shift,
+ bool pagein)
+{
+ unsigned long src_pfn, dst_pfn = 0;
+ struct migrate_vma mig;
+ struct page *dpage;
+ struct page *spage;
+ unsigned long pfn;
+ int ret = 0;
+
+ memset(&mig, 0, sizeof(mig));
+ mig.vma = vma;
+ mig.start = start;
+ mig.end = end;
+ mig.src = &src_pfn;
+ mig.dst = &dst_pfn;
+
+ ret = migrate_vma_setup(&mig);
+ if (ret)
+ return ret;
+
+ if (!(*mig.src & MIGRATE_PFN_MIGRATE)) {
+ ret = -1;
+ goto out_finalize;
+ }
+
+ dpage = kvmppc_uvmem_get_page(gpa, kvm);
+ if (!dpage) {
+ ret = -1;
+ goto out_finalize;
+ }
+
+ if (pagein) {
+ pfn = *mig.src >> MIGRATE_PFN_SHIFT;
+ spage = migrate_pfn_to_page(*mig.src);
+ if (spage) {
+ ret = uv_page_in(kvm->arch.lpid, pfn << page_shift,
+ gpa, 0, page_shift);
+ if (ret)
+ goto out_finalize;
+ }
+ }
+
+ *mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
+ migrate_vma_pages(&mig);
+out_finalize:
+ migrate_vma_finalize(&mig);
+ return ret;
+}
+
+static int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot)
+{
+ unsigned long gfn = memslot->base_gfn;
+ unsigned long end;
+ bool downgrade = false;
+ struct vm_area_struct *vma;
+ int i, ret = 0;
+ unsigned long start = gfn_to_hva(kvm, gfn);
+
+ if (kvm_is_error_hva(start))
+ return H_STATE;
+
+ end = start + (memslot->npages << PAGE_SHIFT);
+
+ down_write(&kvm->mm->mmap_sem);
+
+ mutex_lock(&kvm->arch.uvmem_lock);
+ vma = find_vma_intersection(kvm->mm, start, end);
+ if (!vma || vma->vm_start > start || vma->vm_end < end) {
+ ret = H_STATE;
+ goto out_unlock;
+ }
+
+ ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+ MADV_UNMERGEABLE, &vma->vm_flags);
+ downgrade_write(&kvm->mm->mmap_sem);
+ downgrade = true;
+ if (ret) {
+ ret = H_STATE;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < memslot->npages; i++, ++gfn) {
+ /*
+ * skip GFNs that have already tranistioned.
+ * paged-in GFNs, shared GFNs, paged-in GFNs
+ * that were later paged-out.
+ */
+ if (kvmppc_gfn_has_transitioned(gfn, kvm))
+ continue;
+
+ start = gfn_to_hva(kvm, gfn);
+ end = start + (1UL << PAGE_SHIFT);
+ ret = kvmppc_svm_migrate_page(vma, start, end,
+ (gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
+
+ if (ret)
+ goto out_unlock;
+ }
+
+out_unlock:
+ mutex_unlock(&kvm->arch.uvmem_lock);
+ if (downgrade)
+ up_read(&kvm->mm->mmap_sem);
+ else
+ up_write(&kvm->mm->mmap_sem);
+ return ret;
+}
+
unsigned long kvmppc_h_svm_init_done(struct kvm *kvm)
{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int srcu_idx;
+ long ret = H_SUCCESS;
+
if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
return H_UNSUPPORTED;
+ /* migrate any unmoved normal pfn to device pfns*/
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ slots = kvm_memslots(kvm);
+ kvm_for_each_memslot(memslot, slots) {
+ ret = kvmppc_uv_migrate_mem_slot(kvm, memslot);
+ if (ret) {
+ ret = H_STATE;
+ goto out;
+ }
+ }
+
kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE;
pr_info("LPID %d went secure\n", kvm->arch.lpid);
- return H_SUCCESS;
+
+out:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ return ret;
}
/*
@@ -510,68 +663,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
}
/*
- * Alloc a PFN from private device memory pool and copy page from normal
- * memory to secure memory using UV_PAGE_IN uvcall.
- */
-static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, unsigned long gpa, struct kvm *kvm,
- unsigned long page_shift, bool *downgrade)
-{
- unsigned long src_pfn, dst_pfn = 0;
- struct migrate_vma mig;
- struct page *spage;
- unsigned long pfn;
- struct page *dpage;
- int ret = 0;
-
- memset(&mig, 0, sizeof(mig));
- mig.vma = vma;
- mig.start = start;
- mig.end = end;
- mig.src = &src_pfn;
- mig.dst = &dst_pfn;
-
- /*
- * We come here with mmap_sem write lock held just for
- * ksm_madvise(), otherwise we only need read mmap_sem.
- * Hence downgrade to read lock once ksm_madvise() is done.
- */
- ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, &vma->vm_flags);
- downgrade_write(&kvm->mm->mmap_sem);
- *downgrade = true;
- if (ret)
- return ret;
-
- ret = migrate_vma_setup(&mig);
- if (ret)
- return ret;
-
- if (!(*mig.src & MIGRATE_PFN_MIGRATE)) {
- ret = -1;
- goto out_finalize;
- }
-
- dpage = kvmppc_uvmem_get_page(gpa, kvm);
- if (!dpage) {
- ret = -1;
- goto out_finalize;
- }
-
- pfn = *mig.src >> MIGRATE_PFN_SHIFT;
- spage = migrate_pfn_to_page(*mig.src);
- if (spage)
- uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0,
- page_shift);
-
- *mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
- migrate_vma_pages(&mig);
-out_finalize:
- migrate_vma_finalize(&mig);
- return ret;
-}
-
-/*
* Shares the page with HV, thus making it a normal page.
*
* - If the page is already secure, then provision a new page and share
@@ -676,9 +767,21 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
if (!vma || vma->vm_start > start || vma->vm_end < end)
goto out_unlock;
- if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift,
- &downgrade))
- ret = H_SUCCESS;
+ ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+ MADV_UNMERGEABLE, &vma->vm_flags);
+ downgrade_write(&kvm->mm->mmap_sem);
+ downgrade = true;
+ if (ret) {
+ ret = H_PARAMETER;
+ goto out_unlock;
+ }
+
+ ret = H_PARAMETER;
+ if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
+ true))
+ goto out_unlock;
+
+ ret = H_SUCCESS;
out_unlock:
mutex_unlock(&kvm->arch.uvmem_lock);
--
1.8.3.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox