LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/2] tests: add close_range() tests
From: Christian Brauner @ 2019-05-21 11:34 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, Christian Brauner,
	linux-mips, linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
	linux-arm-kernel, fweimer, linux-parisc, torvalds, oleg,
	linux-alpha, linuxppc-dev
In-Reply-To: <20190521113448.20654-1-christian@brauner.io>

This adds basic tests for the new close_range() syscall.
- test that no invalid flags can be passed
- test that a range of file descriptors is correctly closed
- test that a range of file descriptors is correctly closed if there there
  are already closed file descriptors in the range
- test that max_fd is correctly capped to the current fdtable maximum

Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
---
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/core/.gitignore       |   1 +
 tools/testing/selftests/core/Makefile         |   6 +
 .../testing/selftests/core/close_range_test.c | 128 ++++++++++++++++++
 4 files changed, 136 insertions(+)
 create mode 100644 tools/testing/selftests/core/.gitignore
 create mode 100644 tools/testing/selftests/core/Makefile
 create mode 100644 tools/testing/selftests/core/close_range_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 9781ca79794a..06e57fabbff9 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += bpf
 TARGETS += breakpoints
 TARGETS += capabilities
 TARGETS += cgroup
+TARGETS += core
 TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += drivers/dma-buf
diff --git a/tools/testing/selftests/core/.gitignore b/tools/testing/selftests/core/.gitignore
new file mode 100644
index 000000000000..6e6712ce5817
--- /dev/null
+++ b/tools/testing/selftests/core/.gitignore
@@ -0,0 +1 @@
+close_range_test
diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile
new file mode 100644
index 000000000000..de3ae68aa345
--- /dev/null
+++ b/tools/testing/selftests/core/Makefile
@@ -0,0 +1,6 @@
+CFLAGS += -g -I../../../../usr/include/ -I../../../../include
+
+TEST_GEN_PROGS := close_range_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
new file mode 100644
index 000000000000..ab10cd205ab9
--- /dev/null
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/kernel.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
+				  unsigned int flags)
+{
+	return syscall(__NR_close_range, fd, max_fd, flags);
+}
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+int main(int argc, char **argv)
+{
+	const char *test_name = "close_range";
+	int i, ret;
+	int open_fds[100];
+	int fd_max, fd_mid, fd_min;
+
+	ksft_set_plan(7);
+
+	for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
+		int fd;
+
+		fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+		if (fd < 0) {
+			if (errno == ENOENT)
+				ksft_exit_skip(
+					"%s test: skipping test since /dev/null does not exist\n",
+					test_name);
+
+			ksft_exit_fail_msg(
+				"%s test: %s - failed to open /dev/null\n",
+				strerror(errno), test_name);
+		}
+
+		open_fds[i] = fd;
+	}
+
+	fd_min = open_fds[0];
+	fd_max = open_fds[99];
+
+	ret = sys_close_range(fd_min, fd_max, 1);
+	if (!ret)
+		ksft_exit_fail_msg(
+			"%s test: managed to pass invalid flag value\n",
+			test_name);
+	ksft_test_result_pass("do not allow invalid flag values for close_range()\n");
+
+	fd_mid = open_fds[50];
+	ret = sys_close_range(fd_min, fd_mid, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to close range of file descriptors from 4 to 50\n",
+			test_name);
+	ksft_test_result_pass("close_range() from %d to %d\n", fd_min, fd_mid);
+
+	for (i = 0; i <= 50; i++) {
+		ret = fcntl(open_fds[i], F_GETFL);
+		if (ret >= 0)
+			ksft_exit_fail_msg(
+				"%s test: Failed to close range of file descriptors from 4 to 50\n",
+				test_name);
+	}
+	ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_min, fd_mid);
+
+	/* create a couple of gaps */
+	close(57);
+	close(78);
+	close(81);
+	close(82);
+	close(84);
+	close(90);
+
+	fd_mid = open_fds[51];
+	/* Choose slightly lower limit and leave some fds for a later test */
+	fd_max = open_fds[92];
+	ret = sys_close_range(fd_mid, fd_max, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to close range of file descriptors from 51 to 100\n",
+			test_name);
+	ksft_test_result_pass("close_range() from %d to %d\n", fd_mid, fd_max);
+
+	for (i = 51; i <= 92; i++) {
+		ret = fcntl(open_fds[i], F_GETFL);
+		if (ret >= 0)
+			ksft_exit_fail_msg(
+				"%s test: Failed to close range of file descriptors from 51 to 100\n",
+				test_name);
+	}
+	ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_mid, fd_max);
+
+	fd_mid = open_fds[93];
+	fd_max = open_fds[99];
+	/* test that the kernel caps and still closes all fds */
+	ret = sys_close_range(fd_mid, UINT_MAX, 0);
+	if (ret < 0)
+		ksft_exit_fail_msg(
+			"%s test: Failed to close range of file descriptors from 51 to 100\n",
+			test_name);
+	ksft_test_result_pass("close_range() from %d to %d\n", fd_mid, fd_max);
+
+	for (i = 93; i < 100; i++) {
+		ret = fcntl(open_fds[i], F_GETFL);
+		if (ret >= 0)
+			ksft_exit_fail_msg(
+				"%s test: Failed to close range of file descriptors from 51 to 100\n",
+				test_name);
+	}
+	ksft_test_result_pass("fcntl() verify closed range from %d to %d\n", fd_mid, fd_max);
+
+	return ksft_exit_pass();
+}
-- 
2.21.0


^ permalink raw reply related

* [PATCH 1/2] open: add close_range()
From: Christian Brauner @ 2019-05-21 11:34 UTC (permalink / raw)
  To: viro, linux-kernel, linux-fsdevel, linux-api
  Cc: linux-ia64, linux-sh, ldv, dhowells, linux-kselftest, sparclinux,
	shuah, linux-arch, linux-s390, miklos, x86, Christian Brauner,
	linux-mips, linux-xtensa, tkjos, arnd, jannh, linux-m68k, tglx,
	linux-arm-kernel, fweimer, linux-parisc, torvalds, oleg,
	linux-alpha, linuxppc-dev

This adds the close_range() syscall. It allows to efficiently close a range
of file descriptors up to all file descriptors of a calling task.

The syscall came up in a recent discussion around the new mount API and
making new file descriptor types cloexec by default. During this
discussion, Al suggested the close_range() syscall (cf. [1]). Note, a
syscall in this manner has been requested by various people over time.

First, it helps to close all file descriptors of an exec()ing task. This
can be done safely via (quoting Al's example from [1] verbatim):

        /* that exec is sensitive */
        unshare(CLONE_FILES);
        /* we don't want anything past stderr here */
        close_range(3, ~0U);
        execve(....);

The code snippet above is one way of working around the problem that file
descriptors are not cloexec by default. This is aggravated by the fact that
we can't just switch them over without massively regressing userspace. For
a whole class of programs having an in-kernel method of closing all file
descriptors is very helpful (e.g. demons, service managers, programming
language standard libraries, container managers etc.).
(Please note, unshare(CLONE_FILES) should only be needed if the calling
 task is multi-threaded and shares the file descriptor table with another
 thread in which case two threads could race with one thread allocating
 file descriptors and the other one closing them via close_range(). For the
 general case close_range() before the execve() is sufficient.)

Second, it allows userspace to avoid implementing closing all file
descriptors by parsing through /proc/<pid>/fd/* and calling close() on each
file descriptor. From looking at various large(ish) userspace code bases
this or similar patterns are very common in:
- service managers (cf. [4])
- libcs (cf. [6])
- container runtimes (cf. [5])
- programming language runtimes/standard libraries
  - Python (cf. [2])
  - Rust (cf. [7], [8])
As Dmitry pointed out there's even a long-standing glibc bug about missing
kernel support for this task (cf. [3]).
In addition, the syscall will also work for tasks that do not have procfs
mounted and on kernels that do not have procfs support compiled in. In such
situations the only way to make sure that all file descriptors are closed
is to call close() on each file descriptor up to UINT_MAX or RLIMIT_NOFILE,
OPEN_MAX trickery (cf. comment [8] on Rust).

The performance is striking. For good measure, comparing the following
simple close_all_fds() userspace implementation that is essentially just
glibc's version in [6]:

static int close_all_fds(void)
{
        DIR *dir;
        struct dirent *direntp;

        dir = opendir("/proc/self/fd");
        if (!dir)
                return -1;

        while ((direntp = readdir(dir))) {
                int fd;
                if (strcmp(direntp->d_name, ".") == 0)
                        continue;
                if (strcmp(direntp->d_name, "..") == 0)
                        continue;
                fd = atoi(direntp->d_name);
                if (fd == 0 || fd == 1 || fd == 2)
                        continue;
                close(fd);
        }

        closedir(dir); /* cannot fail */
        return 0;
}

to close_range() yields:
1. closing 4 open files:
   - close_all_fds(): ~280 us
   - close_range():    ~24 us

2. closing 1000 open files:
   - close_all_fds(): ~5000 us
   - close_range():   ~800 us

close_range() is designed to allow for some flexibility. Specifically, it
does not simply always close all open file descriptors of a task. Instead,
callers can specify an upper bound.
This is e.g. useful for scenarios where specific file descriptors are
created with well-known numbers that are supposed to be excluded from
getting closed.
For extra paranoia close_range() comes with a flags argument. This can e.g.
be used to implement extension. Once can imagine userspace wanting to stop
at the first error instead of ignoring errors under certain circumstances.
There might be other valid ideas in the future. In any case, a flag
argument doesn't hurt and keeps us on the safe side.

From an implementation side this is kept rather dumb. It saw some input
from David and Jann but all nonsense is obviously my own!
- Errors to close file descriptors are currently ignored. (Could be changed
  by setting a flag in the future if needed.)
- __close_range() is a rather simplistic wrapper around __close_fd().
  My reasoning behind this is based on the nature of how __close_fd() needs
  to release an fd. But maybe I misunderstood specifics:
  We take the files_lock and rcu-dereference the fdtable of the calling
  task, we find the entry in the fdtable, get the file and need to release
  files_lock before calling filp_close().
  In the meantime the fdtable might have been altered so we can't just
  retake the spinlock and keep the old rcu-reference of the fdtable
  around. Instead we need to grab a fresh reference to the fdtable.
  If my reasoning is correct then there's really no point in fancyfying
  __close_range(): We just need to rcu-dereference the fdtable of the
  calling task once to cap the max_fd value correctly and then go on
  calling __close_fd() in a loop.

/* References */
[1]: https://lore.kernel.org/lkml/20190516165021.GD17978@ZenIV.linux.org.uk/
[2]: https://github.com/python/cpython/blob/9e4f2f3a6b8ee995c365e86d976937c141d867f8/Modules/_posixsubprocess.c#L220
[3]: https://sourceware.org/bugzilla/show_bug.cgi?id=10353#c7
[4]: https://github.com/systemd/systemd/blob/5238e9575906297608ff802a27e2ff9effa3b338/src/basic/fd-util.c#L217
[5]: https://github.com/lxc/lxc/blob/ddf4b77e11a4d08f09b7b9cd13e593f8c047edc5/src/lxc/start.c#L236
[6]: https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/grantpt.c;h=2030e07fa6e652aac32c775b8c6e005844c3c4eb;hb=HEAD#l17
     Note that this is an internal implementation that is not exported.
     Currently, libc seems to not provide an exported version of this
     because of missing kernel support to do this.
[7]: https://github.com/rust-lang/rust/issues/12148
[8]: https://github.com/rust-lang/rust/blob/5f47c0613ed4eb46fca3633c1297364c09e5e451/src/libstd/sys/unix/process2.rs#L303-L308
     Rust's solution is slightly different but is equally unperformant.
     Rust calls getdtablesize() which is a glibc library function that
     simply returns the current RLIMIT_NOFILE or OPEN_MAX values. Rust then
     goes on to call close() on each fd. That's obviously overkill for most
     tasks. Rarely, tasks - especially non-demons - hit RLIMIT_NOFILE or
     OPEN_MAX.
     Let's be nice and assume an unprivileged user with RLIMIT_NOFILE set
     to 1024. Even in this case, there's a very high chance that in the
     common case Rust is calling the close() syscall 1021 times pointlessly
     if the task just has 0, 1, and 2 open.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dmitry V. Levin <ldv@altlinux.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: linux-api@vger.kernel.org
---
 arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
 arch/arm/tools/syscall.tbl                  |  1 +
 arch/arm64/include/asm/unistd32.h           |  2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
 arch/s390/kernel/syscalls/syscall.tbl       |  1 +
 arch/sh/kernel/syscalls/syscall.tbl         |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
 fs/file.c                                   | 30 +++++++++++++++++++++
 fs/open.c                                   | 20 ++++++++++++++
 include/linux/fdtable.h                     |  2 ++
 include/linux/syscalls.h                    |  2 ++
 include/uapi/asm-generic/unistd.h           |  4 ++-
 22 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 9e7704e44f6d..b55d93af8096 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
 541	common	fsconfig			sys_fsconfig
 542	common	fsmount				sys_fsmount
 543	common	fspick				sys_fspick
+545	common	close_range			sys_close_range
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index aaf479a9e92d..0125c97c75dd 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -447,3 +447,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index c39e90600bb3..9a3270d29b42 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -886,6 +886,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_close_range 435
+__SYSCALL(__NR_close_range, sys_close_range)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index e01df3f2f80d..1a90b464e96f 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -354,3 +354,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 7e3d0734b2f3..2dee2050f9ef 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -433,3 +433,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 26339e417695..923ef69e5a76 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -439,3 +439,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 0e2dd68ade57..967ed9de51cd 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -372,3 +372,4 @@
 431	n32	fsconfig			sys_fsconfig
 432	n32	fsmount				sys_fsmount
 433	n32	fspick				sys_fspick
+435	n32	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 5eebfa0d155c..71de731102b1 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -348,3 +348,4 @@
 431	n64	fsconfig			sys_fsconfig
 432	n64	fsmount				sys_fsmount
 433	n64	fspick				sys_fspick
+435	n64	close_range			sys_close_range
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 3cc1374e02d0..5a325ab29f88 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -421,3 +421,4 @@
 431	o32	fsconfig			sys_fsconfig
 432	o32	fsmount				sys_fsmount
 433	o32	fspick				sys_fspick
+435	o32	close_range			sys_close_range
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index c9e377d59232..dcc0a0879139 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -430,3 +430,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 103655d84b4b..ba2c1f078cbd 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index e822b2964a83..d7c9043d2902 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
 431  common	fsconfig		sys_fsconfig			sys_fsconfig
 432  common	fsmount			sys_fsmount			sys_fsmount
 433  common	fspick			sys_fspick			sys_fspick
+435  common	close_range		sys_close_range			sys_close_range
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 016a727d4357..9b5e6bf0ce32 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -436,3 +436,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index e047480b1605..8c674a1e0072 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -479,3 +479,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ad968b7bac72..7f7a89a96707 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -438,3 +438,4 @@
 431	i386	fsconfig		sys_fsconfig			__ia32_sys_fsconfig
 432	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
 433	i386	fspick			sys_fspick			__ia32_sys_fspick
+435	i386	close_range		sys_close_range			__ia32_sys_close_range
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index b4e6f9e6204a..0f7d47ae921c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -355,6 +355,7 @@
 431	common	fsconfig		__x64_sys_fsconfig
 432	common	fsmount			__x64_sys_fsmount
 433	common	fspick			__x64_sys_fspick
+435	common	close_range		__x64_sys_close_range
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 5fa0ee1c8e00..b489532265d0 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -404,3 +404,4 @@
 431	common	fsconfig			sys_fsconfig
 432	common	fsmount				sys_fsmount
 433	common	fspick				sys_fspick
+435	common	close_range			sys_close_range
diff --git a/fs/file.c b/fs/file.c
index 3da91a112bab..3680977a685a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -641,6 +641,36 @@ int __close_fd(struct files_struct *files, unsigned fd)
 }
 EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
 
+/**
+ * __close_range() - Close all file descriptors in a given range.
+ *
+ * @fd:     starting file descriptor to close
+ * @max_fd: last file descriptor to close
+ *
+ * This closes a range of file descriptors. All file descriptors
+ * from @fd up to and including @max_fd are closed.
+ */
+int __close_range(struct files_struct *files, unsigned fd, unsigned max_fd)
+{
+	unsigned int cur_max;
+
+	if (fd > max_fd)
+		return -EINVAL;
+
+	rcu_read_lock();
+	cur_max = files_fdtable(files)->max_fds;
+	rcu_read_unlock();
+
+	/* cap to last valid index into fdtable */
+	if (max_fd >= cur_max)
+		max_fd = cur_max - 1;
+
+	while (fd <= max_fd)
+		__close_fd(files, fd++);
+
+	return 0;
+}
+
 /*
  * variant of __close_fd that gets a ref on the file for later fput
  */
diff --git a/fs/open.c b/fs/open.c
index 9c7d724a6f67..c7baaee7aa47 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1174,6 +1174,26 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
 	return retval;
 }
 
+/**
+ * close_range() - Close all file descriptors in a given range.
+ *
+ * @fd:     starting file descriptor to close
+ * @max_fd: last file descriptor to close
+ * @flags:  reserved for future extensions
+ *
+ * This closes a range of file descriptors. All file descriptors
+ * from @fd up to and including @max_fd are closed.
+ * Currently, errors to close a given file descriptor are ignored.
+ */
+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
+		unsigned int, flags)
+{
+	if (flags)
+		return -EINVAL;
+
+	return __close_range(current->files, fd, max_fd);
+}
+
 /*
  * This routine simulates a hangup on the tty, to arrange that users
  * are given clean terminals at login time.
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f07c55ea0c22..fcd07181a365 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -121,6 +121,8 @@ extern void __fd_install(struct files_struct *files,
 		      unsigned int fd, struct file *file);
 extern int __close_fd(struct files_struct *files,
 		      unsigned int fd);
+extern int __close_range(struct files_struct *files, unsigned int fd,
+			 unsigned int max_fd);
 extern int __close_fd_get_file(unsigned int fd, struct file **res);
 
 extern struct kmem_cache *files_cachep;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..c0189e223255 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -441,6 +441,8 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   umode_t mode);
 asmlinkage long sys_close(unsigned int fd);
+asmlinkage long sys_close_range(unsigned int fd, unsigned int max_fd,
+				unsigned int flags);
 asmlinkage long sys_vhangup(void);
 
 /* fs/pipe.c */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a87904daf103..3f36c8745d24 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_close_range 435
+__SYSCALL(__NR_close_range, sys_close_range)
 
 #undef __NR_syscalls
-#define __NR_syscalls 434
+#define __NR_syscalls 436
 
 /*
  * 32 bit systems traditionally used different
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v2] powerpc/mm/hash: Fix get_region_id() for invalid addresses
From: Michael Ellerman @ 2019-05-21 11:39 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: aneesh.kumar, npiggin
In-Reply-To: <20190517132958.22299-1-mpe@ellerman.id.au>

On Fri, 2019-05-17 at 13:29:58 UTC, Michael Ellerman wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
> 
> Accesses by userspace to random addresses outside the user or kernel
> address range will generate an SLB fault. When we handle that fault we
> classify the effective address into several classes, eg. user, kernel
> linear, kernel virtual etc.
> 
> For addresses that are completely outside of any valid range, we
> should not insert an SLB entry at all, and instead immediately an
> exception.
> 
> In the past this was handled in two ways. Firstly we would check the
> top nibble of the address (using REGION_ID(ea)) and that would tell us
> if the address was user (0), kernel linear (c), kernel virtual (d), or
> vmemmap (f). If the address didn't match any of these it was invalid.
> 
> Then for each type of address we would do a secondary check. For the
> user region we check against H_PGTABLE_RANGE, for kernel linear we
> would mask the top nibble of the address and then check the address
> against MAX_PHYSMEM_BITS.
> 
> As part of commit 0034d395f89d ("powerpc/mm/hash64: Map all the kernel
> regions in the same 0xc range") we replaced REGION_ID() with
> get_region_id() and changed the masking of the top nibble to only mask
> the top two bits, which introduced a bug.
> 
> Addresses less than (4 << 60) are still handled correctly, they are
> either less than (1 << 60) in which case they are subject to the
> H_PGTABLE_RANGE check, or they are correctly checked against
> MAX_PHYSMEM_BITS.
> 
> However addresses from (4 << 60) to ((0xc << 60) - 1), are incorrectly
> treated as kernel linear addresses in get_region_id(). Then the top
> two bits are cleared by EA_MASK in slb_allocate_kernel() and the
> address is checked against MAX_PHYSMEM_BITS, which it passes due to
> the masking. The end result is we incorrectly insert SLB entries for
> those addresses.
> 
> That is not actually catastrophic, having inserted the SLB entry we
> will then go on to take a page fault for the address and at that point
> we detect the problem and report it as a bad fault.
> 
> Still we should not be inserting those entries, or treating them as
> kernel linear addresses in the first place. So fix get_region_id() to
> detect addresses in that range and return an invalid region id, which
> we cause use to not insert an SLB entry and directly report an
> exception.
> 
> Fixes: 0034d395f89d ("powerpc/mm/hash64: Map all the kernel regions in the same 0xc range")
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> [mpe: Drop change to EA_MASK for now, rewrite change log]
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

Applied to powerpc fixes.

https://git.kernel.org/powerpc/c/c179976cf4cbd2e65f29741d5bc07ccf

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/mm/hash: Improve address limit checks
From: Michael Ellerman @ 2019-05-21 11:38 UTC (permalink / raw)
  To: Michael Ellerman, Aneesh Kumar K.V, npiggin, paulus
  Cc: Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <455jHY5Y1zz9sBp@ozlabs.org>

Michael Ellerman <patch-notifications@ellerman.id.au> writes:
> On Thu, 2019-05-16 at 11:50:54 UTC, "Aneesh Kumar K.V" wrote:
>> Different parts of the code do the limit check by ignoring the top nibble
>> of EA. ie. we do checks like
>> 
>> 	if ((ea & EA_MASK)  >= H_PGTABLE_RANGE)
>> 	   	error
>> 
>> This patch makes sure we don't insert SLB entries for addresses whose top nibble
>> doesn't match the ignored bits.
>> 
>> With an address like 0x4000000008000000, we can result in wrong slb entries like
>> 
>> 13 4000000008000000 400ea1b217000510   1T ESID=   400000 VSID=   ea1b217000 LLP:110
>> 
>> without this patch we will map that EA with LINEAR_MAP_REGION_ID and further
>> those addr limit check will return false.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>
> Applied to powerpc fixes, thanks.
>
> https://git.kernel.org/powerpc/c/c179976cf4cbd2e65f29741d5bc07ccf

Actually this patch was superseeded. This should have been a reply to:

  https://lore.kernel.org/linuxppc-dev/20190517132958.22299-1-mpe@ellerman.id.au/

cheers

^ permalink raw reply

* Re: [PATCH] mm/nvdimm: Use correct #defines instead of opencoding
From: Aneesh Kumar K.V @ 2019-05-21  9:50 UTC (permalink / raw)
  To: Dan Williams; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <CAPcyv4jcSgg0wxY9FAM4ke9JzVc9Pu3qe6dviS3seNgHfG2oNw@mail.gmail.com>

Dan Williams <dan.j.williams@intel.com> writes:

> On Mon, May 13, 2019 at 9:46 PM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> On 5/14/19 9:42 AM, Dan Williams wrote:
>> > On Mon, May 13, 2019 at 9:05 PM Aneesh Kumar K.V
>> > <aneesh.kumar@linux.ibm.com> wrote:
>> >>
>> >> On 5/14/19 9:28 AM, Dan Williams wrote:
>> >>> On Mon, May 13, 2019 at 7:56 PM Aneesh Kumar K.V
>> >>> <aneesh.kumar@linux.ibm.com> wrote:
>> >>>>
>> >>>> The nfpn related change is needed to fix the kernel message
>> >>>>
>> >>>> "number of pfns truncated from 2617344 to 163584"
>> >>>>
>> >>>> The change makes sure the nfpns stored in the superblock is right value.
>> >>>>
>> >>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> >>>> ---
>> >>>>    drivers/nvdimm/pfn_devs.c    | 6 +++---
>> >>>>    drivers/nvdimm/region_devs.c | 8 ++++----
>> >>>>    2 files changed, 7 insertions(+), 7 deletions(-)
>> >>>>
>> >>>> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
>> >>>> index 347cab166376..6751ff0296ef 100644
>> >>>> --- a/drivers/nvdimm/pfn_devs.c
>> >>>> +++ b/drivers/nvdimm/pfn_devs.c
>> >>>> @@ -777,8 +777,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>> >>>>                    * when populating the vmemmap. This *should* be equal to
>> >>>>                    * PMD_SIZE for most architectures.
>> >>>>                    */
>> >>>> -               offset = ALIGN(start + reserve + 64 * npfns,
>> >>>> -                               max(nd_pfn->align, PMD_SIZE)) - start;
>> >>>> +               offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
>> >>>> +                              max(nd_pfn->align, PMD_SIZE)) - start;
>> >>>
>> >>> No, I think we need to record the page-size into the superblock format
>> >>> otherwise this breaks in debug builds where the struct-page size is
>> >>> extended.
>> >>>
>> >>>>           } else if (nd_pfn->mode == PFN_MODE_RAM)
>> >>>>                   offset = ALIGN(start + reserve, nd_pfn->align) - start;
>> >>>>           else
>> >>>> @@ -790,7 +790,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>> >>>>                   return -ENXIO;
>> >>>>           }
>> >>>>
>> >>>> -       npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
>> >>>> +       npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
>> >>>
>> >>> Similar comment, if the page size is variable then the superblock
>> >>> needs to explicitly account for it.
>> >>>
>> >>
>> >> PAGE_SIZE is not really variable. What we can run into is the issue you
>> >> mentioned above. The size of struct page can change which means the
>> >> reserved space for keeping vmemmap in device may not be sufficient for
>> >> certain kernel builds.
>> >>
>> >> I was planning to add another patch that fails namespace init if we
>> >> don't have enough space to keep the struct page.
>> >>
>> >> Why do you suggest we need to have PAGE_SIZE as part of pfn superblock?
>> >
>> > So that the kernel has a chance to identify cases where the superblock
>> > it is handling was created on a system with different PAGE_SIZE
>> > assumptions.
>> >
>>
>> The reason to do that is we don't have enough space to keep struct page
>> backing the total number of pfns? If so, what i suggested above should
>> handle that.
>>
>> or are you finding any other reason why we should fail a namespace init
>> with a different PAGE_SIZE value?
>
> I want the kernel to be able to start understand cross-architecture
> and cross-configuration geometries. Which to me means incrementing the
> info-block version and recording PAGE_SIZE and sizeof(struct page) in
> the info-block directly.
>
>> My another patch handle the details w.r.t devdax alignment for which
>> devdax got created with PAGE_SIZE 4K but we are now trying to load that
>> in a kernel with PAGE_SIZE 64k.
>
> Sure, but what about the reverse? These info-block format assumptions
> are as fundamental as the byte-order of the info-block, it needs to be
> cross-arch compatible and the x86 assumptions need to be fully lifted.

Something like the below (Not tested). I am not sure what we will init the page_size
for minor version < 3. This will mark the namespace disabled if the
PAGE_SIZE and sizeof(struct page) doesn't match with the values used
during namespace create. 

diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index dde9853453d3..d6e0933d0dd4 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -36,6 +36,9 @@ struct nd_pfn_sb {
 	__le32 end_trunc;
 	/* minor-version-2 record the base alignment of the mapping */
 	__le32 align;
+	/* minor-version-3 record the page size and struct page size */
+	__le32 page_size;
+	__le32 page_struct_size;
 	u8 padding[4000];
 	__le64 checksum;
 };
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 6f9f78858018..bbc1d792d7f3 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -477,6 +477,15 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 	if (__le16_to_cpu(pfn_sb->version_minor) < 2)
 		pfn_sb->align = 0;
 
+	if (__le16_to_cpu(pfn_sb->version_minor) < 3) {
+		/*
+		 * For a large part we use PAGE_SIZE. But we
+		 * do have some accounting code using SIZE_4K.
+		 */
+		pfn_sb->page_size = cpu_to_le32(PAGE_SIZE);
+		pfn_sb->page_struct_size = cpu_to_le32(64);
+	}
+
 	switch (le32_to_cpu(pfn_sb->mode)) {
 	case PFN_MODE_RAM:
 	case PFN_MODE_PMEM:
@@ -504,6 +513,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 		return -EOPNOTSUPP;
 	}
 
+	if (le32_to_cpu(pfn_sb->page_size) != PAGE_SIZE)
+		return -EOPNOTSUPP;
+
+	if (le32_to_cpu(pfn_sb->page_struct_size) != sizeof(struct page))
+		return -EOPNOTSUPP;
+
 	if (!nd_pfn->uuid) {
 		/*
 		 * When probing a namepace via nd_pfn_probe() the uuid
@@ -798,7 +813,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
-	pfn_sb->version_minor = cpu_to_le16(2);
+	pfn_sb->version_minor = cpu_to_le16(3);
 	pfn_sb->start_pad = cpu_to_le32(start_pad);
 	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
 	pfn_sb->align = cpu_to_le32(nd_pfn->align);


^ permalink raw reply related

* [Bug 203517] WARNING: inconsistent lock state. inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
From: bugzilla-daemon @ 2019-05-21  8:58 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-203517-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=203517

David Sterba (dsterba@suse.com) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|NEW                         |RESOLVED
                 CC|                            |dsterba@suse.com
         Resolution|---                         |CODE_FIX

--- Comment #8 from David Sterba (dsterba@suse.com) ---
https://patchwork.kernel.org/patch/10948813/ fixes the problem and is scheduled
for 5.2 and for stable.

Thanks for the bisecting efforts!

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* [PATCH][next] soc: fsl: fix spelling mistake "Firmaware" -> "Firmware"
From: Colin King @ 2019-05-21  8:56 UTC (permalink / raw)
  To: Li Yang, linuxppc-dev, linux-arm-kernel; +Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

There is a spelling mistake in a pr_err message. Fix it.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/soc/fsl/dpaa2-console.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/dpaa2-console.c b/drivers/soc/fsl/dpaa2-console.c
index 9168d8ddc932..27243f706f37 100644
--- a/drivers/soc/fsl/dpaa2-console.c
+++ b/drivers/soc/fsl/dpaa2-console.c
@@ -73,7 +73,7 @@ static u64 get_mc_fw_base_address(void)
 
 	mcfbaregs = ioremap(mc_base_addr.start, resource_size(&mc_base_addr));
 	if (!mcfbaregs) {
-		pr_err("could not map MC Firmaware Base registers\n");
+		pr_err("could not map MC Firmware Base registers\n");
 		return 0;
 	}
 
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH] powerpc/mm: mark more tlb functions as __always_inline
From: Joe Perches @ 2019-05-21  7:57 UTC (permalink / raw)
  To: Masahiro Yamada, Christophe Leroy
  Cc: linuxppc-dev, Linux Kernel Mailing List, Paul Mackerras
In-Reply-To: <CAK7LNAQNp+wsvNK84oYcGwR24=Kf=_N8WJdyZ2aUL9T3qDsVsA@mail.gmail.com>

On Tue, 2019-05-21 at 16:27 +0900, Masahiro Yamada wrote:
> On Tue, May 21, 2019 at 3:54 PM Christophe Leroy
> > powerpc accepts lines up to 90 chars, see arch/powerpc/tools/checkpatch.pl
> 
> Ugh, I did not know this. Horrible.
> 
> The Linux coding style should be global in the kernel tree.
> No subsystem should adopts its own coding style.

I don't see a problem using 90 column lines by arch/<foo>

There are other subsystem specific variations like the net/

	/* multiline comments without initial blank comment lines
	 * look like this...
	 */

If there were arch specific drivers with style variations
in say drivers/net, then that might be more of an issue.


^ permalink raw reply

* Re: [PATCH] mm/nvdimm: Use correct #defines instead of opencoding
From: Dan Williams @ 2019-05-21  7:47 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Linux MM, linuxppc-dev, linux-nvdimm
In-Reply-To: <02d1d14d-650b-da38-0828-1af330f594d5@linux.ibm.com>

On Mon, May 13, 2019 at 9:46 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 5/14/19 9:42 AM, Dan Williams wrote:
> > On Mon, May 13, 2019 at 9:05 PM Aneesh Kumar K.V
> > <aneesh.kumar@linux.ibm.com> wrote:
> >>
> >> On 5/14/19 9:28 AM, Dan Williams wrote:
> >>> On Mon, May 13, 2019 at 7:56 PM Aneesh Kumar K.V
> >>> <aneesh.kumar@linux.ibm.com> wrote:
> >>>>
> >>>> The nfpn related change is needed to fix the kernel message
> >>>>
> >>>> "number of pfns truncated from 2617344 to 163584"
> >>>>
> >>>> The change makes sure the nfpns stored in the superblock is right value.
> >>>>
> >>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >>>> ---
> >>>>    drivers/nvdimm/pfn_devs.c    | 6 +++---
> >>>>    drivers/nvdimm/region_devs.c | 8 ++++----
> >>>>    2 files changed, 7 insertions(+), 7 deletions(-)
> >>>>
> >>>> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> >>>> index 347cab166376..6751ff0296ef 100644
> >>>> --- a/drivers/nvdimm/pfn_devs.c
> >>>> +++ b/drivers/nvdimm/pfn_devs.c
> >>>> @@ -777,8 +777,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
> >>>>                    * when populating the vmemmap. This *should* be equal to
> >>>>                    * PMD_SIZE for most architectures.
> >>>>                    */
> >>>> -               offset = ALIGN(start + reserve + 64 * npfns,
> >>>> -                               max(nd_pfn->align, PMD_SIZE)) - start;
> >>>> +               offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
> >>>> +                              max(nd_pfn->align, PMD_SIZE)) - start;
> >>>
> >>> No, I think we need to record the page-size into the superblock format
> >>> otherwise this breaks in debug builds where the struct-page size is
> >>> extended.
> >>>
> >>>>           } else if (nd_pfn->mode == PFN_MODE_RAM)
> >>>>                   offset = ALIGN(start + reserve, nd_pfn->align) - start;
> >>>>           else
> >>>> @@ -790,7 +790,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
> >>>>                   return -ENXIO;
> >>>>           }
> >>>>
> >>>> -       npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
> >>>> +       npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
> >>>
> >>> Similar comment, if the page size is variable then the superblock
> >>> needs to explicitly account for it.
> >>>
> >>
> >> PAGE_SIZE is not really variable. What we can run into is the issue you
> >> mentioned above. The size of struct page can change which means the
> >> reserved space for keeping vmemmap in device may not be sufficient for
> >> certain kernel builds.
> >>
> >> I was planning to add another patch that fails namespace init if we
> >> don't have enough space to keep the struct page.
> >>
> >> Why do you suggest we need to have PAGE_SIZE as part of pfn superblock?
> >
> > So that the kernel has a chance to identify cases where the superblock
> > it is handling was created on a system with different PAGE_SIZE
> > assumptions.
> >
>
> The reason to do that is we don't have enough space to keep struct page
> backing the total number of pfns? If so, what i suggested above should
> handle that.
>
> or are you finding any other reason why we should fail a namespace init
> with a different PAGE_SIZE value?

I want the kernel to be able to start understand cross-architecture
and cross-configuration geometries. Which to me means incrementing the
info-block version and recording PAGE_SIZE and sizeof(struct page) in
the info-block directly.

> My another patch handle the details w.r.t devdax alignment for which
> devdax got created with PAGE_SIZE 4K but we are now trying to load that
> in a kernel with PAGE_SIZE 64k.

Sure, but what about the reverse? These info-block format assumptions
are as fundamental as the byte-order of the info-block, it needs to be
cross-arch compatible and the x86 assumptions need to be fully lifted.

^ permalink raw reply

* Re: [PATCH] powerpc/mm: mark more tlb functions as __always_inline
From: Michael Ellerman @ 2019-05-21  7:42 UTC (permalink / raw)
  To: Masahiro Yamada, Christophe Leroy
  Cc: Paul Mackerras, linuxppc-dev, Linux Kernel Mailing List
In-Reply-To: <CAK7LNAQNp+wsvNK84oYcGwR24=Kf=_N8WJdyZ2aUL9T3qDsVsA@mail.gmail.com>

Masahiro Yamada <yamada.masahiro@socionext.com> writes:
> On Tue, May 21, 2019 at 3:54 PM Christophe Leroy
> <christophe.leroy@c-s.fr> wrote:
>> Le 21/05/2019 à 08:16, Masahiro Yamada a écrit :
>> > With CONFIG_OPTIMIZE_INLINING enabled, Laura Abbott reported error
>> > with gcc 9.1.1:
>> >
>> >    arch/powerpc/mm/book3s64/radix_tlb.c: In function '_tlbiel_pid':
>> >    arch/powerpc/mm/book3s64/radix_tlb.c:104:2: warning: asm operand 3 probably doesn't match constraints
>> >      104 |  asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
>> >          |  ^~~
>> >    arch/powerpc/mm/book3s64/radix_tlb.c:104:2: error: impossible constraint in 'asm'
>> >
>> > Fixing _tlbiel_pid() is enough to address the warning above, but I
>> > inlined more functions to fix all potential issues.
>> >
>> > To meet the 'i' (immediate) constraint for the asm operands, functions
>> > propagating propagated 'ric' must be always inlined.
>> >
>> > Fixes: 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING")
>> > Reported-by: Laura Abbott <labbott@redhat.com>
>> > Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
>> > ---
>> >
>> >   arch/powerpc/mm/book3s64/hash_native.c |  8 +++--
>> >   arch/powerpc/mm/book3s64/radix_tlb.c   | 44 +++++++++++++++-----------
>> >   2 files changed, 30 insertions(+), 22 deletions(-)
>> >
>> > diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
>> > index aaa28fd918fe..bc2c35c0d2b1 100644
>> > --- a/arch/powerpc/mm/book3s64/hash_native.c
>> > +++ b/arch/powerpc/mm/book3s64/hash_native.c
>> > @@ -60,9 +60,11 @@ static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
>> >    * tlbiel instruction for hash, set invalidation
>> >    * i.e., r=1 and is=01 or is=10 or is=11
>> >    */
>> > -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
>> > -                                     unsigned int pid,
>> > -                                     unsigned int ric, unsigned int prs)
>> > +static __always_inline void tlbiel_hash_set_isa300(unsigned int set,
>> > +                                                unsigned int is,
>> > +                                                unsigned int pid,
>> > +                                                unsigned int ric,
>> > +                                                unsigned int prs)
>>
>> Please don't split the line more than it is.
>>
>> powerpc accepts lines up to 90 chars, see arch/powerpc/tools/checkpatch.pl
>
> Ugh, I did not know this. Horrible.
>
> The Linux coding style should be global in the kernel tree.
> No subsystem should adopts its own coding style.

Well that ship sailed long ago.

But we don't have our own coding style, we just don't enforce 80 columns
rigidly, there are cases where a slightly longer line (up to ~90) is
preferable to a split line.

In a case like this with a long attribute and function name I think this
is probably the least worst option:

static __always_inline
void tlbiel_hash_set_isa300(unsigned int set, unsigned int is, unsigned int pid,
			    unsigned int ric, unsigned int prs)
{
	...

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/mm: mark more tlb functions as __always_inline
From: Masahiro Yamada @ 2019-05-21  7:27 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linuxppc-dev, Linux Kernel Mailing List, Paul Mackerras
In-Reply-To: <16d967dd-9f8f-4e9e-97fd-3f9761e5d97c@c-s.fr>

On Tue, May 21, 2019 at 3:54 PM Christophe Leroy
<christophe.leroy@c-s.fr> wrote:
>
>
>
> Le 21/05/2019 à 08:16, Masahiro Yamada a écrit :
> > With CONFIG_OPTIMIZE_INLINING enabled, Laura Abbott reported error
> > with gcc 9.1.1:
> >
> >    arch/powerpc/mm/book3s64/radix_tlb.c: In function '_tlbiel_pid':
> >    arch/powerpc/mm/book3s64/radix_tlb.c:104:2: warning: asm operand 3 probably doesn't match constraints
> >      104 |  asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
> >          |  ^~~
> >    arch/powerpc/mm/book3s64/radix_tlb.c:104:2: error: impossible constraint in 'asm'
> >
> > Fixing _tlbiel_pid() is enough to address the warning above, but I
> > inlined more functions to fix all potential issues.
> >
> > To meet the 'i' (immediate) constraint for the asm operands, functions
> > propagating propagated 'ric' must be always inlined.
> >
> > Fixes: 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING")
> > Reported-by: Laura Abbott <labbott@redhat.com>
> > Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
> > ---
> >
> >   arch/powerpc/mm/book3s64/hash_native.c |  8 +++--
> >   arch/powerpc/mm/book3s64/radix_tlb.c   | 44 +++++++++++++++-----------
> >   2 files changed, 30 insertions(+), 22 deletions(-)
> >
> > diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
> > index aaa28fd918fe..bc2c35c0d2b1 100644
> > --- a/arch/powerpc/mm/book3s64/hash_native.c
> > +++ b/arch/powerpc/mm/book3s64/hash_native.c
> > @@ -60,9 +60,11 @@ static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
> >    * tlbiel instruction for hash, set invalidation
> >    * i.e., r=1 and is=01 or is=10 or is=11
> >    */
> > -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
> > -                                     unsigned int pid,
> > -                                     unsigned int ric, unsigned int prs)
> > +static __always_inline void tlbiel_hash_set_isa300(unsigned int set,
> > +                                                unsigned int is,
> > +                                                unsigned int pid,
> > +                                                unsigned int ric,
> > +                                                unsigned int prs)
>
> Please don't split the line more than it is.
>
> powerpc accepts lines up to 90 chars, see arch/powerpc/tools/checkpatch.pl

Ugh, I did not know this. Horrible.

The Linux coding style should be global in the kernel tree.
No subsystem should adopts its own coding style.


-- 
Best Regards
Masahiro Yamada

^ permalink raw reply

* Re: [PATCH] powerpc/mm: mark more tlb functions as __always_inline
From: Joe Perches @ 2019-05-21  7:24 UTC (permalink / raw)
  To: Christophe Leroy, Masahiro Yamada, linuxppc-dev, Michael Ellerman
  Cc: Paul Mackerras, linux-kernel
In-Reply-To: <16d967dd-9f8f-4e9e-97fd-3f9761e5d97c@c-s.fr>

On Tue, 2019-05-21 at 08:53 +0200, Christophe Leroy wrote:
> powerpc accepts lines up to 90 chars, see arch/powerpc/tools/checkpatch.pl

arch/powerpc/tools/checkpatch.sh



^ permalink raw reply

* Re: [PATCH] powerpc/mm: mark more tlb functions as __always_inline
From: Christophe Leroy @ 2019-05-21  6:53 UTC (permalink / raw)
  To: Masahiro Yamada, linuxppc-dev, Michael Ellerman
  Cc: Paul Mackerras, linux-kernel
In-Reply-To: <20190521061659.6073-1-yamada.masahiro@socionext.com>



Le 21/05/2019 à 08:16, Masahiro Yamada a écrit :
> With CONFIG_OPTIMIZE_INLINING enabled, Laura Abbott reported error
> with gcc 9.1.1:
> 
>    arch/powerpc/mm/book3s64/radix_tlb.c: In function '_tlbiel_pid':
>    arch/powerpc/mm/book3s64/radix_tlb.c:104:2: warning: asm operand 3 probably doesn't match constraints
>      104 |  asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
>          |  ^~~
>    arch/powerpc/mm/book3s64/radix_tlb.c:104:2: error: impossible constraint in 'asm'
> 
> Fixing _tlbiel_pid() is enough to address the warning above, but I
> inlined more functions to fix all potential issues.
> 
> To meet the 'i' (immediate) constraint for the asm operands, functions
> propagating propagated 'ric' must be always inlined.
> 
> Fixes: 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING")
> Reported-by: Laura Abbott <labbott@redhat.com>
> Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
> ---
> 
>   arch/powerpc/mm/book3s64/hash_native.c |  8 +++--
>   arch/powerpc/mm/book3s64/radix_tlb.c   | 44 +++++++++++++++-----------
>   2 files changed, 30 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
> index aaa28fd918fe..bc2c35c0d2b1 100644
> --- a/arch/powerpc/mm/book3s64/hash_native.c
> +++ b/arch/powerpc/mm/book3s64/hash_native.c
> @@ -60,9 +60,11 @@ static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
>    * tlbiel instruction for hash, set invalidation
>    * i.e., r=1 and is=01 or is=10 or is=11
>    */
> -static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
> -					unsigned int pid,
> -					unsigned int ric, unsigned int prs)
> +static __always_inline void tlbiel_hash_set_isa300(unsigned int set,
> +						   unsigned int is,
> +						   unsigned int pid,
> +						   unsigned int ric,
> +						   unsigned int prs)

Please don't split the line more than it is.

powerpc accepts lines up to 90 chars, see arch/powerpc/tools/checkpatch.pl

>   {
>   	unsigned long rb;
>   	unsigned long rs;
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
> index 4d841369399f..91c4242c1be3 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -29,9 +29,11 @@
>    * tlbiel instruction for radix, set invalidation
>    * i.e., r=1 and is=01 or is=10 or is=11
>    */
> -static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
> -					unsigned int pid,
> -					unsigned int ric, unsigned int prs)
> +static __always_inline void tlbiel_radix_set_isa300(unsigned int set,
> +						    unsigned int is,
> +						    unsigned int pid,
> +						    unsigned int ric,
> +						    unsigned int prs)

Please don't split the line more than it is.

>   {
>   	unsigned long rb;
>   	unsigned long rs;
> @@ -150,8 +152,8 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
>   	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
>   }
>   
> -static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
> -				unsigned long ric)
> +static __always_inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
> +						unsigned long ric)
>   {
>   	unsigned long rb,rs,prs,r;
>   
> @@ -167,8 +169,8 @@ static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
>   }
>   
>   
> -static inline void __tlbiel_va(unsigned long va, unsigned long pid,
> -			       unsigned long ap, unsigned long ric)
> +static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid,
> +					unsigned long ap, unsigned long ric)
>   {
>   	unsigned long rb,rs,prs,r;
>   
> @@ -183,8 +185,8 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
>   	trace_tlbie(0, 1, rb, rs, ric, prs, r);
>   }
>   
> -static inline void __tlbie_va(unsigned long va, unsigned long pid,
> -			      unsigned long ap, unsigned long ric)
> +static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
> +				       unsigned long ap, unsigned long ric)
>   {
>   	unsigned long rb,rs,prs,r;
>   
> @@ -199,8 +201,10 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
>   	trace_tlbie(0, 0, rb, rs, ric, prs, r);
>   }
>   
> -static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
> -			      unsigned long ap, unsigned long ric)
> +static __always_inline void __tlbie_lpid_va(unsigned long va,
> +					    unsigned long lpid,
> +					    unsigned long ap,
> +					    unsigned long ric)

Please don't split the line more than it is.


>   {
>   	unsigned long rb,rs,prs,r;
>   
> @@ -239,7 +243,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid)
>   /*
>    * We use 128 set in radix mode and 256 set in hpt mode.
>    */
> -static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
> +static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
>   {
>   	int set;
>   
> @@ -341,7 +345,8 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
>   	asm volatile("eieio; tlbsync; ptesync": : :"memory");
>   }
>   
> -static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
> +static __always_inline void _tlbiel_lpid_guest(unsigned long lpid,
> +					       unsigned long ric)

Please don't split the line more than it is.

>   {
>   	int set;
>   
> @@ -381,8 +386,8 @@ static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
>   		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
>   }
>   
> -static inline void _tlbiel_va(unsigned long va, unsigned long pid,
> -			      unsigned long psize, unsigned long ric)
> +static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid,
> +				       unsigned long psize, unsigned long ric)
>   {
>   	unsigned long ap = mmu_get_ap(psize);
>   
> @@ -413,8 +418,8 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
>   		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
>   }
>   
> -static inline void _tlbie_va(unsigned long va, unsigned long pid,
> -			      unsigned long psize, unsigned long ric)
> +static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
> +				      unsigned long psize, unsigned long ric)
>   {
>   	unsigned long ap = mmu_get_ap(psize);
>   
> @@ -424,8 +429,9 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
>   	asm volatile("eieio; tlbsync; ptesync": : :"memory");
>   }
>   
> -static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
> -			      unsigned long psize, unsigned long ric)
> +static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
> +					   unsigned long psize,
> +					   unsigned long ric)

Please don't split the line more than it is.

>   {
>   	unsigned long ap = mmu_get_ap(psize);
>   
> 

Christophe

^ permalink raw reply

* [PATCH] powerpc/mm: mark more tlb functions as __always_inline
From: Masahiro Yamada @ 2019-05-21  6:16 UTC (permalink / raw)
  To: linuxppc-dev, Michael Ellerman
  Cc: Masahiro Yamada, Paul Mackerras, linux-kernel

With CONFIG_OPTIMIZE_INLINING enabled, Laura Abbott reported error
with gcc 9.1.1:

  arch/powerpc/mm/book3s64/radix_tlb.c: In function '_tlbiel_pid':
  arch/powerpc/mm/book3s64/radix_tlb.c:104:2: warning: asm operand 3 probably doesn't match constraints
    104 |  asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
        |  ^~~
  arch/powerpc/mm/book3s64/radix_tlb.c:104:2: error: impossible constraint in 'asm'

Fixing _tlbiel_pid() is enough to address the warning above, but I
inlined more functions to fix all potential issues.

To meet the 'i' (immediate) constraint for the asm operands, functions
propagating propagated 'ric' must be always inlined.

Fixes: 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING")
Reported-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---

 arch/powerpc/mm/book3s64/hash_native.c |  8 +++--
 arch/powerpc/mm/book3s64/radix_tlb.c   | 44 +++++++++++++++-----------
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index aaa28fd918fe..bc2c35c0d2b1 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -60,9 +60,11 @@ static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
  * tlbiel instruction for hash, set invalidation
  * i.e., r=1 and is=01 or is=10 or is=11
  */
-static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
-					unsigned int pid,
-					unsigned int ric, unsigned int prs)
+static __always_inline void tlbiel_hash_set_isa300(unsigned int set,
+						   unsigned int is,
+						   unsigned int pid,
+						   unsigned int ric,
+						   unsigned int prs)
 {
 	unsigned long rb;
 	unsigned long rs;
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 4d841369399f..91c4242c1be3 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -29,9 +29,11 @@
  * tlbiel instruction for radix, set invalidation
  * i.e., r=1 and is=01 or is=10 or is=11
  */
-static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
-					unsigned int pid,
-					unsigned int ric, unsigned int prs)
+static __always_inline void tlbiel_radix_set_isa300(unsigned int set,
+						    unsigned int is,
+						    unsigned int pid,
+						    unsigned int ric,
+						    unsigned int prs)
 {
 	unsigned long rb;
 	unsigned long rs;
@@ -150,8 +152,8 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
 	trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
-				unsigned long ric)
+static __always_inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
+						unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
@@ -167,8 +169,8 @@ static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
 }
 
 
-static inline void __tlbiel_va(unsigned long va, unsigned long pid,
-			       unsigned long ap, unsigned long ric)
+static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid,
+					unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
@@ -183,8 +185,8 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbie_va(unsigned long va, unsigned long pid,
-			      unsigned long ap, unsigned long ric)
+static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
+				       unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
@@ -199,8 +201,10 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
-			      unsigned long ap, unsigned long ric)
+static __always_inline void __tlbie_lpid_va(unsigned long va,
+					    unsigned long lpid,
+					    unsigned long ap,
+					    unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
@@ -239,7 +243,7 @@ static inline void fixup_tlbie_lpid(unsigned long lpid)
 /*
  * We use 128 set in radix mode and 256 set in hpt mode.
  */
-static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 {
 	int set;
 
@@ -341,7 +345,8 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
-static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
+static __always_inline void _tlbiel_lpid_guest(unsigned long lpid,
+					       unsigned long ric)
 {
 	int set;
 
@@ -381,8 +386,8 @@ static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
 		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
 }
 
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
-			      unsigned long psize, unsigned long ric)
+static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid,
+				       unsigned long psize, unsigned long ric)
 {
 	unsigned long ap = mmu_get_ap(psize);
 
@@ -413,8 +418,8 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
 		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
 }
 
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
-			      unsigned long psize, unsigned long ric)
+static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
+				      unsigned long psize, unsigned long ric)
 {
 	unsigned long ap = mmu_get_ap(psize);
 
@@ -424,8 +429,9 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
-static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
-			      unsigned long psize, unsigned long ric)
+static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+					   unsigned long psize,
+					   unsigned long ric)
 {
 	unsigned long ap = mmu_get_ap(psize);
 
-- 
2.17.1


^ permalink raw reply related

* [Bug 203647] Locking API testsuite fails "mixed read-lock/lock-write ABBA" rlock on kernels >=4.14.x
From: bugzilla-daemon @ 2019-05-21  5:35 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-203647-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=203647

Michael Ellerman (michael@ellerman.id.au) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|NEW                         |RESOLVED
                 CC|                            |michael@ellerman.id.au
         Resolution|---                         |DOCUMENTED

--- Comment #6 from Michael Ellerman (michael@ellerman.id.au) ---
This appears to be working as expected, which I admit is a little confusing.

The key thing is that at the end you see, eg:

  [    0.179788] Good, all 261 testcases passed! |

See the commit that added the test:

https://git.kernel.org/torvalds/c/e91498589746

  locking/lockdep/selftests: Add mixed read-write ABBA tests

  Currently lockdep has limited support for recursive readers, add a few
  mixed read-write ABBA selftests to show the extend of these
  limitations.

And in the code:

        print_testname("mixed read-lock/lock-write ABBA");
        pr_cont("             |");
        dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK);
#ifdef CONFIG_PROVE_LOCKING
        /*
         * Lockdep does indeed fail here, but there's nothing we can do about
         * that now.  Don't kill lockdep for it.
         */
        unexpected_testcase_failures--;
#endif

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [RFC PATCH v2 07/10] KVM: PPC: Ultravisor: Restrict LDBAR access
From: Madhavan Srinivasan @ 2019-05-21  5:24 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20190518142524.28528-8-cclaudio@linux.ibm.com>


On 18/05/19 7:55 PM, Claudio Carvalho wrote:
> From: Ram Pai <linuxram@us.ibm.com> When the ultravisor firmware is 
> available, it takes control over the LDBAR register. In this case, 
> thread-imc updates and save/restore operations on the LDBAR register 
> are handled by ultravisor.
we should remove the core and thread imc nodes in the skiboot
if the ultravisor is enabled. We dont need imc-pmu.c file changes, imc-pmu.c
inits only if we have the corresponding nodes. I will post a skiboot patch.

Maddy

> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> [Restrict LDBAR access in assembly code and some in C, update the commit
>   message]
> Signed-off-by: Claudio Carvalho <cclaudio@linux.ibm.com>
> ---
>   arch/powerpc/kvm/book3s_hv.c                 |  4 +-
>   arch/powerpc/kvm/book3s_hv_rmhandlers.S      |  2 +
>   arch/powerpc/perf/imc-pmu.c                  | 64 ++++++++++++--------
>   arch/powerpc/platforms/powernv/idle.c        |  6 +-
>   arch/powerpc/platforms/powernv/subcore-asm.S |  4 ++
>   5 files changed, 52 insertions(+), 28 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 0fab0a201027..81f35f955d16 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -75,6 +75,7 @@
>   #include <asm/xics.h>
>   #include <asm/xive.h>
>   #include <asm/hw_breakpoint.h>
> +#include <asm/firmware.h>
>
>   #include "book3s.h"
>
> @@ -3117,7 +3118,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
>   			subcore_size = MAX_SMT_THREADS / split;
>   			split_info.rpr = mfspr(SPRN_RPR);
>   			split_info.pmmar = mfspr(SPRN_PMMAR);
> -			split_info.ldbar = mfspr(SPRN_LDBAR);
> +			if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
> +				split_info.ldbar = mfspr(SPRN_LDBAR);
>   			split_info.subcore_size = subcore_size;
>   		} else {
>   			split_info.subcore_size = 1;
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index dd014308f065..938cfa5dceed 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -375,8 +375,10 @@ BEGIN_FTR_SECTION
>   	mtspr	SPRN_RPR, r0
>   	ld	r0, KVM_SPLIT_PMMAR(r6)
>   	mtspr	SPRN_PMMAR, r0
> +BEGIN_FW_FTR_SECTION_NESTED(70)
>   	ld	r0, KVM_SPLIT_LDBAR(r6)
>   	mtspr	SPRN_LDBAR, r0
> +END_FW_FTR_SECTION_NESTED(FW_FEATURE_ULTRAVISOR, 0, 70)
>   	isync
>   FTR_SECTION_ELSE
>   	/* On P9 we use the split_info for coordinating LPCR changes */
> diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
> index 31fa753e2eb2..39c84de74da9 100644
> --- a/arch/powerpc/perf/imc-pmu.c
> +++ b/arch/powerpc/perf/imc-pmu.c
> @@ -17,6 +17,7 @@
>   #include <asm/cputhreads.h>
>   #include <asm/smp.h>
>   #include <linux/string.h>
> +#include <asm/firmware.h>
>
>   /* Nest IMC data structures and variables */
>
> @@ -816,6 +817,17 @@ static int core_imc_event_init(struct perf_event *event)
>   	return 0;
>   }
>
> +static void thread_imc_ldbar_disable(void *dummy)
> +{
> +	/*
> +	 * By Zeroing LDBAR, we disable thread-imc updates. When the ultravisor
> +	 * firmware is available, it is responsible for handling thread-imc
> +	 * updates, though
> +	 */
> +	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
> +		mtspr(SPRN_LDBAR, 0);
> +}
> +
>   /*
>    * Allocates a page of memory for each of the online cpus, and load
>    * LDBAR with 0.
> @@ -856,7 +868,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size)
>   		per_cpu(thread_imc_mem, cpu_id) = local_mem;
>   	}
>
> -	mtspr(SPRN_LDBAR, 0);
> +	thread_imc_ldbar_disable(NULL);
>   	return 0;
>   }
>
> @@ -867,7 +879,7 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu)
>
>   static int ppc_thread_imc_cpu_offline(unsigned int cpu)
>   {
> -	mtspr(SPRN_LDBAR, 0);
> +	thread_imc_ldbar_disable(NULL);
>   	return 0;
>   }
>
> @@ -1010,7 +1022,6 @@ static int thread_imc_event_add(struct perf_event *event, int flags)
>   {
>   	int core_id;
>   	struct imc_pmu_ref *ref;
> -	u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id());
>
>   	if (flags & PERF_EF_START)
>   		imc_event_start(event, flags);
> @@ -1019,8 +1030,14 @@ static int thread_imc_event_add(struct perf_event *event, int flags)
>   		return -EINVAL;
>
>   	core_id = smp_processor_id() / threads_per_core;
> -	ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
> -	mtspr(SPRN_LDBAR, ldbar_value);
> +	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
> +		u64 ldbar_value, *local_mem;
> +
> +		local_mem = per_cpu(thread_imc_mem, smp_processor_id());
> +		ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) |
> +				THREAD_IMC_ENABLE;
> +		mtspr(SPRN_LDBAR, ldbar_value);
> +	}
>
>   	/*
>   	 * imc pmus are enabled only when it is used.
> @@ -1053,7 +1070,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags)
>   	int core_id;
>   	struct imc_pmu_ref *ref;
>
> -	mtspr(SPRN_LDBAR, 0);
> +	thread_imc_ldbar_disable(NULL);
>
>   	core_id = smp_processor_id() / threads_per_core;
>   	ref = &core_imc_refc[core_id];
> @@ -1109,7 +1126,7 @@ static int trace_imc_mem_alloc(int cpu_id, int size)
>   	trace_imc_refc[core_id].id = core_id;
>   	mutex_init(&trace_imc_refc[core_id].lock);
>
> -	mtspr(SPRN_LDBAR, 0);
> +	thread_imc_ldbar_disable(NULL);
>   	return 0;
>   }
>
> @@ -1120,7 +1137,7 @@ static int ppc_trace_imc_cpu_online(unsigned int cpu)
>
>   static int ppc_trace_imc_cpu_offline(unsigned int cpu)
>   {
> -	mtspr(SPRN_LDBAR, 0);
> +	thread_imc_ldbar_disable(NULL);
>   	return 0;
>   }
>
> @@ -1207,11 +1224,6 @@ static int trace_imc_event_add(struct perf_event *event, int flags)
>   {
>   	int core_id = smp_processor_id() / threads_per_core;
>   	struct imc_pmu_ref *ref = NULL;
> -	u64 local_mem, ldbar_value;
> -
> -	/* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */
> -	local_mem = get_trace_imc_event_base_addr();
> -	ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE;
>
>   	if (core_imc_refc)
>   		ref = &core_imc_refc[core_id];
> @@ -1222,14 +1234,25 @@ static int trace_imc_event_add(struct perf_event *event, int flags)
>   		if (!ref)
>   			return -EINVAL;
>   	}
> -	mtspr(SPRN_LDBAR, ldbar_value);
> +	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
> +		u64 local_mem, ldbar_value;
> +
> +		/*
> +		 * Set trace-imc bit in ldbar and load ldbar with per-thread
> +		 * memory address
> +		 */
> +		local_mem = get_trace_imc_event_base_addr();
> +		ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) |
> +				TRACE_IMC_ENABLE;
> +		mtspr(SPRN_LDBAR, ldbar_value);
> +	}
>   	mutex_lock(&ref->lock);
>   	if (ref->refc == 0) {
>   		if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE,
>   				get_hard_smp_processor_id(smp_processor_id()))) {
>   			mutex_unlock(&ref->lock);
>   			pr_err("trace-imc: Unable to start the counters for core %d\n", core_id);
> -			mtspr(SPRN_LDBAR, 0);
> +			thread_imc_ldbar_disable(NULL);
>   			return -EINVAL;
>   		}
>   	}
> @@ -1270,7 +1293,7 @@ static void trace_imc_event_del(struct perf_event *event, int flags)
>   		if (!ref)
>   			return;
>   	}
> -	mtspr(SPRN_LDBAR, 0);
> +	thread_imc_ldbar_disable(NULL);
>   	mutex_lock(&ref->lock);
>   	ref->refc--;
>   	if (ref->refc == 0) {
> @@ -1413,15 +1436,6 @@ static void cleanup_all_core_imc_memory(void)
>   	kfree(core_imc_refc);
>   }
>
> -static void thread_imc_ldbar_disable(void *dummy)
> -{
> -	/*
> -	 * By Zeroing LDBAR, we disable thread-imc
> -	 * updates.
> -	 */
> -	mtspr(SPRN_LDBAR, 0);
> -}
> -
>   void thread_imc_disable(void)
>   {
>   	on_each_cpu(thread_imc_ldbar_disable, NULL, 1);
> diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
> index c9133f7908ca..fd62435e3267 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -679,7 +679,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
>   		sprs.ptcr	= mfspr(SPRN_PTCR);
>   		sprs.rpr	= mfspr(SPRN_RPR);
>   		sprs.tscr	= mfspr(SPRN_TSCR);
> -		sprs.ldbar	= mfspr(SPRN_LDBAR);
> +		if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
> +			sprs.ldbar	= mfspr(SPRN_LDBAR);
>
>   		sprs_saved = true;
>
> @@ -762,7 +763,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
>   	mtspr(SPRN_PTCR,	sprs.ptcr);
>   	mtspr(SPRN_RPR,		sprs.rpr);
>   	mtspr(SPRN_TSCR,	sprs.tscr);
> -	mtspr(SPRN_LDBAR,	sprs.ldbar);
> +	if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
> +		mtspr(SPRN_LDBAR,	sprs.ldbar);
>
>   	if (pls >= pnv_first_tb_loss_level) {
>   		/* TB loss */
> diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S
> index 39bb24aa8f34..e4383fa5e150 100644
> --- a/arch/powerpc/platforms/powernv/subcore-asm.S
> +++ b/arch/powerpc/platforms/powernv/subcore-asm.S
> @@ -44,7 +44,9 @@ _GLOBAL(split_core_secondary_loop)
>
>   real_mode:
>   	/* Grab values from unsplit SPRs */
> +BEGIN_FW_FTR_SECTION
>   	mfspr	r6,  SPRN_LDBAR
> +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ULTRAVISOR)
>   	mfspr	r7,  SPRN_PMMAR
>   	mfspr	r8,  SPRN_PMCR
>   	mfspr	r9,  SPRN_RPR
> @@ -77,7 +79,9 @@ real_mode:
>   	mtspr	SPRN_HDEC, r4
>
>   	/* Restore SPR values now we are split */
> +BEGIN_FW_FTR_SECTION
>   	mtspr	SPRN_LDBAR, r6
> +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ULTRAVISOR)
>   	mtspr	SPRN_PMMAR, r7
>   	mtspr	SPRN_PMCR, r8
>   	mtspr	SPRN_RPR, r9


^ permalink raw reply

* Re: [PATCH 11/12] powerpc/pseries/svm: Force SWIOTLB for secure guests
From: Christoph Hellwig @ 2019-05-21  5:15 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: Anshuman Khandual, Alexey Kardashevskiy, Mike Anderson, Ram Pai,
	linux-kernel, Claudio Carvalho, Paul Mackerras, linuxppc-dev,
	Christoph Hellwig, Anshuman Khandual
In-Reply-To: <20190521044912.1375-12-bauerman@linux.ibm.com>

> diff --git a/arch/powerpc/include/asm/mem_encrypt.h b/arch/powerpc/include/asm/mem_encrypt.h
> new file mode 100644
> index 000000000000..45d5e4d0e6e0
> --- /dev/null
> +++ b/arch/powerpc/include/asm/mem_encrypt.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +/*
> + * SVM helper functions
> + *
> + * Copyright 2019 IBM Corporation
> + */
> +
> +#ifndef _ASM_POWERPC_MEM_ENCRYPT_H
> +#define _ASM_POWERPC_MEM_ENCRYPT_H
> +
> +#define sme_me_mask	0ULL
> +
> +static inline bool sme_active(void) { return false; }
> +static inline bool sev_active(void) { return false; }
> +
> +int set_memory_encrypted(unsigned long addr, int numpages);
> +int set_memory_decrypted(unsigned long addr, int numpages);
> +
> +#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */

S/390 seems to be adding a stub header just like this.  Can you please
clean up the Kconfig and generic headers bits for memory encryption so
that we don't need all this boilerplate code?

>  config PPC_SVM
>  	bool "Secure virtual machine (SVM) support for POWER"
>  	depends on PPC_PSERIES
> +	select SWIOTLB
> +	select ARCH_HAS_MEM_ENCRYPT
>  	default n

n is the default default, no need to explictly specify it.

^ permalink raw reply

* Re: [PATCH] powerpc/perf: Use cpumask_last() to determine the
From: Madhavan Srinivasan @ 2019-05-21  5:14 UTC (permalink / raw)
  To: Anju T Sudhakar, mpe; +Cc: aravinda, linuxppc-dev, ego
In-Reply-To: <20190520090501.20415-1-anju@linux.vnet.ibm.com>


On 20/05/19 2:35 PM, Anju T Sudhakar wrote:
> Nest and core imc(In-memory Collection counters) assigns a particular
> cpu as the designated target for counter data collection.
> During system boot, the first online cpu in a chip gets assigned as
> the designated cpu for that chip(for nest-imc) and the first online cpu
> in a core gets assigned as the designated cpu for that core(for core-imc).
>
> If the designated cpu goes offline, the next online cpu from the same
> chip(for nest-imc)/core(for core-imc) is assigned as the next target,
> and the event context is migrated to the target cpu.
> Currently, cpumask_any_but() function is used to find the target cpu.
> Though this function is expected to return a `random` cpu, this always
> returns the next online cpu.
>
> If all cpus in a chip/core is offlined in a sequential manner, starting
> from the first cpu, the event migration has to happen for all the cpus
> which goes offline. Since the migration process involves a grace period,
> the total time taken to offline all the cpus will be significantly high.
>
> Example:
> In a system which has 2 sockets, with
> NUMA node0 CPU(s):     0-87
> NUMA node8 CPU(s):     88-175
>
> Time taken to offline cpu 88-175:
> real    2m56.099s
> user    0m0.191s
> sys     0m0.000s
>
> Use cpumask_last() to choose the target cpu, when the designated cpu
> goes online, so the migration will happen only when the last_cpu in the
> mask goes offline. This way the time taken to offline all cpus in a
> chip/core can be reduced.
>
> With the patch,
>
> Time taken  to offline cpu 88-175:
> real    0m12.207s
> user    0m0.171s
> sys     0m0.000s
>
> cpumask_last() is a better way to find the target cpu, since in most of the
> cases cpuhotplug is performed in an increasing order(even in ppc64_cpu).
>
> cpumask_any_but() can still be used to check the possibility of other
> online cpus from the same chip/core if the last cpu in the mask goes
> offline.

Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>

Also add "Fixes:" tag and this should go to stable right?
>
> Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
> ---
>   arch/powerpc/perf/imc-pmu.c | 14 ++++++++++++--
>   1 file changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
> index 31fa753..fbfd6e7 100644
> --- a/arch/powerpc/perf/imc-pmu.c
> +++ b/arch/powerpc/perf/imc-pmu.c
> @@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
>   	 */
>   	nid = cpu_to_node(cpu);
>   	l_cpumask = cpumask_of_node(nid);
> -	target = cpumask_any_but(l_cpumask, cpu);
> +	target = cpumask_last(l_cpumask);
> +
> +	/*
> +	 * If this(target) is the last cpu in the cpumask for this chip,
> +	 * check for any possible online cpu in the chip.
> +	 */
> +	if (unlikely(target == cpu))
> +		target = cpumask_any_but(l_cpumask, cpu);
>
>   	/*
>   	 * Update the cpumask with the target cpu and
> @@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu)
>   		return 0;
>
>   	/* Find any online cpu in that core except the current "cpu" */
> -	ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
> +	ncpu = cpumask_last(cpu_sibling_mask(cpu));
> +
> +	if (unlikely(ncpu == cpu))
> +		ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
>
>   	if (ncpu >= 0 && ncpu < nr_cpu_ids) {
>   		cpumask_set_cpu(ncpu, &core_imc_cpumask);


^ permalink raw reply

* Re: [RFC PATCH 02/12] powerpc: Add support for adding an ESM blob to the zImage wrapper
From: Christoph Hellwig @ 2019-05-21  5:13 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: Anshuman Khandual, Alexey Kardashevskiy, Mike Anderson, Ram Pai,
	linux-kernel, Claudio Carvalho, Paul Mackerras, linuxppc-dev,
	Christoph Hellwig
In-Reply-To: <20190521044912.1375-3-bauerman@linux.ibm.com>

On Tue, May 21, 2019 at 01:49:02AM -0300, Thiago Jung Bauermann wrote:
> From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> 
> For secure VMs, the signing tool will create a ticket called the "ESM blob"
> for the Enter Secure Mode ultravisor call with the signatures of the kernel
> and initrd among other things.
> 
> This adds support to the wrapper script for adding that blob via the "-e"
> option to the zImage.pseries.
> 
> It also adds code to the zImage wrapper itself to retrieve and if necessary
> relocate the blob, and pass its address to Linux via the device-tree, to be
> later consumed by prom_init.

Where does the "BLOB" come from?  How is it licensed and how can we
satisfy the GPL with it?

^ permalink raw reply

* Re: [PATCH] powerpc/powernv: Return for invalid IMC domain
From: Madhavan Srinivasan @ 2019-05-21  5:11 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20190520085753.19670-1-anju@linux.vnet.ibm.com>


On 20/05/19 2:27 PM, Anju T Sudhakar wrote:
> Currently init_imc_pmu() can be failed either because
> an IMC unit with invalid domain(i.e an IMC node not
> supported by the kernel) is attempted a pmu-registration
> or something went wrong while registering a valid IMC unit.
> In both the cases kernel provides a 'Registration failed'
> error message.
>
> Example:
> Log message, when trace-imc node is not supported by the kernel, and the
> skiboot supports trace-imc node.
>
> So for kernel, trace-imc node is now an unknown domain.
>
> [    1.731870] nest_phb5_imc performance monitor hardware support registered
> [    1.731944] nest_powerbus0_imc performance monitor hardware support registered
> [    1.734458] thread_imc performance monitor hardware support registered
> [    1.734460] IMC Unknown Device type
> [    1.734462] IMC PMU (null) Register failed
> [    1.734558] nest_xlink0_imc performance monitor hardware support registered
> [    1.734614] nest_xlink1_imc performance monitor hardware support registered
> [    1.734670] nest_xlink2_imc performance monitor hardware support registered
> [    1.747043] Initialise system trusted keyrings
> [    1.747054] Key type blacklist registered
>
>
> To avoid ambiguity on the error message, return for invalid domain
> before attempting a pmu registration.

Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>


> Fixes: 8f95faaac56c1 (`powerpc/powernv: Detect and create IMC device`)
> Reported-by: Pavaman Subramaniyam <pavsubra@in.ibm.com>
> Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
> ---
>   arch/powerpc/platforms/powernv/opal-imc.c | 4 ++++
>   1 file changed, 4 insertions(+)
>
> diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
> index 58a0794..4e8b0e1 100644

> --- a/arch/powerpc/platforms/powernv/opal-imc.c
> +++ b/arch/powerpc/platforms/powernv/opal-imc.c
> @@ -161,6 +161,10 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain)
>   	struct imc_pmu *pmu_ptr;
>   	u32 offset;
>
> +	/* Return for unknown domain */
> +	if (domain < 0)
> +		return -EINVAL;
> +
>   	/* memory for pmu */
>   	pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
>   	if (!pmu_ptr)


^ permalink raw reply

* Re: [PATCH] crypto: vmx - convert to skcipher API
From: Michael Ellerman @ 2019-05-21  5:08 UTC (permalink / raw)
  To: Eric Biggers, linux-crypto, Herbert Xu
  Cc: Breno Leitão, linuxppc-dev, Nayna Jain,
	Paulo Flabiano Smorigo, Daniel Axtens
In-Reply-To: <20190520164448.160430-1-ebiggers@kernel.org>

Eric Biggers <ebiggers@kernel.org> writes:
> From: Eric Biggers <ebiggers@google.com>
>
> Convert the VMX implementations of AES-CBC, AES-CTR, and AES-XTS from
> the deprecated "blkcipher" API to the "skcipher" API.
>
> As part of this, I moved the skcipher_request for the fallback algorithm
> off the stack and into the request context of the parent algorithm.
>
> I tested this in a PowerPC VM with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y.

I booted it a few times on a Power9 bare metal machine with
panic_on_fail=1 and fuzz_iterations=400, no issues.

Tested-by: Michael Ellerman <mpe@ellerman.id.au>

cheers

> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
>  drivers/crypto/vmx/aes_cbc.c   | 183 ++++++++++++---------------------
>  drivers/crypto/vmx/aes_ctr.c   | 165 +++++++++++++----------------
>  drivers/crypto/vmx/aes_xts.c   | 175 ++++++++++++++-----------------
>  drivers/crypto/vmx/aesp8-ppc.h |   2 -
>  drivers/crypto/vmx/vmx.c       |  72 +++++++------
>  5 files changed, 252 insertions(+), 345 deletions(-)
>
> diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
> index dae8af3c46dce..92e75a05d6a9e 100644
> --- a/drivers/crypto/vmx/aes_cbc.c
> +++ b/drivers/crypto/vmx/aes_cbc.c
> @@ -7,64 +7,52 @@
>   * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
>   */
>  
> -#include <linux/types.h>
> -#include <linux/err.h>
> -#include <linux/crypto.h>
> -#include <linux/delay.h>
>  #include <asm/simd.h>
>  #include <asm/switch_to.h>
>  #include <crypto/aes.h>
>  #include <crypto/internal/simd.h>
> -#include <crypto/scatterwalk.h>
> -#include <crypto/skcipher.h>
> +#include <crypto/internal/skcipher.h>
>  
>  #include "aesp8-ppc.h"
>  
>  struct p8_aes_cbc_ctx {
> -	struct crypto_sync_skcipher *fallback;
> +	struct crypto_skcipher *fallback;
>  	struct aes_key enc_key;
>  	struct aes_key dec_key;
>  };
>  
> -static int p8_aes_cbc_init(struct crypto_tfm *tfm)
> +static int p8_aes_cbc_init(struct crypto_skcipher *tfm)
>  {
> -	const char *alg = crypto_tfm_alg_name(tfm);
> -	struct crypto_sync_skcipher *fallback;
> -	struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
> -
> -	fallback = crypto_alloc_sync_skcipher(alg, 0,
> -					      CRYPTO_ALG_NEED_FALLBACK);
> +	struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
> +	struct crypto_skcipher *fallback;
>  
> +	fallback = crypto_alloc_skcipher("cbc(aes)", 0,
> +					 CRYPTO_ALG_NEED_FALLBACK |
> +					 CRYPTO_ALG_ASYNC);
>  	if (IS_ERR(fallback)) {
> -		printk(KERN_ERR
> -		       "Failed to allocate transformation for '%s': %ld\n",
> -		       alg, PTR_ERR(fallback));
> +		pr_err("Failed to allocate cbc(aes) fallback: %ld\n",
> +		       PTR_ERR(fallback));
>  		return PTR_ERR(fallback);
>  	}
>  
> -	crypto_sync_skcipher_set_flags(
> -		fallback,
> -		crypto_skcipher_get_flags((struct crypto_skcipher *)tfm));
> +	crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
> +				    crypto_skcipher_reqsize(fallback));
>  	ctx->fallback = fallback;
> -
>  	return 0;
>  }
>  
> -static void p8_aes_cbc_exit(struct crypto_tfm *tfm)
> +static void p8_aes_cbc_exit(struct crypto_skcipher *tfm)
>  {
> -	struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
> +	struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
>  
> -	if (ctx->fallback) {
> -		crypto_free_sync_skcipher(ctx->fallback);
> -		ctx->fallback = NULL;
> -	}
> +	crypto_free_skcipher(ctx->fallback);
>  }
>  
> -static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
> +static int p8_aes_cbc_setkey(struct crypto_skcipher *tfm, const u8 *key,
>  			     unsigned int keylen)
>  {
> +	struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
>  	int ret;
> -	struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
>  
>  	preempt_disable();
>  	pagefault_disable();
> @@ -75,108 +63,71 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
>  	pagefault_enable();
>  	preempt_enable();
>  
> -	ret |= crypto_sync_skcipher_setkey(ctx->fallback, key, keylen);
> +	ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
>  
>  	return ret ? -EINVAL : 0;
>  }
>  
> -static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
> -			      struct scatterlist *dst,
> -			      struct scatterlist *src, unsigned int nbytes)
> +static int p8_aes_cbc_crypt(struct skcipher_request *req, int enc)
>  {
> +	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
> +	const struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
> +	struct skcipher_walk walk;
> +	unsigned int nbytes;
>  	int ret;
> -	struct blkcipher_walk walk;
> -	struct p8_aes_cbc_ctx *ctx =
> -		crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
>  
>  	if (!crypto_simd_usable()) {
> -		SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
> -		skcipher_request_set_sync_tfm(req, ctx->fallback);
> -		skcipher_request_set_callback(req, desc->flags, NULL, NULL);
> -		skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
> -		ret = crypto_skcipher_encrypt(req);
> -		skcipher_request_zero(req);
> -	} else {
> -		blkcipher_walk_init(&walk, dst, src, nbytes);
> -		ret = blkcipher_walk_virt(desc, &walk);
> -		while ((nbytes = walk.nbytes)) {
> -			preempt_disable();
> -			pagefault_disable();
> -			enable_kernel_vsx();
> -			aes_p8_cbc_encrypt(walk.src.virt.addr,
> -					   walk.dst.virt.addr,
> -					   nbytes & AES_BLOCK_MASK,
> -					   &ctx->enc_key, walk.iv, 1);
> -			disable_kernel_vsx();
> -			pagefault_enable();
> -			preempt_enable();
> -
> -			nbytes &= AES_BLOCK_SIZE - 1;
> -			ret = blkcipher_walk_done(desc, &walk, nbytes);
> -		}
> +		struct skcipher_request *subreq = skcipher_request_ctx(req);
> +
> +		*subreq = *req;
> +		skcipher_request_set_tfm(subreq, ctx->fallback);
> +		return enc ? crypto_skcipher_encrypt(subreq) :
> +			     crypto_skcipher_decrypt(subreq);
>  	}
>  
> +	ret = skcipher_walk_virt(&walk, req, false);
> +	while ((nbytes = walk.nbytes) != 0) {
> +		preempt_disable();
> +		pagefault_disable();
> +		enable_kernel_vsx();
> +		aes_p8_cbc_encrypt(walk.src.virt.addr,
> +				   walk.dst.virt.addr,
> +				   round_down(nbytes, AES_BLOCK_SIZE),
> +				   enc ? &ctx->enc_key : &ctx->dec_key,
> +				   walk.iv, enc);
> +		disable_kernel_vsx();
> +		pagefault_enable();
> +		preempt_enable();
> +
> +		ret = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE);
> +	}
>  	return ret;
>  }
>  
> -static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
> -			      struct scatterlist *dst,
> -			      struct scatterlist *src, unsigned int nbytes)
> +static int p8_aes_cbc_encrypt(struct skcipher_request *req)
>  {
> -	int ret;
> -	struct blkcipher_walk walk;
> -	struct p8_aes_cbc_ctx *ctx =
> -		crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
> -
> -	if (!crypto_simd_usable()) {
> -		SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
> -		skcipher_request_set_sync_tfm(req, ctx->fallback);
> -		skcipher_request_set_callback(req, desc->flags, NULL, NULL);
> -		skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
> -		ret = crypto_skcipher_decrypt(req);
> -		skcipher_request_zero(req);
> -	} else {
> -		blkcipher_walk_init(&walk, dst, src, nbytes);
> -		ret = blkcipher_walk_virt(desc, &walk);
> -		while ((nbytes = walk.nbytes)) {
> -			preempt_disable();
> -			pagefault_disable();
> -			enable_kernel_vsx();
> -			aes_p8_cbc_encrypt(walk.src.virt.addr,
> -					   walk.dst.virt.addr,
> -					   nbytes & AES_BLOCK_MASK,
> -					   &ctx->dec_key, walk.iv, 0);
> -			disable_kernel_vsx();
> -			pagefault_enable();
> -			preempt_enable();
> -
> -			nbytes &= AES_BLOCK_SIZE - 1;
> -			ret = blkcipher_walk_done(desc, &walk, nbytes);
> -		}
> -	}
> -
> -	return ret;
> +	return p8_aes_cbc_crypt(req, 1);
>  }
>  
> +static int p8_aes_cbc_decrypt(struct skcipher_request *req)
> +{
> +	return p8_aes_cbc_crypt(req, 0);
> +}
>  
> -struct crypto_alg p8_aes_cbc_alg = {
> -	.cra_name = "cbc(aes)",
> -	.cra_driver_name = "p8_aes_cbc",
> -	.cra_module = THIS_MODULE,
> -	.cra_priority = 2000,
> -	.cra_type = &crypto_blkcipher_type,
> -	.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | CRYPTO_ALG_NEED_FALLBACK,
> -	.cra_alignmask = 0,
> -	.cra_blocksize = AES_BLOCK_SIZE,
> -	.cra_ctxsize = sizeof(struct p8_aes_cbc_ctx),
> -	.cra_init = p8_aes_cbc_init,
> -	.cra_exit = p8_aes_cbc_exit,
> -	.cra_blkcipher = {
> -			  .ivsize = AES_BLOCK_SIZE,
> -			  .min_keysize = AES_MIN_KEY_SIZE,
> -			  .max_keysize = AES_MAX_KEY_SIZE,
> -			  .setkey = p8_aes_cbc_setkey,
> -			  .encrypt = p8_aes_cbc_encrypt,
> -			  .decrypt = p8_aes_cbc_decrypt,
> -	},
> +struct skcipher_alg p8_aes_cbc_alg = {
> +	.base.cra_name = "cbc(aes)",
> +	.base.cra_driver_name = "p8_aes_cbc",
> +	.base.cra_module = THIS_MODULE,
> +	.base.cra_priority = 2000,
> +	.base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
> +	.base.cra_blocksize = AES_BLOCK_SIZE,
> +	.base.cra_ctxsize = sizeof(struct p8_aes_cbc_ctx),
> +	.setkey = p8_aes_cbc_setkey,
> +	.encrypt = p8_aes_cbc_encrypt,
> +	.decrypt = p8_aes_cbc_decrypt,
> +	.init = p8_aes_cbc_init,
> +	.exit = p8_aes_cbc_exit,
> +	.min_keysize = AES_MIN_KEY_SIZE,
> +	.max_keysize = AES_MAX_KEY_SIZE,
> +	.ivsize = AES_BLOCK_SIZE,
>  };
> diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
> index dc31101178446..c4d2809a5d9ee 100644
> --- a/drivers/crypto/vmx/aes_ctr.c
> +++ b/drivers/crypto/vmx/aes_ctr.c
> @@ -7,62 +7,51 @@
>   * Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
>   */
>  
> -#include <linux/types.h>
> -#include <linux/err.h>
> -#include <linux/crypto.h>
> -#include <linux/delay.h>
>  #include <asm/simd.h>
>  #include <asm/switch_to.h>
>  #include <crypto/aes.h>
>  #include <crypto/internal/simd.h>
> -#include <crypto/scatterwalk.h>
> -#include <crypto/skcipher.h>
> +#include <crypto/internal/skcipher.h>
>  
>  #include "aesp8-ppc.h"
>  
>  struct p8_aes_ctr_ctx {
> -	struct crypto_sync_skcipher *fallback;
> +	struct crypto_skcipher *fallback;
>  	struct aes_key enc_key;
>  };
>  
> -static int p8_aes_ctr_init(struct crypto_tfm *tfm)
> +static int p8_aes_ctr_init(struct crypto_skcipher *tfm)
>  {
> -	const char *alg = crypto_tfm_alg_name(tfm);
> -	struct crypto_sync_skcipher *fallback;
> -	struct p8_aes_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
> +	struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
> +	struct crypto_skcipher *fallback;
>  
> -	fallback = crypto_alloc_sync_skcipher(alg, 0,
> -					      CRYPTO_ALG_NEED_FALLBACK);
> +	fallback = crypto_alloc_skcipher("ctr(aes)", 0,
> +					 CRYPTO_ALG_NEED_FALLBACK |
> +					 CRYPTO_ALG_ASYNC);
>  	if (IS_ERR(fallback)) {
> -		printk(KERN_ERR
> -		       "Failed to allocate transformation for '%s': %ld\n",
> -		       alg, PTR_ERR(fallback));
> +		pr_err("Failed to allocate ctr(aes) fallback: %ld\n",
> +		       PTR_ERR(fallback));
>  		return PTR_ERR(fallback);
>  	}
>  
> -	crypto_sync_skcipher_set_flags(
> -		fallback,
> -		crypto_skcipher_get_flags((struct crypto_skcipher *)tfm));
> +	crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
> +				    crypto_skcipher_reqsize(fallback));
>  	ctx->fallback = fallback;
> -
>  	return 0;
>  }
>  
> -static void p8_aes_ctr_exit(struct crypto_tfm *tfm)
> +static void p8_aes_ctr_exit(struct crypto_skcipher *tfm)
>  {
> -	struct p8_aes_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
> +	struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
>  
> -	if (ctx->fallback) {
> -		crypto_free_sync_skcipher(ctx->fallback);
> -		ctx->fallback = NULL;
> -	}
> +	crypto_free_skcipher(ctx->fallback);
>  }
>  
> -static int p8_aes_ctr_setkey(struct crypto_tfm *tfm, const u8 *key,
> +static int p8_aes_ctr_setkey(struct crypto_skcipher *tfm, const u8 *key,
>  			     unsigned int keylen)
>  {
> +	struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
>  	int ret;
> -	struct p8_aes_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
>  
>  	preempt_disable();
>  	pagefault_disable();
> @@ -72,13 +61,13 @@ static int p8_aes_ctr_setkey(struct crypto_tfm *tfm, const u8 *key,
>  	pagefault_enable();
>  	preempt_enable();
>  
> -	ret |= crypto_sync_skcipher_setkey(ctx->fallback, key, keylen);
> +	ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
>  
>  	return ret ? -EINVAL : 0;
>  }
>  
> -static void p8_aes_ctr_final(struct p8_aes_ctr_ctx *ctx,
> -			     struct blkcipher_walk *walk)
> +static void p8_aes_ctr_final(const struct p8_aes_ctr_ctx *ctx,
> +			     struct skcipher_walk *walk)
>  {
>  	u8 *ctrblk = walk->iv;
>  	u8 keystream[AES_BLOCK_SIZE];
> @@ -98,77 +87,63 @@ static void p8_aes_ctr_final(struct p8_aes_ctr_ctx *ctx,
>  	crypto_inc(ctrblk, AES_BLOCK_SIZE);
>  }
>  
> -static int p8_aes_ctr_crypt(struct blkcipher_desc *desc,
> -			    struct scatterlist *dst,
> -			    struct scatterlist *src, unsigned int nbytes)
> +static int p8_aes_ctr_crypt(struct skcipher_request *req)
>  {
> +	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
> +	const struct p8_aes_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
> +	struct skcipher_walk walk;
> +	unsigned int nbytes;
>  	int ret;
> -	u64 inc;
> -	struct blkcipher_walk walk;
> -	struct p8_aes_ctr_ctx *ctx =
> -		crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
>  
>  	if (!crypto_simd_usable()) {
> -		SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
> -		skcipher_request_set_sync_tfm(req, ctx->fallback);
> -		skcipher_request_set_callback(req, desc->flags, NULL, NULL);
> -		skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
> -		ret = crypto_skcipher_encrypt(req);
> -		skcipher_request_zero(req);
> -	} else {
> -		blkcipher_walk_init(&walk, dst, src, nbytes);
> -		ret = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
> -		while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
> -			preempt_disable();
> -			pagefault_disable();
> -			enable_kernel_vsx();
> -			aes_p8_ctr32_encrypt_blocks(walk.src.virt.addr,
> -						    walk.dst.virt.addr,
> -						    (nbytes &
> -						     AES_BLOCK_MASK) /
> -						    AES_BLOCK_SIZE,
> -						    &ctx->enc_key,
> -						    walk.iv);
> -			disable_kernel_vsx();
> -			pagefault_enable();
> -			preempt_enable();
> -
> -			/* We need to update IV mostly for last bytes/round */
> -			inc = (nbytes & AES_BLOCK_MASK) / AES_BLOCK_SIZE;
> -			if (inc > 0)
> -				while (inc--)
> -					crypto_inc(walk.iv, AES_BLOCK_SIZE);
> -
> -			nbytes &= AES_BLOCK_SIZE - 1;
> -			ret = blkcipher_walk_done(desc, &walk, nbytes);
> -		}
> -		if (walk.nbytes) {
> -			p8_aes_ctr_final(ctx, &walk);
> -			ret = blkcipher_walk_done(desc, &walk, 0);
> -		}
> +		struct skcipher_request *subreq = skcipher_request_ctx(req);
> +
> +		*subreq = *req;
> +		skcipher_request_set_tfm(subreq, ctx->fallback);
> +		return crypto_skcipher_encrypt(subreq);
>  	}
>  
> +	ret = skcipher_walk_virt(&walk, req, false);
> +	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
> +		preempt_disable();
> +		pagefault_disable();
> +		enable_kernel_vsx();
> +		aes_p8_ctr32_encrypt_blocks(walk.src.virt.addr,
> +					    walk.dst.virt.addr,
> +					    nbytes / AES_BLOCK_SIZE,
> +					    &ctx->enc_key, walk.iv);
> +		disable_kernel_vsx();
> +		pagefault_enable();
> +		preempt_enable();
> +
> +		do {
> +			crypto_inc(walk.iv, AES_BLOCK_SIZE);
> +		} while ((nbytes -= AES_BLOCK_SIZE) >= AES_BLOCK_SIZE);
> +
> +		ret = skcipher_walk_done(&walk, nbytes);
> +	}
> +	if (nbytes) {
> +		p8_aes_ctr_final(ctx, &walk);
> +		ret = skcipher_walk_done(&walk, 0);
> +	}
>  	return ret;
>  }
>  
> -struct crypto_alg p8_aes_ctr_alg = {
> -	.cra_name = "ctr(aes)",
> -	.cra_driver_name = "p8_aes_ctr",
> -	.cra_module = THIS_MODULE,
> -	.cra_priority = 2000,
> -	.cra_type = &crypto_blkcipher_type,
> -	.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | CRYPTO_ALG_NEED_FALLBACK,
> -	.cra_alignmask = 0,
> -	.cra_blocksize = 1,
> -	.cra_ctxsize = sizeof(struct p8_aes_ctr_ctx),
> -	.cra_init = p8_aes_ctr_init,
> -	.cra_exit = p8_aes_ctr_exit,
> -	.cra_blkcipher = {
> -			  .ivsize = AES_BLOCK_SIZE,
> -			  .min_keysize = AES_MIN_KEY_SIZE,
> -			  .max_keysize = AES_MAX_KEY_SIZE,
> -			  .setkey = p8_aes_ctr_setkey,
> -			  .encrypt = p8_aes_ctr_crypt,
> -			  .decrypt = p8_aes_ctr_crypt,
> -	},
> +struct skcipher_alg p8_aes_ctr_alg = {
> +	.base.cra_name = "ctr(aes)",
> +	.base.cra_driver_name = "p8_aes_ctr",
> +	.base.cra_module = THIS_MODULE,
> +	.base.cra_priority = 2000,
> +	.base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
> +	.base.cra_blocksize = 1,
> +	.base.cra_ctxsize = sizeof(struct p8_aes_ctr_ctx),
> +	.setkey = p8_aes_ctr_setkey,
> +	.encrypt = p8_aes_ctr_crypt,
> +	.decrypt = p8_aes_ctr_crypt,
> +	.init = p8_aes_ctr_init,
> +	.exit = p8_aes_ctr_exit,
> +	.min_keysize = AES_MIN_KEY_SIZE,
> +	.max_keysize = AES_MAX_KEY_SIZE,
> +	.ivsize = AES_BLOCK_SIZE,
> +	.chunksize = AES_BLOCK_SIZE,
>  };
> diff --git a/drivers/crypto/vmx/aes_xts.c b/drivers/crypto/vmx/aes_xts.c
> index aee1339f134ec..965d8e03321cd 100644
> --- a/drivers/crypto/vmx/aes_xts.c
> +++ b/drivers/crypto/vmx/aes_xts.c
> @@ -7,67 +7,56 @@
>   * Author: Leonidas S. Barbosa <leosilva@linux.vnet.ibm.com>
>   */
>  
> -#include <linux/types.h>
> -#include <linux/err.h>
> -#include <linux/crypto.h>
> -#include <linux/delay.h>
>  #include <asm/simd.h>
>  #include <asm/switch_to.h>
>  #include <crypto/aes.h>
>  #include <crypto/internal/simd.h>
> -#include <crypto/scatterwalk.h>
> +#include <crypto/internal/skcipher.h>
>  #include <crypto/xts.h>
> -#include <crypto/skcipher.h>
>  
>  #include "aesp8-ppc.h"
>  
>  struct p8_aes_xts_ctx {
> -	struct crypto_sync_skcipher *fallback;
> +	struct crypto_skcipher *fallback;
>  	struct aes_key enc_key;
>  	struct aes_key dec_key;
>  	struct aes_key tweak_key;
>  };
>  
> -static int p8_aes_xts_init(struct crypto_tfm *tfm)
> +static int p8_aes_xts_init(struct crypto_skcipher *tfm)
>  {
> -	const char *alg = crypto_tfm_alg_name(tfm);
> -	struct crypto_sync_skcipher *fallback;
> -	struct p8_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
> +	struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
> +	struct crypto_skcipher *fallback;
>  
> -	fallback = crypto_alloc_sync_skcipher(alg, 0,
> -					      CRYPTO_ALG_NEED_FALLBACK);
> +	fallback = crypto_alloc_skcipher("xts(aes)", 0,
> +					 CRYPTO_ALG_NEED_FALLBACK |
> +					 CRYPTO_ALG_ASYNC);
>  	if (IS_ERR(fallback)) {
> -		printk(KERN_ERR
> -			"Failed to allocate transformation for '%s': %ld\n",
> -			alg, PTR_ERR(fallback));
> +		pr_err("Failed to allocate xts(aes) fallback: %ld\n",
> +		       PTR_ERR(fallback));
>  		return PTR_ERR(fallback);
>  	}
>  
> -	crypto_sync_skcipher_set_flags(
> -		fallback,
> -		crypto_skcipher_get_flags((struct crypto_skcipher *)tfm));
> +	crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) +
> +				    crypto_skcipher_reqsize(fallback));
>  	ctx->fallback = fallback;
> -
>  	return 0;
>  }
>  
> -static void p8_aes_xts_exit(struct crypto_tfm *tfm)
> +static void p8_aes_xts_exit(struct crypto_skcipher *tfm)
>  {
> -	struct p8_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
> +	struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
>  
> -	if (ctx->fallback) {
> -		crypto_free_sync_skcipher(ctx->fallback);
> -		ctx->fallback = NULL;
> -	}
> +	crypto_free_skcipher(ctx->fallback);
>  }
>  
> -static int p8_aes_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
> +static int p8_aes_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
>  			     unsigned int keylen)
>  {
> +	struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
>  	int ret;
> -	struct p8_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
>  
> -	ret = xts_check_key(tfm, key, keylen);
> +	ret = xts_verify_key(tfm, key, keylen);
>  	if (ret)
>  		return ret;
>  
> @@ -81,100 +70,90 @@ static int p8_aes_xts_setkey(struct crypto_tfm *tfm, const u8 *key,
>  	pagefault_enable();
>  	preempt_enable();
>  
> -	ret |= crypto_sync_skcipher_setkey(ctx->fallback, key, keylen);
> +	ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen);
>  
>  	return ret ? -EINVAL : 0;
>  }
>  
> -static int p8_aes_xts_crypt(struct blkcipher_desc *desc,
> -			    struct scatterlist *dst,
> -			    struct scatterlist *src,
> -			    unsigned int nbytes, int enc)
> +static int p8_aes_xts_crypt(struct skcipher_request *req, int enc)
>  {
> -	int ret;
> +	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
> +	const struct p8_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
> +	struct skcipher_walk walk;
> +	unsigned int nbytes;
>  	u8 tweak[AES_BLOCK_SIZE];
> -	u8 *iv;
> -	struct blkcipher_walk walk;
> -	struct p8_aes_xts_ctx *ctx =
> -		crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
> +	int ret;
>  
>  	if (!crypto_simd_usable()) {
> -		SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
> -		skcipher_request_set_sync_tfm(req, ctx->fallback);
> -		skcipher_request_set_callback(req, desc->flags, NULL, NULL);
> -		skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
> -		ret = enc? crypto_skcipher_encrypt(req) : crypto_skcipher_decrypt(req);
> -		skcipher_request_zero(req);
> -	} else {
> -		blkcipher_walk_init(&walk, dst, src, nbytes);
> +		struct skcipher_request *subreq = skcipher_request_ctx(req);
> +
> +		*subreq = *req;
> +		skcipher_request_set_tfm(subreq, ctx->fallback);
> +		return enc ? crypto_skcipher_encrypt(subreq) :
> +			     crypto_skcipher_decrypt(subreq);
> +	}
> +
> +	ret = skcipher_walk_virt(&walk, req, false);
> +	if (ret)
> +		return ret;
> +
> +	preempt_disable();
> +	pagefault_disable();
> +	enable_kernel_vsx();
>  
> -		ret = blkcipher_walk_virt(desc, &walk);
> +	aes_p8_encrypt(walk.iv, tweak, &ctx->tweak_key);
> +
> +	disable_kernel_vsx();
> +	pagefault_enable();
> +	preempt_enable();
>  
> +	while ((nbytes = walk.nbytes) != 0) {
>  		preempt_disable();
>  		pagefault_disable();
>  		enable_kernel_vsx();
> -
> -		iv = walk.iv;
> -		memset(tweak, 0, AES_BLOCK_SIZE);
> -		aes_p8_encrypt(iv, tweak, &ctx->tweak_key);
> -
> +		if (enc)
> +			aes_p8_xts_encrypt(walk.src.virt.addr,
> +					   walk.dst.virt.addr,
> +					   round_down(nbytes, AES_BLOCK_SIZE),
> +					   &ctx->enc_key, NULL, tweak);
> +		else
> +			aes_p8_xts_decrypt(walk.src.virt.addr,
> +					   walk.dst.virt.addr,
> +					   round_down(nbytes, AES_BLOCK_SIZE),
> +					   &ctx->dec_key, NULL, tweak);
>  		disable_kernel_vsx();
>  		pagefault_enable();
>  		preempt_enable();
>  
> -		while ((nbytes = walk.nbytes)) {
> -			preempt_disable();
> -			pagefault_disable();
> -			enable_kernel_vsx();
> -			if (enc)
> -				aes_p8_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
> -						nbytes & AES_BLOCK_MASK, &ctx->enc_key, NULL, tweak);
> -			else
> -				aes_p8_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
> -						nbytes & AES_BLOCK_MASK, &ctx->dec_key, NULL, tweak);
> -			disable_kernel_vsx();
> -			pagefault_enable();
> -			preempt_enable();
> -
> -			nbytes &= AES_BLOCK_SIZE - 1;
> -			ret = blkcipher_walk_done(desc, &walk, nbytes);
> -		}
> +		ret = skcipher_walk_done(&walk, nbytes % AES_BLOCK_SIZE);
>  	}
>  	return ret;
>  }
>  
> -static int p8_aes_xts_encrypt(struct blkcipher_desc *desc,
> -			      struct scatterlist *dst,
> -			      struct scatterlist *src, unsigned int nbytes)
> +static int p8_aes_xts_encrypt(struct skcipher_request *req)
>  {
> -	return p8_aes_xts_crypt(desc, dst, src, nbytes, 1);
> +	return p8_aes_xts_crypt(req, 1);
>  }
>  
> -static int p8_aes_xts_decrypt(struct blkcipher_desc *desc,
> -			      struct scatterlist *dst,
> -			      struct scatterlist *src, unsigned int nbytes)
> +static int p8_aes_xts_decrypt(struct skcipher_request *req)
>  {
> -	return p8_aes_xts_crypt(desc, dst, src, nbytes, 0);
> +	return p8_aes_xts_crypt(req, 0);
>  }
>  
> -struct crypto_alg p8_aes_xts_alg = {
> -	.cra_name = "xts(aes)",
> -	.cra_driver_name = "p8_aes_xts",
> -	.cra_module = THIS_MODULE,
> -	.cra_priority = 2000,
> -	.cra_type = &crypto_blkcipher_type,
> -	.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | CRYPTO_ALG_NEED_FALLBACK,
> -	.cra_alignmask = 0,
> -	.cra_blocksize = AES_BLOCK_SIZE,
> -	.cra_ctxsize = sizeof(struct p8_aes_xts_ctx),
> -	.cra_init = p8_aes_xts_init,
> -	.cra_exit = p8_aes_xts_exit,
> -	.cra_blkcipher = {
> -			.ivsize = AES_BLOCK_SIZE,
> -			.min_keysize = 2 * AES_MIN_KEY_SIZE,
> -			.max_keysize = 2 * AES_MAX_KEY_SIZE,
> -			.setkey	 = p8_aes_xts_setkey,
> -			.encrypt = p8_aes_xts_encrypt,
> -			.decrypt = p8_aes_xts_decrypt,
> -	}
> +struct skcipher_alg p8_aes_xts_alg = {
> +	.base.cra_name = "xts(aes)",
> +	.base.cra_driver_name = "p8_aes_xts",
> +	.base.cra_module = THIS_MODULE,
> +	.base.cra_priority = 2000,
> +	.base.cra_flags = CRYPTO_ALG_NEED_FALLBACK,
> +	.base.cra_blocksize = AES_BLOCK_SIZE,
> +	.base.cra_ctxsize = sizeof(struct p8_aes_xts_ctx),
> +	.setkey = p8_aes_xts_setkey,
> +	.encrypt = p8_aes_xts_encrypt,
> +	.decrypt = p8_aes_xts_decrypt,
> +	.init = p8_aes_xts_init,
> +	.exit = p8_aes_xts_exit,
> +	.min_keysize = 2 * AES_MIN_KEY_SIZE,
> +	.max_keysize = 2 * AES_MAX_KEY_SIZE,
> +	.ivsize = AES_BLOCK_SIZE,
>  };
> diff --git a/drivers/crypto/vmx/aesp8-ppc.h b/drivers/crypto/vmx/aesp8-ppc.h
> index 349646b73754f..01774a4d26a25 100644
> --- a/drivers/crypto/vmx/aesp8-ppc.h
> +++ b/drivers/crypto/vmx/aesp8-ppc.h
> @@ -2,8 +2,6 @@
>  #include <linux/types.h>
>  #include <crypto/aes.h>
>  
> -#define AES_BLOCK_MASK  (~(AES_BLOCK_SIZE-1))
> -
>  struct aes_key {
>  	u8 key[AES_MAX_KEYLENGTH];
>  	int rounds;
> diff --git a/drivers/crypto/vmx/vmx.c b/drivers/crypto/vmx/vmx.c
> index abd89c2bcec4d..eff03fdf964f2 100644
> --- a/drivers/crypto/vmx/vmx.c
> +++ b/drivers/crypto/vmx/vmx.c
> @@ -15,54 +15,58 @@
>  #include <linux/crypto.h>
>  #include <asm/cputable.h>
>  #include <crypto/internal/hash.h>
> +#include <crypto/internal/skcipher.h>
>  
>  extern struct shash_alg p8_ghash_alg;
>  extern struct crypto_alg p8_aes_alg;
> -extern struct crypto_alg p8_aes_cbc_alg;
> -extern struct crypto_alg p8_aes_ctr_alg;
> -extern struct crypto_alg p8_aes_xts_alg;
> -static struct crypto_alg *algs[] = {
> -	&p8_aes_alg,
> -	&p8_aes_cbc_alg,
> -	&p8_aes_ctr_alg,
> -	&p8_aes_xts_alg,
> -	NULL,
> -};
> +extern struct skcipher_alg p8_aes_cbc_alg;
> +extern struct skcipher_alg p8_aes_ctr_alg;
> +extern struct skcipher_alg p8_aes_xts_alg;
>  
>  static int __init p8_init(void)
>  {
> -	int ret = 0;
> -	struct crypto_alg **alg_it;
> +	int ret;
>  
> -	for (alg_it = algs; *alg_it; alg_it++) {
> -		ret = crypto_register_alg(*alg_it);
> -		printk(KERN_INFO "crypto_register_alg '%s' = %d\n",
> -		       (*alg_it)->cra_name, ret);
> -		if (ret) {
> -			for (alg_it--; alg_it >= algs; alg_it--)
> -				crypto_unregister_alg(*alg_it);
> -			break;
> -		}
> -	}
> +	ret = crypto_register_shash(&p8_ghash_alg);
>  	if (ret)
> -		return ret;
> +		goto err;
>  
> -	ret = crypto_register_shash(&p8_ghash_alg);
> -	if (ret) {
> -		for (alg_it = algs; *alg_it; alg_it++)
> -			crypto_unregister_alg(*alg_it);
> -	}
> +	ret = crypto_register_alg(&p8_aes_alg);
> +	if (ret)
> +		goto err_unregister_ghash;
> +
> +	ret = crypto_register_skcipher(&p8_aes_cbc_alg);
> +	if (ret)
> +		goto err_unregister_aes;
> +
> +	ret = crypto_register_skcipher(&p8_aes_ctr_alg);
> +	if (ret)
> +		goto err_unregister_aes_cbc;
> +
> +	ret = crypto_register_skcipher(&p8_aes_xts_alg);
> +	if (ret)
> +		goto err_unregister_aes_ctr;
> +
> +	return 0;
> +
> +err_unregister_aes_ctr:
> +	crypto_unregister_skcipher(&p8_aes_ctr_alg);
> +err_unregister_aes_cbc:
> +	crypto_unregister_skcipher(&p8_aes_cbc_alg);
> +err_unregister_aes:
> +	crypto_unregister_alg(&p8_aes_alg);
> +err_unregister_ghash:
> +	crypto_unregister_shash(&p8_ghash_alg);
> +err:
>  	return ret;
>  }
>  
>  static void __exit p8_exit(void)
>  {
> -	struct crypto_alg **alg_it;
> -
> -	for (alg_it = algs; *alg_it; alg_it++) {
> -		printk(KERN_INFO "Removing '%s'\n", (*alg_it)->cra_name);
> -		crypto_unregister_alg(*alg_it);
> -	}
> +	crypto_unregister_skcipher(&p8_aes_xts_alg);
> +	crypto_unregister_skcipher(&p8_aes_ctr_alg);
> +	crypto_unregister_skcipher(&p8_aes_cbc_alg);
> +	crypto_unregister_alg(&p8_aes_alg);
>  	crypto_unregister_shash(&p8_ghash_alg);
>  }
>  
> -- 
> 2.21.0.1020.gf2820cf01a-goog

^ permalink raw reply

* [PATCH 07/12] powerpc/pseries/svm: Use shared memory for Debug Trace Log (DTL)
From: Thiago Jung Bauermann @ 2019-05-21  4:49 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Anshuman Khandual, Alexey Kardashevskiy, Mike Anderson, Ram Pai,
	linux-kernel, Claudio Carvalho, Paul Mackerras, Christoph Hellwig,
	Thiago Jung Bauermann, Anshuman Khandual
In-Reply-To: <20190521044912.1375-1-bauerman@linux.ibm.com>

From: Anshuman Khandual <khandual@linux.vnet.ibm.com>

Secure guests need to share the DTL buffers with the hypervisor. To that
end, use a kmem_cache constructor which converts the underlying buddy
allocated SLUB cache pages into shared memory.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
 arch/powerpc/include/asm/svm.h          |  5 ++++
 arch/powerpc/platforms/pseries/Makefile |  1 +
 arch/powerpc/platforms/pseries/setup.c  |  5 +++-
 arch/powerpc/platforms/pseries/svm.c    | 40 +++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
index fef3740f46a6..f253116c31fc 100644
--- a/arch/powerpc/include/asm/svm.h
+++ b/arch/powerpc/include/asm/svm.h
@@ -15,6 +15,9 @@ static inline bool is_secure_guest(void)
 	return mfmsr() & MSR_S;
 }
 
+void dtl_cache_ctor(void *addr);
+#define get_dtl_cache_ctor()	(is_secure_guest() ? dtl_cache_ctor : NULL)
+
 #else /* CONFIG_PPC_SVM */
 
 static inline bool is_secure_guest(void)
@@ -22,5 +25,7 @@ static inline bool is_secure_guest(void)
 	return false;
 }
 
+#define get_dtl_cache_ctor() NULL
+
 #endif /* CONFIG_PPC_SVM */
 #endif /* _ASM_POWERPC_SVM_H */
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index a43ec843c8e2..b7b6e6f52bd0 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_LPARCFG)		+= lparcfg.o
 obj-$(CONFIG_IBMVIO)		+= vio.o
 obj-$(CONFIG_IBMEBUS)		+= ibmebus.o
 obj-$(CONFIG_PAPR_SCM)		+= papr_scm.o
+obj-$(CONFIG_PPC_SVM)		+= svm.o
 
 ifdef CONFIG_PPC_PSERIES
 obj-$(CONFIG_SUSPEND)		+= suspend.o
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index e4f0dfd4ae33..c928e6e8a279 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -71,6 +71,7 @@
 #include <asm/isa-bridge.h>
 #include <asm/security_features.h>
 #include <asm/asm-const.h>
+#include <asm/svm.h>
 
 #include "pseries.h"
 #include "../../../../drivers/pci/pci.h"
@@ -329,8 +330,10 @@ static inline int alloc_dispatch_logs(void)
 
 static int alloc_dispatch_log_kmem_cache(void)
 {
+	void (*ctor)(void *) = get_dtl_cache_ctor();
+
 	dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
-						DISPATCH_LOG_BYTES, 0, NULL);
+						DISPATCH_LOG_BYTES, 0, ctor);
 	if (!dtl_cache) {
 		pr_warn("Failed to create dispatch trace log buffer cache\n");
 		pr_warn("Stolen time statistics will be unreliable\n");
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
new file mode 100644
index 000000000000..c508196f7c83
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Secure VM platform
+ *
+ * Copyright 2019 IBM Corporation
+ * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <asm/ultravisor.h>
+
+/* There's one dispatch log per CPU. */
+#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
+
+static struct page *dtl_page_store[NR_DTL_PAGE];
+static long dtl_nr_pages;
+
+static bool is_dtl_page_shared(struct page *page)
+{
+	long i;
+
+	for (i = 0; i < dtl_nr_pages; i++)
+		if (dtl_page_store[i] == page)
+			return true;
+
+	return false;
+}
+
+void dtl_cache_ctor(void *addr)
+{
+	unsigned long pfn = PHYS_PFN(__pa(addr));
+	struct page *page = pfn_to_page(pfn);
+
+	if (!is_dtl_page_shared(page)) {
+		dtl_page_store[dtl_nr_pages] = page;
+		dtl_nr_pages++;
+		WARN_ON(dtl_nr_pages >= NR_DTL_PAGE);
+		uv_share_page(pfn, 1);
+	}
+}

^ permalink raw reply related

* [PATCH 12/12] powerpc/configs: Enable secure guest support in pseries and ppc64 defconfigs
From: Thiago Jung Bauermann @ 2019-05-21  4:49 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Anshuman Khandual, Alexey Kardashevskiy, Mike Anderson, Ram Pai,
	linux-kernel, Claudio Carvalho, Ryan Grimm, Paul Mackerras,
	Christoph Hellwig, Thiago Jung Bauermann
In-Reply-To: <20190521044912.1375-1-bauerman@linux.ibm.com>

From: Ryan Grimm <grimm@linux.vnet.ibm.com>

Enables running as a secure guest in platforms with an Ultravisor.

Signed-off-by: Ryan Grimm <grimm@linux.vnet.ibm.com>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
 arch/powerpc/configs/ppc64_defconfig   | 1 +
 arch/powerpc/configs/pseries_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index d7c381009636..725297438320 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -31,6 +31,7 @@ CONFIG_DTL=y
 CONFIG_SCANLOG=m
 CONFIG_PPC_SMLPAR=y
 CONFIG_IBMEBUS=y
+CONFIG_PPC_SVM=y
 CONFIG_PPC_MAPLE=y
 CONFIG_PPC_PASEMI=y
 CONFIG_PPC_PASEMI_IOMMU=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 62e12f61a3b2..724a574fe4b2 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -42,6 +42,7 @@ CONFIG_DTL=y
 CONFIG_SCANLOG=m
 CONFIG_PPC_SMLPAR=y
 CONFIG_IBMEBUS=y
+CONFIG_PPC_SVM=y
 # CONFIG_PPC_PMAC is not set
 CONFIG_RTAS_FLASH=m
 CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y


^ permalink raw reply related

* [PATCH 11/12] powerpc/pseries/svm: Force SWIOTLB for secure guests
From: Thiago Jung Bauermann @ 2019-05-21  4:49 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Anshuman Khandual, Alexey Kardashevskiy, Mike Anderson, Ram Pai,
	linux-kernel, Claudio Carvalho, Paul Mackerras, Christoph Hellwig,
	Thiago Jung Bauermann, Anshuman Khandual
In-Reply-To: <20190521044912.1375-1-bauerman@linux.ibm.com>

From: Anshuman Khandual <khandual@linux.vnet.ibm.com>

SWIOTLB checks range of incoming CPU addresses to be bounced and sees if
the device can access it through its DMA window without requiring bouncing.
In such cases it just chooses to skip bouncing. But for cases like secure
guests on powerpc platform all addresses need to be bounced into the shared
pool of memory because the host cannot access it otherwise. Hence the need
to do the bouncing is not related to device's DMA window and use of bounce
buffers is forced by setting swiotlb_force.

Also, connect the shared memory conversion functions into the
ARCH_HAS_MEM_ENCRYPT hooks and call swiotlb_update_mem_attributes() to
convert SWIOTLB's memory pool to shared memory.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
[ Use ARCH_HAS_MEM_ENCRYPT hooks to share swiotlb memory pool. ]
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
 arch/powerpc/include/asm/mem_encrypt.h | 19 +++++++++++
 arch/powerpc/platforms/pseries/Kconfig |  5 +++
 arch/powerpc/platforms/pseries/svm.c   | 45 ++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)

diff --git a/arch/powerpc/include/asm/mem_encrypt.h b/arch/powerpc/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..45d5e4d0e6e0
--- /dev/null
+++ b/arch/powerpc/include/asm/mem_encrypt.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * SVM helper functions
+ *
+ * Copyright 2019 IBM Corporation
+ */
+
+#ifndef _ASM_POWERPC_MEM_ENCRYPT_H
+#define _ASM_POWERPC_MEM_ENCRYPT_H
+
+#define sme_me_mask	0ULL
+
+static inline bool sme_active(void) { return false; }
+static inline bool sev_active(void) { return false; }
+
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
+#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 82c16aa4f1ce..41b10f3bc729 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -145,9 +145,14 @@ config PAPR_SCM
 	help
 	  Enable access to hypervisor provided storage class memory.
 
+config ARCH_HAS_MEM_ENCRYPT
+	def_bool n
+
 config PPC_SVM
 	bool "Secure virtual machine (SVM) support for POWER"
 	depends on PPC_PSERIES
+	select SWIOTLB
+	select ARCH_HAS_MEM_ENCRYPT
 	default n
 	help
 	 Support secure guests on POWER. There are certain POWER platforms which
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
index c508196f7c83..618622d636d5 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -7,8 +7,53 @@
  */
 
 #include <linux/mm.h>
+#include <asm/machdep.h>
+#include <asm/svm.h>
+#include <asm/swiotlb.h>
 #include <asm/ultravisor.h>
 
+static int __init init_svm(void)
+{
+	if (!is_secure_guest())
+		return 0;
+
+	/* Don't release the SWIOTLB buffer. */
+	ppc_swiotlb_enable = 1;
+
+	/*
+	 * Since the guest memory is inaccessible to the host, devices always
+	 * need to use the SWIOTLB buffer for DMA even if dma_capable() says
+	 * otherwise.
+	 */
+	swiotlb_force = SWIOTLB_FORCE;
+
+	/* Share the SWIOTLB buffer with the host. */
+	swiotlb_update_mem_attributes();
+
+	return 0;
+}
+machine_early_initcall(pseries, init_svm);
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+	if (!PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	uv_unshare_page(PHYS_PFN(__pa(addr)), numpages);
+
+	return 0;
+}
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+	if (!PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	uv_share_page(PHYS_PFN(__pa(addr)), numpages);
+
+	return 0;
+}
+
 /* There's one dispatch log per CPU. */
 #define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
 


^ permalink raw reply related

* [PATCH 10/12] powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests
From: Thiago Jung Bauermann @ 2019-05-21  4:49 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Anshuman Khandual, Alexey Kardashevskiy, Mike Anderson, Ram Pai,
	linux-kernel, Claudio Carvalho, Paul Mackerras, Christoph Hellwig,
	Thiago Jung Bauermann
In-Reply-To: <20190521044912.1375-1-bauerman@linux.ibm.com>

Secure guest memory is inacessible to devices so regular DMA isn't
possible.

In that case set devices' dma_map_ops to NULL so that the generic
DMA code path will use SWIOTLB and DMA to bounce buffers.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 03bbb299320e..7d9550edb700 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -50,6 +50,7 @@
 #include <asm/udbg.h>
 #include <asm/mmzone.h>
 #include <asm/plpar_wrappers.h>
+#include <asm/svm.h>
 
 #include "pseries.h"
 
@@ -1332,7 +1333,10 @@ void iommu_init_early_pSeries(void)
 	of_reconfig_notifier_register(&iommu_reconfig_nb);
 	register_memory_notifier(&iommu_mem_nb);
 
-	set_pci_dma_ops(&dma_iommu_ops);
+	if (is_secure_guest())
+		set_pci_dma_ops(NULL);
+	else
+		set_pci_dma_ops(&dma_iommu_ops);
 }
 
 static int __init disable_multitce(char *str)


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox