LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v9 2/4] selftests/powerpc: Add test for strlen()
From: Christophe Leroy @ 2018-08-02  8:26 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	wei.guo.simon, segher
  Cc: linux-kernel, linuxppc-dev
In-Reply-To: <b3d7b6cdb89a48be06a2630bf0d762d9d17d931f.1533198182.git.christophe.leroy@c-s.fr>

This patch adds a test for strlen()

string.c contains a copy of strlen() from lib/string.c

The test first tests the correctness of strlen() by comparing
the result with libc strlen(). It tests all cases of alignment.

It them tests the duration of an aligned strlen() on a 4 bytes string,
on a 16 bytes string and on a 256 bytes string.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 v9: fixed relevant checkpatch warnings
 v8: no change
 v7: no change
 v6: refactorised the benchmark test
 v5: no change
 v4: new

 .../testing/selftests/powerpc/stringloops/Makefile |   5 +-
 .../testing/selftests/powerpc/stringloops/string.c |  37 ++++++
 .../testing/selftests/powerpc/stringloops/strlen.c | 131 +++++++++++++++++++++
 3 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/string.c
 create mode 100644 tools/testing/selftests/powerpc/stringloops/strlen.c

diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
index 3862256c2b7d..779b644461c4 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -10,9 +10,12 @@ $(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
 $(OUTPUT)/memcmp_32: memcmp.c
 $(OUTPUT)/memcmp_32: CFLAGS += -m32
 
+$(OUTPUT)/strlen: strlen.c string.o
+$(OUTPUT)/string.o: string.c
+
 ASFLAGS = $(CFLAGS)
 
-TEST_GEN_PROGS := memcmp_32 memcmp_64
+TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/powerpc/stringloops/string.c b/tools/testing/selftests/powerpc/stringloops/string.c
new file mode 100644
index 000000000000..9dac5ef4f52c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/string.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  linux/lib/string.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * stupid library routines.. The optimized versions should generally be found
+ * as inline code in <asm-xx/string.h>
+ *
+ * These are buggy as well..
+ *
+ * * Fri Jun 25 1999, Ingo Oeser <ioe@informatik.tu-chemnitz.de>
+ * -  Added strsep() which will replace strtok() soon (because strsep() is
+ *    reentrant and should be faster). Use only strsep() in new code, please.
+ *
+ * * Sat Feb 09 2002, Jason Thomas <jason@topic.com.au>,
+ *                    Matthew Hawkins <matt@mh.dropbear.id.au>
+ * -  Kissed strtok() goodbye
+ */
+
+#include <stddef.h>
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t test_strlen(const char *s)
+{
+	const char *sc;
+
+	for (sc = s; *sc != '\0'; ++sc)
+		; /* nothing */
+
+	return sc - s;
+}
diff --git a/tools/testing/selftests/powerpc/stringloops/strlen.c b/tools/testing/selftests/powerpc/stringloops/strlen.c
new file mode 100644
index 000000000000..7ea6122ecacb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/strlen.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "utils.h"
+
+#define SIZE 256
+#define ITERATIONS 1000
+#define ITERATIONS_BENCH 100000
+
+int test_strlen(const void *s);
+
+/* test all offsets and lengths */
+static void test_one(char *s)
+{
+	unsigned long offset;
+
+	for (offset = 0; offset < SIZE; offset++) {
+		int x, y;
+		unsigned long i;
+
+		y = strlen(s + offset);
+		x = test_strlen(s + offset);
+
+		if (x == y)
+			continue;
+
+		printf("strlen() returned %d, should have returned %d (%p offset %ld)\n",
+		       x, y, s, offset);
+
+		for (i = offset; i < SIZE; i++)
+			printf("%02x ", s[i]);
+		printf("\n");
+	}
+}
+
+static void bench_test(char *s)
+{
+	struct timespec ts_start, ts_end;
+	int i;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+	for (i = 0; i < ITERATIONS_BENCH; i++)
+		test_strlen(s);
+
+	clock_gettime(CLOCK_MONOTONIC, &ts_end);
+
+	printf("len %3.3d : time = %.6f\n", test_strlen(s),
+	       ts_end.tv_sec - ts_start.tv_sec +
+	       (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9);
+}
+
+static int testcase(void)
+{
+	char *s;
+	unsigned long i;
+
+	s = memalign(128, SIZE);
+	if (!s) {
+		perror("memalign");
+		exit(1);
+	}
+
+	srandom(1);
+
+	memset(s, 0, SIZE);
+	for (i = 0; i < SIZE; i++) {
+		char c;
+
+		do {
+			c = random() & 0x7f;
+		} while (!c);
+		s[i] = c;
+		test_one(s);
+	}
+
+	for (i = 0; i < ITERATIONS; i++) {
+		unsigned long j;
+
+		for (j = 0; j < SIZE; j++) {
+			char c;
+
+			do {
+				c = random() & 0x7f;
+			} while (!c);
+			s[j] = c;
+		}
+		for (j = 0; j < sizeof(long); j++) {
+			s[SIZE - 1 - j] = 0;
+			test_one(s);
+		}
+	}
+
+	for (i = 0; i < SIZE; i++) {
+		char c;
+
+		do {
+			c = random() & 0x7f;
+		} while (!c);
+		s[i] = c;
+	}
+
+	bench_test(s);
+
+	s[16] = 0;
+	bench_test(s);
+
+	s[8] = 0;
+	bench_test(s);
+
+	s[4] = 0;
+	bench_test(s);
+
+	s[3] = 0;
+	bench_test(s);
+
+	s[2] = 0;
+	bench_test(s);
+
+	s[1] = 0;
+	bench_test(s);
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(testcase, "strlen");
+}
-- 
2.13.3

^ permalink raw reply related

* [PATCH v9 3/4] powerpc/lib: implement strlen() in assembly for PPC32
From: Christophe Leroy @ 2018-08-02  8:26 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	wei.guo.simon, segher
  Cc: linux-kernel, linuxppc-dev
In-Reply-To: <b3d7b6cdb89a48be06a2630bf0d762d9d17d931f.1533198182.git.christophe.leroy@c-s.fr>

The generic implementation of strlen() reads strings byte per byte.

This patch implements strlen() in assembly based on a read of entire
words, in the same spirit as what some other arches and glibc do.

On a 8xx the time spent in strlen is reduced by 3/4 for long strings.

strlen() selftest on an 8xx provides the following values:

Before the patch (ie with the generic strlen() in lib/string.c):

len 256 : time = 1.195055
len 016 : time = 0.083745
len 008 : time = 0.046828
len 004 : time = 0.028390

After the patch:

len 256 : time = 0.272185 ==> 78% improvment
len 016 : time = 0.040632 ==> 51% improvment
len 008 : time = 0.033060 ==> 29% improvment
len 004 : time = 0.029149 ==> 2% degradation

On a 832x:

Before the patch:

len 256 : time = 0.236125
len 016 : time = 0.018136
len 008 : time = 0.011000
len 004 : time = 0.007229

After the patch:

len 256 : time = 0.094950 ==> 60% improvment
len 016 : time = 0.013357 ==> 26% improvment
len 008 : time = 0.010586 ==> 4% improvment
len 004 : time = 0.008784

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
Changes in v9 and v8:
 - No change

Changes in v7:
 - Reduced the scope to PPC32
 - Modified the missalignment handling to be branchless and loopless

Changes in v6:
 - Reworked for having branchless conclusion

Changes in v5:
 - Fixed for PPC64 LITTLE ENDIAN

Changes in v4:
 - Added alignment of the loop
 - doing the andc only if still not 0 as it happends only for bytes above 0x7f which is pretty rare in a string

Changes in v3:
 - Made it common to PPC32 and PPC64

Changes in v2:
 - Moved handling of unaligned strings outside of the main path as it is very unlikely.
 - Removed the verification of the fourth byte in case none of the three first ones are NUL.

 arch/powerpc/include/asm/string.h |  2 +
 arch/powerpc/lib/Makefile         |  2 +-
 arch/powerpc/lib/strlen_32.S      | 78 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/strlen_32.S

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 9b8cedf618f4..1647de15a31e 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -50,6 +50,8 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
 	return __memset64(p, v, n * 8);
 }
 #else
+#define __HAVE_ARCH_STRLEN
+
 extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
 #endif
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d0ca13ad8231..670286808928 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
 
 obj-y += string.o alloc.o code-patching.o feature-fixups.o
 
-obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o
+obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o strlen_32.o
 
 # See corresponding test in arch/powerpc/Makefile
 # 64-bit linker creates .sfpr on demand for final link (vmlinux),
diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S
new file mode 100644
index 000000000000..0a8d3f64d493
--- /dev/null
+++ b/arch/powerpc/lib/strlen_32.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * strlen() for PPC32
+ *
+ * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information.
+ *
+ * Inspired from glibc implementation
+ */
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+#include <asm/cache.h>
+
+	.text
+
+/*
+ * Algorithm:
+ *
+ * 1) Given a word 'x', we can test to see if it contains any 0 bytes
+ *    by subtracting 0x01010101, and seeing if any of the high bits of each
+ *    byte changed from 0 to 1. This works because the least significant
+ *    0 byte must have had no incoming carry (otherwise it's not the least
+ *    significant), so it is 0x00 - 0x01 == 0xff. For all other
+ *    byte values, either they have the high bit set initially, or when
+ *    1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ *    have their high bit set. The expression here is
+ *    (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when
+ *    there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+ *    match, but possibly false 0x80 matches in the next more significant
+ *    byte to a true match due to carries.  For little-endian this is
+ *    of no consequence since the least significant match is the one
+ *    we're interested in, but big-endian needs method 2 to find which
+ *    byte matches.
+ * 2) Given a word 'x', we can test to see _which_ byte was zero by
+ *    calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080).
+ *    This produces 0x80 in each byte that was zero, and 0x00 in all
+ *    the other bytes. The '| ~0x80808080' clears the low 7 bits in each
+ *    byte, and the '| x' part ensures that bytes with the high bit set
+ *    produce 0x00. The addition will carry into the high bit of each byte
+ *    iff that byte had one of its low 7 bits set. We can then just see
+ *    which was the most significant bit set and divide by 8 to find how
+ *    many to add to the index.
+ *    This is from the book 'The PowerPC Compiler Writer's Guide',
+ *    by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
+ */
+
+_GLOBAL(strlen)
+	andi.   r0, r3, 3
+	lis	r7, 0x0101
+	addi	r10, r3, -4
+	addic	r7, r7, 0x0101	/* r7 = 0x01010101 (lomagic) & clear XER[CA] */
+	rotlwi	r6, r7, 31 	/* r6 = 0x80808080 (himagic) */
+	bne-	3f
+	.balign IFETCH_ALIGN_BYTES
+1:	lwzu	r9, 4(r10)
+2:	subf	r8, r7, r9
+	and.	r8, r8, r6
+	beq+	1b
+	andc.	r8, r8, r9
+	beq+	1b
+	andc	r8, r9, r6
+	orc	r9, r9, r6
+	subfe	r8, r6, r8
+	nor	r8, r8, r9
+	cntlzw	r8, r8
+	subf	r3, r3, r10
+	srwi	r8, r8, 3
+	add	r3, r3, r8
+	blr
+
+	/* Missaligned string: make sure bytes before string are seen not 0 */
+3:	xor	r10, r10, r0
+	orc	r8, r8, r8
+	lwzu	r9, 4(r10)
+	slwi	r0, r0, 3
+	srw	r8, r8, r0
+	orc	r9, r9, r8
+	b	2b
+EXPORT_SYMBOL(strlen)
-- 
2.13.3

^ permalink raw reply related

* [PATCH v9 4/4] selftests/powerpc: update strlen() test to test the new assembly function for PPC32
From: Christophe Leroy @ 2018-08-02  8:26 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	wei.guo.simon, segher
  Cc: linux-kernel, linuxppc-dev
In-Reply-To: <b3d7b6cdb89a48be06a2630bf0d762d9d17d931f.1533198182.git.christophe.leroy@c-s.fr>

This patch adds a test for testing the new assembly strlen() for PPC32

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 v9: fixed relevant checkpatch warnings
 v8: removed defines in ppc_asm.h that were added in v6 (not used anymore since v7) ; added missing link to strlen_32.S
 v7: reduced the scope to PPC32
 v6: added additional necessary defines in ppc_asm.h
 v5: no change
 v4: new

 tools/testing/selftests/powerpc/stringloops/Makefile    | 5 ++++-
 tools/testing/selftests/powerpc/stringloops/asm/cache.h | 3 +++
 tools/testing/selftests/powerpc/stringloops/strlen_32.S | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/cache.h
 create mode 120000 tools/testing/selftests/powerpc/stringloops/strlen_32.S

diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
index 779b644461c4..9e510de2c07d 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -13,9 +13,12 @@ $(OUTPUT)/memcmp_32: CFLAGS += -m32
 $(OUTPUT)/strlen: strlen.c string.o
 $(OUTPUT)/string.o: string.c
 
+$(OUTPUT)/strlen_32: strlen.c
+$(OUTPUT)/strlen_32: CFLAGS += -m32
+
 ASFLAGS = $(CFLAGS)
 
-TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen
+TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen strlen_32
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/cache.h b/tools/testing/selftests/powerpc/stringloops/asm/cache.h
new file mode 100644
index 000000000000..38685554b6c1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/cache.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define	IFETCH_ALIGN_BYTES 4
diff --git a/tools/testing/selftests/powerpc/stringloops/strlen_32.S b/tools/testing/selftests/powerpc/stringloops/strlen_32.S
new file mode 120000
index 000000000000..72b13731b24c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/strlen_32.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/strlen_32.S
\ No newline at end of file
-- 
2.13.3

^ permalink raw reply related

* Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig
From: Geert Uytterhoeven @ 2018-08-02  8:57 UTC (permalink / raw)
  To: alex.bou9
  Cc: Christoph Hellwig, acolin, Linux MIPS Mailing List,
	Catalin Marinas, the arch/x86 maintainers, Will Deacon,
	Russell King, Linux Kernel Mailing List, jwalters, Andrew Morton,
	linuxppc-dev, Linux ARM
In-Reply-To: <fad8661c-cd8c-3a9c-ca03-5d2f63893a24@gmail.com>

Hi Alex,

On Wed, Aug 1, 2018 at 3:16 PM Alex Bounine <alex.bou9@gmail.com> wrote:
> On 2018-08-01 05:54 AM, Christoph Hellwig wrote:
> > On Tue, Jul 31, 2018 at 10:29:54AM -0400, Alexei Colin wrote:
> >> Platforms with a PCI bus will be offered the RapidIO menu since they may
> >> be want support for a RapidIO PCI device. Platforms without a PCI bus
> >> that might include a RapidIO IP block will need to "select HAS_RAPIDIO"
> >> in the platform-/machine-specific "config ARCH_*" Kconfig entry.
> >>
> >> Tested that kernel builds for arm64 with RapidIO subsystem and
> >> switch drivers enabled, also that the modules load successfully
> >> on a custom Aarch64 Qemu model.
> >
> > As said before, please include it from drivers/Kconfig so that _all_
> > architectures supporting PCI (or other Rapidio attachements) get it
> > and not some arbitrary selection of architectures.

+1

> As it was replied earlier this is not a random selection of
> architectures but only ones that implement support for RapidIO as system
> bus. If other architectures choose to adopt RapidIO we will include them
> as well.
>
> On some platforms RapidIO can be the only system bus available replacing
> PCI/PCIe or RapidIO can coexist with PCIe.
>
> As it is done now, RapidIO is configured in "Bus Options" (x86/PPC) or
> "Bus Support" (ARMs) sub-menu and from system configuration option it
> should be kept this way.
>
> Current location of RAPIDIO configuration option is familiar to users of
> PowerPC and x86 platforms, and is similarly available in some ARM
> manufacturers kernel code trees.
>
> drivers/Kconfig will be used for configuring drivers for peripheral
> RapidIO devices if/when such device drivers will be published.

Everything in drivers/rapidio/Kconfig depends on RAPIDIO (probably it should
use a big if RAPIDIO/endif instead), so it can just be included from
drivers/Kconfig now.

The sooner you do that, the less treewide changes are needed (currently
limited to mips, powerpc, and x86; your patch adds arm64).

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [PATCH v2] powerpc/mm: fix always true/false warning in slice.c
From: Christophe Leroy @ 2018-08-02  9:25 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, malat,
	aneesh.kumar
  Cc: linux-kernel, linuxppc-dev

This patch fixes the following warnings (obtained with make W=1).

arch/powerpc/mm/slice.c: In function 'slice_range_to_mask':
arch/powerpc/mm/slice.c:73:12: error: comparison is always true due to limited range of data type [-Werror=type-limits]
  if (start < SLICE_LOW_TOP) {
            ^
arch/powerpc/mm/slice.c:81:20: error: comparison is always false due to limited range of data type [-Werror=type-limits]
  if ((start + len) > SLICE_LOW_TOP) {
                    ^
arch/powerpc/mm/slice.c: In function 'slice_mask_for_free':
arch/powerpc/mm/slice.c:136:17: error: comparison is always true due to limited range of data type [-Werror=type-limits]
  if (high_limit <= SLICE_LOW_TOP)
                 ^
arch/powerpc/mm/slice.c: In function 'slice_check_range_fits':
arch/powerpc/mm/slice.c:185:12: error: comparison is always true due to limited range of data type [-Werror=type-limits]
  if (start < SLICE_LOW_TOP) {
            ^
arch/powerpc/mm/slice.c:195:39: error: comparison is always false due to limited range of data type [-Werror=type-limits]
  if (SLICE_NUM_HIGH && ((start + len) > SLICE_LOW_TOP)) {
                                       ^
arch/powerpc/mm/slice.c: In function 'slice_scan_available':
arch/powerpc/mm/slice.c:306:11: error: comparison is always true due to limited range of data type [-Werror=type-limits]
  if (addr < SLICE_LOW_TOP) {
           ^
arch/powerpc/mm/slice.c: In function 'get_slice_psize':
arch/powerpc/mm/slice.c:709:11: error: comparison is always true due to limited range of data type [-Werror=type-limits]
  if (addr < SLICE_LOW_TOP) {
           ^

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 v2: fixed issue reported by Michael

 arch/powerpc/mm/slice.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 9530c6db406a..c774d43e038c 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -62,6 +62,13 @@ static void slice_print_mask(const char *label, const struct slice_mask *mask) {
 
 #endif
 
+static inline bool slice_addr_is_low(unsigned long addr)
+{
+	u64 tmp = (u64)addr;
+
+	return tmp < SLICE_LOW_TOP;
+}
+
 static void slice_range_to_mask(unsigned long start, unsigned long len,
 				struct slice_mask *ret)
 {
@@ -71,7 +78,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len,
 	if (SLICE_NUM_HIGH)
 		bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
 
-	if (start < SLICE_LOW_TOP) {
+	if (slice_addr_is_low(start)) {
 		unsigned long mend = min(end,
 					 (unsigned long)(SLICE_LOW_TOP - 1));
 
@@ -79,7 +86,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len,
 			- (1u << GET_LOW_SLICE_INDEX(start));
 	}
 
-	if ((start + len) > SLICE_LOW_TOP) {
+	if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {
 		unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
 		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
 		unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
@@ -134,7 +141,7 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret,
 		if (!slice_low_has_vma(mm, i))
 			ret->low_slices |= 1u << i;
 
-	if (high_limit <= SLICE_LOW_TOP)
+	if (slice_addr_is_low(high_limit - 1))
 		return;
 
 	for (i = 0; i < GET_HIGH_SLICE_INDEX(high_limit); i++)
@@ -183,7 +190,7 @@ static bool slice_check_range_fits(struct mm_struct *mm,
 	unsigned long end = start + len - 1;
 	u64 low_slices = 0;
 
-	if (start < SLICE_LOW_TOP) {
+	if (slice_addr_is_low(start)) {
 		unsigned long mend = min(end,
 					 (unsigned long)(SLICE_LOW_TOP - 1));
 
@@ -193,7 +200,7 @@ static bool slice_check_range_fits(struct mm_struct *mm,
 	if ((low_slices & available->low_slices) != low_slices)
 		return false;
 
-	if (SLICE_NUM_HIGH && ((start + len) > SLICE_LOW_TOP)) {
+	if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {
 		unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
 		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
 		unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
@@ -304,7 +311,7 @@ static bool slice_scan_available(unsigned long addr,
 				 int end, unsigned long *boundary_addr)
 {
 	unsigned long slice;
-	if (addr < SLICE_LOW_TOP) {
+	if (slice_addr_is_low(addr)) {
 		slice = GET_LOW_SLICE_INDEX(addr);
 		*boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
 		return !!(available->low_slices & (1u << slice));
@@ -707,7 +714,7 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
 
 	VM_BUG_ON(radix_enabled());
 
-	if (addr < SLICE_LOW_TOP) {
+	if (slice_addr_is_low(addr)) {
 		psizes = mm->context.low_slices_psize;
 		index = GET_LOW_SLICE_INDEX(addr);
 	} else {
-- 
2.13.3

^ permalink raw reply related

* Re: [PATCH] powerpc/mm: fix always true/false warning in slice.c
From: Christophe Leroy @ 2018-08-02  9:30 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras, malat,
	aneesh.kumar
  Cc: linux-kernel, linuxppc-dev
In-Reply-To: <871scqwbbo.fsf@concordia.ellerman.id.au>



On 06/29/2018 02:55 AM, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
> 
>> This patch fixes the following warnings (obtained with make W=1).
>>
>> arch/powerpc/mm/slice.c: In function 'slice_range_to_mask':
>> arch/powerpc/mm/slice.c:73:12: error: comparison is always true due to limited range of data type [-Werror=type-limits]
>>    if (start < SLICE_LOW_TOP) {
> 
> Presumably only on 32-bit ?

Sure

> 
>> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
>> index 9530c6db406a..17c57760e06c 100644
>> --- a/arch/powerpc/mm/slice.c
>> +++ b/arch/powerpc/mm/slice.c
>> @@ -79,7 +86,7 @@ static void slice_range_to_mask(unsigned long start, unsigned long len,
>>   			- (1u << GET_LOW_SLICE_INDEX(start));
>>   	}
>>   
>> -	if ((start + len) > SLICE_LOW_TOP) {
>> +	if (!slice_addr_is_low(end)) {
>>   		unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
>>   		unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
>>   		unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
> 
> This worries me.

I'll change it in v2 to:
  	if (SLICE_NUM_HIGH && !slice_addr_is_low(end)) {


> 
> By casting before the comparison in the helper you squash the compiler
> warning, but the code is still broken if (start + len) overflows.
> 
> Presumably that "never happens", but it just seems fishy.
> 
> The other similar check in that file does:
> 
>    if (SLICE_NUM_HIGH && ((start + len) > SLICE_LOW_TOP)) {
> 
> Where SLICE_NUM_HIGH == 0 on 32-bit.
> 
> 
> Could we fix the less than comparisons with SLICE_LOW_TOP with something
> similar, eg:
> 
> 	if (!SLICE_NUM_HIGH || start < SLICE_LOW_TOP) {
> 
> ie. limit them to the 64-bit code?

That's not enough to make GCC happy:

arch/powerpc/mm/slice.c: In function ‘slice_range_to_mask’:
arch/powerpc/mm/slice.c:74:31: error: comparison is always true due to 
limited range of data type [-Werror=type-limits]
   if (!SLICE_NUM_HIGH || start < SLICE_LOW_TOP) {

Christophe


> 
> cheers
> 

^ permalink raw reply

* Re: powerpc/64s/radix: Fix missing global invalidations when removing copro
From: Michael Ellerman @ 2018-08-02 13:01 UTC (permalink / raw)
  To: Frederic Barrat, linuxppc-dev, vaibhav, npiggin; +Cc: clombard, felix
In-Reply-To: <20180731132452.15994-1-fbarrat@linux.ibm.com>

On Tue, 2018-07-31 at 13:24:52 UTC, Frederic Barrat wrote:
> With the optimizations for TLB invalidation from commit 0cef77c7798a
> ("powerpc/64s/radix: flush remote CPUs out of single-threaded
> mm_cpumask"), the scope of a TLBI (global vs. local) can now be
> influenced by the value of the 'copros' counter of the memory context.
> 
> When calling mm_context_remove_copro(), the 'copros' counter is
> decremented first before flushing. It may have the unintended side
> effect of sending local TLBIs when we explicitly need global
> invalidations in this case. Thus breaking any nMMU user in a bad and
> unpredictable way.
> 
> Fix it by flushing first, before updating the 'copros' counter, so
> that invalidations will be global.
> 
> Fixes: 0cef77c7798a ("powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask")
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
> Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
> Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/cca19f0b684f4ed6aabf6ad07ae3e1

cheers

^ permalink raw reply

* [PATCH] ppc64_cpu: utilize cpu/present info to cope with dynamic sysfs
From: Pingfan Liu @ 2018-08-02 13:39 UTC (permalink / raw)
  To: powerpc-utils-devel
  Cc: Pingfan Liu, Tyrel Datwyler, Benjamin Herrenschmidt,
	Michael Ellerman, linuxppc-dev

At present, ppc64_cpu takes the assumption of statically contiguous cpu
ids, i.e from 0 to threads_in_system. This does not face problem, since
the kernel code ensures the continuity. But due to kexec-tools needs the
CPU_ADD/_REMOVE udev event message, instead of CPU_ONLINE/_OFFLINE, the
kernel will resort to register_cpu/unregister_cpu API to acheive this.
Now, unplugging a core will make a hole in cpu_present_mask, which breaks
the continuity. To address this, this patch utilizes the cpu/present to
build a bitmap, and iterate over bitmap to cope with discontinuity.
By this way, ppc64_cpu can work with old/new kernel.

Notes about the kexec-tools issue: (tested with Fedora28)
Some user space tools such as kexec-tools resorts to the event add/remove
to automatically rebuild dtb. If the dtb is not rebuilt correctly, we
may hang on 2nd kernel due to lack the info of boot-cpu-hwid in dtb.

The steps to trigger the bug: (suppose 8 threads/core)
    drmgr -c cpu -r -q 1
    systemctl restart kdump.service
    drmgr -c cpu -a -q 1
    taskset -c 11 sh -c "echo c > /proc/sysrq-trigger"

Then, failure info:
    [  205.299528] SysRq : Trigger a crash
    [  205.299551] Unable to handle kernel paging request for data at address 0x00000000
    [  205.299558] Faulting instruction address: 0xc0000000006001a0
    [  205.299564] Oops: Kernel access of bad area, sig: 11 [#1]
    [  205.299569] SMP NR_CPUS=2048 NUMA pSeries
    [-- cut --]
    [  205.301829] Sending IPI to other CPUs
    [  205.302846] IPI complete
    I'm in purgatory
          -- > hang up here

Cc: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
---
 src/ppc64_cpu.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 176 insertions(+), 29 deletions(-)

diff --git a/src/ppc64_cpu.c b/src/ppc64_cpu.c
index 34654b4..cd5997d 100644
--- a/src/ppc64_cpu.c
+++ b/src/ppc64_cpu.c
@@ -23,6 +23,7 @@
 #include <unistd.h>
 #include <string.h>
 #include <dirent.h>
+#include <malloc.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -49,7 +50,8 @@
 
 #define PPC64_CPU_VERSION	"1.2"
 
-#define SYSFS_CPUDIR	"/sys/devices/system/cpu/cpu%d"
+#define SYSFS_CPUDIR "/sys/devices/system/cpu"
+#define SYSFS_PERCPUDIR	"/sys/devices/system/cpu/cpu%d"
 #define SYSFS_SUBCORES	"/sys/devices/system/cpu/subcores_per_core"
 #define DSCR_DEFAULT_PATH "/sys/devices/system/cpu/dscr_default"
 #define INTSERV_PATH	"/proc/device-tree/cpus/%s/ibm,ppc-interrupt-server#s"
@@ -75,17 +77,161 @@ struct cpu_freq {
 
 static int threads_per_cpu = 0;
 static int cpus_in_system = 0;
-static int threads_in_system = 0;
 
 static int do_info(void);
 
+/* 64 bits system */
+#define BITS_PER_LONG	64
+#define BIT_MASK(nr)	(1UL << ((nr) % BITS_PER_LONG))
+#define BIT_WORD(nr)	((nr) / BITS_PER_LONG)
+
+static unsigned long *cpu_present_mask;
+static unsigned int max_cpu_id = (unsigned int)-1;
+
+/* @n: the position prior to the place to search */
+static unsigned int cpumask_next(int nr, unsigned long *addr)
+{
+	unsigned int bit_num, i, j;
+	unsigned long *p;
+
+	p = addr + BIT_WORD(nr);
+	for (i = nr+1; i < max_cpu_id; ) {
+		for (j = i % BITS_PER_LONG; j < BITS_PER_LONG; j++) {
+			if ((*p >> j) & 0x1) {
+				bit_num = BIT_WORD(i)*BITS_PER_LONG + j;
+				return bit_num;
+			}
+		}
+		p++;
+		i = ((i >> 6) + 1) << 6;
+	}
+	return -1;
+}
+
+#define for_each_cpu(cpu, mask)				\
+	for ((cpu) = -1;				\
+		(cpu) = cpumask_next((cpu), (mask)),	\
+		(cpu) < max_cpu_id;)
+
+static inline int test_bit(int nr, const unsigned long *addr)
+{
+	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+static inline void set_bit(int nr, const unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	*p  |= mask;
+}
+
+static void set_bitmap(int start, int end, const unsigned long *addr)
+{
+	int i;
+
+	for ( i = start; i <= end; i++)
+		set_bit(i, addr);
+}
+
+/* @n: the place prior to search */
+static unsigned int cpumask_next_hthread(int nr, const unsigned long *mask)
+{
+	int i, start;
+
+	start = (nr/threads_per_cpu +1)*threads_per_cpu;
+	for (i = start; i < max_cpu_id; i += threads_per_cpu) {
+		if (test_bit(i, mask))
+			return i;
+	}
+	return -1;
+}
+
+/* @bitmap: allocated internally
+ * max_idx: the max cpu logical id
+ * return the num of bits in bitmap
+ */
+static int parse_cpu_mask(char *buf, int bz, unsigned long **bitmap,
+	unsigned int *max_idx)
+{
+	int a, b, i, bm_sz;
+	bool range = false;
+	char *s, *p;
+#define TMP_BUF_SIZE 32
+	char tbuf[TMP_BUF_SIZE];
+
+	a = b = i = 0;
+	/* get the max id in order to alloc bitmap */
+
+	for (s = p = buf + bz; s >= buf; s--) {
+		if (*s == '-' ||*s == ',') {
+			break;
+		}
+	}
+	memset(tbuf, '\0', TMP_BUF_SIZE);
+	memcpy(tbuf, s+1, p-s-1);
+	sscanf(tbuf, "%d", &b);
+	if (max_idx)
+		*max_idx = b;
+	/* in worst case waste 7 bytes */
+	bm_sz = (b + BITS_PER_LONG-1)/8;
+	*bitmap = memalign(sizeof(unsigned long), bm_sz);
+	memset(*bitmap, 0, bm_sz);
+
+	/* set the bitmap */
+
+	range = false;
+	for (s = p = buf; p - buf < bz; p++) {
+		if (*p == '-')
+			range = true;
+		if (*p == ',' || *p == '\n') {
+			memset(tbuf, '\0', TMP_BUF_SIZE);
+			memcpy(tbuf, s, p-s);
+			if (range) {
+				sscanf(tbuf, "%d-%d", &a, &b);
+				set_bitmap(a, b, *bitmap);
+				i += (b -a) +1;
+			} else {
+				sscanf(tbuf, "%d", &a);
+				set_bitmap(a, a, *bitmap);
+				i++;
+			}
+			range = false;
+			if (*p == ',' )
+				s = p + 1;
+			else
+				break;
+		}
+	}
+	return i;
+}
+
+static int get_cpu_present_mask(void)
+{
+	char path[SYSFS_PATH_MAX];
+	char buf[256] = {0};
+	int fd, sz, ret = 0;
+
+	sprintf(path, SYSFS_CPUDIR"/%s", "present");
+	fd = open(path, O_RDONLY);
+	sz = read(fd, buf, 256);
+	close(fd);
+	if (sz > 0)
+		parse_cpu_mask(buf, sz, &cpu_present_mask, &max_cpu_id);
+	else {
+		ret = -1;
+		printf("can not parse %s\n", path);
+	}
+	return ret;
+}
+
 static int test_sysattr(char *attribute, int perms)
 {
 	char path[SYSFS_PATH_MAX];
 	int i;
 
-	for (i = 0; i < threads_in_system; i++) {
-		sprintf(path, SYSFS_CPUDIR"/%s", i, attribute);
+	for_each_cpu(i, cpu_present_mask) {
+		sprintf(path, SYSFS_PERCPUDIR"/%s", i, attribute);
 		if (access(path, F_OK))
 			continue;
 
@@ -160,7 +306,7 @@ static int cpu_online(int thread)
 	char path[SYSFS_PATH_MAX];
 	int rc, online;
 
-	sprintf(path, SYSFS_CPUDIR"/online", thread);
+	sprintf(path, SYSFS_PERCPUDIR"/online", thread);
 	rc = get_attribute(path, "%d", &online);
 
 	/* This attribute does not exist in kernels without hotplug enabled */
@@ -180,13 +326,13 @@ static int get_system_attribute(char *attribute, const char *fmt, int *value,
 	int i, rc;
 	int system_attribute = -1;
 
-	for (i = 0; i < threads_in_system; i++) {
+	for_each_cpu(i, cpu_present_mask) {
 		int cpu_attribute;
 
 		if (!cpu_online(i))
 			continue;
 
-		sprintf(path, SYSFS_CPUDIR"/%s", i, attribute);
+		sprintf(path, SYSFS_PERCPUDIR"/%s", i, attribute);
 		rc = get_attribute(path, fmt, &cpu_attribute);
 		if (rc)
 			return rc;
@@ -208,8 +354,8 @@ static int set_system_attribute(char *attribute, const char *fmt, int state)
 	char path[SYSFS_PATH_MAX];
 	int i, rc;
 
-	for (i = 0; i < threads_in_system; i++) {
-		sprintf(path, SYSFS_CPUDIR"/%s", i, attribute);
+	for_each_cpu(i, cpu_present_mask) {
+		sprintf(path, SYSFS_PERCPUDIR"/%s", i, attribute);
 		rc = set_attribute(path, fmt, state);
 		/* When a CPU is offline some sysfs files are removed from the CPU
 		 * directory, for example smt_snooze_delay and dscr. The absence of the
@@ -360,14 +506,13 @@ static int get_cpu_info(void)
 	}
 
 	closedir(d);
-	threads_in_system = cpus_in_system * threads_per_cpu;
 
 	subcores = num_subcores();
 	if (is_subcore_capable() && subcores > 0) {
 		threads_per_cpu /= subcores;
 		cpus_in_system *= subcores;
 	}
-	return 0;
+	return get_cpu_present_mask();
 }
 
 static int is_smt_capable(void)
@@ -376,8 +521,8 @@ static int is_smt_capable(void)
 	char path[SYSFS_PATH_MAX];
 	int i;
 
-	for (i = 0; i < threads_in_system; i++) {
-		sprintf(path, SYSFS_CPUDIR"/smt_snooze_delay", i);
+	for_each_cpu(i, cpu_present_mask) {
+		sprintf(path, SYSFS_PERCPUDIR"/smt_snooze_delay", i);
 		if (stat(path, &sb))
 			continue;
 		return 1;
@@ -431,7 +576,7 @@ static int set_one_smt_state(int thread, int online_threads)
 	int i, rc = 0;
 
 	for (i = 0; i < threads_per_cpu; i++) {
-		snprintf(path, SYSFS_PATH_MAX, SYSFS_CPUDIR"/%s", thread + i,
+		snprintf(path, SYSFS_PATH_MAX, SYSFS_PERCPUDIR"/%s", thread + i,
 			 "online");
 		if (i < online_threads)
 			rc = online_thread(path);
@@ -452,7 +597,8 @@ static int set_one_smt_state(int thread, int online_threads)
 
 static int set_smt_state(int smt_state)
 {
-	int i, j, rc;
+	unsigned int i;
+	int j, rc;
 	int ssd, update_ssd = 1;
 	int inconsistent = 0;
 	int error = 0;
@@ -465,8 +611,9 @@ static int set_smt_state(int smt_state)
 	rc = get_smt_snooze_delay(&ssd, &inconsistent);
 	if (rc)
 		update_ssd = 0;
+	if (smt_state )
 
-	for (i = 0; i < threads_in_system; i += threads_per_cpu) {
+	for (i = 0; i < max_cpu_id; ) {
 		/* Online means any thread on this core running, so check all
 		 * threads in the core, not just the first. */
 		for (j = 0; j < threads_per_cpu; j++) {
@@ -481,6 +628,7 @@ static int set_smt_state(int smt_state)
 				error = 1;
 			break;
 		}
+		i = cpumask_next_hthread(i, cpu_present_mask);
 	}
 
 	if (update_ssd)
@@ -501,9 +649,8 @@ static int is_dscr_capable(void)
 
 	if (dscr_default_exists())
 		return 1;
-
-	for (i = 0; i < threads_in_system; i++) {
-		sprintf(path, SYSFS_CPUDIR"/dscr", i);
+	for_each_cpu(i, cpu_present_mask) {
+		sprintf(path, SYSFS_PERCPUDIR"/dscr", i);
 		if (stat(path, &sb))
 			continue;
 		return 1;
@@ -863,7 +1010,7 @@ static int setup_counters(struct cpu_freq *cpu_freqs)
 	/* Record how long the event ran for */
 	attr.read_format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
 
-	for (i = 0; i < threads_in_system; i++) {
+	for_each_cpu(i, cpu_present_mask) {
 		if (!cpu_online(i)) {
 			cpu_freqs[i].offline = 1;
 			continue;
@@ -890,7 +1037,7 @@ static void start_counters(struct cpu_freq *cpu_freqs)
 {
 	int i;
 
-	for (i = 0; i < threads_in_system; i++) {
+	for_each_cpu(i, cpu_present_mask) {
 		if (cpu_freqs[i].offline)
 			continue;
 
@@ -902,7 +1049,7 @@ static void stop_counters(struct cpu_freq *cpu_freqs)
 {
 	int i;
 
-	for (i = 0; i < threads_in_system; i++) {
+	for_each_cpu(i, cpu_present_mask) {
 		if (cpu_freqs[i].offline)
 			continue;
 
@@ -920,7 +1067,7 @@ static void read_counters(struct cpu_freq *cpu_freqs)
 	int i;
 	struct read_format vals;
 
-	for (i = 0; i < threads_in_system; i++) {
+	for_each_cpu(i, cpu_present_mask) {
 		size_t res;
 
 		if (cpu_freqs[i].offline)
@@ -945,7 +1092,7 @@ static void check_threads(struct cpu_freq *cpu_freqs)
 {
 	int i;
 
-	for (i = 0; i < threads_in_system; i++) {
+	for_each_cpu(i, cpu_present_mask) {
 		if (cpu_freqs[i].offline)
 			continue;
 
@@ -1051,7 +1198,7 @@ static void report_system_power_mode(void)
 static void setrlimit_open_files(void)
 {
 	struct rlimit old_rlim, new_rlim;
-	int new = threads_in_system + 8;
+	int new = max_cpu_id + 8;
 
 	getrlimit(RLIMIT_NOFILE, &old_rlim);
 
@@ -1077,7 +1224,7 @@ static int do_cpu_frequency(int sleep_time)
 
 	setrlimit_open_files();
 
-	cpu_freqs = calloc(threads_in_system, sizeof(*cpu_freqs));
+	cpu_freqs = calloc(max_cpu_id, sizeof(*cpu_freqs));
 	if (!cpu_freqs)
 		return -ENOMEM;
 
@@ -1088,7 +1235,7 @@ static int do_cpu_frequency(int sleep_time)
 	}
 
 	/* Start a soak thread on each CPU */
-	for (i = 0; i < threads_in_system; i++) {
+	for_each_cpu(i, cpu_present_mask) {
 		if (cpu_freqs[i].offline)
 			continue;
 
@@ -1111,7 +1258,7 @@ static int do_cpu_frequency(int sleep_time)
 	check_threads(cpu_freqs);
 	read_counters(cpu_freqs);
 
-	for (i = 0; i < threads_in_system; i++) {
+	for_each_cpu(i, cpu_present_mask) {
 		double frequency;
 
 		if (cpu_freqs[i].offline)
@@ -1163,7 +1310,7 @@ static int set_all_threads_off(int cpu, int smt_state)
 	int rc = 0;
 
 	for (i = cpu + smt_state - 1; i >= cpu; i--) {
-		snprintf(path, SYSFS_PATH_MAX, SYSFS_CPUDIR"/%s", i, "online");
+		snprintf(path, SYSFS_PATH_MAX, SYSFS_PERCPUDIR"/%s", i, "online");
 		rc = offline_thread(path);
 		if (rc == -1)
 			printf("Unable to take cpu%d offline", i);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 1/2] powerpc/cpuidle: dynamically register/unregister cpuidle_device during hotplug
From: Pingfan Liu @ 2018-08-02 13:42 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Pingfan Liu, Benjamin Herrenschmidt, Michael Ellerman,
	Rafael J. Wysocki, Tyrel Datwyler, linux-pm

cpuidle_device is touched during the cpu hotplug. At present, ppc64 just
online/offline cpu during hotplug/unplug. But if using the
register_cpu/unregister_cpu() API to implement the hotplug, the dir
/sys/../cpuX is created/destroyed during hotplug, hence we also need to
create the file cpuX/cpuidle dynamically.

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Cc: linux-pm@vger.kernel.org
Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
---
 drivers/cpuidle/cpuidle-powernv.c | 2 ++
 drivers/cpuidle/cpuidle-pseries.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index d29e4f0..94d0def 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -166,6 +166,7 @@ static int powernv_cpuidle_cpu_online(unsigned int cpu)
 	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 
 	if (dev && cpuidle_get_driver()) {
+		cpuidle_register_device(dev);
 		cpuidle_pause_and_lock();
 		cpuidle_enable_device(dev);
 		cpuidle_resume_and_unlock();
@@ -181,6 +182,7 @@ static int powernv_cpuidle_cpu_dead(unsigned int cpu)
 		cpuidle_pause_and_lock();
 		cpuidle_disable_device(dev);
 		cpuidle_resume_and_unlock();
+		cpuidle_unregister_device(dev);
 	}
 	return 0;
 }
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 9e56bc4..a53be8a 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -193,6 +193,7 @@ static int pseries_cpuidle_cpu_online(unsigned int cpu)
 	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
 
 	if (dev && cpuidle_get_driver()) {
+		cpuidle_register_device(dev);
 		cpuidle_pause_and_lock();
 		cpuidle_enable_device(dev);
 		cpuidle_resume_and_unlock();
@@ -208,6 +209,7 @@ static int pseries_cpuidle_cpu_dead(unsigned int cpu)
 		cpuidle_pause_and_lock();
 		cpuidle_disable_device(dev);
 		cpuidle_resume_and_unlock();
+		cpuidle_unregister_device(dev);
 	}
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/2] powerpc/cpu: post the event cpux add/remove instead of online/offline during hotplug
From: Pingfan Liu @ 2018-08-02 13:42 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Pingfan Liu, Benjamin Herrenschmidt, Michael Ellerman,
	Rafael J. Wysocki, Tyrel Datwyler, linux-pm
In-Reply-To: <1533217359-11420-1-git-send-email-kernelfans@gmail.com>

Technically speaking, echo 1/0 > cpuX/online is only a subset of cpu
hotplug/unplug, i.e. add/remove. The latter one includes the physical
adding/removing of a cpu device. Some user space tools such as kexec-tools
resort to the event add/remove to automatically rebuild dtb.
If the dtb is not rebuilt correctly, we may hang on 2nd kernel due to
lack the info of boot-cpu-hwid in dtb.

The steps to trigger the bug: (suppose 8 threads/core)
    drmgr -c cpu -r -q 1
    systemctl restart kdump.service
    drmgr -c cpu -a -q 1
    taskset -c 11 sh -c "echo c > /proc/sysrq-trigger"

Then, failure info:
    [  205.299528] SysRq : Trigger a crash
    [  205.299551] Unable to handle kernel paging request for data at address 0x00000000
    [  205.299558] Faulting instruction address: 0xc0000000006001a0
    [  205.299564] Oops: Kernel access of bad area, sig: 11 [#1]
    [  205.299569] SMP NR_CPUS=2048 NUMA pSeries
    [  205.299575] Modules linked in: macsec sctp_diag sctp tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6
    xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw iptable_nat
    nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter xfs libcrc32c sg
    pseries_rng binfmt_misc ip_tables ext4 mbcache jbd2 sd_mod crc_t10dif crct10dif_generic crct10dif_common ibmvscsi scsi_transport_srp ibmveth scsi_tgt dm_mirror dm_region_hash dm_log dm_mod
    [  205.299658] CPU: 11 PID: 2521 Comm: bash Not tainted 3.10.0-799.el7.ppc64le #1
    [  205.299664] task: c00000017bcd15e0 ti: c00000014f410000 task.ti: c00000014f410000
    [  205.299670] NIP: c0000000006001a0 LR: c000000000600ddc CTR: c000000000600180
    [  205.299676] REGS: c00000014f413a70 TRAP: 0300   Not tainted  (3.10.0-799.el7.ppc64le)
    [  205.299681] MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 28222822  XER: 00000001
    [  205.299696] CFAR: c000000000009368 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 1
    GPR00: c000000000600dbc c00000014f413cf0 c000000001263200 0000000000000063
    GPR04: c0000000019ca818 c0000000019db5f8 00000000000000c2 c00000000140aa30
    GPR08: 0000000000000007 0000000000000001 0000000000000000 c00000000140fc60
    GPR12: c000000000600180 c000000007b36300 0000000010139e58 0000000040000000
    GPR16: 000000001013b5d0 0000000000000000 00000000101306fc 0000000010139de4
    GPR20: 0000000010139de8 0000000010093150 0000000000000000 0000000000000000
    GPR24: 000000001013b5e0 00000000100fa0e8 0000000000000007 c0000000011af1c8
    GPR28: 0000000000000063 c0000000011af588 c000000001179ba8 0000000000000002
    [  205.299770] NIP [c0000000006001a0] sysrq_handle_crash+0x20/0x30
    [  205.299776] LR [c000000000600ddc] write_sysrq_trigger+0x10c/0x230
    [  205.299781] Call Trace:
    [  205.299786] [c00000014f413cf0] [c000000000600dbc] write_sysrq_trigger+0xec/0x230 (unreliable)
    [  205.299794] [c00000014f413d90] [c0000000003eb2c4] proc_reg_write+0x84/0x120
    [  205.299801] [c00000014f413dd0] [c000000000330a80] SyS_write+0x150/0x400
    [  205.299808] [c00000014f413e30] [c00000000000a184] system_call+0x38/0xb4
    [  205.299813] Instruction dump:
    [  205.299816] 409effb8 7fc3f378 4bfff381 4bffffac 3c4c00c6 38423080 3d42fff1 394a6930
    [  205.299827] 39200001 912a0000 7c0004ac 39400000 <992a0000> 4e800020 60000000 60420000
    [  205.299838] ---[ end trace f590a5dbd3f63aab ]---
    [  205.301812]
    [  205.301829] Sending IPI to other CPUs
    [  205.302846] IPI complete
    I'm in purgatory
          -- > hang up here

This patch uses the interface register_/unregister_cpu to fix the problem.

Test the compatibility with ppc64_cpu on a powerKVM guest, with the
following topo:
  Thread(s) per core:  8
  Core(s) per socket:  2
  Socket(s):           2
  NUMA node(s):        1
and the following instructions:
  ppc64_cpu --smt=off
  drmgr -c cpu -r 1
  drmgr -c cpu -a 1
  ppc64_cpu --smt=on
or
  drmgr -c cpu -r 1
  ppc64_cpu --smt=off
  drmgr -c cpu -a 1
  ppc64_cpu --smt=on

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Cc: linux-pm@vger.kernel.org
Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
---
v2 -> v3
  create sysfs file "dev_attr_physical_id" before cpu online callbacks,
  hence it can work with ppc64_cpu
---
 arch/powerpc/include/asm/setup.h             |  4 +++
 arch/powerpc/include/asm/smp.h               |  1 +
 arch/powerpc/kernel/sysfs.c                  | 38 +++++++++++++++++++++++-----
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 16 +++++++++++-
 4 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index 8721fd0..e0597f2 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -40,6 +40,10 @@ static inline void pseries_big_endian_exceptions(void) {}
 static inline void pseries_little_endian_exceptions(void) {}
 #endif /* CONFIG_PPC_PSERIES */
 
+extern bool cpudev_is_dummy(int cpu);
+extern int register_ppc_cpu(int cpu);
+extern int unregister_ppc_cpu(int cpu);
+
 void rfi_flush_enable(bool enable);
 
 /* These are bit flags */
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 29ffaab..344180a 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -36,6 +36,7 @@ extern u32 *cpu_to_phys_id;
 extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
 
+DECLARE_PER_CPU(struct cpu, cpu_devices);
 #ifdef CONFIG_SMP
 
 struct smp_ops_t {
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 755dc98..2f6c9f9 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -27,7 +27,7 @@
 #include <asm/lppaca.h>
 #endif
 
-static DEFINE_PER_CPU(struct cpu, cpu_devices);
+DEFINE_PER_CPU(struct cpu, cpu_devices);
 
 /*
  * SMT snooze delay stuff, 64-bit only for now
@@ -1025,6 +1025,35 @@ static ssize_t show_physical_id(struct device *dev,
 }
 static DEVICE_ATTR(physical_id, 0444, show_physical_id, NULL);
 
+/* unplugged cpu is true */
+bool cpudev_is_dummy(int cpu)
+{
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+	return !kref_read(&c->dev.kobj.kref);
+}
+
+/* create all files in sysfs, which can not be postponed till cpu online
+ * callbacks
+ */
+int register_ppc_cpu(int cpu)
+{
+	struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+	register_cpu(c, cpu);
+	device_create_file(&c->dev, &dev_attr_physical_id);
+	return 0;
+}
+
+int unregister_ppc_cpu(int cpu)
+{
+	device_remove_file(&per_cpu(cpu_devices, cpu).dev,
+		&dev_attr_physical_id);
+	unregister_cpu(container_of(get_cpu_device(cpu),
+		struct cpu, dev));
+	return 0;
+}
+
 static int __init topology_init(void)
 {
 	int cpu, r;
@@ -1044,11 +1073,8 @@ static int __init topology_init(void)
 		if (ppc_md.cpu_die)
 			c->hotpluggable = 1;
 
-		if (cpu_online(cpu) || c->hotpluggable) {
-			register_cpu(c, cpu);
-
-			device_create_file(&c->dev, &dev_attr_physical_id);
-		}
+		if (cpu_online(cpu) || c->hotpluggable)
+			register_ppc_cpu(cpu);
 	}
 	r = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/topology:online",
 			      register_cpu_online, unregister_cpu_online);
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 6ef77ca..f361fc8 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -37,6 +37,7 @@
 #include <asm/xive.h>
 #include <asm/plpar_wrappers.h>
 #include <asm/topology.h>
+#include <asm/setup.h>
 
 #include "pseries.h"
 #include "offline_states.h"
@@ -367,6 +368,11 @@ static int dlpar_online_cpu(struct device_node *dn)
 			cpu_maps_update_done();
 			timed_topology_update(1);
 			find_and_online_cpu_nid(cpu);
+			/* protect against the offline's failure,
+			 * then re-online
+			 */
+			if (cpudev_is_dummy(cpu))
+				register_ppc_cpu(cpu);
 			rc = device_online(get_cpu_device(cpu));
 			if (rc)
 				goto out;
@@ -530,8 +536,12 @@ static int dlpar_offline_cpu(struct device_node *dn)
 			if (get_hard_smp_processor_id(cpu) != thread)
 				continue;
 
-			if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
+			if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) {
+				cpu_maps_update_done();
+				unregister_ppc_cpu(cpu);
+				cpu_maps_update_begin();
 				break;
+			}
 
 			if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
 				set_preferred_offline_state(cpu,
@@ -541,6 +551,7 @@ static int dlpar_offline_cpu(struct device_node *dn)
 				rc = device_offline(get_cpu_device(cpu));
 				if (rc)
 					goto out;
+				unregister_ppc_cpu(cpu);
 				cpu_maps_update_begin();
 				break;
 
@@ -554,6 +565,9 @@ static int dlpar_offline_cpu(struct device_node *dn)
 			BUG_ON(plpar_hcall_norets(H_PROD, thread)
 								!= H_SUCCESS);
 			__cpu_die(cpu);
+			cpu_maps_update_done();
+			unregister_ppc_cpu(cpu);
+			cpu_maps_update_begin();
 			break;
 		}
 		if (cpu == num_possible_cpus())
-- 
2.7.4

^ permalink raw reply related

* Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig
From: Alexei Colin @ 2018-08-02 13:45 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: alex.bou9, Christoph Hellwig, Linux MIPS Mailing List,
	Catalin Marinas, the arch/x86 maintainers, Will Deacon,
	Russell King, Linux Kernel Mailing List, jwalters, Andrew Morton,
	linuxppc-dev, Linux ARM
In-Reply-To: <CAMuHMdVDra1MKcuuD0SqEYXSggr0iVFcbcjL33z7JuiE1_y8yw@mail.gmail.com>

On Thu, Aug 02, 2018 at 10:57:00AM +0200, Geert Uytterhoeven wrote:
> On Wed, Aug 1, 2018 at 3:16 PM Alex Bounine <alex.bou9@gmail.com> wrote:
> > On 2018-08-01 05:54 AM, Christoph Hellwig wrote:
> > > On Tue, Jul 31, 2018 at 10:29:54AM -0400, Alexei Colin wrote:
> > >> Platforms with a PCI bus will be offered the RapidIO menu since they may
> > >> be want support for a RapidIO PCI device. Platforms without a PCI bus
> > >> that might include a RapidIO IP block will need to "select HAS_RAPIDIO"
> > >> in the platform-/machine-specific "config ARCH_*" Kconfig entry.
> > >>
> > >> Tested that kernel builds for arm64 with RapidIO subsystem and
> > >> switch drivers enabled, also that the modules load successfully
> > >> on a custom Aarch64 Qemu model.
> > >
> > > As said before, please include it from drivers/Kconfig so that _all_
> > > architectures supporting PCI (or other Rapidio attachements) get it
> > > and not some arbitrary selection of architectures.
> 
> +1
> 
> > As it was replied earlier this is not a random selection of
> > architectures but only ones that implement support for RapidIO as system
> > bus. If other architectures choose to adopt RapidIO we will include them
> > as well.
> >
> > On some platforms RapidIO can be the only system bus available replacing
> > PCI/PCIe or RapidIO can coexist with PCIe.
> >
> > As it is done now, RapidIO is configured in "Bus Options" (x86/PPC) or
> > "Bus Support" (ARMs) sub-menu and from system configuration option it
> > should be kept this way.
> >
> > Current location of RAPIDIO configuration option is familiar to users of
> > PowerPC and x86 platforms, and is similarly available in some ARM
> > manufacturers kernel code trees.
> >
> > drivers/Kconfig will be used for configuring drivers for peripheral
> > RapidIO devices if/when such device drivers will be published.
> 
> Everything in drivers/rapidio/Kconfig depends on RAPIDIO (probably it should
> use a big if RAPIDIO/endif instead), so it can just be included from
> drivers/Kconfig now.
> 
> The sooner you do that, the less treewide changes are needed (currently
> limited to mips, powerpc, and x86; your patch adds arm64).

If I move RapidIO option to drivers/Kconfig, then it won't appear under
the Bus Options/System Support menu, along with other choices for the
system bus (PCI, PCMCIA, ISA, TC, ...). Alex explains above that RapidIO
may be selected as a system bus on some architectures, and users expect
it to be in the menu in which it has been for some time now.  What
problem is the current organization causing?

This patch does not intend to propose any changes to the current
organization of the menus, if such changes are needed, another patch can
be made with that purpose. What problem is this patch introducing?

^ permalink raw reply

* Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig
From: Alexei Colin @ 2018-08-02 14:00 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Geert Uytterhoeven, alex.bou9, Linux MIPS Mailing List,
	Catalin Marinas, the arch/x86 maintainers, Will Deacon,
	Russell King, Linux Kernel Mailing List, jwalters, Andrew Morton,
	linuxppc-dev, Linux ARM
In-Reply-To: <20180802135421.GA13512@infradead.org>

On Thu, Aug 02, 2018 at 06:54:21AM -0700, Christoph Hellwig wrote:
> On Thu, Aug 02, 2018 at 09:45:44AM -0400, Alexei Colin wrote:
> > If I move RapidIO option to drivers/Kconfig, then it won't appear under
> > the Bus Options/System Support menu, along with other choices for the
> > system bus (PCI, PCMCIA, ISA, TC, ...).
> 
> It's not like anyone cares.  And FYI, moving all those to
> drivers/Kconfig is osmething I will submit for the next merge window.

Ok, I would appreciate a CC on that patch. After it lands, if
there is time, I'll resubmit a rebased version of this patch.

^ permalink raw reply

* Re: [RFC PATCH 3/3] cpuidle/powernv: Conditionally save-restore sprs using opal
From: Nicholas Piggin @ 2018-08-02 14:05 UTC (permalink / raw)
  To: Akshay Adiga; +Cc: linux-kernel, linuxppc-dev, mpe, benh, ego, huntbag
In-Reply-To: <20180802045132.12432-4-akshay.adiga@linux.vnet.ibm.com>

On Thu,  2 Aug 2018 10:21:32 +0530
Akshay Adiga <akshay.adiga@linux.vnet.ibm.com> wrote:

> From: Abhishek Goel <huntbag@linux.vnet.ibm.com>
> 
> If a state has "opal-supported" compat flag in device-tree, an opal call
> needs to be made during the entry and exit of the stop state. This patch
> passes a hint to the power9_idle_stop and power9_offline_stop.
> 
> This patch moves the saving and restoring of sprs for P9 cpuidle
> from kernel to opal. This patch still uses existing code to detect
> first thread in core.
> In an attempt to make the powernv idle code backward compatible,
> and to some extent forward compatible, add support for pre-stop entry
> and post-stop exit actions in OPAL. If a kernel knows about this
> opal call, then just a firmware supporting newer hardware is required,
> instead of waiting for kernel updates.

Still think we should make these do-everything calls. Including
executing nap/stop instructions, restoring timebase, possibly even
saving and restoring SLB (although a return code could be used to
tell the kernel to do that maybe if performance advantage is enough).

I haven't had a lot of time to go through it, I'm working on moving
~all of idle_book3s.S to C code, I'd like to do that before this
OPAL idle driver if possible.

A minor thing I just noticed, you don't have to allocate the opal
spr save space in Linux, just do it all in OPAL.

Thanks,
Nick

^ permalink raw reply

* Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig
From: Alex Bounine @ 2018-08-02 14:16 UTC (permalink / raw)
  To: Alexei Colin
  Cc: Christoph Hellwig, Geert Uytterhoeven, Linux MIPS Mailing List,
	Catalin Marinas, the arch/x86 maintainers, Will Deacon,
	Russell King, Linux Kernel Mailing List, jwalters, Andrew Morton,
	linuxppc-dev, Linux ARM
In-Reply-To: <20180802140033.GH38497@guest228.east.isi.edu>

I will experiment with this idea.
Please keep me in CC as well.

Sent from my iPad

> On Aug 2, 2018, at 10:00 AM, Alexei Colin <acolin@isi.edu> wrote:
> 
>> On Thu, Aug 02, 2018 at 06:54:21AM -0700, Christoph Hellwig wrote:
>>> On Thu, Aug 02, 2018 at 09:45:44AM -0400, Alexei Colin wrote:
>>> If I move RapidIO option to drivers/Kconfig, then it won't appear under
>>> the Bus Options/System Support menu, along with other choices for the
>>> system bus (PCI, PCMCIA, ISA, TC, ...).
>> 
>> It's not like anyone cares.  And FYI, moving all those to
>> drivers/Kconfig is osmething I will submit for the next merge window.
> 
> Ok, I would appreciate a CC on that patch. After it lands, if
> there is time, I'll resubmit a rebased version of this patch.

^ permalink raw reply

* Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig
From: Christoph Hellwig @ 2018-08-02 13:54 UTC (permalink / raw)
  To: Alexei Colin
  Cc: Geert Uytterhoeven, alex.bou9, Christoph Hellwig,
	Linux MIPS Mailing List, Catalin Marinas,
	the arch/x86 maintainers, Will Deacon, Russell King,
	Linux Kernel Mailing List, jwalters, Andrew Morton, linuxppc-dev,
	Linux ARM
In-Reply-To: <20180802134544.GG38497@guest228.east.isi.edu>

On Thu, Aug 02, 2018 at 09:45:44AM -0400, Alexei Colin wrote:
> If I move RapidIO option to drivers/Kconfig, then it won't appear under
> the Bus Options/System Support menu, along with other choices for the
> system bus (PCI, PCMCIA, ISA, TC, ...).

It's not like anyone cares.  And FYI, moving all those to
drivers/Kconfig is osmething I will submit for the next merge window.

> Alex explains above that RapidIO
> may be selected as a system bus on some architectures, and users expect
> it to be in the menu in which it has been for some time now.  What
> problem is the current organization causing?

A "system bus" seems like a rather outdated concept to start with.

^ permalink raw reply

* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Benjamin Herrenschmidt @ 2018-08-02 15:24 UTC (permalink / raw)
  To: Christoph Hellwig, Will Deacon
  Cc: Michael S. Tsirkin, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, linuxram, haren, paulus, srikar, robin.murphy,
	jean-philippe.brucker, marc.zyngier
In-Reply-To: <20180801083639.GF26378@infradead.org>

On Wed, 2018-08-01 at 01:36 -0700, Christoph Hellwig wrote:
> We just need to figure out how to deal with devices that deviate
> from the default.  One things is that VIRTIO_F_IOMMU_PLATFORM really
> should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu
> dma tweaks (offsets, cache flushing), which seems well in spirit of
> the original design. 

I don't completely agree:

1 - VIRTIO_F_IOMMU_PLATFORM is a property of the "other side", ie qemu
for example. It indicates that the peer bypasses the normal platform
iommu. The platform code in the guest has no real way to know that this
is the case, this is a specific "feature" of the qemu implementation.

2 - VIRTIO_F_PLATFORM_DMA (or whatever you want to call it), is a
property of the guest platform itself (not qemu), there's no way the
"peer" can advertize it via the virtio negociated flags. At least for
us. I don't know for sure whether that would be workable for the ARM
case. In our case, qemu has no idea at VM creation time that the VM
will turn itself into a secure VM and thus will require bounce
buffering for IOs (including virtio).

So unless we have another hook for the arch code to set
VIRTIO_F_PLATFORM_DMA on selected (or all) virtio devices from the
guest itself, I don't see that as a way to deal with it.

>  The other issue is VIRTIO_F_IO_BARRIER
> which is very vaguely defined, and which needs a better definition.
> And last but not least we'll need some text explaining the challenges
> of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
> is what would basically cover them, but a good description including
> an explanation of why these matter.

Ben.

^ permalink raw reply

* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Benjamin Herrenschmidt @ 2018-08-02 15:33 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christoph Hellwig, Will Deacon, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, linuxram, haren, paulus, srikar, robin.murphy,
	jean-philippe.brucker, marc.zyngier
In-Reply-To: <20180802003823-mutt-send-email-mst@kernel.org>

On Thu, 2018-08-02 at 00:56 +0300, Michael S. Tsirkin wrote:
> > but it's not, VMs are
> > created in "legacy" mode all the times and we don't know at VM creation
> > time whether it will become a secure VM or not. The way our secure VMs
> > work is that they start as a normal VM, load a secure "payload" and
> > call the Ultravisor to "become" secure.
> > 
> > So we're in a bit of a bind here. We need that one-liner optional arch
> > hook to make virtio use swiotlb in that "IOMMU bypass" case.
> > 
> > Ben.
> 
> And just to make sure I understand, on your platform DMA APIs do include
> some of the cache flushing tricks and this is why you don't want to
> declare iommu support in the hypervisor?

I'm not sure I parse what you mean.

We don't need cache flushing tricks. The problem we have with our
"secure" VMs is that:

 - At VM creation time we have no idea it's going to become a secure
VM, qemu doesn't know anything about it, and thus qemu (or other
management tools, libvirt etc...) are going to create "legacy" (ie
iommu bypass) virtio devices.

 - Once the VM goes secure (early during boot but too late for qemu),
it will need to make virtio do bounce-buffering via swiotlb because
qemu cannot physically access most VM pages (blocked by HW security
features), we need to bounce buffer using some unsecure pages that are
accessible to qemu.

That said, I wouldn't object for us to more generally switch long run
to changing qemu so that virtio on powerpc starts using the IOMMU as a
default provided we fix our guest firmware to understand it (it
currently doesn't), and provided we verify that the performance impact
on things like vhost is negligible.

Cheers,
Ben.

^ permalink raw reply

* [PATCH] powernv/cpuidle: Fix idle states all being marked invalid
From: Nicholas Piggin @ 2018-08-02 15:39 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Akshay Adiga, Gautham R . Shenoy

Commit 9c7b185ab2 ("powernv/cpuidle: Parse dt idle properties into
global structure") parses dt idle states into structs, but never
marks them valid. This results in all idle states being lost.

Cc: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Cc: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/platforms/powernv/idle.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 3116bab10aa3..ecb002c5db83 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -651,11 +651,12 @@ static int __init pnv_power9_idle_init(void)
 					      &state->psscr_mask,
 					      state->flags);
 		if (err) {
-			state->valid = false;
 			report_invalid_psscr_val(state->psscr_val, err);
 			continue;
 		}

+		state->valid = true;
+
 		if (max_residency_ns < state->residency_ns) {
 			max_residency_ns = state->residency_ns;
 			pnv_deepest_stop_psscr_val = state->psscr_val;
-- 
2.17.0

^ permalink raw reply related

* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Michael S. Tsirkin @ 2018-08-02 15:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christoph Hellwig, Will Deacon, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, linuxram, haren, paulus, srikar, robin.murphy,
	jean-philippe.brucker, marc.zyngier
In-Reply-To: <26c1d3d50d8e081eed44fe9940fbefed34598cbd.camel@kernel.crashing.org>

On Thu, Aug 02, 2018 at 10:24:57AM -0500, Benjamin Herrenschmidt wrote:
> On Wed, 2018-08-01 at 01:36 -0700, Christoph Hellwig wrote:
> > We just need to figure out how to deal with devices that deviate
> > from the default.  One things is that VIRTIO_F_IOMMU_PLATFORM really
> > should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu
> > dma tweaks (offsets, cache flushing), which seems well in spirit of
> > the original design. 
> 
> I don't completely agree:
> 
> 1 - VIRTIO_F_IOMMU_PLATFORM is a property of the "other side", ie qemu
> for example. It indicates that the peer bypasses the normal platform
> iommu. The platform code in the guest has no real way to know that this
> is the case, this is a specific "feature" of the qemu implementation.
> 
> 2 - VIRTIO_F_PLATFORM_DMA (or whatever you want to call it), is a
> property of the guest platform itself (not qemu), there's no way the
> "peer" can advertize it via the virtio negociated flags. At least for
> us. I don't know for sure whether that would be workable for the ARM
> case. In our case, qemu has no idea at VM creation time that the VM
> will turn itself into a secure VM and thus will require bounce
> buffering for IOs (including virtio).
> 
> So unless we have another hook for the arch code to set
> VIRTIO_F_PLATFORM_DMA on selected (or all) virtio devices from the
> guest itself, I don't see that as a way to deal with it.
> 
> >  The other issue is VIRTIO_F_IO_BARRIER
> > which is very vaguely defined, and which needs a better definition.
> > And last but not least we'll need some text explaining the challenges
> > of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
> > is what would basically cover them, but a good description including
> > an explanation of why these matter.
> 
> Ben.
> 

So is it true that from qemu point of view there is nothing special
going on?  You pass in a PA, host writes there.


-- 
MST

^ permalink raw reply

* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Benjamin Herrenschmidt @ 2018-08-02 16:01 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christoph Hellwig, Will Deacon, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, linuxram, haren, paulus, srikar, robin.murphy,
	jean-philippe.brucker, marc.zyngier
In-Reply-To: <20180802182959-mutt-send-email-mst@kernel.org>

On Thu, 2018-08-02 at 18:41 +0300, Michael S. Tsirkin wrote:
> 
> > I don't completely agree:
> > 
> > 1 - VIRTIO_F_IOMMU_PLATFORM is a property of the "other side", ie qemu
> > for example. It indicates that the peer bypasses the normal platform
> > iommu. The platform code in the guest has no real way to know that this
> > is the case, this is a specific "feature" of the qemu implementation.
> > 
> > 2 - VIRTIO_F_PLATFORM_DMA (or whatever you want to call it), is a
> > property of the guest platform itself (not qemu), there's no way the
> > "peer" can advertize it via the virtio negociated flags. At least for
> > us. I don't know for sure whether that would be workable for the ARM
> > case. In our case, qemu has no idea at VM creation time that the VM
> > will turn itself into a secure VM and thus will require bounce
> > buffering for IOs (including virtio).
> > 
> > So unless we have another hook for the arch code to set
> > VIRTIO_F_PLATFORM_DMA on selected (or all) virtio devices from the
> > guest itself, I don't see that as a way to deal with it.
> > 
> > >  The other issue is VIRTIO_F_IO_BARRIER
> > > which is very vaguely defined, and which needs a better definition.
> > > And last but not least we'll need some text explaining the challenges
> > > of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
> > > is what would basically cover them, but a good description including
> > > an explanation of why these matter.
> > 
> > Ben.
> > 
> 
> So is it true that from qemu point of view there is nothing special
> going on?  You pass in a PA, host writes there.

Yes, qemu doesn't see a different. It's the guest that will bounce the
pages via a pool of "insecure" pages that qemu can access. Normal pages
in a secure VM come from PAs that qemu cannot physically access.

Cheers,
Ben.

^ permalink raw reply

* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Michael S. Tsirkin @ 2018-08-02 17:19 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christoph Hellwig, Will Deacon, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, linuxram, haren, paulus, srikar, robin.murphy,
	jean-philippe.brucker, marc.zyngier
In-Reply-To: <82ccef6ec3d95ee43f3990a4a2d0aea87eb45e89.camel@kernel.crashing.org>

On Thu, Aug 02, 2018 at 11:01:26AM -0500, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-02 at 18:41 +0300, Michael S. Tsirkin wrote:
> > 
> > > I don't completely agree:
> > > 
> > > 1 - VIRTIO_F_IOMMU_PLATFORM is a property of the "other side", ie qemu
> > > for example. It indicates that the peer bypasses the normal platform
> > > iommu. The platform code in the guest has no real way to know that this
> > > is the case, this is a specific "feature" of the qemu implementation.
> > > 
> > > 2 - VIRTIO_F_PLATFORM_DMA (or whatever you want to call it), is a
> > > property of the guest platform itself (not qemu), there's no way the
> > > "peer" can advertize it via the virtio negociated flags. At least for
> > > us. I don't know for sure whether that would be workable for the ARM
> > > case. In our case, qemu has no idea at VM creation time that the VM
> > > will turn itself into a secure VM and thus will require bounce
> > > buffering for IOs (including virtio).
> > > 
> > > So unless we have another hook for the arch code to set
> > > VIRTIO_F_PLATFORM_DMA on selected (or all) virtio devices from the
> > > guest itself, I don't see that as a way to deal with it.
> > > 
> > > >  The other issue is VIRTIO_F_IO_BARRIER
> > > > which is very vaguely defined, and which needs a better definition.
> > > > And last but not least we'll need some text explaining the challenges
> > > > of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
> > > > is what would basically cover them, but a good description including
> > > > an explanation of why these matter.
> > > 
> > > Ben.
> > > 
> > 
> > So is it true that from qemu point of view there is nothing special
> > going on?  You pass in a PA, host writes there.
> 
> Yes, qemu doesn't see a different. It's the guest that will bounce the
> pages via a pool of "insecure" pages that qemu can access. Normal pages
> in a secure VM come from PAs that qemu cannot physically access.
> 
> Cheers,
> Ben.
> 

I see. So yes, given that device does not know or care, using
virtio features is an awkward fit.

So let's say as a quick fix for you maybe we could generalize the
xen_domain hack, instead of just checking xen_domain check some static
branch.  Then teach xen and others to enable that.
OK but problem then becomes this: if you do this and virtio device appears
behind a vIOMMU and it does not advertize the IOMMU flag, the
code will try to use the vIOMMU mappings and fail.

It does look like even with trick above, you need a special version of
DMA ops that does just swiotlb but not any of the other things DMA API
might do.

Thoughts?

-- 
MST

^ permalink raw reply

* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Benjamin Herrenschmidt @ 2018-08-02 17:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Christoph Hellwig, Will Deacon, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, linuxram, haren, paulus, srikar, robin.murphy,
	jean-philippe.brucker, marc.zyngier
In-Reply-To: <20180802200646-mutt-send-email-mst@kernel.org>

On Thu, 2018-08-02 at 20:19 +0300, Michael S. Tsirkin wrote:
> 
> I see. So yes, given that device does not know or care, using
> virtio features is an awkward fit.
> 
> So let's say as a quick fix for you maybe we could generalize the
> xen_domain hack, instead of just checking xen_domain check some static
> branch.  Then teach xen and others to enable that.>

> OK but problem then becomes this: if you do this and virtio device appears
> behind a vIOMMU and it does not advertize the IOMMU flag, the
> code will try to use the vIOMMU mappings and fail.
>
> It does look like even with trick above, you need a special version of
> DMA ops that does just swiotlb but not any of the other things DMA API
> might do.
> 
> Thoughts?

Yes, this is the purpose of Anshuman original patch (I haven't looked
at the details of the patch in a while but that's what I told him to
implement ;-) :

 - Make virtio always use DMA ops to simplify the code path (with a set
of "transparent" ops for legacy)

 and

 -  Provide an arch hook allowing us to "override" those "transparent"
DMA ops with some custom ones that do the appropriate swiotlb gunk.

Cheers,
Ben.

^ permalink raw reply

* vio.c:__vio_register_driver && LPAR Migration issue
From: Michael Bringmann @ 2018-08-02 18:15 UTC (permalink / raw)
  To: linuxppc-dev, Tyrel Datwyler, Michael Ellerman, engebret, santil,
	hollisb, rcjenn

Hello:
    I have been observing an anomaly during LPAR migrations between
a couple of P8 systems.

This is the problem.  After migrating an LPAR, the PPC mobility code
receives RTAS requests to delete nodes with platform-/hardware-specific
attributes when restarting the kernel after a migration.  My example is
for migration between a P8 Alpine and a P8 Brazos.  Among the nodes
that I see being deleted are 'ibm,random-v1', 'ibm,compression-v1',
'ibm,platform-facilities', and 'ibm,sym-encryption-v1'.  Of these
nodes, the following are created during initial boot by calls to
vio_register_driver:

drivers/char/hw_random/pseries-rng.c
        ibm,random-v1

drivers/crypto/nx/nx-842-pseries.c
        ibm,compression-v1

drivers/crypto/nx/nx.c
        ibm,sym-encryption-v1

After the migration, these nodes are deleted, but nothing recreates
them.  If I boot the LPAR on the target system, the nodes are added
again.

My question is how do we recreate these nodes after migration?

Thanks.

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:       (512) 466-0650
mwb@linux.vnet.ibm.com

^ permalink raw reply

* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Michael S. Tsirkin @ 2018-08-02 20:52 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christoph Hellwig, Will Deacon, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, linuxram, haren, paulus, srikar, robin.murphy,
	jean-philippe.brucker, marc.zyngier
In-Reply-To: <e9beb6bba5a7d05741e27e85b24a3b23ba7c2762.camel@kernel.crashing.org>

On Thu, Aug 02, 2018 at 12:53:41PM -0500, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-02 at 20:19 +0300, Michael S. Tsirkin wrote:
> > 
> > I see. So yes, given that device does not know or care, using
> > virtio features is an awkward fit.
> > 
> > So let's say as a quick fix for you maybe we could generalize the
> > xen_domain hack, instead of just checking xen_domain check some static
> > branch.  Then teach xen and others to enable that.>
> 
> > OK but problem then becomes this: if you do this and virtio device appears
> > behind a vIOMMU and it does not advertize the IOMMU flag, the
> > code will try to use the vIOMMU mappings and fail.
> >
> > It does look like even with trick above, you need a special version of
> > DMA ops that does just swiotlb but not any of the other things DMA API
> > might do.
> > 
> > Thoughts?
> 
> Yes, this is the purpose of Anshuman original patch (I haven't looked
> at the details of the patch in a while but that's what I told him to
> implement ;-) :
> 
>  - Make virtio always use DMA ops to simplify the code path (with a set
> of "transparent" ops for legacy)
> 
>  and
> 
>  -  Provide an arch hook allowing us to "override" those "transparent"
> DMA ops with some custom ones that do the appropriate swiotlb gunk.
> 
> Cheers,
> Ben.
> 

Right but as I tried to say doing that brings us to a bunch of issues
with using DMA APIs in virtio. Put simply DMA APIs weren't designed for
guest to hypervisor communication.

When we do (as is the case with PLATFORM_IOMMU right now) this adds a
bunch of overhead which we need to get rid of if we are to switch to
PLATFORM_IOMMU by default.  We need to fix that.

-- 
MST

^ permalink raw reply

* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Michael S. Tsirkin @ 2018-08-02 20:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Christoph Hellwig, Will Deacon, Anshuman Khandual, virtualization,
	linux-kernel, linuxppc-dev, aik, robh, joe, elfring, david,
	jasowang, mpe, linuxram, haren, paulus, srikar, robin.murphy,
	jean-philippe.brucker, marc.zyngier
In-Reply-To: <7779442d7889ee943b3e4ff6c63ec90b4a58b79d.camel@kernel.crashing.org>

On Thu, Aug 02, 2018 at 10:33:05AM -0500, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-02 at 00:56 +0300, Michael S. Tsirkin wrote:
> > > but it's not, VMs are
> > > created in "legacy" mode all the times and we don't know at VM creation
> > > time whether it will become a secure VM or not. The way our secure VMs
> > > work is that they start as a normal VM, load a secure "payload" and
> > > call the Ultravisor to "become" secure.
> > > 
> > > So we're in a bit of a bind here. We need that one-liner optional arch
> > > hook to make virtio use swiotlb in that "IOMMU bypass" case.
> > > 
> > > Ben.
> > 
> > And just to make sure I understand, on your platform DMA APIs do include
> > some of the cache flushing tricks and this is why you don't want to
> > declare iommu support in the hypervisor?
> 
> I'm not sure I parse what you mean.
> 
> We don't need cache flushing tricks.

You don't but do real devices on same platform need them?

> The problem we have with our
> "secure" VMs is that:
> 
>  - At VM creation time we have no idea it's going to become a secure
> VM, qemu doesn't know anything about it, and thus qemu (or other
> management tools, libvirt etc...) are going to create "legacy" (ie
> iommu bypass) virtio devices.
> 
>  - Once the VM goes secure (early during boot but too late for qemu),
> it will need to make virtio do bounce-buffering via swiotlb because
> qemu cannot physically access most VM pages (blocked by HW security
> features), we need to bounce buffer using some unsecure pages that are
> accessible to qemu.
> 
> That said, I wouldn't object for us to more generally switch long run
> to changing qemu so that virtio on powerpc starts using the IOMMU as a
> default provided we fix our guest firmware to understand it (it
> currently doesn't), and provided we verify that the performance impact
> on things like vhost is negligible.
> 
> Cheers,
> Ben.
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox