* [PATCH v4 1/9] selftests/powerpc: Test the preservation of FPU and VMX regs across syscall
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
2016-02-15 16:59 ` Naveen N. Rao
2016-02-15 5:07 ` [PATCH v4 2/9] selftests/powerpc: Test preservation of FPU and VMX regs across preemption Cyril Bur
` (7 subsequent siblings)
8 siblings, 1 reply; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
Test that the non volatile floating point and Altivec registers get
correctly preserved across the fork() syscall.
fork() works nicely for this purpose, the registers should be the same for
both parent and child
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
tools/testing/selftests/powerpc/Makefile | 3 +-
tools/testing/selftests/powerpc/basic_asm.h | 30 ++++
tools/testing/selftests/powerpc/math/.gitignore | 2 +
tools/testing/selftests/powerpc/math/Makefile | 14 ++
tools/testing/selftests/powerpc/math/fpu_asm.S | 161 +++++++++++++++++
tools/testing/selftests/powerpc/math/fpu_syscall.c | 90 ++++++++++
tools/testing/selftests/powerpc/math/vmx_asm.S | 193 +++++++++++++++++++++
tools/testing/selftests/powerpc/math/vmx_syscall.c | 92 ++++++++++
8 files changed, 584 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/powerpc/basic_asm.h
create mode 100644 tools/testing/selftests/powerpc/math/.gitignore
create mode 100644 tools/testing/selftests/powerpc/math/Makefile
create mode 100644 tools/testing/selftests/powerpc/math/fpu_asm.S
create mode 100644 tools/testing/selftests/powerpc/math/fpu_syscall.c
create mode 100644 tools/testing/selftests/powerpc/math/vmx_asm.S
create mode 100644 tools/testing/selftests/powerpc/math/vmx_syscall.c
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index 0c2706b..19e8191 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -22,7 +22,8 @@ SUB_DIRS = benchmarks \
switch_endian \
syscalls \
tm \
- vphn
+ vphn \
+ math
endif
diff --git a/tools/testing/selftests/powerpc/basic_asm.h b/tools/testing/selftests/powerpc/basic_asm.h
new file mode 100644
index 0000000..f243da0
--- /dev/null
+++ b/tools/testing/selftests/powerpc/basic_asm.h
@@ -0,0 +1,30 @@
+#include <ppc-asm.h>
+#include <asm/unistd.h>
+
+#define LOAD_REG_IMMEDIATE(reg,expr) \
+ lis reg,(expr)@highest; \
+ ori reg,reg,(expr)@higher; \
+ rldicr reg,reg,32,31; \
+ oris reg,reg,(expr)@high; \
+ ori reg,reg,(expr)@l;
+
+/* It is very important to note here that _extra is the extra amount of
+ * stack space needed.
+ * This space must be accessed at sp + 32!
+ */
+#define PUSH_BASIC_STACK(_extra) \
+ mflr r0; \
+ std r0,16(sp); \
+ stdu sp,-(_extra + 32)(sp); \
+ mfcr r0; \
+ stw r0,8(sp); \
+ std 2,24(sp);
+
+#define POP_BASIC_STACK(_extra) \
+ ld 2,24(sp); \
+ lwz r0,8(sp); \
+ mtcr r0; \
+ addi sp,sp,(_extra + 32); \
+ ld r0,16(sp); \
+ mtlr r0;
+
diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore
new file mode 100644
index 0000000..b19b269
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/.gitignore
@@ -0,0 +1,2 @@
+fpu_syscall
+vmx_syscall
diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile
new file mode 100644
index 0000000..418bef1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/Makefile
@@ -0,0 +1,14 @@
+TEST_PROGS := fpu_syscall vmx_syscall
+
+all: $(TEST_PROGS)
+
+$(TEST_PROGS): ../harness.c
+$(TEST_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec
+
+fpu_syscall: fpu_asm.S
+vmx_syscall: vmx_asm.S
+
+include ../../lib.mk
+
+clean:
+ rm -f $(TEST_PROGS) *.o
diff --git a/tools/testing/selftests/powerpc/math/fpu_asm.S b/tools/testing/selftests/powerpc/math/fpu_asm.S
new file mode 100644
index 0000000..8733874
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_asm.S
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "../basic_asm.h"
+
+#define PUSH_FPU(pos) \
+ stfd f14,pos(sp); \
+ stfd f15,pos+8(sp); \
+ stfd f16,pos+16(sp); \
+ stfd f17,pos+24(sp); \
+ stfd f18,pos+32(sp); \
+ stfd f19,pos+40(sp); \
+ stfd f20,pos+48(sp); \
+ stfd f21,pos+56(sp); \
+ stfd f22,pos+64(sp); \
+ stfd f23,pos+72(sp); \
+ stfd f24,pos+80(sp); \
+ stfd f25,pos+88(sp); \
+ stfd f26,pos+96(sp); \
+ stfd f27,pos+104(sp); \
+ stfd f28,pos+112(sp); \
+ stfd f29,pos+120(sp); \
+ stfd f30,pos+128(sp); \
+ stfd f31,pos+136(sp);
+
+#define POP_FPU(pos) \
+ lfd f14,pos(sp); \
+ lfd f15,pos+8(sp); \
+ lfd f16,pos+16(sp); \
+ lfd f17,pos+24(sp); \
+ lfd f18,pos+32(sp); \
+ lfd f19,pos+40(sp); \
+ lfd f20,pos+48(sp); \
+ lfd f21,pos+56(sp); \
+ lfd f22,pos+64(sp); \
+ lfd f23,pos+72(sp); \
+ lfd f24,pos+80(sp); \
+ lfd f25,pos+88(sp); \
+ lfd f26,pos+96(sp); \
+ lfd f27,pos+104(sp); \
+ lfd f28,pos+112(sp); \
+ lfd f29,pos+120(sp); \
+ lfd f30,pos+128(sp); \
+ lfd f31,pos+136(sp);
+
+#Careful calling this, it will 'clobber' fpu (by design)
+#Don't call this from C
+FUNC_START(load_fpu)
+ lfd f14,0(r3)
+ lfd f15,8(r3)
+ lfd f16,16(r3)
+ lfd f17,24(r3)
+ lfd f18,32(r3)
+ lfd f19,40(r3)
+ lfd f20,48(r3)
+ lfd f21,56(r3)
+ lfd f22,64(r3)
+ lfd f23,72(r3)
+ lfd f24,80(r3)
+ lfd f25,88(r3)
+ lfd f26,96(r3)
+ lfd f27,104(r3)
+ lfd f28,112(r3)
+ lfd f29,120(r3)
+ lfd f30,128(r3)
+ lfd f31,136(r3)
+ blr
+FUNC_END(load_fpu)
+
+FUNC_START(check_fpu)
+ mr r4,r3
+ li r3,1 #assume a bad result
+ lfd f0,0(r4)
+ fcmpu cr1,f0,f14
+ bne cr1,1f
+ lfd f0,8(r4)
+ fcmpu cr1,f0,f15
+ bne cr1,1f
+ lfd f0,16(r4)
+ fcmpu cr1,f0,f16
+ bne cr1,1f
+ lfd f0,24(r4)
+ fcmpu cr1,f0,f17
+ bne cr1,1f
+ lfd f0,32(r4)
+ fcmpu cr1,f0,f18
+ bne cr1,1f
+ lfd f0,40(r4)
+ fcmpu cr1,f0,f19
+ bne cr1,1f
+ lfd f0,48(r4)
+ fcmpu cr1,f0,f20
+ bne cr1,1f
+ lfd f0,56(r4)
+ fcmpu cr1,f0,f21
+ bne cr1,1f
+ lfd f0,64(r4)
+ fcmpu cr1,f0,f22
+ bne cr1,1f
+ lfd f0,72(r4)
+ fcmpu cr1,f0,f23
+ bne cr1,1f
+ lfd f0,80(r4)
+ fcmpu cr1,f0,f24
+ bne cr1,1f
+ lfd f0,88(r4)
+ fcmpu cr1,f0,f25
+ bne cr1,1f
+ lfd f0,96(r4)
+ fcmpu cr1,f0,f26
+ bne cr1,1f
+ lfd f0,104(r4)
+ fcmpu cr1,f0,f27
+ bne cr1,1f
+ lfd f0,112(r4)
+ fcmpu cr1,f0,f28
+ bne cr1,1f
+ lfd f0,120(r4)
+ fcmpu cr1,f0,f29
+ bne cr1,1f
+ lfd f0,128(r4)
+ fcmpu cr1,f0,f30
+ bne cr1,1f
+ lfd f0,136(r4)
+ fcmpu cr1,f0,f31
+ bne cr1,1f
+ li r3,0 #Sucess!!!
+1: blr
+
+FUNC_START(test_fpu)
+ #r3 holds pointer to where to put the result of fork
+ #r4 holds pointer to the pid
+ #f14-f31 are non volatiles
+ PUSH_BASIC_STACK(256)
+ std r3,40(sp) #Address of darray
+ std r4,48(sp) #Address of pid
+ PUSH_FPU(56)
+
+ bl load_fpu
+ nop
+ li r0,__NR_fork
+ sc
+
+ #pass the result of the fork to the caller
+ ld r9,48(sp)
+ std r3,0(r9)
+
+ ld r3,40(sp)
+ bl check_fpu
+ nop
+
+ POP_FPU(56)
+ POP_BASIC_STACK(256)
+ blr
+FUNC_END(test_fpu)
diff --git a/tools/testing/selftests/powerpc/math/fpu_syscall.c b/tools/testing/selftests/powerpc/math/fpu_syscall.c
new file mode 100644
index 0000000..949e672
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_syscall.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the FPU registers change across a syscall (fork).
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+
+#include "utils.h"
+
+extern int test_fpu(double *darray, pid_t *pid);
+
+double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+ 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+ 2.1};
+
+int syscall_fpu(void)
+{
+ pid_t fork_pid;
+ int i;
+ int ret;
+ int child_ret;
+ for (i = 0; i < 1000; i++) {
+ /* test_fpu will fork() */
+ ret = test_fpu(darray, &fork_pid);
+ if (fork_pid == -1)
+ return -1;
+ if (fork_pid == 0)
+ exit(ret);
+ waitpid(fork_pid, &child_ret, 0);
+ if (ret || child_ret)
+ return 1;
+ }
+
+ return 0;
+}
+
+int test_syscall_fpu(void)
+{
+ /*
+ * Setup an environment with much context switching
+ */
+ pid_t pid2;
+ pid_t pid = fork();
+ int ret;
+ int child_ret;
+ FAIL_IF(pid == -1);
+
+ pid2 = fork();
+ /* Can't FAIL_IF(pid2 == -1); because already forked once */
+ if (pid2 == -1) {
+ /*
+ * Couldn't fork, ensure test is a fail
+ */
+ child_ret = ret = 1;
+ } else {
+ ret = syscall_fpu();
+ if (pid2)
+ waitpid(pid2, &child_ret, 0);
+ else
+ exit(ret);
+ }
+
+ ret |= child_ret;
+
+ if (pid)
+ waitpid(pid, &child_ret, 0);
+ else
+ exit(ret);
+
+ FAIL_IF(ret || child_ret);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_syscall_fpu, "syscall_fpu");
+
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_asm.S b/tools/testing/selftests/powerpc/math/vmx_asm.S
new file mode 100644
index 0000000..9ed32e7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_asm.S
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "../basic_asm.h"
+
+#define PUSH_VMX(pos,reg) \
+ li reg,pos; \
+ stvx v20,reg,sp; \
+ addi reg,reg,16; \
+ stvx v21,reg,sp; \
+ addi reg,reg,16; \
+ stvx v22,reg,sp; \
+ addi reg,reg,16; \
+ stvx v23,reg,sp; \
+ addi reg,reg,16; \
+ stvx v24,reg,sp; \
+ addi reg,reg,16; \
+ stvx v25,reg,sp; \
+ addi reg,reg,16; \
+ stvx v26,reg,sp; \
+ addi reg,reg,16; \
+ stvx v27,reg,sp; \
+ addi reg,reg,16; \
+ stvx v28,reg,sp; \
+ addi reg,reg,16; \
+ stvx v29,reg,sp; \
+ addi reg,reg,16; \
+ stvx v30,reg,sp; \
+ addi reg,reg,16; \
+ stvx v31,reg,sp;
+
+#define POP_VMX(pos,reg) \
+ li reg,pos; \
+ lvx v20,reg,sp; \
+ addi reg,reg,16; \
+ lvx v21,reg,sp; \
+ addi reg,reg,16; \
+ lvx v22,reg,sp; \
+ addi reg,reg,16; \
+ lvx v23,reg,sp; \
+ addi reg,reg,16; \
+ lvx v24,reg,sp; \
+ addi reg,reg,16; \
+ lvx v25,reg,sp; \
+ addi reg,reg,16; \
+ lvx v26,reg,sp; \
+ addi reg,reg,16; \
+ lvx v27,reg,sp; \
+ addi reg,reg,16; \
+ lvx v28,reg,sp; \
+ addi reg,reg,16; \
+ lvx v29,reg,sp; \
+ addi reg,reg,16; \
+ lvx v30,reg,sp; \
+ addi reg,reg,16; \
+ lvx v31,reg,sp;
+
+#Carefull this will 'clobber' vmx (by design)
+#Don't call this from C
+FUNC_START(load_vmx)
+ li r5,0
+ lvx v20,r5,r3
+ addi r5,r5,16
+ lvx v21,r5,r3
+ addi r5,r5,16
+ lvx v22,r5,r3
+ addi r5,r5,16
+ lvx v23,r5,r3
+ addi r5,r5,16
+ lvx v24,r5,r3
+ addi r5,r5,16
+ lvx v25,r5,r3
+ addi r5,r5,16
+ lvx v26,r5,r3
+ addi r5,r5,16
+ lvx v27,r5,r3
+ addi r5,r5,16
+ lvx v28,r5,r3
+ addi r5,r5,16
+ lvx v29,r5,r3
+ addi r5,r5,16
+ lvx v30,r5,r3
+ addi r5,r5,16
+ lvx v31,r5,r3
+ blr
+FUNC_END(load_vmx)
+
+#Should be safe from C, only touches r4, r5 and v0,v1,v2
+FUNC_START(check_vmx)
+ PUSH_BASIC_STACK(16)
+ mr r4,r3
+ li r3,1 #assume a bad result
+ li r5,0
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v20
+ vmr v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v21
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v22
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v23
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v24
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v25
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v26
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v27
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v28
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v29
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v30
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v31
+ vand v2,v2,v1
+
+ li r5,0
+ stvx v2,r5,sp
+ ldx r0,r5,sp
+ cmpdi r0,0xffffffff
+ bne 1f
+ li r3,0
+1: POP_BASIC_STACK(16)
+ blr
+FUNC_END(check_vmx)
+
+#Safe from C
+FUNC_START(test_vmx)
+ #r3 holds pointer to where to put the result of fork
+ #r4 holds pointer to the pid
+ #v20-v31 are non-volatile
+ PUSH_BASIC_STACK(512)
+ std r3,40(sp) #Address of varray
+ std r4,48(sp) #address of pid
+ PUSH_VMX(56, r4)
+
+ bl load_vmx
+
+ li r0,__NR_fork
+ sc
+ #Pass the result of fork back to the caller
+ ld r9,48(sp)
+ std r3,0(r9)
+
+ ld r3,40(sp)
+ bl check_vmx
+
+ POP_VMX(56,r4)
+ POP_BASIC_STACK(512)
+ blr
+FUNC_END(test_vmx)
diff --git a/tools/testing/selftests/powerpc/math/vmx_syscall.c b/tools/testing/selftests/powerpc/math/vmx_syscall.c
new file mode 100644
index 0000000..6989d96
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_syscall.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the VMX registers change across a syscall (fork).
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "utils.h"
+
+typedef int v4si __attribute__ ((vector_size (16)));
+v4si varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+ {13,14,15,16},{17,18,19,20},{21,22,23,24},
+ {25,26,27,28},{29,30,31,32},{33,34,35,36},
+ {37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+extern int test_vmx(v4si *varray, pid_t *pid);
+
+int vmx_syscall(void)
+{
+ pid_t fork_pid;
+ int i;
+ int ret;
+ int child_ret;
+ for (i = 0; i < 1000; i++) {
+ /* test_vmx will fork() */
+ ret = test_vmx(varray, &fork_pid);
+ if (fork_pid == -1)
+ return -1;
+ if (fork_pid == 0)
+ exit(ret);
+ waitpid(fork_pid, &child_ret, 0);
+ if (ret || child_ret)
+ return 1;
+ }
+
+ return 0;
+}
+
+int test_vmx_syscall(void)
+{
+ /*
+ * Setup an environment with much context switching
+ */
+ pid_t pid2;
+ pid_t pid = fork();
+ int ret;
+ int child_ret;
+ FAIL_IF(pid == -1);
+
+ pid2 = fork();
+ ret = vmx_syscall();
+ /* Can't FAIL_IF(pid2 == -1); because we've already forked */
+ if (pid2 == -1) {
+ /*
+ * Couldn't fork, ensure child_ret is set and is a fail
+ */
+ ret = child_ret = 1;
+ } else {
+ if (pid2)
+ waitpid(pid2, &child_ret, 0);
+ else
+ exit(ret);
+ }
+
+ ret |= child_ret;
+
+ if (pid)
+ waitpid(pid, &child_ret, 0);
+ else
+ exit(ret);
+
+ FAIL_IF(ret || child_ret);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_vmx_syscall, "vmx_syscall");
+
+}
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH v4 1/9] selftests/powerpc: Test the preservation of FPU and VMX regs across syscall
2016-02-15 5:07 ` [PATCH v4 1/9] selftests/powerpc: Test the preservation of FPU and VMX regs across syscall Cyril Bur
@ 2016-02-15 16:59 ` Naveen N. Rao
2016-02-16 0:06 ` Cyril Bur
0 siblings, 1 reply; 14+ messages in thread
From: Naveen N. Rao @ 2016-02-15 16:59 UTC (permalink / raw)
To: Cyril Bur; +Cc: linuxppc-dev, mikey, anton
On 2016/02/15 04:07PM, Cyril Bur wrote:
> Test that the non volatile floating point and Altivec registers get
> correctly preserved across the fork() syscall.
>
> fork() works nicely for this purpose, the registers should be the same for
> both parent and child
>
> Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> ---
> tools/testing/selftests/powerpc/Makefile | 3 +-
> tools/testing/selftests/powerpc/basic_asm.h | 30 ++++
> tools/testing/selftests/powerpc/math/.gitignore | 2 +
> tools/testing/selftests/powerpc/math/Makefile | 14 ++
> tools/testing/selftests/powerpc/math/fpu_asm.S | 161 +++++++++++++++++
> tools/testing/selftests/powerpc/math/fpu_syscall.c | 90 ++++++++++
> tools/testing/selftests/powerpc/math/vmx_asm.S | 193 +++++++++++++++++++++
> tools/testing/selftests/powerpc/math/vmx_syscall.c | 92 ++++++++++
> 8 files changed, 584 insertions(+), 1 deletion(-)
> create mode 100644 tools/testing/selftests/powerpc/basic_asm.h
> create mode 100644 tools/testing/selftests/powerpc/math/.gitignore
> create mode 100644 tools/testing/selftests/powerpc/math/Makefile
> create mode 100644 tools/testing/selftests/powerpc/math/fpu_asm.S
> create mode 100644 tools/testing/selftests/powerpc/math/fpu_syscall.c
> create mode 100644 tools/testing/selftests/powerpc/math/vmx_asm.S
> create mode 100644 tools/testing/selftests/powerpc/math/vmx_syscall.c
>
> diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
> index 0c2706b..19e8191 100644
> --- a/tools/testing/selftests/powerpc/Makefile
> +++ b/tools/testing/selftests/powerpc/Makefile
> @@ -22,7 +22,8 @@ SUB_DIRS = benchmarks \
> switch_endian \
> syscalls \
> tm \
> - vphn
> + vphn \
> + math
>
> endif
>
> diff --git a/tools/testing/selftests/powerpc/basic_asm.h b/tools/testing/selftests/powerpc/basic_asm.h
> new file mode 100644
> index 0000000..f243da0
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/basic_asm.h
> @@ -0,0 +1,30 @@
> +#include <ppc-asm.h>
> +#include <asm/unistd.h>
> +
> +#define LOAD_REG_IMMEDIATE(reg,expr) \
> + lis reg,(expr)@highest; \
> + ori reg,reg,(expr)@higher; \
> + rldicr reg,reg,32,31; \
> + oris reg,reg,(expr)@high; \
> + ori reg,reg,(expr)@l;
> +
> +/* It is very important to note here that _extra is the extra amount of
> + * stack space needed.
> + * This space must be accessed at sp + 32!
This looks to be specific to ABIv2. Is this series limited to ppc64le?
If so, you might want to ensure this only builds there.
Also:
#define PPC_ABIV2_MIN_STACK_SIZE 32
or just:
#define PPC_MIN_STACK 32
... is helpful. And, you might want to base the rest of your code that
use PUSH_BASIC_STACK() on that. If we ever want to have these tests run
anywhere else, that'll help a lot. (See further below)
> + */
> +#define PUSH_BASIC_STACK(_extra) \
> + mflr r0; \
> + std r0,16(sp); \
> + stdu sp,-(_extra + 32)(sp); \
> + mfcr r0; \
> + stw r0,8(sp); \
> + std 2,24(sp);
^^
Better to use r2 here and below.
> +
> +#define POP_BASIC_STACK(_extra) \
> + ld 2,24(sp); \
> + lwz r0,8(sp); \
> + mtcr r0; \
> + addi sp,sp,(_extra + 32); \
> + ld r0,16(sp); \
> + mtlr r0;
> +
> diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore
> new file mode 100644
> index 0000000..b19b269
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/math/.gitignore
> @@ -0,0 +1,2 @@
> +fpu_syscall
> +vmx_syscall
> diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile
> new file mode 100644
> index 0000000..418bef1
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/math/Makefile
> @@ -0,0 +1,14 @@
> +TEST_PROGS := fpu_syscall vmx_syscall
> +
> +all: $(TEST_PROGS)
> +
> +$(TEST_PROGS): ../harness.c
> +$(TEST_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec
> +
> +fpu_syscall: fpu_asm.S
> +vmx_syscall: vmx_asm.S
> +
> +include ../../lib.mk
> +
> +clean:
> + rm -f $(TEST_PROGS) *.o
> diff --git a/tools/testing/selftests/powerpc/math/fpu_asm.S b/tools/testing/selftests/powerpc/math/fpu_asm.S
> new file mode 100644
> index 0000000..8733874
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/math/fpu_asm.S
> @@ -0,0 +1,161 @@
> +/*
> + * Copyright 2015, Cyril Bur, IBM Corp.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +
> +#include "../basic_asm.h"
> +
> +#define PUSH_FPU(pos) \
> + stfd f14,pos(sp); \
> + stfd f15,pos+8(sp); \
> + stfd f16,pos+16(sp); \
> + stfd f17,pos+24(sp); \
> + stfd f18,pos+32(sp); \
> + stfd f19,pos+40(sp); \
> + stfd f20,pos+48(sp); \
> + stfd f21,pos+56(sp); \
> + stfd f22,pos+64(sp); \
> + stfd f23,pos+72(sp); \
> + stfd f24,pos+80(sp); \
> + stfd f25,pos+88(sp); \
> + stfd f26,pos+96(sp); \
> + stfd f27,pos+104(sp); \
> + stfd f28,pos+112(sp); \
> + stfd f29,pos+120(sp); \
> + stfd f30,pos+128(sp); \
> + stfd f31,pos+136(sp);
> +
> +#define POP_FPU(pos) \
> + lfd f14,pos(sp); \
> + lfd f15,pos+8(sp); \
> + lfd f16,pos+16(sp); \
> + lfd f17,pos+24(sp); \
> + lfd f18,pos+32(sp); \
> + lfd f19,pos+40(sp); \
> + lfd f20,pos+48(sp); \
> + lfd f21,pos+56(sp); \
> + lfd f22,pos+64(sp); \
> + lfd f23,pos+72(sp); \
> + lfd f24,pos+80(sp); \
> + lfd f25,pos+88(sp); \
> + lfd f26,pos+96(sp); \
> + lfd f27,pos+104(sp); \
> + lfd f28,pos+112(sp); \
> + lfd f29,pos+120(sp); \
> + lfd f30,pos+128(sp); \
> + lfd f31,pos+136(sp);
> +
> +#Careful calling this, it will 'clobber' fpu (by design)
> +#Don't call this from C
> +FUNC_START(load_fpu)
> + lfd f14,0(r3)
> + lfd f15,8(r3)
> + lfd f16,16(r3)
> + lfd f17,24(r3)
> + lfd f18,32(r3)
> + lfd f19,40(r3)
> + lfd f20,48(r3)
> + lfd f21,56(r3)
> + lfd f22,64(r3)
> + lfd f23,72(r3)
> + lfd f24,80(r3)
> + lfd f25,88(r3)
> + lfd f26,96(r3)
> + lfd f27,104(r3)
> + lfd f28,112(r3)
> + lfd f29,120(r3)
> + lfd f30,128(r3)
> + lfd f31,136(r3)
> + blr
> +FUNC_END(load_fpu)
> +
> +FUNC_START(check_fpu)
> + mr r4,r3
> + li r3,1 #assume a bad result
> + lfd f0,0(r4)
> + fcmpu cr1,f0,f14
> + bne cr1,1f
> + lfd f0,8(r4)
> + fcmpu cr1,f0,f15
> + bne cr1,1f
> + lfd f0,16(r4)
> + fcmpu cr1,f0,f16
> + bne cr1,1f
> + lfd f0,24(r4)
> + fcmpu cr1,f0,f17
> + bne cr1,1f
> + lfd f0,32(r4)
> + fcmpu cr1,f0,f18
> + bne cr1,1f
> + lfd f0,40(r4)
> + fcmpu cr1,f0,f19
> + bne cr1,1f
> + lfd f0,48(r4)
> + fcmpu cr1,f0,f20
> + bne cr1,1f
> + lfd f0,56(r4)
> + fcmpu cr1,f0,f21
> + bne cr1,1f
> + lfd f0,64(r4)
> + fcmpu cr1,f0,f22
> + bne cr1,1f
> + lfd f0,72(r4)
> + fcmpu cr1,f0,f23
> + bne cr1,1f
> + lfd f0,80(r4)
> + fcmpu cr1,f0,f24
> + bne cr1,1f
> + lfd f0,88(r4)
> + fcmpu cr1,f0,f25
> + bne cr1,1f
> + lfd f0,96(r4)
> + fcmpu cr1,f0,f26
> + bne cr1,1f
> + lfd f0,104(r4)
> + fcmpu cr1,f0,f27
> + bne cr1,1f
> + lfd f0,112(r4)
> + fcmpu cr1,f0,f28
> + bne cr1,1f
> + lfd f0,120(r4)
> + fcmpu cr1,f0,f29
> + bne cr1,1f
> + lfd f0,128(r4)
> + fcmpu cr1,f0,f30
> + bne cr1,1f
> + lfd f0,136(r4)
> + fcmpu cr1,f0,f31
> + bne cr1,1f
> + li r3,0 #Sucess!!!
> +1: blr
> +
> +FUNC_START(test_fpu)
> + #r3 holds pointer to where to put the result of fork
> + #r4 holds pointer to the pid
> + #f14-f31 are non volatiles
> + PUSH_BASIC_STACK(256)
> + std r3,40(sp) #Address of darray
So, this could be:
PUSH_BASIC_STACK(256)
std r3,PPC_MIN_STACK+8(sp)
... though I wonder why there is +8 here?
- Naveen
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v4 1/9] selftests/powerpc: Test the preservation of FPU and VMX regs across syscall
2016-02-15 16:59 ` Naveen N. Rao
@ 2016-02-16 0:06 ` Cyril Bur
2016-02-16 5:02 ` Naveen N. Rao
2016-02-16 5:32 ` Michael Ellerman
0 siblings, 2 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-16 0:06 UTC (permalink / raw)
To: Naveen N. Rao; +Cc: linuxppc-dev, mikey, anton
On Mon, 15 Feb 2016 22:29:17 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> wrote:
> On 2016/02/15 04:07PM, Cyril Bur wrote:
> > Test that the non volatile floating point and Altivec registers get
> > correctly preserved across the fork() syscall.
> >
> > fork() works nicely for this purpose, the registers should be the same for
> > both parent and child
> >
> > Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> > ---
> > tools/testing/selftests/powerpc/Makefile | 3 +-
> > tools/testing/selftests/powerpc/basic_asm.h | 30 ++++
> > tools/testing/selftests/powerpc/math/.gitignore | 2 +
> > tools/testing/selftests/powerpc/math/Makefile | 14 ++
> > tools/testing/selftests/powerpc/math/fpu_asm.S | 161 +++++++++++++++++
> > tools/testing/selftests/powerpc/math/fpu_syscall.c | 90 ++++++++++
> > tools/testing/selftests/powerpc/math/vmx_asm.S | 193 +++++++++++++++++++++
> > tools/testing/selftests/powerpc/math/vmx_syscall.c | 92 ++++++++++
> > 8 files changed, 584 insertions(+), 1 deletion(-)
> > create mode 100644 tools/testing/selftests/powerpc/basic_asm.h
> > create mode 100644 tools/testing/selftests/powerpc/math/.gitignore
> > create mode 100644 tools/testing/selftests/powerpc/math/Makefile
> > create mode 100644 tools/testing/selftests/powerpc/math/fpu_asm.S
> > create mode 100644 tools/testing/selftests/powerpc/math/fpu_syscall.c
> > create mode 100644 tools/testing/selftests/powerpc/math/vmx_asm.S
> > create mode 100644 tools/testing/selftests/powerpc/math/vmx_syscall.c
> >
> > diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
> > index 0c2706b..19e8191 100644
> > --- a/tools/testing/selftests/powerpc/Makefile
> > +++ b/tools/testing/selftests/powerpc/Makefile
> > @@ -22,7 +22,8 @@ SUB_DIRS = benchmarks \
> > switch_endian \
> > syscalls \
> > tm \
> > - vphn
> > + vphn \
> > + math
> >
> > endif
> >
> > diff --git a/tools/testing/selftests/powerpc/basic_asm.h b/tools/testing/selftests/powerpc/basic_asm.h
> > new file mode 100644
> > index 0000000..f243da0
> > --- /dev/null
> > +++ b/tools/testing/selftests/powerpc/basic_asm.h
> > @@ -0,0 +1,30 @@
> > +#include <ppc-asm.h>
> > +#include <asm/unistd.h>
> > +
> > +#define LOAD_REG_IMMEDIATE(reg,expr) \
> > + lis reg,(expr)@highest; \
> > + ori reg,reg,(expr)@higher; \
> > + rldicr reg,reg,32,31; \
> > + oris reg,reg,(expr)@high; \
> > + ori reg,reg,(expr)@l;
> > +
> > +/* It is very important to note here that _extra is the extra amount of
> > + * stack space needed.
> > + * This space must be accessed at sp + 32!
>
Hi Naveen,
Thanks for the review.
> This looks to be specific to ABIv2. Is this series limited to ppc64le?
> If so, you might want to ensure this only builds there.
>
Is ABIv1 still in use? Can we still compile for v1?
This is for series 64bit only, I've not really got any reason to believe this
is LE only, shouldn't this work BE? The makefile enforces 64bit, I believe it is
ok for kernel selftests to fail to compile if they aren't going to be able to
run.
> Also:
> #define PPC_ABIV2_MIN_STACK_SIZE 32
>
> or just:
> #define PPC_MIN_STACK 32
>
> ... is helpful. And, you might want to base the rest of your code that
> use PUSH_BASIC_STACK() on that. If we ever want to have these tests run
> anywhere else, that'll help a lot. (See further below)
>
So I thought about it. I agree that it would be nice, I just worry that I might
get rabbitholed, I can see it going further and then providing stack accessors
to abstract out even PPC_MIN_STACK except in a bunch of macros, and that's when
I know I've gone too far.
Perhaps I could look at adding this when I write more tests, I have grand plans
to push way more tests.
> > + */
> > +#define PUSH_BASIC_STACK(_extra) \
> > + mflr r0; \
> > + std r0,16(sp); \
> > + stdu sp,-(_extra + 32)(sp); \
> > + mfcr r0; \
> > + stw r0,8(sp); \
> > + std 2,24(sp);
> ^^
> Better to use r2 here and below.
>
I think the reason I used '2' is that 'r2' isn't actually defined in ppc-asm.h
for userspace, due to conventions, like 'sp', 'toc' has been used. So I could
have used 'toc' but then there was an issue with toc NOT being defined, or
getting undefined in some situations.
> > +
> > +#define POP_BASIC_STACK(_extra) \
> > + ld 2,24(sp); \
> > + lwz r0,8(sp); \
> > + mtcr r0; \
> > + addi sp,sp,(_extra + 32); \
> > + ld r0,16(sp); \
> > + mtlr r0;
> > +
> > diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore
> > new file mode 100644
> > index 0000000..b19b269
> > --- /dev/null
> > +++ b/tools/testing/selftests/powerpc/math/.gitignore
> > @@ -0,0 +1,2 @@
> > +fpu_syscall
> > +vmx_syscall
> > diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile
> > new file mode 100644
> > index 0000000..418bef1
> > --- /dev/null
> > +++ b/tools/testing/selftests/powerpc/math/Makefile
> > @@ -0,0 +1,14 @@
> > +TEST_PROGS := fpu_syscall vmx_syscall
> > +
> > +all: $(TEST_PROGS)
> > +
> > +$(TEST_PROGS): ../harness.c
> > +$(TEST_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec
> > +
> > +fpu_syscall: fpu_asm.S
> > +vmx_syscall: vmx_asm.S
> > +
> > +include ../../lib.mk
> > +
> > +clean:
> > + rm -f $(TEST_PROGS) *.o
> > diff --git a/tools/testing/selftests/powerpc/math/fpu_asm.S b/tools/testing/selftests/powerpc/math/fpu_asm.S
> > new file mode 100644
> > index 0000000..8733874
> > --- /dev/null
> > +++ b/tools/testing/selftests/powerpc/math/fpu_asm.S
> > @@ -0,0 +1,161 @@
> > +/*
> > + * Copyright 2015, Cyril Bur, IBM Corp.
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; either version
> > + * 2 of the License, or (at your option) any later version.
> > + */
> > +
> > +#include "../basic_asm.h"
> > +
> > +#define PUSH_FPU(pos) \
> > + stfd f14,pos(sp); \
> > + stfd f15,pos+8(sp); \
> > + stfd f16,pos+16(sp); \
> > + stfd f17,pos+24(sp); \
> > + stfd f18,pos+32(sp); \
> > + stfd f19,pos+40(sp); \
> > + stfd f20,pos+48(sp); \
> > + stfd f21,pos+56(sp); \
> > + stfd f22,pos+64(sp); \
> > + stfd f23,pos+72(sp); \
> > + stfd f24,pos+80(sp); \
> > + stfd f25,pos+88(sp); \
> > + stfd f26,pos+96(sp); \
> > + stfd f27,pos+104(sp); \
> > + stfd f28,pos+112(sp); \
> > + stfd f29,pos+120(sp); \
> > + stfd f30,pos+128(sp); \
> > + stfd f31,pos+136(sp);
> > +
> > +#define POP_FPU(pos) \
> > + lfd f14,pos(sp); \
> > + lfd f15,pos+8(sp); \
> > + lfd f16,pos+16(sp); \
> > + lfd f17,pos+24(sp); \
> > + lfd f18,pos+32(sp); \
> > + lfd f19,pos+40(sp); \
> > + lfd f20,pos+48(sp); \
> > + lfd f21,pos+56(sp); \
> > + lfd f22,pos+64(sp); \
> > + lfd f23,pos+72(sp); \
> > + lfd f24,pos+80(sp); \
> > + lfd f25,pos+88(sp); \
> > + lfd f26,pos+96(sp); \
> > + lfd f27,pos+104(sp); \
> > + lfd f28,pos+112(sp); \
> > + lfd f29,pos+120(sp); \
> > + lfd f30,pos+128(sp); \
> > + lfd f31,pos+136(sp);
> > +
> > +#Careful calling this, it will 'clobber' fpu (by design)
> > +#Don't call this from C
> > +FUNC_START(load_fpu)
> > + lfd f14,0(r3)
> > + lfd f15,8(r3)
> > + lfd f16,16(r3)
> > + lfd f17,24(r3)
> > + lfd f18,32(r3)
> > + lfd f19,40(r3)
> > + lfd f20,48(r3)
> > + lfd f21,56(r3)
> > + lfd f22,64(r3)
> > + lfd f23,72(r3)
> > + lfd f24,80(r3)
> > + lfd f25,88(r3)
> > + lfd f26,96(r3)
> > + lfd f27,104(r3)
> > + lfd f28,112(r3)
> > + lfd f29,120(r3)
> > + lfd f30,128(r3)
> > + lfd f31,136(r3)
> > + blr
> > +FUNC_END(load_fpu)
> > +
> > +FUNC_START(check_fpu)
> > + mr r4,r3
> > + li r3,1 #assume a bad result
> > + lfd f0,0(r4)
> > + fcmpu cr1,f0,f14
> > + bne cr1,1f
> > + lfd f0,8(r4)
> > + fcmpu cr1,f0,f15
> > + bne cr1,1f
> > + lfd f0,16(r4)
> > + fcmpu cr1,f0,f16
> > + bne cr1,1f
> > + lfd f0,24(r4)
> > + fcmpu cr1,f0,f17
> > + bne cr1,1f
> > + lfd f0,32(r4)
> > + fcmpu cr1,f0,f18
> > + bne cr1,1f
> > + lfd f0,40(r4)
> > + fcmpu cr1,f0,f19
> > + bne cr1,1f
> > + lfd f0,48(r4)
> > + fcmpu cr1,f0,f20
> > + bne cr1,1f
> > + lfd f0,56(r4)
> > + fcmpu cr1,f0,f21
> > + bne cr1,1f
> > + lfd f0,64(r4)
> > + fcmpu cr1,f0,f22
> > + bne cr1,1f
> > + lfd f0,72(r4)
> > + fcmpu cr1,f0,f23
> > + bne cr1,1f
> > + lfd f0,80(r4)
> > + fcmpu cr1,f0,f24
> > + bne cr1,1f
> > + lfd f0,88(r4)
> > + fcmpu cr1,f0,f25
> > + bne cr1,1f
> > + lfd f0,96(r4)
> > + fcmpu cr1,f0,f26
> > + bne cr1,1f
> > + lfd f0,104(r4)
> > + fcmpu cr1,f0,f27
> > + bne cr1,1f
> > + lfd f0,112(r4)
> > + fcmpu cr1,f0,f28
> > + bne cr1,1f
> > + lfd f0,120(r4)
> > + fcmpu cr1,f0,f29
> > + bne cr1,1f
> > + lfd f0,128(r4)
> > + fcmpu cr1,f0,f30
> > + bne cr1,1f
> > + lfd f0,136(r4)
> > + fcmpu cr1,f0,f31
> > + bne cr1,1f
> > + li r3,0 #Sucess!!!
> > +1: blr
> > +
> > +FUNC_START(test_fpu)
> > + #r3 holds pointer to where to put the result of fork
> > + #r4 holds pointer to the pid
> > + #f14-f31 are non volatiles
> > + PUSH_BASIC_STACK(256)
> > + std r3,40(sp) #Address of darray
>
> So, this could be:
> PUSH_BASIC_STACK(256)
> std r3,PPC_MIN_STACK+8(sp)
>
> ... though I wonder why there is +8 here?
>
I think the +8 is left over from my using +0 for something else and then not
and not going back and being all neat about stack usage. Admittedly I didn't
look over that too hard it being a selftest and all, I'm not sure optimal
stack usage is super important here.
Thanks,
Cyril
>
> - Naveen
>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v4 1/9] selftests/powerpc: Test the preservation of FPU and VMX regs across syscall
2016-02-16 0:06 ` Cyril Bur
@ 2016-02-16 5:02 ` Naveen N. Rao
2016-02-16 5:32 ` Michael Ellerman
1 sibling, 0 replies; 14+ messages in thread
From: Naveen N. Rao @ 2016-02-16 5:02 UTC (permalink / raw)
To: Cyril Bur; +Cc: linuxppc-dev, mikey, anton
On 2016/02/16 11:06AM, Cyril Bur wrote:
> On Mon, 15 Feb 2016 22:29:17 +0530
> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> wrote:
>
> > On 2016/02/15 04:07PM, Cyril Bur wrote:
> > > Test that the non volatile floating point and Altivec registers get
> > > correctly preserved across the fork() syscall.
> > >
> > > fork() works nicely for this purpose, the registers should be the same for
> > > both parent and child
> > >
> > > Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> > > ---
<snip>
> > > +
> > > +/* It is very important to note here that _extra is the extra amount of
> > > + * stack space needed.
> > > + * This space must be accessed at sp + 32!
> >
>
> Hi Naveen,
>
> Thanks for the review.
>
> > This looks to be specific to ABIv2. Is this series limited to ppc64le?
> > If so, you might want to ensure this only builds there.
> >
>
> Is ABIv1 still in use? Can we still compile for v1?
Yes, that's the earlier ppc64 BE (I'm assuming these tests can be run
when booted in LPARs as well)
>
> This is for series 64bit only, I've not really got any reason to believe this
> is LE only, shouldn't this work BE? The makefile enforces 64bit, I believe it is
This won't work for ABIv1 BE since the stack setup is a bit different. I
think your patches assume that 32 bytes is the minimum stack size, but
that's only for ABIv2. Also, the locations of CR and TOC save areas on
the stack are quite different:
http://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html#STACK
> ok for kernel selftests to fail to compile if they aren't going to be able to
> run.
>
> > Also:
> > #define PPC_ABIV2_MIN_STACK_SIZE 32
> >
> > or just:
> > #define PPC_MIN_STACK 32
> >
> > ... is helpful. And, you might want to base the rest of your code that
> > use PUSH_BASIC_STACK() on that. If we ever want to have these tests run
> > anywhere else, that'll help a lot. (See further below)
> >
>
> So I thought about it. I agree that it would be nice, I just worry that I might
> get rabbitholed, I can see it going further and then providing stack accessors
> to abstract out even PPC_MIN_STACK except in a bunch of macros, and that's when
> I know I've gone too far.
>
> Perhaps I could look at adding this when I write more tests, I have grand plans
> to push way more tests.
Sure - just that if you ever intend to have these for ABIv1, it will be
way easier to put together macros now rather than later.
- Naveen
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH v4 1/9] selftests/powerpc: Test the preservation of FPU and VMX regs across syscall
2016-02-16 0:06 ` Cyril Bur
2016-02-16 5:02 ` Naveen N. Rao
@ 2016-02-16 5:32 ` Michael Ellerman
1 sibling, 0 replies; 14+ messages in thread
From: Michael Ellerman @ 2016-02-16 5:32 UTC (permalink / raw)
To: Cyril Bur, Naveen N. Rao; +Cc: linuxppc-dev, mikey, anton
On Tue, 2016-02-16 at 11:06 +1100, Cyril Bur wrote:
> On Mon, 15 Feb 2016 22:29:17 +0530
> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> wrote:
>
> > On 2016/02/15 04:07PM, Cyril Bur wrote:
> > > Test that the non volatile floating point and Altivec registers get
> > > correctly preserved across the fork() syscall.
> > >
> > > fork() works nicely for this purpose, the registers should be the same for
> > > both parent and child
> > >
> > > diff --git a/tools/testing/selftests/powerpc/basic_asm.h b/tools/testing/selftests/powerpc/basic_asm.h
> > > new file mode 100644
> > > index 0000000..f243da0
> > > --- /dev/null
> > > +++ b/tools/testing/selftests/powerpc/basic_asm.h
> > > @@ -0,0 +1,30 @@
> > > +#include <ppc-asm.h>
> > > +#include <asm/unistd.h>
> > > +
> > > +#define LOAD_REG_IMMEDIATE(reg,expr) \
> > > + lis reg,(expr)@highest; \
> > > + ori reg,reg,(expr)@higher; \
> > > + rldicr reg,reg,32,31; \
> > > + oris reg,reg,(expr)@high; \
> > > + ori reg,reg,(expr)@l;
> > > +
> > > +/* It is very important to note here that _extra is the extra amount of
> > > + * stack space needed.
> > > + * This space must be accessed at sp + 32!
> >
>
> Hi Naveen,
>
> Thanks for the review.
> > This looks to be specific to ABIv2. Is this series limited to ppc64le?
> > If so, you might want to ensure this only builds there.
> >
>
> Is ABIv1 still in use? Can we still compile for v1?
YES! >:E
> This is for series 64bit only, I've not really got any reason to believe this
> is LE only, shouldn't this work BE? The makefile enforces 64bit, I believe it is
> ok for kernel selftests to fail to compile if they aren't going to be able to
> run.
> > Also:
> > #define PPC_ABIV2_MIN_STACK_SIZE 32
> >
> > or just:
> > #define PPC_MIN_STACK 32
> >
> > ... is helpful. And, you might want to base the rest of your code that
> > use PUSH_BASIC_STACK() on that. If we ever want to have these tests run
> > anywhere else, that'll help a lot. (See further below)
> >
>
> So I thought about it. I agree that it would be nice, I just worry that I might
> get rabbitholed, I can see it going further and then providing stack accessors
> to abstract out even PPC_MIN_STACK except in a bunch of macros, and that's when
> I know I've gone too far.
>
> Perhaps I could look at adding this when I write more tests, I have grand plans
> to push way more tests.
You definitely need a #define for the minimum stack frame size, based on the
ABI version. You can basically do what the kernel does for STACK_FRAME_MIN_SIZE.
You also need to cope with the TOC save slot moving between ABIv1 & 2, which
shouldn't be hard with a macro for it.
> > > + */
> > > +#define PUSH_BASIC_STACK(_extra) \
> > > + mflr r0; \
> > > + std r0,16(sp); \
> > > + stdu sp,-(_extra + 32)(sp); \
> > > + mfcr r0; \
> > > + stw r0,8(sp); \
> > > + std 2,24(sp);
> > ^^
> > Better to use r2 here and below.
> >
>
> I think the reason I used '2' is that 'r2' isn't actually defined in ppc-asm.h
> for userspace, due to conventions, like 'sp', 'toc' has been used. So I could
> have used 'toc' but then there was an issue with toc NOT being defined, or
> getting undefined in some situations.
That's true, ppc-asm.h doesn't define r2, instead it defines toc.
But you can always use %r2, which is preferable to 2 IMHO.
Personally I'd rather you use %r1 than sp, but I won't make you. As someone who
has read lots of powerpc assembler %r1 translates as "stack pointer" where as
"sp" translates as "huh?".
> > > +FUNC_START(test_fpu)
> > > + #r3 holds pointer to where to put the result of fork
> > > + #r4 holds pointer to the pid
> > > + #f14-f31 are non volatiles
> > > + PUSH_BASIC_STACK(256)
> > > + std r3,40(sp) #Address of darray
> >
> > So, this could be:
> > PUSH_BASIC_STACK(256)
> > std r3,PPC_MIN_STACK+8(sp)
> >
> > ... though I wonder why there is +8 here?
>
> I think the +8 is left over from my using +0 for something else and then not
> and not going back and being all neat about stack usage. Admittedly I didn't
> look over that too hard it being a selftest and all, I'm not sure optimal
> stack usage is super important here.
The first free slot is at PPC_MIN_STACK(%r1), so that's what you should use.
cheers
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v4 2/9] selftests/powerpc: Test preservation of FPU and VMX regs across preemption
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
2016-02-15 5:07 ` [PATCH v4 1/9] selftests/powerpc: Test the preservation of FPU and VMX regs across syscall Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
2016-02-15 5:07 ` [PATCH v4 3/9] selftests/powerpc: Test FPU and VMX regs in signal ucontext Cyril Bur
` (6 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
Loop in assembly checking the registers with many threads.
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
tools/testing/selftests/powerpc/math/.gitignore | 2 +
tools/testing/selftests/powerpc/math/Makefile | 5 +-
tools/testing/selftests/powerpc/math/fpu_asm.S | 34 +++++++
tools/testing/selftests/powerpc/math/fpu_preempt.c | 113 +++++++++++++++++++++
tools/testing/selftests/powerpc/math/vmx_asm.S | 44 +++++++-
tools/testing/selftests/powerpc/math/vmx_preempt.c | 113 +++++++++++++++++++++
6 files changed, 306 insertions(+), 5 deletions(-)
create mode 100644 tools/testing/selftests/powerpc/math/fpu_preempt.c
create mode 100644 tools/testing/selftests/powerpc/math/vmx_preempt.c
diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore
index b19b269..1a6f09e 100644
--- a/tools/testing/selftests/powerpc/math/.gitignore
+++ b/tools/testing/selftests/powerpc/math/.gitignore
@@ -1,2 +1,4 @@
fpu_syscall
vmx_syscall
+fpu_preempt
+vmx_preempt
diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile
index 418bef1..b6f4158 100644
--- a/tools/testing/selftests/powerpc/math/Makefile
+++ b/tools/testing/selftests/powerpc/math/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := fpu_syscall vmx_syscall
+TEST_PROGS := fpu_syscall fpu_preempt vmx_syscall vmx_preempt
all: $(TEST_PROGS)
@@ -6,7 +6,10 @@ $(TEST_PROGS): ../harness.c
$(TEST_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec
fpu_syscall: fpu_asm.S
+fpu_preempt: fpu_asm.S
+
vmx_syscall: vmx_asm.S
+vmx_preempt: vmx_asm.S
include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/math/fpu_asm.S b/tools/testing/selftests/powerpc/math/fpu_asm.S
index 8733874..46bbe99 100644
--- a/tools/testing/selftests/powerpc/math/fpu_asm.S
+++ b/tools/testing/selftests/powerpc/math/fpu_asm.S
@@ -159,3 +159,37 @@ FUNC_START(test_fpu)
POP_BASIC_STACK(256)
blr
FUNC_END(test_fpu)
+
+#int preempt_fpu(double *darray, int *threads_running, int *running)
+#On starting will (atomically) decrement not_ready as a signal that the FPU
+#has been loaded with darray. Will proceed to check the validity of the FPU
+#registers while running is not zero.
+FUNC_START(preempt_fpu)
+ PUSH_BASIC_STACK(256)
+ std r3,32(sp) #double *darray
+ std r4,40(sp) #volatile int *not_ready
+ std r5,48(sp) #int *running
+ PUSH_FPU(56)
+
+ bl load_fpu
+
+ #Atomic DEC
+ ld r3,40(sp)
+1: lwarx r4,0,r3
+ addi r4,r4,-1
+ stwcx. r4,0,r3
+ bne- 1b
+
+2: ld r3, 32(sp)
+ bl check_fpu
+ cmpdi r3,0
+ bne 3f
+ ld r4, 48(sp)
+ ld r5, 0(r4)
+ cmpwi r5,0
+ bne 2b
+
+3: POP_FPU(56)
+ POP_BASIC_STACK(256)
+ blr
+FUNC_END(preempt_fpu)
diff --git a/tools/testing/selftests/powerpc/math/fpu_preempt.c b/tools/testing/selftests/powerpc/math/fpu_preempt.c
new file mode 100644
index 0000000..0f85b79
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_preempt.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the FPU registers change across preemption.
+ * Two things should be noted here a) The check_fpu function in asm only checks
+ * the non volatile registers as it is reused from the syscall test b) There is
+ * no way to be sure preemption happened so this test just uses many threads
+ * and a long wait. As such, a successful test doesn't mean much but a failure
+ * is bad.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Time to wait for workers to get preempted (seconds) */
+#define PREEMPT_TIME 20
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+
+__thread double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+ 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+ 2.1};
+
+int threads_starting;
+int running;
+
+extern void preempt_fpu(double *darray, int *threads_starting, int *running);
+
+void *preempt_fpu_c(void *p)
+{
+ int i;
+ srand(pthread_self());
+ for (i = 0; i < 21; i++)
+ darray[i] = rand();
+
+ /* Test failed if it ever returns */
+ preempt_fpu(darray, &threads_starting, &running);
+
+ return p;
+}
+
+int test_preempt_fpu(void)
+{
+ int i, rc, threads;
+ pthread_t *tids;
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc((threads) * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ threads_starting = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, preempt_fpu_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ /* Not really necessary but nice to wait for every thread to start */
+ printf("\tWaiting for all workers to start...");
+ while(threads_starting)
+ asm volatile("": : :"memory");
+ printf("done\n");
+
+ printf("\tWaiting for %d seconds to let some workers get preempted...", PREEMPT_TIME);
+ sleep(PREEMPT_TIME);
+ printf("done\n");
+
+ printf("\tStopping workers...");
+ /*
+ * Working are checking this value every loop. In preempt_fpu 'cmpwi r5,0; bne 2b'.
+ * r5 will have loaded the value of running.
+ */
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ void *rc_p;
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why preempt_fpu
+ * returned
+ */
+ if ((long) rc_p)
+ printf("oops\n");
+ FAIL_IF((long) rc_p);
+ }
+ printf("done\n");
+
+ free(tids);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_preempt_fpu, "fpu_preempt");
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_asm.S b/tools/testing/selftests/powerpc/math/vmx_asm.S
index 9ed32e7..416b5b0 100644
--- a/tools/testing/selftests/powerpc/math/vmx_asm.S
+++ b/tools/testing/selftests/powerpc/math/vmx_asm.S
@@ -9,6 +9,7 @@
#include "../basic_asm.h"
+#POS MUST BE 16 ALIGNED!
#define PUSH_VMX(pos,reg) \
li reg,pos; \
stvx v20,reg,sp; \
@@ -35,6 +36,7 @@
addi reg,reg,16; \
stvx v31,reg,sp;
+#POS MUST BE 16 ALIGNED!
#define POP_VMX(pos,reg) \
li reg,pos; \
lvx v20,reg,sp; \
@@ -93,7 +95,7 @@ FUNC_END(load_vmx)
#Should be safe from C, only touches r4, r5 and v0,v1,v2
FUNC_START(check_vmx)
- PUSH_BASIC_STACK(16)
+ PUSH_BASIC_STACK(32)
mr r4,r3
li r3,1 #assume a bad result
li r5,0
@@ -162,7 +164,7 @@ FUNC_START(check_vmx)
cmpdi r0,0xffffffff
bne 1f
li r3,0
-1: POP_BASIC_STACK(16)
+1: POP_BASIC_STACK(32)
blr
FUNC_END(check_vmx)
@@ -174,7 +176,7 @@ FUNC_START(test_vmx)
PUSH_BASIC_STACK(512)
std r3,40(sp) #Address of varray
std r4,48(sp) #address of pid
- PUSH_VMX(56, r4)
+ PUSH_VMX(64, r4)
bl load_vmx
@@ -187,7 +189,41 @@ FUNC_START(test_vmx)
ld r3,40(sp)
bl check_vmx
- POP_VMX(56,r4)
+ POP_VMX(64,r4)
POP_BASIC_STACK(512)
blr
FUNC_END(test_vmx)
+
+#int preempt_vmx(v4si *varray, int *threads_starting, int *running)
+#On starting will (atomically) decrement not_ready as a signal that the FPU
+#has been loaded with varray. Will proceed to check the validity of the FPU
+#registers while running is not zero.
+FUNC_START(preempt_vmx)
+ PUSH_BASIC_STACK(512)
+ std r3,32(sp) #v4si *varray
+ std r4,40(sp) #volatile int *not_ready
+ std r5,48(sp) #int *running
+ PUSH_VMX(64,r4)
+
+ bl load_vmx
+
+ #Atomic DEC
+ ld r3,40(sp)
+1: lwarx r4,0,r3
+ addi r4,r4,-1
+ stwcx. r4,0,r3
+ bne- 1b
+
+2: ld r3,32(sp)
+ bl check_vmx
+ cmpdi r3,0
+ bne 3f
+ ld r4,48(sp)
+ ld r5,0(r4)
+ cmpwi r5,0
+ bne 2b
+
+3: POP_VMX(64,r4)
+ POP_BASIC_STACK(512)
+ blr
+FUNC_END(preempt_vmx)
diff --git a/tools/testing/selftests/powerpc/math/vmx_preempt.c b/tools/testing/selftests/powerpc/math/vmx_preempt.c
new file mode 100644
index 0000000..9474e78
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_preempt.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the VMX registers change across preemption.
+ * Two things should be noted here a) The check_vmx function in asm only checks
+ * the non volatile registers as it is reused from the syscall test b) There is
+ * no way to be sure preemption happened so this test just uses many threads
+ * and a long wait. As such, a successful test doesn't mean much but a failure
+ * is bad.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Time to wait for workers to get preempted (seconds) */
+#define PREEMPT_TIME 20
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+typedef int v4si __attribute__ ((vector_size (16)));
+__thread v4si varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+ {13,14,15,16},{17,18,19,20},{21,22,23,24},
+ {25,26,27,28},{29,30,31,32},{33,34,35,36},
+ {37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+int threads_starting;
+int running;
+
+extern void preempt_vmx(v4si *varray, int *threads_starting, int *running);
+
+void *preempt_vmx_c(void *p)
+{
+ int i, j;
+ srand(pthread_self());
+ for (i = 0; i < 12; i++)
+ for (j = 0; j < 4; j++)
+ varray[i][j] = rand();
+
+ /* Test fails if it ever returns */
+ preempt_vmx(varray, &threads_starting, &running);
+ return p;
+}
+
+int test_preempt_vmx(void)
+{
+ int i, rc, threads;
+ pthread_t *tids;
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc(threads * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ threads_starting = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, preempt_vmx_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ /* Not really nessesary but nice to wait for every thread to start */
+ printf("\tWaiting for all workers to start...");
+ while(threads_starting)
+ asm volatile("": : :"memory");
+ printf("done\n");
+
+ printf("\tWaiting for %d seconds to let some workers get preempted...", PREEMPT_TIME);
+ sleep(PREEMPT_TIME);
+ printf("done\n");
+
+ printf("\tStopping workers...");
+ /*
+ * Working are checking this value every loop. In preempt_vmx 'cmpwi r5,0; bne 2b'.
+ * r5 will have loaded the value of running.
+ */
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ void *rc_p;
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why preempt_vmx
+ * returned
+ */
+ if ((long) rc_p)
+ printf("oops\n");
+ FAIL_IF((long) rc_p);
+ }
+ printf("done\n");
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_preempt_vmx, "vmx_preempt");
+}
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 3/9] selftests/powerpc: Test FPU and VMX regs in signal ucontext
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
2016-02-15 5:07 ` [PATCH v4 1/9] selftests/powerpc: Test the preservation of FPU and VMX regs across syscall Cyril Bur
2016-02-15 5:07 ` [PATCH v4 2/9] selftests/powerpc: Test preservation of FPU and VMX regs across preemption Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
2016-02-15 5:07 ` [PATCH v4 4/9] powerpc: Explicitly disable math features when copying thread Cyril Bur
` (5 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
Load up the non volatile FPU and VMX regs and ensure that they are the
expected value in a signal handler
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
tools/testing/selftests/powerpc/math/.gitignore | 2 +
tools/testing/selftests/powerpc/math/Makefile | 4 +-
tools/testing/selftests/powerpc/math/fpu_signal.c | 135 +++++++++++++++++++++
tools/testing/selftests/powerpc/math/vmx_signal.c | 138 ++++++++++++++++++++++
4 files changed, 278 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/powerpc/math/fpu_signal.c
create mode 100644 tools/testing/selftests/powerpc/math/vmx_signal.c
diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore
index 1a6f09e..4fe13a4 100644
--- a/tools/testing/selftests/powerpc/math/.gitignore
+++ b/tools/testing/selftests/powerpc/math/.gitignore
@@ -2,3 +2,5 @@ fpu_syscall
vmx_syscall
fpu_preempt
vmx_preempt
+fpu_signal
+vmx_signal
diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile
index b6f4158..5b88875 100644
--- a/tools/testing/selftests/powerpc/math/Makefile
+++ b/tools/testing/selftests/powerpc/math/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := fpu_syscall fpu_preempt vmx_syscall vmx_preempt
+TEST_PROGS := fpu_syscall fpu_preempt fpu_signal vmx_syscall vmx_preempt vmx_signal
all: $(TEST_PROGS)
@@ -7,9 +7,11 @@ $(TEST_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec
fpu_syscall: fpu_asm.S
fpu_preempt: fpu_asm.S
+fpu_signal: fpu_asm.S
vmx_syscall: vmx_asm.S
vmx_preempt: vmx_asm.S
+vmx_signal: vmx_asm.S
include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/math/fpu_signal.c b/tools/testing/selftests/powerpc/math/fpu_signal.c
new file mode 100644
index 0000000..888aa51
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_signal.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the FPU registers are correctly reported in a
+ * signal context. Each worker just spins checking its FPU registers, at some
+ * point a signal will interrupt it and C code will check the signal context
+ * ensuring it is also the same.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Number of times each thread should receive the signal */
+#define ITERATIONS 10
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+__thread double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+ 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+ 2.1};
+
+bool bad_context;
+int threads_starting;
+int running;
+
+extern long preempt_fpu(double *darray, int *threads_starting, int *running);
+
+void signal_fpu_sig(int sig, siginfo_t *info, void *context)
+{
+ int i;
+ ucontext_t *uc = context;
+ mcontext_t *mc = &uc->uc_mcontext;
+
+ /* Only the non volatiles were loaded up */
+ for (i = 14; i < 32; i++) {
+ if (mc->fp_regs[i] != darray[i - 14]) {
+ bad_context = true;
+ break;
+ }
+ }
+}
+
+void *signal_fpu_c(void *p)
+{
+ int i;
+ long rc;
+ struct sigaction act;
+ act.sa_sigaction = signal_fpu_sig;
+ act.sa_flags = SA_SIGINFO;
+ rc = sigaction(SIGUSR1, &act, NULL);
+ if (rc)
+ return p;
+
+ srand(pthread_self());
+ for (i = 0; i < 21; i++)
+ darray[i] = rand();
+
+ rc = preempt_fpu(darray, &threads_starting, &running);
+
+ return (void *) rc;
+}
+
+int test_signal_fpu(void)
+{
+ int i, j, rc, threads;
+ void *rc_p;
+ pthread_t *tids;
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc(threads * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ threads_starting = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, signal_fpu_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ printf("\tWaiting for all workers to start...");
+ while (threads_starting)
+ asm volatile("": : :"memory");
+ printf("done\n");
+
+ printf("\tSending signals to all threads %d times...", ITERATIONS);
+ for (i = 0; i < ITERATIONS; i++) {
+ for (j = 0; j < threads; j++) {
+ pthread_kill(tids[j], SIGUSR1);
+ }
+ sleep(1);
+ }
+ printf("done\n");
+
+ printf("\tStopping workers...");
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why signal_fpu
+ * returned
+ */
+ if ((long) rc_p || bad_context)
+ printf("oops\n");
+ if (bad_context)
+ fprintf(stderr, "\t!! bad_context is true\n");
+ FAIL_IF((long) rc_p || bad_context);
+ }
+ printf("done\n");
+
+ free(tids);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_signal_fpu, "fpu_signal");
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_signal.c b/tools/testing/selftests/powerpc/math/vmx_signal.c
new file mode 100644
index 0000000..093af5a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_signal.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the VMX registers are correctly reported in a
+ * signal context. Each worker just spins checking its VMX registers, at some
+ * point a signal will interrupt it and C code will check the signal context
+ * ensuring it is also the same.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Number of times each thread should receive the signal */
+#define ITERATIONS 10
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+typedef int v4si __attribute__ ((vector_size (16)));
+__thread v4si varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+ {13,14,15,16},{17,18,19,20},{21,22,23,24},
+ {25,26,27,28},{29,30,31,32},{33,34,35,36},
+ {37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+bool bad_context;
+int running;
+int threads_starting;
+
+extern int preempt_vmx(v4si *varray, volatile int *not_ready, int *sentinal);
+
+void signal_vmx_sig(int sig, siginfo_t *info, void *context)
+{
+ int i;
+ ucontext_t *uc = context;
+ mcontext_t *mc = &uc->uc_mcontext;
+
+ /* Only the non volatiles were loaded up */
+ for (i = 20; i < 32; i++) {
+ if (memcmp(mc->v_regs->vrregs[i], &varray[i - 20], 16)) {
+ bad_context = true;
+ break;
+ }
+ }
+}
+
+void *signal_vmx_c(void *p)
+{
+ int i, j;
+ long rc;
+ struct sigaction act;
+ act.sa_sigaction = signal_vmx_sig;
+ act.sa_flags = SA_SIGINFO;
+ rc = sigaction(SIGUSR1, &act, NULL);
+ if (rc)
+ return p;
+
+ srand(pthread_self());
+ for (i = 0; i < 12; i++)
+ for (j = 0; j < 4; j++)
+ varray[i][j] = rand();
+
+ rc = preempt_vmx(varray, ¬_ready, &running);
+
+ return (void *) rc;
+}
+
+int test_signal_vmx(void)
+{
+ int i, j, rc, threads;
+ void *rc_p;
+ pthread_t *tids;
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc(threads * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ not_ready = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, signal_vmx_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ printf("\tWaiting for all workers to start...");
+ while (not_ready);
+ printf("done\n");
+
+ printf("\tSending signals to all threads %d times...", ITERATIONS);
+ for (i = 0; i < ITERATIONS; i++) {
+ for (j = 0; j < threads; j++) {
+ pthread_kill(tids[j], SIGUSR1);
+ }
+ sleep(1);
+ }
+ printf("done\n");
+
+ printf("\tKilling workers...");
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why signal_vmx
+ * returned
+ */
+ if ((long) rc_p || bad_context)
+ printf("oops\n");
+ if (bad_context)
+ fprintf(stderr, "\t!! bad_context is true\n");
+ FAIL_IF((long) rc_p || bad_context);
+ }
+ printf("done\n");
+
+ free(tids);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_signal_vmx, "vmx_signal");
+}
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 4/9] powerpc: Explicitly disable math features when copying thread
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
` (2 preceding siblings ...)
2016-02-15 5:07 ` [PATCH v4 3/9] selftests/powerpc: Test FPU and VMX regs in signal ucontext Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
2016-02-15 5:07 ` [PATCH v4 5/9] powerpc: Restore FPU/VEC/VSX if previously used Cyril Bur
` (4 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
Currently when threads get scheduled off they always giveup the FPU,
Altivec (VMX) and Vector (VSX) units if they were using them. When they are
scheduled back on a fault is then taken to enable each facility and load
registers. As a result explicitly disabling FPU/VMX/VSX has not been
necessary.
Future changes and optimisations remove this mandatory giveup and fault
which could cause calls such as clone() and fork() to copy threads and run
them later with FPU/VMX/VSX enabled but no registers loaded.
This patch starts the process of having MSR_{FP,VEC,VSX} mean that a
threads registers are hot while not having MSR_{FP,VEC,VSX} means that the
registers must be loaded. This allows for a smarter return to userspace.
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
arch/powerpc/kernel/process.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dccc87e..e0c3d2d 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1307,6 +1307,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
f = ret_from_fork;
}
+ childregs->msr &= ~(MSR_FP|MSR_VEC|MSR_VSX);
sp -= STACK_FRAME_OVERHEAD;
/*
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 5/9] powerpc: Restore FPU/VEC/VSX if previously used
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
` (3 preceding siblings ...)
2016-02-15 5:07 ` [PATCH v4 4/9] powerpc: Explicitly disable math features when copying thread Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
2016-02-15 5:07 ` [PATCH v4 6/9] powerpc: Prepare for splitting giveup_{fpu, altivec, vsx} in two Cyril Bur
` (3 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
Currently the FPU, VEC and VSX facilities are lazily loaded. This is not a
problem unless a process is using these facilities.
Modern versions of GCC are very good at automatically vectorising code, new
and modernised workloads make use of floating point and vector facilities,
even the kernel makes use of vectorised memcpy.
All this combined greatly increases the cost of a syscall since the kernel
uses the facilities sometimes even in syscall fast-path making it
increasingly common for a thread to take an *_unavailable exception soon
after a syscall, not to mention potentially taking all three.
The obvious overcompensation to this problem is to simply always load all
the facilities on every exit to userspace. Loading up all FPU, VEC and VSX
registers every time can be expensive and if a workload does avoid using
them, it should not be forced to incur this penalty.
An 8bit counter is used to detect if the registers have been used in the
past and the registers are always loaded until the value wraps to back to
zero.
Several versions of the assembly in entry_64.S. 1. Always calling C, 2.
Performing a common case check and then calling C and 3. A complex check in
asm. After some benchmarking it was determined that avoiding C in the
common case is a performance benefit. The full check in asm greatly
complicated that codepath for a negligible performance gain and the
trade-off was deemed not worth it.
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
arch/powerpc/include/asm/processor.h | 2 +
arch/powerpc/kernel/asm-offsets.c | 2 +
arch/powerpc/kernel/entry_64.S | 21 +++++++--
arch/powerpc/kernel/fpu.S | 4 ++
arch/powerpc/kernel/process.c | 88 +++++++++++++++++++++++++++++++-----
arch/powerpc/kernel/vector.S | 4 ++
6 files changed, 107 insertions(+), 14 deletions(-)
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index ac23308..dcab21f 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -236,11 +236,13 @@ struct thread_struct {
#endif
struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */
unsigned long trap_nr; /* last trap # on this thread */
+ u8 load_fp;
#ifdef CONFIG_ALTIVEC
struct thread_vr_state vr_state;
struct thread_vr_state *vr_save_area;
unsigned long vrsave;
int used_vr; /* set if process has used altivec */
+ u8 load_vec;
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
/* VSR status */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 07cebc3..10d5eab 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -95,12 +95,14 @@ int main(void)
DEFINE(THREAD_FPSTATE, offsetof(struct thread_struct, fp_state));
DEFINE(THREAD_FPSAVEAREA, offsetof(struct thread_struct, fp_save_area));
DEFINE(FPSTATE_FPSCR, offsetof(struct thread_fp_state, fpscr));
+ DEFINE(THREAD_LOAD_FP, offsetof(struct thread_struct, load_fp));
#ifdef CONFIG_ALTIVEC
DEFINE(THREAD_VRSTATE, offsetof(struct thread_struct, vr_state));
DEFINE(THREAD_VRSAVEAREA, offsetof(struct thread_struct, vr_save_area));
DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave));
DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr));
DEFINE(VRSTATE_VSCR, offsetof(struct thread_vr_state, vscr));
+ DEFINE(THREAD_LOAD_VEC, offsetof(struct thread_struct, load_vec));
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
DEFINE(THREAD_USED_VSR, offsetof(struct thread_struct, used_vsr));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 0d525ce..038e0a1 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -210,7 +210,20 @@ system_call: /* label this so stack traces look sane */
li r11,-MAX_ERRNO
andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
bne- syscall_exit_work
- cmpld r3,r11
+
+ andi. r0,r8,MSR_FP
+ beq 2f
+#ifdef CONFIG_ALTIVEC
+ andis. r0,r8,MSR_VEC@h
+ bne 3f
+#endif
+2: addi r3,r1,STACK_FRAME_OVERHEAD
+ bl restore_math
+ ld r8,_MSR(r1)
+ ld r3,RESULT(r1)
+ li r11,-MAX_ERRNO
+
+3: cmpld r3,r11
ld r5,_CCR(r1)
bge- syscall_error
.Lsyscall_error_cont:
@@ -602,8 +615,8 @@ _GLOBAL(ret_from_except_lite)
/* Check current_thread_info()->flags */
andi. r0,r4,_TIF_USER_WORK_MASK
-#ifdef CONFIG_PPC_BOOK3E
bne 1f
+#ifdef CONFIG_PPC_BOOK3E
/*
* Check to see if the dbcr0 register is set up to debug.
* Use the internal debug mode bit to do this.
@@ -618,7 +631,9 @@ _GLOBAL(ret_from_except_lite)
mtspr SPRN_DBSR,r10
b restore
#else
- beq restore
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl restore_math
+ b restore
#endif
1: andi. r0,r4,_TIF_NEED_RESCHED
beq 2f
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 2117eac..b063524 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -130,6 +130,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
or r12,r12,r4
std r12,_MSR(r1)
#endif
+ /* Don't care if r4 overflows, this is desired behaviour */
+ lbz r4,THREAD_LOAD_FP(r5)
+ addi r4,r4,1
+ stb r4,THREAD_LOAD_FP(r5)
addi r10,r5,THREAD_FPSTATE
lfd fr0,FPSTATE_FPSCR(r10)
MTFSF_L(fr0)
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e0c3d2d..55c1eb0 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -187,9 +187,22 @@ void enable_kernel_fp(void)
}
}
EXPORT_SYMBOL(enable_kernel_fp);
+
+static int restore_fp(struct task_struct *tsk) {
+ if (tsk->thread.load_fp) {
+ load_fp_state(¤t->thread.fp_state);
+ current->thread.load_fp++;
+ return 1;
+ }
+ return 0;
+}
+#else
+static int restore_fp(struct task_struct *tsk) { return 0; }
#endif /* CONFIG_PPC_FPU */
#ifdef CONFIG_ALTIVEC
+#define loadvec(thr) ((thr).load_vec)
+
void giveup_altivec(struct task_struct *tsk)
{
check_if_tm_restore_required(tsk);
@@ -229,6 +242,21 @@ void flush_altivec_to_thread(struct task_struct *tsk)
}
}
EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
+
+static int restore_altivec(struct task_struct *tsk)
+{
+ if (cpu_has_feature(CPU_FTR_ALTIVEC) && tsk->thread.load_vec) {
+ load_vr_state(&tsk->thread.vr_state);
+ tsk->thread.used_vr = 1;
+ tsk->thread.load_vec++;
+
+ return 1;
+ }
+ return 0;
+}
+#else
+#define loadvec(thr) 0
+static inline int restore_altivec(struct task_struct *tsk) { return 0; }
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
@@ -275,6 +303,18 @@ void flush_vsx_to_thread(struct task_struct *tsk)
}
}
EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
+
+static int restore_vsx(struct task_struct *tsk)
+{
+ if (cpu_has_feature(CPU_FTR_VSX)) {
+ tsk->thread.used_vsr = 1;
+ return 1;
+ }
+
+ return 0;
+}
+#else
+static inline int restore_vsx(struct task_struct *tsk) { return 0; }
#endif /* CONFIG_VSX */
#ifdef CONFIG_SPE
@@ -374,6 +414,36 @@ void giveup_all(struct task_struct *tsk)
}
EXPORT_SYMBOL(giveup_all);
+void restore_math(struct pt_regs *regs)
+{
+ unsigned long msr;
+
+ if (!current->thread.load_fp && !loadvec(current->thread))
+ return;
+
+ msr = regs->msr;
+ msr_check_and_set(msr_all_available);
+
+ /*
+ * Only reload if the bit is not set in the user MSR, the bit BEING set
+ * indicates that the registers are hot
+ */
+ if ((!(msr & MSR_FP)) && restore_fp(current))
+ msr |= MSR_FP | current->thread.fpexc_mode;
+
+ if ((!(msr & MSR_VEC)) && restore_altivec(current))
+ msr |= MSR_VEC;
+
+ if ((msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC) &&
+ restore_vsx(current)) {
+ msr |= MSR_VSX;
+ }
+
+ msr_check_and_clear(msr_all_available);
+
+ regs->msr = msr;
+}
+
void flush_all_to_thread(struct task_struct *tsk)
{
if (tsk->thread.regs) {
@@ -832,17 +902,9 @@ void restore_tm_state(struct pt_regs *regs)
msr_diff = current->thread.ckpt_regs.msr & ~regs->msr;
msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
- if (msr_diff & MSR_FP) {
- msr_check_and_set(MSR_FP);
- load_fp_state(¤t->thread.fp_state);
- msr_check_and_clear(MSR_FP);
- regs->msr |= current->thread.fpexc_mode;
- }
- if (msr_diff & MSR_VEC) {
- msr_check_and_set(MSR_VEC);
- load_vr_state(¤t->thread.vr_state);
- msr_check_and_clear(MSR_VEC);
- }
+
+ restore_math(regs);
+
regs->msr |= msr_diff;
}
@@ -1006,6 +1068,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
batch = this_cpu_ptr(&ppc64_tlb_batch);
batch->active = 1;
}
+
+ if (current_thread_info()->task->thread.regs)
+ restore_math(current_thread_info()->task->thread.regs);
+
#endif /* CONFIG_PPC_BOOK3S_64 */
return last;
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 162d0f7..038cff8 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -91,6 +91,10 @@ _GLOBAL(load_up_altivec)
oris r12,r12,MSR_VEC@h
std r12,_MSR(r1)
#endif
+ /* Don't care if r4 overflows, this is desired behaviour */
+ lbz r4,THREAD_LOAD_VEC(r5)
+ addi r4,r4,1
+ stb r4,THREAD_LOAD_VEC(r5)
addi r6,r5,THREAD_VRSTATE
li r4,1
li r10,VRSTATE_VSCR
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 6/9] powerpc: Prepare for splitting giveup_{fpu, altivec, vsx} in two
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
` (4 preceding siblings ...)
2016-02-15 5:07 ` [PATCH v4 5/9] powerpc: Restore FPU/VEC/VSX if previously used Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
2016-02-15 5:07 ` [PATCH v4 7/9] powerpc: Add the ability to save FPU without giving it up Cyril Bur
` (2 subsequent siblings)
8 siblings, 0 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
This prepares for the decoupling of saving {fpu,altivec,vsx} registers and
marking {fpu,altivec,vsx} as being unused by a thread.
Currently giveup_{fpu,altivec,vsx}() does both however optimisations to
task switching can be made if these two operations are decoupled.
save_all() will permit the saving of registers to thread structs and leave
threads MSR with bits enabled.
This patch introduces no functional change.
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
arch/powerpc/include/asm/reg.h | 8 ++++++++
arch/powerpc/include/asm/switch_to.h | 7 +++++++
arch/powerpc/kernel/process.c | 31 ++++++++++++++++++++++++++++++-
3 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c4cb2ff..d07b110 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -75,6 +75,14 @@
#define MSR_HV 0
#endif
+/*
+ * To be used in shared book E/book S, this avoids needing to worry about
+ * book S/book E in shared code
+ */
+#ifndef MSR_SPE
+#define MSR_SPE 0
+#endif
+
#define MSR_VEC __MASK(MSR_VEC_LG) /* Enable AltiVec */
#define MSR_VSX __MASK(MSR_VSX_LG) /* Enable VSX */
#define MSR_POW __MASK(MSR_POW_LG) /* Enable Power Management */
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 5b268b6..3690041 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -34,6 +34,7 @@ static inline void disable_kernel_fp(void)
msr_check_and_clear(MSR_FP);
}
#else
+static inline void __giveup_fpu(struct task_struct *t) { }
static inline void flush_fp_to_thread(struct task_struct *t) { }
#endif
@@ -46,6 +47,8 @@ static inline void disable_kernel_altivec(void)
{
msr_check_and_clear(MSR_VEC);
}
+#else
+static inline void __giveup_altivec(struct task_struct *t) { }
#endif
#ifdef CONFIG_VSX
@@ -57,6 +60,8 @@ static inline void disable_kernel_vsx(void)
{
msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
}
+#else
+static inline void __giveup_vsx(struct task_struct *t) { }
#endif
#ifdef CONFIG_SPE
@@ -68,6 +73,8 @@ static inline void disable_kernel_spe(void)
{
msr_check_and_clear(MSR_SPE);
}
+#else
+static inline void __giveup_spe(struct task_struct *t) { }
#endif
static inline void clear_task_ebb(struct task_struct *t)
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 55c1eb0..29da07f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -444,12 +444,41 @@ void restore_math(struct pt_regs *regs)
regs->msr = msr;
}
+void save_all(struct task_struct *tsk)
+{
+ unsigned long usermsr;
+
+ if (!tsk->thread.regs)
+ return;
+
+ usermsr = tsk->thread.regs->msr;
+
+ if ((usermsr & msr_all_available) == 0)
+ return;
+
+ msr_check_and_set(msr_all_available);
+
+ if (usermsr & MSR_FP)
+ __giveup_fpu(tsk);
+
+ if (usermsr & MSR_VEC)
+ __giveup_altivec(tsk);
+
+ if (usermsr & MSR_VSX)
+ __giveup_vsx(tsk);
+
+ if (usermsr & MSR_SPE)
+ __giveup_spe(tsk);
+
+ msr_check_and_clear(msr_all_available);
+}
+
void flush_all_to_thread(struct task_struct *tsk)
{
if (tsk->thread.regs) {
preempt_disable();
BUG_ON(tsk != current);
- giveup_all(tsk);
+ save_all(tsk);
#ifdef CONFIG_SPE
if (tsk->thread.regs->msr & MSR_SPE)
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 7/9] powerpc: Add the ability to save FPU without giving it up
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
` (5 preceding siblings ...)
2016-02-15 5:07 ` [PATCH v4 6/9] powerpc: Prepare for splitting giveup_{fpu, altivec, vsx} in two Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
2016-02-15 5:07 ` [PATCH v4 8/9] powerpc: Add the ability to save Altivec " Cyril Bur
2016-02-15 5:07 ` [PATCH v4 9/9] powerpc: Add the ability to save VSX " Cyril Bur
8 siblings, 0 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
This patch adds the ability to be able to save the FPU registers to the
thread struct without giving up (disabling the facility) next time the
process returns to userspace.
This patch optimises the thread copy path (as a result of a fork() or
clone()) so that the parent thread can return to userspace with hot
registers avoiding a possibly pointless reload of FPU register state.
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
arch/powerpc/include/asm/switch_to.h | 3 ++-
arch/powerpc/kernel/fpu.S | 21 ++++-----------------
arch/powerpc/kernel/process.c | 12 +++++++++++-
3 files changed, 17 insertions(+), 19 deletions(-)
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 3690041..6a201e8 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -28,13 +28,14 @@ extern void giveup_all(struct task_struct *);
extern void enable_kernel_fp(void);
extern void flush_fp_to_thread(struct task_struct *);
extern void giveup_fpu(struct task_struct *);
-extern void __giveup_fpu(struct task_struct *);
+extern void save_fpu(struct task_struct *);
static inline void disable_kernel_fp(void)
{
msr_check_and_clear(MSR_FP);
}
#else
static inline void __giveup_fpu(struct task_struct *t) { }
+static inline void save_fpu(struct task_struct *t) { }
static inline void flush_fp_to_thread(struct task_struct *t) { }
#endif
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index b063524..15da2b5 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -143,33 +143,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
blr
/*
- * __giveup_fpu(tsk)
- * Disable FP for the task given as the argument,
- * and save the floating-point registers in its thread_struct.
+ * save_fpu(tsk)
+ * Save the floating-point registers in its thread_struct.
* Enables the FPU for use in the kernel on return.
*/
-_GLOBAL(__giveup_fpu)
+_GLOBAL(save_fpu)
addi r3,r3,THREAD /* want THREAD of task */
PPC_LL r6,THREAD_FPSAVEAREA(r3)
PPC_LL r5,PT_REGS(r3)
PPC_LCMPI 0,r6,0
bne 2f
addi r6,r3,THREAD_FPSTATE
-2: PPC_LCMPI 0,r5,0
- SAVE_32FPVSRS(0, R4, R6)
+2: SAVE_32FPVSRS(0, R4, R6)
mffs fr0
stfd fr0,FPSTATE_FPSCR(r6)
- beq 1f
- PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5)
- li r3,MSR_FP|MSR_FE0|MSR_FE1
-#ifdef CONFIG_VSX
-BEGIN_FTR_SECTION
- oris r3,r3,MSR_VSX@h
-END_FTR_SECTION_IFSET(CPU_FTR_VSX)
-#endif
- andc r4,r4,r3 /* disable FP for previous task */
- PPC_STL r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
blr
/*
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 29da07f..a7e5061 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -133,6 +133,16 @@ void __msr_check_and_clear(unsigned long bits)
EXPORT_SYMBOL(__msr_check_and_clear);
#ifdef CONFIG_PPC_FPU
+void __giveup_fpu(struct task_struct *tsk)
+{
+ save_fpu(tsk);
+ tsk->thread.regs->msr &= ~MSR_FP;
+#ifdef CONFIG_VSX
+ if (cpu_has_feature(CPU_FTR_VSX))
+ tsk->thread.regs->msr &= ~MSR_VSX;
+#endif
+}
+
void giveup_fpu(struct task_struct *tsk)
{
check_if_tm_restore_required(tsk);
@@ -459,7 +469,7 @@ void save_all(struct task_struct *tsk)
msr_check_and_set(msr_all_available);
if (usermsr & MSR_FP)
- __giveup_fpu(tsk);
+ save_fpu(tsk);
if (usermsr & MSR_VEC)
__giveup_altivec(tsk);
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 8/9] powerpc: Add the ability to save Altivec without giving it up
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
` (6 preceding siblings ...)
2016-02-15 5:07 ` [PATCH v4 7/9] powerpc: Add the ability to save FPU without giving it up Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
2016-02-15 5:07 ` [PATCH v4 9/9] powerpc: Add the ability to save VSX " Cyril Bur
8 siblings, 0 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
This patch adds the ability to be able to save the VEC registers to the
thread struct without giving up (disabling the facility) next time the
process returns to userspace.
This patch builds on a previous optimisation for the FPU registers in the
thread copy path to avoid a possibly pointless reload of VEC state.
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
arch/powerpc/include/asm/switch_to.h | 3 ++-
arch/powerpc/kernel/process.c | 12 +++++++++++-
arch/powerpc/kernel/vector.S | 24 ++++--------------------
3 files changed, 17 insertions(+), 22 deletions(-)
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 6a201e8..9028822 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -43,12 +43,13 @@ static inline void flush_fp_to_thread(struct task_struct *t) { }
extern void enable_kernel_altivec(void);
extern void flush_altivec_to_thread(struct task_struct *);
extern void giveup_altivec(struct task_struct *);
-extern void __giveup_altivec(struct task_struct *);
+extern void save_altivec(struct task_struct *);
static inline void disable_kernel_altivec(void)
{
msr_check_and_clear(MSR_VEC);
}
#else
+static inline void save_altivec(struct task_struct *t) { }
static inline void __giveup_altivec(struct task_struct *t) { }
#endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index a7e5061..14c09d2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -213,6 +213,16 @@ static int restore_fp(struct task_struct *tsk) { return 0; }
#ifdef CONFIG_ALTIVEC
#define loadvec(thr) ((thr).load_vec)
+static void __giveup_altivec(struct task_struct *tsk)
+{
+ save_altivec(tsk);
+ tsk->thread.regs->msr &= ~MSR_VEC;
+#ifdef CONFIG_VSX
+ if (cpu_has_feature(CPU_FTR_VSX))
+ tsk->thread.regs->msr &= ~MSR_VSX;
+#endif
+}
+
void giveup_altivec(struct task_struct *tsk)
{
check_if_tm_restore_required(tsk);
@@ -472,7 +482,7 @@ void save_all(struct task_struct *tsk)
save_fpu(tsk);
if (usermsr & MSR_VEC)
- __giveup_altivec(tsk);
+ save_altivec(tsk);
if (usermsr & MSR_VSX)
__giveup_vsx(tsk);
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 038cff8..51b0c17 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -106,36 +106,20 @@ _GLOBAL(load_up_altivec)
blr
/*
- * __giveup_altivec(tsk)
- * Disable VMX for the task given as the argument,
- * and save the vector registers in its thread_struct.
+ * save_altivec(tsk)
+ * Save the vector registers to its thread_struct
*/
-_GLOBAL(__giveup_altivec)
+_GLOBAL(save_altivec)
addi r3,r3,THREAD /* want THREAD of task */
PPC_LL r7,THREAD_VRSAVEAREA(r3)
PPC_LL r5,PT_REGS(r3)
PPC_LCMPI 0,r7,0
bne 2f
addi r7,r3,THREAD_VRSTATE
-2: PPC_LCMPI 0,r5,0
- SAVE_32VRS(0,r4,r7)
+2: SAVE_32VRS(0,r4,r7)
mfvscr v0
li r4,VRSTATE_VSCR
stvx v0,r4,r7
- beq 1f
- PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-#ifdef CONFIG_VSX
-BEGIN_FTR_SECTION
- lis r3,(MSR_VEC|MSR_VSX)@h
-FTR_SECTION_ELSE
- lis r3,MSR_VEC@h
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
-#else
- lis r3,MSR_VEC@h
-#endif
- andc r4,r4,r3 /* disable FP for previous task */
- PPC_STL r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
blr
#ifdef CONFIG_VSX
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH v4 9/9] powerpc: Add the ability to save VSX without giving it up
2016-02-15 5:07 [PATCH v4 0/9] FP/VEC/VSX switching optimisations Cyril Bur
` (7 preceding siblings ...)
2016-02-15 5:07 ` [PATCH v4 8/9] powerpc: Add the ability to save Altivec " Cyril Bur
@ 2016-02-15 5:07 ` Cyril Bur
8 siblings, 0 replies; 14+ messages in thread
From: Cyril Bur @ 2016-02-15 5:07 UTC (permalink / raw)
To: linuxppc-dev; +Cc: mikey, anton
This patch adds the ability to be able to save the VSX registers to the
thread struct without giving up (disabling the facility) next time the
process returns to userspace.
This patch builds on a previous optimisation for the FPU and VEC registers
in the thread copy path to avoid a possibly pointless reload of VSX state.
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
---
arch/powerpc/include/asm/switch_to.h | 4 ----
arch/powerpc/kernel/ppc_ksyms.c | 4 ----
arch/powerpc/kernel/process.c | 42 +++++++++++++++++++++++++-----------
arch/powerpc/kernel/vector.S | 17 ---------------
4 files changed, 30 insertions(+), 37 deletions(-)
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 9028822..17c8380 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -56,14 +56,10 @@ static inline void __giveup_altivec(struct task_struct *t) { }
#ifdef CONFIG_VSX
extern void enable_kernel_vsx(void);
extern void flush_vsx_to_thread(struct task_struct *);
-extern void giveup_vsx(struct task_struct *);
-extern void __giveup_vsx(struct task_struct *);
static inline void disable_kernel_vsx(void)
{
msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
}
-#else
-static inline void __giveup_vsx(struct task_struct *t) { }
#endif
#ifdef CONFIG_SPE
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 41e1607..ef7024da 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -28,10 +28,6 @@ EXPORT_SYMBOL(load_vr_state);
EXPORT_SYMBOL(store_vr_state);
#endif
-#ifdef CONFIG_VSX
-EXPORT_SYMBOL_GPL(__giveup_vsx);
-#endif
-
#ifdef CONFIG_EPAPR_PARAVIRT
EXPORT_SYMBOL(epapr_hypercall_start);
#endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 14c09d2..d7a9df5 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -280,19 +280,31 @@ static inline int restore_altivec(struct task_struct *tsk) { return 0; }
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
-void giveup_vsx(struct task_struct *tsk)
+static void __giveup_vsx(struct task_struct *tsk)
{
- check_if_tm_restore_required(tsk);
-
- msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
if (tsk->thread.regs->msr & MSR_FP)
__giveup_fpu(tsk);
if (tsk->thread.regs->msr & MSR_VEC)
__giveup_altivec(tsk);
+ tsk->thread.regs->msr &= ~MSR_VSX;
+}
+
+static void giveup_vsx(struct task_struct *tsk)
+{
+ check_if_tm_restore_required(tsk);
+
+ msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
__giveup_vsx(tsk);
msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
}
-EXPORT_SYMBOL(giveup_vsx);
+
+static void save_vsx(struct task_struct *tsk)
+{
+ if (tsk->thread.regs->msr & MSR_FP)
+ save_fpu(tsk);
+ if (tsk->thread.regs->msr & MSR_VEC)
+ save_altivec(tsk);
+}
void enable_kernel_vsx(void)
{
@@ -335,6 +347,7 @@ static int restore_vsx(struct task_struct *tsk)
}
#else
static inline int restore_vsx(struct task_struct *tsk) { return 0; }
+static inline void save_vsx(struct task_struct *tsk) { }
#endif /* CONFIG_VSX */
#ifdef CONFIG_SPE
@@ -478,14 +491,19 @@ void save_all(struct task_struct *tsk)
msr_check_and_set(msr_all_available);
- if (usermsr & MSR_FP)
- save_fpu(tsk);
-
- if (usermsr & MSR_VEC)
- save_altivec(tsk);
+ /*
+ * Saving the way the register space is in hardware, save_vsx boils
+ * down to a save_fpu() and save_altivec()
+ */
+ if (usermsr & MSR_VSX) {
+ save_vsx(tsk);
+ } else {
+ if (usermsr & MSR_FP)
+ save_fpu(tsk);
- if (usermsr & MSR_VSX)
- __giveup_vsx(tsk);
+ if (usermsr & MSR_VEC)
+ save_altivec(tsk);
+ }
if (usermsr & MSR_SPE)
__giveup_spe(tsk);
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 51b0c17..1c2e7a3 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -151,23 +151,6 @@ _GLOBAL(load_up_vsx)
std r12,_MSR(r1)
b fast_exception_return
-/*
- * __giveup_vsx(tsk)
- * Disable VSX for the task given as the argument.
- * Does NOT save vsx registers.
- */
-_GLOBAL(__giveup_vsx)
- addi r3,r3,THREAD /* want THREAD of task */
- ld r5,PT_REGS(r3)
- cmpdi 0,r5,0
- beq 1f
- ld r4,_MSR-STACK_FRAME_OVERHEAD(r5)
- lis r3,MSR_VSX@h
- andc r4,r4,r3 /* disable VSX for previous task */
- std r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
- blr
-
#endif /* CONFIG_VSX */
--
2.7.1
^ permalink raw reply related [flat|nested] 14+ messages in thread