LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] KVM: PPC: remove mmio_vsx_tx_sx_enabled in PR KVM MMIO emulation
From: wei.guo.simon @ 2018-05-24  9:01 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Paul Mackerras, kvm, linuxppc-dev, Simon Guo

From: Simon Guo <wei.guo.simon@gmail.com>

Originally PR KVM MMIO emulation uses only 0~31#(5 bits) for VSR
reg number, and use mmio_vsx_tx_sx_enabled field together for
0~63# VSR regs.

Currently PR KVM MMIO emulation is reimplemented with analyse_instr()
assistence. analyse_instr() returns 0~63 for VSR register number, so
it is not necessary to use additional mmio_vsx_tx_sx_enabled field
any more.

This patch extends related reg bits(expand io_gpr to u16 from u8
and use 6 bits for VSR reg#), so that mmio_vsx_tx_sx_enabled can
be removed.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/kvm_host.h  | 17 ++++++++---------
 arch/powerpc/kvm/emulate_loadstore.c |  7 +++----
 arch/powerpc/kvm/powerpc.c           | 30 +++++++++++++++---------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 8dc5e43..bd220a3 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -673,7 +673,7 @@ struct kvm_vcpu_arch {
 	gva_t vaddr_accessed;
 	pgd_t *pgdir;
 
-	u8 io_gpr; /* GPR used as IO source/target */
+	u16 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_host_swabbed;
 	u8 mmio_sign_extend;
 	/* conversion between single and double precision */
@@ -689,7 +689,6 @@ struct kvm_vcpu_arch {
 	 */
 	u8 mmio_vsx_copy_nums;
 	u8 mmio_vsx_offset;
-	u8 mmio_vsx_tx_sx_enabled;
 	u8 mmio_vmx_copy_nums;
 	u8 mmio_vmx_offset;
 	u8 mmio_copy_type;
@@ -802,14 +801,14 @@ struct kvm_vcpu_arch {
 #define KVMPPC_VCPU_BUSY_IN_HOST	2
 
 /* Values for vcpu->arch.io_gpr */
-#define KVM_MMIO_REG_MASK	0x001f
-#define KVM_MMIO_REG_EXT_MASK	0xffe0
+#define KVM_MMIO_REG_MASK	0x003f
+#define KVM_MMIO_REG_EXT_MASK	0xffc0
 #define KVM_MMIO_REG_GPR	0x0000
-#define KVM_MMIO_REG_FPR	0x0020
-#define KVM_MMIO_REG_QPR	0x0040
-#define KVM_MMIO_REG_FQPR	0x0060
-#define KVM_MMIO_REG_VSX	0x0080
-#define KVM_MMIO_REG_VMX	0x00c0
+#define KVM_MMIO_REG_FPR	0x0040
+#define KVM_MMIO_REG_QPR	0x0080
+#define KVM_MMIO_REG_FQPR	0x00c0
+#define KVM_MMIO_REG_VSX	0x0100
+#define KVM_MMIO_REG_VMX	0x0180
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index dca7f1c..64b325b 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -106,7 +106,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 	 * if mmio_vsx_tx_sx_enabled == 1, copy data between
 	 * VSR[32..63] and memory
 	 */
-	vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
 	vcpu->arch.mmio_vsx_copy_nums = 0;
 	vcpu->arch.mmio_vsx_offset = 0;
 	vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
@@ -242,8 +241,8 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			}
 
 			emulated = kvmppc_handle_vsx_load(run, vcpu,
-					KVM_MMIO_REG_VSX | (op.reg & 0x1f),
-					io_size_each, 1, op.type & SIGNEXT);
+					KVM_MMIO_REG_VSX|op.reg, io_size_each,
+					1, op.type & SIGNEXT);
 			break;
 		}
 #endif
@@ -363,7 +362,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 			}
 
 			emulated = kvmppc_handle_vsx_store(run, vcpu,
-					op.reg & 0x1f, io_size_each, 1);
+					op.reg, io_size_each, 1);
 			break;
 		}
 #endif
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 05eccdc..dcc7982 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -881,10 +881,10 @@ static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu,
 	if (offset == -1)
 		return;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsxval[offset] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		VCPU_VSX_FPR(vcpu, index, offset) = gpr;
 	}
@@ -896,11 +896,11 @@ static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu,
 	union kvmppc_one_reg val;
 	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsxval[0] = gpr;
 		val.vsxval[1] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		VCPU_VSX_FPR(vcpu, index, 0) = gpr;
 		VCPU_VSX_FPR(vcpu, index, 1) = gpr;
@@ -913,12 +913,12 @@ static inline void kvmppc_set_vsr_word_dump(struct kvm_vcpu *vcpu,
 	union kvmppc_one_reg val;
 	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
+	if (index >= 32) {
 		val.vsx32val[0] = gpr;
 		val.vsx32val[1] = gpr;
 		val.vsx32val[2] = gpr;
 		val.vsx32val[3] = gpr;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		val.vsx32val[0] = gpr;
 		val.vsx32val[1] = gpr;
@@ -938,10 +938,10 @@ static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
 	if (offset == -1)
 		return;
 
-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
-		val.vval = VCPU_VSX_VR(vcpu, index);
+	if (index >= 32) {
+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
 		val.vsx32val[offset] = gpr32;
-		VCPU_VSX_VR(vcpu, index) = val.vval;
+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
 	} else {
 		dword_offset = offset / 2;
 		word_offset = offset % 2;
@@ -1362,10 +1362,10 @@ static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
 			break;
 		}
 
-		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		if (rs < 32) {
 			*val = VCPU_VSX_FPR(vcpu, rs, vsx_offset);
 		} else {
-			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
 			*val = reg.vsxval[vsx_offset];
 		}
 		break;
@@ -1379,13 +1379,13 @@ static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
 			break;
 		}
 
-		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
+		if (rs < 32) {
 			dword_offset = vsx_offset / 2;
 			word_offset = vsx_offset % 2;
 			reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset);
 			*val = reg.vsx32val[word_offset];
 		} else {
-			reg.vval = VCPU_VSX_VR(vcpu, rs);
+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
 			*val = reg.vsx32val[vsx_offset];
 		}
 		break;
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH bpf-next v4 02/10] bpf: powerpc64: pad function address loads with NOPs
From: Daniel Borkmann @ 2018-05-24  9:25 UTC (permalink / raw)
  To: Sandipan Das; +Cc: ast, netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <8826a2e1-71c5-06fc-7a66-3c33c1a54c78@linux.vnet.ibm.com>

On 05/24/2018 10:25 AM, Sandipan Das wrote:
> On 05/24/2018 01:04 PM, Daniel Borkmann wrote:
>> On 05/24/2018 08:56 AM, Sandipan Das wrote:
>>> For multi-function programs, loading the address of a callee
>>> function to a register requires emitting instructions whose
>>> count varies from one to five depending on the nature of the
>>> address.
>>>
>>> Since we come to know of the callee's address only before the
>>> extra pass, the number of instructions required to load this
>>> address may vary from what was previously generated. This can
>>> make the JITed image grow or shrink.
>>>
>>> To avoid this, we should generate a constant five-instruction
>>> when loading function addresses by padding the optimized load
>>> sequence with NOPs.
>>>
>>> Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
>>> ---
>>>  arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
>>>  1 file changed, 23 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
>>> index 1bdb1aff0619..e4582744a31d 100644
>>> --- a/arch/powerpc/net/bpf_jit_comp64.c
>>> +++ b/arch/powerpc/net/bpf_jit_comp64.c
>>> @@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
>>>  
>>>  static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
>>>  {
>>> +	unsigned int i, ctx_idx = ctx->idx;
>>> +
>>> +	/* Load function address into r12 */
>>> +	PPC_LI64(12, func);
>>> +
>>> +	/* For bpf-to-bpf function calls, the callee's address is unknown
>>> +	 * until the last extra pass. As seen above, we use PPC_LI64() to
>>> +	 * load the callee's address, but this may optimize the number of
>>> +	 * instructions required based on the nature of the address.
>>> +	 *
>>> +	 * Since we don't want the number of instructions emitted to change,
>>> +	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
>>> +	 * we always have a five-instruction sequence, which is the maximum
>>> +	 * that PPC_LI64() can emit.
>>> +	 */
>>> +	for (i = ctx->idx - ctx_idx; i < 5; i++)
>>> +		PPC_NOP();
>>
>> By the way, I think you can still optimize this. The nops are not really
>> needed in case of insn->src_reg != BPF_PSEUDO_CALL since the address of
>> a normal BPF helper call will always be at a fixed location and known a
>> priori.
> 
> Ah, true. Thanks for pointing this out. There are a few other things that
> we are planning to do for the ppc64 JIT compiler. Will put out a patch for
> this with that series.

Awesome, thanks Sandipan!

^ permalink raw reply

* [PATCH 4.4 25/92] futex: Remove duplicated code and fix undefined behaviour
From: Greg Kroah-Hartman @ 2018-05-24  9:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: Greg Kroah-Hartman, stable, Jiri Slaby, Thomas Gleixner,
	Russell King, Darren Hart (VMware), linux-mips, Rich Felker,
	linux-ia64, linux-sh, peterz, Benjamin Herrenschmidt,
	Max Filippov, Paul Mackerras, sparclinux, Jonas Bonn, linux-s390,
	linux-arch, Yoshinori Sato, linux-hexagon, Helge Deller,
	James E.J. Bottomley, Catalin Marinas, Matt Turner,
	linux-snps-arc, Fenghua Yu, Arnd Bergmann, linux-xtensa,
	Stefan Kristiansson, openrisc, Ivan Kokshaysky, Stafford Horne,
	linux-arm-kernel, Richard Henderson, Chris Zankel, Michal Simek,
	Tony Luck, linux-parisc, Vineet Gupta, Ralf Baechle, Richard Kuo,
	linux-alpha, Martin Schwidefsky, linuxppc-dev, David S. Miller,
	Ben Hutchings
In-Reply-To: <20180524093159.286472249@linuxfoundation.org>

4.4-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Jiri Slaby <jslaby@suse.cz>

commit 30d6e0a4190d37740e9447e4e4815f06992dd8c3 upstream.

There is code duplicated over all architecture's headers for
futex_atomic_op_inuser. Namely op decoding, access_ok check for uaddr,
and comparison of the result.

Remove this duplication and leave up to the arches only the needed
assembly which is now in arch_futex_atomic_op_inuser.

This effectively distributes the Will Deacon's arm64 fix for undefined
behaviour reported by UBSAN to all architectures. The fix was done in
commit 5f16a046f8e1 (arm64: futex: Fix undefined behaviour with
FUTEX_OP_OPARG_SHIFT usage). Look there for an example dump.

And as suggested by Thomas, check for negative oparg too, because it was
also reported to cause undefined behaviour report.

Note that s390 removed access_ok check in d12a29703 ("s390/uaccess:
remove pointless access_ok() checks") as access_ok there returns true.
We introduce it back to the helper for the sake of simplicity (it gets
optimized away anyway).

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> [s390]
Acked-by: Chris Metcalf <cmetcalf@mellanox.com> [for tile]
Reviewed-by: Darren Hart (VMware) <dvhart@infradead.org>
Reviewed-by: Will Deacon <will.deacon@arm.com> [core/arm64]
Cc: linux-mips@linux-mips.org
Cc: Rich Felker <dalias@libc.org>
Cc: linux-ia64@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: peterz@infradead.org
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: sparclinux@vger.kernel.org
Cc: Jonas Bonn <jonas@southpole.se>
Cc: linux-s390@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: linux-hexagon@vger.kernel.org
Cc: Helge Deller <deller@gmx.de>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: linux-snps-arc@lists.infradead.org
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-xtensa@linux-xtensa.org
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: openrisc@lists.librecores.org
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Stafford Horne <shorne@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Richard Henderson <rth@twiddle.net>
Cc: Chris Zankel <chris@zankel.net>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-parisc@vger.kernel.org
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: linux-alpha@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: "David S. Miller" <davem@davemloft.net>
Link: http://lkml.kernel.org/r/20170824073105.3901-1-jslaby@suse.cz
Cc: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 arch/alpha/include/asm/futex.h      |   26 +++---------------
 arch/arc/include/asm/futex.h        |   40 +++-------------------------
 arch/arm/include/asm/futex.h        |   26 ++----------------
 arch/arm64/include/asm/futex.h      |   26 ++----------------
 arch/frv/include/asm/futex.h        |    3 +-
 arch/frv/kernel/futex.c             |   27 ++-----------------
 arch/hexagon/include/asm/futex.h    |   38 ++-------------------------
 arch/ia64/include/asm/futex.h       |   25 ++----------------
 arch/microblaze/include/asm/futex.h |   38 ++-------------------------
 arch/mips/include/asm/futex.h       |   25 ++----------------
 arch/parisc/include/asm/futex.h     |   25 ++----------------
 arch/powerpc/include/asm/futex.h    |   26 +++---------------
 arch/s390/include/asm/futex.h       |   23 +++-------------
 arch/sh/include/asm/futex.h         |   26 ++----------------
 arch/sparc/include/asm/futex_64.h   |   26 +++---------------
 arch/tile/include/asm/futex.h       |   40 +++-------------------------
 arch/x86/include/asm/futex.h        |   40 +++-------------------------
 arch/xtensa/include/asm/futex.h     |   27 +++----------------
 include/asm-generic/futex.h         |   50 ++++++------------------------------
 kernel/futex.c                      |   39 ++++++++++++++++++++++++++++
 20 files changed, 126 insertions(+), 470 deletions(-)

--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -29,18 +29,10 @@
 	:	"r" (uaddr), "r"(oparg)				\
 	:	"memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -73,20 +73,11 @@
 
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
 #ifndef CONFIG_ARC_HAS_LLSC
 	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
 #endif
@@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser
 	preempt_enable();
 #endif
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
 #endif /* !SMP */
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 #ifndef CONFIG_SMP
 	preempt_disable();
 #endif
@@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op,
 	preempt_enable();
 #endif
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -53,20 +53,10 @@
 	: "memory")
 
 static inline int
-futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (int)(encoded_op << 8) >> 20;
-	int cmparg = (int)(encoded_op << 20) >> 20;
 	int oldval = 0, ret, tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1U << (oparg & 0x1f);
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -96,17 +86,9 @@ futex_atomic_op_inuser(unsigned int enco
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -7,7 +7,8 @@
 #include <asm/errno.h>
 #include <asm/uaccess.h>
 
-extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
+extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr);
 
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_x
 /*
  * do the futex operations
  */
-int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_o
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS; break;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
 
 	return ret;
 
-} /* end futex_atomic_op_inuser() */
+} /* end arch_futex_atomic_op_inuser() */
--- a/arch/hexagon/include/asm/futex.h
+++ b/arch/hexagon/include/asm/futex.h
@@ -31,18 +31,9 @@
 
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, i
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -45,18 +45,9 @@ do {									\
 } while (0)
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op,
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -29,18 +29,9 @@
 })
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -83,18 +83,9 @@
 }
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -32,20 +32,11 @@ _futex_spin_unlock_irqrestore(u32 __user
 }
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
 	unsigned long int flags;
 	u32 val;
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -98,17 +89,9 @@ futex_atomic_op_inuser (int encoded_op,
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -31,18 +31,10 @@
 	: "b" (uaddr), "i" (-EFAULT), "r" (oparg) \
 	: "cr0", "memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -68,17 +60,9 @@ static inline int futex_atomic_op_inuser
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -21,17 +21,12 @@
 		: "0" (-EFAULT), "d" (oparg), "a" (uaddr),		\
 		  "m" (*uaddr) : "cc");
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, newval, ret;
 
 	load_kernel_asce();
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
 
 	pagefault_disable();
 	switch (op) {
@@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser
 	}
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -10,20 +10,11 @@
 /* XXX: UP variants, fix for SH-4A and SMP.. */
 #include <asm/futex-irq.h>
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -49,17 +40,8 @@ static inline int futex_atomic_op_inuser
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
 
 	return ret;
 }
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -29,22 +29,14 @@
 	: "r" (uaddr), "r" (oparg), "i" (-EFAULT)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-		return -EFAULT;
 	if (unlikely((((unsigned long) uaddr) & 0x3UL)))
 		return -EINVAL;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -106,12 +106,9 @@
 	lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int uninitialized_var(val), ret;
 
 	__futex_prolog();
@@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser
 	/* The 32-bit futex code makes this assumption, so validate it here. */
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser
 	}
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (val == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (val != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (val < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (val >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (val <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (val > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = val;
+
 	return ret;
 }
 
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -41,20 +41,11 @@
 		       "+m" (*uaddr), "=&r" (tem)		\
 		     : "r" (oparg), "i" (-EFAULT), "1" (0))
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -44,18 +44,10 @@
 	: "r" (uaddr), "I" (-EFAULT), "r" (oparg)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 #if !XCHAL_HAVE_S32C1I
 	return -ENOSYS;
@@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser
 
 	pagefault_enable();
 
-	if (ret)
-		return ret;
-
-	switch (cmp) {
-	case FUTEX_OP_CMP_EQ: return (oldval == cmparg);
-	case FUTEX_OP_CMP_NE: return (oldval != cmparg);
-	case FUTEX_OP_CMP_LT: return (oldval < cmparg);
-	case FUTEX_OP_CMP_GE: return (oldval >= cmparg);
-	case FUTEX_OP_CMP_LE: return (oldval <= cmparg);
-	case FUTEX_OP_CMP_GT: return (oldval > cmparg);
-	}
+	if (!ret)
+		*oval = oldval;
 
-	return -ENOSYS;
+	return ret;
 }
 
 static inline int
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -13,7 +13,7 @@
  */
 
 /**
- * futex_atomic_op_inuser() - Atomic arithmetic operation with constant
+ * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant
  *			  argument and comparison of the previous
  *			  futex value with another constant.
  *
@@ -25,18 +25,11 @@
  * <0 - On error
  */
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval, ret;
 	u32 tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
 	preempt_disable();
 	pagefault_disable();
 
@@ -74,17 +67,9 @@ out_pagefault_enable:
 	pagefault_enable();
 	preempt_enable();
 
-	if (ret == 0) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (ret == 0)
+		*oval = oldval;
+
 	return ret;
 }
 
@@ -126,18 +111,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
 
 #else
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -153,17 +129,9 @@ futex_atomic_op_inuser (int encoded_op,
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1453,6 +1453,45 @@ out:
 	return ret;
 }
 
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+	unsigned int op =	  (encoded_op & 0x70000000) >> 28;
+	unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24;
+	int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
+	int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+	int oldval, ret;
+
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+		if (oparg < 0 || oparg > 31)
+			return -EINVAL;
+		oparg = 1 << oparg;
+	}
+
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+		return -EFAULT;
+
+	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+	if (ret)
+		return ret;
+
+	switch (cmp) {
+	case FUTEX_OP_CMP_EQ:
+		return oldval == cmparg;
+	case FUTEX_OP_CMP_NE:
+		return oldval != cmparg;
+	case FUTEX_OP_CMP_LT:
+		return oldval < cmparg;
+	case FUTEX_OP_CMP_GE:
+		return oldval >= cmparg;
+	case FUTEX_OP_CMP_LE:
+		return oldval <= cmparg;
+	case FUTEX_OP_CMP_GT:
+		return oldval > cmparg;
+	default:
+		return -ENOSYS;
+	}
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:

^ permalink raw reply

* Re: [PATCH v3] powerpc: Implement csum_ipv6_magic in assembly
From: Christophe Leroy @ 2018-05-24 10:18 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: linux-kernel, Paul Mackerras, netdev, linuxppc-dev
In-Reply-To: <3848a4ad-2c0e-691f-e98f-347cfe3484e8@c-s.fr>



On 05/24/2018 06:20 AM, Christophe LEROY wrote:
> 
> 
> Le 23/05/2018 à 20:34, Segher Boessenkool a écrit :
>> On Tue, May 22, 2018 at 08:57:01AM +0200, Christophe Leroy wrote:
>>> The generic csum_ipv6_magic() generates a pretty bad result
>>
>> <snip>
>>
>> Please try with a more recent compiler, what you used is pretty ancient.
>> It's not like recent compilers do great on this either, but it's not
>> *that* bad anymore ;-)


Here is what I get with GCC 8.1
It doesn't look much better, does it ?


net/ipv6/ip6_checksum.o:     file format elf32-powerpc


Disassembly of section .text:

00000000 <csum_ipv6_magic>:
    0:	94 21 ff f0 	stwu    r1,-16(r1)
    4:	80 04 00 00 	lwz     r0,0(r4)
    8:	81 64 00 04 	lwz     r11,4(r4)
    c:	81 04 00 08 	lwz     r8,8(r4)
   10:	93 e1 00 0c 	stw     r31,12(r1)
   14:	81 43 00 00 	lwz     r10,0(r3)
   18:	83 e3 00 04 	lwz     r31,4(r3)
   1c:	81 23 00 08 	lwz     r9,8(r3)
   20:	81 83 00 0c 	lwz     r12,12(r3)
   24:	7c ea 3a 14 	add     r7,r10,r7
   28:	7d 4a 38 10 	subfc   r10,r10,r7
   2c:	7c ff 3a 14 	add     r7,r31,r7
   30:	81 44 00 0c 	lwz     r10,12(r4)
   34:	7c 63 19 10 	subfe   r3,r3,r3
   38:	7c 63 38 50 	subf    r3,r3,r7
   3c:	7f ff 18 10 	subfc   r31,r31,r3
   40:	7c e9 1a 14 	add     r7,r9,r3
   44:	83 e1 00 0c 	lwz     r31,12(r1)
   48:	7c 63 19 10 	subfe   r3,r3,r3
   4c:	38 21 00 10 	addi    r1,r1,16
   50:	7c 63 38 50 	subf    r3,r3,r7
   54:	7d 29 18 10 	subfc   r9,r9,r3
   58:	7d 2c 1a 14 	add     r9,r12,r3
   5c:	7c 63 19 10 	subfe   r3,r3,r3
   60:	7c 63 48 50 	subf    r3,r3,r9
   64:	7d 8c 18 10 	subfc   r12,r12,r3
   68:	7d 20 1a 14 	add     r9,r0,r3
   6c:	7c 63 19 10 	subfe   r3,r3,r3
   70:	7c 63 48 50 	subf    r3,r3,r9
   74:	7c 00 18 10 	subfc   r0,r0,r3
   78:	7d 2b 1a 14 	add     r9,r11,r3
   7c:	7c 63 19 10 	subfe   r3,r3,r3
   80:	7c 63 48 50 	subf    r3,r3,r9
   84:	7d 6b 18 10 	subfc   r11,r11,r3
   88:	7d 28 1a 14 	add     r9,r8,r3
   8c:	7c 63 19 10 	subfe   r3,r3,r3
   90:	7c 63 48 50 	subf    r3,r3,r9
   94:	7d 08 18 10 	subfc   r8,r8,r3
   98:	7d 2a 1a 14 	add     r9,r10,r3
   9c:	7c 63 19 10 	subfe   r3,r3,r3
   a0:	7c 63 48 50 	subf    r3,r3,r9
   a4:	7d 4a 18 10 	subfc   r10,r10,r3
   a8:	7d 23 2a 14 	add     r9,r3,r5
   ac:	7c 63 19 10 	subfe   r3,r3,r3
   b0:	7c 63 48 50 	subf    r3,r3,r9
   b4:	7c a5 18 10 	subfc   r5,r5,r3
   b8:	7c 63 32 14 	add     r3,r3,r6
   bc:	7d 29 49 10 	subfe   r9,r9,r9
   c0:	7d 29 18 50 	subf    r9,r9,r3
   c4:	7c c6 48 10 	subfc   r6,r6,r9
   c8:	7c 63 19 10 	subfe   r3,r3,r3
   cc:	7c 63 48 50 	subf    r3,r3,r9
   d0:	54 69 80 3e 	rotlwi  r9,r3,16
   d4:	7c 63 4a 14 	add     r3,r3,r9
   d8:	7c 63 18 f8 	not     r3,r3
   dc:	54 63 84 3e 	rlwinm  r3,r3,16,16,31
   e0:	4e 80 00 20 	blr

net/ipv6/ip6_checksum.o:     file format elf64-powerpc


Disassembly of section .text:

0000000000000000 <.csum_ipv6_magic>:
    0:	fb e1 ff f8 	std     r31,-8(r1)
    4:	81 43 00 00 	lwz     r10,0(r3)
    8:	81 83 00 04 	lwz     r12,4(r3)
    c:	81 23 00 08 	lwz     r9,8(r3)
   10:	80 03 00 0c 	lwz     r0,12(r3)
   14:	7c e7 52 14 	add     r7,r7,r10
   18:	80 64 00 08 	lwz     r3,8(r4)
   1c:	81 04 00 00 	lwz     r8,0(r4)
   20:	78 ff 00 20 	clrldi  r31,r7,32
   24:	7c ec 3a 14 	add     r7,r12,r7
   28:	81 64 00 04 	lwz     r11,4(r4)
   2c:	7f ea f8 50 	subf    r31,r10,r31
   30:	81 44 00 0c 	lwz     r10,12(r4)
   34:	7b ff 0f e0 	rldicl  r31,r31,1,63
   38:	7c ff 3a 14 	add     r7,r31,r7
   3c:	eb e1 ff f8 	ld      r31,-8(r1)
   40:	78 e4 00 20 	clrldi  r4,r7,32
   44:	7c e9 3a 14 	add     r7,r9,r7
   48:	7d 8c 20 50 	subf    r12,r12,r4
   4c:	79 8c 0f e0 	rldicl  r12,r12,1,63
   50:	7d 8c 3a 14 	add     r12,r12,r7
   54:	79 87 00 20 	clrldi  r7,r12,32
   58:	7d 80 62 14 	add     r12,r0,r12
   5c:	7d 29 38 50 	subf    r9,r9,r7
   60:	79 29 0f e0 	rldicl  r9,r9,1,63
   64:	7d 29 62 14 	add     r9,r9,r12
   68:	79 27 00 20 	clrldi  r7,r9,32
   6c:	7d 28 4a 14 	add     r9,r8,r9
   70:	7c 00 38 50 	subf    r0,r0,r7
   74:	78 00 0f e0 	rldicl  r0,r0,1,63
   78:	7c 00 4a 14 	add     r0,r0,r9
   7c:	78 09 00 20 	clrldi  r9,r0,32
   80:	7c 0b 02 14 	add     r0,r11,r0
   84:	7d 08 48 50 	subf    r8,r8,r9
   88:	79 08 0f e0 	rldicl  r8,r8,1,63
   8c:	7d 08 02 14 	add     r8,r8,r0
   90:	79 09 00 20 	clrldi  r9,r8,32
   94:	7d 03 42 14 	add     r8,r3,r8
   98:	7d 2b 48 50 	subf    r9,r11,r9
   9c:	79 29 0f e0 	rldicl  r9,r9,1,63
   a0:	7d 29 42 14 	add     r9,r9,r8
   a4:	79 28 00 20 	clrldi  r8,r9,32
   a8:	7d 2a 4a 14 	add     r9,r10,r9
   ac:	7d 03 40 50 	subf    r8,r3,r8
   b0:	79 08 0f e0 	rldicl  r8,r8,1,63
   b4:	7d 08 4a 14 	add     r8,r8,r9
   b8:	79 09 00 20 	clrldi  r9,r8,32
   bc:	7d 08 2a 14 	add     r8,r8,r5
   c0:	7d 2a 48 50 	subf    r9,r10,r9
   c4:	79 29 0f e0 	rldicl  r9,r9,1,63
   c8:	7d 29 42 14 	add     r9,r9,r8
   cc:	79 2a 00 20 	clrldi  r10,r9,32
   d0:	7d 29 32 14 	add     r9,r9,r6
   d4:	7c a5 50 50 	subf    r5,r5,r10
   d8:	78 a5 0f e0 	rldicl  r5,r5,1,63
   dc:	7d 25 4a 14 	add     r9,r5,r9
   e0:	79 2a 00 20 	clrldi  r10,r9,32
   e4:	7c c6 50 50 	subf    r6,r6,r10
   e8:	78 c6 0f e0 	rldicl  r6,r6,1,63
   ec:	7c c6 4a 14 	add     r6,r6,r9
   f0:	54 c3 80 3e 	rotlwi  r3,r6,16
   f4:	7c c6 1a 14 	add     r6,r6,r3
   f8:	7c c3 30 f8 	not     r3,r6
   fc:	78 63 84 22 	rldicl  r3,r3,48,48
  100:	4e 80 00 20 	blr

Christophe

>>
>>> --- a/arch/powerpc/lib/checksum_32.S
>>> +++ b/arch/powerpc/lib/checksum_32.S
>>> @@ -293,3 +293,36 @@ dst_error:
>>>       EX_TABLE(51b, dst_error);
>>>   EXPORT_SYMBOL(csum_partial_copy_generic)
>>> +
>>> +/*
>>> + * static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
>>> + *                      const struct in6_addr *daddr,
>>> + *                      __u32 len, __u8 proto, __wsum sum)
>>> + */
>>> +
>>> +_GLOBAL(csum_ipv6_magic)
>>> +    lwz    r8, 0(r3)
>>> +    lwz    r9, 4(r3)
>>> +    lwz    r10, 8(r3)
>>> +    lwz    r11, 12(r3)
>>> +    addc    r0, r5, r6
>>> +    adde    r0, r0, r7
>>> +    adde    r0, r0, r8
>>> +    adde    r0, r0, r9
>>> +    adde    r0, r0, r10
>>> +    adde    r0, r0, r11
>>> +    lwz    r8, 0(r4)
>>> +    lwz    r9, 4(r4)
>>> +    lwz    r10, 8(r4)
>>> +    lwz    r11, 12(r4)
>>> +    adde    r0, r0, r8
>>> +    adde    r0, r0, r9
>>> +    adde    r0, r0, r10
>>> +    adde    r0, r0, r11
>>> +    addze    r0, r0
>>> +    rotlwi    r3, r0, 16
>>> +    add    r3, r0, r3
>>> +    not    r3, r3
>>> +    rlwinm    r3, r3, 16, 16, 31
>>> +    blr
>>> +EXPORT_SYMBOL(csum_ipv6_magic)
>>
>> Clustering the loads and carry insns together is pretty much the worst 
>> you
>> can do on most 32-bit CPUs.
> 
> Oh, really ? __csum_partial is written that way too.
> 
> Right, now I tried interleaving the lwz and adde. I get no improvment at 
> all on a 885, but I get a 15% improvment on a 8321.
> 
> Christophe
> 
>>
>>
>> Segher
>>

^ permalink raw reply

* Re: [PATCH 2/2] selftests/powerpc: Add perf breakpoint test
From: Michael Ellerman @ 2018-05-24 10:30 UTC (permalink / raw)
  To: Michael Neuling; +Cc: linuxppc-dev, mikey
In-Reply-To: <20180522061428.5142-2-mikey@neuling.org>

Michael Neuling <mikey@neuling.org> writes:

> This tests perf hardware breakpoints (ie PERF_TYPE_BREAKPOINT) on
> powerpc.

This doesn't work for me on a P8 guest:

  test: perf-hwbreak
  tags: git_version:bb5602e
  !! killing perf-hwbreak
  !! child died by signal 15
  failure: perf-hwbreak


That means the harness killed it after 2 minutes.

Sometimes it gets further:

  test: perf-hwbreak
  tags: git_version:v4.17-rc3-109-gbb20240fb508
  threads=16 loops=1048576 scalar test
  !! killing perf-hwbreak
  !! child died by signal 15
  failure: perf-hwbreak


Do we just need to bump the timeout up, or is something going wrong
here?

cheers

^ permalink raw reply

* Re: [PATCH v2 1/7] powerpc/64s/radix: do not flush TLB on spurious fault
From: Nicholas Piggin @ 2018-05-24 10:37 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <87efi5y23v.fsf@linux.vnet.ibm.com>

On Mon, 21 May 2018 11:36:12 +0530
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> Nicholas Piggin <npiggin@gmail.com> writes:
> 
> > In the case of a spurious fault (which can happen due to a race with
> > another thread that changes the page table), the default Linux mm code
> > calls flush_tlb_page for that address. This is not required because
> > the pte will be re-fetched. Hash does not wire this up to a hardware
> > TLB flush for this reason. This patch avoids the flush for radix.
> >
> > From Power ISA v3.0B, p.1090:
> >
> >     Setting a Reference or Change Bit or Upgrading Access Authority
> >     (PTE Subject to Atomic Hardware Updates)
> >
> >     If the only change being made to a valid PTE that is subject to
> >     atomic hardware updates is to set the Refer- ence or Change bit to
> >     1 or to add access authorities, a simpler sequence suffices
> >     because the translation hardware will refetch the PTE if an access
> >     is attempted for which the only problems were reference and/or
> >     change bits needing to be set or insufficient access authority.
> >
> > The nest MMU on POWER9 does not re-fetch the PTE after such an access
> > attempt before faulting, so address spaces with a coprocessor
> > attached will continue to flush in these cases.
> >
> > This reduces tlbies for a kernel compile workload from 0.95M to 0.90M.
> >
> > fork --fork --exec benchmark improved 0.5% (12300->12400).
> >  
> 
> 
> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> 
> Do you want to use flush_tlb_fix_spurious_fault in
> ptep_set_access_flags() also?. That would bring it closer to generic version?

I'm not sure it adds much, it does bring it closer to generic version
and they do happen to do the same thing, but it's not really fixing a
spurious fault in ptep_set_access_flags(). I think adding that just
means you would have to follow another indirection to work out what it
does.

Thanks,
Nick

^ permalink raw reply

* [PATCH] powerpc/8xx: fix invalid register expression in head_8xx.S
From: Christophe Leroy @ 2018-05-24 11:02 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

New binutils generate the following warning

  AS      arch/powerpc/kernel/head_8xx.o
arch/powerpc/kernel/head_8xx.S: Assembler messages:
arch/powerpc/kernel/head_8xx.S:916: Warning: invalid register expression

This patch fixes it.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/kernel/head_8xx.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index d8670a37d70c..6cab07e76732 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -913,7 +913,7 @@ start_here:
 	tovirt(r6,r6)
 	lis	r5, abatron_pteptrs@h
 	ori	r5, r5, abatron_pteptrs@l
-	stw	r5, 0xf0(r0)	/* Must match your Abatron config file */
+	stw	r5, 0xf0(0)	/* Must match your Abatron config file */
 	tophys(r5,r5)
 	stw	r6, 0(r5)
 
-- 
2.13.3

^ permalink raw reply related

* [PATCH] powerpc/32: Optimise __csum_partial()
From: Christophe Leroy @ 2018-05-24 11:22 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linux-kernel, linuxppc-dev

Improve __csum_partial by interleaving loads and adds.

On a 8xx, it brings neither improvement nor degradation.
On a 83xx, it brings a 25% improvement.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/checksum_32.S | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index d2238ea82209..aa224069f93a 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -47,16 +47,25 @@ _GLOBAL(__csum_partial)
 	bdnz	2b
 21:	srwi.	r6,r4,4		/* # blocks of 4 words to do */
 	beq	3f
+	lwz	r0,4(r3)
 	mtctr	r6
-22:	lwz	r0,4(r3)
 	lwz	r6,8(r3)
+	adde	r5,r5,r0
 	lwz	r7,12(r3)
+	adde	r5,r5,r6
 	lwzu	r8,16(r3)
+	adde	r5,r5,r7
+	bdz	23f
+22:	lwz	r0,4(r3)
+	adde	r5,r5,r8
+	lwz	r6,8(r3)
 	adde	r5,r5,r0
+	lwz	r7,12(r3)
 	adde	r5,r5,r6
+	lwzu	r8,16(r3)
 	adde	r5,r5,r7
-	adde	r5,r5,r8
 	bdnz	22b
+23:	adde	r5,r5,r8
 3:	andi.	r0,r4,2
 	beq+	4f
 	lhz	r0,4(r3)
-- 
2.13.3

^ permalink raw reply related

* [PATCH v4] powerpc: Implement csum_ipv6_magic in assembly
From: Christophe Leroy @ 2018-05-24 11:33 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linux-kernel, linuxppc-dev

The generic csum_ipv6_magic() generates a pretty bad result

00000000 <csum_ipv6_magic>: (PPC32)
   0:	81 23 00 00 	lwz     r9,0(r3)
   4:	81 03 00 04 	lwz     r8,4(r3)
   8:	7c e7 4a 14 	add     r7,r7,r9
   c:	7d 29 38 10 	subfc   r9,r9,r7
  10:	7d 4a 51 10 	subfe   r10,r10,r10
  14:	7d 27 42 14 	add     r9,r7,r8
  18:	7d 2a 48 50 	subf    r9,r10,r9
  1c:	80 e3 00 08 	lwz     r7,8(r3)
  20:	7d 08 48 10 	subfc   r8,r8,r9
  24:	7d 4a 51 10 	subfe   r10,r10,r10
  28:	7d 29 3a 14 	add     r9,r9,r7
  2c:	81 03 00 0c 	lwz     r8,12(r3)
  30:	7d 2a 48 50 	subf    r9,r10,r9
  34:	7c e7 48 10 	subfc   r7,r7,r9
  38:	7d 4a 51 10 	subfe   r10,r10,r10
  3c:	7d 29 42 14 	add     r9,r9,r8
  40:	7d 2a 48 50 	subf    r9,r10,r9
  44:	80 e4 00 00 	lwz     r7,0(r4)
  48:	7d 08 48 10 	subfc   r8,r8,r9
  4c:	7d 4a 51 10 	subfe   r10,r10,r10
  50:	7d 29 3a 14 	add     r9,r9,r7
  54:	7d 2a 48 50 	subf    r9,r10,r9
  58:	81 04 00 04 	lwz     r8,4(r4)
  5c:	7c e7 48 10 	subfc   r7,r7,r9
  60:	7d 4a 51 10 	subfe   r10,r10,r10
  64:	7d 29 42 14 	add     r9,r9,r8
  68:	7d 2a 48 50 	subf    r9,r10,r9
  6c:	80 e4 00 08 	lwz     r7,8(r4)
  70:	7d 08 48 10 	subfc   r8,r8,r9
  74:	7d 4a 51 10 	subfe   r10,r10,r10
  78:	7d 29 3a 14 	add     r9,r9,r7
  7c:	7d 2a 48 50 	subf    r9,r10,r9
  80:	81 04 00 0c 	lwz     r8,12(r4)
  84:	7c e7 48 10 	subfc   r7,r7,r9
  88:	7d 4a 51 10 	subfe   r10,r10,r10
  8c:	7d 29 42 14 	add     r9,r9,r8
  90:	7d 2a 48 50 	subf    r9,r10,r9
  94:	7d 08 48 10 	subfc   r8,r8,r9
  98:	7d 4a 51 10 	subfe   r10,r10,r10
  9c:	7d 29 2a 14 	add     r9,r9,r5
  a0:	7d 2a 48 50 	subf    r9,r10,r9
  a4:	7c a5 48 10 	subfc   r5,r5,r9
  a8:	7c 63 19 10 	subfe   r3,r3,r3
  ac:	7d 29 32 14 	add     r9,r9,r6
  b0:	7d 23 48 50 	subf    r9,r3,r9
  b4:	7c c6 48 10 	subfc   r6,r6,r9
  b8:	7c 63 19 10 	subfe   r3,r3,r3
  bc:	7c 63 48 50 	subf    r3,r3,r9
  c0:	54 6a 80 3e 	rotlwi  r10,r3,16
  c4:	7c 63 52 14 	add     r3,r3,r10
  c8:	7c 63 18 f8 	not     r3,r3
  cc:	54 63 84 3e 	rlwinm  r3,r3,16,16,31
  d0:	4e 80 00 20 	blr

0000000000000000 <.csum_ipv6_magic>: (PPC64)
   0:	81 23 00 00 	lwz     r9,0(r3)
   4:	80 03 00 04 	lwz     r0,4(r3)
   8:	81 63 00 08 	lwz     r11,8(r3)
   c:	7c e7 4a 14 	add     r7,r7,r9
  10:	7f 89 38 40 	cmplw   cr7,r9,r7
  14:	7d 47 02 14 	add     r10,r7,r0
  18:	7d 30 10 26 	mfocrf  r9,1
  1c:	55 29 f7 fe 	rlwinm  r9,r9,30,31,31
  20:	7d 4a 4a 14 	add     r10,r10,r9
  24:	7f 80 50 40 	cmplw   cr7,r0,r10
  28:	7d 2a 5a 14 	add     r9,r10,r11
  2c:	80 03 00 0c 	lwz     r0,12(r3)
  30:	81 44 00 00 	lwz     r10,0(r4)
  34:	7d 10 10 26 	mfocrf  r8,1
  38:	55 08 f7 fe 	rlwinm  r8,r8,30,31,31
  3c:	7d 29 42 14 	add     r9,r9,r8
  40:	81 04 00 04 	lwz     r8,4(r4)
  44:	7f 8b 48 40 	cmplw   cr7,r11,r9
  48:	7d 29 02 14 	add     r9,r9,r0
  4c:	7d 70 10 26 	mfocrf  r11,1
  50:	55 6b f7 fe 	rlwinm  r11,r11,30,31,31
  54:	7d 29 5a 14 	add     r9,r9,r11
  58:	7f 80 48 40 	cmplw   cr7,r0,r9
  5c:	7d 29 52 14 	add     r9,r9,r10
  60:	7c 10 10 26 	mfocrf  r0,1
  64:	54 00 f7 fe 	rlwinm  r0,r0,30,31,31
  68:	7d 69 02 14 	add     r11,r9,r0
  6c:	7f 8a 58 40 	cmplw   cr7,r10,r11
  70:	7c 0b 42 14 	add     r0,r11,r8
  74:	81 44 00 08 	lwz     r10,8(r4)
  78:	7c f0 10 26 	mfocrf  r7,1
  7c:	54 e7 f7 fe 	rlwinm  r7,r7,30,31,31
  80:	7c 00 3a 14 	add     r0,r0,r7
  84:	7f 88 00 40 	cmplw   cr7,r8,r0
  88:	7d 20 52 14 	add     r9,r0,r10
  8c:	80 04 00 0c 	lwz     r0,12(r4)
  90:	7d 70 10 26 	mfocrf  r11,1
  94:	55 6b f7 fe 	rlwinm  r11,r11,30,31,31
  98:	7d 29 5a 14 	add     r9,r9,r11
  9c:	7f 8a 48 40 	cmplw   cr7,r10,r9
  a0:	7d 29 02 14 	add     r9,r9,r0
  a4:	7d 70 10 26 	mfocrf  r11,1
  a8:	55 6b f7 fe 	rlwinm  r11,r11,30,31,31
  ac:	7d 29 5a 14 	add     r9,r9,r11
  b0:	7f 80 48 40 	cmplw   cr7,r0,r9
  b4:	7d 29 2a 14 	add     r9,r9,r5
  b8:	7c 10 10 26 	mfocrf  r0,1
  bc:	54 00 f7 fe 	rlwinm  r0,r0,30,31,31
  c0:	7d 29 02 14 	add     r9,r9,r0
  c4:	7f 85 48 40 	cmplw   cr7,r5,r9
  c8:	7c 09 32 14 	add     r0,r9,r6
  cc:	7d 50 10 26 	mfocrf  r10,1
  d0:	55 4a f7 fe 	rlwinm  r10,r10,30,31,31
  d4:	7c 00 52 14 	add     r0,r0,r10
  d8:	7f 80 30 40 	cmplw   cr7,r0,r6
  dc:	7d 30 10 26 	mfocrf  r9,1
  e0:	55 29 ef fe 	rlwinm  r9,r9,29,31,31
  e4:	7c 09 02 14 	add     r0,r9,r0
  e8:	54 03 80 3e 	rotlwi  r3,r0,16
  ec:	7c 03 02 14 	add     r0,r3,r0
  f0:	7c 03 00 f8 	not     r3,r0
  f4:	78 63 84 22 	rldicl  r3,r3,48,48
  f8:	4e 80 00 20 	blr

This patch implements it in assembly for both PPC32 and PPC64

Link: https://github.com/linuxppc/linux/issues/9
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 v4: Interleaved loads and adds in PPC32. No performance change on 8xx, 15% improvment on 83xx

 v3: Add support for PPC64 (please review, especially whether instructions order in optimal)

 v2: Fix number of args in final addze

 arch/powerpc/include/asm/checksum.h |  6 ++++++
 arch/powerpc/lib/checksum_32.S      | 33 +++++++++++++++++++++++++++++++++
 arch/powerpc/lib/checksum_64.S      | 28 ++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 54065caa40b3..a78a57e5058d 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -13,6 +13,7 @@
 #include <asm-generic/checksum.h>
 #else
 #include <linux/bitops.h>
+#include <linux/in6.h>
 /*
  * Computes the checksum of a memory block at src, length len,
  * and adds in "sum" (32-bit), while copying the block to dst.
@@ -211,6 +212,11 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
 	return csum_fold(csum_partial(buff, len, 0));
 }
 
+#define _HAVE_ARCH_IPV6_CSUM
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, __u8 proto, __wsum sum);
+
 #endif
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9a671c774b22..d2238ea82209 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -293,3 +293,36 @@ dst_error:
 	EX_TABLE(51b, dst_error);
 
 EXPORT_SYMBOL(csum_partial_copy_generic)
+
+/*
+ * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ *			   const struct in6_addr *daddr,
+ *			   __u32 len, __u8 proto, __wsum sum)
+ */
+
+_GLOBAL(csum_ipv6_magic)
+	lwz	r8, 0(r3)
+	lwz	r9, 4(r3)
+	addc	r0, r7, r8
+	lwz	r10, 8(r3)
+	adde	r0, r0, r9
+	lwz	r11, 12(r3)
+	adde	r0, r0, r10
+	lwz	r8, 0(r4)
+	adde	r0, r0, r11
+	lwz	r9, 4(r4)
+	adde	r0, r0, r8
+	lwz	r10, 8(r4)
+	adde	r0, r0, r9
+	lwz	r11, 12(r4)
+	adde	r0, r0, r10
+	add	r5, r5, r6	/* assumption: len + proto doesn't carry */
+	adde	r0, r0, r11
+	adde	r0, r0, r5
+	addze	r0, r0
+	rotlwi	r3, r0, 16
+	add	r3, r0, r3
+	not	r3, r3
+	rlwinm	r3, r3, 16, 16, 31
+	blr
+EXPORT_SYMBOL(csum_ipv6_magic)
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index d7f1a966136e..bf0546e546fc 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -429,3 +429,31 @@ dstnr;	stb	r6,0(r4)
 	stw	r6,0(r8)
 	blr
 EXPORT_SYMBOL(csum_partial_copy_generic)
+
+/*
+ * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ *			   const struct in6_addr *daddr,
+ *			   __u32 len, __u8 proto, __wsum sum)
+ */
+
+_GLOBAL(csum_ipv6_magic)
+	ld	r8, 0(r3)
+	ld	r9, 8(r3)
+	add	r5, r5, r6
+	addc	r0, r8, r9
+	ld	r10, 0(r4)
+	ld	r11, 8(r4)
+	adde	r0, r0, r10
+	add	r5, r5, r7
+	adde	r0, r0, r11
+	adde	r0, r0, r5
+	addze	r0, r0
+	rotldi  r3 ,r0, 32		/* fold two 32 bit halves together */
+	add	r3, r0, r3
+	srdi	r0, r3, 32
+	rotlwi	r3, r0, 16		/* fold two 16 bit halves together */
+	add	r3, r0, r3
+	not	r3, r3
+	rlwinm	r3, r3, 16, 16, 31
+	blr
+EXPORT_SYMBOL(csum_ipv6_magic)
-- 
2.13.3

^ permalink raw reply related

* Re: [PATCH v2 2/2] selftests/powerpc: Add test to verify rfi flush across a system call
From: Michael Ellerman @ 2018-05-24 13:20 UTC (permalink / raw)
  To: Naveen N. Rao; +Cc: linuxppc-dev
In-Reply-To: <5777d4ff6a9216b189f349f98217d47b38278661.1526915409.git.naveen.n.rao@linux.vnet.ibm.com>

"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> writes:
> diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c b/tools/testing/selftests/powerpc/security/rfi_flush.c
> new file mode 100644
> index 000000000000..a20fe8eca161
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/security/rfi_flush.c
> @@ -0,0 +1,132 @@
...
> +
> +int rfi_flush_test(void)
> +{
> +	char *p;
> +	int repetitions = 10;
> +	int fd, passes = 0, iter, rc = 0;
> +	struct perf_event_read v;
> +	uint64_t l1d_misses_total = 0;
> +	unsigned long iterations = 100000, zero_size = 24*1024;
> +	int rfi_flush_org, rfi_flush;
> +
> +	SKIP_IF(geteuid() != 0);
> +
> +	if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_org)) {
> +		perror("error reading powerpc/rfi_flush debugfs file");
> +	        printf("unable to determine current rfi_flush setting");
> +		return 1;
> +	}

This leads to a hard error on old kernels, which I don't want (breaks my CI).

So I changed it to:

	if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_org)) {
		perror("Unable to read powerpc/rfi_flush debugfs file");
		SKIP_IF(1);
	}

cheers

^ permalink raw reply

* Re: [PATCH 2/2] i2c: opal: don't check number of messages in the driver
From: Peter Rosin @ 2018-05-24 14:13 UTC (permalink / raw)
  To: Wolfram Sang, linux-i2c
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel
In-Reply-To: <20180520065035.7920-3-wsa@the-dreams.de>

On 2018-05-20 08:50, Wolfram Sang wrote:
> Since commit 1eace8344c02 ("i2c: add param sanity check to
> i2c_transfer()") and b7f625840267 ("i2c: add quirk checks to core"), the
> I2C core does this check now. We can remove it here.
> 
> Signed-off-by: Wolfram Sang <wsa@the-dreams.de>

Reviewed-by: Peter Rosin <peda@axentia.se>

> ---
> 
> Only build tested.
> 
>  drivers/i2c/busses/i2c-opal.c | 4 ----
>  1 file changed, 4 deletions(-)
> 
> diff --git a/drivers/i2c/busses/i2c-opal.c b/drivers/i2c/busses/i2c-opal.c
> index 0aabb7eca0c5..dc2a23f4fb52 100644
> --- a/drivers/i2c/busses/i2c-opal.c
> +++ b/drivers/i2c/busses/i2c-opal.c
> @@ -94,8 +94,6 @@ static int i2c_opal_master_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
>  	 */
>  	memset(&req, 0, sizeof(req));
>  	switch(num) {
> -	case 0:
> -		return 0;
>  	case 1:
>  		req.type = (msgs[0].flags & I2C_M_RD) ?
>  			OPAL_I2C_RAW_READ : OPAL_I2C_RAW_WRITE;
> @@ -114,8 +112,6 @@ static int i2c_opal_master_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
>  		req.size = cpu_to_be32(msgs[1].len);
>  		req.buffer_ra = cpu_to_be64(__pa(msgs[1].buf));
>  		break;
> -	default:
> -		return -EOPNOTSUPP;
>  	}
>  
>  	rc = i2c_opal_send_request(opal_id, &req);
> 

^ permalink raw reply

* [PATCH] powerpc/32: implement strlen() in assembly
From: Christophe Leroy @ 2018-05-24 16:17 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linux-kernel, linuxppc-dev

The generic implementation of strlen() reads strings byte per byte.

This patch implements strlen() in assembly for PPC32 based on
a read of entire words, in the same spirit as what some other
arches and glibc do.

For long strings, the time spent in strlen is reduced by 50-60%

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 Applies after the patch 'powerpc/lib: move PPC32 specific functions out of string.S'

 arch/powerpc/include/asm/string.h |  3 +++
 arch/powerpc/lib/string_32.S      | 40 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 0f41686b6243..23ee2a0f2b21 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -15,6 +15,9 @@
 #define __HAVE_ARCH_MEMCHR
 #define __HAVE_ARCH_MEMSET16
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE
+#ifdef CONFIG_PPC32
+#define __HAVE_ARCH_STRLEN
+#endif
 
 extern char * strcpy(char *,const char *);
 extern __kernel_size_t strlen(const char *);
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index c4e70123d245..31575a698c97 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -62,6 +62,46 @@ _GLOBAL(memcmp)
 	blr
 EXPORT_SYMBOL(memcmp)
 
+_GLOBAL(strlen)
+	andi.   r9, r3, 3
+	addi	r10, r3, -4
+	beq+	2f
+1:	lbz	r9, 4(r10)
+	addi	r10, r10, 1
+	cmpwi	cr0, r9, 0
+	beq	19f
+	andi.	r9, r10, 3
+	bne	1b
+2:	lis	r6, 0x8080
+	ori	r6, r6, 0x8080
+	rlwinm	r7, r6, 1, 0xffffffff
+3:	lwzu	r9, 4(r10)
+	subf	r8, r7, r9
+	andc	r11, r6, r9
+	and.	r8, r8, r11
+	beq+	3b
+	rlwinm.	r8, r9, 0, 0xff000000
+	beq	20f
+	rlwinm.	r8, r9, 0, 0x00ff0000
+	beq	21f
+	rlwinm.	r8, r9, 0, 0x0000ff00
+	beq	22f
+	rlwinm.	r8, r9, 0, 0x000000ff
+	bne	3b
+23:	subf	r3, r3, r10
+	addi	r3, r3, 3
+	blr
+22:	subf	r3, r3, r10
+	addi	r3, r3, 2
+	blr
+21:	subf	r3, r3, r10
+	addi	r3, r3, 1
+	blr
+19:	addi	r10, r10, 3
+20:	subf	r3, r3, r10
+	blr
+EXPORT_SYMBOL(strlen)
+
 CACHELINE_BYTES = L1_CACHE_BYTES
 LG_CACHELINE_BYTES = L1_CACHE_SHIFT
 CACHELINE_MASK = (L1_CACHE_BYTES-1)
-- 
2.13.3

^ permalink raw reply related

* Re: [PATCH v4 3/3] powerpc/lib: optimise PPC32 memcmp
From: Segher Boessenkool @ 2018-05-24 17:24 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel
In-Reply-To: <bb7ef10a-4777-7635-3e54-830dd3eeb8bb@c-s.fr>

On Wed, May 23, 2018 at 09:47:32AM +0200, Christophe Leroy wrote:
> At the time being, memcmp() compares two chunks of memory
> byte per byte.
> 
> This patch optimises the comparison by comparing word by word.
> 
> A small benchmark performed on an 8xx comparing two chuncks
> of 512 bytes performed 100000 times gives:
> 
> Before : 5852274 TB ticks
> After:   1488638 TB ticks

> diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
> index 40a576d56ac7..542e6cecbcaf 100644
> --- a/arch/powerpc/lib/string_32.S
> +++ b/arch/powerpc/lib/string_32.S
> @@ -16,17 +16,45 @@
>  	.text
> 
>  _GLOBAL(memcmp)
> -	cmpwi	cr0, r5, 0
> -	beq-	2f
> -	mtctr	r5
> -	addi	r6,r3,-1
> -	addi	r4,r4,-1
> -1:	lbzu	r3,1(r6)
> -	lbzu	r0,1(r4)
> -	subf.	r3,r0,r3
> -	bdnzt	2,1b
> +	srawi.	r7, r5, 2		/* Divide len by 4 */
> +	mr	r6, r3
> +	beq-	3f
> +	mtctr	r7
> +	li	r7, 0
> +1:
> +#ifdef __LITTLE_ENDIAN__
> +	lwbrx	r3, r6, r7
> +	lwbrx	r0, r4, r7
> +#else
> +	lwzx	r3, r6, r7
> +	lwzx	r0, r4, r7
> +#endif

You don't test whether the pointers are word-aligned.  Does that work?
Say, when a load is crossing a page boundary, or segment boundary.


Segher

^ permalink raw reply

* [PATCH v3 0/7] Various TLB and PTE improvements
From: Nicholas Piggin @ 2018-05-24 17:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin

Since last time:
- Fixed compile error on ppc32
- Significantly reworked mm_cpumask reset patch to restore the
  lazy PID context switch optimisation, and not over-flush the
  local CPU when flushing remotes (using IPIs).
- Moved mm_cpumask reset patch to the end of the series.

Nicholas Piggin (7):
  powerpc/64s/radix: do not flush TLB when relaxing access
  powerpc/64s/radix: do not flush TLB on spurious fault
  powerpc/64s/radix: make ptep_get_and_clear_full non-atomic for the
    full case
  powerpc/64s/radix: prefetch user address in update_mmu_cache
  powerpc/64s/radix: avoid ptesync after set_pte and
    ptep_set_access_flags
  powerpc/64s/radix: optimise pte_update
  powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask

 arch/powerpc/include/asm/book3s/64/radix.h    |  37 ++--
 arch/powerpc/include/asm/book3s/64/tlbflush.h |  12 +-
 arch/powerpc/include/asm/cacheflush.h         |  13 ++
 arch/powerpc/include/asm/tlb.h                |  13 ++
 arch/powerpc/mm/mem.c                         |   4 +-
 arch/powerpc/mm/mmu_context.c                 |   6 +-
 arch/powerpc/mm/pgtable-book3s64.c            |  13 +-
 arch/powerpc/mm/pgtable.c                     |  25 ++-
 arch/powerpc/mm/tlb-radix.c                   | 159 +++++++++++++++---
 9 files changed, 222 insertions(+), 60 deletions(-)

-- 
2.17.0

^ permalink raw reply

* [PATCH v3 1/7] powerpc/64s/radix: do not flush TLB when relaxing access
From: Nicholas Piggin @ 2018-05-24 17:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180524175853.19695-1-npiggin@gmail.com>

Radix flushes the TLB when updating ptes to increase permissiveness
of protection (increase access authority). Book3S does not require
TLB flushing in this case, and it is not done on hash. This patch
avoids the flush for radix.

>From Power ISA v3.0B, p.1090:

    Setting a Reference or Change Bit or Upgrading Access Authority
    (PTE Subject to Atomic Hardware Updates)

    If the only change being made to a valid PTE that is subject to
    atomic hardware updates is to set the Reference or Change bit to 1
    or to add access authorities, a simpler sequence suffices because
    the translation hardware will refetch the PTE if an access is
    attempted for which the only problems were reference and/or change
    bits needing to be set or insufficient access authority.

The nest MMU on POWER9 does not re-fetch the PTE after such an access
attempt before faulting, so address spaces with a coprocessor
attached will continue to flush in these cases.

This reduces tlbies for a kernel compile workload from 1.28M to 0.95M,
tlbiels from 20.17M 19.68M.

fork --fork --exec benchmark improved 2.77% (12000->12300).

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/pgtable-book3s64.c | 10 +++++++---
 arch/powerpc/mm/pgtable.c          | 25 +++++++++++++++++++++++--
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 518518fb7c45..994492453f0e 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -31,16 +31,20 @@ int (*register_process_table)(unsigned long base, unsigned long page_size,
 int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 			  pmd_t *pmdp, pmd_t entry, int dirty)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	int changed;
 #ifdef CONFIG_DEBUG_VM
 	WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
-	assert_spin_locked(&vma->vm_mm->page_table_lock);
+	assert_spin_locked(&mm->page_table_lock);
 #endif
 	changed = !pmd_same(*(pmdp), entry);
 	if (changed) {
-		__ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp),
+		__ptep_set_access_flags(mm, pmdp_ptep(pmdp),
 					pmd_pte(entry), address);
-		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+		/* See ptep_set_access_flags comments */
+		if (atomic_read(&mm->context.copros) > 0)
+			flush_pmd_tlb_range(vma, address,
+					address + HPAGE_PMD_SIZE);
 	}
 	return changed;
 }
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9f361ae571e9..02a24bce7e51 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -217,14 +217,35 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 			  pte_t *ptep, pte_t entry, int dirty)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	int changed;
+
 	entry = set_access_flags_filter(entry, vma, dirty);
 	changed = !pte_same(*(ptep), entry);
 	if (changed) {
 		if (!is_vm_hugetlb_page(vma))
-			assert_pte_locked(vma->vm_mm, address);
-		__ptep_set_access_flags(vma->vm_mm, ptep, entry, address);
+			assert_pte_locked(mm, address);
+		__ptep_set_access_flags(mm, ptep, entry, address);
+#ifdef CONFIG_PPC_BOOK3S_64
+		/*
+		 * Book3S does not require a TLB flush when relaxing access
+		 * restrictions because the core MMU will reload the pte after
+		 * taking an access fault. However the NMMU on POWER9 does not
+		 * re-load the pte, so flush if we have a coprocessor attached
+		 * to this address space.
+		 *
+		 * This could be further refined and pushed out to NMMU drivers
+		 * so TLBIEs are only done for NMMU faults, but this is a more
+		 * minimal fix. The NMMU fault handler does
+		 * get_user_pages_remote or similar to bring the page tables
+		 * in, and this flush_tlb_page will do a global TLBIE because
+		 * the coprocessor is attached to the address space.
+		 */
+		if (atomic_read(&mm->context.copros) > 0)
+			flush_tlb_page(vma, address);
+#else
 		flush_tlb_page(vma, address);
+#endif
 	}
 	return changed;
 }
-- 
2.17.0

^ permalink raw reply related

* [PATCH v3 2/7] powerpc/64s/radix: do not flush TLB on spurious fault
From: Nicholas Piggin @ 2018-05-24 17:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180524175853.19695-1-npiggin@gmail.com>

In the case of a spurious fault (which can happen due to a race with
another thread that changes the page table), the default Linux mm code
calls flush_tlb_page for that address. This is not required because
the pte will be re-fetched. Hash does not wire this up to a hardware
TLB flush for this reason. This patch avoids the flush for radix.

>From Power ISA v3.0B, p.1090:

    Setting a Reference or Change Bit or Upgrading Access Authority
    (PTE Subject to Atomic Hardware Updates)

    If the only change being made to a valid PTE that is subject to
    atomic hardware updates is to set the Refer- ence or Change bit to
    1 or to add access authorities, a simpler sequence suffices
    because the translation hardware will refetch the PTE if an access
    is attempted for which the only problems were reference and/or
    change bits needing to be set or insufficient access authority.

The nest MMU on POWER9 does not re-fetch the PTE after such an access
attempt before faulting, so address spaces with a coprocessor
attached will continue to flush in these cases.

This reduces tlbies for a kernel compile workload from 0.95M to 0.90M.

fork --fork --exec benchmark improved 0.5% (12300->12400).

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/tlbflush.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 0cac17253513..ebf572ea621e 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -4,7 +4,7 @@
 
 #define MMU_NO_CONTEXT	~0UL
 
-
+#include <linux/mm_types.h>
 #include <asm/book3s/64/tlbflush-hash.h>
 #include <asm/book3s/64/tlbflush-radix.h>
 
@@ -137,6 +137,16 @@ static inline void flush_all_mm(struct mm_struct *mm)
 #define flush_tlb_page(vma, addr)	local_flush_tlb_page(vma, addr)
 #define flush_all_mm(mm)		local_flush_all_mm(mm)
 #endif /* CONFIG_SMP */
+
+#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
+static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
+						unsigned long address)
+{
+	/* See ptep_set_access_flags comment */
+	if (atomic_read(&vma->vm_mm->context.copros) > 0)
+		flush_tlb_page(vma, address);
+}
+
 /*
  * flush the page walk cache for the address
  */
-- 
2.17.0

^ permalink raw reply related

* [PATCH v3 3/7] powerpc/64s/radix: make ptep_get_and_clear_full non-atomic for the full case
From: Nicholas Piggin @ 2018-05-24 17:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180524175853.19695-1-npiggin@gmail.com>

This matches other architectures, when we know there will be no
further accesses to the address (e.g., for teardown), page table
entries can be cleared non-atomically.

The comments about NMMU are bogus: all MMU notifiers (including NMMU)
are released at this point, with their TLBs flushed. An NMMU access at
this point would be a bug.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 705193e7192f..fcd92f9b6ec0 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -176,14 +176,8 @@ static inline pte_t radix__ptep_get_and_clear_full(struct mm_struct *mm,
 	unsigned long old_pte;
 
 	if (full) {
-		/*
-		 * If we are trying to clear the pte, we can skip
-		 * the DD1 pte update sequence and batch the tlb flush. The
-		 * tlb flush batching is done by mmu gather code. We
-		 * still keep the cmp_xchg update to make sure we get
-		 * correct R/C bit which might be updated via Nest MMU.
-		 */
-		old_pte = __radix_pte_update(ptep, ~0ul, 0);
+		old_pte = pte_val(*ptep);
+		*ptep = __pte(0);
 	} else
 		old_pte = radix__pte_update(mm, addr, ptep, ~0ul, 0, 0);
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH v3 4/7] powerpc/64s/radix: prefetch user address in update_mmu_cache
From: Nicholas Piggin @ 2018-05-24 17:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180524175853.19695-1-npiggin@gmail.com>

Prefetch the faulting address in update_mmu_cache to give the page
table walker perhaps 100 cycles head start as locks are dropped and
the interrupt completed.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/mem.c              | 4 +++-
 arch/powerpc/mm/pgtable-book3s64.c | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c3c39b02b2ba..8cecda4bd66a 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -509,8 +509,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 	 */
 	unsigned long access, trap;
 
-	if (radix_enabled())
+	if (radix_enabled()) {
+		prefetch((void *)address);
 		return;
+	}
 
 	/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
 	if (!pte_young(*ptep) || address >= TASK_SIZE)
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 994492453f0e..7ce889a7e5ce 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -145,7 +145,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 			  pmd_t *pmd)
 {
-	return;
+	if (radix_enabled())
+		prefetch((void *)addr);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH v3 5/7] powerpc/64s/radix: avoid ptesync after set_pte and ptep_set_access_flags
From: Nicholas Piggin @ 2018-05-24 17:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180524175853.19695-1-npiggin@gmail.com>

The ISA suggests ptesync after setting a pte, to prevent a table walk
initiated by a subsequent access from missing that store and causing a
spurious fault. This is an architectual allowance that allows an
implementation's page table walker to be incoherent with the store
queue.

However there is no correctness problem in taking a spurious fault in
userspace -- the kernel copes with these at any time, so the updated
pte will be found eventually. Spurious kernel faults on vmap memory
must be avoided, so a ptesync is put into flush_cache_vmap.

On POWER9 so far I have not found a measurable window where this can
result in more minor faults, so as an optimisation, remove the costly
ptesync from pte updates. If an implementation benefits from ptesync,
it would be better to add it back in update_mmu_cache, so it's not
done for things like fork(2).

fork --fork --exec benchmark improved 5.2% (12400->13100).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |  2 --
 arch/powerpc/include/asm/cacheflush.h      | 13 +++++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index fcd92f9b6ec0..45bf1e1b1d33 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -209,7 +209,6 @@ static inline void radix__ptep_set_access_flags(struct mm_struct *mm,
 		__radix_pte_update(ptep, 0, new_pte);
 	} else
 		__radix_pte_update(ptep, 0, set);
-	asm volatile("ptesync" : : : "memory");
 }
 
 static inline int radix__pte_same(pte_t pte_a, pte_t pte_b)
@@ -226,7 +225,6 @@ static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr,
 				 pte_t *ptep, pte_t pte, int percpu)
 {
 	*ptep = pte;
-	asm volatile("ptesync" : : : "memory");
 }
 
 static inline int radix__pmd_bad(pmd_t pmd)
diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index 11843e37d9cf..e9662648e72d 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -26,6 +26,19 @@
 #define flush_cache_vmap(start, end)		do { } while (0)
 #define flush_cache_vunmap(start, end)		do { } while (0)
 
+#ifdef CONFIG_BOOK3S_64
+/*
+ * Book3s has no ptesync after setting a pte, so without this ptesync it's
+ * possible for a kernel virtual mapping access to return a spurious fault
+ * if it's accessed right after the pte is set. The page fault handler does
+ * not expect this type of fault. flush_cache_vmap is not exactly the right
+ * place to put this, but it seems to work well enough.
+ */
+#define flush_cache_vmap(start, end)		do { asm volatile("ptesync"); } while (0)
+#else
+#define flush_cache_vmap(start, end)		do { } while (0)
+#endif
+
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
-- 
2.17.0

^ permalink raw reply related

* [PATCH v3 6/7] powerpc/64s/radix: optimise pte_update
From: Nicholas Piggin @ 2018-05-24 17:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180524175853.19695-1-npiggin@gmail.com>

Implementing pte_update with pte_xchg (which uses cmpxchg) is
inefficient. A single larx/stcx. works fine, no need for the less
efficient cmpxchg sequence.

Then remove the memory barriers from the operation. There is a
requirement for TLB flushing to load mm_cpumask after the store
that reduces pte permissions, which is moved into the TLB flush
code.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h | 25 +++++++++++-----------
 arch/powerpc/mm/mmu_context.c              |  6 ++++--
 arch/powerpc/mm/tlb-radix.c                | 11 +++++++++-
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 45bf1e1b1d33..cc9437a542cc 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -127,20 +127,21 @@ extern void radix__mark_initmem_nx(void);
 static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
 					       unsigned long set)
 {
-	pte_t pte;
-	unsigned long old_pte, new_pte;
-
-	do {
-		pte = READ_ONCE(*ptep);
-		old_pte = pte_val(pte);
-		new_pte = (old_pte | set) & ~clr;
-
-	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
-
-	return old_pte;
+	__be64 old_be, tmp_be;
+
+	__asm__ __volatile__(
+	"1:	ldarx	%0,0,%3		# pte_update\n"
+	"	andc	%1,%0,%5	\n"
+	"	or	%1,%1,%4	\n"
+	"	stdcx.	%1,0,%3		\n"
+	"	bne-	1b"
+	: "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep)
+	: "r" (ptep), "r" (cpu_to_be64(set)), "r" (cpu_to_be64(clr))
+	: "cc" );
+
+	return be64_to_cpu(old_be);
 }
 
-
 static inline unsigned long radix__pte_update(struct mm_struct *mm,
 					unsigned long addr,
 					pte_t *ptep, unsigned long clr,
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 0ab297c4cfad..f84e14f23e50 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -57,8 +57,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * in switch_slb(), and/or the store of paca->mm_ctx_id in
 		 * copy_mm_to_paca().
 		 *
-		 * On the read side the barrier is in pte_xchg(), which orders
-		 * the store to the PTE vs the load of mm_cpumask.
+		 * On the other side, the barrier is in mm/tlb-radix.c for
+		 * radix which orders earlier stores to clear the PTEs vs
+		 * the load of mm_cpumask. And pte_xchg which does the same
+		 * thing for hash.
 		 *
 		 * This full barrier is needed by membarrier when switching
 		 * between processes after store to rq->curr, before user-space
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 5ac3206c51cc..cdc50398fd60 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -524,6 +524,11 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 		return;
 
 	preempt_disable();
+	/*
+	 * Order loads of mm_cpumask vs previous stores to clear ptes before
+	 * the invalidate. See barrier in switch_mm_irqs_off
+	 */
+	smp_mb();
 	if (!mm_is_thread_local(mm)) {
 		if (mm_needs_flush_escalation(mm))
 			_tlbie_pid(pid, RIC_FLUSH_ALL);
@@ -544,6 +549,7 @@ void radix__flush_all_mm(struct mm_struct *mm)
 		return;
 
 	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
 	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_ALL);
 	else
@@ -568,6 +574,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 		return;
 
 	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
 	if (!mm_is_thread_local(mm))
 		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	else
@@ -630,6 +637,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		return;
 
 	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
 	if (mm_is_thread_local(mm)) {
 		local = true;
 		full = (end == TLB_FLUSH_ALL ||
@@ -791,6 +799,7 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 		return;
 
 	preempt_disable();
+	smp_mb(); /* see radix__flush_tlb_mm */
 	if (mm_is_thread_local(mm)) {
 		local = true;
 		full = (end == TLB_FLUSH_ALL ||
@@ -849,7 +858,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 
 	/* Otherwise first do the PWC, then iterate the pages. */
 	preempt_disable();
-
+	smp_mb(); /* see radix__flush_tlb_mm */
 	if (mm_is_thread_local(mm)) {
 		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	} else {
-- 
2.17.0

^ permalink raw reply related

* [PATCH v3 7/7] powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask
From: Nicholas Piggin @ 2018-05-24 17:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180524175853.19695-1-npiggin@gmail.com>

When a single-threaded process has a non-local mm_cpumask, try to use
that point to flush the TLBs out of other CPUs in the cpumask.

An IPI is used for clearing remote CPUs for a few reasons:
- An IPI can end lazy TLB use of the mm, which is required to prevent
  TLB entries being created on the remote CPU. The alternative is to
  drop lazy TLB switching completely, which costs 7.5% in a context
  switch ping-pong test betwee a process and kernel idle thread.
- An IPI can have remote CPUs flush the entire PID, but the local CPU
  can flush a specific VA. tlbie would require over-flushing of the
  local CPU (where the process is running).
- A single threaded process that is migrated to a different CPU is
  likely to have a relatively small mm_cpumask, so IPI is reasonable.

No other thread can concurrently switch to this mm, because it must
have been given a reference to mm_users by the current thread before it
can use_mm. mm_users can be asynchronously incremented (by
mm_activate or mmget_not_zero), but those users must use remote mm
access and can't use_mm or access user address space. Existing code
makes the this assumption already, for example sparc64 has reset
mm_cpumask using this condition since the start of history, see
arch/sparc/kernel/smp_64.c.

This reduces tlbies for a kernel compile workload from 0.90M to 0.12M,
tlbiels are increased significantly due to the PID flushing for the
cleaning up remote CPUs, and increased local flushes (PID flushes take
128 tlbiels vs 1 tlbie).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/tlb.h |  13 +++
 arch/powerpc/mm/tlb-radix.c    | 148 +++++++++++++++++++++++++++------
 2 files changed, 134 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index a7eabff27a0f..9138baccebb0 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -76,6 +76,19 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
 		return false;
 	return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
 }
+static inline void mm_reset_thread_local(struct mm_struct *mm)
+{
+	WARN_ON(atomic_read(&mm->context.copros) > 0);
+	/*
+	 * It's possible for mm_access to take a reference on mm_users to
+	 * access the remote mm from another thread, but it's not allowed
+	 * to set mm_cpumask, so mm_users may be > 1 here.
+	 */
+	WARN_ON(current->mm != mm);
+	atomic_set(&mm->context.active_cpus, 1);
+	cpumask_clear(mm_cpumask(mm));
+	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
+}
 #else /* CONFIG_PPC_BOOK3S_64 */
 static inline int mm_is_thread_local(struct mm_struct *mm)
 {
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index cdc50398fd60..67a6e86d3e7e 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -12,6 +12,8 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
 
 #include <asm/ppc-opcode.h>
 #include <asm/tlb.h>
@@ -504,6 +506,15 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_page);
 
+static bool mm_is_singlethreaded(struct mm_struct *mm)
+{
+	if (atomic_read(&mm->context.copros) > 0)
+		return false;
+	if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
+		return true;
+	return false;
+}
+
 static bool mm_needs_flush_escalation(struct mm_struct *mm)
 {
 	/*
@@ -511,10 +522,47 @@ static bool mm_needs_flush_escalation(struct mm_struct *mm)
 	 * caching PTEs and not flushing them properly when
 	 * RIC = 0 for a PID/LPID invalidate
 	 */
-	return atomic_read(&mm->context.copros) != 0;
+	if (atomic_read(&mm->context.copros) > 0)
+		return true;
+	return false;
 }
 
 #ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+	unsigned long pid = mm->context.id;
+
+	if (current->mm == mm)
+		return; /* Local CPU */
+
+	if (current->active_mm == mm) {
+		/*
+		 * Must be a kernel thread because sender is single-threaded.
+		 */
+		BUG_ON(current->mm);
+		mmgrab(&init_mm);
+		switch_mm(mm, &init_mm, current);
+		current->active_mm = &init_mm;
+		mmdrop(mm);
+	}
+	_tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+static void exit_flush_lazy_tlbs(struct mm_struct *mm)
+{
+	/*
+	 * Would be nice if this was async so it could be run in
+	 * parallel with our local flush, but generic code does not
+	 * give a good API for it. Could extend the generic code or
+	 * make a special powerpc IPI for flushing TLBs.
+	 * For now it's not too performance critical.
+	 */
+	smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
+				(void *)mm, 1);
+	mm_reset_thread_local(mm);
+}
+
 void radix__flush_tlb_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
@@ -530,17 +578,24 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 	 */
 	smp_mb();
 	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			exit_flush_lazy_tlbs(mm);
+			goto local;
+		}
+
 		if (mm_needs_flush_escalation(mm))
 			_tlbie_pid(pid, RIC_FLUSH_ALL);
 		else
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
-	} else
+	} else {
+local:
 		_tlbiel_pid(pid, RIC_FLUSH_TLB);
+	}
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
 
-void radix__flush_all_mm(struct mm_struct *mm)
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
 {
 	unsigned long pid;
 
@@ -550,12 +605,24 @@ void radix__flush_all_mm(struct mm_struct *mm)
 
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	if (!mm_is_thread_local(mm))
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			if (!fullmm) {
+				exit_flush_lazy_tlbs(mm);
+				goto local;
+			}
+		}
 		_tlbie_pid(pid, RIC_FLUSH_ALL);
-	else
+	} else {
+local:
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
+	}
 	preempt_enable();
 }
+void radix__flush_all_mm(struct mm_struct *mm)
+{
+	__flush_all_mm(mm, false);
+}
 EXPORT_SYMBOL(radix__flush_all_mm);
 
 void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
@@ -575,10 +642,16 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	if (!mm_is_thread_local(mm))
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			exit_flush_lazy_tlbs(mm);
+			goto local;
+		}
 		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
-	else
+	} else {
+local:
 		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+	}
 	preempt_enable();
 }
 
@@ -638,14 +711,21 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	if (mm_is_thread_local(mm)) {
-		local = true;
-		full = (end == TLB_FLUSH_ALL ||
-				nr_pages > tlb_local_single_page_flush_ceiling);
-	} else {
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			if (end != TLB_FLUSH_ALL) {
+				exit_flush_lazy_tlbs(mm);
+				goto is_local;
+			}
+		}
 		local = false;
 		full = (end == TLB_FLUSH_ALL ||
 				nr_pages > tlb_single_page_flush_ceiling);
+	} else {
+is_local:
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
 	}
 
 	if (full) {
@@ -766,7 +846,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	 * See the comment for radix in arch_exit_mmap().
 	 */
 	if (tlb->fullmm) {
-		radix__flush_all_mm(mm);
+		__flush_all_mm(mm, true);
 	} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
 		if (!tlb->need_flush_all)
 			radix__flush_tlb_mm(mm);
@@ -800,24 +880,32 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	if (mm_is_thread_local(mm)) {
-		local = true;
-		full = (end == TLB_FLUSH_ALL ||
-				nr_pages > tlb_local_single_page_flush_ceiling);
-	} else {
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			if (end != TLB_FLUSH_ALL) {
+				exit_flush_lazy_tlbs(mm);
+				goto is_local;
+			}
+		}
 		local = false;
 		full = (end == TLB_FLUSH_ALL ||
 				nr_pages > tlb_single_page_flush_ceiling);
+	} else {
+is_local:
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
 	}
 
 	if (full) {
-		if (!local && mm_needs_flush_escalation(mm))
-			also_pwc = true;
-
-		if (local)
+		if (local) {
 			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
-		else
-			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
+		} else {
+			if (mm_needs_flush_escalation(mm))
+				also_pwc = true;
+
+			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+		}
 	} else {
 		if (local)
 			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
@@ -859,10 +947,16 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 	/* Otherwise first do the PWC, then iterate the pages. */
 	preempt_disable();
 	smp_mb(); /* see radix__flush_tlb_mm */
-	if (mm_is_thread_local(mm)) {
-		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
-	} else {
+	if (!mm_is_thread_local(mm)) {
+		if (unlikely(mm_is_singlethreaded(mm))) {
+			exit_flush_lazy_tlbs(mm);
+			goto local;
+		}
 		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+		goto local;
+	} else {
+local:
+		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	}
 
 	preempt_enable();
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH v2 2/2] selftests/powerpc: Add test to verify rfi flush across a system call
From: Naveen N. Rao @ 2018-05-24 18:29 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev
In-Reply-To: <87r2m142ci.fsf@concordia.ellerman.id.au>

Michael Ellerman wrote:
> "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> writes:
>> diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c b/tool=
s/testing/selftests/powerpc/security/rfi_flush.c
>> new file mode 100644
>> index 000000000000..a20fe8eca161
>> --- /dev/null
>> +++ b/tools/testing/selftests/powerpc/security/rfi_flush.c
>> @@ -0,0 +1,132 @@
> ...
>> +
>> +int rfi_flush_test(void)
>> +{
>> +	char *p;
>> +	int repetitions =3D 10;
>> +	int fd, passes =3D 0, iter, rc =3D 0;
>> +	struct perf_event_read v;
>> +	uint64_t l1d_misses_total =3D 0;
>> +	unsigned long iterations =3D 100000, zero_size =3D 24*1024;
>> +	int rfi_flush_org, rfi_flush;
>> +
>> +	SKIP_IF(geteuid() !=3D 0);
>> +
>> +	if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_org)) {
>> +		perror("error reading powerpc/rfi_flush debugfs file");
>> +	        printf("unable to determine current rfi_flush setting");
>> +		return 1;
>> +	}
>=20
> This leads to a hard error on old kernels, which I don't want (breaks my =
CI).

Hmm... didn't realize new tests would be run on older kernels. Are those=20
older stable/distro releases, or actually just earlier kernel versions?

>=20
> So I changed it to:
>=20
> 	if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_org)) {
> 		perror("Unable to read powerpc/rfi_flush debugfs file");
> 		SKIP_IF(1);
> 	}

Sure, thanks!
- Naveen

=

^ permalink raw reply

* Re: [PATCH v3] powerpc: Implement csum_ipv6_magic in assembly
From: Segher Boessenkool @ 2018-05-24 19:42 UTC (permalink / raw)
  To: Christophe LEROY
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev, netdev
In-Reply-To: <3848a4ad-2c0e-691f-e98f-347cfe3484e8@c-s.fr>

On Thu, May 24, 2018 at 08:20:16AM +0200, Christophe LEROY wrote:
> Le 23/05/2018 à 20:34, Segher Boessenkool a écrit :
> >On Tue, May 22, 2018 at 08:57:01AM +0200, Christophe Leroy wrote:
> >>+_GLOBAL(csum_ipv6_magic)
> >>+	lwz	r8, 0(r3)
> >>+	lwz	r9, 4(r3)
> >>+	lwz	r10, 8(r3)
> >>+	lwz	r11, 12(r3)
> >>+	addc	r0, r5, r6
> >>+	adde	r0, r0, r7
> >>+	adde	r0, r0, r8
> >>+	adde	r0, r0, r9
> >>+	adde	r0, r0, r10
> >>+	adde	r0, r0, r11
> >>+	lwz	r8, 0(r4)
> >>+	lwz	r9, 4(r4)
> >>+	lwz	r10, 8(r4)
> >>+	lwz	r11, 12(r4)
> >>+	adde	r0, r0, r8
> >>+	adde	r0, r0, r9
> >>+	adde	r0, r0, r10
> >>+	adde	r0, r0, r11
> >>+	addze	r0, r0
> >>+	rotlwi	r3, r0, 16
> >>+	add	r3, r0, r3
> >>+	not	r3, r3
> >>+	rlwinm	r3, r3, 16, 16, 31
> >>+	blr
> >>+EXPORT_SYMBOL(csum_ipv6_magic)
> >
> >Clustering the loads and carry insns together is pretty much the worst you
> >can do on most 32-bit CPUs.
> 
> Oh, really ? __csum_partial is written that way too.

I thought I told you about this before?  Maybe not.

> Right, now I tried interleaving the lwz and adde. I get no improvment at 
> all on a 885, but I get a 15% improvment on a 8321.

It won't likely help on single-issue cores (like the one 885 has), yes.


Segher

^ permalink raw reply

* Re: [PATCH v3] powerpc: Implement csum_ipv6_magic in assembly
From: Segher Boessenkool @ 2018-05-24 19:55 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: linux-kernel, Paul Mackerras, netdev, linuxppc-dev
In-Reply-To: <1dac2356-5d8a-2892-109e-6e1b26c2bd8c@c-s.fr>

On Thu, May 24, 2018 at 10:18:44AM +0000, Christophe Leroy wrote:
> On 05/24/2018 06:20 AM, Christophe LEROY wrote:
> >Le 23/05/2018 à 20:34, Segher Boessenkool a écrit :
> >>On Tue, May 22, 2018 at 08:57:01AM +0200, Christophe Leroy wrote:
> >>>The generic csum_ipv6_magic() generates a pretty bad result
> >>
> >><snip>
> >>
> >>Please try with a more recent compiler, what you used is pretty ancient.
> >>It's not like recent compilers do great on this either, but it's not
> >>*that* bad anymore ;-)
> 
> Here is what I get with GCC 8.1
> It doesn't look much better, does it ?

There are no more mfocrf, which is a big speedup.  Other than that it is
pretty lousy still, I totally agree.  This improvement happened quite a
while ago, it's fixed in GCC 6.


Segher

^ permalink raw reply

* Re: [PATCH] powerpc/32: Optimise __csum_partial()
From: Segher Boessenkool @ 2018-05-24 19:58 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev
In-Reply-To: <484bcfaccc1ec3d91b74aeaaa26a0ae66fe0955a.1527160868.git.christophe.leroy@c-s.fr>

On Thu, May 24, 2018 at 11:22:27AM +0000, Christophe Leroy wrote:
> Improve __csum_partial by interleaving loads and adds.
> 
> On a 8xx, it brings neither improvement nor degradation.
> On a 83xx, it brings a 25% improvement.

Thanks!  Looks fine to me.

> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>

Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>

> ---
>  arch/powerpc/lib/checksum_32.S | 13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
> index d2238ea82209..aa224069f93a 100644
> --- a/arch/powerpc/lib/checksum_32.S
> +++ b/arch/powerpc/lib/checksum_32.S
> @@ -47,16 +47,25 @@ _GLOBAL(__csum_partial)
>  	bdnz	2b
>  21:	srwi.	r6,r4,4		/* # blocks of 4 words to do */
>  	beq	3f
> +	lwz	r0,4(r3)
>  	mtctr	r6
> -22:	lwz	r0,4(r3)
>  	lwz	r6,8(r3)
> +	adde	r5,r5,r0
>  	lwz	r7,12(r3)
> +	adde	r5,r5,r6
>  	lwzu	r8,16(r3)
> +	adde	r5,r5,r7
> +	bdz	23f
> +22:	lwz	r0,4(r3)
> +	adde	r5,r5,r8
> +	lwz	r6,8(r3)
>  	adde	r5,r5,r0
> +	lwz	r7,12(r3)
>  	adde	r5,r5,r6
> +	lwzu	r8,16(r3)
>  	adde	r5,r5,r7
> -	adde	r5,r5,r8
>  	bdnz	22b
> +23:	adde	r5,r5,r8
>  3:	andi.	r0,r4,2
>  	beq+	4f
>  	lhz	r0,4(r3)
> -- 
> 2.13.3

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox