LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] powerpc/lib: implement strlen() in assembly
From: Christophe Leroy @ 2018-05-30  8:33 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linux-kernel, linuxppc-dev

The generic implementation of strlen() reads strings byte per byte.

This patch implements strlen() in assembly based on a read of entire
words, in the same spirit as what some other arches and glibc do.

On a 8xx the time spent in strlen is reduced by 50-60% for long strings.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
Not tested on PPC64.

To avoid trivial conflict, apply on top of serie "[v6 0/2] powerpc/lib: Optimisation of memcmp() and __clear_user() for PPC32".

Changes in v3:
 - Made it common to PPC32 and PPC64

Changes in v2:
 - Moved handling of unaligned strings outside of the main path as it is very unlikely.
 - Removed the verification of the fourth byte in case none of the three first ones are NUL.

 arch/powerpc/include/asm/asm-compat.h |  4 +++
 arch/powerpc/include/asm/string.h     |  1 +
 arch/powerpc/lib/string.S             | 55 +++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h
index 7f2a7702596c..0e99fe7570c0 100644
--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -20,8 +20,10 @@
 
 /* operations for longs and pointers */
 #define PPC_LL		stringify_in_c(ld)
+#define PPC_LLU		stringify_in_c(ldu)
 #define PPC_STL		stringify_in_c(std)
 #define PPC_STLU	stringify_in_c(stdu)
+#define PPC_ROTLI	stringify_in_c(rotldi)
 #define PPC_LCMPI	stringify_in_c(cmpdi)
 #define PPC_LCMPLI	stringify_in_c(cmpldi)
 #define PPC_LCMP	stringify_in_c(cmpd)
@@ -53,8 +55,10 @@
 
 /* operations for longs and pointers */
 #define PPC_LL		stringify_in_c(lwz)
+#define PPC_LLU		stringify_in_c(lwzu)
 #define PPC_STL		stringify_in_c(stw)
 #define PPC_STLU	stringify_in_c(stwu)
+#define PPC_ROTLI	stringify_in_c(rotlwi)
 #define PPC_LCMPI	stringify_in_c(cmpwi)
 #define PPC_LCMPLI	stringify_in_c(cmplwi)
 #define PPC_LCMP	stringify_in_c(cmpw)
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 9b8cedf618f4..8fdcb532de72 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -13,6 +13,7 @@
 #define __HAVE_ARCH_MEMCHR
 #define __HAVE_ARCH_MEMSET16
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE
+#define __HAVE_ARCH_STRLEN
 
 extern char * strcpy(char *,const char *);
 extern char * strncpy(char *,const char *, __kernel_size_t);
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 4b41970e9ed8..cf8a86c9feb5 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -67,3 +67,58 @@ _GLOBAL(memchr)
 2:	li	r3,0
 	blr
 EXPORT_SYMBOL(memchr)
+
+_GLOBAL(strlen)
+	andi.   r9, r3, (SZL - 1)
+	addi	r10, r3, -SZL
+	bne-	1f
+2:	lis	r6, 0x8080
+	ori	r6, r6, 0x8080		/* r6 = 0x80808080 (himagic) */
+#ifdef CONFIG_PPC64
+	rldimi	r6, r6, 32, 0		/* r6 = 0x8080808080808080 (himagic) */
+#endif
+	PPC_ROTLI  r7, r6, 1 		/* r7 = 0x01010101(01010101) (lomagic)*/
+3:	PPC_LLU	r9, SZL(r10)
+	/* ((x - lomagic) & ~x & himagic) == 0 means no byte in x is NUL */
+	subf	r8, r7, r9
+	andc	r11, r6, r9
+	and.	r8, r8, r11
+	beq+	3b
+#ifdef CONFIG_PPC64
+	rldicl.	r8, r9, 8, 56
+	beq	20f
+	rldicl.	r8, r9, 16, 56
+	beq	21f
+	rldicl.	r8, r9, 24, 56
+	beq	22f
+	rldicl.	r8, r9, 32, 56
+	beq	23f
+	addi	r10, r10, 4
+#endif
+	rlwinm.	r8, r9, 0, 0xff000000
+	beq	20f
+	rlwinm.	r8, r9, 0, 0x00ff0000
+	beq	21f
+	rlwinm.	r8, r9, 0, 0x0000ff00
+	beq	22f
+23:	subf	r3, r3, r10
+	addi	r3, r3, 3
+	blr
+22:	subf	r3, r3, r10
+	addi	r3, r3, 2
+	blr
+21:	subf	r3, r3, r10
+	addi	r3, r3, 1
+	blr
+19:	addi	r10, r10, (SZL - 1)
+20:	subf	r3, r3, r10
+	blr
+
+1:	lbz	r9, SZL(r10)
+	addi	r10, r10, 1
+	cmpwi	cr1, r9, 0
+	andi.	r9, r10, (SZL - 1)
+	beq	cr1, 19b
+	bne	1b
+	b	2b
+EXPORT_SYMBOL(strlen)
-- 
2.13.3

^ permalink raw reply related

* Re: [PATCH v6 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: Segher Boessenkool @ 2018-05-30  8:35 UTC (permalink / raw)
  To: Simon Guo; +Cc: linuxppc-dev, Naveen N.  Rao, Cyril Bur
In-Reply-To: <20180530081402.GB5951@simonLocalRHEL7.x64>

On Wed, May 30, 2018 at 04:14:02PM +0800, Simon Guo wrote:
> Hi Segher,
> On Mon, May 28, 2018 at 06:05:59AM -0500, Segher Boessenkool wrote:
> > On Fri, May 25, 2018 at 12:07:34PM +0800, wei.guo.simon@gmail.com wrote:
> > > +	/* save and restore cr0 */
> > > +	mfocrf  r5,64
> > > +	EXIT_VMX_OPS
> > > +	mtocrf	64,r5
> > > +	b	.LcmpAB_lightweight
> > 
> > That's cr1, not cr0.  You can use mcrf instead, it is cheaper (esp. if
> > you have it in a non-volatile CR field before so you need only one, if any).
> > 
> You are right :) How about using mtcr/mfcr instead, I think they are
> fast as well and more readable.

Those are much worse than m[ft]ocrf.

You probably should just shuffle things around so that EXIT_VMX_OPS
does not clobber the CR field you need to keep.


Segher

^ permalink raw reply

* Re: [PATCH 1/2] error-injection: Simplify arch specific helpers
From: Masami Hiramatsu @ 2018-05-30  8:41 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: Ingo Molnar, Michael Ellerman, Josef Bacik, linuxppc-dev,
	linux-kernel
In-Reply-To: <8f4883f08feaf6e040255015af2da7bbc7741e41.1527596631.git.naveen.n.rao@linux.vnet.ibm.com>

On Tue, 29 May 2018 18:06:02 +0530
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> wrote:

> We already have an arch-independent way to set the instruction pointer
> with instruction_pointer_set(). Using this allows us to get rid of the
> need for override_function_with_return() that each architecture has to
> implement.
> 
> Furthermore, just_return_func() only has to encode arch-specific
> assembly instructions to return from a function. Introduce a macro
> ARCH_FUNC_RET to provide the arch-specific instruction and move over
> just_return_func() to generic code.
> 
> With these changes, architectures that already support kprobes, only
> just need to ensure they provide regs_set_return_value(), GET_IP() (for
> instruction_pointer_set()), and ARCH_FUNC_RET to support error
> injection.

Nice! the code basically good to me. Just one comment, ARCH_FUNC_RET sounds
like a function. Maybe ARCH_RETURN_INSTRUCTION will be better name, isn't it? :)

Thank you,

> 
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/error-injection.h |  6 +-----
>  arch/x86/lib/Makefile                  |  1 -
>  arch/x86/lib/error-inject.c            | 20 --------------------
>  include/asm-generic/error-injection.h  |  6 ++++++
>  include/linux/error-injection.h        |  1 +
>  kernel/fail_function.c                 |  2 +-
>  kernel/trace/bpf_trace.c               |  2 +-
>  lib/error-inject.c                     |  8 ++++++++
>  8 files changed, 18 insertions(+), 28 deletions(-)
>  delete mode 100644 arch/x86/lib/error-inject.c
> 
> diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h
> index 47b7a1296245..f3f22e237b86 100644
> --- a/arch/x86/include/asm/error-injection.h
> +++ b/arch/x86/include/asm/error-injection.h
> @@ -2,12 +2,8 @@
>  #ifndef _ASM_ERROR_INJECTION_H
>  #define _ASM_ERROR_INJECTION_H
>  
> -#include <linux/compiler.h>
> -#include <linux/linkage.h>
> -#include <asm/ptrace.h>
>  #include <asm-generic/error-injection.h>
>  
> -asmlinkage void just_return_func(void);
> -void override_function_with_return(struct pt_regs *regs);
> +#define ARCH_FUNC_RET	"ret"
>  
>  #endif /* _ASM_ERROR_INJECTION_H */
> diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
> index 25a972c61b0a..f23934bbaf4e 100644
> --- a/arch/x86/lib/Makefile
> +++ b/arch/x86/lib/Makefile
> @@ -26,7 +26,6 @@ lib-y += memcpy_$(BITS).o
>  lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
>  lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
>  lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
> -lib-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
>  lib-$(CONFIG_RETPOLINE) += retpoline.o
>  
>  obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
> diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
> deleted file mode 100644
> index 3cdf06128d13..000000000000
> --- a/arch/x86/lib/error-inject.c
> +++ /dev/null
> @@ -1,20 +0,0 @@
> -// SPDX-License-Identifier: GPL-2.0
> -
> -#include <linux/error-injection.h>
> -#include <linux/kprobes.h>
> -
> -asmlinkage void just_return_func(void);
> -
> -asm(
> -	".type just_return_func, @function\n"
> -	".globl just_return_func\n"
> -	"just_return_func:\n"
> -	"	ret\n"
> -	".size just_return_func, .-just_return_func\n"
> -);
> -
> -void override_function_with_return(struct pt_regs *regs)
> -{
> -	regs->ip = (unsigned long)&just_return_func;
> -}
> -NOKPROBE_SYMBOL(override_function_with_return);
> diff --git a/include/asm-generic/error-injection.h b/include/asm-generic/error-injection.h
> index 296c65442f00..8ac152cc204a 100644
> --- a/include/asm-generic/error-injection.h
> +++ b/include/asm-generic/error-injection.h
> @@ -3,6 +3,9 @@
>  #define _ASM_GENERIC_ERROR_INJECTION_H
>  
>  #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
> +#include <linux/compiler.h>
> +#include <linux/linkage.h>
> +
>  enum {
>  	EI_ETYPE_NONE,		/* Dummy value for undefined case */
>  	EI_ETYPE_NULL,		/* Return NULL if failure */
> @@ -27,6 +30,9 @@ static struct error_injection_entry __used				\
>  		.addr = (unsigned long)fname,				\
>  		.etype = EI_ETYPE_##_etype,				\
>  	};
> +
> +asmlinkage void just_return_func(void);
> +
>  #else
>  #define ALLOW_ERROR_INJECTION(fname, _etype)
>  #endif
> diff --git a/include/linux/error-injection.h b/include/linux/error-injection.h
> index 280c61ecbf20..f4a0b23423d2 100644
> --- a/include/linux/error-injection.h
> +++ b/include/linux/error-injection.h
> @@ -4,6 +4,7 @@
>  
>  #ifdef CONFIG_FUNCTION_ERROR_INJECTION
>  
> +#include <linux/types.h>
>  #include <asm/error-injection.h>
>  
>  extern bool within_error_injection_list(unsigned long addr);
> diff --git a/kernel/fail_function.c b/kernel/fail_function.c
> index 1d5632d8bbcc..0ae2ca4a29e8 100644
> --- a/kernel/fail_function.c
> +++ b/kernel/fail_function.c
> @@ -183,7 +183,7 @@ static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
>  
>  	if (should_fail(&fei_fault_attr, 1)) {
>  		regs_set_return_value(regs, attr->retval);
> -		override_function_with_return(regs);
> +		instruction_pointer_set(regs, (unsigned long)&just_return_func);
>  		/* Kprobe specific fixup */
>  		reset_current_kprobe();
>  		preempt_enable_no_resched();
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 56ba0f2a01db..23f1f4ffda6c 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
>  BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
>  {
>  	regs_set_return_value(regs, rc);
> -	override_function_with_return(regs);
> +	instruction_pointer_set(regs, (unsigned long)&just_return_func);
>  	return 0;
>  }
>  
> diff --git a/lib/error-inject.c b/lib/error-inject.c
> index c0d4600f4896..7fdc92b5babc 100644
> --- a/lib/error-inject.c
> +++ b/lib/error-inject.c
> @@ -20,6 +20,14 @@ struct ei_entry {
>  	void *priv;
>  };
>  
> +asm(
> +	".type just_return_func, @function\n"
> +	".globl just_return_func\n"
> +	"just_return_func:\n"
> +	ARCH_FUNC_RET "\n"
> +	".size just_return_func, .-just_return_func\n"
> +);
> +
>  bool within_error_injection_list(unsigned long addr)
>  {
>  	struct ei_entry *ent;
> -- 
> 2.17.0
> 


-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v5 3/3] powerpc/lib: optimise PPC32 memcmp
From: Christophe LEROY @ 2018-05-30  8:47 UTC (permalink / raw)
  To: Mathieu Malaterre
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Segher Boessenkool, linuxppc-dev, LKML
In-Reply-To: <CA+7wUszdfG+u4YSnxjFaujy3s_Vu92POPamK+=zjYe6Pbu+kxw@mail.gmail.com>



Le 29/05/2018 à 22:03, Mathieu Malaterre a écrit :
> On Mon, May 28, 2018 at 12:49 PM, Christophe Leroy
> <christophe.leroy@c-s.fr> wrote:
>> At the time being, memcmp() compares two chunks of memory
>> byte per byte.
>>
>> This patch optimises the comparison by comparing word by word.
>>
>> A small benchmark performed on an 8xx comparing two chuncks
>> of 512 bytes performed 100000 times gives:
>>
>> Before : 5852274 TB ticks
>> After:   1488638 TB ticks
>>
>> This is almost 4 times faster
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>> ---
>>   arch/powerpc/lib/string_32.S | 37 +++++++++++++++++++++++++++----------
>>   1 file changed, 27 insertions(+), 10 deletions(-)
> 
> Would it possible for you to move the actual code instead to:
> 
> ./arch/powerpc/lib/memcmp_32.S
> 
> This will seat right next to memcmp_64.S implementation.

Good idea, thanks.
Done in v6

Christophe

> 
>> diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
>> index 40a576d56ac7..4fbaa046aa84 100644
>> --- a/arch/powerpc/lib/string_32.S
>> +++ b/arch/powerpc/lib/string_32.S
>> @@ -16,17 +16,34 @@
>>          .text
>>
>>   _GLOBAL(memcmp)
>> -       cmpwi   cr0, r5, 0
>> -       beq-    2f
>> -       mtctr   r5
>> -       addi    r6,r3,-1
>> -       addi    r4,r4,-1
>> -1:     lbzu    r3,1(r6)
>> -       lbzu    r0,1(r4)
>> -       subf.   r3,r0,r3
>> -       bdnzt   2,1b
>> +       srawi.  r7, r5, 2               /* Divide len by 4 */
>> +       mr      r6, r3
>> +       beq-    3f
>> +       mtctr   r7
>> +       li      r7, 0
>> +1:     lwzx    r3, r6, r7
>> +       lwzx    r0, r4, r7
>> +       addi    r7, r7, 4
>> +       cmplw   cr0, r3, r0
>> +       bdnzt   eq, 1b
>> +       bne     5f
>> +3:     andi.   r3, r5, 3
>> +       beqlr
>> +       cmplwi  cr1, r3, 2
>> +       blt-    cr1, 4f
>> +       lhzx    r3, r6, r7
>> +       lhzx    r0, r4, r7
>> +       addi    r7, r7, 2
>> +       subf.   r3, r0, r3
>> +       beqlr   cr1
>> +       bnelr
>> +4:     lbzx    r3, r6, r7
>> +       lbzx    r0, r4, r7
>> +       subf.   r3, r0, r3
>>          blr
>> -2:     li      r3,0
>> +5:     li      r3, 1
>> +       bgtlr
>> +       li      r3, -1
>>          blr
>>   EXPORT_SYMBOL(memcmp)
>>
>> --
>> 2.13.3
>>

^ permalink raw reply

* Re: [PATCH v6 1/4] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()
From: Simon Guo @ 2018-05-30  9:02 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: linuxppc-dev, Naveen N.  Rao, Cyril Bur
In-Reply-To: <20180530082739.GF17342@gate.crashing.org>

On Wed, May 30, 2018 at 03:27:39AM -0500, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, May 30, 2018 at 04:11:50PM +0800, Simon Guo wrote:
> > On Mon, May 28, 2018 at 05:35:12AM -0500, Segher Boessenkool wrote:
> > > On Fri, May 25, 2018 at 12:07:33PM +0800, wei.guo.simon@gmail.com wrote:
> > > If this doesn't use cr0 anymore, you can do  rlwinm r6,r6,0,7  instead of
> > > andi r6,r6,7 .
> > > 
> > CR0 is used at .Lno_short handling.
> 
> Tricky.
> 
> > > > +	subfc.	r5,r6,r5
> > > 
> > > Why subfc?  You don't use the carry.
> > OK. I will use subfc instead.
> 
> I meant subf -- no carry.  If you want CR0 set there is subf. just fine.
> 
> > > > +	bgt	cr0,8f
> > > > +	li	r3,-1
> > > > +8:
> > > > +	blr
> > > 
> > >   blelr
> > >   li r3,-1
> > >   blr
> > Sure. That looks more impact.
> 
> Should have been bgtlr of course -- well check please :-)
Yes :)

Thanks,
- Simon

^ permalink raw reply

* Re: [PATCH v6 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: Simon Guo @ 2018-05-30  9:03 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: linuxppc-dev, Naveen N.  Rao, Cyril Bur
In-Reply-To: <20180530083540.GG17342@gate.crashing.org>

On Wed, May 30, 2018 at 03:35:40AM -0500, Segher Boessenkool wrote:
> On Wed, May 30, 2018 at 04:14:02PM +0800, Simon Guo wrote:
> > Hi Segher,
> > On Mon, May 28, 2018 at 06:05:59AM -0500, Segher Boessenkool wrote:
> > > On Fri, May 25, 2018 at 12:07:34PM +0800, wei.guo.simon@gmail.com wrote:
> > > > +	/* save and restore cr0 */
> > > > +	mfocrf  r5,64
> > > > +	EXIT_VMX_OPS
> > > > +	mtocrf	64,r5
> > > > +	b	.LcmpAB_lightweight
> > > 
> > > That's cr1, not cr0.  You can use mcrf instead, it is cheaper (esp. if
> > > you have it in a non-volatile CR field before so you need only one, if any).
> > > 
> > You are right :) How about using mtcr/mfcr instead, I think they are
> > fast as well and more readable.
> 
> Those are much worse than m[ft]ocrf.
> 
> You probably should just shuffle things around so that EXIT_VMX_OPS
> does not clobber the CR field you need to keep.
Let me use mcrf then :)

Thanks,
- Simon

^ permalink raw reply

* Re: [PATCH v4 04/29] KVM: PPC: Book3S PR: Move kvmppc_save_tm/kvmppc_restore_tm to separate file
From: Simon Guo @ 2018-05-30  9:04 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <20180529234027.GA28144@fergus.ozlabs.ibm.com>

On Wed, May 30, 2018 at 09:40:27AM +1000, Paul Mackerras wrote:
> On Wed, May 23, 2018 at 03:01:47PM +0800, wei.guo.simon@gmail.com wrote:
> > From: Simon Guo <wei.guo.simon@gmail.com>
> > 
> > It is a simple patch just for moving kvmppc_save_tm/kvmppc_restore_tm()
> > functionalities to tm.S. There is no logic change. The reconstruct of
> > those APIs will be done in later patches to improve readability.
> > 
> > It is for preparation of reusing those APIs on both HV/PR PPC KVM.
> > 
> > Some slight change during move the functions includes:
> > - surrounds some HV KVM specific code with CONFIG_KVM_BOOK3S_HV_POSSIBLE
> > for compilation.
> > - use _GLOBAL() to define kvmppc_save_tm/kvmppc_restore_tm()
> > 
> > Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
> > ---
> >  arch/powerpc/kvm/Makefile               |   3 +
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 322 ----------------------------
> >  arch/powerpc/kvm/tm.S                   | 363 ++++++++++++++++++++++++++++++++
> >  3 files changed, 366 insertions(+), 322 deletions(-)
> >  create mode 100644 arch/powerpc/kvm/tm.S
> > 
> > diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> > index 4b19da8..f872c04 100644
> > --- a/arch/powerpc/kvm/Makefile
> > +++ b/arch/powerpc/kvm/Makefile
> > @@ -63,6 +63,9 @@ kvm-pr-y := \
> >  	book3s_64_mmu.o \
> >  	book3s_32_mmu.o
> >  
> > +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> > +	tm.o
> > +
> >  ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> >  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> >  	book3s_rmhandlers.o
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index 5e6e493..4db2b10 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -39,8 +39,6 @@ BEGIN_FTR_SECTION;				\
> >  	extsw	reg, reg;			\
> >  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> >  
> > -#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
> > -
> >  /* Values in HSTATE_NAPPING(r13) */
> >  #define NAPPING_CEDE	1
> >  #define NAPPING_NOVCPU	2
> > @@ -3119,326 +3117,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
> >  	mr	r4,r31
> >  	blr
> >  
> > -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> > -/*
> > - * Save transactional state and TM-related registers.
> > - * Called with r9 pointing to the vcpu struct.
> > - * This can modify all checkpointed registers, but
> > - * restores r1, r2 and r9 (vcpu pointer) before exit.
> > - */
> > -kvmppc_save_tm:
> > -	mflr	r0
> > -	std	r0, PPC_LR_STKOFF(r1)
> > -	stdu	r1, -PPC_MIN_STKFRM(r1)
> > -
> > -	/* Turn on TM. */
> > -	mfmsr	r8
> > -	li	r0, 1
> > -	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
> > -	mtmsrd	r8
> > -
> > -	ld	r5, VCPU_MSR(r9)
> > -	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
> > -	beq	1f	/* TM not active in guest. */
> > -
> > -	std	r1, HSTATE_HOST_R1(r13)
> > -	li	r3, TM_CAUSE_KVM_RESCHED
> > -
> > -BEGIN_FTR_SECTION
> > -	lbz	r0, HSTATE_FAKE_SUSPEND(r13) /* Were we fake suspended? */
> > -	cmpwi	r0, 0
> > -	beq	3f
> > -	rldicl. r8, r8, 64 - MSR_TS_S_LG, 62 /* Did we actually hrfid? */
> > -	beq	4f
> > -BEGIN_FTR_SECTION_NESTED(96)
> > -	bl	pnv_power9_force_smt4_catch
> > -END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
> > -	nop
> > -	b	6f
> > -3:
> > -	/* Emulation of the treclaim instruction needs TEXASR before treclaim */
> > -	mfspr	r6, SPRN_TEXASR
> > -	std	r6, VCPU_ORIG_TEXASR(r9)
> > -6:
> > -END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
> 
> It worries me that we now have this TM hypervisor assist stuff in a
> place where it could be active with PR KVM.  I think it would be
> better to factor out the TM assist code into a separate function which
> then calls kvmppc_save_tm if it needs to do an actual treclaim.  I'll
> look at doing that.
Paul,
Thanks for the info. Please let me know if anything I can help.

BR,
- Simon

^ permalink raw reply

* [PATCH] powerpc/powermac: fix spelling mistake "Usupported" -> "Unsupported"
From: Colin King @ 2018-05-30  9:15 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Trivial fix to spelling mistake in bootx_printf message text.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 arch/powerpc/platforms/powermac/bootx_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powermac/bootx_init.c b/arch/powerpc/platforms/powermac/bootx_init.c
index ba0964c17620..3b3b0b9b3577 100644
--- a/arch/powerpc/platforms/powermac/bootx_init.c
+++ b/arch/powerpc/platforms/powermac/bootx_init.c
@@ -519,7 +519,7 @@ void __init bootx_init(unsigned long r3, unsigned long r4)
 			;
 	}
 	if (bi->architecture != BOOT_ARCH_PCI) {
-		bootx_printf(" !!! WARNING - Usupported machine"
+		bootx_printf(" !!! WARNING - Unsupported machine"
 			     " architecture !\n");
 		for (;;)
 			;
-- 
2.17.0

^ permalink raw reply related

* [PATCH v7 0/5] powerpc/64: memcmp() optimization
From: wei.guo.simon @ 2018-05-30  9:20 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo

From: Simon Guo <wei.guo.simon@gmail.com>

There is some room to optimize memcmp() in powerpc 64 bits version for
following 2 cases:
(1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
memcmp() can align them and go with .Llong comparision mode without
fallback to .Lshort comparision mode do compare buffer byte by byte.
(2) VMX instructions can be used to speed up for large size comparision,
currently the threshold is set for 4K bytes. Notes the VMX instructions
will lead to VMX regs save/load penalty. This patch set includes a
patch to add a 32 bytes pre-checking to minimize the penalty.

It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
performance for POWER8). Thanks Cyril Bur's information.
This patch set also updates memcmp selftest case to make it compiled and
incorporate large size comparison case.

v6 -> v7:
- add vcmpequd/vcmpequdb .long macro
- add CPU_FTR pair so that Power7 won't invoke Altivec instrs.
- rework some instructions for higher performance or more readable.

v5 -> v6:
- correct some comments/commit messsage.
- rename VMX_OPS_THRES to VMX_THRESH

v4 -> v5:
- Expand 32 bytes prechk to src/dst different offset case, and remove
KSM specific label/comment.

v3 -> v4:
- Add 32 bytes pre-checking before using VMX instructions.

v2 -> v3:
- add optimization for src/dst with different offset against 8 bytes
boundary.
- renamed some label names.
- reworked some comments from Cyril Bur, such as fill the pipeline, 
and use VMX when size == 4K.
- fix a bug of enter/exit_vmx_ops pairness issue. And revised test 
case to test whether enter/exit_vmx_ops are paired.

v1 -> v2:
- update 8bytes unaligned bytes comparison method.
- fix a VMX comparision bug.
- enhanced the original memcmp() selftest.
- add powerpc/64 to subject/commit message.


Simon Guo (5):
  powerpc/64: Align bytes before fall back to .Lshort in powerpc64
    memcmp()
  powerpc: add vcmpequd/vcmpequb ppc instruction macro
  powerpc/64: enhance memcmp() with VMX instruction for long bytes
    comparision
  powerpc/64: add 32 bytes prechecking before using VMX optimization on
    memcmp()
  powerpc:selftest update memcmp_64 selftest for VMX implementation

 arch/powerpc/include/asm/asm-prototypes.h          |   4 +-
 arch/powerpc/include/asm/ppc-opcode.h              |  11 +
 arch/powerpc/lib/copypage_power7.S                 |   4 +-
 arch/powerpc/lib/memcmp_64.S                       | 412 ++++++++++++++++++++-
 arch/powerpc/lib/memcpy_power7.S                   |   6 +-
 arch/powerpc/lib/vmx-helper.c                      |   4 +-
 .../selftests/powerpc/copyloops/asm/ppc_asm.h      |   4 +-
 .../selftests/powerpc/stringloops/asm/ppc-opcode.h |  39 ++
 .../selftests/powerpc/stringloops/asm/ppc_asm.h    |  24 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c |  98 +++--
 10 files changed, 566 insertions(+), 40 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v7 1/5] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()
From: wei.guo.simon @ 2018-05-30  9:20 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527672063-6953-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2)  If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
 then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
	29.773018302 seconds time elapsed                                          ( +- 0.09% )
- with patch
	16.485568173 seconds time elapsed                                          ( +-  0.02% )
		-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 10000

 int test_memcmp(const void *s1, const void *s2, size_t n);
--------

- Without patch
	0.244746482 seconds time elapsed                                          ( +-  0.36%)
- with patch
	0.215069477 seconds time elapsed                                          ( +-  0.51%)
		-> There is ~+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000

 int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
       1.845642503 seconds time elapsed                                          ( +- 0.12% )
- With patch
       1.849767135 seconds time elapsed                                          ( +- 0.26% )
		-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/lib/memcmp_64.S | 140 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 133 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..5776f91 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
 #define rH	r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH	lhbrx
+#define LW	lwbrx
 #define LD	ldbrx
 #else
+#define LH	lhzx
+#define LW	lwzx
 #define LD	ldx
 #endif
 
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_xxxx
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_xxxx
+ */
 _GLOBAL(memcmp)
 	cmpdi	cr1,r5,0
 
-	/* Use the short loop if both strings are not 8B aligned */
-	or	r6,r3,r4
+	/* Use the short loop if the src/dst addresses are not
+	 * with the same offset of 8 bytes align boundary.
+	 */
+	xor	r6,r3,r4
 	andi.	r6,r6,7
 
-	/* Use the short loop if length is less than 32B */
-	cmpdi	cr6,r5,31
+	/* Fall back to short loop if compare at aligned addrs
+	 * with less than 8 bytes.
+	 */
+	cmpdi   cr6,r5,7
 
 	beq	cr1,.Lzero
-	bne	.Lshort
-	bgt	cr6,.Llong
+	bgt	cr6,.Lno_short
 
 .Lshort:
 	mtctr	r5
-
 1:	lbz	rA,0(r3)
 	lbz	rB,0(r4)
 	subf.	rC,rB,rA
@@ -78,11 +91,89 @@ _GLOBAL(memcmp)
 	li	r3,0
 	blr
 
+.Lno_short:
+	dcbt	0,r3
+	dcbt	0,r4
+	bne	.Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+	/* attempt to compare bytes not aligned with 8 bytes so that
+	 * rest comparison can run based on 8 bytes alignment.
+	 */
+	andi.   r6,r3,7
+
+	/* Try to compare the first double word which is not 8 bytes aligned:
+	 * load the first double word at (src & ~7UL) and shift left appropriate
+	 * bits before comparision.
+	 */
+	rlwinm  r6,r3,3,26,28
+	beq     .Lsameoffset_8bytes_aligned
+	clrrdi	r3,r3,3
+	clrrdi	r4,r4,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	sld	rA,rA,r6
+	sld	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+	subfic  r6,r6,8
+	subf.	r5,r6,r5
+	addi	r3,r3,8
+	addi	r4,r4,8
+	beq	.Lzero
+
+.Lsameoffset_8bytes_aligned:
+	/* now we are aligned with 8 bytes.
+	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+	 */
+	cmpdi   cr6,r5,31
+	bgt	cr6,.Llong
+
+.Lcmp_lt32bytes:
+	/* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */
+	cmpdi   cr5,r5,7
+	srdi    r0,r5,3
+	ble	cr5,.Lcmp_rest_lt8bytes
+
+	/* handle 8 ~ 31 bytes */
+	clrldi  r5,r5,61
+	mtctr   r0
+2:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne	cr0,.LcmpAB_lightweight
+	bdnz	2b
+
+	cmpwi   r5,0
+	beq	.Lzero
+
+.Lcmp_rest_lt8bytes:
+	/* Here we have only less than 8 bytes to compare with. at least s1
+	 * Address is aligned with 8 bytes.
+	 * The next double words are load and shift right with appropriate
+	 * bits.
+	 */
+	subfic  r6,r5,8
+	slwi	r6,r6,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+
 .Lnon_zero:
 	mr	r3,rC
 	blr
 
 .Llong:
+	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
 	li	off24,24
@@ -232,4 +323,39 @@ _GLOBAL(memcmp)
 	ld	r28,-32(r1)
 	ld	r27,-40(r1)
 	blr
+
+.LcmpAB_lightweight:   /* skip NV GPRS restore */
+	li	r3,1
+	bgtlr
+	li	r3,-1
+	blr
+
+.Ldiffoffset_8bytes_make_align_start:
+	/* now try to align s1 with 8 bytes */
+	rlwinm  r6,r3,3,26,28
+	beq     .Ldiffoffset_align_s1_8bytes
+
+	clrrdi	r3,r3,3
+	LD	rA,0,r3
+	LD	rB,0,r4  /* unaligned load */
+	sld	rA,rA,r6
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+
+	subfic  r6,r6,8
+	subf.	r5,r6,r5
+	addi	r3,r3,8
+	add	r4,r4,r6
+
+	beq	.Lzero
+
+.Ldiffoffset_align_s1_8bytes:
+	/* now s1 is aligned with 8 bytes. */
+	cmpdi   cr5,r5,31
+	ble	cr5,.Lcmp_lt32bytes
+	b	.Llong
+
 EXPORT_SYMBOL(memcmp)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v7 2/5] powerpc: add vcmpequd/vcmpequb ppc instruction macro
From: wei.guo.simon @ 2018-05-30  9:21 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527672063-6953-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

Some old tool chains don't know about instructions like vcmpequd.

This patch adds .long macro for vcmpequd and vcmpequb, which is
a preparation to optimize ppc64 memcmp with VMX instructions.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/ppc-opcode.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 18883b8..1866a97 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -366,6 +366,8 @@
 #define PPC_INST_STFDX			0x7c0005ae
 #define PPC_INST_LVX			0x7c0000ce
 #define PPC_INST_STVX			0x7c0001ce
+#define PPC_INST_VCMPEQUD		0x100000c7
+#define PPC_INST_VCMPEQUB		0x10000006
 
 /* macros to insert fields into opcodes */
 #define ___PPC_RA(a)	(((a) & 0x1f) << 16)
@@ -396,6 +398,7 @@
 #define __PPC_BI(s)	(((s) & 0x1f) << 16)
 #define __PPC_CT(t)	(((t) & 0x0f) << 21)
 #define __PPC_SPR(r)	((((r) & 0x1f) << 16) | ((((r) >> 5) & 0x1f) << 11))
+#define __PPC_RC21	(0x1 << 10)
 
 /*
  * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a
@@ -567,4 +570,12 @@
 				       ((IH & 0x7) << 21))
 #define PPC_INVALIDATE_ERAT	PPC_SLBIA(7)
 
+#define VCMPEQUD_RC(vrt, vra, vrb)	stringify_in_c(.long PPC_INST_VCMPEQUD | \
+			      ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+			      ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb)	stringify_in_c(.long PPC_INST_VCMPEQUB | \
+			      ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+			      ___PPC_RB(vrb) | __PPC_RC21)
+
 #endif /* _ASM_POWERPC_PPC_OPCODE_H */
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v7 3/5] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: wei.guo.simon @ 2018-05-30  9:21 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527672063-6953-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch add VMX primitives to do memcmp() in case the compare size
is equal or greater than 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
------
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include <malloc.h>
>#include <stdlib.h>
>#include <string.h>
>#include <time.h>
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
        char *s1;
        char *s2;
        unsigned long i;

        s1 = memalign(128, SIZE);
        if (!s1) {
                perror("memalign");
                exit(1);
        }

        s2 = memalign(128, SIZE);
        if (!s2) {
                perror("memalign");
                exit(1);
        }

        for (i = 0; i < SIZE; i++)  {
                s1[i] = i & 0xff;
                s2[i] = i & 0xff;
        }
        for (i = 0; i < ITERATIONS; i++) {
		int ret = test_memcmp(s1, s2, SIZE);

		if (ret) {
			printf("return %d at[%ld]! should have returned zero\n", ret, i);
			abort();
		}
	}

        return 0;
}

int main(void)
{
        return test_harness(testcase, "memcmp");
}
------
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
	4.726728762 seconds time elapsed                                          ( +-  3.54%)
With VMX patch:
	4.234335473 seconds time elapsed                                          ( +-  2.63%)
		There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S        |   4 +-
 arch/powerpc/lib/memcmp_64.S              | 239 +++++++++++++++++++++++++++++-
 arch/powerpc/lib/memcpy_power7.S          |   6 +-
 arch/powerpc/lib/vmx-helper.c             |   4 +-
 5 files changed, 247 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
 	addi	r3,r3,128
 	bdnz	1b
 
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 #else
 	li	r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 5776f91..aef0e41 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
  */
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/ppc-opcode.h>
 
 #define off8	r6
 #define off16	r7
@@ -27,12 +28,73 @@
 #define LH	lhbrx
 #define LW	lwbrx
 #define LD	ldbrx
+#define LVS	lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH	lhzx
 #define LW	lwzx
 #define LD	ldx
+#define LVS	lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS	\
+	mflr    r0;	\
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      enter_vmx_ops; \
+	cmpwi   cr1,r3,0; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+#define EXIT_VMX_OPS \
+	mflr    r0; \
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      exit_vmx_ops; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ *    ^                                  ^                                 ^
+ * 0xbbbb10                          0xbbbb20                          0xbbb30
+ *                                 ^
+ *                                _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ *   for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ *   for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+        lvx     _v2nd_qw,_vaddr,off16; \
+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
 /*
  * There are 2 categories for memcmp:
  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
@@ -132,7 +194,7 @@ _GLOBAL(memcmp)
 	bgt	cr6,.Llong
 
 .Lcmp_lt32bytes:
-	/* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */
+	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
 	cmpdi   cr5,r5,7
 	srdi    r0,r5,3
 	ble	cr5,.Lcmp_rest_lt8bytes
@@ -173,6 +235,15 @@ _GLOBAL(memcmp)
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* Try to use vmx loop if length is equal or greater than 4K */
+	cmpldi  cr6,r5,VMX_THRESH
+	bge	cr6,.Lsameoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Llong_novmx_cmp:
+#endif
 	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
@@ -330,7 +401,97 @@ _GLOBAL(memcmp)
 	li	r3,-1
 	blr
 
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+	/* Enter with src/dst addrs has the same offset with 8 bytes
+	 * align boundary
+	 */
+	ENTER_VMX_OPS
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* need to check whether r4 has the same offset with r3
+	 * for 16 bytes boundary.
+	 */
+	xor	r0,r3,r4
+	andi.	r0,r0,0xf
+	bne	.Ldiffoffset_vmx_cmp_start
+
+	/* len is no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	LD	rA,0,r3
+	beq	4f
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	addi	r5,r5,-8
+
+	beq	cr0,4f
+	/* save and restore cr0 */
+	mcrf	7,0
+	EXIT_VMX_OPS
+	mcrf	0,7
+	b	.LcmpAB_lightweight
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	clrldi  r5,r5,59
+	li	off16,16
+
+.balign 16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	VCMPEQUD_RC(v0,v0,v1)
+	bnl	cr6,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	VCMPEQUD_RC(v0,v0,v1)
+	bnl	cr6,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	EXIT_VMX_OPS
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	/* diff the last 16 bytes */
+	EXIT_VMX_OPS
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	li	off8,8
+	bne	cr0,.LcmpAB_lightweight
+
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
+
 .Ldiffoffset_8bytes_make_align_start:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* only do vmx ops when the size equal or greater than 4K bytes */
+	cmpdi	cr5,r5,VMX_THRESH
+	bge	cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
 	/* now try to align s1 with 8 bytes */
 	rlwinm  r6,r3,3,26,28
 	beq     .Ldiffoffset_align_s1_8bytes
@@ -356,6 +517,82 @@ _GLOBAL(memcmp)
 	/* now s1 is aligned with 8 bytes. */
 	cmpdi   cr5,r5,31
 	ble	cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+	b	.Llong_novmx_cmp
+#else
 	b	.Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+	ENTER_VMX_OPS
+	beq     cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+	/* Firstly try to align r3 with 16 bytes */
+	andi.   r6,r3,0xf
+	li	off16,16
+	beq     .Ldiffoffset_vmx_s1_16bytes_align
 
+	LVS	v3,0,r3
+	LVS	v4,0,r4
+
+	lvx     v5,0,r3
+	lvx     v6,0,r4
+	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+	VCMPEQUB_RC(v7,v9,v10)
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	subfic  r6,r6,16
+	subf    r5,r6,r5
+	add     r3,r3,r6
+	add     r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+	/* now s1 is aligned with 16 bytes */
+	lvx     v6,0,r4
+	LVS	v4,0,r4
+	srdi	r6,r5,5  /* loop for 32 bytes each */
+	clrldi  r5,r5,59
+	mtctr	r6
+
+.balign	16
+.Ldiffoffset_vmx_32bytesloop:
+	/* the first qw of r4 was saved in v6 */
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	VCMPEQUB_RC(v7,v9,v10)
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	VCMPEQUB_RC(v7,v9,v10)
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	bdnz	.Ldiffoffset_vmx_32bytesloop
+
+	EXIT_VMX_OPS
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+	EXIT_VMX_OPS
+	/* anyway, the diff will appear in next 16 bytes */
+	li	r5,16
+	b	.Lcmp_lt32bytes
+
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d..070cdf6 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 .Lvmx_unaligned_copy:
 	/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 #endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cd..9f34049 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
  * passed a pointer to the destination which we return as required by a
  * memcpy implementation.
  */
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
 {
 	disable_kernel_altivec();
 	preempt_enable();
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v7 4/5] powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp()
From: wei.guo.simon @ 2018-05-30  9:21 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527672063-6953-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch is based on the previous VMX patch on memcmp().

To optimize ppc64 memcmp() with VMX instruction, we need to think about
the VMX penalty brought with: If kernel uses VMX instruction, it needs
to save/restore current thread's VMX registers. There are 32 x 128 bits
VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store.

The major concern regarding the memcmp() performance in kernel is KSM,
who will use memcmp() frequently to merge identical pages. So it will
make sense to take some measures/enhancement on KSM to see whether any
improvement can be done here.  Cyril Bur indicates that the memcmp() for
KSM has a higher possibility to fail (unmatch) early in previous bytes
in following mail.
	https://patchwork.ozlabs.org/patch/817322/#1773629
And I am taking a follow-up on this with this patch.

Per some testing, it shows KSM memcmp() will fail early at previous 32
bytes.  More specifically:
    - 76% cases will fail/unmatch before 16 bytes;
    - 83% cases will fail/unmatch before 32 bytes;
    - 84% cases will fail/unmatch before 64 bytes;
So 32 bytes looks a better choice than other bytes for pre-checking.

The early failure is also true for memcmp() for non-KSM case. With a
non-typical call load, it shows ~73% cases fail before first 32 bytes.

This patch adds a 32 bytes pre-checking firstly before jumping into VMX
operations, to avoid the unnecessary VMX penalty. It is not limited to
KSM case. And the testing shows ~20% improvement on memcmp() average
execution time with this patch.

And note the 32B pre-checking is only performed when the compare size
is long enough (>=4K currently) to allow VMX operation.

The detail data and analysis is at:
https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/lib/memcmp_64.S | 57 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index aef0e41..5eba497 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -404,8 +404,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #ifdef CONFIG_ALTIVEC
 .Lsameoffset_vmx_cmp:
 	/* Enter with src/dst addrs has the same offset with 8 bytes
-	 * align boundary
+	 * align boundary.
+	 *
+	 * There is an optimization based on following fact: memcmp()
+	 * prones to fail early at the first 32 bytes.
+	 * Before applying VMX instructions which will lead to 32x128bits
+	 * VMX regs load/restore penalty, we compare the first 32 bytes
+	 * so that we can catch the ~80% fail cases.
 	 */
+
+	li	r0,4
+	mtctr	r0
+.Lsameoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Lsameoffset_prechk_32B_loop
+
 	ENTER_VMX_OPS
 	beq     cr1,.Llong_novmx_cmp
 
@@ -482,16 +501,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #endif
 
 .Ldiffoffset_8bytes_make_align_start:
-#ifdef CONFIG_ALTIVEC
-BEGIN_FTR_SECTION
-	/* only do vmx ops when the size equal or greater than 4K bytes */
-	cmpdi	cr5,r5,VMX_THRESH
-	bge	cr5,.Ldiffoffset_vmx_cmp
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-
-.Ldiffoffset_novmx_cmp:
-#endif
-
 	/* now try to align s1 with 8 bytes */
 	rlwinm  r6,r3,3,26,28
 	beq     .Ldiffoffset_align_s1_8bytes
@@ -515,6 +524,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 .Ldiffoffset_align_s1_8bytes:
 	/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* only do vmx ops when the size equal or greater than 4K bytes */
+	cmpdi	cr5,r5,VMX_THRESH
+	bge	cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
+
 	cmpdi   cr5,r5,31
 	ble	cr5,.Lcmp_lt32bytes
 
@@ -526,6 +546,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_ALTIVEC
 .Ldiffoffset_vmx_cmp:
+	/* perform a 32 bytes pre-checking before
+	 * enable VMX operations.
+	 */
+	li	r0,4
+	mtctr	r0
+.Ldiffoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Ldiffoffset_prechk_32B_loop
+
 	ENTER_VMX_OPS
 	beq     cr1,.Ldiffoffset_novmx_cmp
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v7 5/5] powerpc:selftest update memcmp_64 selftest for VMX implementation
From: wei.guo.simon @ 2018-05-30  9:21 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1527672063-6953-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch reworked selftest memcmp_64 so that memcmp selftest can
cover more test cases.

It adds testcases for:
- memcmp over 4K bytes size.
- s1/s2 with different/random offset on 16 bytes boundary.
- enter/exit_vmx_ops pairness.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 .../selftests/powerpc/copyloops/asm/ppc_asm.h      |  4 +-
 .../selftests/powerpc/stringloops/asm/ppc-opcode.h | 39 +++++++++
 .../selftests/powerpc/stringloops/asm/ppc_asm.h    | 24 ++++++
 .../testing/selftests/powerpc/stringloops/memcmp.c | 98 +++++++++++++++++-----
 4 files changed, 141 insertions(+), 24 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 5ffe04d..dfce161 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -36,11 +36,11 @@
 	li	r3,0
 	blr
 
-FUNC_START(enter_vmx_copy)
+FUNC_START(enter_vmx_ops)
 	li	r3,1
 	blr
 
-FUNC_START(exit_vmx_copy)
+FUNC_START(exit_vmx_ops)
 	blr
 
 FUNC_START(memcpy_power7)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
new file mode 100644
index 0000000..9de413c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2009 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * provides masks and opcode images for use by code generation, emulation
+ * and for instructions that older assemblers might not know about
+ */
+#ifndef _ASM_POWERPC_PPC_OPCODE_H
+#define _ASM_POWERPC_PPC_OPCODE_H
+
+
+#  define stringify_in_c(...)	__VA_ARGS__
+#  define ASM_CONST(x)		x
+
+
+#define PPC_INST_VCMPEQUD_RC		0x100000c7
+#define PPC_INST_VCMPEQUB_RC		0x10000006
+
+#define __PPC_RC21     (0x1 << 10)
+
+/* macros to insert fields into opcodes */
+#define ___PPC_RA(a)	(((a) & 0x1f) << 16)
+#define ___PPC_RB(b)	(((b) & 0x1f) << 11)
+#define ___PPC_RS(s)	(((s) & 0x1f) << 21)
+#define ___PPC_RT(t)	___PPC_RS(t)
+
+#define VCMPEQUD_RC(vrt, vra, vrb)	stringify_in_c(.long PPC_INST_VCMPEQUD_RC | \
+			      ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+			      ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb)	stringify_in_c(.long PPC_INST_VCMPEQUB_RC | \
+			      ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+			      ___PPC_RB(vrb) | __PPC_RC21)
+
+#endif /* _ASM_POWERPC_PPC_OPCODE_H */
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
index 136242e..33912bb 100644
--- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define __PPC_ASM_H
 #include <ppc-asm.h>
 
 #ifndef r1
@@ -6,3 +8,25 @@
 #endif
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE	256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+#define BEGIN_FTR_SECTION
+#define END_FTR_SECTION_IFSET(val)
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 8250db2..b5cf717 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,20 +2,40 @@
 #include <malloc.h>
 #include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include "utils.h"
 
 #define SIZE 256
 #define ITERATIONS 10000
 
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+	vmx_count++;
+	return 1;
+}
+
+void exit_vmx_ops(void)
+{
+	vmx_count--;
+}
 int test_memcmp(const void *s1, const void *s2, size_t n);
 
 /* test all offsets and lengths */
-static void test_one(char *s1, char *s2)
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+		unsigned long size_start, unsigned long max_size)
 {
 	unsigned long offset, size;
 
-	for (offset = 0; offset < SIZE; offset++) {
-		for (size = 0; size < (SIZE-offset); size++) {
+	for (offset = 0; offset < max_offset; offset++) {
+		for (size = size_start; size < (max_size - offset); size++) {
 			int x, y;
 			unsigned long i;
 
@@ -35,70 +55,104 @@ static void test_one(char *s1, char *s2)
 				printf("\n");
 				abort();
 			}
+
+			if (vmx_count != 0) {
+				printf("vmx enter/exit not paired.(offset:%ld size:%ld s1:%p s2:%p vc:%d\n",
+					offset, size, s1, s2, vmx_count);
+				printf("\n");
+				abort();
+			}
 		}
 	}
 }
 
-static int testcase(void)
+static int testcase(bool islarge)
 {
 	char *s1;
 	char *s2;
 	unsigned long i;
 
-	s1 = memalign(128, SIZE);
+	unsigned long comp_size = (islarge ? LARGE_SIZE : SIZE);
+	unsigned long alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
+	int iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
+
+	s1 = memalign(128, alloc_size);
 	if (!s1) {
 		perror("memalign");
 		exit(1);
 	}
 
-	s2 = memalign(128, SIZE);
+	s2 = memalign(128, alloc_size);
 	if (!s2) {
 		perror("memalign");
 		exit(1);
 	}
 
-	srandom(1);
+	srandom(time(0));
 
-	for (i = 0; i < ITERATIONS; i++) {
+	for (i = 0; i < iterations; i++) {
 		unsigned long j;
 		unsigned long change;
+		char *rand_s1 = s1;
+		char *rand_s2 = s2;
 
-		for (j = 0; j < SIZE; j++)
+		for (j = 0; j < alloc_size; j++)
 			s1[j] = random();
 
-		memcpy(s2, s1, SIZE);
+		rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+		rand_s2 += random() % MAX_OFFSET_DIFF_S1_S2;
+		memcpy(rand_s2, rand_s1, comp_size);
 
 		/* change one byte */
-		change = random() % SIZE;
-		s2[change] = random() & 0xff;
-
-		test_one(s1, s2);
+		change = random() % comp_size;
+		rand_s2[change] = random() & 0xff;
+
+		if (islarge)
+			test_one(rand_s1, rand_s2, LARGE_MAX_OFFSET,
+					LARGE_SIZE_START, comp_size);
+		else
+			test_one(rand_s1, rand_s2, SIZE, 0, comp_size);
 	}
 
-	srandom(1);
+	srandom(time(0));
 
-	for (i = 0; i < ITERATIONS; i++) {
+	for (i = 0; i < iterations; i++) {
 		unsigned long j;
 		unsigned long change;
+		char *rand_s1 = s1;
+		char *rand_s2 = s2;
 
-		for (j = 0; j < SIZE; j++)
+		for (j = 0; j < alloc_size; j++)
 			s1[j] = random();
 
-		memcpy(s2, s1, SIZE);
+		rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+		rand_s2 += random() % MAX_OFFSET_DIFF_S1_S2;
+		memcpy(rand_s2, rand_s1, comp_size);
 
 		/* change multiple bytes, 1/8 of total */
-		for (j = 0; j < SIZE / 8; j++) {
-			change = random() % SIZE;
+		for (j = 0; j < comp_size / 8; j++) {
+			change = random() % comp_size;
 			s2[change] = random() & 0xff;
 		}
 
-		test_one(s1, s2);
+		if (islarge)
+			test_one(rand_s1, rand_s2, LARGE_MAX_OFFSET,
+					LARGE_SIZE_START, comp_size);
+		else
+			test_one(rand_s1, rand_s2, SIZE, 0, comp_size);
 	}
 
 	return 0;
 }
 
+static int testcases(void)
+{
+	testcase(0);
+	testcase(1);
+	return 0;
+}
+
 int main(void)
 {
-	return test_harness(testcase, "memcmp");
+	return test_harness(testcases, "memcmp");
 }
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH kernel] powerpc/powernv/ioda2: Remove redundand free of TCE pages
From: Alexey Kardashevskiy @ 2018-05-30  9:22 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Alexey Kardashevskiy, Andrew Donnellan, Michael Ellerman,
	Russell Currey, Oliver O'Halloran

When IODA2 creates a PE, it creates an IOMMU table with it_ops::free
set to pnv_ioda2_table_free() which calls pnv_pci_ioda2_table_free_pages().

Since iommu_tce_table_put() calls it_ops::free when the last reference
to the table is released, explicit call to pnv_pci_ioda2_table_free_pages()
is not needed so let's remove it.

This should fix double free in the case of PCI hotuplug as
pnv_pci_ioda2_table_free_pages() does not reset neither
iommu_table::it_base nor ::it_size.

This was not exposed by SRIOV as it uses different code path via
pnv_pcibios_sriov_disable().

IODA1 does not inialize it_ops::free so it does not have this issue.

Fixes: c5f7700bb "powerpc/powernv: Dynamically release PE"
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

We have not seen any bug reports because we have not tested the actual
PCI hotunplug in hardware very well.


---
 arch/powerpc/platforms/powernv/pci-ioda.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9731098..92ca662 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3670,7 +3670,6 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
 		WARN_ON(pe->table_group.group);
 	}
 
-	pnv_pci_ioda2_table_free_pages(tbl);
 	iommu_tce_table_put(tbl);
 }
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH] powerpc/64s: Fix compiler store ordering to SLB shadow area
From: Nicholas Piggin @ 2018-05-30 10:31 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Aneesh Kumar K . V

The stores to update the SLB shadow area must be made as they appear
in the C code, so that the hypervisor does not see an entry with
mismatched vsid and esid. Use WRITE_ONCE for this.

GCC has been observed to elide the first store to esid in the update,
which means that if the hypervisor interrupts the guest after storing
to vsid, it could see an entry with old esid and new vsid, which may
possibly result in memory corruption.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/slb.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 66577cc66dc9..2f4b33b24b3b 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -63,14 +63,14 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
 	 * updating it.  No write barriers are needed here, provided
 	 * we only update the current CPU's SLB shadow buffer.
 	 */
-	p->save_area[index].esid = 0;
-	p->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags));
-	p->save_area[index].esid = cpu_to_be64(mk_esid_data(ea, ssize, index));
+	WRITE_ONCE(p->save_area[index].esid, 0);
+	WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
+	WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
 }
 
 static inline void slb_shadow_clear(enum slb_index index)
 {
-	get_slb_shadow()->save_area[index].esid = 0;
+	WRITE_ONCE(get_slb_shadow()->save_area[index].esid, 0);
 }
 
 static inline void create_shadowed_slbe(unsigned long ea, int ssize,
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH v4 1/4] powerpc/kbuild: set default generic machine type for 32-bit compile
From: Nicholas Piggin @ 2018-05-30 10:43 UTC (permalink / raw)
  To: Masahiro Yamada
  Cc: Linux Kbuild mailing list, linuxppc-dev, Segher Boessenkool
In-Reply-To: <CAK7LNASsx_gDQePtNRdiY8v4RbysQqoKKfr1LW6dsz=7fq-oEQ@mail.gmail.com>

On Tue, 29 May 2018 22:39:48 +0900
Masahiro Yamada <yamada.masahiro@socionext.com> wrote:

> 2018-05-16 23:14 GMT+09:00 Nicholas Piggin <npiggin@gmail.com>:
> > Some 64-bit toolchains uses the wrong ISA variant for compiling 32-bit
> > kernels, even with -m32. Debian's powerpc64le is one such case, and
> > that is because it is built with --with-cpu=power8.
> >
> > So when cross compiling a 32-bit kernel with a 64-bit toolchain, set
> > -mcpu=powerpc initially, which is the generic 32-bit powerpc machine
> > type and scheduling model. CPU and platform code can override this
> > with subsequent -mcpu flags if necessary.
> >
> > This is not done for 32-bit toolchains otherwise it would override
> > their defaults, which are presumably set appropriately for the
> > environment (moreso than a 64-bit cross compiler).
> >
> > This fixes a lot of build failures due to incompatible assembly when
> > compiling 32-bit kernel with th Debian powerpc64le 64-bit toolchain.
> >
> > Cc: Segher Boessenkool <segher@kernel.crashing.org>
> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> > ---  
> 
> 
> Can you please remove the noise changes?
> 
> 1/4 adds some blank lines, then 2/4 removes them.

Okay sure, I will change that.

Thanks,
Nick

^ permalink raw reply

* [PATCH v5 0/4] powerpc patches for new Kconfig language
From: Nicholas Piggin @ 2018-05-30 12:19 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Nicholas Piggin, linuxppc-dev, Masahiro Yamada,
	Segher Boessenkool

This series of patches improves th powerpc kbuild system. The
motivation was to to be compatible with the new Kconfig scripting
language that Yamada-san has implemented here for merge:

https://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git/log/?h=kconfig

I have tested on top of that tree, powerpc now builds there.

To avoid build breakage, the first 3 patches must go before the
kconfig change, and the 4th patch must go after it.

v5 changes:
- Patch 4 update to syntax changed since kconfig-shell-v3 release.
- Patch 4 suggestions from Masahiro Yamada, remove unnecessary "OK"
  output from check mprofile script, and fold CC_USING_MPROFILE_KERNEL
  into CONFIG_MPROFILE_KERNEL.
- Reduce whitespace disturbance in patch 1.

Thanks,
Nick


Nicholas Piggin (4):
  powerpc/kbuild: set default generic machine type for 32-bit compile
  powerpc/kbuild: remove CROSS32 defines from top level powerpc Makefile
  powerpc/kbuild: Use flags variables rather than overriding LD/CC/AS
  powerpc/kbuild: move -mprofile-kernel check to Kconfig

 arch/powerpc/Kconfig                          | 16 +------
 arch/powerpc/Makefile                         | 44 ++++++++-----------
 arch/powerpc/boot/Makefile                    | 16 ++++---
 arch/powerpc/include/asm/module.h             |  2 +-
 arch/powerpc/kernel/module_64.c               |  4 +-
 arch/powerpc/kernel/trace/ftrace.c            |  6 +--
 arch/powerpc/kernel/vdso32/Makefile           | 15 +++++--
 .../tools/gcc-check-mprofile-kernel.sh        | 13 +++---
 scripts/recordmcount.pl                       | 18 +++++++-
 9 files changed, 72 insertions(+), 62 deletions(-)

-- 
2.17.0

^ permalink raw reply

* [PATCH v5 1/4] powerpc/kbuild: set default generic machine type for 32-bit compile
From: Nicholas Piggin @ 2018-05-30 12:19 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Nicholas Piggin, linuxppc-dev, Masahiro Yamada,
	Segher Boessenkool
In-Reply-To: <20180530121922.22122-1-npiggin@gmail.com>

Some 64-bit toolchains uses the wrong ISA variant for compiling 32-bit
kernels, even with -m32. Debian's powerpc64le is one such case, and
that is because it is built with --with-cpu=power8.

So when cross compiling a 32-bit kernel with a 64-bit toolchain, set
-mcpu=powerpc initially, which is the generic 32-bit powerpc machine
type and scheduling model. CPU and platform code can override this
with subsequent -mcpu flags if necessary.

This is not done for 32-bit toolchains otherwise it would override
their defaults, which are presumably set appropriately for the
environment (moreso than a 64-bit cross compiler).

This fixes a lot of build failures due to incompatible assembly when
compiling 32-bit kernel with th Debian powerpc64le 64-bit toolchain.

Cc: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Since v1: reworded changelog to explain the cause of the problem (thanks
Segher) and moved the flags into the 64-32 cross compile case.
---
 arch/powerpc/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 95813df90801..d628724087c6 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -24,6 +24,14 @@ ifeq ($(HAS_BIARCH),y)
 ifeq ($(CROSS32_COMPILE),)
 CROSS32CC	:= $(CC) -m32
 KBUILD_ARFLAGS	+= --target=elf32-powerpc
+ifdef CONFIG_PPC32
+# These options will be overridden by any -mcpu option that the CPU
+# or platform code sets later on the command line, but they are needed
+# to set a sane 32-bit cpu target for the 64-bit cross compiler which
+# may default to the wrong ISA.
+KBUILD_CFLAGS		+= -mcpu=powerpc
+KBUILD_AFLAGS		+= -mcpu=powerpc
+endif
 endif
 endif
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH v5 2/4] powerpc/kbuild: remove CROSS32 defines from top level powerpc Makefile
From: Nicholas Piggin @ 2018-05-30 12:19 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Nicholas Piggin, linuxppc-dev, Masahiro Yamada,
	Segher Boessenkool
In-Reply-To: <20180530121922.22122-1-npiggin@gmail.com>

Switch VDSO32 build over to use CROSS32_COMPILE directly, and have
it pass in -m32 after the standard c_flags. This allows endianness
overrides to be removed and the endian and bitness flags moved into
standard flags variables.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Makefile               |  7 -------
 arch/powerpc/boot/Makefile          | 16 +++++++++++-----
 arch/powerpc/kernel/vdso32/Makefile | 15 +++++++++++----
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index d628724087c6..167b26a0780c 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -17,13 +17,8 @@ HAS_BIARCH	:= $(call cc-option-yn, -m32)
 # Set default 32 bits cross compilers for vdso and boot wrapper
 CROSS32_COMPILE ?=
 
-CROSS32CC		:= $(CROSS32_COMPILE)gcc
-CROSS32AR		:= $(CROSS32_COMPILE)ar
-
 ifeq ($(HAS_BIARCH),y)
 ifeq ($(CROSS32_COMPILE),)
-CROSS32CC	:= $(CC) -m32
-KBUILD_ARFLAGS	+= --target=elf32-powerpc
 ifdef CONFIG_PPC32
 # These options will be overridden by any -mcpu option that the CPU
 # or platform code sets later on the command line, but they are needed
@@ -35,8 +30,6 @@ endif
 endif
 endif
 
-export CROSS32CC CROSS32AR
-
 ifeq ($(CROSS_COMPILE),)
 KBUILD_DEFCONFIG := $(shell uname -m)_defconfig
 else
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 26d5d2a5b8e9..49767e06202c 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -23,19 +23,23 @@ all: $(obj)/zImage
 compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
 compress-$(CONFIG_KERNEL_XZ)   := CONFIG_KERNEL_XZ
 
+ifdef CROSS32_COMPILE
+    BOOTCC := $(CROSS32_COMPILE)gcc
+    BOOTAR := $(CROSS32_COMPILE)ar
+else
+    BOOTCC := $(CC)
+    BOOTAR := $(AR)
+endif
+
 BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 		 -fno-strict-aliasing -Os -msoft-float -pipe \
 		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
 		 -D$(compress-y)
 
-BOOTCC := $(CC)
 ifdef CONFIG_PPC64_BOOT_WRAPPER
 BOOTCFLAGS	+= -m64
 else
 BOOTCFLAGS	+= -m32
-ifdef CROSS32_COMPILE
-    BOOTCC := $(CROSS32_COMPILE)gcc
-endif
 endif
 
 BOOTCFLAGS	+= -isystem $(shell $(BOOTCC) -print-file-name=include)
@@ -49,6 +53,8 @@ endif
 
 BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc
 
+BOOTARFLAGS	:= -cr$(KBUILD_ARFLAGS)
+
 ifdef CONFIG_DEBUG_INFO
 BOOTCFLAGS	+= -g
 endif
@@ -202,7 +208,7 @@ quiet_cmd_bootas = BOOTAS  $@
       cmd_bootas = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $<
 
 quiet_cmd_bootar = BOOTAR  $@
-      cmd_bootar = $(CROSS32AR) -cr$(KBUILD_ARFLAGS) $@.$$$$ $(filter-out FORCE,$^); mv $@.$$$$ $@
+      cmd_bootar = $(BOOTAR) $(BOOTARFLAGS) $@.$$$$ $(filter-out FORCE,$^); mv $@.$$$$ $@
 
 $(obj-libfdt): $(obj)/%.o: $(srctree)/scripts/dtc/libfdt/%.c FORCE
 	$(call if_changed_dep,bootcc)
diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index b8c434d1d459..50112d4473bb 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -8,8 +8,15 @@ obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \
 
 # Build rules
 
-ifeq ($(CONFIG_PPC32),y)
-CROSS32CC := $(CC)
+ifdef CROSS32_COMPILE
+    VDSOCC := $(CROSS32_COMPILE)gcc
+else
+    VDSOCC := $(CC)
+endif
+
+CC32FLAGS :=
+ifdef CONFIG_PPC64
+CC32FLAGS += -m32
 endif
 
 targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
@@ -45,9 +52,9 @@ $(obj-vdso32): %.o: %.S FORCE
 
 # actual build commands
 quiet_cmd_vdso32ld = VDSO32L $@
-      cmd_vdso32ld = $(CROSS32CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
+      cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
 quiet_cmd_vdso32as = VDSO32A $@
-      cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $<
+      cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $<
 
 # install commands for the unstripped file
 quiet_cmd_vdso_install = INSTALL $@
-- 
2.17.0

^ permalink raw reply related

* [PATCH v5 3/4] powerpc/kbuild: Use flags variables rather than overriding LD/CC/AS
From: Nicholas Piggin @ 2018-05-30 12:19 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Nicholas Piggin, linuxppc-dev, Masahiro Yamada,
	Segher Boessenkool
In-Reply-To: <20180530121922.22122-1-npiggin@gmail.com>

The powerpc toolchain can compile combinations of 32/64 bit and
big/little endian, so it's convenient to consider, e.g.,

  `CC -m64 -mbig-endian`

To be the C compiler for the purpose of invoking it to build target
artifacts. So overriding the the CC variable to include thse flags
works for this purpose.

Unfortunately that is not compatible with the way the proposed new
Kconfig macro language will work.

After previous patches in this series, these flags can be carefully
passed in using flags instead.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Since v1: removed extra -EB in the recordmcount script (thanks mpe)
---
 arch/powerpc/Makefile                          | 16 +++++++++-------
 .../powerpc/tools/gcc-check-mprofile-kernel.sh | 12 ++++++++----
 scripts/recordmcount.pl                        | 18 +++++++++++++++++-
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 167b26a0780c..6faf1d6ad9dd 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -75,13 +75,15 @@ endif
 endif
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
-override LD	+= -EL
+KBUILD_CFLAGS	+= -mlittle-endian
+LDFLAGS		+= -EL
 LDEMULATION	:= lppc
 GNUTARGET	:= powerpcle
 MULTIPLEWORD	:= -mno-multiple
 KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-save-toc-indirect)
 else
-override LD	+= -EB
+KBUILD_CFLAGS += $(call cc-option,-mbig-endian)
+LDFLAGS		+= -EB
 LDEMULATION	:= ppc
 GNUTARGET	:= powerpc
 MULTIPLEWORD	:= -mmultiple
@@ -94,19 +96,19 @@ aflags-$(CONFIG_CPU_BIG_ENDIAN)		+= $(call cc-option,-mabi=elfv1)
 aflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mabi=elfv2
 endif
 
-cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mlittle-endian
-cflags-$(CONFIG_CPU_BIG_ENDIAN)		+= $(call cc-option,-mbig-endian)
 ifneq ($(cc-name),clang)
   cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mno-strict-align
 endif
 
+cflags-$(CONFIG_CPU_BIG_ENDIAN)		+= $(call cc-option,-mbig-endian)
+cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mlittle-endian
 aflags-$(CONFIG_CPU_BIG_ENDIAN)		+= $(call cc-option,-mbig-endian)
 aflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mlittle-endian
 
 ifeq ($(HAS_BIARCH),y)
-override AS	+= -a$(BITS)
-override LD	+= -m elf$(BITS)$(LDEMULATION)
-override CC	+= -m$(BITS)
+KBUILD_CFLAGS	+= -m$(BITS)
+KBUILD_AFLAGS	+= -m$(BITS) -Wl,-a$(BITS)
+LDFLAGS		+= -m elf$(BITS)$(LDEMULATION)
 KBUILD_ARFLAGS	+= --target=elf$(BITS)-$(GNUTARGET)
 endif
 
diff --git a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
index 061f8035bdbe..a7dd0e5d9f98 100755
--- a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
+++ b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
@@ -7,17 +7,21 @@ set -o pipefail
 # To debug, uncomment the following line
 # set -x
 
+# -mprofile-kernel is only supported on 64le, so this should not be invoked
+# for other targets. Therefore we can pass in -m64 and -mlittle-endian
+# explicitly, to take care of toolchains defaulting to other targets.
+
 # Test whether the compile option -mprofile-kernel exists and generates
 # profiling code (ie. a call to _mcount()).
 echo "int func() { return 0; }" | \
-    $* -S -x c -O2 -p -mprofile-kernel - -o - 2> /dev/null | \
-    grep -q "_mcount"
+    $* -m64 -mlittle-endian -S -x c -O2 -p -mprofile-kernel - -o - \
+    2> /dev/null | grep -q "_mcount"
 
 # Test whether the notrace attribute correctly suppresses calls to _mcount().
 
 echo -e "#include <linux/compiler.h>\nnotrace int func() { return 0; }" | \
-    $* -S -x c -O2 -p -mprofile-kernel - -o - 2> /dev/null | \
-    grep -q "_mcount" && \
+    $* -m64 -mlittle-endian -S -x c -O2 -p -mprofile-kernel - -o - \
+    2> /dev/null | grep -q "_mcount" && \
     exit 1
 
 echo "OK"
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 191eb949d52c..fe06e77c15eb 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -266,13 +266,29 @@ if ($arch eq "x86_64") {
     $objcopy .= " -O elf32-sh-linux";
 
 } elsif ($arch eq "powerpc") {
+    my $ldemulation;
+
     $local_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)";
     # See comment in the sparc64 section for why we use '\w'.
     $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?\\w*?)>:";
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$";
 
+    if ($endian eq "big") {
+	    $cc .= " -mbig-endian ";
+	    $ld .= " -EB ";
+	    $ldemulation = "ppc"
+    } else {
+	    $cc .= " -mlittle-endian ";
+	    $ld .= " -EL ";
+	    $ldemulation = "lppc"
+    }
     if ($bits == 64) {
-	$type = ".quad";
+        $type = ".quad";
+        $cc .= " -m64 ";
+        $ld .= " -m elf64".$ldemulation." ";
+    } else {
+        $cc .= " -m32 ";
+        $ld .= " -m elf32".$ldemulation." ";
     }
 
 } elsif ($arch eq "arm") {
-- 
2.17.0

^ permalink raw reply related

* [PATCH v5 4/4] powerpc/kbuild: move -mprofile-kernel check to Kconfig
From: Nicholas Piggin @ 2018-05-30 12:19 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Nicholas Piggin, linuxppc-dev, Masahiro Yamada,
	Segher Boessenkool
In-Reply-To: <20180530121922.22122-1-npiggin@gmail.com>

This eliminates the workaround that requires disabling
-mprofile-kernel by default in Kconfig.

[ Note: this depends on https://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git kconfig-shell-v3 ]

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig                            | 16 +---------------
 arch/powerpc/Makefile                           | 13 +------------
 arch/powerpc/include/asm/module.h               |  2 +-
 arch/powerpc/kernel/module_64.c                 |  4 ++--
 arch/powerpc/kernel/trace/ftrace.c              |  6 +++---
 arch/powerpc/tools/gcc-check-mprofile-kernel.sh |  1 -
 6 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 0cc41146bc51..736fe35a657d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -461,23 +461,9 @@ config LD_HEAD_STUB_CATCH
 
 	  If unsure, say "N".
 
-config DISABLE_MPROFILE_KERNEL
-	bool "Disable use of mprofile-kernel for kernel tracing"
-	depends on PPC64 && CPU_LITTLE_ENDIAN
-	default y
-	help
-	  Selecting this options disables use of the mprofile-kernel ABI for
-	  kernel tracing. That will cause options such as live patching
-	  (CONFIG_LIVEPATCH) which depend on CONFIG_DYNAMIC_FTRACE_WITH_REGS to
-	  be disabled also.
-
-	  If you have a toolchain which supports mprofile-kernel, then you can
-	  disable this. Otherwise leave it enabled. If you're not sure, say
-	  "Y".
-
 config MPROFILE_KERNEL
 	depends on PPC64 && CPU_LITTLE_ENDIAN
-	def_bool !DISABLE_MPROFILE_KERNEL
+	def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__)
 
 config IOMMU_HELPER
 	def_bool PPC64
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 6faf1d6ad9dd..8eda91e0b8df 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -161,18 +161,7 @@ CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64
 endif
 
 ifdef CONFIG_MPROFILE_KERNEL
-    ifeq ($(shell $(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
-        CC_FLAGS_FTRACE := -pg -mprofile-kernel
-        KBUILD_CPPFLAGS += -DCC_USING_MPROFILE_KERNEL
-    else
-        # If the user asked for mprofile-kernel but the toolchain doesn't
-        # support it, emit a warning and deliberately break the build later
-        # with mprofile-kernel-not-supported. We would prefer to make this an
-        # error right here, but then the user would never be able to run
-        # oldconfig to change their configuration.
-        $(warning Compiler does not support mprofile-kernel, set CONFIG_DISABLE_MPROFILE_KERNEL)
-        CC_FLAGS_FTRACE := -mprofile-kernel-not-supported
-    endif
+	CC_FLAGS_FTRACE := -pg -mprofile-kernel
 endif
 
 CFLAGS-$(CONFIG_CELL_CPU) += $(call cc-option,-mcpu=cell)
diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index 4f6573934792..c38b125b7a57 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -14,7 +14,7 @@
 #include <asm-generic/module.h>
 
 
-#ifdef CC_USING_MPROFILE_KERNEL
+#ifdef CONFIG_MPROFILE_KERNEL
 #define MODULE_ARCH_VERMAGIC_FTRACE	"mprofile-kernel "
 #else
 #define MODULE_ARCH_VERMAGIC_FTRACE	""
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index a2636c250b7b..55bccc315e1a 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -462,7 +462,7 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs,
 	return (unsigned long)&stubs[i];
 }
 
-#ifdef CC_USING_MPROFILE_KERNEL
+#ifdef CONFIG_MPROFILE_KERNEL
 static bool is_early_mcount_callsite(u32 *instruction)
 {
 	/*
@@ -746,7 +746,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-#ifdef CC_USING_MPROFILE_KERNEL
+#ifdef CONFIG_MPROFILE_KERNEL
 
 #define PACATOC offsetof(struct paca_struct, kernel_toc)
 
diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 4741fe112f05..b2272308225a 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -144,7 +144,7 @@ __ftrace_make_nop(struct module *mod,
 		return -EINVAL;
 	}
 
-#ifdef CC_USING_MPROFILE_KERNEL
+#ifdef CONFIG_MPROFILE_KERNEL
 	/* When using -mkernel_profile there is no load to jump over */
 	pop = PPC_INST_NOP;
 
@@ -188,7 +188,7 @@ __ftrace_make_nop(struct module *mod,
 		pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op);
 		return -EINVAL;
 	}
-#endif /* CC_USING_MPROFILE_KERNEL */
+#endif /* CONFIG_MPROFILE_KERNEL */
 
 	if (patch_instruction((unsigned int *)ip, pop)) {
 		pr_err("Patching NOP failed.\n");
@@ -324,7 +324,7 @@ int ftrace_make_nop(struct module *mod,
  * They should effectively be a NOP, and follow formal constraints,
  * depending on the ABI. Return false if they don't.
  */
-#ifndef CC_USING_MPROFILE_KERNEL
+#ifndef CONFIG_MPROFILE_KERNEL
 static int
 expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
 {
diff --git a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
index a7dd0e5d9f98..137f3376ac2b 100755
--- a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
+++ b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
@@ -24,5 +24,4 @@ echo -e "#include <linux/compiler.h>\nnotrace int func() { return 0; }" | \
     2> /dev/null | grep -q "_mcount" && \
     exit 1
 
-echo "OK"
 exit 0
-- 
2.17.0

^ permalink raw reply related

* [PATCH] powerpc/mm: Fix kernel crash on page table free
From: Aneesh Kumar K.V @ 2018-05-30 12:32 UTC (permalink / raw)
  To: benh, paulus, mpe; +Cc: linuxppc-dev, Aneesh Kumar K.V

Fix the below crash on BookE 64. pgtable_page_dtor expects struct page *arg.

Also call the destructor on non book3s platforms correctly. This free up the
split ptl locks correctly if we had allocated them before.

Call Trace:
[c0000000f30c7520] [c00000000021eeec] .kmem_cache_free+0x9c/0x44c (unreliable)
[c0000000f30c75c0] [c0000000001ee07c] .ptlock_free+0x1c/0x30
[c0000000f30c7630] [c0000000001ee260] .tlb_remove_table+0xdc/0x224
[c0000000f30c76c0] [c0000000001ee640] .free_pgd_range+0x298/0x500
[c0000000f30c77d0] [c000000000232afc] .shift_arg_pages+0x10c/0x1e0
[c0000000f30c7910] [c000000000232dd0] .setup_arg_pages+0x200/0x25c
[c0000000f30c79c0] [c0000000002ad4fc] .load_elf_binary+0x450/0x16c8
[c0000000f30c7b10] [c000000000234914] .search_binary_handler.part.11+0x9c/0x248
[c0000000f30c7bb0] [c00000000023595c] .do_execveat_common.isra.13+0x868/0xc18
[c0000000f30c7cb0] [c00000000000184c] .run_init_process+0x34/0x4c
[c0000000f30c7d30] [c000000000001880] .try_to_run_init_process+0x1c/0x68
[c0000000f30c7db0] [c000000000002bd8] .kernel_init+0xdc/0x130
[c0000000f30c7e30] [c0000000000009dc] .ret_from_kernel_thread+0x58/0x7c

Fixes: 702346768 ("powerpc/mm/nohash: Remove pte fragment dependency from nohash")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 1 +
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 1 +
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 5073cc75f1c8..6a6673907e45 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -99,6 +99,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 static inline void pgtable_free(void *table, unsigned index_size)
 {
 	if (!index_size) {
+		pgtable_page_dtor(virt_to_page(table));
 		free_page((unsigned long)table);
 	} else {
 		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 29d37bd1f3b3..1707781d2f20 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -100,6 +100,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 static inline void pgtable_free(void *table, unsigned index_size)
 {
 	if (!index_size) {
+		pgtable_page_dtor(virt_to_page(table));
 		free_page((unsigned long)table);
 	} else {
 		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h
index 21624ff1f065..0e693f322cb2 100644
--- a/arch/powerpc/include/asm/nohash/64/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h
@@ -133,7 +133,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
 static inline void pgtable_free(void *table, int shift)
 {
 	if (!shift) {
-		pgtable_page_dtor(table);
+		pgtable_page_dtor(virt_to_page(table));
 		free_page((unsigned long)table);
 	} else {
 		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
-- 
2.17.0

^ permalink raw reply related

* [PATCH] powerpc/mm/hash: Add missing update in slb update sequence.
From: Aneesh Kumar K.V @ 2018-05-30 13:18 UTC (permalink / raw)
  To: benh, paulus, mpe, npiggin; +Cc: linuxppc-dev, Aneesh Kumar K.V

>From ISA

"For data accesses, the context synchronizing instruction before the slbie,
slbieg, slbia, slbmte, tlbie, or tlbiel instruction ensures that all preceding
instructions that access data storage have completed to a point at which they
have reported all exceptions they will cause."

Add the missing isync when updating Kernel stack slb entry.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/kernel/entry_64.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 51695608c68b..3d1af55e09dc 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -596,6 +596,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	 * actually hit this code path.
 	 */
 
+	isync
 	slbie	r6
 	slbie	r6		/* Workaround POWER5 < DD2.1 issue */
 	slbmte	r7,r0
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH v2 1/1] powerpc/pseries: fix EEH recovery of some IOV devices
From: Bryant G. Ly @ 2018-05-30 13:55 UTC (permalink / raw)
  To: Sam Bobroff, linuxppc-dev, linux-pci; +Cc: mpe, bhelgaas, bryantly
In-Reply-To: <7598ffeb48c16c88a34937ad93b18f806222b8df.1527208281.git.sbobroff@linux.ibm.com>



On 5/24/18 7:31 PM, Sam Bobroff wrote:
> EEH recovery currently fails on pSeries for some IOV capable PCI
> devices, if CONFIG_PCI_IOV is on and the hypervisor doesn't provide
> certain device tree properties for the device. (Found on an IOV
> capable device using the ipr driver.)
>
> Recovery fails in pci_enable_resources() at the check on r->parent,
> because r->flags is set and r->parent is not.  This state is due to
> sriov_init() setting the start, end and flags members of the IOV BARs
> but the parent not being set later in
> pseries_pci_fixup_iov_resources(), because the
> "ibm,open-sriov-vf-bar-info" property is missing.
>
> Correct this by zeroing the resource flags for IOV BARs when they
> can't be configured.
>
> Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
> ---
> Hi,
>
> This is a fix to allow EEH recovery to succeed in a specific situation,
> which I've tried to explain in the commit message.
>
> As with the RFC version, the IOV BARs are disabled by setting the resource
> flags to 0 but the other fields are now left as-is because that is what is done
> elsewhere (see sriov_init() and __pci_read_base()).
>
> I've also examined the concern raised by Bjorn Helgaas, that VFs could be
> enabled later after the BARs are disabled, and it already seems safe: enabling
> VFs (on pseries) depends on another device tree property,
> "ibm,number-of-configurable-vfs" as well as support for the RTAS function
> "ibm_map_pes". Since these are all part of the hypervisor's support for IOV it
> seems unlikely that we would ever see some of them but not all. (None are
> currently provided by QEMU/KVM.) (Additionally, the ipr driver on which the EEH
> recovery failure was discovered doesn't even seem to have SR-IOV support so it
> certainly can't enable VFs.)
>
> Cheers,
> Sam.
> ====== v1 -> v2: ======
>
> Patch 1/1: powerpc/pseries: fix EEH recovery of some IOV devices
> * Moved the BAR disabling code to a function.
> * Also check in pseries_pci_fixup_resources().
>
> ====== v1: ======
>
> Patch 1/1: powerpc/pseries: fix EEH recovery of IOV devices
>
>  arch/powerpc/platforms/pseries/setup.c | 25 +++++++++++++++++--------
>  1 file changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index b55ad4286dc7..0a9e4243ae1d 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -645,6 +645,15 @@ void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
>  	}
>  }
>
> +static void pseries_disable_sriov_resources(struct pci_dev *pdev)
> +{
> +	int i;
> +
> +	pci_warn(pdev, "No hypervisor support for SR-IOV on this device, IOV BARs disabled.\n");
> +	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
> +		pdev->resource[i + PCI_IOV_RESOURCES].flags = 0;
> +}
> +
>  static void pseries_pci_fixup_resources(struct pci_dev *pdev)
>  {
>  	const int *indexes;
> @@ -652,10 +661,10 @@ static void pseries_pci_fixup_resources(struct pci_dev *pdev)
>
>  	/*Firmware must support open sriov otherwise dont configure*/
>  	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
> -	if (!indexes)
> -		return;
> -	/* Assign the addresses from device tree*/
> -	of_pci_set_vf_bar_size(pdev, indexes);
> +	if (indexes)
> +		of_pci_set_vf_bar_size(pdev, indexes);
> +	else
> +		pseries_disable_sriov_resources(pdev);
>  }
>
>  static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
> @@ -667,10 +676,10 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
>  		return;
>  	/*Firmware must support open sriov otherwise dont configure*/
>  	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
> -	if (!indexes)
> -		return;
> -	/* Assign the addresses from device tree*/
> -	of_pci_parse_iov_addrs(pdev, indexes);
> +	if (indexes)
> +		of_pci_parse_iov_addrs(pdev, indexes);
> +	else
> +		pseries_disable_sriov_resources(pdev);
>  }
>
>  static resource_size_t pseries_pci_iov_resource_alignment(struct pci_dev *pdev,

Acked-by: Bryant G. Ly <bryantly@linux.vnet.ibm.com>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox