LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v3 4/6] powerpc/64s: implement queued spinlocks and rwlocks
From: Michal Suchánek @ 2020-07-23 14:37 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Peter Zijlstra, Will Deacon, Boqun Feng, linux-kernel,
	kvm-ppc, virtualization, Ingo Molnar, Waiman Long, linuxppc-dev
In-Reply-To: <20200706043540.1563616-5-npiggin@gmail.com>

On Mon, Jul 06, 2020 at 02:35:38PM +1000, Nicholas Piggin wrote:
> These have shown significantly improved performance and fairness when
> spinlock contention is moderate to high on very large systems.
> 
>  [ Numbers hopefully forthcoming after more testing, but initial
>    results look good ]
> 
> Thanks to the fast path, single threaded performance is not noticably
> hurt.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/powerpc/Kconfig                      | 13 ++++++++++++
>  arch/powerpc/include/asm/Kbuild           |  2 ++
>  arch/powerpc/include/asm/qspinlock.h      | 25 +++++++++++++++++++++++
>  arch/powerpc/include/asm/spinlock.h       |  5 +++++
>  arch/powerpc/include/asm/spinlock_types.h |  5 +++++
>  arch/powerpc/lib/Makefile                 |  3 +++
>  include/asm-generic/qspinlock.h           |  2 ++
>  7 files changed, 55 insertions(+)
>  create mode 100644 arch/powerpc/include/asm/qspinlock.h
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 24ac85c868db..17663ea57697 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -146,6 +146,8 @@ config PPC
>  	select ARCH_SUPPORTS_ATOMIC_RMW
>  	select ARCH_USE_BUILTIN_BSWAP
>  	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
> +	select ARCH_USE_QUEUED_RWLOCKS		if PPC_QUEUED_SPINLOCKS
> +	select ARCH_USE_QUEUED_SPINLOCKS	if PPC_QUEUED_SPINLOCKS
>  	select ARCH_WANT_IPC_PARSE_VERSION
>  	select ARCH_WEAK_RELEASE_ACQUIRE
>  	select BINFMT_ELF
> @@ -492,6 +494,17 @@ config HOTPLUG_CPU
>  
>  	  Say N if you are unsure.
>  
> +config PPC_QUEUED_SPINLOCKS
> +	bool "Queued spinlocks"
> +	depends on SMP
> +	default "y" if PPC_BOOK3S_64
> +	help
> +	  Say Y here to use to use queued spinlocks which are more complex
> +	  but give better salability and fairness on large SMP and NUMA
                           ^ +c?
Thanks

Michal
> +	  systems.
> +
> +	  If unsure, say "Y" if you have lots of cores, otherwise "N".
> +
>  config ARCH_CPU_PROBE_RELEASE
>  	def_bool y
>  	depends on HOTPLUG_CPU
> diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
> index dadbcf3a0b1e..1dd8b6adff5e 100644
> --- a/arch/powerpc/include/asm/Kbuild
> +++ b/arch/powerpc/include/asm/Kbuild
> @@ -6,5 +6,7 @@ generated-y += syscall_table_spu.h
>  generic-y += export.h
>  generic-y += local64.h
>  generic-y += mcs_spinlock.h
> +generic-y += qrwlock.h
> +generic-y += qspinlock.h
>  generic-y += vtime.h
>  generic-y += early_ioremap.h
> diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
> new file mode 100644
> index 000000000000..c49e33e24edd
> --- /dev/null
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_QSPINLOCK_H
> +#define _ASM_POWERPC_QSPINLOCK_H
> +
> +#include <asm-generic/qspinlock_types.h>
> +
> +#define _Q_PENDING_LOOPS	(1 << 9) /* not tuned */
> +
> +#define smp_mb__after_spinlock()   smp_mb()
> +
> +static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
> +{
> +	/*
> +	 * This barrier was added to simple spinlocks by commit 51d7d5205d338,
> +	 * but it should now be possible to remove it, asm arm64 has done with
> +	 * commit c6f5d02b6a0f.
> +	 */
> +	smp_mb();
> +	return atomic_read(&lock->val);
> +}
> +#define queued_spin_is_locked queued_spin_is_locked
> +
> +#include <asm-generic/qspinlock.h>
> +
> +#endif /* _ASM_POWERPC_QSPINLOCK_H */
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 21357fe05fe0..434615f1d761 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -3,7 +3,12 @@
>  #define __ASM_SPINLOCK_H
>  #ifdef __KERNEL__
>  
> +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
> +#include <asm/qspinlock.h>
> +#include <asm/qrwlock.h>
> +#else
>  #include <asm/simple_spinlock.h>
> +#endif
>  
>  #endif /* __KERNEL__ */
>  #endif /* __ASM_SPINLOCK_H */
> diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
> index 3906f52dae65..c5d742f18021 100644
> --- a/arch/powerpc/include/asm/spinlock_types.h
> +++ b/arch/powerpc/include/asm/spinlock_types.h
> @@ -6,6 +6,11 @@
>  # error "please don't include this file directly"
>  #endif
>  
> +#ifdef CONFIG_PPC_QUEUED_SPINLOCKS
> +#include <asm-generic/qspinlock_types.h>
> +#include <asm-generic/qrwlock_types.h>
> +#else
>  #include <asm/simple_spinlock_types.h>
> +#endif
>  
>  #endif
> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
> index 5e994cda8e40..d66a645503eb 100644
> --- a/arch/powerpc/lib/Makefile
> +++ b/arch/powerpc/lib/Makefile
> @@ -41,7 +41,10 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
>  obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
>  	   memcpy_64.o memcpy_mcsafe_64.o
>  
> +ifndef CONFIG_PPC_QUEUED_SPINLOCKS
>  obj64-$(CONFIG_SMP)	+= locks.o
> +endif
> +
>  obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
>  obj64-$(CONFIG_KPROBES_SANITY_TEST)	+= test_emulate_step.o \
>  					   test_emulate_step_exec_instr.o
> diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
> index fde943d180e0..fb0a814d4395 100644
> --- a/include/asm-generic/qspinlock.h
> +++ b/include/asm-generic/qspinlock.h
> @@ -12,6 +12,7 @@
>  
>  #include <asm-generic/qspinlock_types.h>
>  
> +#ifndef queued_spin_is_locked
>  /**
>   * queued_spin_is_locked - is the spinlock locked?
>   * @lock: Pointer to queued spinlock structure
> @@ -25,6 +26,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
>  	 */
>  	return atomic_read(&lock->val);
>  }
> +#endif
>  
>  /**
>   * queued_spin_value_unlocked - is the spinlock structure unlocked?
> -- 
> 2.23.0
> 

^ permalink raw reply

* Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks
From: Waiman Long @ 2020-07-23 14:29 UTC (permalink / raw)
  To: Nicholas Piggin, Peter Zijlstra
  Cc: linux-arch, Will Deacon, Boqun Feng, linux-kernel, kvm-ppc,
	virtualization, Ingo Molnar, linuxppc-dev
In-Reply-To: <1595510571.u39qfc8d1o.astroid@bobo.none>

On 7/23/20 9:30 AM, Nicholas Piggin wrote:
>> I would prefer to extract out the pending bit handling code out into a
>> separate helper function which can be overridden by the arch code
>> instead of breaking the slowpath into 2 pieces.
> You mean have the arch provide a queued_spin_lock_slowpath_pending
> function that the slow path calls?
>
> I would actually prefer the pending handling can be made inline in
> the queued_spin_lock function, especially with out-of-line locks it
> makes sense to put it there.
>
> We could ifdef out queued_spin_lock_slowpath_queue if it's not used,
> then __queued_spin_lock_slowpath_queue would be inlined into the
> caller so there would be no split?

The pending code is an optimization for lightly contended locks. That is 
why I think it is appropriate to extract it into a helper function and 
mark it as such.

You can certainly put the code in the arch's spin_lock code, you just 
has to override the generic pending code by a null function.

Cheers,
Longman


^ permalink raw reply

* Re: [RFC PATCH 4/5] powerpc/mm: Remove custom stack expansion checking
From: Daniel Axtens @ 2020-07-23 14:11 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: linux-arch, hughd, linux-kernel
In-Reply-To: <20200703141327.1732550-4-mpe@ellerman.id.au>

Hi Michael,

> We have powerpc specific logic in our page fault handling to decide if
> an access to an unmapped address below the stack pointer should expand
> the stack VMA.
>
> The logic aims to prevent userspace from doing bad accesses below the
> stack pointer. However as long as the stack is < 1MB in size, we allow
> all accesses without further checks. Adding some debug I see that I
> can do a full kernel build and LTP run, and not a single process has
> used more than 1MB of stack. So for the majority of processes the
> logic never even fires.
>
> We also recently found a nasty bug in this code which could cause
> userspace programs to be killed during signal delivery. It went
> unnoticed presumably because most processes use < 1MB of stack.
>
> The generic mm code has also grown support for stack guard pages since
> this code was originally written, so the most heinous case of the
> stack expanding into other mappings is now handled for us.
>
> Finally although some other arches have special logic in this path,
> from what I can tell none of x86, arm64, arm and s390 impose any extra
> checks other than those in expand_stack().
>
> So drop our complicated logic and like other architectures just let
> the stack expand as long as its within the rlimit.
>

I applied and tested this. While I wouldn't call my testing
comprehensive, I have not been able to reproduce the crash with this
patch applied.

Kind regards,
Daniel


> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/mm/fault.c | 106 ++--------------------------------------
>  1 file changed, 5 insertions(+), 101 deletions(-)
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index ed01329dd12b..925a7231abb3 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -42,39 +42,7 @@
>  #include <asm/kup.h>
>  #include <asm/inst.h>
>  
> -/*
> - * Check whether the instruction inst is a store using
> - * an update addressing form which will update r1.
> - */
> -static bool store_updates_sp(struct ppc_inst inst)
> -{
> -	/* check for 1 in the rA field */
> -	if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1)
> -		return false;
> -	/* check major opcode */
> -	switch (ppc_inst_primary_opcode(inst)) {
> -	case OP_STWU:
> -	case OP_STBU:
> -	case OP_STHU:
> -	case OP_STFSU:
> -	case OP_STFDU:
> -		return true;
> -	case OP_STD:	/* std or stdu */
> -		return (ppc_inst_val(inst) & 3) == 1;
> -	case OP_31:
> -		/* check minor opcode */
> -		switch ((ppc_inst_val(inst) >> 1) & 0x3ff) {
> -		case OP_31_XOP_STDUX:
> -		case OP_31_XOP_STWUX:
> -		case OP_31_XOP_STBUX:
> -		case OP_31_XOP_STHUX:
> -		case OP_31_XOP_STFSUX:
> -		case OP_31_XOP_STFDUX:
> -			return true;
> -		}
> -	}
> -	return false;
> -}
> +
>  /*
>   * do_page_fault error handling helpers
>   */
> @@ -267,54 +235,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
>  	return false;
>  }
>  
> -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> -				struct vm_area_struct *vma, unsigned int flags,
> -				bool *must_retry)
> -{
> -	/*
> -	 * N.B. The POWER/Open ABI allows programs to access up to
> -	 * 288 bytes below the stack pointer.
> -	 * The kernel signal delivery code writes up to 4KB
> -	 * below the stack pointer (r1) before decrementing it.
> -	 * The exec code can write slightly over 640kB to the stack
> -	 * before setting the user r1.  Thus we allow the stack to
> -	 * expand to 1MB without further checks.
> -	 */
> -	if (address + 0x100000 < vma->vm_end) {
> -		struct ppc_inst __user *nip = (struct ppc_inst __user *)regs->nip;
> -		/* get user regs even if this fault is in kernel mode */
> -		struct pt_regs *uregs = current->thread.regs;
> -		if (uregs == NULL)
> -			return true;
> -
> -		/*
> -		 * A user-mode access to an address a long way below
> -		 * the stack pointer is only valid if the instruction
> -		 * is one which would update the stack pointer to the
> -		 * address accessed if the instruction completed,
> -		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
> -		 * (or the byte, halfword, float or double forms).
> -		 *
> -		 * If we don't check this then any write to the area
> -		 * between the last mapped region and the stack will
> -		 * expand the stack rather than segfaulting.
> -		 */
> -		if (address + 4096 >= uregs->gpr[1])
> -			return false;
> -
> -		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> -		    access_ok(nip, sizeof(*nip))) {
> -			struct ppc_inst inst;
> -
> -			if (!probe_user_read_inst(&inst, nip))
> -				return !store_updates_sp(inst);
> -			*must_retry = true;
> -		}
> -		return true;
> -	}
> -	return false;
> -}
> -
>  #ifdef CONFIG_PPC_MEM_KEYS
>  static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
>  			      struct vm_area_struct *vma)
> @@ -480,7 +400,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  	int is_user = user_mode(regs);
>  	int is_write = page_fault_is_write(error_code);
>  	vm_fault_t fault, major = 0;
> -	bool must_retry = false;
>  	bool kprobe_fault = kprobe_page_fault(regs, 11);
>  
>  	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
> @@ -569,30 +488,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  	vma = find_vma(mm, address);
>  	if (unlikely(!vma))
>  		return bad_area(regs, address);
> -	if (likely(vma->vm_start <= address))
> -		goto good_area;
> -	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
> -		return bad_area(regs, address);
>  
> -	/* The stack is being expanded, check if it's valid */
> -	if (unlikely(bad_stack_expansion(regs, address, vma, flags,
> -					 &must_retry))) {
> -		if (!must_retry)
> +	if (unlikely(vma->vm_start > address)) {
> +		if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
>  			return bad_area(regs, address);
>  
> -		mmap_read_unlock(mm);
> -		if (fault_in_pages_readable((const char __user *)regs->nip,
> -					    sizeof(unsigned int)))
> -			return bad_area_nosemaphore(regs, address);
> -		goto retry;
> +		if (unlikely(expand_stack(vma, address)))
> +			return bad_area(regs, address);
>  	}
>  
> -	/* Try to expand it */
> -	if (unlikely(expand_stack(vma, address)))
> -		return bad_area(regs, address);
> -
> -good_area:
> -
>  #ifdef CONFIG_PPC_MEM_KEYS
>  	if (unlikely(access_pkey_error(is_write, is_exec,
>  				       (error_code & DSISR_KEYFAULT), vma)))
> -- 
> 2.25.1

^ permalink raw reply

* Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
From: Nicholas Piggin @ 2020-07-23 14:09 UTC (permalink / raw)
  To: linuxppc-dev, Michael Ellerman
  Cc: linux-arch, Peter Zijlstra, Boqun Feng, linux-kernel, kvm-ppc,
	virtualization, Ingo Molnar, Waiman Long, Will Deacon
In-Reply-To: <874kqhvu1v.fsf@mpe.ellerman.id.au>

Excerpts from Michael Ellerman's message of July 9, 2020 8:53 pm:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/powerpc/include/asm/paravirt.h           | 28 ++++++++
>>  arch/powerpc/include/asm/qspinlock.h          | 66 +++++++++++++++++++
>>  arch/powerpc/include/asm/qspinlock_paravirt.h |  7 ++
>>  arch/powerpc/platforms/pseries/Kconfig        |  5 ++
>>  arch/powerpc/platforms/pseries/setup.c        |  6 +-
>>  include/asm-generic/qspinlock.h               |  2 +
> 
> Another ack?
> 
>> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
>> index 7a8546660a63..f2d51f929cf5 100644
>> --- a/arch/powerpc/include/asm/paravirt.h
>> +++ b/arch/powerpc/include/asm/paravirt.h
>> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
>>  {
>>  	___bad_yield_to_preempted(); /* This would be a bug */
>>  }
>> +
>> +extern void ___bad_yield_to_any(void);
>> +static inline void yield_to_any(void)
>> +{
>> +	___bad_yield_to_any(); /* This would be a bug */
>> +}
> 
> Why do we do that rather than just not defining yield_to_any() at all
> and letting the build fail on that?
> 
> There's a condition somewhere that we know will false at compile time
> and drop the call before linking?

Mainly so you could use it in if (IS_ENABLED()) blocks, but would still
catch the (presumably buggy) case where something calls it without the
option set.

I think I had it arranged a different way that was using IS_ENABLED 
earlier and changed it but might as well keep it this way.

> 
>> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> new file mode 100644
>> index 000000000000..750d1b5e0202
>> --- /dev/null
>> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> @@ -0,0 +1,7 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
>> +#define __ASM_QSPINLOCK_PARAVIRT_H
> 
> _ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.
> 
>> +
>> +EXPORT_SYMBOL(__pv_queued_spin_unlock);
> 
> Why's that in a header? Should that (eventually) go with the generic implementation?

Yeah the qspinlock_paravirt.h header is a bit weird and only gets 
included into kernel/locking/qspinlock.c

>> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
>> index 24c18362e5ea..756e727b383f 100644
>> --- a/arch/powerpc/platforms/pseries/Kconfig
>> +++ b/arch/powerpc/platforms/pseries/Kconfig
>> @@ -25,9 +25,14 @@ config PPC_PSERIES
>>  	select SWIOTLB
>>  	default y
>>  
>> +config PARAVIRT_SPINLOCKS
>> +	bool
>> +	default n
> 
> default n is the default.
> 
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index 2db8469e475f..747a203d9453 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
>>  	if (firmware_has_feature(FW_FEATURE_LPAR)) {
>>  		vpa_init(boot_cpuid);
>>  
>> -		if (lppaca_shared_proc(get_lppaca()))
>> +		if (lppaca_shared_proc(get_lppaca())) {
>>  			static_branch_enable(&shared_processor);
>> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
>> +			pv_spinlocks_init();
>> +#endif
>> +		}
> 
> We could avoid the ifdef with this I think?

Yes I think so.

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH v2 2/2] KVM: PPC: Book3S HV: rework secure mem slot dropping
From: Laurent Dufour @ 2020-07-23 14:06 UTC (permalink / raw)
  To: bharata, linuxram
  Cc: linux-kernel, kvm-ppc, paulus, sukadev, linuxppc-dev, bauerman
In-Reply-To: <4a3caeaf-cd0c-fcd7-0a97-f367a5f78dac@linux.ibm.com>

Le 23/07/2020 à 14:32, Laurent Dufour a écrit :
> Le 23/07/2020 à 05:36, Bharata B Rao a écrit :
>> On Tue, Jul 21, 2020 at 12:42:02PM +0200, Laurent Dufour wrote:
>>> When a secure memslot is dropped, all the pages backed in the secure device
>>> (aka really backed by secure memory by the Ultravisor) should be paged out
>>> to a normal page. Previously, this was achieved by triggering the page
>>> fault mechanism which is calling kvmppc_svm_page_out() on each pages.
>>>
>>> This can't work when hot unplugging a memory slot because the memory slot
>>> is flagged as invalid and gfn_to_pfn() is then not trying to access the
>>> page, so the page fault mechanism is not triggered.
>>>
>>> Since the final goal is to make a call to kvmppc_svm_page_out() it seems
>>> simpler to directly calling it instead of triggering such a mechanism. This
>>> way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
>>> memslot.
>>>
>>> Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
>>> the call to __kvmppc_svm_page_out() is made.
>>> As __kvmppc_svm_page_out needs the vma pointer to migrate the pages, the
>>> VMA is fetched in a lazy way, to not trigger find_vma() all the time. In
>>> addition, the mmap_sem is help in read mode during that time, not in write
>>> mode since the virual memory layout is not impacted, and
>>> kvm->arch.uvmem_lock prevents concurrent operation on the secure device.
>>>
>>> Cc: Ram Pai <linuxram@us.ibm.com>
>>> Cc: Bharata B Rao <bharata@linux.ibm.com>
>>> Cc: Paul Mackerras <paulus@ozlabs.org>
>>> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
>>> ---
>>>   arch/powerpc/kvm/book3s_hv_uvmem.c | 54 ++++++++++++++++++++----------
>>>   1 file changed, 37 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
>>> b/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> index 5a4b02d3f651..ba5c7c77cc3a 100644
>>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> @@ -624,35 +624,55 @@ static inline int kvmppc_svm_page_out(struct 
>>> vm_area_struct *vma,
>>>    * fault on them, do fault time migration to replace the device PTEs in
>>>    * QEMU page table with normal PTEs from newly allocated pages.
>>>    */
>>> -void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
>>> +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
>>>                    struct kvm *kvm, bool skip_page_out)
>>>   {
>>>       int i;
>>>       struct kvmppc_uvmem_page_pvt *pvt;
>>> -    unsigned long pfn, uvmem_pfn;
>>> -    unsigned long gfn = free->base_gfn;
>>> +    struct page *uvmem_page;
>>> +    struct vm_area_struct *vma = NULL;
>>> +    unsigned long uvmem_pfn, gfn;
>>> +    unsigned long addr, end;
>>> +
>>> +    mmap_read_lock(kvm->mm);
>>> +
>>> +    addr = slot->userspace_addr;
>>
>> We typically use gfn_to_hva() for that, but that won't work for a
>> memslot that is already marked INVALID which is the case here.
>> I think it is ok to access slot->userspace_addr here of an INVALID
>> memslot, but just thought of explictly bringing this up.
> 
> Which explicitly mentioned above in the patch's description:
> 
> This can't work when hot unplugging a memory slot because the memory slot
> is flagged as invalid and gfn_to_pfn() is then not trying to access the
> page, so the page fault mechanism is not triggered.
> 
>>
>>> +    end = addr + (slot->npages * PAGE_SIZE);
>>> -    for (i = free->npages; i; --i, ++gfn) {
>>> -        struct page *uvmem_page;
>>> +    gfn = slot->base_gfn;
>>> +    for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
>>> +
>>> +        /* Fetch the VMA if addr is not in the latest fetched one */
>>> +        if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) {
>>> +            vma = find_vma_intersection(kvm->mm, addr, end);
>>> +            if (!vma ||
>>> +                vma->vm_start > addr || vma->vm_end < end) {
>>> +                pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
>>> +                break;
>>> +            }
>>> +        }
>>
>> In Ram's series, kvmppc_memslot_page_merge() also walks the VMAs spanning
>> the memslot, but it uses a different logic for the same. Why can't these
>> two cases use the same method to walk the VMAs? Is there anything subtly
>> different between the two cases?
> 
> This is probably doable. At the time I wrote that patch, the 
> kvmppc_memslot_page_merge() was not yet introduced AFAIR.
> 
> This being said, I'd help a lot to factorize that code... I let Ram dealing with 
> that ;)

Indeed I don't think this is relevant, the loop in kvmppc_memslot_page_merge() 
deals with one call (to ksm_advise) per VMA, while this code is dealing with one 
call per page of the VMA, which completely different.

I don't think merging the both will be a good idea.

Cheers,
Laurent.

^ permalink raw reply

* Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
From: Peter Zijlstra @ 2020-07-23 14:00 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-arch, Boqun Feng, virtualization, linuxppc-dev,
	Nicholas Piggin, linux-kernel, Ingo Molnar, kvm-ppc, Will Deacon
In-Reply-To: <8265d782-4e50-a9b2-a908-0cb588ffa09c@redhat.com>

On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote:
> We don't really need to do a pv_spinlocks_init() if pv_kick() isn't
> supported.

Waiman, if you cannot explain how not having kick is a sane thing, what
are you saying here?

^ permalink raw reply

* Re: [PATCH v2 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Ard Biesheuvel @ 2020-07-23 12:42 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Thomas Gleixner, Jiri Kosina, Anup Patel,
	Linux Kernel Mailing List, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, Omar Sandoval, open list:S390,
	Joe Lawrence, Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Yonghong Song, Iurii Zaikin, Andrii Nakryiko, Thomas Huth,
	Vasily Gorbik, moderated list:ARM PORT, Daniel Axtens,
	Damien Le Moal, Martin KaFai Lau, Song Liu, Josh Poimboeuf,
	Heiko Carstens, Alexei Starovoitov, Atish Patra, Will Deacon,
	Daniel Borkmann, Masahiro Yamada, Nayna Jain, Ley Foon Tan,
	Christian Borntraeger, Sami Tolvanen, Naveen N. Rao, Mao Han,
	Marco Elver, Steven Rostedt, Babu Moger, Borislav Petkov,
	Greentime Hu, Ben Dooks, Guan Xuetao, Thomas Bogendoerfer,
	open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra, David Howells,
	Amit Daniel Kachhap, Sandipan Das, H. Peter Anvin,
	open list:SPARC + UltraSPARC sparc/sparc64,
	open list:RISC-V ARCHITECTURE, Miroslav Benes, Jiri Olsa,
	Tiezhu Yang, Vincenzo Frascino, Anders Roxell, Sven Schnelle,
	maintainer:X86 ARCHITECTURE 32-BIT AND 64-BIT, Russell King,
	Mike Rapoport, Ingo Molnar, Albert Ou, Paul E. McKenney,
	Paul Walmsley, KP Singh, Dmitry Vyukov, Nick Hu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, open list:MIPS,
	Palmer Dabbelt, open list:LINUX FOR POWERPC 32-BIT AND 64-BIT
In-Reply-To: <20200723015127.GE45081@linux.intel.com>

On Thu, 23 Jul 2020 at 04:52, Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> On Thu, Jul 16, 2020 at 06:49:09PM +0200, Christophe Leroy wrote:
> > Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> a écrit :
> >
> > > Rename module_alloc() to text_alloc() and module_memfree() to
> > > text_memfree(), and move them to kernel/text.c, which is unconditionally
> > > compiled to the kernel proper. This allows kprobes, ftrace and bpf to
> > > allocate space for executable code without requiring to compile the modules
> > > support (CONFIG_MODULES=y) in.
> >
> > You are not changing enough in powerpc to have this work.
> > On powerpc 32 bits (6xx), when STRICT_KERNEL_RWX is selected, the vmalloc
> > space is set to NX (no exec) at segment level (ie by 256Mbytes zone) unless
> > CONFIG_MODULES is selected.
> >
> > Christophe
>
> This has been deduced down to:
>
> https://lore.kernel.org/lkml/20200717030422.679972-1-jarkko.sakkinen@linux.intel.com/
>
> I.e. not intruding PPC anymore :-)
>

Ok, so after the elaborate discussion we had between Jessica, Russell,
Peter, Will, Mark, you and myself, where we pointed out that
a) a single text_alloc() abstraction for bpf, kprobes and ftrace does
not fit other architectures very well, and
b) that module_alloc() is not suitable as a default to base text_alloc() on,

you went ahead and implemented that anyway, but only cc'ing Peter,
akpm, Masami and the mm list this time?

Sorry, but that is not how it works. Once people get pulled into a
discussion, you cannot dismiss them or their feedback like that and go
off and do your own thing anyway. Generic features like this are
tricky to get right, and it will likely take many iterations and input
from many different people.

^ permalink raw reply

* Re: [PATCH 2/5] powerpc: Allow 4096 bytes of stack expansion for the signal frame
From: Daniel Axtens @ 2020-07-23 13:35 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: linux-arch, hughd, linux-kernel
In-Reply-To: <20200703141327.1732550-2-mpe@ellerman.id.au>

Hi Michael,

Unfortunately, this patch doesn't completely solve the problem.

Trying the original reproducer, I'm still able to trigger the crash even
with this patch, although not 100% of the time. (If I turn ASLR off
outside of tmux it reliably crashes, if I turn ASLR off _inside_ of tmux
it reliably succeeds; all of this is on a serial console.)

./foo 1241000 & sleep 1; killall -USR1 foo; echo ok

If I add some debugging information, I see that I'm getting
address + 4096 = 7fffffed0fa0
gpr1 =           7fffffed1020

So address + 4096 is 0x80 bytes below the 4k window. I haven't been able
to figure out why, gdb gives me a NIP in __kernel_sigtramp_rt64 but I
don't know what to make of that.

Kind regards,
Daniel

P.S. I don't know what your policy on linking to kernel bugzilla is, but
if you want:

Link: https://bugzilla.kernel.org/show_bug.cgi?id=205183


> Reported-by: Tom Lane <tgl@sss.pgh.pa.us>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/mm/fault.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 641fc5f3d7dd..ed01329dd12b 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -274,7 +274,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>  	/*
>  	 * N.B. The POWER/Open ABI allows programs to access up to
>  	 * 288 bytes below the stack pointer.
> -	 * The kernel signal delivery code writes up to about 1.5kB
> +	 * The kernel signal delivery code writes up to 4KB
>  	 * below the stack pointer (r1) before decrementing it.
>  	 * The exec code can write slightly over 640kB to the stack
>  	 * before setting the user r1.  Thus we allow the stack to
> @@ -299,7 +299,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>  		 * between the last mapped region and the stack will
>  		 * expand the stack rather than segfaulting.
>  		 */
> -		if (address + 2048 >= uregs->gpr[1])
> +		if (address + 4096 >= uregs->gpr[1])
>  			return false;
>  
>  		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> -- 
> 2.25.1

^ permalink raw reply

* Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks
From: Nicholas Piggin @ 2020-07-23 13:30 UTC (permalink / raw)
  To: Waiman Long, Peter Zijlstra
  Cc: linux-arch, Will Deacon, Boqun Feng, linux-kernel, kvm-ppc,
	virtualization, Ingo Molnar, linuxppc-dev
In-Reply-To: <eaabf501-80fe-dd15-c03c-f75ce4f75877@redhat.com>

Excerpts from Waiman Long's message of July 22, 2020 12:36 am:
> On 7/21/20 7:08 AM, Nicholas Piggin wrote:
>> diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
>> index b752d34517b3..26d8766a1106 100644
>> --- a/arch/powerpc/include/asm/qspinlock.h
>> +++ b/arch/powerpc/include/asm/qspinlock.h
>> @@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
>>   
>>   #else
>>   extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
>> +extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>>   #endif
>>   
>>   static __always_inline void queued_spin_lock(struct qspinlock *lock)
>>   {
>> -	u32 val = 0;
>> -
>> -	if (likely(atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL)))
>> +	atomic_t *a = &lock->val;
>> +	u32 val;
>> +
>> +again:
>> +	asm volatile(
>> +"1:\t"	PPC_LWARX(%0,0,%1,1) "	# queued_spin_lock			\n"
>> +	: "=&r" (val)
>> +	: "r" (&a->counter)
>> +	: "memory");
>> +
>> +	if (likely(val == 0)) {
>> +		asm_volatile_goto(
>> +	"	stwcx.	%0,0,%1							\n"
>> +	"	bne-	%l[again]						\n"
>> +	"\t"	PPC_ACQUIRE_BARRIER "						\n"
>> +		:
>> +		: "r"(_Q_LOCKED_VAL), "r" (&a->counter)
>> +		: "cr0", "memory"
>> +		: again );
>>   		return;
>> -
>> -	queued_spin_lock_slowpath(lock, val);
>> +	}
>> +
>> +	if (likely(val == _Q_LOCKED_VAL)) {
>> +		asm_volatile_goto(
>> +	"	stwcx.	%0,0,%1							\n"
>> +	"	bne-	%l[again]						\n"
>> +		:
>> +		: "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r" (&a->counter)
>> +		: "cr0", "memory"
>> +		: again );
>> +
>> +		atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
>> +//		clear_pending_set_locked(lock);
>> +		WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
>> +//		lockevent_inc(lock_pending);
>> +		return;
>> +	}
>> +
>> +	if (val == _Q_PENDING_VAL) {
>> +		int cnt = _Q_PENDING_LOOPS;
>> +		val = atomic_cond_read_relaxed(a,
>> +					       (VAL != _Q_PENDING_VAL) || !cnt--);
>> +		if (!(val & ~_Q_LOCKED_MASK))
>> +			goto again;
>> +        }
>> +	queued_spin_lock_slowpath_queue(lock);
>>   }
>>   #define queued_spin_lock queued_spin_lock
>>   
> 
> I am fine with the arch code override some part of the generic code.

Cool.

>> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
>> index b9515fcc9b29..ebcc6f5d99d5 100644
>> --- a/kernel/locking/qspinlock.c
>> +++ b/kernel/locking/qspinlock.c
>> @@ -287,10 +287,14 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
>>   
>>   #ifdef CONFIG_PARAVIRT_SPINLOCKS
>>   #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
>> +#define queued_spin_lock_slowpath_queue	native_queued_spin_lock_slowpath_queue
>>   #endif
>>   
>>   #endif /* _GEN_PV_LOCK_SLOWPATH */
>>   
>> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
>> +
>>   /**
>>    * queued_spin_lock_slowpath - acquire the queued spinlock
>>    * @lock: Pointer to queued spinlock structure
>> @@ -314,12 +318,6 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
>>    */
>>   void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>>   {
>> -	struct mcs_spinlock *prev, *next, *node;
>> -	u32 old, tail;
>> -	int idx;
>> -
>> -	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
>> -
>>   	if (pv_enabled())
>>   		goto pv_queue;
>>   
>> @@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>>   queue:
>>   	lockevent_inc(lock_slowpath);
>>   pv_queue:
>> +	__queued_spin_lock_slowpath_queue(lock);
>> +}
>> +EXPORT_SYMBOL(queued_spin_lock_slowpath);
>> +
>> +void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
>> +{
>> +	lockevent_inc(lock_slowpath);
>> +	__queued_spin_lock_slowpath_queue(lock);
>> +}
>> +EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
>> +
>> +static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
>> +{
>> +	struct mcs_spinlock *prev, *next, *node;
>> +	u32 old, tail;
>> +	u32 val;
>> +	int idx;
>> +
>> +	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
>> +
>>   	node = this_cpu_ptr(&qnodes[0].mcs);
>>   	idx = node->count++;
>>   	tail = encode_tail(smp_processor_id(), idx);
>> @@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>>   	 */
>>   	__this_cpu_dec(qnodes[0].mcs.count);
>>   }
>> -EXPORT_SYMBOL(queued_spin_lock_slowpath);
>>   
>>   /*
>>    * Generate the paravirt code for queued_spin_unlock_slowpath().
>>
> I would prefer to extract out the pending bit handling code out into a 
> separate helper function which can be overridden by the arch code 
> instead of breaking the slowpath into 2 pieces.

You mean have the arch provide a queued_spin_lock_slowpath_pending 
function that the slow path calls?

I would actually prefer the pending handling can be made inline in
the queued_spin_lock function, especially with out-of-line locks it 
makes sense to put it there.

We could ifdef out queued_spin_lock_slowpath_queue if it's not used,
then __queued_spin_lock_slowpath_queue would be inlined into the
caller so there would be no split?

Thanks,
Nick

^ permalink raw reply

* Re: [PATCHv3 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents
From: Nathan Lynch @ 2020-07-23 13:27 UTC (permalink / raw)
  To: Pingfan Liu; +Cc: cheloha, kexec, ldufour, linuxppc-dev, Hari Bathini
In-Reply-To: <1595382730-10565-2-git-send-email-kernelfans@gmail.com>

Pingfan Liu <kernelfans@gmail.com> writes:
> A bug is observed on pseries by taking the following steps on rhel:
> -1. drmgr -c mem -r -q 5
> -2. echo c > /proc/sysrq-trigger
>
> And then, the failure looks like:
> kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/
> kdump: saving vmcore-dmesg.txt
> kdump: saving vmcore-dmesg.txt complete
> kdump: saving vmcore
>  Checking for memory holes                         : [  0.0 %] /                   Checking for memory holes                         : [100.0 %] |                   Excluding unnecessary pages                       : [100.0 %] \                   Copying data                                      : [  0.3 %] -          eta: 38s[   44.337636] hash-mmu: mm: Hashing failure ! EA=0x7fffba400000 access=0x8000000000000004 current=makedumpfile
> [   44.337663] hash-mmu:     trap=0x300 vsid=0x13a109c ssize=1 base psize=2 psize 2 pte=0xc000000050000504
> [   44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba400000 access=0x8000000000000004 current=makedumpfile
> [   44.337692] hash-mmu:     trap=0x300 vsid=0x13a109c ssize=1 base psize=2 psize 2 pte=0xc000000050000504
> [   44.337708] makedumpfile[469]: unhandled signal 7 at 00007fffba400000 nip 00007fffbbc4d7fc lr 000000011356ca3c code 2
> [   44.338548] Core dump to |/bin/false pipe failed
> /lib/kdump-lib-initramfs.sh: line 98:   469 Bus error               $CORE_COLLECTOR /proc/vmcore $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete
> kdump: saving vmcore failed
>
> * Root cause *
>   After analyzing, it turns out that in the current implementation,
> when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as
> the code __remove_memory() comes before drmem_update_dt().
> So in kdump kernel, when read_from_oldmem() resorts to
> pSeries_lpar_hpte_insert() to install hpte, but fails with -2 due to
> non-exist pfn. And finally, low_hash_fault() raise SIGBUS to process, as it
> can be observed "Bus error"
>
> From a viewpoint of listener and publisher, the publisher notifies the
> listener before data is ready.  This introduces a problem where udev
> launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before
> updating. And in capture kernel, makedumpfile will access the memory based
> on the stale dt info, and hit a SIGBUS error due to an un-existed lmb.
>
> * Fix *
>   In order to fix this issue, update dt before __remove_memory(), and
> accordingly the same rule in hot-add path.
>
> This will introduce extra dt updating payload for each involved lmb when hotplug.
> But it should be fine since drmem_update_dt() is memory based operation and
> hotplug is not a hot path.

This is great analysis but the performance implications of the change
are grave. The add/remove paths here are already O(n) where n is the
quantity of memory assigned to the LP, this change would make it O(n^2):

dlpar_memory_add_by_count
  for_each_drmem_lmb             <--
    dlpar_add_lmb
      drmem_update_dt(_v1|_v2)
        for_each_drmem_lmb       <--

Memory add/remove isn't a hot path but quadratic runtime complexity
isn't acceptable. Its current performance is bad enough that I have
internal bugs open on it.

Not to mention we leak memory every time drmem_update_dt is called
because we can't safely free device tree properties :-(

Also note that this sort of reverts (fixes?) 063b8b1251fd
("powerpc/pseries/memory-hotplug: Only update DT once per memory DLPAR
request").

^ permalink raw reply

* Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
From: Nicholas Piggin @ 2020-07-23 13:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Alexey Kardashevskiy, Will Deacon, linux-kernel,
	Ingo Molnar, linuxppc-dev
In-Reply-To: <20200723114010.GO5523@worktop.programming.kicks-ass.net>

Excerpts from Peter Zijlstra's message of July 23, 2020 9:40 pm:
> On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:
> 
>> diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
>> index 3a0db7b0b46e..35060be09073 100644
>> --- a/arch/powerpc/include/asm/hw_irq.h
>> +++ b/arch/powerpc/include/asm/hw_irq.h
>> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
>>  #define powerpc_local_irq_pmu_save(flags)			\
>>  	 do {							\
>>  		raw_local_irq_pmu_save(flags);			\
>> -		trace_hardirqs_off();				\
>> +		if (!raw_irqs_disabled_flags(flags))		\
>> +			trace_hardirqs_off();			\
>>  	} while(0)
>>  #define powerpc_local_irq_pmu_restore(flags)			\
>>  	do {							\
>> -		if (raw_irqs_disabled_flags(flags)) {		\
>> -			raw_local_irq_pmu_restore(flags);	\
>> -			trace_hardirqs_off();			\
>> -		} else {					\
>> +		if (!raw_irqs_disabled_flags(flags))		\
>>  			trace_hardirqs_on();			\
>> -			raw_local_irq_pmu_restore(flags);	\
>> -		}						\
>> +		raw_local_irq_pmu_restore(flags);		\
>>  	} while(0)
> 
> You shouldn't be calling lockdep from NMI context!

After this patch it doesn't.

trace_hardirqs_on/off implementation appears to expect to be called in NMI 
context though, for some reason.

> That is, I recently
> added suport for that on x86:
> 
>   https://lkml.kernel.org/r/20200623083721.155449112@infradead.org
>   https://lkml.kernel.org/r/20200623083721.216740948@infradead.org
> 
> But you need to be very careful on how you order things, as you can see
> the above relies on preempt_count() already having been incremented with
> NMI_MASK.

Hmm. My patch seems simpler.

I don't know this stuff very well, I don't really understand what your patch 
enables for x86 but at least it shouldn't be incompatible with this one 
AFAIKS.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v2 2/2] KVM: PPC: Book3S HV: rework secure mem slot dropping
From: Laurent Dufour @ 2020-07-23 12:32 UTC (permalink / raw)
  To: bharata, linuxram
  Cc: linux-kernel, kvm-ppc, paulus, sukadev, linuxppc-dev, bauerman
In-Reply-To: <20200723033600.GS7902@in.ibm.com>

Le 23/07/2020 à 05:36, Bharata B Rao a écrit :
> On Tue, Jul 21, 2020 at 12:42:02PM +0200, Laurent Dufour wrote:
>> When a secure memslot is dropped, all the pages backed in the secure device
>> (aka really backed by secure memory by the Ultravisor) should be paged out
>> to a normal page. Previously, this was achieved by triggering the page
>> fault mechanism which is calling kvmppc_svm_page_out() on each pages.
>>
>> This can't work when hot unplugging a memory slot because the memory slot
>> is flagged as invalid and gfn_to_pfn() is then not trying to access the
>> page, so the page fault mechanism is not triggered.
>>
>> Since the final goal is to make a call to kvmppc_svm_page_out() it seems
>> simpler to directly calling it instead of triggering such a mechanism. This
>> way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
>> memslot.
>>
>> Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
>> the call to __kvmppc_svm_page_out() is made.
>> As __kvmppc_svm_page_out needs the vma pointer to migrate the pages, the
>> VMA is fetched in a lazy way, to not trigger find_vma() all the time. In
>> addition, the mmap_sem is help in read mode during that time, not in write
>> mode since the virual memory layout is not impacted, and
>> kvm->arch.uvmem_lock prevents concurrent operation on the secure device.
>>
>> Cc: Ram Pai <linuxram@us.ibm.com>
>> Cc: Bharata B Rao <bharata@linux.ibm.com>
>> Cc: Paul Mackerras <paulus@ozlabs.org>
>> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
>> ---
>>   arch/powerpc/kvm/book3s_hv_uvmem.c | 54 ++++++++++++++++++++----------
>>   1 file changed, 37 insertions(+), 17 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> index 5a4b02d3f651..ba5c7c77cc3a 100644
>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>> @@ -624,35 +624,55 @@ static inline int kvmppc_svm_page_out(struct vm_area_struct *vma,
>>    * fault on them, do fault time migration to replace the device PTEs in
>>    * QEMU page table with normal PTEs from newly allocated pages.
>>    */
>> -void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free,
>> +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot,
>>   			     struct kvm *kvm, bool skip_page_out)
>>   {
>>   	int i;
>>   	struct kvmppc_uvmem_page_pvt *pvt;
>> -	unsigned long pfn, uvmem_pfn;
>> -	unsigned long gfn = free->base_gfn;
>> +	struct page *uvmem_page;
>> +	struct vm_area_struct *vma = NULL;
>> +	unsigned long uvmem_pfn, gfn;
>> +	unsigned long addr, end;
>> +
>> +	mmap_read_lock(kvm->mm);
>> +
>> +	addr = slot->userspace_addr;
> 
> We typically use gfn_to_hva() for that, but that won't work for a
> memslot that is already marked INVALID which is the case here.
> I think it is ok to access slot->userspace_addr here of an INVALID
> memslot, but just thought of explictly bringing this up.

Which explicitly mentioned above in the patch's description:

This can't work when hot unplugging a memory slot because the memory slot
is flagged as invalid and gfn_to_pfn() is then not trying to access the
page, so the page fault mechanism is not triggered.

> 
>> +	end = addr + (slot->npages * PAGE_SIZE);
>>   
>> -	for (i = free->npages; i; --i, ++gfn) {
>> -		struct page *uvmem_page;
>> +	gfn = slot->base_gfn;
>> +	for (i = slot->npages; i; --i, ++gfn, addr += PAGE_SIZE) {
>> +
>> +		/* Fetch the VMA if addr is not in the latest fetched one */
>> +		if (!vma || (addr < vma->vm_start || addr >= vma->vm_end)) {
>> +			vma = find_vma_intersection(kvm->mm, addr, end);
>> +			if (!vma ||
>> +			    vma->vm_start > addr || vma->vm_end < end) {
>> +				pr_err("Can't find VMA for gfn:0x%lx\n", gfn);
>> +				break;
>> +			}
>> +		}
> 
> In Ram's series, kvmppc_memslot_page_merge() also walks the VMAs spanning
> the memslot, but it uses a different logic for the same. Why can't these
> two cases use the same method to walk the VMAs? Is there anything subtly
> different between the two cases?

This is probably doable. At the time I wrote that patch, the 
kvmppc_memslot_page_merge() was not yet introduced AFAIR.

This being said, I'd help a lot to factorize that code... I let Ram dealing with 
that ;)

Cheers,
Laurent.



^ permalink raw reply

* Re: [v4 4/5] KVM: PPC: Book3S HV: retry page migration before erroring-out
From: Ram Pai @ 2020-07-23 11:44 UTC (permalink / raw)
  To: Bharata B Rao
  Cc: ldufour, cclaudio, kvm-ppc, sathnaga, aneesh.kumar, sukadev,
	linuxppc-dev, bauerman, david
In-Reply-To: <20200723061344.GB1082478@in.ibm.com>

I am dropping this patch based on our conversation, where we agreed, we
need to rootcause the migration failure.

On Thu, Jul 23, 2020 at 11:43:44AM +0530, Bharata B Rao wrote:
> On Fri, Jul 17, 2020 at 01:00:26AM -0700, Ram Pai wrote:
> > @@ -812,7 +842,7 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> >  	struct vm_area_struct *vma;
> >  	int srcu_idx;
> >  	unsigned long gfn = gpa >> page_shift;
> > -	int ret;
> > +	int ret, repeat_count = REPEAT_COUNT;
> >  
> >  	if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
> >  		return H_UNSUPPORTED;
> > @@ -826,34 +856,44 @@ unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> >  	if (flags & H_PAGE_IN_SHARED)
> >  		return kvmppc_share_page(kvm, gpa, page_shift);
> >  
> > -	ret = H_PARAMETER;
> >  	srcu_idx = srcu_read_lock(&kvm->srcu);
> > -	mmap_read_lock(kvm->mm);
> >  
> > -	start = gfn_to_hva(kvm, gfn);
> > -	if (kvm_is_error_hva(start))
> > -		goto out;
> > -
> > -	mutex_lock(&kvm->arch.uvmem_lock);
> >  	/* Fail the page-in request of an already paged-in page */
> > -	if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL))
> > -		goto out_unlock;
> > +	mutex_lock(&kvm->arch.uvmem_lock);
> > +	ret = kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL);
> > +	mutex_unlock(&kvm->arch.uvmem_lock);
> 
> Same comment as for the prev patch. I don't think you can release
> the lock here.
> 
> > +	if (ret) {
> > +		srcu_read_unlock(&kvm->srcu, srcu_idx);
> > +		return H_PARAMETER;
> > +	}
> >  
> > -	end = start + (1UL << page_shift);
> > -	vma = find_vma_intersection(kvm->mm, start, end);
> > -	if (!vma || vma->vm_start > start || vma->vm_end < end)
> > -		goto out_unlock;
> > +	do {
> > +		ret = H_PARAMETER;
> > +		mmap_read_lock(kvm->mm);
> >  
> > -	if (kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift,
> > -				true))
> > -		goto out_unlock;
> > +		start = gfn_to_hva(kvm, gfn);
> > +		if (kvm_is_error_hva(start)) {
> > +			mmap_read_unlock(kvm->mm);
> > +			break;
> > +		}
> >  
> > -	ret = H_SUCCESS;
> > +		end = start + (1UL << page_shift);
> > +		vma = find_vma_intersection(kvm->mm, start, end);
> > +		if (!vma || vma->vm_start > start || vma->vm_end < end) {
> > +			mmap_read_unlock(kvm->mm);
> > +			break;
> > +		}
> > +
> > +		mutex_lock(&kvm->arch.uvmem_lock);
> > +		ret = kvmppc_svm_migrate_page(vma, start, end, gpa, kvm, page_shift, true);
> > +		mutex_unlock(&kvm->arch.uvmem_lock);
> > +
> > +		mmap_read_unlock(kvm->mm);
> > +	} while (ret == -2 && repeat_count--);
> > +
> > +	if (ret == -2)
> > +		ret = H_BUSY;
> >  
> > -out_unlock:
> > -	mutex_unlock(&kvm->arch.uvmem_lock);
> > -out:
> > -	mmap_read_unlock(kvm->mm);
> >  	srcu_read_unlock(&kvm->srcu, srcu_idx);
> >  	return ret;
> >  }
> > -- 
> > 1.8.3.1

-- 
Ram Pai

^ permalink raw reply

* Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
From: Peter Zijlstra @ 2020-07-23 11:40 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Alexey Kardashevskiy, linuxppc-dev, linux-kernel,
	Ingo Molnar, Will Deacon
In-Reply-To: <20200723105615.1268126-1-npiggin@gmail.com>

On Thu, Jul 23, 2020 at 08:56:14PM +1000, Nicholas Piggin wrote:

> diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
> index 3a0db7b0b46e..35060be09073 100644
> --- a/arch/powerpc/include/asm/hw_irq.h
> +++ b/arch/powerpc/include/asm/hw_irq.h
> @@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
>  #define powerpc_local_irq_pmu_save(flags)			\
>  	 do {							\
>  		raw_local_irq_pmu_save(flags);			\
> -		trace_hardirqs_off();				\
> +		if (!raw_irqs_disabled_flags(flags))		\
> +			trace_hardirqs_off();			\
>  	} while(0)
>  #define powerpc_local_irq_pmu_restore(flags)			\
>  	do {							\
> -		if (raw_irqs_disabled_flags(flags)) {		\
> -			raw_local_irq_pmu_restore(flags);	\
> -			trace_hardirqs_off();			\
> -		} else {					\
> +		if (!raw_irqs_disabled_flags(flags))		\
>  			trace_hardirqs_on();			\
> -			raw_local_irq_pmu_restore(flags);	\
> -		}						\
> +		raw_local_irq_pmu_restore(flags);		\
>  	} while(0)

You shouldn't be calling lockdep from NMI context! That is, I recently
added suport for that on x86:

  https://lkml.kernel.org/r/20200623083721.155449112@infradead.org
  https://lkml.kernel.org/r/20200623083721.216740948@infradead.org

But you need to be very careful on how you order things, as you can see
the above relies on preempt_count() already having been incremented with
NMI_MASK.

^ permalink raw reply

* Re: [v4 3/5] KVM: PPC: Book3S HV: in H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs.
From: Ram Pai @ 2020-07-23 11:39 UTC (permalink / raw)
  To: Bharata B Rao
  Cc: ldufour, cclaudio, kvm-ppc, sathnaga, aneesh.kumar, sukadev,
	linuxppc-dev, bauerman, david
In-Reply-To: <20200723061037.GA1082478@in.ibm.com>

On Thu, Jul 23, 2020 at 11:40:37AM +0530, Bharata B Rao wrote:
> On Fri, Jul 17, 2020 at 01:00:25AM -0700, Ram Pai wrote:
> >  
> > +int kvmppc_uv_migrate_mem_slot(struct kvm *kvm,
> > +		const struct kvm_memory_slot *memslot)
> 
> Don't see any callers for this outside of this file, so why not static?
> 
> > +{
> > +	unsigned long gfn = memslot->base_gfn;
> > +	struct vm_area_struct *vma;
> > +	unsigned long start, end;
> > +	int ret = 0;
> > +
> > +	while (kvmppc_next_nontransitioned_gfn(memslot, kvm, &gfn)) {
> 
> So you checked the state of gfn under uvmem_lock above, but release
> it too.
> 
> > +
> > +		mmap_read_lock(kvm->mm);
> > +		start = gfn_to_hva(kvm, gfn);
> > +		if (kvm_is_error_hva(start)) {
> > +			ret = H_STATE;
> > +			goto next;
> > +		}
> > +
> > +		end = start + (1UL << PAGE_SHIFT);
> > +		vma = find_vma_intersection(kvm->mm, start, end);
> > +		if (!vma || vma->vm_start > start || vma->vm_end < end) {
> > +			ret = H_STATE;
> > +			goto next;
> > +		}
> > +
> > +		mutex_lock(&kvm->arch.uvmem_lock);
> > +		ret = kvmppc_svm_migrate_page(vma, start, end,
> > +				(gfn << PAGE_SHIFT), kvm, PAGE_SHIFT, false);
> 
> What is the guarantee that the gfn is in the same earlier state when you do
> do migration here?

Are you worried about the case, where someother thread will sneak-in and
migrate the GFN, and this migration request will become a duplicate one?

That is theortically possible, though practically improbable. This
transition is attempted only when there is one vcpu active in the VM.

However, may be, we should not bake-in that assumption in this code.
Will remove that assumption.

RP

^ permalink raw reply

* Re: [v4 2/5] KVM: PPC: Book3S HV: track the state GFNs associated with secure VMs
From: Ram Pai @ 2020-07-23 11:14 UTC (permalink / raw)
  To: Bharata B Rao
  Cc: ldufour, cclaudio, kvm-ppc, sathnaga, aneesh.kumar, sukadev,
	linuxppc-dev, bauerman, david
In-Reply-To: <20200723044830.GT7902@in.ibm.com>

On Thu, Jul 23, 2020 at 10:18:30AM +0530, Bharata B Rao wrote:
> On Fri, Jul 17, 2020 at 01:00:24AM -0700, Ram Pai wrote:
> >  	pvt->gpa = gpa;
..snip..
> >  	pvt->kvm = kvm;
> > @@ -524,6 +663,7 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
> >  		uvmem_page = pfn_to_page(uvmem_pfn);
> >  		pvt = uvmem_page->zone_device_data;
> >  		pvt->skip_page_out = true;
> > +		pvt->remove_gfn = false;
> >  	}
> >  
> >  retry:
> > @@ -537,12 +677,16 @@ static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
> >  		uvmem_page = pfn_to_page(uvmem_pfn);
> >  		pvt = uvmem_page->zone_device_data;
> >  		pvt->skip_page_out = true;
> > +		pvt->remove_gfn = false;
> 
> This is the case of making an already secure page as shared page.
> A comment here as to why remove_gfn is set to false here will help.
> 
> Also isn't it by default false? Is there a situation where it starts
> out by default false, becomes true later and you are required to
> explicitly mark it false here?

It is by default false. And will be true when the GFN is
released/invalidated through kvmppc_uvmem_drop_pages().

It is marked false explicitly here, just to be safe, and protect
against any implicit changes.

> 
> Otherwise, Reviewed-by: Bharata B Rao <bharata@linux.ibm.com>
> 
Thanks for the review.

RP

^ permalink raw reply

* [PATCH 2/2] lockdep: warn on redundant or incorrect irq state changes
From: Nicholas Piggin @ 2020-07-23 10:56 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Nicholas Piggin,
	Alexey Kardashevskiy, Ingo Molnar, Will Deacon
In-Reply-To: <20200723105615.1268126-1-npiggin@gmail.com>

With the previous patch, lockdep hardirq state changes should not be
redundant. Softirq state changes already follow that pattern.

So warn on unexpected enable-when-enabled or disable-when-disabled
conditions, to catch possible errors or sloppy patterns that could
lead to similar bad behavior due to NMIs etc.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 kernel/locking/lockdep.c           | 80 +++++++++++++-----------------
 kernel/locking/lockdep_internals.h |  4 --
 kernel/locking/lockdep_proc.c      | 10 +---
 3 files changed, 35 insertions(+), 59 deletions(-)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..138458fb2234 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3649,15 +3649,8 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
-	if (unlikely(current->hardirqs_enabled)) {
-		/*
-		 * Neither irq nor preemption are disabled here
-		 * so this is racy by nature but losing one hit
-		 * in a stat is not a big deal.
-		 */
-		__debug_atomic_inc(redundant_hardirqs_on);
+	if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled))
 		return;
-	}
 
 	/*
 	 * We're enabling irqs and according to our state above irqs weren't
@@ -3695,15 +3688,8 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	if (unlikely(!debug_locks || curr->lockdep_recursion))
 		return;
 
-	if (curr->hardirqs_enabled) {
-		/*
-		 * Neither irq nor preemption are disabled here
-		 * so this is racy by nature but losing one hit
-		 * in a stat is not a big deal.
-		 */
-		__debug_atomic_inc(redundant_hardirqs_on);
+	if (DEBUG_LOCKS_WARN_ON(curr->hardirqs_enabled))
 		return;
-	}
 
 	/*
 	 * We're enabling irqs and according to our state above irqs weren't
@@ -3738,6 +3724,9 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 	if (unlikely(!debug_locks || curr->lockdep_recursion))
 		return;
 
+	if (DEBUG_LOCKS_WARN_ON(!curr->hardirqs_enabled))
+		return;
+
 	/*
 	 * So we're supposed to get called after you mask local IRQs, but for
 	 * some reason the hardware doesn't quite think you did a proper job.
@@ -3745,17 +3734,13 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->hardirqs_enabled) {
-		/*
-		 * We have done an ON -> OFF transition:
-		 */
-		curr->hardirqs_enabled = 0;
-		curr->hardirq_disable_ip = ip;
-		curr->hardirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(hardirqs_off_events);
-	} else {
-		debug_atomic_inc(redundant_hardirqs_off);
-	}
+	/*
+	 * We have done an ON -> OFF transition:
+	 */
+	curr->hardirqs_enabled = 0;
+	curr->hardirq_disable_ip = ip;
+	curr->hardirq_disable_event = ++curr->irq_events;
+	debug_atomic_inc(hardirqs_off_events);
 }
 EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
 
@@ -3769,6 +3754,9 @@ void lockdep_softirqs_on(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
+	if (DEBUG_LOCKS_WARN_ON(curr->softirqs_enabled))
+		return;
+
 	/*
 	 * We fancy IRQs being disabled here, see softirq.c, avoids
 	 * funny state and nesting things.
@@ -3776,11 +3764,6 @@ void lockdep_softirqs_on(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->softirqs_enabled) {
-		debug_atomic_inc(redundant_softirqs_on);
-		return;
-	}
-
 	current->lockdep_recursion++;
 	/*
 	 * We'll do an OFF -> ON transition:
@@ -3809,26 +3792,26 @@ void lockdep_softirqs_off(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
+	if (DEBUG_LOCKS_WARN_ON(!curr->softirqs_enabled))
+		return;
+
 	/*
 	 * We fancy IRQs being disabled here, see softirq.c
 	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->softirqs_enabled) {
-		/*
-		 * We have done an ON -> OFF transition:
-		 */
-		curr->softirqs_enabled = 0;
-		curr->softirq_disable_ip = ip;
-		curr->softirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(softirqs_off_events);
-		/*
-		 * Whoops, we wanted softirqs off, so why aren't they?
-		 */
-		DEBUG_LOCKS_WARN_ON(!softirq_count());
-	} else
-		debug_atomic_inc(redundant_softirqs_off);
+	/*
+	 * We have done an ON -> OFF transition:
+	 */
+	curr->softirqs_enabled = 0;
+	curr->softirq_disable_ip = ip;
+	curr->softirq_disable_event = ++curr->irq_events;
+	debug_atomic_inc(softirqs_off_events);
+	/*
+	 * Whoops, we wanted softirqs off, so why aren't they?
+	 */
+	DEBUG_LOCKS_WARN_ON(!softirq_count());
 }
 
 static int
@@ -5684,6 +5667,11 @@ void __init lockdep_init(void)
 
 	printk(" per task-struct memory footprint: %zu bytes\n",
 	       sizeof(((struct task_struct *)NULL)->held_locks));
+
+	WARN_ON(irqs_disabled());
+
+	current->hardirqs_enabled = 1;
+	current->softirqs_enabled = 1;
 }
 
 static void
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index baca699b94e9..6dd8b1f06dc4 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -180,12 +180,8 @@ struct lockdep_stats {
 	unsigned int   chain_lookup_misses;
 	unsigned long  hardirqs_on_events;
 	unsigned long  hardirqs_off_events;
-	unsigned long  redundant_hardirqs_on;
-	unsigned long  redundant_hardirqs_off;
 	unsigned long  softirqs_on_events;
 	unsigned long  softirqs_off_events;
-	unsigned long  redundant_softirqs_on;
-	unsigned long  redundant_softirqs_off;
 	int            nr_unused_locks;
 	unsigned int   nr_redundant_checks;
 	unsigned int   nr_redundant;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 5525cd3ba0c8..98f204220ed9 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -172,12 +172,8 @@ static void lockdep_stats_debug_show(struct seq_file *m)
 #ifdef CONFIG_DEBUG_LOCKDEP
 	unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
 			   hi2 = debug_atomic_read(hardirqs_off_events),
-			   hr1 = debug_atomic_read(redundant_hardirqs_on),
-			   hr2 = debug_atomic_read(redundant_hardirqs_off),
 			   si1 = debug_atomic_read(softirqs_on_events),
-			   si2 = debug_atomic_read(softirqs_off_events),
-			   sr1 = debug_atomic_read(redundant_softirqs_on),
-			   sr2 = debug_atomic_read(redundant_softirqs_off);
+			   si2 = debug_atomic_read(softirqs_off_events);
 
 	seq_printf(m, " chain lookup misses:           %11llu\n",
 		debug_atomic_read(chain_lookup_misses));
@@ -196,12 +192,8 @@ static void lockdep_stats_debug_show(struct seq_file *m)
 
 	seq_printf(m, " hardirq on events:             %11llu\n", hi1);
 	seq_printf(m, " hardirq off events:            %11llu\n", hi2);
-	seq_printf(m, " redundant hardirq ons:         %11llu\n", hr1);
-	seq_printf(m, " redundant hardirq offs:        %11llu\n", hr2);
 	seq_printf(m, " softirq on events:             %11llu\n", si1);
 	seq_printf(m, " softirq off events:            %11llu\n", si2);
-	seq_printf(m, " redundant softirq ons:         %11llu\n", sr1);
-	seq_printf(m, " redundant softirq offs:        %11llu\n", sr2);
 #endif
 }
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
From: Nicholas Piggin @ 2020-07-23 10:56 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Nicholas Piggin,
	Alexey Kardashevskiy, Ingo Molnar, Will Deacon

If an interrupt is not masked by local_irq_disable (e.g., a powerpc perf
interrupt), then it can hit in local_irq_enable() after trace_hardirqs_on()
and before raw_local_irq_enable().

If that interrupt handler calls local_irq_save(), it will call
trace_hardirqs_off() but the local_irq_restore() will not call
trace_hardirqs_on() again because raw_irqs_disabled_flags(flags) is true.

This can lead lockdep_assert_irqs_enabled() to trigger false positive
warnings.

Fix this by being careful to only enable and disable trace_hardirqs with
the outer-most irq enable/disable.

Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---

I haven't tested on other architectures but I imagine NMIs in general
might cause a similar problem.

Other architectures might have to be updated for patch 2, but there's
a lot of asm around interrupt/return, so I didn't have a very good
lock. The warnings should be harmless enough and uncover most places
that need updating.

 arch/powerpc/include/asm/hw_irq.h | 11 ++++-------
 include/linux/irqflags.h          | 29 ++++++++++++++++++-----------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 3a0db7b0b46e..35060be09073 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -200,17 +200,14 @@ static inline bool arch_irqs_disabled(void)
 #define powerpc_local_irq_pmu_save(flags)			\
 	 do {							\
 		raw_local_irq_pmu_save(flags);			\
-		trace_hardirqs_off();				\
+		if (!raw_irqs_disabled_flags(flags))		\
+			trace_hardirqs_off();			\
 	} while(0)
 #define powerpc_local_irq_pmu_restore(flags)			\
 	do {							\
-		if (raw_irqs_disabled_flags(flags)) {		\
-			raw_local_irq_pmu_restore(flags);	\
-			trace_hardirqs_off();			\
-		} else {					\
+		if (!raw_irqs_disabled_flags(flags))		\
 			trace_hardirqs_on();			\
-			raw_local_irq_pmu_restore(flags);	\
-		}						\
+		raw_local_irq_pmu_restore(flags);		\
 	} while(0)
 #else
 #define powerpc_local_irq_pmu_save(flags)			\
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 6384d2813ded..571ee29ecefc 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -163,26 +163,33 @@ do {						\
  * if !TRACE_IRQFLAGS.
  */
 #ifdef CONFIG_TRACE_IRQFLAGS
-#define local_irq_enable() \
-	do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
-#define local_irq_disable() \
-	do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)
+#define local_irq_enable()				\
+	do {						\
+		trace_hardirqs_on();			\
+		raw_local_irq_enable();			\
+	} while (0)
+
+#define local_irq_disable()				\
+	do {						\
+		bool was_disabled = raw_irqs_disabled(); \
+		raw_local_irq_disable();		\
+		if (!was_disabled)			\
+			trace_hardirqs_off();		\
+	} while (0)
+
 #define local_irq_save(flags)				\
 	do {						\
 		raw_local_irq_save(flags);		\
-		trace_hardirqs_off();			\
+		if (!raw_irqs_disabled_flags(flags))	\
+			trace_hardirqs_off();		\
 	} while (0)
 
 
 #define local_irq_restore(flags)			\
 	do {						\
-		if (raw_irqs_disabled_flags(flags)) {	\
-			raw_local_irq_restore(flags);	\
-			trace_hardirqs_off();		\
-		} else {				\
+		if (!raw_irqs_disabled_flags(flags))	\
 			trace_hardirqs_on();		\
-			raw_local_irq_restore(flags);	\
-		}					\
+		raw_local_irq_restore(flags);		\
 	} while (0)
 
 #define safe_halt()				\
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH 0/7] powerpc/watchpoint: 2nd DAWR kvm enablement + selftests
From: Ravi Bangoria @ 2020-07-23 10:48 UTC (permalink / raw)
  To: mpe, paulus
  Cc: christophe.leroy, Ravi Bangoria, mikey, kvm, jniethe5,
	linux-kernel, npiggin, kvm-ppc, linux-kselftest, rogealve,
	pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200723102058.312282-1-ravi.bangoria@linux.ibm.com>



On 7/23/20 3:50 PM, Ravi Bangoria wrote:
> Patch #1, #2 and #3 enables p10 2nd DAWR feature for Book3S kvm guest. DAWR
> is a hypervisor resource and thus H_SET_MODE hcall is used to set/unset it.
> A new case H_SET_MODE_RESOURCE_SET_DAWR1 is introduced in H_SET_MODE hcall
> for setting/unsetting 2nd DAWR. Also, new capability KVM_CAP_PPC_DAWR1 has
> been added to query 2nd DAWR support via kvm ioctl.
> 
> This feature also needs to be enabled in Qemu to really use it. I'll reply
> link to qemu patches once I post them in qemu-devel mailing list.

Qemu patches: https://lore.kernel.org/kvm/20200723104220.314671-1-ravi.bangoria@linux.ibm.com

^ permalink raw reply

* Re: [PATCH] powerpc/64s: Fix irq tracing corruption in interrupt/syscall return caused by perf interrupts
From: Nicholas Piggin @ 2020-07-23 10:29 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev
In-Reply-To: <dcf4ee37-202b-794a-189b-895e59293c68@ozlabs.ru>

Excerpts from Alexey Kardashevskiy's message of July 22, 2020 8:50 pm:
> 
> 
> On 22/07/2020 17:34, Nicholas Piggin wrote:
>> Alexey reports lockdep_assert_irqs_enabled() warnings when stress testing perf, e.g.,
>> 
>> WARNING: CPU: 0 PID: 1556 at kernel/softirq.c:169 __local_bh_enable_ip+0x258/0x270
>> CPU: 0 PID: 1556 Comm: syz-executor
>> NIP:  c0000000001ec888 LR: c0000000001ec884 CTR: c000000000ef0610
>> REGS: c000000022d4f8a0 TRAP: 0700   Not tainted  (5.8.0-rc3-x)
>> MSR:  8000000000029033 <SF,EE,ME,IR,DR,RI,LE>  CR: 28008844  XER: 20040000
>> CFAR: c0000000001dc1d0 IRQMASK: 0
>> 
>> The interesting thing is MSR[EE] and IRQMASK shows interrupts are enabled,
>> suggesting the current->hardirqs_enabled irq tracing state is going out of sync
>> with the actual interrupt enable state.
>> 
>> The cause is a window in interrupt/syscall return where irq tracing state is being
>> adjusted for an irqs-enabled return while MSR[EE] is still enabled. A perf
>> interrupt hits and ends up calling trace_hardirqs_off() when restoring
>> interrupt flags to a disable state.
>> 
>> Fix this by disabling perf interrupts as well while adjusting irq tracing state.
>> 
>> Add a debug check that catches the condition sooner.
>> 
>> Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic in C")
>> Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>> 
>> I can reproduce similar symptoms and this patch fixes my test case,
>> still trying to confirm Alexey's test case or whether there's another
>> similar bug causing it.
> 
> 
> This does not fix my testcase. I applied this on top of 4fa640dc5230
> ("Merge tag 'vfio-v5.8-rc7' of git://github.com/awilliam/linux-vfio into
> master")  without any of my testing code, just to be clear. Sorry...

Okay it seems to be a bigger problem and not actually caused by that
patch but was possible for lockdep hardirqs_enabled state to get out 
of synch with the local_irq_disable() state before that too. Root
cause is similar -- perf interrupts hitting between updating the two
different bits of state.

Not quite sure why Alexey's test wasn't hitting it before the patch,
but possibly the way masked interrupts get replayed. But I was able
to hit the problem with a different assertion.

I think I have a fix, but it seems to be a generic irq tracing code
issue. So this patch can be dropped, and it's not an urgent issue for
the next release (it only triggers warns on rare occasions and only
when lockdep is enabled).

Thanks,
Nick


^ permalink raw reply

* [PATCH 7/7] powerpc/selftests: Add selftest to test concurrent perf/ptrace events
From: Ravi Bangoria @ 2020-07-23 10:20 UTC (permalink / raw)
  To: mpe, paulus
  Cc: christophe.leroy, ravi.bangoria, mikey, kvm, jniethe5,
	linux-kernel, npiggin, kvm-ppc, linux-kselftest, rogealve,
	pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200723102058.312282-1-ravi.bangoria@linux.ibm.com>

ptrace and perf watchpoints can't co-exists if their address range
overlaps. See commit 29da4f91c0c1 ("powerpc/watchpoint: Don't allow
concurrent perf and ptrace events") for more detail. Add selftest
for the same.

Sample o/p:
  # ./ptrace-perf-hwbreak
  test: ptrace-perf-hwbreak
  tags: git_version:powerpc-5.8-7-118-g937fa174a15d-dirty
  perf cpu event -> ptrace thread event (Overlapping): Ok
  perf cpu event -> ptrace thread event (Non-overlapping): Ok
  perf thread event -> ptrace same thread event (Overlapping): Ok
  perf thread event -> ptrace same thread event (Non-overlapping): Ok
  perf thread event -> ptrace other thread event: Ok
  ptrace thread event -> perf kernel event: Ok
  ptrace thread event -> perf same thread event (Overlapping): Ok
  ptrace thread event -> perf same thread event (Non-overlapping): Ok
  ptrace thread event -> perf other thread event: Ok
  ptrace thread event -> perf cpu event (Overlapping): Ok
  ptrace thread event -> perf cpu event (Non-overlapping): Ok
  ptrace thread event -> perf same thread & cpu event (Overlapping): Ok
  ptrace thread event -> perf same thread & cpu event (Non-overlapping): Ok
  ptrace thread event -> perf other thread & cpu event: Ok
  success: ptrace-perf-hwbreak

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 .../selftests/powerpc/ptrace/.gitignore       |   1 +
 .../testing/selftests/powerpc/ptrace/Makefile |   2 +-
 .../powerpc/ptrace/ptrace-perf-hwbreak.c      | 659 ++++++++++++++++++
 3 files changed, 661 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c

diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore b/tools/testing/selftests/powerpc/ptrace/.gitignore
index 0e96150b7c7e..eb75e5360e31 100644
--- a/tools/testing/selftests/powerpc/ptrace/.gitignore
+++ b/tools/testing/selftests/powerpc/ptrace/.gitignore
@@ -14,3 +14,4 @@ perf-hwbreak
 core-pkey
 ptrace-pkey
 ptrace-syscall
+ptrace-perf-hwbreak
diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile
index 8d3f006c98cc..a500639da97a 100644
--- a/tools/testing/selftests/powerpc/ptrace/Makefile
+++ b/tools/testing/selftests/powerpc/ptrace/Makefile
@@ -2,7 +2,7 @@
 TEST_GEN_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \
               ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx ptrace-tm-vsx \
               ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey core-pkey \
-              perf-hwbreak ptrace-syscall
+              perf-hwbreak ptrace-syscall ptrace-perf-hwbreak
 
 top_srcdir = ../../../../..
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c
new file mode 100644
index 000000000000..6b8804a4942e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c
@@ -0,0 +1,659 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
+#include <asm/unistd.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+#include "ptrace.h"
+
+char data[16];
+
+/* Overlapping address range */
+volatile __u64 *ptrace_data1 = (__u64 *)&data[0];
+volatile __u64 *perf_data1 = (__u64 *)&data[4];
+
+/* Non-overlapping address range */
+volatile __u64 *ptrace_data2 = (__u64 *)&data[0];
+volatile __u64 *perf_data2 = (__u64 *)&data[8];
+
+static unsigned long pid_max_addr(void)
+{
+	FILE *fp;
+	char *line, *c;
+	char addr[100];
+	size_t len = 0;
+
+	fp = fopen("/proc/kallsyms", "r");
+	if (!fp) {
+		printf("Failed to read /proc/kallsyms. Exiting..\n");
+		exit(EXIT_FAILURE);
+	}
+
+	while (getline(&line, &len, fp) != -1) {
+		if (!strstr(line, "pid_max") || strstr(line, "pid_max_max") ||
+		    strstr(line, "pid_max_min"))
+			continue;
+
+		strncpy(addr, line, len < 100 ? len : 100);
+		c = strchr(addr, ' ');
+		*c = '\0';
+		return strtoul(addr, &c, 16);
+	}
+	fclose(fp);
+	printf("Could not find pix_max. Exiting..\n");
+	exit(EXIT_FAILURE);
+	return -1;
+}
+
+static void perf_user_event_attr_set(struct perf_event_attr *attr, __u64 addr, __u64 len)
+{
+	memset(attr, 0, sizeof(struct perf_event_attr));
+	attr->type           = PERF_TYPE_BREAKPOINT;
+	attr->size           = sizeof(struct perf_event_attr);
+	attr->bp_type        = HW_BREAKPOINT_R;
+	attr->bp_addr        = addr;
+	attr->bp_len         = len;
+	attr->exclude_kernel = 1;
+	attr->exclude_hv     = 1;
+}
+
+static void perf_kernel_event_attr_set(struct perf_event_attr *attr)
+{
+	memset(attr, 0, sizeof(struct perf_event_attr));
+	attr->type           = PERF_TYPE_BREAKPOINT;
+	attr->size           = sizeof(struct perf_event_attr);
+	attr->bp_type        = HW_BREAKPOINT_R;
+	attr->bp_addr        = pid_max_addr();
+	attr->bp_len         = sizeof(unsigned long);
+	attr->exclude_user   = 1;
+	attr->exclude_hv     = 1;
+}
+
+static int perf_cpu_event_open(int cpu, __u64 addr, __u64 len)
+{
+	struct perf_event_attr attr;
+
+	perf_user_event_attr_set(&attr, addr, len);
+	return syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0);
+}
+
+static int perf_thread_event_open(pid_t child_pid, __u64 addr, __u64 len)
+{
+	struct perf_event_attr attr;
+
+	perf_user_event_attr_set(&attr, addr, len);
+	return syscall(__NR_perf_event_open, &attr, child_pid, -1, -1, 0);
+}
+
+static int perf_thread_cpu_event_open(pid_t child_pid, int cpu, __u64 addr, __u64 len)
+{
+	struct perf_event_attr attr;
+
+	perf_user_event_attr_set(&attr, addr, len);
+	return syscall(__NR_perf_event_open, &attr, child_pid, cpu, -1, 0);
+}
+
+static int perf_thread_kernel_event_open(pid_t child_pid)
+{
+	struct perf_event_attr attr;
+
+	perf_kernel_event_attr_set(&attr);
+	return syscall(__NR_perf_event_open, &attr, child_pid, -1, -1, 0);
+}
+
+static int perf_cpu_kernel_event_open(int cpu)
+{
+	struct perf_event_attr attr;
+
+	perf_kernel_event_attr_set(&attr);
+	return syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0);
+}
+
+static int child(void)
+{
+	int ret;
+
+	ret = ptrace(PTRACE_TRACEME, 0, NULL, 0);
+	if (ret) {
+		printf("Error: PTRACE_TRACEME failed\n");
+		return 0;
+	}
+	kill(getpid(), SIGUSR1); /* --> parent (SIGUSR1) */
+
+	return 0;
+}
+
+static void ptrace_ppc_hw_breakpoint(struct ppc_hw_breakpoint *info, int type,
+				     __u64 addr, int len)
+{
+	info->version = 1;
+	info->trigger_type = type;
+	info->condition_mode = PPC_BREAKPOINT_CONDITION_NONE;
+	info->addr = addr;
+	info->addr2 = addr + len;
+	info->condition_value = 0;
+	if (!len)
+		info->addr_mode = PPC_BREAKPOINT_MODE_EXACT;
+	else
+		info->addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
+}
+
+static int ptrace_open(pid_t child_pid, __u64 wp_addr, int len)
+{
+	struct ppc_hw_breakpoint info;
+
+	ptrace_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_RW, wp_addr, len);
+	return ptrace(PPC_PTRACE_SETHWDEBUG, child_pid, 0, &info);
+}
+
+static int test1(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread event by ptrace)
+	 *	if (existing cpu event by perf)
+	 *		if (addr range overlaps)
+	 *			fail;
+	 */
+
+	perf_fd = perf_cpu_event_open(0, (__u64)perf_data1, sizeof(*perf_data1));
+	if (perf_fd < 0)
+		return -1;
+
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd > 0 || errno != ENOSPC)
+		ret = -1;
+
+	close(perf_fd);
+	return ret;
+}
+
+static int test2(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread event by ptrace)
+	 *	if (existing cpu event by perf)
+	 *		if (addr range does not overlaps)
+	 *			allow;
+	 */
+
+	perf_fd = perf_cpu_event_open(0, (__u64)perf_data2, sizeof(*perf_data2));
+	if (perf_fd < 0)
+		return -1;
+
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2));
+	if (ptrace_fd < 0) {
+		ret = -1;
+		goto perf_close;
+	}
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+
+perf_close:
+	close(perf_fd);
+	return ret;
+}
+
+static int test3(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread event by ptrace)
+	 *	if (existing thread event by perf on the same thread)
+	 *		if (addr range overlaps)
+	 *			fail;
+	 */
+	perf_fd = perf_thread_event_open(child_pid, (__u64)perf_data1,
+					 sizeof(*perf_data1));
+	if (perf_fd < 0)
+		return -1;
+
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd > 0 || errno != ENOSPC)
+		ret = -1;
+
+	close(perf_fd);
+	return ret;
+}
+
+static int test4(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread event by ptrace)
+	 *	if (existing thread event by perf on the same thread)
+	 *		if (addr range does not overlaps)
+	 *			fail;
+	 */
+	perf_fd = perf_thread_event_open(child_pid, (__u64)perf_data2,
+					 sizeof(*perf_data2));
+	if (perf_fd < 0)
+		return -1;
+
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2));
+	if (ptrace_fd < 0) {
+		ret = -1;
+		goto perf_close;
+	}
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+
+perf_close:
+	close(perf_fd);
+	return ret;
+}
+
+static int test5(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int cpid;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread event by ptrace)
+	 *	if (existing thread event by perf on the different thread)
+	 *		allow;
+	 */
+	cpid = fork();
+	if (!cpid) {
+		/* Temporary Child */
+		pause();
+		exit(EXIT_SUCCESS);
+	}
+
+	perf_fd = perf_thread_event_open(cpid, (__u64)perf_data1, sizeof(*perf_data1));
+	if (perf_fd < 0) {
+		ret = -1;
+		goto kill_child;
+	}
+
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd < 0) {
+		ret = -1;
+		goto perf_close;
+	}
+
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+perf_close:
+	close(perf_fd);
+kill_child:
+	kill(cpid, SIGINT);
+	return ret;
+}
+
+static int test6(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread kernel event by perf)
+	 *	if (existing thread event by ptrace on the same thread)
+	 *		allow;
+	 * -- OR --
+	 * if (new per cpu kernel event by perf)
+	 *	if (existing thread event by ptrace)
+	 *		allow;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd < 0)
+		return -1;
+
+	perf_fd = perf_thread_kernel_event_open(child_pid);
+	if (perf_fd < 0) {
+		ret = -1;
+		goto ptrace_close;
+	}
+	close(perf_fd);
+
+	perf_fd = perf_cpu_kernel_event_open(0);
+	if (perf_fd < 0) {
+		ret = -1;
+		goto ptrace_close;
+	}
+	close(perf_fd);
+
+ptrace_close:
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+static int test7(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread event by perf)
+	 *	if (existing thread event by ptrace on the same thread)
+	 *		if (addr range overlaps)
+	 *			fail;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd < 0)
+		return -1;
+
+	perf_fd = perf_thread_event_open(child_pid, (__u64)perf_data1,
+					 sizeof(*perf_data1));
+	if (perf_fd > 0 || errno != ENOSPC)
+		ret = -1;
+
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+static int test8(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread event by perf)
+	 *	if (existing thread event by ptrace on the same thread)
+	 *		if (addr range does not overlaps)
+	 *			allow;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2));
+	if (ptrace_fd < 0)
+		return -1;
+
+	perf_fd = perf_thread_event_open(child_pid, (__u64)perf_data2,
+					 sizeof(*perf_data2));
+	if (perf_fd < 0) {
+		ret = -1;
+		goto ptrace_close;
+	}
+	close(perf_fd);
+
+ptrace_close:
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+static int test9(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int cpid;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread event by perf)
+	 *	if (existing thread event by ptrace on the other thread)
+	 *		allow;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd < 0)
+		return -1;
+
+	cpid = fork();
+	if (!cpid) {
+		/* Temporary Child */
+		pause();
+		exit(EXIT_SUCCESS);
+	}
+
+	perf_fd = perf_thread_event_open(cpid, (__u64)perf_data1, sizeof(*perf_data1));
+	if (perf_fd < 0) {
+		ret = -1;
+		goto kill_child;
+	}
+	close(perf_fd);
+
+kill_child:
+	kill(cpid, SIGINT);
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+static int test10(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per cpu event by perf)
+	 *	if (existing thread event by ptrace on the same thread)
+	 *		if (addr range overlaps)
+	 *			fail;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd < 0)
+		return -1;
+
+	perf_fd = perf_cpu_event_open(0, (__u64)perf_data1, sizeof(*perf_data1));
+	if (perf_fd > 0 || errno != ENOSPC)
+		ret = -1;
+
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+static int test11(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per cpu event by perf)
+	 *	if (existing thread event by ptrace on the same thread)
+	 *		if (addr range does not overlap)
+	 *			allow;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2));
+	if (ptrace_fd < 0)
+		return -1;
+
+	perf_fd = perf_cpu_event_open(0, (__u64)perf_data2, sizeof(*perf_data2));
+	if (perf_fd < 0) {
+		ret = -1;
+		goto ptrace_close;
+	}
+	close(perf_fd);
+
+ptrace_close:
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+static int test12(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread and per cpu event by perf)
+	 *	if (existing thread event by ptrace on the same thread)
+	 *		if (addr range overlaps)
+	 *			fail;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd < 0)
+		return -1;
+
+	perf_fd = perf_thread_cpu_event_open(child_pid, 0, (__u64)perf_data1,
+						sizeof(*perf_data1));
+	if (perf_fd > 0 || errno != ENOSPC)
+		ret = -1;
+
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+static int test13(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread and per cpu event by perf)
+	 *	if (existing thread event by ptrace on the same thread)
+	 *		if (addr range does not overlap)
+	 *			allow;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data2, sizeof(*ptrace_data2));
+	if (ptrace_fd < 0)
+		return -1;
+
+	perf_fd = perf_thread_cpu_event_open(child_pid, 0, (__u64)perf_data2,
+						sizeof(*perf_data2));
+	if (perf_fd < 0) {
+		ret = -1;
+		goto ptrace_close;
+	}
+	close(perf_fd);
+
+ptrace_close:
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+static int test14(pid_t child_pid)
+{
+	int perf_fd;
+	int ptrace_fd;
+	int cpid;
+	int ret = 0;
+
+	/* Test:
+	 * if (new per thread and per cpu event by perf)
+	 *	if (existing thread event by ptrace on the other thread)
+	 *		allow;
+	 */
+	ptrace_fd = ptrace_open(child_pid, (__u64)ptrace_data1, sizeof(*ptrace_data1));
+	if (ptrace_fd < 0)
+		return -1;
+
+	cpid = fork();
+	if (!cpid) {
+		/* Temporary Child */
+		pause();
+		exit(EXIT_SUCCESS);
+	}
+
+	perf_fd = perf_thread_cpu_event_open(cpid, 0, (__u64)perf_data1,
+					     sizeof(*perf_data1));
+	if (perf_fd < 0) {
+		ret = -1;
+		goto kill_child;
+	}
+	close(perf_fd);
+
+kill_child:
+	kill(cpid, SIGINT);
+	ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, ptrace_fd);
+	return ret;
+}
+
+#define TEST(msg, fun, arg, ret) {		\
+	int r;					\
+	r = fun(arg);				\
+	if (r)					\
+		printf("%s: Error\n", msg);	\
+	else					\
+		printf("%s: Ok\n", msg);	\
+	ret |= r;				\
+}
+
+char *desc[14] = {
+	"perf cpu event -> ptrace thread event (Overlapping)",
+	"perf cpu event -> ptrace thread event (Non-overlapping)",
+	"perf thread event -> ptrace same thread event (Overlapping)",
+	"perf thread event -> ptrace same thread event (Non-overlapping)",
+	"perf thread event -> ptrace other thread event",
+	"ptrace thread event -> perf kernel event",
+	"ptrace thread event -> perf same thread event (Overlapping)",
+	"ptrace thread event -> perf same thread event (Non-overlapping)",
+	"ptrace thread event -> perf other thread event",
+	"ptrace thread event -> perf cpu event (Overlapping)",
+	"ptrace thread event -> perf cpu event (Non-overlapping)",
+	"ptrace thread event -> perf same thread & cpu event (Overlapping)",
+	"ptrace thread event -> perf same thread & cpu event (Non-overlapping)",
+	"ptrace thread event -> perf other thread & cpu event",
+};
+
+static int test(pid_t child_pid)
+{
+	int ret = TEST_PASS;
+
+	TEST(desc[0], test1, child_pid, ret);
+	TEST(desc[1], test2, child_pid, ret);
+	TEST(desc[2], test3, child_pid, ret);
+	TEST(desc[3], test4, child_pid, ret);
+	TEST(desc[4], test5, child_pid, ret);
+	TEST(desc[5], test6, child_pid, ret);
+	TEST(desc[6], test7, child_pid, ret);
+	TEST(desc[7], test8, child_pid, ret);
+	TEST(desc[8], test9, child_pid, ret);
+	TEST(desc[9], test10, child_pid, ret);
+	TEST(desc[10], test11, child_pid, ret);
+	TEST(desc[11], test12, child_pid, ret);
+	TEST(desc[12], test13, child_pid, ret);
+	TEST(desc[13], test14, child_pid, ret);
+
+	return ret;
+}
+
+static void get_dbginfo(pid_t child_pid, struct ppc_debug_info *dbginfo)
+{
+	if (ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, dbginfo)) {
+		perror("Can't get breakpoint info");
+		exit(-1);
+	}
+}
+
+static int ptrace_perf_hwbreak(void)
+{
+	int ret;
+	pid_t child_pid;
+	struct ppc_debug_info dbginfo;
+
+	child_pid = fork();
+	if (!child_pid)
+		return child();
+
+	/* parent */
+	wait(NULL); /* <-- child (SIGUSR1) */
+
+	get_dbginfo(child_pid, &dbginfo);
+	SKIP_IF(dbginfo.num_data_bps <= 1);
+
+	ret = perf_cpu_event_open(0, (__u64)perf_data1, sizeof(*perf_data1));
+	SKIP_IF(ret < 0);
+	close(ret);
+
+	ret = test(child_pid);
+
+	ptrace(PTRACE_CONT, child_pid, NULL, 0);
+	return ret;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(ptrace_perf_hwbreak, "ptrace-perf-hwbreak");
+}
-- 
2.26.2


^ permalink raw reply related

* [PATCH 6/7] powerpc/selftests/perf-hwbreak: Add testcases for 2nd DAWR
From: Ravi Bangoria @ 2020-07-23 10:20 UTC (permalink / raw)
  To: mpe, paulus
  Cc: christophe.leroy, ravi.bangoria, mikey, kvm, jniethe5,
	linux-kernel, npiggin, kvm-ppc, linux-kselftest, rogealve,
	pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200723102058.312282-1-ravi.bangoria@linux.ibm.com>

Extend perf-hwbreak.c selftest to test multiple DAWRs. Also add
testcase for testing 512 byte boundary removal.

Sample o/p:
  # ./perf-hwbreak
  ...
  TESTED: Process specific, Two events, diff addr
  TESTED: Process specific, Two events, same addr
  TESTED: Process specific, Two events, diff addr, one is RO, other is WO
  TESTED: Process specific, Two events, same addr, one is RO, other is WO
  TESTED: Systemwide, Two events, diff addr
  TESTED: Systemwide, Two events, same addr
  TESTED: Systemwide, Two events, diff addr, one is RO, other is WO
  TESTED: Systemwide, Two events, same addr, one is RO, other is WO
  TESTED: Process specific, 512 bytes, unaligned
  success: perf_hwbreak

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 .../selftests/powerpc/ptrace/perf-hwbreak.c   | 568 +++++++++++++++++-
 1 file changed, 567 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
index bde475341c8a..5df08738884d 100644
--- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -21,8 +21,13 @@
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <signal.h>
 #include <string.h>
 #include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <sys/ptrace.h>
+#include <sys/sysinfo.h>
+#include <asm/ptrace.h>
 #include <elf.h>
 #include <pthread.h>
 #include <sys/syscall.h>
@@ -34,6 +39,12 @@
 
 #define DAWR_LENGTH_MAX ((0x3f + 1) * 8)
 
+int nprocs;
+
+static volatile int a = 10;
+static volatile int b = 10;
+static volatile char c[512 + 8] __attribute__((aligned(512)));
+
 static void perf_event_attr_set(struct perf_event_attr *attr,
 				__u32 type, __u64 addr, __u64 len,
 				bool exclude_user)
@@ -68,6 +79,76 @@ static int perf_process_event_open(__u32 type, __u64 addr, __u64 len)
 	return syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
 }
 
+static int perf_cpu_event_open(long cpu, __u32 type, __u64 addr, __u64 len)
+{
+	struct perf_event_attr attr;
+
+	perf_event_attr_set(&attr, type, addr, len, 0);
+	return syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0);
+}
+
+static void close_fds(int *fd, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		close(fd[i]);
+}
+
+static unsigned long read_fds(int *fd, int n)
+{
+	int i;
+	unsigned long c = 0;
+	unsigned long count = 0;
+	size_t res;
+
+	for (i = 0; i < n; i++) {
+		res = read(fd[i], &c, sizeof(c));
+		assert(res == sizeof(unsigned long long));
+		count += c;
+	}
+	return count;
+}
+
+static void reset_fds(int *fd, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		ioctl(fd[i], PERF_EVENT_IOC_RESET);
+}
+
+static void enable_fds(int *fd, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		ioctl(fd[i], PERF_EVENT_IOC_ENABLE);
+}
+
+static void disable_fds(int *fd, int n)
+{
+	int i;
+
+	for (i = 0; i < n; i++)
+		ioctl(fd[i], PERF_EVENT_IOC_DISABLE);
+}
+
+static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 len)
+{
+	int i = 0;
+
+	/* Assume online processors are 0 to nprocs for simplisity */
+	for (i = 0; i < nprocs; i++) {
+		fd[i] = perf_cpu_event_open(i, type, addr, len);
+		if (fd[i] < 0) {
+			close_fds(fd, i);
+			return fd[i];
+		}
+	}
+	return 0;
+}
+
 static inline bool breakpoint_test(int len)
 {
 	int fd;
@@ -261,11 +342,483 @@ static int runtest_dar_outside(void)
 	return fail;
 }
 
+static void multi_dawr_workload(void)
+{
+	a += 10;
+	b += 10;
+	c[512 + 1] += 'a';
+}
+
+static int test_process_multi_diff_addr(void)
+{
+	unsigned long long breaks1 = 0, breaks2 = 0;
+	int fd1, fd2;
+	char *desc = "Process specific, Two events, diff addr";
+	size_t res;
+
+	fd1 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a));
+	if (fd1 < 0) {
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	fd2 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)&b, (__u64)sizeof(b));
+	if (fd2 < 0) {
+		close(fd1);
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ioctl(fd1, PERF_EVENT_IOC_RESET);
+	ioctl(fd2, PERF_EVENT_IOC_RESET);
+	ioctl(fd1, PERF_EVENT_IOC_ENABLE);
+	ioctl(fd2, PERF_EVENT_IOC_ENABLE);
+	multi_dawr_workload();
+	ioctl(fd1, PERF_EVENT_IOC_DISABLE);
+	ioctl(fd2, PERF_EVENT_IOC_DISABLE);
+
+	res = read(fd1, &breaks1, sizeof(breaks1));
+	assert(res == sizeof(unsigned long long));
+	res = read(fd2, &breaks2, sizeof(breaks2));
+	assert(res == sizeof(unsigned long long));
+
+	close(fd1);
+	close(fd2);
+
+	if (breaks1 != 2 || breaks2 != 2) {
+		printf("FAILED: %s: %lld != 2 || %lld != 2\n", desc,
+			breaks1, breaks2);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+static int test_process_multi_same_addr(void)
+{
+	unsigned long long breaks1 = 0, breaks2 = 0;
+	int fd1, fd2;
+	char *desc = "Process specific, Two events, same addr";
+	size_t res;
+
+	fd1 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a));
+	if (fd1 < 0) {
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	fd2 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a));
+	if (fd2 < 0) {
+		close(fd1);
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ioctl(fd1, PERF_EVENT_IOC_RESET);
+	ioctl(fd2, PERF_EVENT_IOC_RESET);
+	ioctl(fd1, PERF_EVENT_IOC_ENABLE);
+	ioctl(fd2, PERF_EVENT_IOC_ENABLE);
+	multi_dawr_workload();
+	ioctl(fd1, PERF_EVENT_IOC_DISABLE);
+	ioctl(fd2, PERF_EVENT_IOC_DISABLE);
+
+	res = read(fd1, &breaks1, sizeof(breaks1));
+	assert(res == sizeof(unsigned long long));
+	res = read(fd2, &breaks2, sizeof(breaks2));
+	assert(res == sizeof(unsigned long long));
+
+	close(fd1);
+	close(fd2);
+
+	if (breaks1 != 2 || breaks2 != 2) {
+		printf("FAILED: %s: %lld != 2 || %lld != 2\n", desc,
+			breaks1, breaks2);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+static int test_process_multi_diff_addr_ro_wo(void)
+{
+	unsigned long long breaks1 = 0, breaks2 = 0;
+	int fd1, fd2;
+	char *desc = "Process specific, Two events, diff addr, one is RO, other is WO";
+	size_t res;
+
+	fd1 = perf_process_event_open(HW_BREAKPOINT_W, (__u64)&a, (__u64)sizeof(a));
+	if (fd1 < 0) {
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	fd2 = perf_process_event_open(HW_BREAKPOINT_R, (__u64)&b, (__u64)sizeof(b));
+	if (fd2 < 0) {
+		close(fd1);
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ioctl(fd1, PERF_EVENT_IOC_RESET);
+	ioctl(fd2, PERF_EVENT_IOC_RESET);
+	ioctl(fd1, PERF_EVENT_IOC_ENABLE);
+	ioctl(fd2, PERF_EVENT_IOC_ENABLE);
+	multi_dawr_workload();
+	ioctl(fd1, PERF_EVENT_IOC_DISABLE);
+	ioctl(fd2, PERF_EVENT_IOC_DISABLE);
+
+	res = read(fd1, &breaks1, sizeof(breaks1));
+	assert(res == sizeof(unsigned long long));
+	res = read(fd2, &breaks2, sizeof(breaks2));
+	assert(res == sizeof(unsigned long long));
+
+	close(fd1);
+	close(fd2);
+
+	if (breaks1 != 1 || breaks2 != 1) {
+		printf("FAILED: %s: %lld != 1 || %lld != 1\n", desc,
+			breaks1, breaks2);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+static int test_process_multi_same_addr_ro_wo(void)
+{
+	unsigned long long breaks1 = 0, breaks2 = 0;
+	int fd1, fd2;
+	char *desc = "Process specific, Two events, same addr, one is RO, other is WO";
+	size_t res;
+
+	fd1 = perf_process_event_open(HW_BREAKPOINT_R, (__u64)&a, (__u64)sizeof(a));
+	if (fd1 < 0) {
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	fd2 = perf_process_event_open(HW_BREAKPOINT_W, (__u64)&a, (__u64)sizeof(a));
+	if (fd2 < 0) {
+		close(fd1);
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ioctl(fd1, PERF_EVENT_IOC_RESET);
+	ioctl(fd2, PERF_EVENT_IOC_RESET);
+	ioctl(fd1, PERF_EVENT_IOC_ENABLE);
+	ioctl(fd2, PERF_EVENT_IOC_ENABLE);
+	multi_dawr_workload();
+	ioctl(fd1, PERF_EVENT_IOC_DISABLE);
+	ioctl(fd2, PERF_EVENT_IOC_DISABLE);
+
+	res = read(fd1, &breaks1, sizeof(breaks1));
+	assert(res == sizeof(unsigned long long));
+	res = read(fd2, &breaks2, sizeof(breaks2));
+	assert(res == sizeof(unsigned long long));
+
+	close(fd1);
+	close(fd2);
+
+	if (breaks1 != 1 || breaks2 != 1) {
+		printf("FAILED: %s: %lld != 1 || %lld != 1\n", desc,
+			breaks1, breaks2);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+static int test_syswide_multi_diff_addr(void)
+{
+	unsigned long long breaks1 = 0, breaks2 = 0;
+	int *fd1 = malloc(nprocs * sizeof(int));
+	int *fd2 = malloc(nprocs * sizeof(int));
+	char *desc = "Systemwide, Two events, diff addr";
+	int ret;
+
+	ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_RW, (__u64)&a,
+					(__u64)sizeof(a));
+	if (ret) {
+		perror("perf_systemwide_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_RW, (__u64)&b,
+					(__u64)sizeof(b));
+	if (ret) {
+		close_fds(fd1, nprocs);
+		perror("perf_systemwide_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	reset_fds(fd1, nprocs);
+	reset_fds(fd2, nprocs);
+	enable_fds(fd1, nprocs);
+	enable_fds(fd2, nprocs);
+	multi_dawr_workload();
+	disable_fds(fd1, nprocs);
+	disable_fds(fd2, nprocs);
+
+	breaks1 = read_fds(fd1, nprocs);
+	breaks2 = read_fds(fd2, nprocs);
+
+	close_fds(fd1, nprocs);
+	close_fds(fd2, nprocs);
+
+	free(fd1);
+	free(fd2);
+
+	if (breaks1 != 2 || breaks2 != 2) {
+		printf("FAILED: %s: %lld != 2 || %lld != 2\n", desc,
+			breaks1, breaks2);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+static int test_syswide_multi_same_addr(void)
+{
+	unsigned long long breaks1 = 0, breaks2 = 0;
+	int *fd1 = malloc(nprocs * sizeof(int));
+	int *fd2 = malloc(nprocs * sizeof(int));
+	char *desc = "Systemwide, Two events, same addr";
+	int ret;
+
+	ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_RW, (__u64)&a,
+					(__u64)sizeof(a));
+	if (ret) {
+		perror("perf_systemwide_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_RW, (__u64)&a,
+					(__u64)sizeof(a));
+	if (ret) {
+		close_fds(fd1, nprocs);
+		perror("perf_systemwide_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	reset_fds(fd1, nprocs);
+	reset_fds(fd2, nprocs);
+	enable_fds(fd1, nprocs);
+	enable_fds(fd2, nprocs);
+	multi_dawr_workload();
+	disable_fds(fd1, nprocs);
+	disable_fds(fd2, nprocs);
+
+	breaks1 = read_fds(fd1, nprocs);
+	breaks2 = read_fds(fd2, nprocs);
+
+	close_fds(fd1, nprocs);
+	close_fds(fd2, nprocs);
+
+	free(fd1);
+	free(fd2);
+
+	if (breaks1 != 2 || breaks2 != 2) {
+		printf("FAILED: %s: %lld != 2 || %lld != 2\n", desc,
+			breaks1, breaks2);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+static int test_syswide_multi_diff_addr_ro_wo(void)
+{
+	unsigned long long breaks1 = 0, breaks2 = 0;
+	int *fd1 = malloc(nprocs * sizeof(int));
+	int *fd2 = malloc(nprocs * sizeof(int));
+	char *desc = "Systemwide, Two events, diff addr, one is RO, other is WO";
+	int ret;
+
+	ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_W, (__u64)&a,
+					(__u64)sizeof(a));
+	if (ret) {
+		perror("perf_systemwide_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_R, (__u64)&b,
+					(__u64)sizeof(b));
+	if (ret) {
+		close_fds(fd1, nprocs);
+		perror("perf_systemwide_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	reset_fds(fd1, nprocs);
+	reset_fds(fd2, nprocs);
+	enable_fds(fd1, nprocs);
+	enable_fds(fd2, nprocs);
+	multi_dawr_workload();
+	disable_fds(fd1, nprocs);
+	disable_fds(fd2, nprocs);
+
+	breaks1 = read_fds(fd1, nprocs);
+	breaks2 = read_fds(fd2, nprocs);
+
+	close_fds(fd1, nprocs);
+	close_fds(fd2, nprocs);
+
+	free(fd1);
+	free(fd2);
+
+	if (breaks1 != 1 || breaks2 != 1) {
+		printf("FAILED: %s: %lld != 1 || %lld != 1\n", desc,
+			breaks1, breaks2);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+static int test_syswide_multi_same_addr_ro_wo(void)
+{
+	unsigned long long breaks1 = 0, breaks2 = 0;
+	int *fd1 = malloc(nprocs * sizeof(int));
+	int *fd2 = malloc(nprocs * sizeof(int));
+	char *desc = "Systemwide, Two events, same addr, one is RO, other is WO";
+	int ret;
+
+	ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_W, (__u64)&a,
+					(__u64)sizeof(a));
+	if (ret) {
+		perror("perf_systemwide_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_R, (__u64)&a,
+					(__u64)sizeof(a));
+	if (ret) {
+		close_fds(fd1, nprocs);
+		perror("perf_systemwide_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	reset_fds(fd1, nprocs);
+	reset_fds(fd2, nprocs);
+	enable_fds(fd1, nprocs);
+	enable_fds(fd2, nprocs);
+	multi_dawr_workload();
+	disable_fds(fd1, nprocs);
+	disable_fds(fd2, nprocs);
+
+	breaks1 = read_fds(fd1, nprocs);
+	breaks2 = read_fds(fd2, nprocs);
+
+	close_fds(fd1, nprocs);
+	close_fds(fd2, nprocs);
+
+	free(fd1);
+	free(fd2);
+
+	if (breaks1 != 1 || breaks2 != 1) {
+		printf("FAILED: %s: %lld != 1 || %lld != 1\n", desc,
+			breaks1, breaks2);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+static int runtest_multi_dawr(void)
+{
+	int ret = 0;
+
+	ret |= test_process_multi_diff_addr();
+	ret |= test_process_multi_same_addr();
+	ret |= test_process_multi_diff_addr_ro_wo();
+	ret |= test_process_multi_same_addr_ro_wo();
+	ret |= test_syswide_multi_diff_addr();
+	ret |= test_syswide_multi_same_addr();
+	ret |= test_syswide_multi_diff_addr_ro_wo();
+	ret |= test_syswide_multi_same_addr_ro_wo();
+
+	return ret;
+}
+
+static int runtest_unaligned_512bytes(void)
+{
+	unsigned long long breaks = 0;
+	int fd;
+	char *desc = "Process specific, 512 bytes, unaligned";
+	__u64 addr = (__u64)&c + 8;
+	size_t res;
+
+	fd = perf_process_event_open(HW_BREAKPOINT_RW, addr, 512);
+	if (fd < 0) {
+		perror("perf_process_event_open");
+		exit(EXIT_FAILURE);
+	}
+
+	ioctl(fd, PERF_EVENT_IOC_RESET);
+	ioctl(fd, PERF_EVENT_IOC_ENABLE);
+	multi_dawr_workload();
+	ioctl(fd, PERF_EVENT_IOC_DISABLE);
+
+	res = read(fd, &breaks, sizeof(breaks));
+	assert(res == sizeof(unsigned long long));
+
+	close(fd);
+
+	if (breaks != 2) {
+		printf("FAILED: %s: %lld != 2\n", desc, breaks);
+		return 1;
+	}
+
+	printf("TESTED: %s\n", desc);
+	return 0;
+}
+
+/* There is no perf api to find number of available watchpoints. Use ptrace. */
+static int get_nr_wps(bool *arch_31)
+{
+	struct ppc_debug_info dbginfo;
+	int child_pid;
+
+	child_pid = fork();
+	if (!child_pid) {
+		int ret = ptrace(PTRACE_TRACEME, 0, NULL, 0);
+		if (ret) {
+			perror("PTRACE_TRACEME failed\n");
+			exit(EXIT_FAILURE);
+		}
+		kill(getpid(), SIGUSR1);
+
+		sleep(1);
+		exit(EXIT_SUCCESS);
+	}
+
+	wait(NULL);
+	if (ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, &dbginfo)) {
+		perror("Can't get breakpoint info");
+		exit(EXIT_FAILURE);
+	}
+
+	*arch_31 = !!(dbginfo.features & PPC_DEBUG_FEATURE_DATA_BP_DAWR_ARCH_31);
+	return dbginfo.num_data_bps;
+}
+
 static int runtest(void)
 {
 	int rwflag;
 	int exclude_user;
 	int ret;
+	bool dawr = dawr_supported();
+	bool arch_31 = false;
+	int nr_wps = get_nr_wps(&arch_31);
 
 	/*
 	 * perf defines rwflag as two bits read and write and at least
@@ -278,7 +831,7 @@ static int runtest(void)
 				return ret;
 
 			/* if we have the dawr, we can do an array test */
-			if (!dawr_supported())
+			if (!dawr)
 				continue;
 			ret = runtestsingle(rwflag, exclude_user, 1);
 			if (ret)
@@ -287,6 +840,19 @@ static int runtest(void)
 	}
 
 	ret = runtest_dar_outside();
+	if (ret)
+		return ret;
+
+	if (dawr && nr_wps > 1) {
+		nprocs = get_nprocs();
+		ret = runtest_multi_dawr();
+		if (ret)
+			return ret;
+	}
+
+	if (dawr && arch_31)
+		ret = runtest_unaligned_512bytes();
+
 	return ret;
 }
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH 5/7] powerpc/selftests/perf-hwbreak: Coalesce event creation code
From: Ravi Bangoria @ 2020-07-23 10:20 UTC (permalink / raw)
  To: mpe, paulus
  Cc: christophe.leroy, ravi.bangoria, mikey, kvm, jniethe5,
	linux-kernel, npiggin, kvm-ppc, linux-kselftest, rogealve,
	pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200723102058.312282-1-ravi.bangoria@linux.ibm.com>

perf-hwbreak selftest opens hw-breakpoint event at multiple places for
which it has same code repeated. Coalesce that code into a function.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 .../selftests/powerpc/ptrace/perf-hwbreak.c   | 78 +++++++++----------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
index c1f324afdbf3..bde475341c8a 100644
--- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -34,28 +34,46 @@
 
 #define DAWR_LENGTH_MAX ((0x3f + 1) * 8)
 
-static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
-				      int cpu, int group_fd,
-				      unsigned long flags)
+static void perf_event_attr_set(struct perf_event_attr *attr,
+				__u32 type, __u64 addr, __u64 len,
+				bool exclude_user)
 {
-	attr->size = sizeof(*attr);
-	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+	memset(attr, 0, sizeof(struct perf_event_attr));
+	attr->type           = PERF_TYPE_BREAKPOINT;
+	attr->size           = sizeof(struct perf_event_attr);
+	attr->bp_type        = type;
+	attr->bp_addr        = addr;
+	attr->bp_len         = len;
+	attr->exclude_kernel = 1;
+	attr->exclude_hv     = 1;
+	attr->exclude_guest  = 1;
+	attr->exclude_user   = exclude_user;
+	attr->disabled       = 1;
 }
 
-static inline bool breakpoint_test(int len)
+static int
+perf_process_event_open_exclude_user(__u32 type, __u64 addr, __u64 len, bool exclude_user)
 {
 	struct perf_event_attr attr;
+
+	perf_event_attr_set(&attr, type, addr, len, exclude_user);
+	return syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
+}
+
+static int perf_process_event_open(__u32 type, __u64 addr, __u64 len)
+{
+	struct perf_event_attr attr;
+
+	perf_event_attr_set(&attr, type, addr, len, 0);
+	return syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
+}
+
+static inline bool breakpoint_test(int len)
+{
 	int fd;
 
-	/* setup counters */
-	memset(&attr, 0, sizeof(attr));
-	attr.disabled = 1;
-	attr.type = PERF_TYPE_BREAKPOINT;
-	attr.bp_type = HW_BREAKPOINT_R;
 	/* bp_addr can point anywhere but needs to be aligned */
-	attr.bp_addr = (__u64)(&attr) & 0xfffffffffffff800;
-	attr.bp_len = len;
-	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+	fd = perf_process_event_open(HW_BREAKPOINT_R, (__u64)(&fd) & 0xfffffffffffff800, len);
 	if (fd < 0)
 		return false;
 	close(fd);
@@ -75,7 +93,6 @@ static inline bool dawr_supported(void)
 static int runtestsingle(int readwriteflag, int exclude_user, int arraytest)
 {
 	int i,j;
-	struct perf_event_attr attr;
 	size_t res;
 	unsigned long long breaks, needed;
 	int readint;
@@ -94,19 +111,11 @@ static int runtestsingle(int readwriteflag, int exclude_user, int arraytest)
 	if (arraytest)
 		ptr = &readintalign[0];
 
-	/* setup counters */
-	memset(&attr, 0, sizeof(attr));
-	attr.disabled = 1;
-	attr.type = PERF_TYPE_BREAKPOINT;
-	attr.bp_type = readwriteflag;
-	attr.bp_addr = (__u64)ptr;
-	attr.bp_len = sizeof(int);
-	if (arraytest)
-		attr.bp_len = DAWR_LENGTH_MAX;
-	attr.exclude_user = exclude_user;
-	break_fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+	break_fd = perf_process_event_open_exclude_user(readwriteflag, (__u64)ptr,
+				arraytest ? DAWR_LENGTH_MAX : sizeof(int),
+				exclude_user);
 	if (break_fd < 0) {
-		perror("sys_perf_event_open");
+		perror("perf_process_event_open_exclude_user");
 		exit(1);
 	}
 
@@ -153,7 +162,6 @@ static int runtest_dar_outside(void)
 	void *target;
 	volatile __u16 temp16;
 	volatile __u64 temp64;
-	struct perf_event_attr attr;
 	int break_fd;
 	unsigned long long breaks;
 	int fail = 0;
@@ -165,21 +173,11 @@ static int runtest_dar_outside(void)
 		exit(EXIT_FAILURE);
 	}
 
-	/* setup counters */
-	memset(&attr, 0, sizeof(attr));
-	attr.disabled = 1;
-	attr.type = PERF_TYPE_BREAKPOINT;
-	attr.exclude_kernel = 1;
-	attr.exclude_hv = 1;
-	attr.exclude_guest = 1;
-	attr.bp_type = HW_BREAKPOINT_RW;
 	/* watch middle half of target array */
-	attr.bp_addr = (__u64)(target + 2);
-	attr.bp_len = 4;
-	break_fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+	break_fd = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)(target + 2), 4);
 	if (break_fd < 0) {
 		free(target);
-		perror("sys_perf_event_open");
+		perror("perf_process_event_open");
 		exit(EXIT_FAILURE);
 	}
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH 4/7] powerpc/selftests/ptrace-hwbreak: Add testcases for 2nd DAWR
From: Ravi Bangoria @ 2020-07-23 10:20 UTC (permalink / raw)
  To: mpe, paulus
  Cc: christophe.leroy, ravi.bangoria, mikey, kvm, jniethe5,
	linux-kernel, npiggin, kvm-ppc, linux-kselftest, rogealve,
	pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200723102058.312282-1-ravi.bangoria@linux.ibm.com>

Add selftests to test multiple active DAWRs with ptrace interface.

Sample o/p:
  $ ./ptrace-hwbreak
  ...
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO, len: 6: Ok

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
index fc477dfe86a2..65781f4035c1 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
@@ -185,6 +185,18 @@ static void test_workload(void)
 		big_var[rand() % DAWR_MAX_LEN] = 'a';
 	else
 		cvar = big_var[rand() % DAWR_MAX_LEN];
+
+	/* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */
+	gstruct.a[rand() % A_LEN] = 'a';
+
+	/* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */
+	cvar = gstruct.b[rand() % B_LEN];
+
+	/* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */
+	gstruct.a[rand() % A_LEN] = 'a';
+
+	/* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */
+	cvar = gstruct.a[rand() % A_LEN];
 }
 
 static void check_success(pid_t child_pid, const char *name, const char *type,
@@ -374,6 +386,69 @@ static void test_sethwdebug_range_aligned(pid_t child_pid)
 	ptrace_delhwdebug(child_pid, wh);
 }
 
+static void test_multi_sethwdebug_range(pid_t child_pid)
+{
+	struct ppc_hw_breakpoint info1, info2;
+	unsigned long wp_addr1, wp_addr2;
+	char *name1 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED";
+	char *name2 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED";
+	int len1, len2;
+	int wh1, wh2;
+
+	wp_addr1 = (unsigned long)&gstruct.a;
+	wp_addr2 = (unsigned long)&gstruct.b;
+	len1 = A_LEN;
+	len2 = B_LEN;
+	get_ppc_hw_breakpoint(&info1, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, len1);
+	get_ppc_hw_breakpoint(&info2, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, len2);
+
+	/* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */
+	wh1 = ptrace_sethwdebug(child_pid, &info1);
+
+	/* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */
+	wh2 = ptrace_sethwdebug(child_pid, &info2);
+
+	ptrace(PTRACE_CONT, child_pid, NULL, 0);
+	check_success(child_pid, name1, "WO", wp_addr1, len1);
+
+	ptrace(PTRACE_CONT, child_pid, NULL, 0);
+	check_success(child_pid, name2, "RO", wp_addr2, len2);
+
+	ptrace_delhwdebug(child_pid, wh1);
+	ptrace_delhwdebug(child_pid, wh2);
+}
+
+static void test_multi_sethwdebug_range_dawr_overlap(pid_t child_pid)
+{
+	struct ppc_hw_breakpoint info1, info2;
+	unsigned long wp_addr1, wp_addr2;
+	char *name = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap";
+	int len1, len2;
+	int wh1, wh2;
+
+	wp_addr1 = (unsigned long)&gstruct.a;
+	wp_addr2 = (unsigned long)&gstruct.a;
+	len1 = A_LEN;
+	len2 = A_LEN;
+	get_ppc_hw_breakpoint(&info1, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, len1);
+	get_ppc_hw_breakpoint(&info2, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, len2);
+
+	/* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */
+	wh1 = ptrace_sethwdebug(child_pid, &info1);
+
+	/* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */
+	wh2 = ptrace_sethwdebug(child_pid, &info2);
+
+	ptrace(PTRACE_CONT, child_pid, NULL, 0);
+	check_success(child_pid, name, "WO", wp_addr1, len1);
+
+	ptrace(PTRACE_CONT, child_pid, NULL, 0);
+	check_success(child_pid, name, "RO", wp_addr2, len2);
+
+	ptrace_delhwdebug(child_pid, wh1);
+	ptrace_delhwdebug(child_pid, wh2);
+}
+
 static void test_sethwdebug_range_unaligned(pid_t child_pid)
 {
 	struct ppc_hw_breakpoint info;
@@ -460,6 +535,10 @@ run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, bool dawr)
 			test_sethwdebug_range_unaligned(child_pid);
 			test_sethwdebug_range_unaligned_dar(child_pid);
 			test_sethwdebug_dawr_max_range(child_pid);
+			if (dbginfo->num_data_bps > 1) {
+				test_multi_sethwdebug_range(child_pid);
+				test_multi_sethwdebug_range_dawr_overlap(child_pid);
+			}
 		}
 	}
 }
-- 
2.26.2


^ permalink raw reply related

* [PATCH 1/7] powerpc/watchpoint/kvm: Rename current DAWR macros and variables
From: Ravi Bangoria @ 2020-07-23 10:20 UTC (permalink / raw)
  To: mpe, paulus
  Cc: christophe.leroy, ravi.bangoria, mikey, kvm, jniethe5,
	linux-kernel, npiggin, kvm-ppc, linux-kselftest, rogealve,
	pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200723102058.312282-1-ravi.bangoria@linux.ibm.com>

Power10 is introducing second DAWR. Use real register names (with
suffix 0) from ISA for current macros and variables used by kvm.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst            |  4 +--
 arch/powerpc/include/asm/kvm_host.h       |  4 +--
 arch/powerpc/include/uapi/asm/kvm.h       |  4 +--
 arch/powerpc/kernel/asm-offsets.c         |  4 +--
 arch/powerpc/kvm/book3s_hv.c              | 32 +++++++++++------------
 arch/powerpc/kvm/book3s_hv_nested.c       |  8 +++---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 20 +++++++-------
 tools/arch/powerpc/include/uapi/asm/kvm.h |  4 +--
 8 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 426f94582b7a..4dc18fe6a2bf 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -2219,8 +2219,8 @@ registers, find a list below:
   PPC     KVM_REG_PPC_BESCR               64
   PPC     KVM_REG_PPC_TAR                 64
   PPC     KVM_REG_PPC_DPDES               64
-  PPC     KVM_REG_PPC_DAWR                64
-  PPC     KVM_REG_PPC_DAWRX               64
+  PPC     KVM_REG_PPC_DAWR0               64
+  PPC     KVM_REG_PPC_DAWRX0              64
   PPC     KVM_REG_PPC_CIABR               64
   PPC     KVM_REG_PPC_IC                  64
   PPC     KVM_REG_PPC_VTB                 64
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7e2d061d0445..9aa3854f0e1e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -582,8 +582,8 @@ struct kvm_vcpu_arch {
 	u32 ctrl;
 	u32 dabrx;
 	ulong dabr;
-	ulong dawr;
-	ulong dawrx;
+	ulong dawr0;
+	ulong dawrx0;
 	ulong ciabr;
 	ulong cfar;
 	ulong ppr;
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 264e266a85bf..38d61b73f5ed 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -608,8 +608,8 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_BESCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7)
 #define KVM_REG_PPC_TAR		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8)
 #define KVM_REG_PPC_DPDES	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9)
-#define KVM_REG_PPC_DAWR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
-#define KVM_REG_PPC_DAWRX	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
+#define KVM_REG_PPC_DAWR0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
+#define KVM_REG_PPC_DAWRX0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
 #define KVM_REG_PPC_CIABR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac)
 #define KVM_REG_PPC_IC		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad)
 #define KVM_REG_PPC_VTB		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6657dc6b2336..e76bffe348e1 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -547,8 +547,8 @@ int main(void)
 	OFFSET(VCPU_CTRL, kvm_vcpu, arch.ctrl);
 	OFFSET(VCPU_DABR, kvm_vcpu, arch.dabr);
 	OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx);
-	OFFSET(VCPU_DAWR, kvm_vcpu, arch.dawr);
-	OFFSET(VCPU_DAWRX, kvm_vcpu, arch.dawrx);
+	OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0);
+	OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0);
 	OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr);
 	OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags);
 	OFFSET(VCPU_DEC, kvm_vcpu, arch.dec);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 89afcc5f60ca..28200e4f5d27 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -778,8 +778,8 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
 			return H_UNSUPPORTED_FLAG_START;
 		if (value2 & DABRX_HYP)
 			return H_P4;
-		vcpu->arch.dawr  = value1;
-		vcpu->arch.dawrx = value2;
+		vcpu->arch.dawr0  = value1;
+		vcpu->arch.dawrx0 = value2;
 		return H_SUCCESS;
 	case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
 		/* KVM does not support mflags=2 (AIL=2) */
@@ -1724,11 +1724,11 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_VTB:
 		*val = get_reg_val(id, vcpu->arch.vcore->vtb);
 		break;
-	case KVM_REG_PPC_DAWR:
-		*val = get_reg_val(id, vcpu->arch.dawr);
+	case KVM_REG_PPC_DAWR0:
+		*val = get_reg_val(id, vcpu->arch.dawr0);
 		break;
-	case KVM_REG_PPC_DAWRX:
-		*val = get_reg_val(id, vcpu->arch.dawrx);
+	case KVM_REG_PPC_DAWRX0:
+		*val = get_reg_val(id, vcpu->arch.dawrx0);
 		break;
 	case KVM_REG_PPC_CIABR:
 		*val = get_reg_val(id, vcpu->arch.ciabr);
@@ -1938,11 +1938,11 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_VTB:
 		vcpu->arch.vcore->vtb = set_reg_val(id, *val);
 		break;
-	case KVM_REG_PPC_DAWR:
-		vcpu->arch.dawr = set_reg_val(id, *val);
+	case KVM_REG_PPC_DAWR0:
+		vcpu->arch.dawr0 = set_reg_val(id, *val);
 		break;
-	case KVM_REG_PPC_DAWRX:
-		vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
+	case KVM_REG_PPC_DAWRX0:
+		vcpu->arch.dawrx0 = set_reg_val(id, *val) & ~DAWRX_HYP;
 		break;
 	case KVM_REG_PPC_CIABR:
 		vcpu->arch.ciabr = set_reg_val(id, *val);
@@ -3397,8 +3397,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 	int trap;
 	unsigned long host_hfscr = mfspr(SPRN_HFSCR);
 	unsigned long host_ciabr = mfspr(SPRN_CIABR);
-	unsigned long host_dawr = mfspr(SPRN_DAWR0);
-	unsigned long host_dawrx = mfspr(SPRN_DAWRX0);
+	unsigned long host_dawr0 = mfspr(SPRN_DAWR0);
+	unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0);
 	unsigned long host_psscr = mfspr(SPRN_PSSCR);
 	unsigned long host_pidr = mfspr(SPRN_PID);
 
@@ -3427,8 +3427,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 	mtspr(SPRN_SPURR, vcpu->arch.spurr);
 
 	if (dawr_enabled()) {
-		mtspr(SPRN_DAWR0, vcpu->arch.dawr);
-		mtspr(SPRN_DAWRX0, vcpu->arch.dawrx);
+		mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
+		mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
 	}
 	mtspr(SPRN_CIABR, vcpu->arch.ciabr);
 	mtspr(SPRN_IC, vcpu->arch.ic);
@@ -3480,8 +3480,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 	      (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
 	mtspr(SPRN_HFSCR, host_hfscr);
 	mtspr(SPRN_CIABR, host_ciabr);
-	mtspr(SPRN_DAWR0, host_dawr);
-	mtspr(SPRN_DAWRX0, host_dawrx);
+	mtspr(SPRN_DAWR0, host_dawr0);
+	mtspr(SPRN_DAWRX0, host_dawrx0);
 	mtspr(SPRN_PID, host_pidr);
 
 	/*
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 2c849a65db77..629f74edab22 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -33,8 +33,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
 	hr->dpdes = vc->dpdes;
 	hr->hfscr = vcpu->arch.hfscr;
 	hr->tb_offset = vc->tb_offset;
-	hr->dawr0 = vcpu->arch.dawr;
-	hr->dawrx0 = vcpu->arch.dawrx;
+	hr->dawr0 = vcpu->arch.dawr0;
+	hr->dawrx0 = vcpu->arch.dawrx0;
 	hr->ciabr = vcpu->arch.ciabr;
 	hr->purr = vcpu->arch.purr;
 	hr->spurr = vcpu->arch.spurr;
@@ -151,8 +151,8 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
 	vc->pcr = hr->pcr | PCR_MASK;
 	vc->dpdes = hr->dpdes;
 	vcpu->arch.hfscr = hr->hfscr;
-	vcpu->arch.dawr = hr->dawr0;
-	vcpu->arch.dawrx = hr->dawrx0;
+	vcpu->arch.dawr0 = hr->dawr0;
+	vcpu->arch.dawrx0 = hr->dawrx0;
 	vcpu->arch.ciabr = hr->ciabr;
 	vcpu->arch.purr = hr->purr;
 	vcpu->arch.spurr = hr->spurr;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 71943892c81c..e562a9acbc2a 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -52,8 +52,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #define STACK_SLOT_PID		(SFS-32)
 #define STACK_SLOT_IAMR		(SFS-40)
 #define STACK_SLOT_CIABR	(SFS-48)
-#define STACK_SLOT_DAWR		(SFS-56)
-#define STACK_SLOT_DAWRX	(SFS-64)
+#define STACK_SLOT_DAWR0	(SFS-56)
+#define STACK_SLOT_DAWRX0	(SFS-64)
 #define STACK_SLOT_HFSCR	(SFS-72)
 #define STACK_SLOT_AMR		(SFS-80)
 #define STACK_SLOT_UAMOR	(SFS-88)
@@ -711,8 +711,8 @@ BEGIN_FTR_SECTION
 	mfspr	r7, SPRN_DAWRX0
 	mfspr	r8, SPRN_IAMR
 	std	r5, STACK_SLOT_CIABR(r1)
-	std	r6, STACK_SLOT_DAWR(r1)
-	std	r7, STACK_SLOT_DAWRX(r1)
+	std	r6, STACK_SLOT_DAWR0(r1)
+	std	r7, STACK_SLOT_DAWRX0(r1)
 	std	r8, STACK_SLOT_IAMR(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
@@ -801,8 +801,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	lbz	r5, 0(r5)
 	cmpdi	r5, 0
 	beq	1f
-	ld	r5, VCPU_DAWR(r4)
-	ld	r6, VCPU_DAWRX(r4)
+	ld	r5, VCPU_DAWR0(r4)
+	ld	r6, VCPU_DAWRX0(r4)
 	mtspr	SPRN_DAWR0, r5
 	mtspr	SPRN_DAWRX0, r6
 1:
@@ -1759,8 +1759,8 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 	/* Restore host values of some registers */
 BEGIN_FTR_SECTION
 	ld	r5, STACK_SLOT_CIABR(r1)
-	ld	r6, STACK_SLOT_DAWR(r1)
-	ld	r7, STACK_SLOT_DAWRX(r1)
+	ld	r6, STACK_SLOT_DAWR0(r1)
+	ld	r7, STACK_SLOT_DAWRX0(r1)
 	mtspr	SPRN_CIABR, r5
 	/*
 	 * If the DAWR doesn't work, it's ok to write these here as
@@ -2566,8 +2566,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	rlwimi	r5, r4, 5, DAWRX_DR | DAWRX_DW
 	rlwimi	r5, r4, 2, DAWRX_WT
 	clrrdi	r4, r4, 3
-	std	r4, VCPU_DAWR(r3)
-	std	r5, VCPU_DAWRX(r3)
+	std	r4, VCPU_DAWR0(r3)
+	std	r5, VCPU_DAWRX0(r3)
 	/*
 	 * If came in through the real mode hcall handler then it is necessary
 	 * to write the registers since the return path won't. Otherwise it is
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index 264e266a85bf..38d61b73f5ed 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -608,8 +608,8 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_BESCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa7)
 #define KVM_REG_PPC_TAR		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa8)
 #define KVM_REG_PPC_DPDES	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa9)
-#define KVM_REG_PPC_DAWR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
-#define KVM_REG_PPC_DAWRX	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
+#define KVM_REG_PPC_DAWR0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xaa)
+#define KVM_REG_PPC_DAWRX0	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xab)
 #define KVM_REG_PPC_CIABR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xac)
 #define KVM_REG_PPC_IC		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xad)
 #define KVM_REG_PPC_VTB		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xae)
-- 
2.26.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox