LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH] mm/migrate.c: Remove MIGRATE_PFN_LOCKED
From: Felix Kuehling @ 2021-10-28 15:33 UTC (permalink / raw)
  To: Alistair Popple, akpm
  Cc: rcampbell, amd-gfx, nouveau, linux-kernel, dri-devel, linux-mm,
	jglisse, kvm-ppc, ziy, jhubbard, alexander.deucher, linuxppc-dev,
	hch, bskeggs
In-Reply-To: <2096706.TdNOD7Y7u4@nvdebian>

Am 2021-10-27 um 9:42 p.m. schrieb Alistair Popple:
> On Wednesday, 27 October 2021 3:09:57 AM AEDT Felix Kuehling wrote:
>> Am 2021-10-25 um 12:16 a.m. schrieb Alistair Popple:
>>> MIGRATE_PFN_LOCKED is used to indicate to migrate_vma_prepare() that a
>>> source page was already locked during migrate_vma_collect(). If it
>>> wasn't then the a second attempt is made to lock the page. However if
>>> the first attempt failed it's unlikely a second attempt will succeed,
>>> and the retry adds complexity. So clean this up by removing the retry
>>> and MIGRATE_PFN_LOCKED flag.
>>>
>>> Destination pages are also meant to have the MIGRATE_PFN_LOCKED flag
>>> set, but nothing actually checks that.
>>>
>>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> It makes sense to me. Do you have any empirical data on how much more
>> likely migrations are going to fail with this change due to contested
>> page locks?
> Thanks Felix. I do not have any empirical data on this but I've mostly seen
> migrations fail due to the reference count check failing rather than failure to
> lock the page. Even then it's mostly been due to thrashing on the same page, so
> I would be surprised if this change made any noticeable difference.

We have seen more page locking contention on NUMA systems that disappear
when we disable NUMA balancing. Probably NUMA balancing migrations
result in the page lock being more contended, which can cause HMM
migration of some pages to fail.

Also, for migrations to system memory, multiple threads page faulting
concurrently can cause contention. I was just helping debug such an
issue. Having migrations to system memory be only partially successful
is problematic. We'll probably have to implement some retry logic in the
driver to handle this.

Regards,
  Felix


>
>> Either way, the patch is
>>
>> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>
>>
>>> ---
>>>  Documentation/vm/hmm.rst                 |   2 +-
>>>  arch/powerpc/kvm/book3s_hv_uvmem.c       |   4 +-
>>>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |   2 -
>>>  drivers/gpu/drm/nouveau/nouveau_dmem.c   |   4 +-
>>>  include/linux/migrate.h                  |   1 -
>>>  lib/test_hmm.c                           |   5 +-
>>>  mm/migrate.c                             | 145 +++++------------------
>>>  7 files changed, 35 insertions(+), 128 deletions(-)
>>>
>>> diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
>>> index a14c2938e7af..f2a59ed82ed3 100644
>>> --- a/Documentation/vm/hmm.rst
>>> +++ b/Documentation/vm/hmm.rst
>>> @@ -360,7 +360,7 @@ between device driver specific code and shared common code:
>>>     system memory page, locks the page with ``lock_page()``, and fills in the
>>>     ``dst`` array entry with::
>>>  
>>> -     dst[i] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
>>> +     dst[i] = migrate_pfn(page_to_pfn(dpage));
>>>  
>>>     Now that the driver knows that this page is being migrated, it can
>>>     invalidate device private MMU mappings and copy device private memory
>>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> index a7061ee3b157..28c436df9935 100644
>>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
>>> @@ -560,7 +560,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
>>>  				  gpa, 0, page_shift);
>>>  
>>>  	if (ret == U_SUCCESS)
>>> -		*mig.dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
>>> +		*mig.dst = migrate_pfn(pfn);
>>>  	else {
>>>  		unlock_page(dpage);
>>>  		__free_page(dpage);
>>> @@ -774,7 +774,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
>>>  		}
>>>  	}
>>>  
>>> -	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
>>> +	*mig.dst = migrate_pfn(page_to_pfn(dpage));
>>>  	migrate_vma_pages(&mig);
>>>  out_finalize:
>>>  	migrate_vma_finalize(&mig);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>>> index 4a16e3c257b9..41d9417f182b 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
>>> @@ -300,7 +300,6 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
>>>  			migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]);
>>>  			svm_migrate_get_vram_page(prange, migrate->dst[i]);
>>>  			migrate->dst[i] = migrate_pfn(migrate->dst[i]);
>>> -			migrate->dst[i] |= MIGRATE_PFN_LOCKED;
>>>  			src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE,
>>>  					      DMA_TO_DEVICE);
>>>  			r = dma_mapping_error(dev, src[i]);
>>> @@ -580,7 +579,6 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
>>>  			      dst[i] >> PAGE_SHIFT, page_to_pfn(dpage));
>>>  
>>>  		migrate->dst[i] = migrate_pfn(page_to_pfn(dpage));
>>> -		migrate->dst[i] |= MIGRATE_PFN_LOCKED;
>>>  		j++;
>>>  	}
>>>  
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
>>> index 92987daa5e17..3828aafd3ac4 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
>>> @@ -166,7 +166,7 @@ static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
>>>  		goto error_dma_unmap;
>>>  	mutex_unlock(&svmm->mutex);
>>>  
>>> -	args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
>>> +	args->dst[0] = migrate_pfn(page_to_pfn(dpage));
>>>  	return 0;
>>>  
>>>  error_dma_unmap:
>>> @@ -602,7 +602,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
>>>  		((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT);
>>>  	if (src & MIGRATE_PFN_WRITE)
>>>  		*pfn |= NVIF_VMM_PFNMAP_V0_W;
>>> -	return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
>>> +	return migrate_pfn(page_to_pfn(dpage));
>>>  
>>>  out_dma_unmap:
>>>  	dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>> index c8077e936691..479b861ae490 100644
>>> --- a/include/linux/migrate.h
>>> +++ b/include/linux/migrate.h
>>> @@ -119,7 +119,6 @@ static inline int migrate_misplaced_page(struct page *page,
>>>   */
>>>  #define MIGRATE_PFN_VALID	(1UL << 0)
>>>  #define MIGRATE_PFN_MIGRATE	(1UL << 1)
>>> -#define MIGRATE_PFN_LOCKED	(1UL << 2)
>>>  #define MIGRATE_PFN_WRITE	(1UL << 3)
>>>  #define MIGRATE_PFN_SHIFT	6
>>>  
>>> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
>>> index c259842f6d44..e2ce8f9b7605 100644
>>> --- a/lib/test_hmm.c
>>> +++ b/lib/test_hmm.c
>>> @@ -613,8 +613,7 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
>>>  		 */
>>>  		rpage->zone_device_data = dmirror;
>>>  
>>> -		*dst = migrate_pfn(page_to_pfn(dpage)) |
>>> -			    MIGRATE_PFN_LOCKED;
>>> +		*dst = migrate_pfn(page_to_pfn(dpage));
>>>  		if ((*src & MIGRATE_PFN_WRITE) ||
>>>  		    (!spage && args->vma->vm_flags & VM_WRITE))
>>>  			*dst |= MIGRATE_PFN_WRITE;
>>> @@ -1137,7 +1136,7 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
>>>  		lock_page(dpage);
>>>  		xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
>>>  		copy_highpage(dpage, spage);
>>> -		*dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
>>> +		*dst = migrate_pfn(page_to_pfn(dpage));
>>>  		if (*src & MIGRATE_PFN_WRITE)
>>>  			*dst |= MIGRATE_PFN_WRITE;
>>>  	}
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index a6a7743ee98f..915e969811d0 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -2369,7 +2369,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>>  		 * can't be dropped from it).
>>>  		 */
>>>  		get_page(page);
>>> -		migrate->cpages++;
>>>  
>>>  		/*
>>>  		 * Optimize for the common case where page is only mapped once
>>> @@ -2379,7 +2378,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>>  		if (trylock_page(page)) {
>>>  			pte_t swp_pte;
>>>  
>>> -			mpfn |= MIGRATE_PFN_LOCKED;
>>> +			migrate->cpages++;
>>>  			ptep_get_and_clear(mm, addr, ptep);
>>>  
>>>  			/* Setup special migration page table entry */
>>> @@ -2413,6 +2412,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>>  
>>>  			if (pte_present(pte))
>>>  				unmapped++;
>>> +		} else {
>>> +			put_page(page);
>>> +			mpfn = 0;
>>>  		}
>>>  
>>>  next:
>>> @@ -2517,15 +2519,17 @@ static bool migrate_vma_check_page(struct page *page)
>>>  }
>>>  
>>>  /*
>>> - * migrate_vma_prepare() - lock pages and isolate them from the lru
>>> + * migrate_vma_unmap() - replace page mapping with special migration pte entry
>>>   * @migrate: migrate struct containing all migration information
>>>   *
>>> - * This locks pages that have been collected by migrate_vma_collect(). Once each
>>> - * page is locked it is isolated from the lru (for non-device pages). Finally,
>>> - * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
>>> - * migrated by concurrent kernel threads.
>>> + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
>>> + * special migration pte entry and check if it has been pinned. Pinned pages are
>>> + * restored because we cannot migrate them.
>>> + *
>>> + * This is the last step before we call the device driver callback to allocate
>>> + * destination memory and copy contents of original page over to new page.
>>>   */
>>> -static void migrate_vma_prepare(struct migrate_vma *migrate)
>>> +static void migrate_vma_unmap(struct migrate_vma *migrate)
>>>  {
>>>  	const unsigned long npages = migrate->npages;
>>>  	const unsigned long start = migrate->start;
>>> @@ -2534,32 +2538,12 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
>>>  
>>>  	lru_add_drain();
>>>  
>>> -	for (i = 0; (i < npages) && migrate->cpages; i++) {
>>> +	for (i = 0; i < npages; i++) {
>>>  		struct page *page = migrate_pfn_to_page(migrate->src[i]);
>>> -		bool remap = true;
>>>  
>>>  		if (!page)
>>>  			continue;
>>>  
>>> -		if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
>>> -			/*
>>> -			 * Because we are migrating several pages there can be
>>> -			 * a deadlock between 2 concurrent migration where each
>>> -			 * are waiting on each other page lock.
>>> -			 *
>>> -			 * Make migrate_vma() a best effort thing and backoff
>>> -			 * for any page we can not lock right away.
>>> -			 */
>>> -			if (!trylock_page(page)) {
>>> -				migrate->src[i] = 0;
>>> -				migrate->cpages--;
>>> -				put_page(page);
>>> -				continue;
>>> -			}
>>> -			remap = false;
>>> -			migrate->src[i] |= MIGRATE_PFN_LOCKED;
>>> -		}
>>> -
>>>  		/* ZONE_DEVICE pages are not on LRU */
>>>  		if (!is_zone_device_page(page)) {
>>>  			if (!PageLRU(page) && allow_drain) {
>>> @@ -2569,16 +2553,9 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
>>>  			}
>>>  
>>>  			if (isolate_lru_page(page)) {
>>> -				if (remap) {
>>> -					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>> -					migrate->cpages--;
>>> -					restore++;
>>> -				} else {
>>> -					migrate->src[i] = 0;
>>> -					unlock_page(page);
>>> -					migrate->cpages--;
>>> -					put_page(page);
>>> -				}
>>> +				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>> +				migrate->cpages--;
>>> +				restore++;
>>>  				continue;
>>>  			}
>>>  
>>> @@ -2586,80 +2563,20 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
>>>  			put_page(page);
>>>  		}
>>>  
>>> -		if (!migrate_vma_check_page(page)) {
>>> -			if (remap) {
>>> -				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>> -				migrate->cpages--;
>>> -				restore++;
>>> -
>>> -				if (!is_zone_device_page(page)) {
>>> -					get_page(page);
>>> -					putback_lru_page(page);
>>> -				}
>>> -			} else {
>>> -				migrate->src[i] = 0;
>>> -				unlock_page(page);
>>> -				migrate->cpages--;
>>> +		if (page_mapped(page))
>>> +			try_to_migrate(page, 0);
>>>  
>>> -				if (!is_zone_device_page(page))
>>> -					putback_lru_page(page);
>>> -				else
>>> -					put_page(page);
>>> +		if (page_mapped(page) || !migrate_vma_check_page(page)) {
>>> +			if (!is_zone_device_page(page)) {
>>> +				get_page(page);
>>> +				putback_lru_page(page);
>>>  			}
>>> -		}
>>> -	}
>>> -
>>> -	for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
>>> -		struct page *page = migrate_pfn_to_page(migrate->src[i]);
>>> -
>>> -		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
>>> -			continue;
>>>  
>>> -		remove_migration_pte(page, migrate->vma, addr, page);
>>> -
>>> -		migrate->src[i] = 0;
>>> -		unlock_page(page);
>>> -		put_page(page);
>>> -		restore--;
>>> -	}
>>> -}
>>> -
>>> -/*
>>> - * migrate_vma_unmap() - replace page mapping with special migration pte entry
>>> - * @migrate: migrate struct containing all migration information
>>> - *
>>> - * Replace page mapping (CPU page table pte) with a special migration pte entry
>>> - * and check again if it has been pinned. Pinned pages are restored because we
>>> - * cannot migrate them.
>>> - *
>>> - * This is the last step before we call the device driver callback to allocate
>>> - * destination memory and copy contents of original page over to new page.
>>> - */
>>> -static void migrate_vma_unmap(struct migrate_vma *migrate)
>>> -{
>>> -	const unsigned long npages = migrate->npages;
>>> -	const unsigned long start = migrate->start;
>>> -	unsigned long addr, i, restore = 0;
>>> -
>>> -	for (i = 0; i < npages; i++) {
>>> -		struct page *page = migrate_pfn_to_page(migrate->src[i]);
>>> -
>>> -		if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
>>> +			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>> +			migrate->cpages--;
>>> +			restore++;
>>>  			continue;
>>> -
>>> -		if (page_mapped(page)) {
>>> -			try_to_migrate(page, 0);
>>> -			if (page_mapped(page))
>>> -				goto restore;
>>>  		}
>>> -
>>> -		if (migrate_vma_check_page(page))
>>> -			continue;
>>> -
>>> -restore:
>>> -		migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>> -		migrate->cpages--;
>>> -		restore++;
>>>  	}
>>>  
>>>  	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
>>> @@ -2672,12 +2589,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
>>>  
>>>  		migrate->src[i] = 0;
>>>  		unlock_page(page);
>>> +		put_page(page);
>>>  		restore--;
>>> -
>>> -		if (is_zone_device_page(page))
>>> -			put_page(page);
>>> -		else
>>> -			putback_lru_page(page);
>>>  	}
>>>  }
>>>  
>>> @@ -2700,8 +2613,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
>>>   * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
>>>   * flag set).  Once these are allocated and copied, the caller must update each
>>>   * corresponding entry in the dst array with the pfn value of the destination
>>> - * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
>>> - * (destination pages must have their struct pages locked, via lock_page()).
>>> + * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
>>> + * lock_page().
>>>   *
>>>   * Note that the caller does not have to migrate all the pages that are marked
>>>   * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
>>> @@ -2770,8 +2683,6 @@ int migrate_vma_setup(struct migrate_vma *args)
>>>  
>>>  	migrate_vma_collect(args);
>>>  
>>> -	if (args->cpages)
>>> -		migrate_vma_prepare(args);
>>>  	if (args->cpages)
>>>  		migrate_vma_unmap(args);
>>>  
>
>

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/watchdog: prevent printk and send IPI while holding the wd lock
From: Laurent Dufour @ 2021-10-28 15:45 UTC (permalink / raw)
  To: Nicholas Piggin, benh, mpe, paulus; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1635327848.ktks46hzts.astroid@bobo.none>

Le 27/10/2021 à 11:49, Nicholas Piggin a écrit :
> Excerpts from Nicholas Piggin's message of October 27, 2021 1:29 pm:
>> Excerpts from Laurent Dufour's message of October 27, 2021 2:27 am:
>>> When handling the Watchdog interrupt, long processing should not be done
>>> while holding the __wd_smp_lock. This prevents the other CPUs to grab it
>>> and to process Watchdog timer interrupts. Furhtermore, this could lead to
>>> the following situation:
>>>
>>> CPU x detect lockup on CPU y and grab the __wd_smp_lock
>>>        in watchdog_smp_panic()
>>> CPU y caught the watchdog interrupt and try to grab the __wd_smp_lock
>>>        in soft_nmi_interrupt()
>>> CPU x wait for CPU y to catch the IPI for 1s in __smp_send_nmi_ipi()
>>
>> CPU y should get the IPI here if it's a NMI IPI (which will be true for
>>> = POWER9 64s).
>>
>> That said, not all platforms support it and the console lock problem
>> seems real, so okay.
>>
>>> CPU x will timeout and so has spent 1s waiting while holding the
>>>        __wd_smp_lock.
>>>
>>> A deadlock may also happen between the __wd_smp_lock and the console_owner
>>> 'lock' this way:
>>> CPU x grab the console_owner
>>> CPU y grab the __wd_smp_lock
>>> CPU x catch the watchdog timer interrupt and needs to grab __wd_smp_lock
>>> CPU y wants to print something and wait for console_owner
>>> -> deadlock
>>>
>>> Doing all the long processing without holding the _wd_smp_lock prevents
>>> these situations.
>>
>> The intention was to avoid logs getting garbled e.g., if multiple
>> different CPUs fire at once.
>>
>> I wonder if instead we could deal with that by protecting the IPI
>> sending and printing stuff with a trylock, and if you don't get the
>> trylock then just return, and you'll come back with the next timer
>> interrupt.
> 
> Something like this (untested) should basically hold off concurrency on
> watchdog panics. It does not serialize output from IPI targets but it
> should prevent multiple CPUs trying to send IPIs at once, without
> holding the lock.

Got it, I'll work this way, despite the minor comments below.

> 
> ---
> 
> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
> index 2ffeb3f8b5a7..3a0db577da56 100644
> --- a/arch/powerpc/kernel/watchdog.c
> +++ b/arch/powerpc/kernel/watchdog.c
> @@ -85,6 +85,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
>   
>   /* SMP checker bits */
>   static unsigned long __wd_smp_lock;
> +static unsigned long __wd_printing;
>   static cpumask_t wd_smp_cpus_pending;
>   static cpumask_t wd_smp_cpus_stuck;
>   static u64 wd_smp_last_reset_tb;
> @@ -131,10 +132,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
>   	/* Do not panic from here because that can recurse into NMI IPI layer */
>   }
>   
> -static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
> +static void set_cpu_stuck(int cpu, u64 tb)
>   {
> -	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
> -	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
> +	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
> +	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
>   	if (cpumask_empty(&wd_smp_cpus_pending)) {
>   		wd_smp_last_reset_tb = tb;
>   		cpumask_andnot(&wd_smp_cpus_pending,
> @@ -142,10 +143,6 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
>   				&wd_smp_cpus_stuck);
>   	}
>   }
> -static void set_cpu_stuck(int cpu, u64 tb)
> -{
> -	set_cpumask_stuck(cpumask_of(cpu), tb);
> -}
>   
>   static void watchdog_smp_panic(int cpu, u64 tb)
>   {
> @@ -160,6 +157,10 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   		goto out;
>   	if (cpumask_weight(&wd_smp_cpus_pending) == 0)
>   		goto out;
> +	if (__wd_printing)
> +		goto out;
> +	__wd_printing = 1;
> +	wd_smp_unlock(&flags);
>   
>   	pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
>   		 cpu, cpumask_pr_args(&wd_smp_cpus_pending));
> @@ -172,24 +173,31 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   		 * Try to trigger the stuck CPUs, unless we are going to
>   		 * get a backtrace on all of them anyway.
>   		 */
> -		for_each_cpu(c, &wd_smp_cpus_pending) {
> +		for_each_online_cpu(c) {
>   			if (c == cpu)
>   				continue;
> +			if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
> +				continue;
> +			wd_smp_lock(&flags);
> +			if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
> +				wd_smp_unlock(&flags);
> +				continue;
> +			}
> +			/* Take the stuck CPU out of the watch group */
> +			set_cpu_stuck(cpu, tb);
> +			wd_smp_unlock(&flags);
> +
>   			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
>   		}
>   	}
>   
> -	/* Take the stuck CPUs out of the watch group */
> -	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
> -
> -	wd_smp_unlock(&flags);
> -
>   	if (sysctl_hardlockup_all_cpu_backtrace)
>   		trigger_allbutself_cpu_backtrace();
>   
>   	if (hardlockup_panic)
>   		nmi_panic(NULL, "Hard LOCKUP");
>   
> +	__wd_printing = 0;

I think WRITE_ONCE() is required here, to prevent any compiler trick.
Also, I think it might be safer (I don't have clear example in mind, just a bad 
feeling) to do so before the check of hardlockup_panic and the call to panic.

>   	return;
>   
>   out:
> @@ -204,8 +212,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>   		if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
>   			struct pt_regs *regs = get_irq_regs();
>   
> -			wd_smp_lock(&flags);
> -
>   			pr_emerg("CPU %d became unstuck TB:%lld\n",
>   				 cpu, tb);
>   			print_irqtrace_events(current);
> @@ -214,6 +220,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>   			else
>   				dump_stack();
>   
> +			wd_smp_lock(&flags);
>   			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
>   			wd_smp_unlock(&flags);
>   		}
> @@ -263,8 +270,16 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>   			wd_smp_unlock(&flags);
>   			return 0;
>   		}
> +		if (__wd_printing) {
> +			wd_smp_unlock(&flags);
> +			return 0;

Here the CPU is not flagged as stuck and nothing is printed. We have to wait for 
the next soft_nmi_interrupt() to happen for this CPU to detect itself stuck and 
provide the backstack. Depending on the number of CPU and the stress on the 
system, this CPU may net see __wd_printing=0 when entering soft_nmi_interrupt() 
until a long period of time.
I think we should introduce a per CPU counter, when that counter reach a limit 
(let's say 10), the CPU is not taking care of __wd_printing and continue it's 
processing.

> +		} > +		__wd_printing = 1;
> +
>   		set_cpu_stuck(cpu, tb);
>   
> +		wd_smp_unlock(&flags);
> +
>   		pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
>   			 cpu, (void *)regs->nip);
>   		pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
> @@ -274,13 +289,13 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>   		print_irqtrace_events(current);
>   		show_regs(regs);
>   
> -		wd_smp_unlock(&flags);
> -
>   		if (sysctl_hardlockup_all_cpu_backtrace)
>   			trigger_allbutself_cpu_backtrace();
>   
>   		if (hardlockup_panic)
>   			nmi_panic(regs, "Hard LOCKUP");
> +
> +		__wd_printing = 0;

I think WRITE_ONCE() is required here, to prevent any compiler trick.

>   	}
>   	if (wd_panic_timeout_tb < 0x7fffffff)
>   		mtspr(SPRN_DEC, wd_panic_timeout_tb);
> 


^ permalink raw reply

* [PATCH v2] powerpc/powernv/prd: Unregister OPAL_MSG_PRD2 notifier during module unload
From: Vasant Hegde @ 2021-10-28 16:57 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Vasant Hegde, dja

Commit 587164cd, introduced new opal message type (OPAL_MSG_PRD2) and added
opal notifier. But I missed to unregister the notifier during module unload
path. This results in below call trace if you try to unload and load
opal_prd module.

Also add new notifier_block for OPAL_MSG_PRD2 message.

Sample calltrace (modprobe -r opal_prd; modprobe opal_prd)
  [  213.335261] BUG: Unable to handle kernel data access on read at 0xc0080000192200e0
  [  213.335287] Faulting instruction address: 0xc00000000018d1cc
  [  213.335301] Oops: Kernel access of bad area, sig: 11 [#1]
  [  213.335313] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
  [  213.335736] CPU: 66 PID: 7446 Comm: modprobe Kdump: loaded Tainted: G            E     5.14.0prd #759
  [  213.335772] NIP:  c00000000018d1cc LR: c00000000018d2a8 CTR: c0000000000cde10
  [  213.335805] REGS: c0000003c4c0f0a0 TRAP: 0300   Tainted: G            E      (5.14.0prd)
  [  213.335848] MSR:  9000000002009033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 24224824  XER: 20040000
  [  213.335893] CFAR: c00000000018d2a4 DAR: c0080000192200e0 DSISR: 40000000 IRQMASK: 1
  [  213.335893] GPR00: c00000000018d2a8 c0000003c4c0f340 c000000001995300 c000000001a5ad08
  [  213.335893] GPR04: c00800000e3700d0 c0000003c4c0f434 c0000000010a8c08 6e616d6500000000
  [  213.335893] GPR08: 0000000000000000 c0080000192200d0 0000000000000001 c00800000e351020
  [  213.335893] GPR12: c0000000000cde10 c000000ffffecb80 c0000003c4c0fd00 0000000000000000
  [  213.335893] GPR16: 0000000000000990 c00800000d950000 c00800000d950990 c00000000103fd10
  [  213.335893] GPR20: c0000003c4c0fbc0 0000000000000001 c0000003c4c0fbc0 c00800000e370498
  [  213.335893] GPR24: 0000000000000000 c000000000dab5c8 c000000001a5ad18 c000000001a5ac80
  [  213.335893] GPR28: 0000000000000008 0000000000000001 c000000001a5ad00 c000000001a5ad00
  [  213.336170] NIP [c00000000018d1cc] notifier_chain_register+0x2c/0xc0
  [  213.336205] LR [c00000000018d2a8] atomic_notifier_chain_register+0x48/0x80
  [  213.336238] Call Trace:
  [  213.336255] [c0000003c4c0f340] [c000000002090610] 0xc000000002090610 (unreliable)
  [  213.336281] [c0000003c4c0f3a0] [c00000000018d2b8] atomic_notifier_chain_register+0x58/0x80
  [  213.336309] [c0000003c4c0f3f0] [c0000000000cde8c] opal_message_notifier_register+0x7c/0x1e0
  [  213.336345] [c0000003c4c0f4b0] [c00800000e3508ac] opal_prd_probe+0x84/0x150 [opal_prd]
  [  213.336382] [c0000003c4c0f530] [c00000000097acc8] platform_probe+0x78/0x130
  [  213.336416] [c0000003c4c0f5b0] [c000000000976520] really_probe+0x110/0x5d0
  [  213.336467] [c0000003c4c0f630] [c000000000976b5c] __driver_probe_device+0x17c/0x230
  [  213.336512] [c0000003c4c0f6b0] [c000000000976c70] driver_probe_device+0x60/0x130
  [  213.336556] [c0000003c4c0f700] [c00000000097746c] __driver_attach+0xfc/0x220
  [  213.336592] [c0000003c4c0f780] [c000000000972e68] bus_for_each_dev+0xa8/0x130
  [  213.336627] [c0000003c4c0f7e0] [c000000000975b04] driver_attach+0x34/0x50
  [  213.336661] [c0000003c4c0f800] [c000000000974e20] bus_add_driver+0x1b0/0x300
  [  213.336696] [c0000003c4c0f890] [c000000000978468] driver_register+0x98/0x1a0
  [  213.336732] [c0000003c4c0f900] [c00000000097a878] __platform_driver_register+0x38/0x50
  [  213.336768] [c0000003c4c0f920] [c00800000e350dc0] opal_prd_driver_init+0x34/0x50 [opal_prd]
  [  213.336804] [c0000003c4c0f940] [c000000000012410] do_one_initcall+0x60/0x2d0
  [  213.336821] [c0000003c4c0fa10] [c000000000262c8c] do_init_module+0x7c/0x320
  [  213.336845] [c0000003c4c0fa90] [c000000000266544] load_module+0x3394/0x3650
  [  213.336880] [c0000003c4c0fc90] [c000000000266b34] __do_sys_finit_module+0xd4/0x160
  [  213.336914] [c0000003c4c0fdb0] [c000000000031e00] system_call_exception+0x140/0x290
  [  213.336958] [c0000003c4c0fe10] [c00000000000c764] system_call_common+0xf4/0x258

Fixes: 587164cd ("powerpc/powernv: Add new opal message type")
Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/opal-prd.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index a191f4c60ce7..113bdb151f68 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -369,6 +369,12 @@ static struct notifier_block opal_prd_event_nb = {
 	.priority	= 0,
 };
 
+static struct notifier_block opal_prd_event_nb2 = {
+	.notifier_call	= opal_prd_msg_notifier,
+	.next		= NULL,
+	.priority	= 0,
+};
+
 static int opal_prd_probe(struct platform_device *pdev)
 {
 	int rc;
@@ -390,9 +396,10 @@ static int opal_prd_probe(struct platform_device *pdev)
 		return rc;
 	}
 
-	rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb);
+	rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb2);
 	if (rc) {
 		pr_err("Couldn't register PRD2 event notifier\n");
+		opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
 		return rc;
 	}
 
@@ -401,6 +408,8 @@ static int opal_prd_probe(struct platform_device *pdev)
 		pr_err("failed to register miscdev\n");
 		opal_message_notifier_unregister(OPAL_MSG_PRD,
 				&opal_prd_event_nb);
+		opal_message_notifier_unregister(OPAL_MSG_PRD2,
+				&opal_prd_event_nb2);
 		return rc;
 	}
 
@@ -411,6 +420,7 @@ static int opal_prd_remove(struct platform_device *pdev)
 {
 	misc_deregister(&opal_prd_dev);
 	opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
+	opal_message_notifier_unregister(OPAL_MSG_PRD2, &opal_prd_event_nb2);
 	return 0;
 }
 
-- 
2.31.1


^ permalink raw reply related

* Re: [PATCH] powerpc/powernv/prd: Unregister OPAL_MSG_PRD2 notifier during module unload
From: Vasant Hegde @ 2021-10-28 16:57 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev
In-Reply-To: <87r1d5mg1a.fsf@linkitivity.dja.id.au>

On 10/1/21 11:40 AM, Daniel Axtens wrote:
> Hi Vasant,
> 
>> Commit 587164cd, introduced new opal message type (OPAL_MSG_PRD2) and added
>> opal notifier. But I missed to unregister the notifier during module unload
>> path. This results in below call trace if you try to unload and load
>> opal_prd module.
>>
>> Fixes: 587164cd ("powerpc/powernv: Add new opal message type")
> 
> In reviewing this patch, I've become a bit worried the underlying patch
> has another issue that we should fix.

Thanks for the review. I have addressed below comments in v2.

-Vasant


^ permalink raw reply

* Re: [PATCH] powerpc/32e: Ignore ESR in instruction storage interrupt handler
From: Daniel Axtens @ 2021-10-28 22:13 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Nicholas Piggin, Jacques de Laval
In-Reply-To: <20211028133043.4159501-1-npiggin@gmail.com>

Hi Nick,

> A e5500 machine running a 32-bit kernel sometimes hangs at boot,
> seemingly going into an infinite loop of instruction storage interrupts.
> The ESR SPR has a value of 0x800000 (store) when this happens, which is
> likely set by a previous store. An instruction TLB miss interrupt would
> then leave ESR unchanged, and if no PTE exists it calls directly to the
> instruction storage interrupt handler without changing ESR.

I hadn't heard of the ESR before and your patch just uses the acronym:
apparently it is the Exception Syndrome Register. In ISA 2.07 it's in
Section 7.2.13 of Book III-E. (e5500 implements 2.06 but I assume it's
roughly the same there.)

The ISA says that ESR_ST is set for the following types of exception:
 - Alignment
 - Data Storage
 - Data TLB
 - LRAT Error

So it makes sense to assume that it was not set by the instruction
storage exception. (I see you have a discussion as to how that might
occur in
https://lore.kernel.org/linuxppc-dev/1635413197.83rhc4s3fc.astroid@bobo.none/#t
and you concluded that in the comment you add that we arrive here via
the TLB handler jumping to the ISI.)

> access_error() does not cause a segfault due to a store to a read-only
> vma because is_exec is true. Most subsequent fault handling does not
> check for a write fault on a read-only vma, and might do strange things
> like create a writeable PTE or call page_mkwrite on a read only vma or
> file. It's not clear what happens here to cause the infinite faulting in
> this case, a fault handler failure or low level PTE or TLB handling.

I'm just trying to unpick this:

 - INSTRUCTION_STORAGE_EXCEPTION ends up branching to do_page_fault ->
   __do_page_fault -> ___do_page_fault

 - ___do_page_fault has is_exec = true because the exception is a
   instruction storage interrupt

 - ___do_page_fault also decides that is_write = true because the
   ESR_DST bit is set.

This puts us in a bad position because we're taking some information
from the current exception and some information from a previous
exception, so it makes sense that things would go wrong from here!

> In any case this can be fixed by having the instruction storage
> interrupt zero regs->dsisr rather than storing the ESR value to it.

Is it valid to just ignore the contents of ESR for an Instruction
Storage exception?

The 2.07 ISA at least says that the following ESR bits can be set by an
Instruction Storage exception:
 - Byte Ordering
 - TLB Inelligible
 - Page Table

It sounds from

> + * In any case, do_page_fault for BOOK3E does not use ESR and always expects
> + * dsisr to be 0. ESR_DST from a prior store in particular would confuse fault
> + * handling.

that we don't actually ever check any of those three bits in the
do_page_fault path. reg_booke.h defines ESR_BO but the definition is
never used, and there is no ESR_TLBI or ESR_PT constant defined.

So much as it seems a bit dodgy to me, I guess it is right to say that
we're not changing behaviour by setting dsisr to 0 and just ignoring
those 3 bits? Should we document that in the comment?

I probably would have masked ESR_DST but I guess extra bit-twiddling in
an interrupt path when zeroing would do is probably not worth it for
theoretical correctness?

Kind regards,
Daniel

> Link: https://lore.kernel.org/linuxppc-dev/1635306738.0z8wt7619v.astroid@bobo.none/
> Fixes: a01a3f2ddbcd ("powerpc: remove arguments from fault handler functions")
> Reported-by: Jacques de Laval <jacques.delaval@protonmail.com>
> Tested-by: Jacques de Laval <jacques.delaval@protonmail.com>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/powerpc/kernel/head_booke.h | 15 ++++++++++++---
>  1 file changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
> index e5503420b6c6..ef8d1b1c234e 100644
> --- a/arch/powerpc/kernel/head_booke.h
> +++ b/arch/powerpc/kernel/head_booke.h
> @@ -465,12 +465,21 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
>  	bl	do_page_fault;						      \
>  	b	interrupt_return
>  
> +/*
> + * Instruction TLB Error interrupt handlers may call InstructionStorage
> + * directly without clearing ESR, so the ESR at this point may be left over
> + * from a prior interrupt.
> + *
> + * In any case, do_page_fault for BOOK3E does not use ESR and always expects
> + * dsisr to be 0. ESR_DST from a prior store in particular would confuse fault
> + * handling.
> + */
>  #define INSTRUCTION_STORAGE_EXCEPTION					      \
>  	START_EXCEPTION(InstructionStorage)				      \
> -	NORMAL_EXCEPTION_PROLOG(0x400, INST_STORAGE);		      \
> -	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
> +	NORMAL_EXCEPTION_PROLOG(0x400, INST_STORAGE);			      \
> +	li	r5,0;			/* Store 0 in regs->esr (dsisr) */    \
>  	stw	r5,_ESR(r11);						      \
> -	stw	r12, _DEAR(r11);	/* Pass SRR0 as arg2 */		      \
> +	stw	r12, _DEAR(r11);	/* Set regs->dear (dar) to SRR0 */    \
>  	prepare_transfer_to_handler;					      \
>  	bl	do_page_fault;						      \
>  	b	interrupt_return
> -- 
> 2.23.0

^ permalink raw reply

* Re: [PATCH v3] KVM: PPC: Tick accounting should defer vtime accounting 'til after IRQ handling
From: Nicholas Piggin @ 2021-10-29  0:35 UTC (permalink / raw)
  To: linuxppc-dev, Laurent Vivier; +Cc: stable
In-Reply-To: <3d621619-e6b2-9388-06dd-0ea4cc805ed7@redhat.com>

Excerpts from Laurent Vivier's message of October 28, 2021 10:48 pm:
> On 27/10/2021 16:21, Nicholas Piggin wrote:
>> From: Laurent Vivier <lvivier@redhat.com>
>> 
>> Commit 112665286d08 ("KVM: PPC: Book3S HV: Context tracking exit guest
>> context before enabling irqs") moved guest_exit() into the interrupt
>> protected area to avoid wrong context warning (or worse). The problem is
>> that tick-based time accounting has not yet been updated at this point
>> (because it depends on the timer interrupt firing), so the guest time
>> gets incorrectly accounted to system time.
>> 
>> To fix the problem, follow the x86 fix in commit 160457140187 ("Defer
>> vtime accounting 'til after IRQ handling"), and allow host IRQs to run
>> before accounting the guest exit time.
>> 
>> In the case vtime accounting is enabled, this is not required because TB
>> is used directly for accounting.
>> 
>> Before this patch, with CONFIG_TICK_CPU_ACCOUNTING=y in the host and a
>> guest running a kernel compile, the 'guest' fields of /proc/stat are
>> stuck at zero. With the patch they can be observed increasing roughly as
>> expected.
>> 
>> Fixes: e233d54d4d97 ("KVM: booke: use __kvm_guest_exit")
>> Fixes: 112665286d08 ("KVM: PPC: Book3S HV: Context tracking exit guest context before enabling irqs")
>> Cc: <stable@vger.kernel.org> # 5.12
>> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
>> [np: only required for tick accounting, add Book3E fix, tweak changelog]
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>> Since v2:
>> - I took over the patch with Laurent's blessing.
>> - Changed to avoid processing IRQs if we do have vtime accounting
>>    enabled.
>> - Changed so in either case the accounting is called with irqs disabled.
>> - Added similar Book3E fix.
>> - Rebased on upstream, tested, observed bug and confirmed fix.
>> 
>>   arch/powerpc/kvm/book3s_hv.c | 30 ++++++++++++++++++++++++++++--
>>   arch/powerpc/kvm/booke.c     | 16 +++++++++++++++-
>>   2 files changed, 43 insertions(+), 3 deletions(-)
>> 
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 2acb1c96cfaf..7b74fc0a986b 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -3726,7 +3726,20 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
>>   
>>   	kvmppc_set_host_core(pcpu);
>>   
>> -	guest_exit_irqoff();
>> +	context_tracking_guest_exit();
>> +	if (!vtime_accounting_enabled_this_cpu()) {
>> +		local_irq_enable();
>> +		/*
>> +		 * Service IRQs here before vtime_account_guest_exit() so any
>> +		 * ticks that occurred while running the guest are accounted to
>> +		 * the guest. If vtime accounting is enabled, accounting uses
>> +		 * TB rather than ticks, so it can be done without enabling
>> +		 * interrupts here, which has the problem that it accounts
>> +		 * interrupt processing overhead to the host.
>> +		 */
>> +		local_irq_disable();
>> +	}
>> +	vtime_account_guest_exit();
>>   
>>   	local_irq_enable();
>>   
>> @@ -4510,7 +4523,20 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
>>   
>>   	kvmppc_set_host_core(pcpu);
>>   
>> -	guest_exit_irqoff();
>> +	context_tracking_guest_exit();
>> +	if (!vtime_accounting_enabled_this_cpu()) {
>> +		local_irq_enable();
>> +		/*
>> +		 * Service IRQs here before vtime_account_guest_exit() so any
>> +		 * ticks that occurred while running the guest are accounted to
>> +		 * the guest. If vtime accounting is enabled, accounting uses
>> +		 * TB rather than ticks, so it can be done without enabling
>> +		 * interrupts here, which has the problem that it accounts
>> +		 * interrupt processing overhead to the host.
>> +		 */
>> +		local_irq_disable();
>> +	}
>> +	vtime_account_guest_exit();
>>   
>>   	local_irq_enable();
>>   
>> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
>> index 977801c83aff..8c15c90dd3a9 100644
>> --- a/arch/powerpc/kvm/booke.c
>> +++ b/arch/powerpc/kvm/booke.c
>> @@ -1042,7 +1042,21 @@ int kvmppc_handle_exit(struct kvm_vcpu *vcpu, unsigned int exit_nr)
>>   	}
>>   
>>   	trace_kvm_exit(exit_nr, vcpu);
>> -	guest_exit_irqoff();
>> +
>> +	context_tracking_guest_exit();
>> +	if (!vtime_accounting_enabled_this_cpu()) {
>> +		local_irq_enable();
>> +		/*
>> +		 * Service IRQs here before vtime_account_guest_exit() so any
>> +		 * ticks that occurred while running the guest are accounted to
>> +		 * the guest. If vtime accounting is enabled, accounting uses
>> +		 * TB rather than ticks, so it can be done without enabling
>> +		 * interrupts here, which has the problem that it accounts
>> +		 * interrupt processing overhead to the host.
>> +		 */
>> +		local_irq_disable();
>> +	}
>> +	vtime_account_guest_exit();
>>   
>>   	local_irq_enable();
>>   
>> 
> 
> I'm wondering if we should put the context_tracking_guest_exit() just after the 
> "srcu_read_unlock(&vc->kvm->srcu, srcu_idx);" as it was before 61bd0f66ff92 ("KVM: PPC: 
> Book3S HV: Fix guest time accounting with VIRT_CPU_ACCOUNTING_GEN")?

For the run_single_vcpu path, I _think_ it shouldn't matter. It's mostly 
just fixing up low level powerpc details.

But actually I wonder whether we should move the guest context entirely 
inside the SRCU lock because it's a high level host locking primitive.

For the kvmppc_run_core path, we are actually taking the vc->lock spin 
lock as well. Arguably it's waiting for secondary threads in the guest
but I think changing to host context as soon as possible could make
sense. If we don't have an actual bug identified it could wait for next
merge perhaps.

Thanks,
Nick
> 

^ permalink raw reply

* Re: Linux kernel: powerpc: KVM guest can trigger host crash on Power8
From: Nicholas Piggin @ 2021-10-29  0:41 UTC (permalink / raw)
  To: John Paul Adrian Glaubitz, Michael Ellerman
  Cc: debian-powerpc@lists.debian.org, linuxppc-dev
In-Reply-To: <9ed788c0-b99b-f327-0814-a2d92db6bd8b@physik.fu-berlin.de>

Excerpts from John Paul Adrian Glaubitz's message of October 29, 2021 12:05 am:
> Hi Michael!
> 
> On 10/28/21 13:20, John Paul Adrian Glaubitz wrote:
>> It seems I also can no longer reproduce the issue, even when building the most problematic
>> packages and I think we should consider it fixed for now. I will keep monitoring the server,
>> of course, and will let you know in case the problem shows again.
> 
> The host machine is stuck again but I'm not 100% sure what triggered the problem:
> 
> [194817.984249] watchdog: BUG: soft lockup - CPU#80 stuck for 246s! [CPU 2/KVM:1836]
> [194818.012248] watchdog: BUG: soft lockup - CPU#152 stuck for 246s! [CPU 3/KVM:1837]
> [194825.960164] watchdog: BUG: soft lockup - CPU#24 stuck for 246s! [khugepaged:318]
> [194841.983991] watchdog: BUG: soft lockup - CPU#80 stuck for 268s! [CPU 2/KVM:1836]
> [194842.011991] watchdog: BUG: soft lockup - CPU#152 stuck for 268s! [CPU 3/KVM:1837]
> [194849.959906] watchdog: BUG: soft lockup - CPU#24 stuck for 269s! [khugepaged:318]
> [194865.983733] watchdog: BUG: soft lockup - CPU#80 stuck for 291s! [CPU 2/KVM:1836]
> [194866.011733] watchdog: BUG: soft lockup - CPU#152 stuck for 291s! [CPU 3/KVM:1837]
> [194873.959648] watchdog: BUG: soft lockup - CPU#24 stuck for 291s! [khugepaged:318]
> [194889.983475] watchdog: BUG: soft lockup - CPU#80 stuck for 313s! [CPU 2/KVM:1836]
> [194890.011475] watchdog: BUG: soft lockup - CPU#152 stuck for 313s! [CPU 3/KVM:1837]
> [194897.959390] watchdog: BUG: soft lockup - CPU#24 stuck for 313s! [khugepaged:318]
> [194913.983218] watchdog: BUG: soft lockup - CPU#80 stuck for 335s! [CPU 2/KVM:1836]
> [194914.011217] watchdog: BUG: soft lockup - CPU#152 stuck for 335s! [CPU 3/KVM:1837]
> [194921.959133] watchdog: BUG: soft lockup - CPU#24 stuck for 336s! [khugepaged:318]

Soft lockup should mean it's taking timer interrupts still, just not 
scheduling. Do you have the hard lockup detector enabled as well? Is
there anything stuck spinning on another CPU?

Do you have the full dmesg / kernel log for this boot?

Could you try a sysrq+w to get a trace of blocked tasks?

Are you able to shut down the guests and exit qemu normally?

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH] powerpc/32e: Ignore ESR in instruction storage interrupt handler
From: Nicholas Piggin @ 2021-10-29  0:43 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev; +Cc: Jacques de Laval
In-Reply-To: <89fbff81-f70f-9e3e-eb5b-de7969b20638@csgroup.eu>

Excerpts from Christophe Leroy's message of October 28, 2021 11:52 pm:
> 
> 
> Le 28/10/2021 à 15:30, Nicholas Piggin a écrit :
>> A e5500 machine running a 32-bit kernel sometimes hangs at boot,
>> seemingly going into an infinite loop of instruction storage interrupts.
>> The ESR SPR has a value of 0x800000 (store) when this happens, which is
>> likely set by a previous store. An instruction TLB miss interrupt would
>> then leave ESR unchanged, and if no PTE exists it calls directly to the
>> instruction storage interrupt handler without changing ESR.
>> 
>> access_error() does not cause a segfault due to a store to a read-only
>> vma because is_exec is true. Most subsequent fault handling does not
>> check for a write fault on a read-only vma, and might do strange things
>> like create a writeable PTE or call page_mkwrite on a read only vma or
>> file. It's not clear what happens here to cause the infinite faulting in
>> this case, a fault handler failure or low level PTE or TLB handling.
>> 
>> In any case this can be fixed by having the instruction storage
>> interrupt zero regs->dsisr rather than storing the ESR value to it.
>> 
>> Link: https://lore.kernel.org/linuxppc-dev/1635306738.0z8wt7619v.astroid@bobo.none/
>> Fixes: a01a3f2ddbcd ("powerpc: remove arguments from fault handler functions")
> 
> Should it go to stable as well ?

Yeah, I'm used to Fixes: tags getting picked up automatically, are we 
not doing that anymore since someone flamed stable maintainers? :(

> 
>> Reported-by: Jacques de Laval <jacques.delaval@protonmail.com>
>> Tested-by: Jacques de Laval <jacques.delaval@protonmail.com>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> 
> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>

Thanks,
Nick


^ permalink raw reply

* Re: [PATCH] powerpc/32e: Ignore ESR in instruction storage interrupt handler
From: Nicholas Piggin @ 2021-10-29  0:50 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev; +Cc: Jacques de Laval
In-Reply-To: <87ee84g5mx.fsf@dja-thinkpad.axtens.net>

Excerpts from Daniel Axtens's message of October 29, 2021 8:13 am:
> Hi Nick,
> 
>> A e5500 machine running a 32-bit kernel sometimes hangs at boot,
>> seemingly going into an infinite loop of instruction storage interrupts.
>> The ESR SPR has a value of 0x800000 (store) when this happens, which is
>> likely set by a previous store. An instruction TLB miss interrupt would
>> then leave ESR unchanged, and if no PTE exists it calls directly to the
>> instruction storage interrupt handler without changing ESR.
> 
> I hadn't heard of the ESR before and your patch just uses the acronym:
> apparently it is the Exception Syndrome Register. In ISA 2.07 it's in
> Section 7.2.13 of Book III-E. (e5500 implements 2.06 but I assume it's
> roughly the same there.)
> 
> The ISA says that ESR_ST is set for the following types of exception:
>  - Alignment
>  - Data Storage
>  - Data TLB
>  - LRAT Error
> 
> So it makes sense to assume that it was not set by the instruction
> storage exception. (I see you have a discussion as to how that might
> occur in
> https://lore.kernel.org/linuxppc-dev/1635413197.83rhc4s3fc.astroid@bobo.none/#t
> and you concluded that in the comment you add that we arrive here via
> the TLB handler jumping to the ISI.)

I think that's most likely. I don't know book E arch very well, *maybe* 
you could have an ISI which does not change ESR without violating the
letter of the architecture, but not sure. Either way, this fix should 
take care of both.

> 
>> access_error() does not cause a segfault due to a store to a read-only
>> vma because is_exec is true. Most subsequent fault handling does not
>> check for a write fault on a read-only vma, and might do strange things
>> like create a writeable PTE or call page_mkwrite on a read only vma or
>> file. It's not clear what happens here to cause the infinite faulting in
>> this case, a fault handler failure or low level PTE or TLB handling.
> 
> I'm just trying to unpick this:
> 
>  - INSTRUCTION_STORAGE_EXCEPTION ends up branching to do_page_fault ->
>    __do_page_fault -> ___do_page_fault
> 
>  - ___do_page_fault has is_exec = true because the exception is a
>    instruction storage interrupt
> 
>  - ___do_page_fault also decides that is_write = true because the
>    ESR_DST bit is set.
> 
> This puts us in a bad position because we're taking some information
> from the current exception and some information from a previous
> exception, so it makes sense that things would go wrong from here!

Yeah, we miss the store into read-only vma check because is_exec which 
causes us to go into write path of the core handle_mm_fault. Although 
even if we caught it and segfaulted obviously that's still not the right 
thing to do.

> 
>> In any case this can be fixed by having the instruction storage
>> interrupt zero regs->dsisr rather than storing the ESR value to it.
> 
> Is it valid to just ignore the contents of ESR for an Instruction
> Storage exception?
> 
> The 2.07 ISA at least says that the following ESR bits can be set by an
> Instruction Storage exception:
>  - Byte Ordering
>  - TLB Inelligible
>  - Page Table
> 
> It sounds from
> 
>> + * In any case, do_page_fault for BOOK3E does not use ESR and always expects
>> + * dsisr to be 0. ESR_DST from a prior store in particular would confuse fault
>> + * handling.
> 
> that we don't actually ever check any of those three bits in the
> do_page_fault path. reg_booke.h defines ESR_BO but the definition is
> never used, and there is no ESR_TLBI or ESR_PT constant defined.

Yeah that's right. I don't quite know what we would do with the other
exception types. Possibly just check them in page_fault_is_bad and cause 
a sigbus. Maybe there is some low level fixup that would be possible.
But nothing is implemented.

> So much as it seems a bit dodgy to me, I guess it is right to say that
> we're not changing behaviour by setting dsisr to 0 and just ignoring
> those 3 bits? Should we document that in the comment?

The comment does say do_page_fault expects it to be 0. Did you have more 
in mind? I can tweak the comment.

> I probably would have masked ESR_DST but I guess extra bit-twiddling in
> an interrupt path when zeroing would do is probably not worth it for
> theoretical correctness?

Well the ESR is completely un-set so none of the bits make sense at this
point. Reading it is jsut overhead.

If we wanted to do it really "cleanly", the TLB error interrupt handler
might set ESR before calling here, or it might call a few instructions
later with the register already set to a sane ESR value. But that would
really only matter if do_page_fault started doing anything useful with
the other bits.

Thanks,
Nick

> 
> Kind regards,
> Daniel
> 
>> Link: https://lore.kernel.org/linuxppc-dev/1635306738.0z8wt7619v.astroid@bobo.none/
>> Fixes: a01a3f2ddbcd ("powerpc: remove arguments from fault handler functions")
>> Reported-by: Jacques de Laval <jacques.delaval@protonmail.com>
>> Tested-by: Jacques de Laval <jacques.delaval@protonmail.com>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/powerpc/kernel/head_booke.h | 15 ++++++++++++---
>>  1 file changed, 12 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
>> index e5503420b6c6..ef8d1b1c234e 100644
>> --- a/arch/powerpc/kernel/head_booke.h
>> +++ b/arch/powerpc/kernel/head_booke.h
>> @@ -465,12 +465,21 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
>>  	bl	do_page_fault;						      \
>>  	b	interrupt_return
>>  
>> +/*
>> + * Instruction TLB Error interrupt handlers may call InstructionStorage
>> + * directly without clearing ESR, so the ESR at this point may be left over
>> + * from a prior interrupt.
>> + *
>> + * In any case, do_page_fault for BOOK3E does not use ESR and always expects
>> + * dsisr to be 0. ESR_DST from a prior store in particular would confuse fault
>> + * handling.
>> + */
>>  #define INSTRUCTION_STORAGE_EXCEPTION					      \
>>  	START_EXCEPTION(InstructionStorage)				      \
>> -	NORMAL_EXCEPTION_PROLOG(0x400, INST_STORAGE);		      \
>> -	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
>> +	NORMAL_EXCEPTION_PROLOG(0x400, INST_STORAGE);			      \
>> +	li	r5,0;			/* Store 0 in regs->esr (dsisr) */    \
>>  	stw	r5,_ESR(r11);						      \
>> -	stw	r12, _DEAR(r11);	/* Pass SRR0 as arg2 */		      \
>> +	stw	r12, _DEAR(r11);	/* Set regs->dear (dar) to SRR0 */    \
>>  	prepare_transfer_to_handler;					      \
>>  	bl	do_page_fault;						      \
>>  	b	interrupt_return
>> -- 
>> 2.23.0
> 

^ permalink raw reply

* Re: [PATCH v2 03/45] notifier: Add atomic/blocking_notifier_has_unique_priority()
From: Dmitry Osipenko @ 2021-10-28 21:32 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Ulf Hansson, Rich Felker, linux-ia64, Tomer Maimon,
	Santosh Shilimkar, Rafael J . Wysocki, Boris Ostrovsky,
	Catalin Marinas, Linus Walleij, Dave Hansen, x86, Tali Perry,
	James E.J. Bottomley, Thierry Reding, Guo Ren, Pavel Machek,
	H. Peter Anvin, linux-riscv, Vincent Chen, Will Deacon,
	Greg Ungerer, Stefano Stabellini, Benjamin Fair, Yoshinori Sato,
	Krzysztof Kozlowski, linux-sh, Lee Jones, Helge Deller,
	Daniel Lezcano, Russell King, linux-csky, Jonathan Hunter,
	Tony Lindgren, Chen-Yu Tsai, Ingo Molnar, Geert Uytterhoeven,
	xen-devel, linux-mips, Guenter Roeck, Len Brown, Albert Ou,
	linux-pm, Jonathan Neuschäfer, Vladimir Zapolskiy,
	linux-acpi, linux-m68k, Mark Brown, Borislav Petkov, Greentime Hu,
	Paul Walmsley, linux-tegra, Thomas Gleixner, linux-omap,
	Nancy Yuen, linux-arm-kernel, Juergen Gross, Thomas Bogendoerfer,
	linux-parisc, Nick Hu, Avi Fishman, Patrick Venture,
	Liam Girdwood, linux-kernel, Palmer Dabbelt, Philipp Zabel,
	Paul Mackerras, Andrew Morton, linuxppc-dev, openbmc,
	Joshua Thompson
In-Reply-To: <YXqCz/utp2DFJJ45@smile.fi.intel.com>

28.10.2021 14:00, Andy Shevchenko пишет:
> On Thu, Oct 28, 2021 at 12:16:33AM +0300, Dmitry Osipenko wrote:
>> Add atomic/blocking_notifier_has_unique_priority() helpers which return
>> true if given handler has unique priority.
> 
> ...
> 
>> +/**
>> + *	atomic_notifier_has_unique_priority - Checks whether notifier's priority is unique
>> + *	@nh: Pointer to head of the atomic notifier chain
>> + *	@n: Entry in notifier chain to check
>> + *
>> + *	Checks whether there is another notifier in the chain with the same priority.
>> + *	Must be called in process context.
>> + *
>> + *	Returns true if priority is unique, false otherwise.
> 
> Why this indentation?

This is the same doc-comment style used by this file in general. I
haven't tried to invent anything new.


> ...
> 
>> +	/*
>> +	 * This code gets used during boot-up, when task switching is
>> +	 * not yet working and interrupts must remain disabled.  At
> 
> One space is enough.

This comment is replicated multiple times over this source file. You can
find it before each down_write(). I borrowed the text as-is, for
consistency.

^ permalink raw reply

* Re: [PATCH v2 08/45] kernel: Add combined power-off+restart handler call chain API
From: Dmitry Osipenko @ 2021-10-28 21:36 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Ulf Hansson, Rich Felker, linux-ia64, Tomer Maimon,
	Santosh Shilimkar, Linux-sh list, Boris Ostrovsky,
	Catalin Marinas, Linus Walleij, Dave Hansen,
	the arch/x86 maintainers, Tali Perry, James E.J. Bottomley,
	Thierry Reding, Guo Ren, Pavel Machek, H. Peter Anvin,
	linux-riscv, Vincent Chen, Will Deacon, Greg Ungerer,
	Stefano Stabellini, Benjamin Fair, Yoshinori Sato,
	Krzysztof Kozlowski, Lee Jones, Helge Deller, Daniel Lezcano,
	Russell King, linux-csky, Jonathan Hunter, Tony Lindgren,
	Chen-Yu Tsai, Ingo Molnar, Geert Uytterhoeven, xen-devel,
	linux-mips, Guenter Roeck, Len Brown, Albert Ou,
	Linux OMAP Mailing List, Jonathan Neuschäfer,
	Vladimir Zapolskiy, ACPI Devel Maling List, linux-m68k,
	Mark Brown, Borislav Petkov, Greentime Hu, Paul Walmsley,
	linux-tegra, Thomas Gleixner, Andy Shevchenko, Nancy Yuen,
	Linux ARM, Juergen Gross, Thomas Bogendoerfer, linux-parisc,
	Nick Hu, Avi Fishman, Patrick Venture, Linux PM, Liam Girdwood,
	Linux Kernel Mailing List, Palmer Dabbelt, Philipp Zabel,
	Paul Mackerras, Andrew Morton, linuxppc-dev, openbmc,
	Joshua Thompson
In-Reply-To: <CAJZ5v0jMdSjmkswzu18LSxcNk+k92Oz5XFFXmu-h=W8aPP4Oig@mail.gmail.com>

28.10.2021 12:53, Rafael J. Wysocki пишет:
>> +/**
>> + * struct power_handler - Machine power-off + restart handler
>> + *
>> + * Describes power-off and restart handlers which are invoked by kernel
>> + * to power off or restart this machine.  Supports prioritized chaining for
>> + * both restart and power-off handlers.  Callback's priority must be unique.
>> + * Intended to be used by device drivers that are responsible for restarting
>> + * and powering off hardware which kernel is running on.
>> + *
>> + * Struct power_handler can be static.  Members of this structure must not be
>> + * altered while handler is registered.
>> + *
>> + * Fill the structure members and pass it to register_power_handler().
>> + */
>> +struct power_handler {
> The name of this structure is too generic IMV.  There are many things
> that it might apply to in principle.
> 
> What about calling power_off_handler or sys_off_handler as it need not
> be about power at all?

I didn't like much the 'power' either, but couldn't come up with a
better variant. Will change it in v3, thank you.

^ permalink raw reply

* Re: [PATCH v2 08/45] kernel: Add combined power-off+restart handler call chain API
From: Dmitry Osipenko @ 2021-10-28 21:58 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Ulf Hansson, Rich Felker, Thomas Gleixner, Tomer Maimon,
	Santosh Shilimkar, Linux-sh list, Boris Ostrovsky,
	Catalin Marinas, Linus Walleij, Dave Hansen,
	the arch/x86 maintainers, Tali Perry, James E.J. Bottomley,
	Thierry Reding, Guo Ren, Pavel Machek, H. Peter Anvin, linux-ia64,
	linux-riscv, Vincent Chen, Will Deacon, Greg Ungerer,
	Stefano Stabellini, Benjamin Fair, Yoshinori Sato,
	Krzysztof Kozlowski, Helge Deller, Daniel Lezcano, Russell King,
	linux-csky, Jonathan Hunter, Tony Lindgren, Chen-Yu Tsai,
	Ingo Molnar, Geert Uytterhoeven, xen-devel, linux-mips,
	Guenter Roeck, Len Brown, Albert Ou, Linux OMAP Mailing List,
	Jonathan Neuschäfer, Vladimir Zapolskiy,
	ACPI Devel Maling List, linux-m68k, Mark Brown, Borislav Petkov,
	Greentime Hu, Paul Walmsley, linux-tegra, Lee Jones,
	Andy Shevchenko, Nancy Yuen, Linux ARM, Juergen Gross,
	Thomas Bogendoerfer, linux-parisc, Avi Fishman, Patrick Venture,
	Linux PM, Liam Girdwood, Linux Kernel Mailing List,
	Palmer Dabbelt, Philipp Zabel, Paul Mackerras, Andrew Morton,
	linuxppc-dev, openbmc, Joshua Thompson
In-Reply-To: <CAJZ5v0gpu2ezMhWr=grg6M8aWAx58DQozbXHoZaiPqUaZxJi4Q@mail.gmail.com>

28.10.2021 12:59, Rafael J. Wysocki пишет:
>> +#define RESTART_PRIO_RESERVED          0
>> +#define RESTART_PRIO_DEFAULT           128
>> +#define RESTART_PRIO_HIGH              192
>>
>>  enum reboot_mode {
>>         REBOOT_UNDEFINED = -1,
>> @@ -49,6 +55,167 @@ int register_restart_handler(struct notifier_block *);
>>  int unregister_restart_handler(struct notifier_block *);
>>  void do_kernel_restart(char *cmd);
>>
>> +/*
>> + * Unified poweroff + restart API.
>> + */
>> +
>> +#define POWEROFF_PRIO_RESERVED         0
>> +#define POWEROFF_PRIO_PLATFORM         1
>> +#define POWEROFF_PRIO_DEFAULT          128
>> +#define POWEROFF_PRIO_HIGH             192
>> +#define POWEROFF_PRIO_FIRMWARE         224
> Also I'm wondering why these particular numbers were chosen, here and above?

These values are chosen based on priorities that drivers already use. I looked thorough them all and ended with this scheme that fulfills the needs of the current API users.

I'll add these comments in v3:

/*
 * Standard restart priority levels. Intended to be set in the
 * sys_off_handler.restart_priority field.
 *
 * Use `RESTART_PRIO_XXX +- prio` style for additional levels.
 *
 * RESTART_PRIO_RESERVED:	Falls back to RESTART_PRIO_DEFAULT.
 *				Drivers may leave priority initialized
 *				to zero, to auto-set it to the default level.
 *
 * RESTART_PRIO_DEFAULT:	Use this for generic handler.
 *
 * RESTART_PRIO_HIGH:		Use this if you have multiple handlers and
 *				this handler has higher priority than the
 *				default handler.
 */

/*
 * Standard power-off priority levels. Intended to be set in the
 * sys_off_handler.power_off_priority field.
 *
 * Use `POWEROFF_PRIO_XXX +- prio` style for additional levels.
 *
 * POWEROFF_PRIO_RESERVED:	Falls back to POWEROFF_PRIO_DEFAULT.
 *				Drivers may leave priority initialized
 *				to zero, to auto-set it to the default level.
 *
 * POWEROFF_PRIO_PLATFORM:	Intended to be used by platform-level handler.
 *				Has lowest priority since device drivers are
 *				expected to take over platform handler which
 *				doesn't allow further callback chaining.
 *
 * POWEROFF_PRIO_DEFAULT:	Use this for generic handler.
 *
 * POWEROFF_PRIO_HIGH:		Use this if you have multiple handlers and
 *				this handler has higher priority than the
 *				default handler.
 *
 * POWEROFF_PRIO_FIRMWARE:	Use this if handler uses firmware call.
 *				Has highest priority since firmware is expected
 *				to know best how to power-off hardware properly.
 */

^ permalink raw reply

* Re: [PATCH v2 08/45] kernel: Add combined power-off+restart handler call chain API
From: Dmitry Osipenko @ 2021-10-28 21:59 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Ulf Hansson, Rich Felker, Thomas Gleixner, Tomer Maimon,
	Santosh Shilimkar, Linux-sh list, Boris Ostrovsky,
	Catalin Marinas, Linus Walleij, Dave Hansen,
	the arch/x86 maintainers, Tali Perry, James E.J. Bottomley,
	Thierry Reding, Guo Ren, Pavel Machek, H. Peter Anvin, linux-ia64,
	linux-riscv, Vincent Chen, Will Deacon, Greg Ungerer,
	Stefano Stabellini, Benjamin Fair, Yoshinori Sato,
	Krzysztof Kozlowski, Helge Deller, Daniel Lezcano, Russell King,
	linux-csky, Jonathan Hunter, Tony Lindgren, Chen-Yu Tsai,
	Ingo Molnar, Geert Uytterhoeven, xen-devel, linux-mips,
	Guenter Roeck, Len Brown, Albert Ou, Linux OMAP Mailing List,
	Jonathan Neuschäfer, Vladimir Zapolskiy,
	ACPI Devel Maling List, linux-m68k, Mark Brown, Borislav Petkov,
	Greentime Hu, Paul Walmsley, linux-tegra, Lee Jones,
	Andy Shevchenko, Nancy Yuen, Linux ARM, Juergen Gross,
	Thomas Bogendoerfer, linux-parisc, Avi Fishman, Patrick Venture,
	Linux PM, Liam Girdwood, Linux Kernel Mailing List,
	Palmer Dabbelt, Philipp Zabel, Paul Mackerras, Andrew Morton,
	linuxppc-dev, openbmc, Joshua Thompson
In-Reply-To: <CAJZ5v0gpu2ezMhWr=grg6M8aWAx58DQozbXHoZaiPqUaZxJi4Q@mail.gmail.com>

28.10.2021 12:59, Rafael J. Wysocki пишет:
>> +/**
>> + * struct power_handler - Machine power-off + restart handler
>> + *
>> + * Describes power-off and restart handlers which are invoked by kernel
>> + * to power off or restart this machine.  Supports prioritized chaining for
>> + * both restart and power-off handlers.  Callback's priority must be unique.
>> + * Intended to be used by device drivers that are responsible for restarting
>> + * and powering off hardware which kernel is running on.
>> + *
>> + * Struct power_handler can be static.  Members of this structure must not be
>> + * altered while handler is registered.
>> + *
>> + * Fill the structure members and pass it to register_power_handler().
>> + */
>> +struct power_handler {
>> +       /**
>> +        * @cb_data:
>> +        *
>> +        * User data included in callback's argument.
>> +        */
> And here I would document the structure fields in the main kerneldoc
> comment above.
> 
> As is, it is a bit hard to grasp the whole definition.
> 

I'll move the comments in v3, thanks.

^ permalink raw reply

* Re: [PATCH v2 00/45] Introduce power-off+restart call chain API
From: Dmitry Osipenko @ 2021-10-28 22:05 UTC (permalink / raw)
  To: Thierry Reding, Jonathan Hunter, Lee Jones, Rafael J . Wysocki,
	Mark Brown, Andrew Morton, Guenter Roeck, Russell King,
	Daniel Lezcano, Andy Shevchenko, Ulf Hansson
  Cc: Rich Felker, linux-ia64, Tomer Maimon, Santosh Shilimkar,
	linux-sh, Boris Ostrovsky, Linus Walleij, Dave Hansen, linux-acpi,
	Tali Perry, James E.J. Bottomley, Paul Mackerras, Pavel Machek,
	H. Peter Anvin, linux-riscv, Vincent Chen, Will Deacon,
	Greg Ungerer, Stefano Stabellini, Benjamin Fair, Yoshinori Sato,
	Krzysztof Kozlowski, Helge Deller, x86, linux-csky, Tony Lindgren,
	Chen-Yu Tsai, Ingo Molnar, Geert Uytterhoeven, Catalin Marinas,
	xen-devel, linux-mips, Len Brown, Albert Ou, linux-pm,
	Jonathan Neuschäfer, Vladimir Zapolskiy, linux-m68k,
	Borislav Petkov, Greentime Hu, Paul Walmsley, linux-tegra,
	Thomas Gleixner, linux-omap, Nancy Yuen, linux-arm-kernel,
	Juergen Gross, Thomas Bogendoerfer, linux-parisc, Nick Hu,
	Avi Fishman, Patrick Venture, Liam Girdwood, linux-kernel,
	Palmer Dabbelt, Philipp Zabel, Guo Ren, linuxppc-dev, openbmc,
	Joshua Thompson
In-Reply-To: <20211027211715.12671-1-digetx@gmail.com>

28.10.2021 00:16, Dmitry Osipenko пишет:
>   mfd: ab8500: Use devm_register_trivial_power_off_handler()
>   reset: ath79: Use devm_register_simple_restart_handler()
>   reset: intel-gw: Use devm_register_simple_restart_handler()
>   reset: lpc18xx: Use devm_register_prioritized_restart_handler()
>   reset: npcm: Use devm_register_prioritized_restart_handler()

These patches got lost because Gmail gave me ban after 40's email. I
think it doesn't worth to re-send them now since you should get an idea
about how API usage looks like without the lost patches.

^ permalink raw reply

* Re: [PATCH v2 03/45] notifier: Add atomic/blocking_notifier_has_unique_priority()
From: Dmitry Osipenko @ 2021-10-28 22:10 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Ulf Hansson, Rich Felker, linux-ia64, Tomer Maimon,
	Santosh Shilimkar, Rafael J . Wysocki, Boris Ostrovsky,
	Catalin Marinas, Linus Walleij, Dave Hansen, x86, Tali Perry,
	James E.J. Bottomley, Thierry Reding, Guo Ren, Pavel Machek,
	H. Peter Anvin, linux-riscv, Vincent Chen, Will Deacon,
	Greg Ungerer, Stefano Stabellini, Benjamin Fair, Yoshinori Sato,
	Krzysztof Kozlowski, linux-sh, Lee Jones, Helge Deller,
	Daniel Lezcano, Russell King, linux-csky, Jonathan Hunter,
	Tony Lindgren, Chen-Yu Tsai, Ingo Molnar, Geert Uytterhoeven,
	xen-devel, linux-mips, Guenter Roeck, Len Brown, Albert Ou,
	linux-pm, Jonathan Neuschäfer, Vladimir Zapolskiy,
	linux-acpi, linux-m68k, Mark Brown, Borislav Petkov, Greentime Hu,
	Paul Walmsley, linux-tegra, Thomas Gleixner, linux-omap,
	Nancy Yuen, linux-arm-kernel, Juergen Gross, Thomas Bogendoerfer,
	linux-parisc, Nick Hu, Avi Fishman, Patrick Venture,
	Liam Girdwood, linux-kernel, Palmer Dabbelt, Philipp Zabel,
	Paul Mackerras, Andrew Morton, linuxppc-dev, openbmc,
	Joshua Thompson
In-Reply-To: <YXqCz/utp2DFJJ45@smile.fi.intel.com>

28.10.2021 14:00, Andy Shevchenko пишет:
>> +	while ((*nl) != NULL && (*nl)->priority >= n->priority) {
> ' != NULL' is not needed.
> 

I'll change it in v3, thanks.

^ permalink raw reply

* Re: [PATCH v2 03/45] notifier: Add atomic/blocking_notifier_has_unique_priority()
From: Dmitry Osipenko @ 2021-10-28 23:28 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Ulf Hansson, Rich Felker, linux-ia64, Tomer Maimon,
	Santosh Shilimkar, Rafael J . Wysocki, Boris Ostrovsky,
	Catalin Marinas, Linus Walleij, Dave Hansen, x86, Tali Perry,
	James E.J. Bottomley, Thierry Reding, Guo Ren, Pavel Machek,
	H. Peter Anvin, linux-riscv, Vincent Chen, Will Deacon,
	Greg Ungerer, Stefano Stabellini, Benjamin Fair, Yoshinori Sato,
	Krzysztof Kozlowski, linux-sh, Lee Jones, Helge Deller,
	Daniel Lezcano, Russell King, linux-csky, Jonathan Hunter,
	Tony Lindgren, Chen-Yu Tsai, Ingo Molnar, Geert Uytterhoeven,
	xen-devel, linux-mips, Guenter Roeck, Len Brown, Albert Ou,
	linux-pm, Jonathan Neuschäfer, Vladimir Zapolskiy,
	linux-acpi, linux-m68k, Mark Brown, Borislav Petkov, Greentime Hu,
	Paul Walmsley, linux-tegra, Thomas Gleixner, linux-omap,
	Nancy Yuen, linux-arm-kernel, Juergen Gross, Thomas Bogendoerfer,
	linux-parisc, Nick Hu, Avi Fishman, Patrick Venture,
	Liam Girdwood, linux-kernel, Palmer Dabbelt, Philipp Zabel,
	Paul Mackerras, Andrew Morton, linuxppc-dev, openbmc,
	Joshua Thompson
In-Reply-To: <c5fb7590-03a7-0eea-4040-07472a5c9710@gmail.com>

29.10.2021 00:32, Dmitry Osipenko пишет:
>>> +	/*
>>> +	 * This code gets used during boot-up, when task switching is
>>> +	 * not yet working and interrupts must remain disabled.  At
>> One space is enough.
> This comment is replicated multiple times over this source file. You can
> find it before each down_write(). I borrowed the text as-is, for
> consistency.

Actually, it should be down_read() here since there are no writes. I'll
correct it in v3.

^ permalink raw reply

* Re: [PATCH] powerpc/bpf: fix write protecting JIT code
From: Michael Ellerman @ 2021-10-29  1:50 UTC (permalink / raw)
  To: Daniel Borkmann, Naveen N. Rao, ast, christophe.leroy,
	Hari Bathini, jniethe5
  Cc: songliubraving, netdev, john.fastabend, kpsingh, stable, andrii,
	paulus, yhs, bpf, linuxppc-dev, kafai
In-Reply-To: <c8d7390b-c07c-75cd-e9e9-4b8f0b786cc6@iogearbox.net>

Daniel Borkmann <daniel@iogearbox.net> writes:
> On 10/25/21 8:15 AM, Naveen N. Rao wrote:
>> Hari Bathini wrote:
>>> Running program with bpf-to-bpf function calls results in data access
>>> exception (0x300) with the below call trace:
>>>
>>>     [c000000000113f28] bpf_int_jit_compile+0x238/0x750 (unreliable)
>>>     [c00000000037d2f8] bpf_check+0x2008/0x2710
>>>     [c000000000360050] bpf_prog_load+0xb00/0x13a0
>>>     [c000000000361d94] __sys_bpf+0x6f4/0x27c0
>>>     [c000000000363f0c] sys_bpf+0x2c/0x40
>>>     [c000000000032434] system_call_exception+0x164/0x330
>>>     [c00000000000c1e8] system_call_vectored_common+0xe8/0x278
>>>
>>> as bpf_int_jit_compile() tries writing to write protected JIT code
>>> location during the extra pass.
>>>
>>> Fix it by holding off write protection of JIT code until the extra
>>> pass, where branch target addresses fixup happens.
>>>
>>> Cc: stable@vger.kernel.org
>>> Fixes: 62e3d4210ac9 ("powerpc/bpf: Write protect JIT code")
>>> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
>>> ---
>>>  arch/powerpc/net/bpf_jit_comp.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> Thanks for the fix!
>> 
>> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>
> LGTM, I presume this fix will be routed via Michael.

Thanks for reviewing, I've picked it up.

> BPF selftests have plenty of BPF-to-BPF calls in there, too bad this was
> caught so late. :/

Yeah :/

STRICT_KERNEL_RWX is not on by default in all our defconfigs, so that's
probably why no one caught it.

I used to run the BPF selftests but they stopped building for me a while
back, I'll see if I can get them going again.

cheers

^ permalink raw reply

* [V3] powerpc/perf: Enable PMU counters post partition migration if PMU is active
From: Athira Rajeev @ 2021-10-29  3:05 UTC (permalink / raw)
  To: mpe; +Cc: nathanl, maddy, rnsastry, kjain, npiggin, linuxppc-dev

During Live Partition Migration (LPM), it is observed that perf
counter values reports zero post migration completion. However
'perf stat' with workload continues to show counts post migration
since PMU gets disabled/enabled during sched switches. But incase
of system/cpu wide monitoring, zero counts were reported with 'perf
stat' after migration completion.

Example:
 ./perf stat -e r1001e -I 1000
           time             counts unit events
     1.001010437         22,137,414      r1001e
     2.002495447         15,455,821      r1001e
<<>> As seen in next below logs, the counter values shows zero
        after migration is completed.
<<>>
    86.142535370    129,392,333,440      r1001e
    87.144714617                  0      r1001e
    88.146526636                  0      r1001e
    89.148085029                  0      r1001e

Here PMU is enabled during start of perf session and counter
values are read at intervals. Counters are only disabled at the
end of session. The powerpc mobility code presently does not handle
disabling and enabling back of PMU counters during partition
migration. Also since the PMU register values are not saved/restored
during migration, PMU registers like Monitor Mode Control Register 0
(MMCR0), Monitor Mode Control Register 1 (MMCR1) will not contain
the value it was programmed with. Hence PMU counters will not be
enabled correctly post migration.

Fix this in mobility code by handling disabling and enabling of
PMU in all cpu's before and after migration. Patch introduces two
functions 'mobility_pmu_disable' and 'mobility_pmu_enable'.
mobility_pmu_disable() is called before the processor threads goes
to suspend state so as to disable the PMU counters. And disable is
done only if there are any active events running on that cpu.
mobility_pmu_enable() is called after the migrate is done to enable
back the PMU counters.

Since the performance Monitor counters ( PMCs) are not
saved/restored during LPM, results in PMC value being zero and the
'event->hw.prev_count' being non-zero value. This causes problem
during updation of event->count since we always accumulate
(event->hw.prev_count - PMC value) in event->count.  If
event->hw.prev_count is greater PMC value, event->count becomes
negative. To fix this, 'prev_count' also needs to be re-initialised
for all events while enabling back the events. Hence read the
existing events and clear the PMC index (stored in event->hw.idx)
for all events im mobility_pmu_disable. By this way, event count
settings will get re-initialised correctly in power_pmu_enable.

Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
[ Fixed compilation error reported by kernel test robot ]
Reported-by: kernel test robot <lkp@intel.com>
---
Changelog:
Change from v2 -> v3:
Addressed review comments from Nicholas Piggin.
 - Removed the "migrate" field which was added in initial
   patch to address updation of event count settings correctly
   in power_pmu_enable. Instead read off existing events in
   mobility_pmu_disable before power_pmu_enable.
 - Moved the mobility_pmu_disable/enable declaration from
   rtas.h to perf event header file.

Addressed review comments from Nathan.
 - Moved the mobility function calls from stop_machine
   context out to pseries_migrate_partition. Also now this
   is a per cpu invocation.

Change from v1 -> v2:
 - Moved the mobility_pmu_enable and mobility_pmu_disable
   declarations under CONFIG_PPC_PERF_CTRS in rtas.h.
   Also included 'asm/rtas.h' in core-book3s to fix the
   compilation warning reported by kernel test robot.

 arch/powerpc/include/asm/perf_event.h     |  8 +++++
 arch/powerpc/perf/core-book3s.c           | 39 +++++++++++++++++++++++
 arch/powerpc/platforms/pseries/mobility.c |  7 ++++
 3 files changed, 54 insertions(+)

diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
index 164e910bf654..88aab6cf840c 100644
--- a/arch/powerpc/include/asm/perf_event.h
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -17,6 +17,14 @@ static inline bool is_sier_available(void) { return false; }
 static inline unsigned long get_pmcs_ext_regs(int idx) { return 0; }
 #endif
 
+#ifdef CONFIG_PPC_PERF_CTRS
+void mobility_pmu_disable(void *unused);
+void mobility_pmu_enable(void *unused);
+#else
+static inline void mobility_pmu_disable(void *unused) { }
+static inline void mobility_pmu_enable(void *unused) { }
+#endif
+
 #ifdef CONFIG_FSL_EMB_PERF_EVENT
 #include <asm/perf_event_fsl_emb.h>
 #endif
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 73e62e9b179b..2e8c4c668fa3 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1343,6 +1343,33 @@ static void power_pmu_disable(struct pmu *pmu)
 	local_irq_restore(flags);
 }
 
+/*
+ * Called from pseries_migrate_partition() function
+ * before migration, from powerpc/mobility code.
+ */
+void mobility_pmu_disable(void *unused)
+{
+	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+	struct perf_event *event;
+
+	if (cpuhw->n_events != 0) {
+		int i;
+
+		power_pmu_disable(NULL);
+		/*
+		 * Read off any pre-existing events because the register
+		 * values may not be migrated.
+		 */
+		for (i = 0; i < cpuhw->n_events; ++i) {
+			event = cpuhw->event[i];
+			if (event->hw.idx) {
+				power_pmu_read(event);
+				event->hw.idx = 0;
+			}
+		}
+	}
+}
+
 /*
  * Re-enable all events if disable == 0.
  * If we were previously disabled and events were added, then
@@ -1515,6 +1542,18 @@ static void power_pmu_enable(struct pmu *pmu)
 	local_irq_restore(flags);
 }
 
+/*
+ * Called from pseries_migrate_partition() function
+ * after migration, from powerpc/mobility code.
+ */
+void mobility_pmu_enable(void *unused)
+{
+	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
+
+	cpuhw->n_added = cpuhw->n_events;
+	power_pmu_enable(NULL);
+}
+
 static int collect_events(struct perf_event *group, int max_count,
 			  struct perf_event *ctrs[], u64 *events,
 			  unsigned int *flags)
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index e83e0891272d..3e96485ccba4 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -22,6 +22,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/stringify.h>
+#include <linux/perf_event.h>
 
 #include <asm/machdep.h>
 #include <asm/rtas.h>
@@ -631,12 +632,18 @@ static int pseries_migrate_partition(u64 handle)
 	if (ret)
 		return ret;
 
+	/* Disable PMU before suspend */
+	on_each_cpu(&mobility_pmu_disable, NULL, 0);
+
 	ret = pseries_suspend(handle);
 	if (ret == 0)
 		post_mobility_fixup();
 	else
 		pseries_cancel_migration(handle, ret);
 
+	/* Enable PMU after resuming */
+	on_each_cpu(&mobility_pmu_enable, NULL, 0);
+
 	return ret;
 }
 
-- 
2.33.0


^ permalink raw reply related

* Re: [PATCH] powerpc: Enhance pmem DMA bypass handling
From: Alexey Kardashevskiy @ 2021-10-29  5:57 UTC (permalink / raw)
  To: Brian King, linuxppc-dev
In-Reply-To: <2075ad89-09d5-6906-f8fe-83a2347bbe9d@linux.vnet.ibm.com>



On 28/10/2021 08:30, Brian King wrote:
> On 10/26/21 12:39 AM, Alexey Kardashevskiy wrote:
>>
>>
>> On 10/26/21 01:40, Brian King wrote:
>>> On 10/23/21 7:18 AM, Alexey Kardashevskiy wrote:
>>>>
>>>>
>>>> On 23/10/2021 07:18, Brian King wrote:
>>>>> On 10/22/21 7:24 AM, Alexey Kardashevskiy wrote:
>>>>>>
>>>>>>
>>>>>> On 22/10/2021 04:44, Brian King wrote:
>>>>>>> If ibm,pmemory is installed in the system, it can appear anywhere
>>>>>>> in the address space. This patch enhances how we handle DMA for devices when
>>>>>>> ibm,pmemory is present. In the case where we have enough DMA space to
>>>>>>> direct map all of RAM, but not ibm,pmemory, we use direct DMA for
>>>>>>> I/O to RAM and use the default window to dynamically map ibm,pmemory.
>>>>>>> In the case where we only have a single DMA window, this won't work, > so if the window is not big enough to map the entire address range,
>>>>>>> we cannot direct map.
>>>>>>
>>>>>> but we want the pmem range to be mapped into the huge DMA window too if we can, why skip it?
>>>>>
>>>>> This patch should simply do what the comment in this commit mentioned below suggests, which says that
>>>>> ibm,pmemory can appear anywhere in the address space. If the DMA window is large enough
>>>>> to map all of MAX_PHYSMEM_BITS, we will indeed simply do direct DMA for everything,
>>>>> including the pmem. If we do not have a big enough window to do that, we will do
>>>>> direct DMA for DRAM and dynamic mapping for pmem.
>>>>
>>>>
>>>> Right, and this is what we do already, do not we? I missing something here.
>>>
>>> The upstream code does not work correctly that I can see. If I boot an upstream kernel
>>> with an nvme device and vpmem assigned to the LPAR, and enable dev_dbg in arch/powerpc/platforms/pseries/iommu.c,
>>> I see the following in the logs:
>>>
>>> [    2.157549] nvme 0121:50:00.0: ibm,query-pe-dma-windows(53) 500000 8000000 20000121 returned 0
>>> [    2.157561] nvme 0121:50:00.0: Skipping ibm,pmemory
>>> [    2.157567] nvme 0121:50:00.0: can't map partition max 0x8000000000000 with 16777216 65536-sized pages
>>> [    2.170150] nvme 0121:50:00.0: ibm,create-pe-dma-window(54) 500000 8000000 20000121 10 28 returned 0 (liobn = 0x70000121 starting addr = 8000000 0)
>>> [    2.170170] nvme 0121:50:00.0: created tce table LIOBN 0x70000121 for /pci@800000020000121/pci1014,683@0
>>> [    2.356260] nvme 0121:50:00.0: node is /pci@800000020000121/pci1014,683@0
>>>
>>> This means we are heading down the leg in enable_ddw where we do not set direct_mapping to true. We use
>>> create the DDW window, but don't do any direct DMA. This is because the window is not large enough to
>>> map 2PB of memory, which is what ddw_memory_hotplug_max returns without my patch.
>>>
>>> With my patch applied, I get this in the logs:
>>>
>>> [    2.204866] nvme 0121:50:00.0: ibm,query-pe-dma-windows(53) 500000 8000000 20000121 returned 0
>>> [    2.204875] nvme 0121:50:00.0: Skipping ibm,pmemory
>>> [    2.205058] nvme 0121:50:00.0: ibm,create-pe-dma-window(54) 500000 8000000 20000121 10 21 returned 0 (liobn = 0x70000121 starting addr = 8000000 0)
>>> [    2.205068] nvme 0121:50:00.0: created tce table LIOBN 0x70000121 for /pci@800000020000121/pci1014,683@0
>>> [    2.215898] nvme 0121:50:00.0: iommu: 64-bit OK but direct DMA is limited by 800000200000000
>>>
>>
>>
>> ah I see. then...
>>
>>
>>>
>>> Thanks,
>>>
>>> Brian
>>>
>>>
>>>>
>>>>>
>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/powerpc/platforms/pseries/iommu.c?id=bf6e2d562bbc4d115cf322b0bca57fe5bbd26f48
>>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Brian
>>>>>
>>>>>
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
>>>>>>> ---
>>>>>>>     arch/powerpc/platforms/pseries/iommu.c | 19 ++++++++++---------
>>>>>>>     1 file changed, 10 insertions(+), 9 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>>>>>>> index 269f61d519c2..d9ae985d10a4 100644
>>>>>>> --- a/arch/powerpc/platforms/pseries/iommu.c
>>>>>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>>>>>>> @@ -1092,15 +1092,6 @@ static phys_addr_t ddw_memory_hotplug_max(void)
>>>>>>>         phys_addr_t max_addr = memory_hotplug_max();
>>>>>>>         struct device_node *memory;
>>>>>>>     -    /*
>>>>>>> -     * The "ibm,pmemory" can appear anywhere in the address space.
>>>>>>> -     * Assuming it is still backed by page structs, set the upper limit
>>>>>>> -     * for the huge DMA window as MAX_PHYSMEM_BITS.
>>>>>>> -     */
>>>>>>> -    if (of_find_node_by_type(NULL, "ibm,pmemory"))
>>>>>>> -        return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
>>>>>>> -            (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS);
>>>>>>> -
>>>>>>>         for_each_node_by_type(memory, "memory") {
>>>>>>>             unsigned long start, size;
>>>>>>>             int n_mem_addr_cells, n_mem_size_cells, len;
>>>>>>> @@ -1341,6 +1332,16 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>>>>>>>          */
>>>>>>>         len = max_ram_len;
>>>>>>>         if (pmem_present) {
>>>>>>> +        if (default_win_removed) {
>>>>>>> +            /*
>>>>>>> +             * If we only have one DMA window and have pmem present,
>>>>>>> +             * then we need to be able to map the entire address
>>>>>>> +             * range in order to be able to do direct DMA to RAM.
>>>>>>> +             */
>>>>>>> +            len = order_base_2((sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ?
>>>>>>> +                    (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS));


Sorry I am still not following. If pmem is present, this new chunk will 
extend @len to MAX_PHYSMEM_BITS but the same will do the code right 
after this new chunk. How does removing the default window alters any of 
this?

If the only window is removed and we can only have one window which does 
not cover MAX_PHYSMEM_BITS, then we do not try 1:1 at all. Reverting
54fc3c681ded9437e4548e250 (which added pmem to phys_addr_t 
ddw_memory_hotplug_max(void) should be enough, no?


>>
>>
>> ... len = (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ? 31 :
>> MAX_PHYSMEM_BITS  ?
>>
>> Or actually simply drop this hunk and only leave the first one and add
>> this instead:
>>
>>
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c
>> b/arch/powerpc/platforms/pseries/iommu.c
>> index 591ec9e94edb..68bfcd2227d9 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -1518,7 +1518,7 @@ static bool enable_ddw(struct pci_dev *dev, struct
>> device_node *pdn)
>>           * as RAM, then we failed to create a window to cover persistent
>>           * memory and need to set the DMA limit.
>>           */
>> -       if (pmem_present && ddw_enabled && direct_mapping && len ==
>> max_ram_len)
>> +       if (pmem_present && ddw_enabled && direct_mapping)
>>
>> ?
> 
> 
> So, this would change the handling of devices that have a single window when pmem
> is present. 

Yeah, that was not right, never mind.

> With your proposed change, we would then direct map for DRAM
> and attempt to use whatever TCE space is left to do the dynamic mapping
> when DMA'ing to the pmem, all from a single window. We don't account for this
> in the code from what I can see, so we could get into the scenario where we have
> a DMA window just large enough to map all of DRAM, we direct map that, and then
> have nothing left over for the pmem.
> 
> I would actually like to get this working, as it would be helpful for the performance
> of SR-IOV devices when pmem is present. However, I think we'd need to ensure we
> have at least a certain amount of reserved DMA space for the dynamic mapping
> before we do. There might be other things to consider as well...
> 
> Should we handle that as a further enhancement in a future patch, and move forward with this
> as a bug fix?

I am still struggling to see what the second hunk fixes exactly. Thanks,


> 
> Thanks,
> 
> Brian
> 
>>
>> Thanks,
>>
>>
>>
>>>>>>> +        }
>>>>>>> +
>>>>>>>             if (query.largest_available_block >=
>>>>>>>                 (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
>>>>>>>                 len = MAX_PHYSMEM_BITS;
>>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>
>>>
>>>
>>
> 
> 

-- 
Alexey

^ permalink raw reply

* Re: [V3] powerpc/perf: Enable PMU counters post partition migration if PMU is active
From: Nicholas Piggin @ 2021-10-29  6:16 UTC (permalink / raw)
  To: Athira Rajeev, mpe; +Cc: nathanl, kjain, maddy, linuxppc-dev, rnsastry
In-Reply-To: <20211029030510.58797-1-atrajeev@linux.vnet.ibm.com>

Excerpts from Athira Rajeev's message of October 29, 2021 1:05 pm:
> During Live Partition Migration (LPM), it is observed that perf
> counter values reports zero post migration completion. However
> 'perf stat' with workload continues to show counts post migration
> since PMU gets disabled/enabled during sched switches. But incase
> of system/cpu wide monitoring, zero counts were reported with 'perf
> stat' after migration completion.
> 
> Example:
>  ./perf stat -e r1001e -I 1000
>            time             counts unit events
>      1.001010437         22,137,414      r1001e
>      2.002495447         15,455,821      r1001e
> <<>> As seen in next below logs, the counter values shows zero
>         after migration is completed.
> <<>>
>     86.142535370    129,392,333,440      r1001e
>     87.144714617                  0      r1001e
>     88.146526636                  0      r1001e
>     89.148085029                  0      r1001e

This is the output without the patch? After the patch it keeps counting 
I suppose? And does the very large count go away too?

> 
> Here PMU is enabled during start of perf session and counter
> values are read at intervals. Counters are only disabled at the
> end of session. The powerpc mobility code presently does not handle
> disabling and enabling back of PMU counters during partition
> migration. Also since the PMU register values are not saved/restored
> during migration, PMU registers like Monitor Mode Control Register 0
> (MMCR0), Monitor Mode Control Register 1 (MMCR1) will not contain
> the value it was programmed with. Hence PMU counters will not be
> enabled correctly post migration.
> 
> Fix this in mobility code by handling disabling and enabling of
> PMU in all cpu's before and after migration. Patch introduces two
> functions 'mobility_pmu_disable' and 'mobility_pmu_enable'.
> mobility_pmu_disable() is called before the processor threads goes
> to suspend state so as to disable the PMU counters. And disable is
> done only if there are any active events running on that cpu.
> mobility_pmu_enable() is called after the migrate is done to enable
> back the PMU counters.
> 
> Since the performance Monitor counters ( PMCs) are not
> saved/restored during LPM, results in PMC value being zero and the
> 'event->hw.prev_count' being non-zero value. This causes problem
> during updation of event->count since we always accumulate
> (event->hw.prev_count - PMC value) in event->count.  If
> event->hw.prev_count is greater PMC value, event->count becomes
> negative. To fix this, 'prev_count' also needs to be re-initialised
> for all events while enabling back the events. Hence read the
> existing events and clear the PMC index (stored in event->hw.idx)
> for all events im mobility_pmu_disable. By this way, event count
> settings will get re-initialised correctly in power_pmu_enable.
> 
> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
> [ Fixed compilation error reported by kernel test robot ]
> Reported-by: kernel test robot <lkp@intel.com>
> ---
> Changelog:
> Change from v2 -> v3:
> Addressed review comments from Nicholas Piggin.
>  - Removed the "migrate" field which was added in initial
>    patch to address updation of event count settings correctly
>    in power_pmu_enable. Instead read off existing events in
>    mobility_pmu_disable before power_pmu_enable.
>  - Moved the mobility_pmu_disable/enable declaration from
>    rtas.h to perf event header file.
> 
> Addressed review comments from Nathan.
>  - Moved the mobility function calls from stop_machine
>    context out to pseries_migrate_partition. Also now this
>    is a per cpu invocation.
> 
> Change from v1 -> v2:
>  - Moved the mobility_pmu_enable and mobility_pmu_disable
>    declarations under CONFIG_PPC_PERF_CTRS in rtas.h.
>    Also included 'asm/rtas.h' in core-book3s to fix the
>    compilation warning reported by kernel test robot.
> 
>  arch/powerpc/include/asm/perf_event.h     |  8 +++++
>  arch/powerpc/perf/core-book3s.c           | 39 +++++++++++++++++++++++
>  arch/powerpc/platforms/pseries/mobility.c |  7 ++++
>  3 files changed, 54 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
> index 164e910bf654..88aab6cf840c 100644
> --- a/arch/powerpc/include/asm/perf_event.h
> +++ b/arch/powerpc/include/asm/perf_event.h
> @@ -17,6 +17,14 @@ static inline bool is_sier_available(void) { return false; }
>  static inline unsigned long get_pmcs_ext_regs(int idx) { return 0; }
>  #endif
>  
> +#ifdef CONFIG_PPC_PERF_CTRS
> +void mobility_pmu_disable(void *unused);
> +void mobility_pmu_enable(void *unused);
> +#else
> +static inline void mobility_pmu_disable(void *unused) { }
> +static inline void mobility_pmu_enable(void *unused) { }
> +#endif
> +
>  #ifdef CONFIG_FSL_EMB_PERF_EVENT
>  #include <asm/perf_event_fsl_emb.h>
>  #endif
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index 73e62e9b179b..2e8c4c668fa3 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -1343,6 +1343,33 @@ static void power_pmu_disable(struct pmu *pmu)
>  	local_irq_restore(flags);
>  }
>  
> +/*
> + * Called from pseries_migrate_partition() function
> + * before migration, from powerpc/mobility code.
> + */
> +void mobility_pmu_disable(void *unused)
> +{
> +	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
> +	struct perf_event *event;
> +
> +	if (cpuhw->n_events != 0) {
> +		int i;
> +
> +		power_pmu_disable(NULL);
> +		/*
> +		 * Read off any pre-existing events because the register
> +		 * values may not be migrated.
> +		 */
> +		for (i = 0; i < cpuhw->n_events; ++i) {
> +			event = cpuhw->event[i];
> +			if (event->hw.idx) {
> +				power_pmu_read(event);
> +				event->hw.idx = 0;
> +			}
> +		}
> +	}
> +}
> +
>  /*
>   * Re-enable all events if disable == 0.
>   * If we were previously disabled and events were added, then
> @@ -1515,6 +1542,18 @@ static void power_pmu_enable(struct pmu *pmu)
>  	local_irq_restore(flags);
>  }
>  
> +/*
> + * Called from pseries_migrate_partition() function
> + * after migration, from powerpc/mobility code.
> + */
> +void mobility_pmu_enable(void *unused)
> +{
> +	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
> +
> +	cpuhw->n_added = cpuhw->n_events;
> +	power_pmu_enable(NULL);
> +}
> +
>  static int collect_events(struct perf_event *group, int max_count,
>  			  struct perf_event *ctrs[], u64 *events,
>  			  unsigned int *flags)
> diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
> index e83e0891272d..3e96485ccba4 100644
> --- a/arch/powerpc/platforms/pseries/mobility.c
> +++ b/arch/powerpc/platforms/pseries/mobility.c
> @@ -22,6 +22,7 @@
>  #include <linux/delay.h>
>  #include <linux/slab.h>
>  #include <linux/stringify.h>
> +#include <linux/perf_event.h>
>  
>  #include <asm/machdep.h>
>  #include <asm/rtas.h>
> @@ -631,12 +632,18 @@ static int pseries_migrate_partition(u64 handle)
>  	if (ret)
>  		return ret;
>  
> +	/* Disable PMU before suspend */
> +	on_each_cpu(&mobility_pmu_disable, NULL, 0);

Why was this moved out of stop machine and to an IPI?

My concern would be, what are the other CPUs doing at this time? Is it 
possible they could take interrupts and schedule? Could that mess up the
perf state here?

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH] mm/migrate.c: Remove MIGRATE_PFN_LOCKED
From: Alistair Popple @ 2021-10-29  6:38 UTC (permalink / raw)
  To: akpm, Felix Kuehling
  Cc: rcampbell, amd-gfx, nouveau, linux-kernel, dri-devel, linux-mm,
	jglisse, kvm-ppc, ziy, jhubbard, alexander.deucher, linuxppc-dev,
	hch, bskeggs
In-Reply-To: <130148f1-2676-16e2-b2a5-bf21f2a5481a@amd.com>

On Friday, 29 October 2021 2:33:31 AM AEDT Felix Kuehling wrote:
> Am 2021-10-27 um 9:42 p.m. schrieb Alistair Popple:
> > On Wednesday, 27 October 2021 3:09:57 AM AEDT Felix Kuehling wrote:
> >> Am 2021-10-25 um 12:16 a.m. schrieb Alistair Popple:
> >>> MIGRATE_PFN_LOCKED is used to indicate to migrate_vma_prepare() that a
> >>> source page was already locked during migrate_vma_collect(). If it
> >>> wasn't then the a second attempt is made to lock the page. However if
> >>> the first attempt failed it's unlikely a second attempt will succeed,
> >>> and the retry adds complexity. So clean this up by removing the retry
> >>> and MIGRATE_PFN_LOCKED flag.
> >>>
> >>> Destination pages are also meant to have the MIGRATE_PFN_LOCKED flag
> >>> set, but nothing actually checks that.
> >>>
> >>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> >> It makes sense to me. Do you have any empirical data on how much more
> >> likely migrations are going to fail with this change due to contested
> >> page locks?
> > Thanks Felix. I do not have any empirical data on this but I've mostly seen
> > migrations fail due to the reference count check failing rather than failure to
> > lock the page. Even then it's mostly been due to thrashing on the same page, so
> > I would be surprised if this change made any noticeable difference.
> 
> We have seen more page locking contention on NUMA systems that disappear
> when we disable NUMA balancing. Probably NUMA balancing migrations
> result in the page lock being more contended, which can cause HMM
> migration of some pages to fail.

Yeah, we've found NUMA balancing in general is pretty unhelpful for HMM based
migrations and mappings so have been looking into ways of disabling it for HMM
ranges.

> Also, for migrations to system memory, multiple threads page faulting
> concurrently can cause contention. I was just helping debug such an
> issue. Having migrations to system memory be only partially successful
> is problematic. We'll probably have to implement some retry logic in the
> driver to handle this.

Sounds similar to the problem I was referring to, except in my case I was
seeing contention on the page reference checks due to lots of threads hitting
__migration_entry_wait() at just the wrong time. I am working on a fix for that
that avoids taking the reference at all, however I think retry logic will still
be needed and suspect a driver is probably the best place to implement that.

> Regards,
>   Felix
> 
> 
> >
> >> Either way, the patch is
> >>
> >> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
> >>
> >>
> >>> ---
> >>>  Documentation/vm/hmm.rst                 |   2 +-
> >>>  arch/powerpc/kvm/book3s_hv_uvmem.c       |   4 +-
> >>>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |   2 -
> >>>  drivers/gpu/drm/nouveau/nouveau_dmem.c   |   4 +-
> >>>  include/linux/migrate.h                  |   1 -
> >>>  lib/test_hmm.c                           |   5 +-
> >>>  mm/migrate.c                             | 145 +++++------------------
> >>>  7 files changed, 35 insertions(+), 128 deletions(-)
> >>>
> >>> diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
> >>> index a14c2938e7af..f2a59ed82ed3 100644
> >>> --- a/Documentation/vm/hmm.rst
> >>> +++ b/Documentation/vm/hmm.rst
> >>> @@ -360,7 +360,7 @@ between device driver specific code and shared common code:
> >>>     system memory page, locks the page with ``lock_page()``, and fills in the
> >>>     ``dst`` array entry with::
> >>>  
> >>> -     dst[i] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> >>> +     dst[i] = migrate_pfn(page_to_pfn(dpage));
> >>>  
> >>>     Now that the driver knows that this page is being migrated, it can
> >>>     invalidate device private MMU mappings and copy device private memory
> >>> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> >>> index a7061ee3b157..28c436df9935 100644
> >>> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> >>> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> >>> @@ -560,7 +560,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma,
> >>>  				  gpa, 0, page_shift);
> >>>  
> >>>  	if (ret == U_SUCCESS)
> >>> -		*mig.dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> >>> +		*mig.dst = migrate_pfn(pfn);
> >>>  	else {
> >>>  		unlock_page(dpage);
> >>>  		__free_page(dpage);
> >>> @@ -774,7 +774,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
> >>>  		}
> >>>  	}
> >>>  
> >>> -	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> >>> +	*mig.dst = migrate_pfn(page_to_pfn(dpage));
> >>>  	migrate_vma_pages(&mig);
> >>>  out_finalize:
> >>>  	migrate_vma_finalize(&mig);
> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> >>> index 4a16e3c257b9..41d9417f182b 100644
> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> >>> @@ -300,7 +300,6 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
> >>>  			migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]);
> >>>  			svm_migrate_get_vram_page(prange, migrate->dst[i]);
> >>>  			migrate->dst[i] = migrate_pfn(migrate->dst[i]);
> >>> -			migrate->dst[i] |= MIGRATE_PFN_LOCKED;
> >>>  			src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE,
> >>>  					      DMA_TO_DEVICE);
> >>>  			r = dma_mapping_error(dev, src[i]);
> >>> @@ -580,7 +579,6 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
> >>>  			      dst[i] >> PAGE_SHIFT, page_to_pfn(dpage));
> >>>  
> >>>  		migrate->dst[i] = migrate_pfn(page_to_pfn(dpage));
> >>> -		migrate->dst[i] |= MIGRATE_PFN_LOCKED;
> >>>  		j++;
> >>>  	}
> >>>  
> >>> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> >>> index 92987daa5e17..3828aafd3ac4 100644
> >>> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> >>> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> >>> @@ -166,7 +166,7 @@ static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
> >>>  		goto error_dma_unmap;
> >>>  	mutex_unlock(&svmm->mutex);
> >>>  
> >>> -	args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> >>> +	args->dst[0] = migrate_pfn(page_to_pfn(dpage));
> >>>  	return 0;
> >>>  
> >>>  error_dma_unmap:
> >>> @@ -602,7 +602,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
> >>>  		((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT);
> >>>  	if (src & MIGRATE_PFN_WRITE)
> >>>  		*pfn |= NVIF_VMM_PFNMAP_V0_W;
> >>> -	return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> >>> +	return migrate_pfn(page_to_pfn(dpage));
> >>>  
> >>>  out_dma_unmap:
> >>>  	dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
> >>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> >>> index c8077e936691..479b861ae490 100644
> >>> --- a/include/linux/migrate.h
> >>> +++ b/include/linux/migrate.h
> >>> @@ -119,7 +119,6 @@ static inline int migrate_misplaced_page(struct page *page,
> >>>   */
> >>>  #define MIGRATE_PFN_VALID	(1UL << 0)
> >>>  #define MIGRATE_PFN_MIGRATE	(1UL << 1)
> >>> -#define MIGRATE_PFN_LOCKED	(1UL << 2)
> >>>  #define MIGRATE_PFN_WRITE	(1UL << 3)
> >>>  #define MIGRATE_PFN_SHIFT	6
> >>>  
> >>> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> >>> index c259842f6d44..e2ce8f9b7605 100644
> >>> --- a/lib/test_hmm.c
> >>> +++ b/lib/test_hmm.c
> >>> @@ -613,8 +613,7 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
> >>>  		 */
> >>>  		rpage->zone_device_data = dmirror;
> >>>  
> >>> -		*dst = migrate_pfn(page_to_pfn(dpage)) |
> >>> -			    MIGRATE_PFN_LOCKED;
> >>> +		*dst = migrate_pfn(page_to_pfn(dpage));
> >>>  		if ((*src & MIGRATE_PFN_WRITE) ||
> >>>  		    (!spage && args->vma->vm_flags & VM_WRITE))
> >>>  			*dst |= MIGRATE_PFN_WRITE;
> >>> @@ -1137,7 +1136,7 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> >>>  		lock_page(dpage);
> >>>  		xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> >>>  		copy_highpage(dpage, spage);
> >>> -		*dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> >>> +		*dst = migrate_pfn(page_to_pfn(dpage));
> >>>  		if (*src & MIGRATE_PFN_WRITE)
> >>>  			*dst |= MIGRATE_PFN_WRITE;
> >>>  	}
> >>> diff --git a/mm/migrate.c b/mm/migrate.c
> >>> index a6a7743ee98f..915e969811d0 100644
> >>> --- a/mm/migrate.c
> >>> +++ b/mm/migrate.c
> >>> @@ -2369,7 +2369,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >>>  		 * can't be dropped from it).
> >>>  		 */
> >>>  		get_page(page);
> >>> -		migrate->cpages++;
> >>>  
> >>>  		/*
> >>>  		 * Optimize for the common case where page is only mapped once
> >>> @@ -2379,7 +2378,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >>>  		if (trylock_page(page)) {
> >>>  			pte_t swp_pte;
> >>>  
> >>> -			mpfn |= MIGRATE_PFN_LOCKED;
> >>> +			migrate->cpages++;
> >>>  			ptep_get_and_clear(mm, addr, ptep);
> >>>  
> >>>  			/* Setup special migration page table entry */
> >>> @@ -2413,6 +2412,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >>>  
> >>>  			if (pte_present(pte))
> >>>  				unmapped++;
> >>> +		} else {
> >>> +			put_page(page);
> >>> +			mpfn = 0;
> >>>  		}
> >>>  
> >>>  next:
> >>> @@ -2517,15 +2519,17 @@ static bool migrate_vma_check_page(struct page *page)
> >>>  }
> >>>  
> >>>  /*
> >>> - * migrate_vma_prepare() - lock pages and isolate them from the lru
> >>> + * migrate_vma_unmap() - replace page mapping with special migration pte entry
> >>>   * @migrate: migrate struct containing all migration information
> >>>   *
> >>> - * This locks pages that have been collected by migrate_vma_collect(). Once each
> >>> - * page is locked it is isolated from the lru (for non-device pages). Finally,
> >>> - * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
> >>> - * migrated by concurrent kernel threads.
> >>> + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
> >>> + * special migration pte entry and check if it has been pinned. Pinned pages are
> >>> + * restored because we cannot migrate them.
> >>> + *
> >>> + * This is the last step before we call the device driver callback to allocate
> >>> + * destination memory and copy contents of original page over to new page.
> >>>   */
> >>> -static void migrate_vma_prepare(struct migrate_vma *migrate)
> >>> +static void migrate_vma_unmap(struct migrate_vma *migrate)
> >>>  {
> >>>  	const unsigned long npages = migrate->npages;
> >>>  	const unsigned long start = migrate->start;
> >>> @@ -2534,32 +2538,12 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
> >>>  
> >>>  	lru_add_drain();
> >>>  
> >>> -	for (i = 0; (i < npages) && migrate->cpages; i++) {
> >>> +	for (i = 0; i < npages; i++) {
> >>>  		struct page *page = migrate_pfn_to_page(migrate->src[i]);
> >>> -		bool remap = true;
> >>>  
> >>>  		if (!page)
> >>>  			continue;
> >>>  
> >>> -		if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
> >>> -			/*
> >>> -			 * Because we are migrating several pages there can be
> >>> -			 * a deadlock between 2 concurrent migration where each
> >>> -			 * are waiting on each other page lock.
> >>> -			 *
> >>> -			 * Make migrate_vma() a best effort thing and backoff
> >>> -			 * for any page we can not lock right away.
> >>> -			 */
> >>> -			if (!trylock_page(page)) {
> >>> -				migrate->src[i] = 0;
> >>> -				migrate->cpages--;
> >>> -				put_page(page);
> >>> -				continue;
> >>> -			}
> >>> -			remap = false;
> >>> -			migrate->src[i] |= MIGRATE_PFN_LOCKED;
> >>> -		}
> >>> -
> >>>  		/* ZONE_DEVICE pages are not on LRU */
> >>>  		if (!is_zone_device_page(page)) {
> >>>  			if (!PageLRU(page) && allow_drain) {
> >>> @@ -2569,16 +2553,9 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
> >>>  			}
> >>>  
> >>>  			if (isolate_lru_page(page)) {
> >>> -				if (remap) {
> >>> -					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> >>> -					migrate->cpages--;
> >>> -					restore++;
> >>> -				} else {
> >>> -					migrate->src[i] = 0;
> >>> -					unlock_page(page);
> >>> -					migrate->cpages--;
> >>> -					put_page(page);
> >>> -				}
> >>> +				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> >>> +				migrate->cpages--;
> >>> +				restore++;
> >>>  				continue;
> >>>  			}
> >>>  
> >>> @@ -2586,80 +2563,20 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
> >>>  			put_page(page);
> >>>  		}
> >>>  
> >>> -		if (!migrate_vma_check_page(page)) {
> >>> -			if (remap) {
> >>> -				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> >>> -				migrate->cpages--;
> >>> -				restore++;
> >>> -
> >>> -				if (!is_zone_device_page(page)) {
> >>> -					get_page(page);
> >>> -					putback_lru_page(page);
> >>> -				}
> >>> -			} else {
> >>> -				migrate->src[i] = 0;
> >>> -				unlock_page(page);
> >>> -				migrate->cpages--;
> >>> +		if (page_mapped(page))
> >>> +			try_to_migrate(page, 0);
> >>>  
> >>> -				if (!is_zone_device_page(page))
> >>> -					putback_lru_page(page);
> >>> -				else
> >>> -					put_page(page);
> >>> +		if (page_mapped(page) || !migrate_vma_check_page(page)) {
> >>> +			if (!is_zone_device_page(page)) {
> >>> +				get_page(page);
> >>> +				putback_lru_page(page);
> >>>  			}
> >>> -		}
> >>> -	}
> >>> -
> >>> -	for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
> >>> -		struct page *page = migrate_pfn_to_page(migrate->src[i]);
> >>> -
> >>> -		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
> >>> -			continue;
> >>>  
> >>> -		remove_migration_pte(page, migrate->vma, addr, page);
> >>> -
> >>> -		migrate->src[i] = 0;
> >>> -		unlock_page(page);
> >>> -		put_page(page);
> >>> -		restore--;
> >>> -	}
> >>> -}
> >>> -
> >>> -/*
> >>> - * migrate_vma_unmap() - replace page mapping with special migration pte entry
> >>> - * @migrate: migrate struct containing all migration information
> >>> - *
> >>> - * Replace page mapping (CPU page table pte) with a special migration pte entry
> >>> - * and check again if it has been pinned. Pinned pages are restored because we
> >>> - * cannot migrate them.
> >>> - *
> >>> - * This is the last step before we call the device driver callback to allocate
> >>> - * destination memory and copy contents of original page over to new page.
> >>> - */
> >>> -static void migrate_vma_unmap(struct migrate_vma *migrate)
> >>> -{
> >>> -	const unsigned long npages = migrate->npages;
> >>> -	const unsigned long start = migrate->start;
> >>> -	unsigned long addr, i, restore = 0;
> >>> -
> >>> -	for (i = 0; i < npages; i++) {
> >>> -		struct page *page = migrate_pfn_to_page(migrate->src[i]);
> >>> -
> >>> -		if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
> >>> +			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> >>> +			migrate->cpages--;
> >>> +			restore++;
> >>>  			continue;
> >>> -
> >>> -		if (page_mapped(page)) {
> >>> -			try_to_migrate(page, 0);
> >>> -			if (page_mapped(page))
> >>> -				goto restore;
> >>>  		}
> >>> -
> >>> -		if (migrate_vma_check_page(page))
> >>> -			continue;
> >>> -
> >>> -restore:
> >>> -		migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> >>> -		migrate->cpages--;
> >>> -		restore++;
> >>>  	}
> >>>  
> >>>  	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
> >>> @@ -2672,12 +2589,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
> >>>  
> >>>  		migrate->src[i] = 0;
> >>>  		unlock_page(page);
> >>> +		put_page(page);
> >>>  		restore--;
> >>> -
> >>> -		if (is_zone_device_page(page))
> >>> -			put_page(page);
> >>> -		else
> >>> -			putback_lru_page(page);
> >>>  	}
> >>>  }
> >>>  
> >>> @@ -2700,8 +2613,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
> >>>   * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
> >>>   * flag set).  Once these are allocated and copied, the caller must update each
> >>>   * corresponding entry in the dst array with the pfn value of the destination
> >>> - * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
> >>> - * (destination pages must have their struct pages locked, via lock_page()).
> >>> + * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
> >>> + * lock_page().
> >>>   *
> >>>   * Note that the caller does not have to migrate all the pages that are marked
> >>>   * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
> >>> @@ -2770,8 +2683,6 @@ int migrate_vma_setup(struct migrate_vma *args)
> >>>  
> >>>  	migrate_vma_collect(args);
> >>>  
> >>> -	if (args->cpages)
> >>> -		migrate_vma_prepare(args);
> >>>  	if (args->cpages)
> >>>  		migrate_vma_unmap(args);
> >>>  
> >
> >
> 





^ permalink raw reply

* [PATCH 1/2] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race
From: Nicholas Piggin @ 2021-10-29  8:39 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Laurent Dufour, Nicholas Piggin

It is possible for all CPUs to miss the pending cpumask becoming clear,
and then nobody resetting it, which will cause the lockup detector to
stop working. It will eventually expire, but watchdog_smp_panic will
avoid doing anything if the pending mask is clear and it will never be
reset.

Order the cpumask clear vs the subsequent test to close this race.

Add an extra check for an empty pending mask when the watchdog fires and
finds its bit still clear, to try to catch any other possible races or
bugs here and keep the watchdog working. The extra test in
arch_touch_nmi_watchdog is required to prevent the new warning from
firing off.

Debugged-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/watchdog.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index f9ea0e5357f9..4bb7c8e371a2 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -215,13 +215,38 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 
 			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
 			wd_smp_unlock(&flags);
+		} else {
+			/*
+			 * The last CPU to clear pending should have reset the
+			 * watchdog, yet we find it empty here. This should not
+			 * happen but we can try to recover and avoid a false
+			 * positive if it does.
+			 */
+			if (WARN_ON_ONCE(cpumask_empty(&wd_smp_cpus_pending)))
+				goto none_pending;
 		}
 		return;
 	}
+
 	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
+	/*
+	 * Order the store to clear pending with the load(s) to check all
+	 * words in the pending mask to check they are all empty. This orders
+	 * with the same barrier on another CPU. This prevents two CPUs
+	 * clearing the last 2 pending bits, but neither seeing the other's
+	 * store when checking if the mask is empty, and missing an empty
+	 * mask, which ends with a false positive.
+	 */
+	smp_mb();
 	if (cpumask_empty(&wd_smp_cpus_pending)) {
 		unsigned long flags;
 
+none_pending:
+		/*
+		 * Double check under lock because more than one CPU could see
+		 * a clear mask with the lockless check after clearing their
+		 * pending bits.
+		 */
 		wd_smp_lock(&flags);
 		if (cpumask_empty(&wd_smp_cpus_pending)) {
 			wd_smp_last_reset_tb = tb;
@@ -312,8 +337,12 @@ void arch_touch_nmi_watchdog(void)
 {
 	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
-	u64 tb = get_tb();
+	u64 tb;
 
+	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
+		return;
+
+	tb = get_tb();
 	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
 		per_cpu(wd_timer_tb, cpu) = tb;
 		wd_smp_clear_cpu_pending(cpu, tb);
-- 
2.23.0


^ permalink raw reply related

* [PATCH 2/2] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi
From: Nicholas Piggin @ 2021-10-29  8:39 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Laurent Dufour, Nicholas Piggin
In-Reply-To: <20211029083908.87931-1-npiggin@gmail.com>

There is a deadlock with the console_owner lock and the wd_smp_lock:

CPU x takes the console_owner lock
CPU y takes a watchdog timer interrupt and takes __wd_smp_lock
CPU x takes a watchdog timer interrupt and spins on __wd_smp_lock
CPU y detects a deadlock, tries to print something and spins on console_owner
-> deadlock

Change the watchdog locking scheme so wd_smp_lock protects the watchdog
internal data, but "reporting" (printing, issuing NMI IPIs, taking any
action outside of watchdog) uses a non-waiting exclusion. If a CPU detects
a problem but can not take the reporting lock, it just returns because
something else is already reporting. It will try again at some point.

Typically hard lockup watchdog report usefulness is not impacted due to
failure to spewing a large enough amount of data in as short a time as
possible, but by messages getting garbled.

Laurent debugged this and found the deadlock, and this patch is based on
his general approach to avoid expensive operations while holding the lock.
With the addition of the reporting exclusion.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
[np: rework to add reporting exclusion update changelog]
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/watchdog.c | 76 ++++++++++++++++++++++++++--------
 1 file changed, 59 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 4bb7c8e371a2..69a475aa0f44 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -85,10 +85,32 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
 
 /* SMP checker bits */
 static unsigned long __wd_smp_lock;
+static unsigned long __wd_reporting;
 static cpumask_t wd_smp_cpus_pending;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
 
+/*
+ * Try to take the exclusive watchdog action / NMI IPI / printing lock.
+ * wd_smp_lock must be held. If this fails, we should return and wait
+ * for the watchdog to kick in again (or another CPU to trigger it).
+ */
+static bool wd_try_report(void)
+{
+	if (__wd_reporting)
+		return false;
+	__wd_reporting = 1;
+	return true;
+}
+
+/* End printing after successful wd_try_report. wd_smp_lock not required. */
+static void wd_end_reporting(void)
+{
+	smp_mb(); /* End printing "critical section" */
+	WARN_ON_ONCE(__wd_reporting == 0);
+	WRITE_ONCE(__wd_reporting, 0);
+}
+
 static inline void wd_smp_lock(unsigned long *flags)
 {
 	/*
@@ -131,10 +153,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
-static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
+static void set_cpu_stuck(int cpu, u64 tb)
 {
-	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
-	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
+	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
+	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
 	if (cpumask_empty(&wd_smp_cpus_pending)) {
 		wd_smp_last_reset_tb = tb;
 		cpumask_andnot(&wd_smp_cpus_pending,
@@ -142,10 +164,6 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
 				&wd_smp_cpus_stuck);
 	}
 }
-static void set_cpu_stuck(int cpu, u64 tb)
-{
-	set_cpumask_stuck(cpumask_of(cpu), tb);
-}
 
 static void watchdog_smp_panic(int cpu, u64 tb)
 {
@@ -160,6 +178,9 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 		goto out;
 	if (cpumask_weight(&wd_smp_cpus_pending) == 0)
 		goto out;
+	if (!wd_try_report())
+		goto out;
+	wd_smp_unlock(&flags);
 
 	pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
 		 cpu, cpumask_pr_args(&wd_smp_cpus_pending));
@@ -172,24 +193,32 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 		 * Try to trigger the stuck CPUs, unless we are going to
 		 * get a backtrace on all of them anyway.
 		 */
-		for_each_cpu(c, &wd_smp_cpus_pending) {
+		for_each_online_cpu(c) {
 			if (c == cpu)
 				continue;
+			if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
+				continue;
+			wd_smp_lock(&flags);
+			if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
+				wd_smp_unlock(&flags);
+				continue;
+			}
+			/* Take the stuck CPU out of the watch group */
+			set_cpu_stuck(cpu, tb);
+			wd_smp_unlock(&flags);
+
 			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
 		}
 	}
 
-	/* Take the stuck CPUs out of the watch group */
-	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
-
-	wd_smp_unlock(&flags);
-
 	if (sysctl_hardlockup_all_cpu_backtrace)
 		trigger_allbutself_cpu_backtrace();
 
 	if (hardlockup_panic)
 		nmi_panic(NULL, "Hard LOCKUP");
 
+	wd_end_reporting();
+
 	return;
 
 out:
@@ -203,8 +232,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 			struct pt_regs *regs = get_irq_regs();
 			unsigned long flags;
 
-			wd_smp_lock(&flags);
-
 			pr_emerg("CPU %d became unstuck TB:%lld\n",
 				 cpu, tb);
 			print_irqtrace_events(current);
@@ -213,6 +240,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 			else
 				dump_stack();
 
+			wd_smp_lock(&flags);
 			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
 			wd_smp_unlock(&flags);
 		} else {
@@ -291,8 +319,17 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 			wd_smp_unlock(&flags);
 			return 0;
 		}
+		if (!wd_try_report()) {
+			wd_smp_unlock(&flags);
+			/* Couldn't report, try again in 100ms */
+			mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
+			return 0;
+		}
+
 		set_cpu_stuck(cpu, tb);
 
+		wd_smp_unlock(&flags);
+
 		pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
 			 cpu, (void *)regs->nip);
 		pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
@@ -302,14 +339,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 		print_irqtrace_events(current);
 		show_regs(regs);
 
-		wd_smp_unlock(&flags);
-
 		if (sysctl_hardlockup_all_cpu_backtrace)
 			trigger_allbutself_cpu_backtrace();
 
 		if (hardlockup_panic)
 			nmi_panic(regs, "Hard LOCKUP");
+
+		wd_end_reporting();
 	}
+	/*
+	 * We are okay to change DEC in soft_nmi_interrupt because the masked
+	 * handler has marked a DEC as pending, so the timer interrupt will be
+	 * replayed as soon as local irqs are enabled again.
+	 */
 	if (wd_panic_timeout_tb < 0x7fffffff)
 		mtspr(SPRN_DEC, wd_panic_timeout_tb);
 
-- 
2.23.0


^ permalink raw reply related

* Re: [V3] powerpc/perf: Enable PMU counters post partition migration if PMU is active
From: kernel test robot @ 2021-10-29 10:20 UTC (permalink / raw)
  To: Athira Rajeev, mpe
  Cc: nathanl, maddy, kbuild-all, rnsastry, kjain, npiggin,
	linuxppc-dev
In-Reply-To: <20211029030510.58797-1-atrajeev@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 3091 bytes --]

Hi Athira,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on tip/perf/core v5.15-rc7 next-20211028]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Athira-Rajeev/powerpc-perf-Enable-PMU-counters-post-partition-migration-if-PMU-is-active/20211029-110804
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc64-randconfig-r003-20211028 (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/caeb6bb8d2f36e6b15151587193c1e0a9e62ab16
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Athira-Rajeev/powerpc-perf-Enable-PMU-counters-post-partition-migration-if-PMU-is-active/20211029-110804
        git checkout caeb6bb8d2f36e6b15151587193c1e0a9e62ab16
        # save the attached .config to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/powerpc/platforms/pseries/mobility.c: In function 'pseries_migrate_partition':
>> arch/powerpc/platforms/pseries/mobility.c:670:22: error: 'mobility_pmu_disable' undeclared (first use in this function)
     670 |         on_each_cpu(&mobility_pmu_disable, NULL, 0);
         |                      ^~~~~~~~~~~~~~~~~~~~
   arch/powerpc/platforms/pseries/mobility.c:670:22: note: each undeclared identifier is reported only once for each function it appears in
>> arch/powerpc/platforms/pseries/mobility.c:679:22: error: 'mobility_pmu_enable' undeclared (first use in this function)
     679 |         on_each_cpu(&mobility_pmu_enable, NULL, 0);
         |                      ^~~~~~~~~~~~~~~~~~~


vim +/mobility_pmu_disable +670 arch/powerpc/platforms/pseries/mobility.c

   660	
   661	static int pseries_migrate_partition(u64 handle)
   662	{
   663		int ret;
   664	
   665		ret = wait_for_vasi_session_suspending(handle);
   666		if (ret)
   667			return ret;
   668	
   669		/* Disable PMU before suspend */
 > 670		on_each_cpu(&mobility_pmu_disable, NULL, 0);
   671	
   672		ret = pseries_suspend(handle);
   673		if (ret == 0)
   674			post_mobility_fixup();
   675		else
   676			pseries_cancel_migration(handle, ret);
   677	
   678		/* Enable PMU after resuming */
 > 679		on_each_cpu(&mobility_pmu_enable, NULL, 0);
   680	
   681		return ret;
   682	}
   683	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 36333 bytes --]

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi
From: Laurent Dufour @ 2021-10-29 12:09 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev
In-Reply-To: <20211029083908.87931-2-npiggin@gmail.com>

Le 29/10/2021 à 10:39, Nicholas Piggin a écrit :
> There is a deadlock with the console_owner lock and the wd_smp_lock:
> 
> CPU x takes the console_owner lock
> CPU y takes a watchdog timer interrupt and takes __wd_smp_lock
> CPU x takes a watchdog timer interrupt and spins on __wd_smp_lock
> CPU y detects a deadlock, tries to print something and spins on console_owner
> -> deadlock
> 
> Change the watchdog locking scheme so wd_smp_lock protects the watchdog
> internal data, but "reporting" (printing, issuing NMI IPIs, taking any
> action outside of watchdog) uses a non-waiting exclusion. If a CPU detects
> a problem but can not take the reporting lock, it just returns because
> something else is already reporting. It will try again at some point.
> 
> Typically hard lockup watchdog report usefulness is not impacted due to
> failure to spewing a large enough amount of data in as short a time as
> possible, but by messages getting garbled.
> 
> Laurent debugged this and found the deadlock, and this patch is based on
> his general approach to avoid expensive operations while holding the lock.
> With the addition of the reporting exclusion.
> 
> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
> [np: rework to add reporting exclusion update changelog]
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   arch/powerpc/kernel/watchdog.c | 76 ++++++++++++++++++++++++++--------
>   1 file changed, 59 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
> index 4bb7c8e371a2..69a475aa0f44 100644
> --- a/arch/powerpc/kernel/watchdog.c
> +++ b/arch/powerpc/kernel/watchdog.c
> @@ -85,10 +85,32 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
>   
>   /* SMP checker bits */
>   static unsigned long __wd_smp_lock;
> +static unsigned long __wd_reporting;
>   static cpumask_t wd_smp_cpus_pending;
>   static cpumask_t wd_smp_cpus_stuck;
>   static u64 wd_smp_last_reset_tb;
>   
> +/*
> + * Try to take the exclusive watchdog action / NMI IPI / printing lock.
> + * wd_smp_lock must be held. If this fails, we should return and wait
> + * for the watchdog to kick in again (or another CPU to trigger it).
> + */
> +static bool wd_try_report(void)
> +{
> +	if (__wd_reporting)
> +		return false;
> +	__wd_reporting = 1;
> +	return true;
> +}
> +
> +/* End printing after successful wd_try_report. wd_smp_lock not required. */
> +static void wd_end_reporting(void)
> +{
> +	smp_mb(); /* End printing "critical section" */
> +	WARN_ON_ONCE(__wd_reporting == 0);
> +	WRITE_ONCE(__wd_reporting, 0);
> +}
> +
>   static inline void wd_smp_lock(unsigned long *flags)
>   {
>   	/*
> @@ -131,10 +153,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
>   	/* Do not panic from here because that can recurse into NMI IPI layer */
>   }
>   
> -static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
> +static void set_cpu_stuck(int cpu, u64 tb)
>   {
> -	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
> -	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
> +	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
> +	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
>   	if (cpumask_empty(&wd_smp_cpus_pending)) {
>   		wd_smp_last_reset_tb = tb;
>   		cpumask_andnot(&wd_smp_cpus_pending,
> @@ -142,10 +164,6 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
>   				&wd_smp_cpus_stuck);
>   	}
>   }
> -static void set_cpu_stuck(int cpu, u64 tb)
> -{
> -	set_cpumask_stuck(cpumask_of(cpu), tb);
> -}
>   
>   static void watchdog_smp_panic(int cpu, u64 tb)
>   {
> @@ -160,6 +178,9 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   		goto out;
>   	if (cpumask_weight(&wd_smp_cpus_pending) == 0)
>   		goto out;
> +	if (!wd_try_report())
> +		goto out;
> +	wd_smp_unlock(&flags);
>   
>   	pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
>   		 cpu, cpumask_pr_args(&wd_smp_cpus_pending));
> @@ -172,24 +193,32 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   		 * Try to trigger the stuck CPUs, unless we are going to
>   		 * get a backtrace on all of them anyway.
>   		 */
> -		for_each_cpu(c, &wd_smp_cpus_pending) {
> +		for_each_online_cpu(c) {
>   			if (c == cpu)
>   				continue;
> +			if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
                                               ^ c
cpu is the reporting CPU, c is the target here.

> +				continue;
> +			wd_smp_lock(&flags);
> +			if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
                                               ^ again c
> +				wd_smp_unlock(&flags);
> +				continue;
> +			}
> +			/* Take the stuck CPU out of the watch group */
> +			set_cpu_stuck(cpu, tb);
                                       ^ c
> +			wd_smp_unlock(&flags);
> +
>   			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
>   		}
>   	}
>   
> -	/* Take the stuck CPUs out of the watch group */
> -	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
> -
> -	wd_smp_unlock(&flags);
> -
>   	if (sysctl_hardlockup_all_cpu_backtrace)
>   		trigger_allbutself_cpu_backtrace();
>   
>   	if (hardlockup_panic)
>   		nmi_panic(NULL, "Hard LOCKUP");
>   
> +	wd_end_reporting();
> +
>   	return;
>   
>   out:
> @@ -203,8 +232,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>   			struct pt_regs *regs = get_irq_regs();
>   			unsigned long flags;
>   
> -			wd_smp_lock(&flags);
> -
>   			pr_emerg("CPU %d became unstuck TB:%lld\n",
>   				 cpu, tb);
>   			print_irqtrace_events(current);
> @@ -213,6 +240,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>   			else
>   				dump_stack();
>   
> +			wd_smp_lock(&flags);
>   			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
>   			wd_smp_unlock(&flags);
>   		} else {
> @@ -291,8 +319,17 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>   			wd_smp_unlock(&flags);
>   			return 0;
>   		}
> +		if (!wd_try_report()) {
> +			wd_smp_unlock(&flags);
> +			/* Couldn't report, try again in 100ms */
> +			mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
> +			return 0;
> +		}
> +
>   		set_cpu_stuck(cpu, tb);
>   
> +		wd_smp_unlock(&flags);
> +
>   		pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
>   			 cpu, (void *)regs->nip);
>   		pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
> @@ -302,14 +339,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>   		print_irqtrace_events(current);
>   		show_regs(regs);
>   
> -		wd_smp_unlock(&flags);
> -
>   		if (sysctl_hardlockup_all_cpu_backtrace)
>   			trigger_allbutself_cpu_backtrace();
>   
>   		if (hardlockup_panic)
>   			nmi_panic(regs, "Hard LOCKUP");
> +
> +		wd_end_reporting();
>   	}
> +	/*
> +	 * We are okay to change DEC in soft_nmi_interrupt because the masked
> +	 * handler has marked a DEC as pending, so the timer interrupt will be
> +	 * replayed as soon as local irqs are enabled again.
> +	 */
>   	if (wd_panic_timeout_tb < 0x7fffffff)
>   		mtspr(SPRN_DEC, wd_panic_timeout_tb);
>   
> 


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox