LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v4 04/13] dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
From: Alexey Kardashevskiy @ 2026-05-18  8:19 UTC (permalink / raw)
  To: Aneesh Kumar K.V (Arm), iommu, linux-arm-kernel, linux-kernel,
	linux-coco
  Cc: Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Jason Gunthorpe, Mostafa Saleh, Petr Tesarik, Dan Williams,
	Xu Yilun, linuxppc-dev, linux-s390, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Alexander Gordeev, Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-5-aneesh.kumar@kernel.org>

On 12/5/26 19:03, Aneesh Kumar K.V (Arm) wrote:
> Teach swiotlb to distinguish between encrypted and decrypted bounce
> buffer pools, and make allocation and mapping paths select a pool whose
> state matches the requested DMA attributes.
> 
> Add a decrypted flag to io_tlb_mem, initialize it for the default and
> restricted pools, and propagate DMA_ATTR_CC_SHARED into swiotlb pool
> allocation. Reject swiotlb alloc/map requests when the selected pool does
> not match the required encrypted/decrypted state.
> 
> Also return DMA addresses with the matching phys_to_dma_{encrypted,
> unencrypted} helper so the DMA address encoding stays consistent with the
> chosen pool.
> 
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
>   include/linux/dma-direct.h |  10 ++++
>   include/linux/swiotlb.h    |   8 ++-
>   kernel/dma/direct.c        |  14 +++--
>   kernel/dma/swiotlb.c       | 108 +++++++++++++++++++++++++++----------
>   4 files changed, 107 insertions(+), 33 deletions(-)
> 
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index c249912456f9..94fad4e7c11e 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -77,6 +77,10 @@ static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map)
>   #ifndef phys_to_dma_unencrypted
>   #define phys_to_dma_unencrypted		phys_to_dma
>   #endif
> +
> +#ifndef phys_to_dma_encrypted
> +#define phys_to_dma_encrypted		phys_to_dma
> +#endif
>   #else
>   static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
>   {
> @@ -90,6 +94,12 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
>   {
>   	return dma_addr_unencrypted(__phys_to_dma(dev, paddr));
>   }
> +
> +static inline dma_addr_t phys_to_dma_encrypted(struct device *dev,
> +		phys_addr_t paddr)
> +{
> +	return dma_addr_encrypted(__phys_to_dma(dev, paddr));
> +}
>   /*
>    * If memory encryption is supported, phys_to_dma will set the memory encryption
>    * bit in the DMA address, and dma_to_phys will clear it.
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 3dae0f592063..b3fa3c6e0169 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -81,6 +81,7 @@ struct io_tlb_pool {
>   	struct list_head node;
>   	struct rcu_head rcu;
>   	bool transient;
> +	bool unencrypted;
>   #endif
>   };
>   
> @@ -111,6 +112,7 @@ struct io_tlb_mem {
>   	struct dentry *debugfs;
>   	bool force_bounce;
>   	bool for_alloc;
> +	bool unencrypted;
>   #ifdef CONFIG_SWIOTLB_DYNAMIC
>   	bool can_grow;
>   	u64 phys_limit;
> @@ -282,7 +284,8 @@ static inline void swiotlb_sync_single_for_cpu(struct device *dev,
>   extern void swiotlb_print_info(void);
>   
>   #ifdef CONFIG_DMA_RESTRICTED_POOL
> -struct page *swiotlb_alloc(struct device *dev, size_t size);
> +struct page *swiotlb_alloc(struct device *dev, size_t size,
> +		unsigned long attrs);
>   bool swiotlb_free(struct device *dev, struct page *page, size_t size);
>   
>   static inline bool is_swiotlb_for_alloc(struct device *dev)
> @@ -290,7 +293,8 @@ static inline bool is_swiotlb_for_alloc(struct device *dev)
>   	return dev->dma_io_tlb_mem->for_alloc;
>   }
>   #else
> -static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
> +static inline struct page *swiotlb_alloc(struct device *dev, size_t size,
> +		unsigned long attrs)
>   {
>   	return NULL;
>   }
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index dc2907439b3d..97ae4fa10521 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -104,9 +104,10 @@ static void __dma_direct_free_pages(struct device *dev, struct page *page,
>   	dma_free_contiguous(dev, page, size);
>   }
>   
> -static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
> +static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size,
> +		unsigned long attrs)
>   {
> -	struct page *page = swiotlb_alloc(dev, size);
> +	struct page *page = swiotlb_alloc(dev, size, attrs);
>   
>   	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
>   		swiotlb_free(dev, page, size);
> @@ -266,8 +267,12 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>   						  gfp, attrs);
>   
>   	if (is_swiotlb_for_alloc(dev)) {

here we know it is shared so ...


> -		page = dma_direct_alloc_swiotlb(dev, size);
> +		page = dma_direct_alloc_swiotlb(dev, size, attrs);
>   		if (page) {
> +			/*
> +			 * swiotlb allocations comes from pool already marked
> +			 * decrypted
> +			 */

... is not this needed here
attrs |= DMA_ATTR_CC_SHARED;

?

and then the setup_page label below can do the right thing, which I tried, with enforcing io_tlb_default_mem.for_alloc=1, it works - accepted device can still do DMA to shared memory. Thanks,



>   			mark_mem_decrypt = false;
>   			goto setup_page;
>   		}
> @@ -374,6 +379,7 @@ void dma_direct_free(struct device *dev, size_t size,
>   		return;
>   
>   	if (swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr)))
> +		/* Swiotlb doesn't need a page attribute update on free */
>   		mark_mem_encrypted = false;
>   
>   	if (is_vmalloc_addr(cpu_addr)) {
> @@ -403,7 +409,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
>   						  gfp, attrs);
>   
>   	if (is_swiotlb_for_alloc(dev)) {
> -		page = dma_direct_alloc_swiotlb(dev, size);
> +		page = dma_direct_alloc_swiotlb(dev, size, attrs);
>   		if (!page)
>   			return NULL;
>   
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index ab4eccbaa076..065663be282c 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -259,10 +259,21 @@ void __init swiotlb_update_mem_attributes(void)
>   	struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
>   	unsigned long bytes;
>   
> +	/*
> +	 * if platform support memory encryption, swiotlb buffers are
> +	 * decrypted by default.
> +	 */
> +	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> +		io_tlb_default_mem.unencrypted = true;
> +	else
> +		io_tlb_default_mem.unencrypted = false;
> +
>   	if (!mem->nslabs || mem->late_alloc)
>   		return;
>   	bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
> -	set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
> +
> +	if (io_tlb_default_mem.unencrypted)
> +		set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
>   }
>   
>   static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
> @@ -505,8 +516,10 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
>   	if (!mem->slots)
>   		goto error_slots;
>   
> -	set_memory_decrypted((unsigned long)vstart,
> -			     (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
> +	if (io_tlb_default_mem.unencrypted)
> +		set_memory_decrypted((unsigned long)vstart,
> +				     (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
> +
>   	swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
>   				 nareas);
>   	add_mem_pool(&io_tlb_default_mem, mem);
> @@ -539,7 +552,9 @@ void __init swiotlb_exit(void)
>   	tbl_size = PAGE_ALIGN(mem->end - mem->start);
>   	slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
>   
> -	set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
> +	if (io_tlb_default_mem.unencrypted)
> +		set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
> +
>   	if (mem->late_alloc) {
>   		area_order = get_order(array_size(sizeof(*mem->areas),
>   			mem->nareas));
> @@ -563,6 +578,7 @@ void __init swiotlb_exit(void)
>    * @gfp:	GFP flags for the allocation.
>    * @bytes:	Size of the buffer.
>    * @phys_limit:	Maximum allowed physical address of the buffer.
> + * @unencrypted: true to allocate unencrypted memory, false for encrypted memory
>    *
>    * Allocate pages from the buddy allocator. If successful, make the allocated
>    * pages decrypted that they can be used for DMA.
> @@ -570,7 +586,8 @@ void __init swiotlb_exit(void)
>    * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
>    * if the allocated physical address was above @phys_limit.
>    */
> -static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
> +static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes,
> +		u64 phys_limit, bool unencrypted)
>   {
>   	unsigned int order = get_order(bytes);
>   	struct page *page;
> @@ -588,13 +605,13 @@ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
>   	}
>   
>   	vaddr = phys_to_virt(paddr);
> -	if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
> +	if (unencrypted && set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
>   		goto error;
>   	return page;
>   
>   error:
>   	/* Intentional leak if pages cannot be encrypted again. */
> -	if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> +	if (unencrypted && !set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
>   		__free_pages(page, order);
>   	return NULL;
>   }
> @@ -604,30 +621,26 @@ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
>    * @dev:	Device for which a memory pool is allocated.
>    * @bytes:	Size of the buffer.
>    * @phys_limit:	Maximum allowed physical address of the buffer.
> + * @attrs:	DMA attributes for the allocation.
>    * @gfp:	GFP flags for the allocation.
>    *
>    * Return: Allocated pages, or %NULL on allocation failure.
>    */
>   static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> -		u64 phys_limit, gfp_t gfp)
> +		u64 phys_limit, unsigned long attrs, gfp_t gfp)
>   {
>   	struct page *page;
> -	unsigned long attrs = 0;
>   
>   	/*
>   	 * Allocate from the atomic pools if memory is encrypted and
>   	 * the allocation is atomic, because decrypting may block.
>   	 */
> -	if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
> +	if (!gfpflags_allow_blocking(gfp) && (attrs & DMA_ATTR_CC_SHARED)) {
>   		void *vaddr;
>   
>   		if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
>   			return NULL;
>   
> -		/* swiotlb considered decrypted by default */
> -		if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> -			attrs = DMA_ATTR_CC_SHARED;
> -
>   		return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
>   					   attrs, dma_coherent_ok);
>   	}
> @@ -638,7 +651,8 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
>   	else if (phys_limit <= DMA_BIT_MASK(32))
>   		gfp |= __GFP_DMA32;
>   
> -	while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
> +	while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit,
> +					     !!(attrs & DMA_ATTR_CC_SHARED)))) {
>   		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
>   		    phys_limit < DMA_BIT_MASK(64) &&
>   		    !(gfp & (__GFP_DMA32 | __GFP_DMA)))
> @@ -657,15 +671,18 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
>    * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
>    * @vaddr:	Virtual address of the buffer.
>    * @bytes:	Size of the buffer.
> + * @unencrypted: true if @vaddr was allocated decrypted and must be
> + *	re-encrypted before being freed
>    */
> -static void swiotlb_free_tlb(void *vaddr, size_t bytes)
> +static void swiotlb_free_tlb(void *vaddr, size_t bytes, bool unencrypted)
>   {
>   	if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
>   	    dma_free_from_pool(NULL, vaddr, bytes))
>   		return;
>   
>   	/* Intentional leak if pages cannot be encrypted again. */
> -	if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> +	if (!unencrypted ||
> +	    !set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
>   		__free_pages(virt_to_page(vaddr), get_order(bytes));
>   }
>   
> @@ -676,6 +693,7 @@ static void swiotlb_free_tlb(void *vaddr, size_t bytes)
>    * @nslabs:	Desired (maximum) number of slabs.
>    * @nareas:	Number of areas.
>    * @phys_limit:	Maximum DMA buffer physical address.
> + * @attrs:	DMA attributes for the allocation.
>    * @gfp:	GFP flags for the allocations.
>    *
>    * Allocate and initialize a new IO TLB memory pool. The actual number of
> @@ -686,7 +704,8 @@ static void swiotlb_free_tlb(void *vaddr, size_t bytes)
>    */
>   static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
>   		unsigned long minslabs, unsigned long nslabs,
> -		unsigned int nareas, u64 phys_limit, gfp_t gfp)
> +		unsigned int nareas, u64 phys_limit, unsigned long attrs,
> +		gfp_t gfp)
>   {
>   	struct io_tlb_pool *pool;
>   	unsigned int slot_order;
> @@ -704,9 +723,10 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
>   	if (!pool)
>   		goto error;
>   	pool->areas = (void *)pool + sizeof(*pool);
> +	pool->unencrypted = !!(attrs & DMA_ATTR_CC_SHARED);
>   
>   	tlb_size = nslabs << IO_TLB_SHIFT;
> -	while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
> +	while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, attrs, gfp))) {
>   		if (nslabs <= minslabs)
>   			goto error_tlb;
>   		nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
> @@ -724,7 +744,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
>   	return pool;
>   
>   error_slots:
> -	swiotlb_free_tlb(page_address(tlb), tlb_size);
> +	swiotlb_free_tlb(page_address(tlb), tlb_size,
> +			 !!(attrs & DMA_ATTR_CC_SHARED));
>   error_tlb:
>   	kfree(pool);
>   error:
> @@ -742,7 +763,9 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
>   	struct io_tlb_pool *pool;
>   
>   	pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
> -				  default_nareas, mem->phys_limit, GFP_KERNEL);
> +				  default_nareas, mem->phys_limit,
> +				  mem->unencrypted ? DMA_ATTR_CC_SHARED : 0,
> +				  GFP_KERNEL);
>   	if (!pool) {
>   		pr_warn_ratelimited("Failed to allocate new pool");
>   		return;
> @@ -762,7 +785,7 @@ static void swiotlb_dyn_free(struct rcu_head *rcu)
>   	size_t tlb_size = pool->end - pool->start;
>   
>   	free_pages((unsigned long)pool->slots, get_order(slots_size));
> -	swiotlb_free_tlb(pool->vaddr, tlb_size);
> +	swiotlb_free_tlb(pool->vaddr, tlb_size, pool->unencrypted);
>   	kfree(pool);
>   }
>   
> @@ -1232,6 +1255,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
>   	nslabs = nr_slots(alloc_size);
>   	phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
>   	pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
> +				  mem->unencrypted ? DMA_ATTR_CC_SHARED : 0,
>   				  GFP_NOWAIT);
>   	if (!pool)
>   		return -1;
> @@ -1394,6 +1418,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>   		enum dma_data_direction dir, unsigned long attrs)
>   {
>   	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> +	bool require_decrypted = false;
>   	unsigned int offset;
>   	struct io_tlb_pool *pool;
>   	unsigned int i;
> @@ -1411,6 +1436,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>   	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
>   		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
>   
> +	/*
> +	 * if we are trying to swiotlb map a decrypted paddr or the paddr is encrypted
> +	 * but the device is forcing decryption, use decrypted io_tlb_mem
> +	 */
> +	if ((attrs & DMA_ATTR_CC_SHARED) || force_dma_unencrypted(dev))
> +		require_decrypted = true;
> +
> +	if (require_decrypted != mem->unencrypted)
> +		return (phys_addr_t)DMA_MAPPING_ERROR;
> +
>   	/*
>   	 * The default swiotlb memory pool is allocated with PAGE_SIZE
>   	 * alignment. If a mapping is requested with larger alignment,
> @@ -1608,8 +1643,14 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
>   	if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
>   		return DMA_MAPPING_ERROR;
>   
> -	/* Ensure that the address returned is DMA'ble */
> -	dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
> +	/*
> +	 * Use the allocated io_tlb_mem encryption type to determine dma addr.
> +	 */
> +	if (dev->dma_io_tlb_mem->unencrypted)
> +		dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
> +	else
> +		dma_addr = phys_to_dma_encrypted(dev, swiotlb_addr);
> +
>   	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
>   		__swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
>   			attrs | DMA_ATTR_SKIP_CPU_SYNC,
> @@ -1773,7 +1814,8 @@ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
>   
>   #ifdef CONFIG_DMA_RESTRICTED_POOL
>   
> -struct page *swiotlb_alloc(struct device *dev, size_t size)
> +struct page *swiotlb_alloc(struct device *dev, size_t size,
> +		unsigned long attrs)
>   {
>   	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
>   	struct io_tlb_pool *pool;
> @@ -1784,6 +1826,9 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
>   	if (!mem)
>   		return NULL;
>   
> +	if (mem->unencrypted != !!(attrs & DMA_ATTR_CC_SHARED))
> +		return NULL;
> +
>   	align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
>   	index = swiotlb_find_slots(dev, 0, size, align, &pool);
>   	if (index == -1)
> @@ -1853,9 +1898,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
>   			kfree(mem);
>   			return -ENOMEM;
>   		}
> +		/*
> +		 * if platform supports memory encryption,
> +		 * restricted mem pool is decrypted by default
> +		 */
> +		if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
> +			mem->unencrypted = true;
> +			set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
> +					     rmem->size >> PAGE_SHIFT);
> +		} else {
> +			mem->unencrypted = false;
> +		}
>   
> -		set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
> -				     rmem->size >> PAGE_SHIFT);
>   		swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
>   					 false, nareas);
>   		mem->force_bounce = true;

-- 
Alexey



^ permalink raw reply

* Re: [PATCH 0/3] powerpc: fix preempt_count imbalances in perf and kexec paths
From: Shrikanth Hegde @ 2026-05-18  8:08 UTC (permalink / raw)
  To: Aboorva Devarajan, Madhavan Srinivasan, linuxppc-dev
  Cc: Athira Rajeev, Christophe Leroy, linux-kernel, Sourabh Jain,
	Ritesh Harjani
In-Reply-To: <20260518050855.1147242-1-aboorvad@linux.ibm.com>

Hi Aboorva,

On 5/18/26 10:38 AM, Aboorva Devarajan wrote:
> Hi all,
> 
> This patch series fixes some minor preempt_count bookkeeping issues in
> arch/powerpc/ found during a preemption leak audit prompted by the
> lazy/full preemption model changes. These are get_cpu/put_cpu and
> get_cpu_var/put_cpu_var pairing errors that leave preempt_count
> incorrectly elevated or underflowed.
> 

Thanks for fixing some of these.

while we do this, Can you fix these mismatch in preempt disable/enable in
below files as well.

1. kernel/kprobes.c - kprobe_handler - Does disable, but doesn't enable in some return paths.
    A definite leak.

2. Maybe platforms/pseries/lpar.c and platforms/powernv/opal-tracepoints.c.
    In __trace_hcall_entry/exit.  It maybe a very corner case,
    I don't see a big concern there. But it may be remotely possible.
    Need to evaluate whether it should be fixed or not.


^ permalink raw reply

* Re: [PATCH 2/3] powerpc/powernv: fix preempt count leak in pnv_kexec_wait_secondaries_down
From: Shrikanth Hegde @ 2026-05-18  7:56 UTC (permalink / raw)
  To: Aboorva Devarajan, Madhavan Srinivasan, linuxppc-dev
  Cc: Athira Rajeev, Christophe Leroy, linux-kernel, Sourabh Jain,
	Ritesh Harjani
In-Reply-To: <20260518050855.1147242-3-aboorvad@linux.ibm.com>


Hi Aboorva.

On 5/18/26 10:38 AM, Aboorva Devarajan wrote:
> pnv_kexec_wait_secondaries_down() calls get_cpu() to obtain the current
> CPU id but never calls the matching put_cpu(), leaking one
> preempt_disable() nesting level on every invocation.
> 
> In practice the imbalance does not trigger a visible splat because the
> kexec teardown path is a one-way trip: IRQs are already disabled, no
> schedule() occurs after the leak, and default_machine_kexec() overwrites
> preempt_count with HARDIRQ_OFFSET before jumping into kexec_sequence()
> which never returns. However the bookkeeping is still wrong.
> 
> In the kexec teardown path IRQs are already disabled and the CPU is
> pinned, so get_cpu()'s preempt_disable() side-effect is unnecessary.
> Replace get_cpu() with raw_smp_processor_id() which returns the CPU id
> without touching preempt_count.
> 
> Fixes: 298b34d7d578 ("powerpc/powernv: Fix kexec races going back to OPAL")
> Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> ---
>   arch/powerpc/platforms/powernv/setup.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
> index 4dbb47ddbdcc4..177da0defcb36 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -396,7 +396,7 @@ static void pnv_kexec_wait_secondaries_down(void)
>   {
>   	int my_cpu, i, notified = -1;
>   
> -	my_cpu = get_cpu();
> +	my_cpu = raw_smp_processor_id();
>   

Is it always with irq-disabled?
How about !CONFIG_SMP and in kexec_prepare_cpus. I see it disables interrupt later.
(though it is a less common config)

So use smp_processor_id()?? One could compile with CONFIG_DEBUG_PREEMPT=y and
see any reports.

>   	for_each_online_cpu(i) {
>   		uint8_t status;


^ permalink raw reply

* Re: [PATCH v4 2/5] powerpc/bpf: Move out dummy_tramp_addr after Long branch stub
From: Hari Bathini @ 2026-05-18  7:53 UTC (permalink / raw)
  To: adubey, bpf
  Cc: linuxppc-dev, maddy, ast, andrii, daniel, shuah, linux-kselftest,
	stable
In-Reply-To: <e074658e-401f-4d12-8997-4007d86b9826@linux.ibm.com>



On 18/05/26 12:55 pm, Hari Bathini wrote:
> 
> 
> On 18/05/26 3:10 am, adubey@linux.ibm.com wrote:
>> From: Abhishek Dubey <adubey@linux.ibm.com>
>>
>> Move the long branch address space to the bottom of the long
>> branch stub. This allows uninterrupted disassembly until the
>> last 8 bytes. Exclude these last bytes from the overall
>> program length to prevent failure in assembly generation.
>> Also, align dummy_tramp_addr field with 8-byte boundary.
>>
>> Following is disassembler output for test program with moved down
>> dummy_tramp_addr field:
>> .....
>> .....
>> pc:68    left:44     a6 03 08 7c  :  mtlr 0
>> pc:72    left:40     bc ff ff 4b  :  b .-68
>> pc:76    left:36     a6 02 68 7d  :  mflr 11
>> pc:80    left:32     05 00 9f 42  :  bcl 20, 31, .+4
>> pc:84    left:28     a6 02 88 7d  :  mflr 12
>> pc:88    left:24     14 00 8c e9  :  ld 12, 20(12)
>> pc:92    left:20     a6 03 89 7d  :  mtctr 12
>> pc:96    left:16     a6 03 68 7d  :  mtlr 11
>> pc:100   left:12     20 04 80 4e  :  bctr
>> pc:104   left:8      c0 34 1d 00  :
>>
>> Failure log:
>> Can't disasm instruction at offset 104: c0 34 1d 00 00 00 00 c0
>> Disassembly logic can truncate at 104, ignoring last 8 bytes.
>>
>> Update the dummy_tramp_addr field offset calculation from the end
>> of the program to reflect its new location, for bpf_arch_text_poke()
>> to update the actual trampoline's address in this field.
>>
>> All BPF trampoline selftests continue to pass with this patch applied.
>>
>> Signed-off-by: Abhishek Dubey <adubey@linux.ibm.com>
>> ---
>>   arch/powerpc/net/bpf_jit_comp.c | 34 +++++++++++++++++++--------------
>>   1 file changed, 20 insertions(+), 14 deletions(-)
>>
>> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/ 
>> bpf_jit_comp.c
>> index ef7614177cb1..b73bc9295c31 100644
>> --- a/arch/powerpc/net/bpf_jit_comp.c
>> +++ b/arch/powerpc/net/bpf_jit_comp.c
>> @@ -57,19 +57,21 @@ void bpf_jit_build_fentry_stubs(u32 *image, u32 
>> *fimage, struct codegen_context
>>        * In the final pass, align the mis-aligned dummy_tramp_addr field
>>        * in the fimage. The alignment NOP must appear before OOL stub,
>>        * to make ool_stub_idx & long_branch_stub_idx constant from end.
>> +     *
>> +     * The dummy_tramp_addr field is placed at bottom of Long branch 
>> stub.
>>        */
>>   #ifdef CONFIG_PPC64
>>       if (fimage && image) {
>>           /*
>>            * pc points to first instruction of OOL stub,
>> -         * dummy_tramp_addr is past 4/3 instructions depending on
>> +         * dummy_tramp_addr is past 11/10 instructions depending on
>>            * CONFIG_PPC_FTRACE_OUT_OF_LINE is enabled/not respectively.
>>            *
>>            * The decision to emit alignment NOP must depend on the 
>> alignment
>>            * of dummy_tramp_addr field.
>>            */
>>           unsigned long pc = (unsigned long)fimage + CTX_NIA(ctx);
> 
>> -        pc += IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 4 : 3;
>> +        pc += IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 11 : 10;
> 
> To get the address, should multiply the instruction count with 4..
> 
>      pc += (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 11 : 10) * 4;
> 
> Also, pc may not be appropriate name here. We are essentially
> calculating the pointer address of dummy_tramp_addr. `addrp` maybe?

Something like this:

+		u32 *addrp = fimage + ctx->idx;
+
+		addrp += IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 4 : 3;
+		if (!IS_ALIGNED((unsigned long)addrp, 8))
+			EMIT(PPC_RAW_NOP());

- Hari


^ permalink raw reply

* Re: [PATCH 2/8] mm/bootmem_info: drop initialization of page->lru
From: Lance Yang @ 2026-05-18  7:49 UTC (permalink / raw)
  To: david
  Cc: davem, andreas, rppt, akpm, agordeev, gerald.schaefer, hca, gor,
	borntraeger, svens, maddy, mpe, npiggin, chleroy, ljs, liam,
	vbabka, surenb, mhocko, sparclinux, linux-kernel, linux-mm,
	linux-s390, linuxppc-dev, Lance Yang
In-Reply-To: <20260511-bootmem_info_prep-v1-2-3fb0be6fc688@kernel.org>


On Mon, May 11, 2026 at 04:05:30PM +0200, David Hildenbrand (Arm) wrote:
>In the past, we used to store the type in page->lru.next, introduced by
>commit 5f24ce5fd34c ("thp: remove PG_buddy"). The location changed over
>the years; ever since commit 0386aaa6e9c8 ("bootmem: stop using
>page->index"), we store it alongside the info in page->private.
>
>Consequently, there is no need to reset page->lru anymore.
>
>Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
>---
> mm/bootmem_info.c | 1 -
> 1 file changed, 1 deletion(-)
>
>diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
>index 3d7675a3ae04..a0a1ecdec8d0 100644
>--- a/mm/bootmem_info.c
>+++ b/mm/bootmem_info.c
>@@ -34,7 +34,6 @@ void put_page_bootmem(struct page *page)
> 	if (page_ref_dec_return(page) == 1) {
> 		ClearPagePrivate(page);
> 		set_page_private(page, 0);
>-		INIT_LIST_HEAD(&page->lru);

Yep, that old INIT_LIST_HEAD() call was dead cleanup. page->lru and
page->buddy_list are in the same union:

			union {
				struct list_head lru;

				/* Or, free page */
				struct list_head buddy_list;
			};

and free_reserved_page() passes the page to the buddy allocator. The
later buddy list insertion will overwrite the values written by
INIT_LIST_HEAD(&page->lru) anyway.

> 		kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
> 		free_reserved_page(page);
> 	}

LGTM, feel free to add:
Reviewed-by: Lance Yang <lance.yang@linux.dev>


^ permalink raw reply

* Re: [PATCH v2 08/69] mm/mm_init: Defer sparse_init() until after zone initialization
From: Oscar Salvador (SUSE) @ 2026-05-18  6:32 UTC (permalink / raw)
  To: Muchun Song
  Cc: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan, Lorenzo Stoakes,
	Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel
In-Reply-To: <agqxs2-EQ06mFKun@localhost.localdomain>

On Mon, May 18, 2026 at 08:29:07AM +0200, Oscar Salvador (SUSE) wrote:
> On Wed, May 13, 2026 at 09:04:36PM +0800, Muchun Song wrote:
> >  void __init mm_core_init_early(void)
> >  {
> > +	int nid;
> > +
> >  	hugetlb_cma_reserve();
> >  	hugetlb_bootmem_alloc();
> >  
> >  	free_area_init();
> > +
> > +	sparse_init();
> > +	for_each_node_state(nid, N_MEMORY)
> > +		sparse_vmemmap_init_nid_late(nid);
> 
> Would it not make more sense to hide sparse_vmemmap_init_nid_late()
> within sparse_init() and have it called at the end of the function?
> 
> The flow would be:
> 
>  sparse_init()
>   sparse_init_nid
>    sparse_vmemmap_init_nid_early
>    ...
>   sparse_vmemmap_init_nid_late
> 
> I think it is better to have sparse stuff all together in one place instead of scattered.

Ah, I see that you later remove this stuff, so disregard this please.



-- 
Oscar Salvador
SUSE Labs


^ permalink raw reply

* Re: [PATCH v2 08/69] mm/mm_init: Defer sparse_init() until after zone initialization
From: Oscar Salvador (SUSE) @ 2026-05-18  6:32 UTC (permalink / raw)
  To: Muchun Song
  Cc: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan, Lorenzo Stoakes,
	Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel
In-Reply-To: <20260513130542.35604-9-songmuchun@bytedance.com>

On Wed, May 13, 2026 at 09:04:36PM +0800, Muchun Song wrote:
> free_area_init() is responsible for initializing pgdat and zone state.
> Calling sparse_init() from there mixes in later vmemmap and struct page
> setup, which makes the initialization flow less clear.
> 
> Defer sparse_init(), sparse_vmemmap_init_nid_late(), and memmap_init()
> until after free_area_init() completes, when zone initialization is fully
> done. This keeps free_area_init() focused on zone setup and ensures that
> sparse_init() runs with the relevant zone state already available.
> 
> This is also a prerequisite for later hugetlb vmemmap changes that need
> zone information during early sparse vmemmap setup.
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Reviewed-by: Oscar Salvador (SUSE) <osalvador@kernel.org>

> ---
> v1->v2:
> - Restore the set_pageblock_order() change suggested by Mike Rapoport
> - Add Mike Rapoport's Reviewed-by
> ---
>  mm/mm_init.c | 12 +++++++-----
>  1 file changed, 7 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 12fe21c4e26c..c14491c2dad3 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1826,7 +1826,6 @@ static void __init free_area_init(void)
>  	bool descending;
>  
>  	arch_zone_limits_init(max_zone_pfn);
> -	sparse_init();
>  
>  	start_pfn = PHYS_PFN(memblock_start_of_DRAM());
>  	descending = arch_has_descending_max_zone_pfns();
> @@ -1915,11 +1914,7 @@ static void __init free_area_init(void)
>  		}
>  	}
>  
> -	for_each_node_state(nid, N_MEMORY)
> -		sparse_vmemmap_init_nid_late(nid);
> -
>  	calc_nr_kernel_pages();
> -	memmap_init();
>  
>  	/* disable hash distribution for systems with a single node */
>  	fixup_hashdist();
> @@ -2691,10 +2686,17 @@ void __init __weak mem_init(void)
>  
>  void __init mm_core_init_early(void)
>  {
> +	int nid;
> +
>  	hugetlb_cma_reserve();
>  	hugetlb_bootmem_alloc();
>  
>  	free_area_init();
> +
> +	sparse_init();
> +	for_each_node_state(nid, N_MEMORY)
> +		sparse_vmemmap_init_nid_late(nid);
> +	memmap_init();
>  }
>  
>  /*
> -- 
> 2.54.0
> 

-- 
Oscar Salvador
SUSE Labs


^ permalink raw reply

* Re: [PATCH v4 2/5] powerpc/bpf: Move out dummy_tramp_addr after Long branch stub
From: Hari Bathini @ 2026-05-18  7:25 UTC (permalink / raw)
  To: adubey, bpf
  Cc: linuxppc-dev, maddy, ast, andrii, daniel, shuah, linux-kselftest,
	stable
In-Reply-To: <20260517214043.12975-3-adubey@linux.ibm.com>



On 18/05/26 3:10 am, adubey@linux.ibm.com wrote:
> From: Abhishek Dubey <adubey@linux.ibm.com>
> 
> Move the long branch address space to the bottom of the long
> branch stub. This allows uninterrupted disassembly until the
> last 8 bytes. Exclude these last bytes from the overall
> program length to prevent failure in assembly generation.
> Also, align dummy_tramp_addr field with 8-byte boundary.
> 
> Following is disassembler output for test program with moved down
> dummy_tramp_addr field:
> .....
> .....
> pc:68    left:44     a6 03 08 7c  :  mtlr 0
> pc:72    left:40     bc ff ff 4b  :  b .-68
> pc:76    left:36     a6 02 68 7d  :  mflr 11
> pc:80    left:32     05 00 9f 42  :  bcl 20, 31, .+4
> pc:84    left:28     a6 02 88 7d  :  mflr 12
> pc:88    left:24     14 00 8c e9  :  ld 12, 20(12)
> pc:92    left:20     a6 03 89 7d  :  mtctr 12
> pc:96    left:16     a6 03 68 7d  :  mtlr 11
> pc:100   left:12     20 04 80 4e  :  bctr
> pc:104   left:8      c0 34 1d 00  :
> 
> Failure log:
> Can't disasm instruction at offset 104: c0 34 1d 00 00 00 00 c0
> Disassembly logic can truncate at 104, ignoring last 8 bytes.
> 
> Update the dummy_tramp_addr field offset calculation from the end
> of the program to reflect its new location, for bpf_arch_text_poke()
> to update the actual trampoline's address in this field.
> 
> All BPF trampoline selftests continue to pass with this patch applied.
> 
> Signed-off-by: Abhishek Dubey <adubey@linux.ibm.com>
> ---
>   arch/powerpc/net/bpf_jit_comp.c | 34 +++++++++++++++++++--------------
>   1 file changed, 20 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index ef7614177cb1..b73bc9295c31 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -57,19 +57,21 @@ void bpf_jit_build_fentry_stubs(u32 *image, u32 *fimage, struct codegen_context
>   	 * In the final pass, align the mis-aligned dummy_tramp_addr field
>   	 * in the fimage. The alignment NOP must appear before OOL stub,
>   	 * to make ool_stub_idx & long_branch_stub_idx constant from end.
> +	 *
> +	 * The dummy_tramp_addr field is placed at bottom of Long branch stub.
>   	 */
>   #ifdef CONFIG_PPC64
>   	if (fimage && image) {
>   		/*
>   		 * pc points to first instruction of OOL stub,
> -		 * dummy_tramp_addr is past 4/3 instructions depending on
> +		 * dummy_tramp_addr is past 11/10 instructions depending on
>   		 * CONFIG_PPC_FTRACE_OUT_OF_LINE is enabled/not respectively.
>   		 *
>   		 * The decision to emit alignment NOP must depend on the alignment
>   		 * of dummy_tramp_addr field.
>   		 */
>   		unsigned long pc = (unsigned long)fimage + CTX_NIA(ctx);

> -		pc += IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 4 : 3;
> +		pc += IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 11 : 10;

To get the address, should multiply the instruction count with 4..

     pc += (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 11 : 10) * 4;

Also, pc may not be appropriate name here. We are essentially
calculating the pointer address of dummy_tramp_addr. `addrp` maybe?

- Hari


^ permalink raw reply

* Re: [PATCH v4 1/5] powerpc/bpf: fix alignment of long branch trampoline address
From: Hari Bathini @ 2026-05-18  7:18 UTC (permalink / raw)
  To: adubey, bpf
  Cc: linuxppc-dev, maddy, ast, andrii, daniel, shuah, linux-kselftest,
	stable
In-Reply-To: <20260517214043.12975-2-adubey@linux.ibm.com>



On 18/05/26 3:10 am, adubey@linux.ibm.com wrote:
> From: Abhishek Dubey <adubey@linux.ibm.com>
> 
> Ensure the dummy trampoline address field present between the OOL stub
> and the long branch stub is 8-byte aligned, for memory compatibility
> when content loaded to a register.
> 
> Reported-by: Hari Bathini <hbathini@linux.ibm.com>
> Fixes: d243b62b7bd3 ("powerpc64/bpf: Add support for bpf trampolines")
> Cc: stable@vger.kernel.org
> Signed-off-by: Abhishek Dubey <adubey@linux.ibm.com>
> ---
>   arch/powerpc/net/bpf_jit.h        |  4 ++--
>   arch/powerpc/net/bpf_jit_comp.c   | 34 ++++++++++++++++++++++++++-----
>   arch/powerpc/net/bpf_jit_comp64.c |  4 ++--
>   3 files changed, 33 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
> index f32de8704d4d..71e6e7d01057 100644
> --- a/arch/powerpc/net/bpf_jit.h
> +++ b/arch/powerpc/net/bpf_jit.h
> @@ -214,8 +214,8 @@ int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *
>   int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct codegen_context *ctx,
>   		       u32 *addrs, int pass, bool extra_pass);
>   void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
> -void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
> -void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx);
> +void bpf_jit_build_epilogue(u32 *image, u32 *fimage, struct codegen_context *ctx);
> +void bpf_jit_build_fentry_stubs(u32 *image, u32 *fimage, struct codegen_context *ctx);
>   void bpf_jit_realloc_regs(struct codegen_context *ctx);
>   int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, long exit_addr);
>   void prepare_for_fsession_fentry(u32 *image, struct codegen_context *ctx, int cookie_cnt,
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 53ab97ad6074..ef7614177cb1 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -49,11 +49,34 @@ asm (
>   "	.popsection				;"
>   );
>   
> -void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx)
> +void bpf_jit_build_fentry_stubs(u32 *image, u32 *fimage, struct codegen_context *ctx)
>   {
>   	int ool_stub_idx, long_branch_stub_idx;
>   
>   	/*
> +	 * In the final pass, align the mis-aligned dummy_tramp_addr field
> +	 * in the fimage. The alignment NOP must appear before OOL stub,
> +	 * to make ool_stub_idx & long_branch_stub_idx constant from end.
> +	 */
> +#ifdef CONFIG_PPC64
> +	if (fimage && image) {
> +		/*
> +		 * pc points to first instruction of OOL stub,
> +		 * dummy_tramp_addr is past 4/3 instructions depending on
> +		 * CONFIG_PPC_FTRACE_OUT_OF_LINE is enabled/not respectively.
> +		 *
> +		 * The decision to emit alignment NOP must depend on the alignment
> +		 * of dummy_tramp_addr field.

This makes it easier to read instead of the XOR matrix..

> +		 */
> +		unsigned long pc = (unsigned long)fimage + CTX_NIA(ctx);

> +		pc += IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 4 : 3;

The above line should be:

pc += (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) ? 4 : 3) * 4;


> +
> +		if (!IS_ALIGNED(pc, 8))
> +			EMIT(PPC_RAW_NOP());
> +	}
> +#endif
> +
> +	/*      nop     // optional, for alignment of dummy_tramp_addr
>   	 * Out-of-line stub:
>   	 *	mflr	r0
>   	 *	[b|bl]	tramp
> @@ -70,7 +93,7 @@ void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx)
>   
>   	/*
>   	 * Long branch stub:
> -	 *	.long	<dummy_tramp_addr>
> +	 *	.long	<dummy_tramp_addr>  // 8-byte aligned
>   	 *	mflr	r11
>   	 *	bcl	20,31,$+4
>   	 *	mflr	r12
> @@ -81,6 +104,7 @@ void bpf_jit_build_fentry_stubs(u32 *image, struct codegen_context *ctx)
>   	 */
>   	if (image)
>   		*((unsigned long *)&image[ctx->idx]) = (unsigned long)dummy_tramp;
> +
>   	ctx->idx += SZL / 4;
>   	long_branch_stub_idx = ctx->idx;
>   	EMIT(PPC_RAW_MFLR(_R11));
> @@ -107,7 +131,7 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg,
>   		PPC_JMP(ctx->alt_exit_addr);
>   	} else {
>   		ctx->alt_exit_addr = ctx->idx * 4;
> -		bpf_jit_build_epilogue(image, ctx);
> +		bpf_jit_build_epilogue(image, NULL, ctx);
>   	}
>   
>   	return 0;
> @@ -286,7 +310,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr
>   	 */
>   	bpf_jit_build_prologue(NULL, &cgctx);
>   	addrs[fp->len] = cgctx.idx * 4;
> -	bpf_jit_build_epilogue(NULL, &cgctx);
> +	bpf_jit_build_epilogue(NULL, NULL, &cgctx);
>   
>   	fixup_len = fp->aux->num_exentries * BPF_FIXUP_LEN * 4;
>   	extable_len = fp->aux->num_exentries * sizeof(struct exception_table_entry);
> @@ -318,7 +342,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_pr
>   			bpf_jit_binary_pack_free(fhdr, hdr);
>   			goto out_err;
>   		}
> -		bpf_jit_build_epilogue(code_base, &cgctx);
> +		bpf_jit_build_epilogue(code_base, fcode_base, &cgctx);
>   
>   		if (bpf_jit_enable > 1)
>   			pr_info("Pass %d: shrink = %d, seen = 0x%x\n", pass,
> diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
> index db364d9083e7..885dc8cf55a2 100644
> --- a/arch/powerpc/net/bpf_jit_comp64.c
> +++ b/arch/powerpc/net/bpf_jit_comp64.c
> @@ -398,7 +398,7 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx
>   	}
>   }
>   
> -void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
> +void bpf_jit_build_epilogue(u32 *image, u32 *fimage, struct codegen_context *ctx)
>   {
>   	bpf_jit_emit_common_epilogue(image, ctx);
>   
> @@ -407,7 +407,7 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
>   
>   	EMIT(PPC_RAW_BLR());
>   
> -	bpf_jit_build_fentry_stubs(image, ctx);
> +	bpf_jit_build_fentry_stubs(image, fimage, ctx);
>   }
>   
>   /*



^ permalink raw reply

* Re: [PATCH 1/8] sparc/mm: remove register_page_bootmem_info()
From: Lance Yang @ 2026-05-18  6:55 UTC (permalink / raw)
  To: david
  Cc: davem, andreas, rppt, akpm, agordeev, gerald.schaefer, hca, gor,
	borntraeger, svens, maddy, mpe, npiggin, chleroy, ljs, liam,
	vbabka, surenb, mhocko, sparclinux, linux-kernel, linux-mm,
	linux-s390, linuxppc-dev, Lance Yang
In-Reply-To: <20260511-bootmem_info_prep-v1-1-3fb0be6fc688@kernel.org>


On Mon, May 11, 2026 at 04:05:29PM +0200, David Hildenbrand (Arm) wrote:
>sparc does not select CONFIG_HAVE_BOOTMEM_INFO_NODE, therefore,
>register_page_bootmem_info_node() is a nop.
>
>Let's just get rid of register_page_bootmem_info().
>
>Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
>---

Nice cleanup!

With CONFIG_NUMA=n, the removed helper did nothing.
With CONFIG_NUMA=y, it only looped over nodes and called the empty inline
stub.

So, feel free to add:
Reviewed-by: Lance Yang <lance.yang@linux.dev>


^ permalink raw reply

* [PATCH v2 2/2] lkdtm/powerpc: add PPC_RADIX_TLBIEL test for radix MCE validation
From: Sayali Patil @ 2026-05-18  6:56 UTC (permalink / raw)
  To: linuxppc-dev, maddy; +Cc: linux-kernel, Ritesh Harjani, Mahesh Salgaonkar
In-Reply-To: <cover.1778975974.git.sayalip@linux.ibm.com>

Add a new LKDTM trigger (PPC_RADIX_TLBIEL) that executes a process-scoped
radix TLBIEL instruction to exercise the radix MMU behaviour and
associated machine check exception (MCE) handling paths.

This provides a way to validate MCE handling in radix mode. Currently,
there is no dedicated LKDTM test that exercises this path or allows
triggering radix-specific machine check behaviour for validation.

The test is only enabled on ppc64 systems with radix MMU
support and If radix is not active, the trigger is skipped and reported as
XFAIL.

Co-developed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 drivers/misc/lkdtm/Makefile             |  2 +-
 drivers/misc/lkdtm/core.c               |  2 +-
 drivers/misc/lkdtm/powerpc.c            | 44 +++++++++++++++++++++++++
 tools/testing/selftests/lkdtm/tests.txt |  1 +
 4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index 03ebe33185f9..4e58d16fc01e 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -11,7 +11,7 @@ lkdtm-$(CONFIG_LKDTM)		+= usercopy.o
 lkdtm-$(CONFIG_LKDTM)		+= kstack_erase.o
 lkdtm-$(CONFIG_LKDTM)		+= cfi.o
 lkdtm-$(CONFIG_LKDTM)		+= fortify.o
-lkdtm-$(CONFIG_PPC_64S_HASH_MMU)	+= powerpc.o
+lkdtm-$(CONFIG_PPC_BOOK3S_64)	+= powerpc.o
 
 KASAN_SANITIZE_stackleak.o	:= n
 
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index 5732fd59a227..ededa32d6744 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -96,7 +96,7 @@ static const struct crashtype_category *crashtype_categories[] = {
 	&stackleak_crashtypes,
 	&cfi_crashtypes,
 	&fortify_crashtypes,
-#ifdef CONFIG_PPC_64S_HASH_MMU
+#ifdef CONFIG_PPC_BOOK3S_64
 	&powerpc_crashtypes,
 #endif
 };
diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c
index ef07e5201edf..6eaac79ea26b 100644
--- a/drivers/misc/lkdtm/powerpc.c
+++ b/drivers/misc/lkdtm/powerpc.c
@@ -5,6 +5,7 @@
 #include <linux/vmalloc.h>
 #include <asm/mmu.h>
 
+#ifdef CONFIG_PPC_64S_HASH_MMU
 /* Inserts new slb entries */
 static void insert_slb_entry(unsigned long p, int ssize, int page_size)
 {
@@ -104,9 +105,35 @@ static void insert_dup_slb_entry_0(void)
 
 	preempt_enable();
 }
+#endif /* CONFIG_PPC_64S_HASH_MMU */
+
+static __always_inline void tlbiel_va(unsigned long va,
+				      unsigned long pid,
+				      unsigned long ap,
+				      unsigned long ric)
+{
+	unsigned long rb, rs, prs, r;
+
+	rb = va & ~(PPC_BITMASK(52, 63));
+	rb |= ap << PPC_BITLSHIFT(58);
+	rs = pid << PPC_BITLSHIFT(31);
+
+	prs = 1; /* process scoped */
+	r = 1;   /* radix format */
+
+	/*
+	 * Trigger an MCE by issuing radix tlbiel with an invalid operand combination.
+	 * The combination of RIC = 2 with IS = 0 (Invalidation selector specified
+	 * in the RB register) is invalid.
+	 * This invalid combination causes hardware to raise a machine check.
+	 */
+	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+			: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+}
 
 static void lkdtm_PPC_SLB_MULTIHIT(void)
 {
+#ifdef CONFIG_PPC_64S_HASH_MMU
 	if (!radix_enabled()) {
 		pr_info("Injecting SLB multihit errors\n");
 		/*
@@ -122,10 +149,27 @@ static void lkdtm_PPC_SLB_MULTIHIT(void)
 	} else {
 		pr_err("XFAIL: This test is for ppc64 and with hash mode MMU only\n");
 	}
+#else
+	pr_err("XFAIL: This test requires CONFIG_PPC_64S_HASH_MMU\n");
+#endif
+}
+
+static void lkdtm_PPC_RADIX_TLBIEL(void)
+{
+	unsigned long addr = PAGE_OFFSET;
+
+	if (radix_enabled()) {
+		pr_info("Injecting Radix TLB invalidation MCE\n");
+		tlbiel_va(addr, 0, 0, RIC_FLUSH_ALL);
+		pr_info("Recovered from radix tlbiel attempt\n");
+	} else {
+		pr_err("XFAIL: This test is for ppc64 and with radix mode MMU only\n");
+	}
 }
 
 static struct crashtype crashtypes[] = {
 	CRASHTYPE(PPC_SLB_MULTIHIT),
+	CRASHTYPE(PPC_RADIX_TLBIEL),
 };
 
 struct crashtype_category powerpc_crashtypes = {
diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt
index 3245032db34d..d8180bbe31e8 100644
--- a/tools/testing/selftests/lkdtm/tests.txt
+++ b/tools/testing/selftests/lkdtm/tests.txt
@@ -86,3 +86,4 @@ FORTIFY_STR_MEMBER detected buffer overflow
 FORTIFY_MEM_OBJECT detected buffer overflow
 FORTIFY_MEM_MEMBER detected field-spanning write
 PPC_SLB_MULTIHIT Recovered
+#PPC_RADIX_TLBIEL Triggers unrecoverable MCE
-- 
2.52.0



^ permalink raw reply related

* [PATCH v2 1/2] lkdtm/powerpc: add isync after slbmte to enforce SLB update ordering
From: Sayali Patil @ 2026-05-18  6:56 UTC (permalink / raw)
  To: linuxppc-dev, maddy; +Cc: linux-kernel, Ritesh Harjani, Mahesh Salgaonkar
In-Reply-To: <cover.1778975974.git.sayalip@linux.ibm.com>

The slbmte instruction modifies the Segment Lookaside Buffer, but without
a context synchronizing operation the CPU is not guaranteed to observe
the updated SLB state for subsequent instructions. This can result in
use of stale translation state when memory is accessed immediately after
SLB modifications.

Add isync after each slbmte in the PPC_SLB_MULTIHIT test to ensure proper
ordering of SLB updates before subsequent memory accesses.

This aligns with Power ISA context synchronization requirements for changes
in address translation state and improves the reliability of SLB multihit
injection tests in hash MMU mode.

Suggested-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Sayali Patil <sayalip@linux.ibm.com>
---
 drivers/misc/lkdtm/powerpc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c
index be385449911a..ef07e5201edf 100644
--- a/drivers/misc/lkdtm/powerpc.c
+++ b/drivers/misc/lkdtm/powerpc.c
@@ -17,11 +17,14 @@ static void insert_slb_entry(unsigned long p, int ssize, int page_size)
 		     : "r" (mk_vsid_data(p, ssize, flags)),
 		       "r" (mk_esid_data(p, ssize, SLB_NUM_BOLTED))
 		     : "memory");
+	isync();
 
 	asm volatile("slbmte %0,%1" :
 			: "r" (mk_vsid_data(p, ssize, flags)),
 			  "r" (mk_esid_data(p, ssize, SLB_NUM_BOLTED + 1))
 			: "memory");
+	isync();
+
 	preempt_enable();
 }
 
@@ -84,6 +87,7 @@ static void insert_dup_slb_entry_0(void)
 			: "r" (vsid),
 			  "r" (esid | SLB_NUM_BOLTED)
 			: "memory");
+	isync();
 
 	asm volatile("slbmfee  %0,%1" : "=r" (esid) : "r" (i));
 	asm volatile("slbmfev  %0,%1" : "=r" (vsid) : "r" (i));
@@ -93,6 +97,7 @@ static void insert_dup_slb_entry_0(void)
 			: "r" (vsid),
 			  "r" (esid | (SLB_NUM_BOLTED + 1))
 			: "memory");
+	isync();
 
 	pr_info("%s accessing test address 0x%lx: 0x%lx\n",
 		__func__, test_address, *test_ptr);
-- 
2.52.0



^ permalink raw reply related

* [PATCH v2 0/2] LKDTM powerpc enhancements - Part2
From: Sayali Patil @ 2026-05-18  6:56 UTC (permalink / raw)
  To: linuxppc-dev, maddy; +Cc: linux-kernel, Ritesh Harjani, Mahesh Salgaonkar

Hi all,

This series adds a new LKDTM trigger PPC_RADIX_TLBIEL, to validate
machine check handling on radix MMU systems and improves reliability of
the PPC_SLB_MULTIHIT test by adding isync instructions after slbmte
operations.

Please review the patches and provide any feedback or suggestions
for improvement.

Thanks,
Sayali

---

v1->v2
 - Split the patch series into two parts.
 - Updated "lkdtm/powerpc: add PPC_RADIX_TLBIEL test for radix MCE
   validation" as per review comments:
   Wrapped Hash-MMU specific functions with #ifdef CONFIG_PPC_64S_HASH_MMU.
   Guarded powerpc_crashtypes registration with #ifdef CONFIG_PPC_BOOK3S_64
   Updated comment explaining the MCE trigger condition for radix MMU.

v1: https://lore.kernel.org/all/cover.1778057685.git.sayalip@linux.ibm.com/
---

Sayali Patil (2):
  lkdtm/powerpc: add isync after slbmte to enforce SLB update ordering
  lkdtm/powerpc: add PPC_RADIX_TLBIEL test for radix MCE validation

 drivers/misc/lkdtm/Makefile             |  2 +-
 drivers/misc/lkdtm/core.c               |  2 +-
 drivers/misc/lkdtm/powerpc.c            | 49 +++++++++++++++++++++++++
 tools/testing/selftests/lkdtm/tests.txt |  1 +
 4 files changed, 52 insertions(+), 2 deletions(-)

-- 
2.52.0



^ permalink raw reply

* Re: [PATCH v2 08/69] mm/mm_init: Defer sparse_init() until after zone initialization
From: Oscar Salvador (SUSE) @ 2026-05-18  6:29 UTC (permalink / raw)
  To: Muchun Song
  Cc: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan, Lorenzo Stoakes,
	Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel
In-Reply-To: <20260513130542.35604-9-songmuchun@bytedance.com>

On Wed, May 13, 2026 at 09:04:36PM +0800, Muchun Song wrote:
>  void __init mm_core_init_early(void)
>  {
> +	int nid;
> +
>  	hugetlb_cma_reserve();
>  	hugetlb_bootmem_alloc();
>  
>  	free_area_init();
> +
> +	sparse_init();
> +	for_each_node_state(nid, N_MEMORY)
> +		sparse_vmemmap_init_nid_late(nid);

Would it not make more sense to hide sparse_vmemmap_init_nid_late()
within sparse_init() and have it called at the end of the function?

The flow would be:

 sparse_init()
  sparse_init_nid
   sparse_vmemmap_init_nid_early
   ...
  sparse_vmemmap_init_nid_late

I think it is better to have sparse stuff all together in one place instead of scattered.

 

-- 
Oscar Salvador
SUSE Labs


^ permalink raw reply

* Re: [PATCH v7 0/4] PCI: Add support for resetting the Root Ports in a platform specific way
From: Manivannan Sadhasivam @ 2026-05-18  6:21 UTC (permalink / raw)
  To: Niklas Cassel
  Cc: manivannan.sadhasivam, Bjorn Helgaas, Mahesh J Salgaonkar,
	Oliver O'Halloran, Will Deacon, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Rob Herring, Heiko Stuebner,
	Philipp Zabel, linux-pci, linux-kernel, linuxppc-dev,
	linux-arm-kernel, linux-arm-msm, linux-rockchip, Wilfred Mallawa,
	Krishna Chaitanya Chundru, Lukas Wunner, Richard Zhu,
	Brian Norris, Wilson Ding, Frank Li
In-Reply-To: <abk3rU2EDKjkefUD@ryzen>

On Tue, Mar 17, 2026 at 12:16:47PM +0100, Niklas Cassel wrote:
> On Wed, Mar 11, 2026 at 08:44:15PM +0530, Manivannan Sadhasivam wrote:
> > On Wed, Mar 11, 2026 at 08:09:53PM +0530, Manivannan Sadhasivam wrote:
> > > On Wed, Mar 11, 2026 at 12:05:15PM +0100, Niklas Cassel wrote:
> > > > On Tue, Mar 10, 2026 at 07:31:58PM +0530, Manivannan Sadhasivam via B4 Relay wrote:
> > > > > Changes in v7:
> > > > > - Dropped Rockchip Root port reset patch due to reported issues. But the series
> > > > >   works on other platforms as tested by others.
> > > > 
> > > > Are you referring to
> > > > 
> > > > ## On EP side:
> > > > # echo 0 > /sys/kernel/config/pci_ep/controllers/a40000000.pcie-ep/start && \
> > > >   sleep 0.1 && echo 1 > /sys/kernel/config/pci_ep/controllers/a40000000.pcie-ep/start
> > > > 
> > > > Then running pcitest only having 7 / 16 tests passed ?
> > > > 
> > > > If so, isn't that a problem also for qcom?
> > > > 
> > > 
> > > No, tests are passing on my setup after link up.
> > > 
> > > > 
> > > > There is no chance that the patch:
> > > > "misc: pci_endpoint_test: Add AER error handlers"
> > > > improves things in this regard?
> > > > 
> > > > Or will it simply avoid the "AER: device recovery failed" print?
> > > > 
> > > 
> > > Yes, as mentioned in the commit message, it just avoids the AER recovery failure
> > > message.
> > > 
> > 
> > I also realized that Endpoint state is not saved in all the code paths. So the
> > pci_endpoint_test driver has to save/restore the state also. But it is still not
> > clear why that didn't help you.
> > 
> > Can you share the snapshot of the entire config space before and after reset
> > using 'lspci -xxxx -s "0000:01:00"'?
> 
> If I don't add something like:
> 
> diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
> index 1eced7a419eb..9d7ee39164d4 100644
> --- a/drivers/misc/pci_endpoint_test.c
> +++ b/drivers/misc/pci_endpoint_test.c
> @@ -1059,6 +1059,9 @@ static int pci_endpoint_test_set_irq(struct pci_endpoint_test *test,
>                 return ret;
>         }
>  
> +       pr_info("saving PCI state (irq_type: %d)\n", req_irq_type);
> +       pci_save_state(pdev);
> +
>         return 0;
>  }
>  
> @@ -1453,6 +1456,7 @@ static pci_ers_result_t pci_endpoint_test_error_detected(struct pci_dev *pdev,
>  
>  static pci_ers_result_t pci_endpoint_test_slot_reset(struct pci_dev *pdev)
>  {
> +       pci_restore_state(pdev);
>         return PCI_ERS_RESULT_RECOVERED;
>  }
> 
> On top of your patch.
> 
> Then all the BAR tests + MSI and MSI-X tests fail.
> 
> There is a huge difference in lspci -vvv output (as I guess is expected),
> including all BARs being marked as disabled.
> 
> 
> With the patch above. There is zero difference before/after reset, and all
> the BAR tests pass. However, MSI/MSI-X tests still fail with:
> 
> # pci_endpoint_test.c:143:MSI_TEST:Expected 0 (0) == ret (-110) 
> # pci_endpoint_test.c:143:MSI_TEST:Test failed for MSI1
> 
> ETIMEDOUT.
> 
> This suggests that pci_endpoint_test on the host side did not receive an
> interrupt.
> 
> I don't know why, but considering that lspci output is now (with the
> save+restore) identical, I assume that the problem is not related to
> the host. Unless somehow the host will use a new/different MSI address
> after the root port has been reset, and we restore the old MSI address,
> but looking at the code, dw_pcie_msi_init() is called by
> dw_pcie_setup_rc(), so I would expect the MSI address to be the same.
> 

Hi Niklas,

When I rebased this series on top of v7.1-rc1, I ended up seeing the issue what
you described here (not sure why I didn't see it earlier). So after the Root
Port reset, MSI tests fail, but BAR tests succeed. Also, I got IOMMU faults on
the host after endpoint triggers MSI.

I investigated it and found that the MSI iATU mapping gets cleared in hw after
LDn happens. But the host continues to use the same address/size for the
endpoint MSI even after reset. Due to this, the existing checks in
dw_pcie_ep_raise_msi_irq() don't pass and the stale MSI iATU mapping gets
reused.

The fix would be to clear the mapping in dw_pcie_ep_cleanup(), which gets called
as part of the PERST# assert/deassert sequence post LDn and also set
msi_iatu_mapped flag to 'false'. This will force dw_pcie_ep_raise_msi_irq() to
use fresh iATU mapping when it gets called for the first time:

diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index d4dc3b24da60..4ae0e1b55f39 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -1035,6 +1035,11 @@ void dw_pcie_ep_cleanup(struct dw_pcie_ep *ep)
 {
        struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
+       if (ep->msi_iatu_mapped) {
+               dw_pcie_ep_unmap_addr(ep->epc, 0, 0, ep->msi_mem_phys);
+               ep->msi_iatu_mapped = false;
+       }
+
        dwc_pcie_debugfs_deinit(pci);
        dw_pcie_edma_remove(pci);
 }

With this change, MSI works after Root Port reset without any issues on our Qcom
endpoint/host setup.

Please test this change on your rockchip setup as well. You have to make sure
that dw_pcie_ep_cleanup() is called during PERST# assert/deassert.

I'm going to respin the series with this fix. If you confirm it works for you,
then we can merge your Rockchip Root Port change.

- Mani

-- 
மணிவண்ணன் சதாசிவம்


^ permalink raw reply related

* Re: [PATCH 1/3] powerpc/perf: fix preempt count underflow in fsl_emb_pmu_del
From: Shrikanth Hegde @ 2026-05-18  6:13 UTC (permalink / raw)
  To: Aboorva Devarajan, Madhavan Srinivasan, linuxppc-dev
  Cc: Athira Rajeev, Christophe Leroy, linux-kernel, Sourabh Jain,
	Ritesh Harjani
In-Reply-To: <20260518050855.1147242-2-aboorvad@linux.ibm.com>



On 5/18/26 10:38 AM, Aboorva Devarajan wrote:
> fsl_emb_pmu_del() unconditionally calls put_cpu_var(cpu_hw_events) at
> the 'out:' label, but only calls the matching get_cpu_var() after the
> 'i < 0' early-return check. When event->hw.idx is negative the
> function jumps to 'out:' without having taken get_cpu_var(), and the
> trailing put_cpu_var() then issues an unmatched preempt_enable(),
> underflowing preempt_count.
> 
> On a CONFIG_PREEMPT=y kernel preempt_count would underflow and
> eventually present as a 'scheduling while atomic' BUG.
> 
> Move put_cpu_var() to pair with get_cpu_var() so the percpu access is
> correctly bracketed and the 'out:' label only handles perf_pmu_enable.
> 
> Fixes: a11106544f33c ("powerpc/perf: e500 support")
> Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> ---
>   arch/powerpc/perf/core-fsl-emb.c | 3 ++-
>   1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c
> index 7120ab20cbfec..02b5dd74c187a 100644
> --- a/arch/powerpc/perf/core-fsl-emb.c
> +++ b/arch/powerpc/perf/core-fsl-emb.c
> @@ -366,9 +366,10 @@ static void fsl_emb_pmu_del(struct perf_event *event, int flags)
>   
>   	cpuhw->n_events--;
>   
> +	put_cpu_var(cpu_hw_events);
> +
>    out:
>   	perf_pmu_enable(event->pmu);
> -	put_cpu_var(cpu_hw_events);
>   }
>   
>   static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags)

Thanks for fixing this. Looks good to me.

Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>


^ permalink raw reply

* Re: [PATCH 3/3] powerpc/kexec: fix double get_cpu() imbalance in kexec_prepare_cpus
From: Shrikanth Hegde @ 2026-05-18  6:02 UTC (permalink / raw)
  To: Aboorva Devarajan, Madhavan Srinivasan, linuxppc-dev
  Cc: Athira Rajeev, Christophe Leroy, linux-kernel, Sourabh Jain,
	Ritesh Harjani
In-Reply-To: <20260518050855.1147242-4-aboorvad@linux.ibm.com>

Hi Aboorva.

On 5/18/26 10:38 AM, Aboorva Devarajan wrote:
> kexec_prepare_cpus_wait() calls get_cpu() internally to obtain the
> current CPU id. kexec_prepare_cpus() calls kexec_prepare_cpus_wait()
> twice -- once for KEXEC_STATE_IRQS_OFF and once for
> KEXEC_STATE_REAL_MODE -- but only issues a single put_cpu() at the end,
> leaving preempt_count elevated by one extra nesting level.
> 
> In practice the imbalance does not trigger a 'scheduling while atomic'
> splat because the kexec path is a one-way trip: IRQs are already
> disabled, no schedule() occurs after the leak, and
> default_machine_kexec() overwrites preempt_count with HARDIRQ_OFFSET
> before jumping into kexec_sequence() which never returns. However the
> bookkeeping is still wrong.
> 
> Lift the get_cpu()/put_cpu() pair into kexec_prepare_cpus() so it is
> called exactly once, and pass the CPU id to kexec_prepare_cpus_wait()
> as a parameter. This keeps preempt_count correctly balanced.
> 
> Fixes: 1fc711f7ffb01 ("powerpc/kexec: Fix race in kexec shutdown")
> Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> ---
>   arch/powerpc/kexec/core_64.c | 15 ++++++++-------
>   1 file changed, 8 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
> index 825ab8a88f18e..9d7e5a1e6e5b8 100644
> --- a/arch/powerpc/kexec/core_64.c
> +++ b/arch/powerpc/kexec/core_64.c
> @@ -164,12 +164,11 @@ static void kexec_smp_down(void *arg)
>   	/* NOTREACHED */
>   }
>   
> -static void kexec_prepare_cpus_wait(int wait_state)
> +static void kexec_prepare_cpus_wait(int wait_state, int my_cpu)
>   {
> -	int my_cpu, i, notified=-1;
> +	int i, notified = -1;
>   
>   	hw_breakpoint_disable();
> -	my_cpu = get_cpu();
>   	/* Make sure each CPU has at least made it to the state we need.
>   	 *
>   	 * FIXME: There is a (slim) chance of a problem if not all of the CPUs
> @@ -246,6 +245,8 @@ static void wake_offline_cpus(void)
>   
>   static void kexec_prepare_cpus(void)
>   {
> +	int my_cpu;
> +
>   	wake_offline_cpus();
>   	smp_call_function(kexec_smp_down, NULL, /* wait */0);
>   	local_irq_disable();
> @@ -254,7 +255,8 @@ static void kexec_prepare_cpus(void)
>   	mb(); /* make sure IRQs are disabled before we say they are */
>   	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
>   
> -	kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF);
> +	my_cpu = get_cpu();

raw_smp_processor_id() is better here. All it needs is get current cpu?
caller does irq_disable above and that renders call for get_cpu un-necessary.


> +	kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF, my_cpu);
>   	/* we are sure every CPU has IRQs off at this point */
>   	kexec_all_irq_disabled = 1;
>   
> @@ -262,13 +264,12 @@ static void kexec_prepare_cpus(void)
>   	 * Before removing MMU mappings make sure all CPUs have entered real
>   	 * mode:
>   	 */
> -	kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE);
> +	kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE, my_cpu);
> +	put_cpu();
>   
>   	/* after we tell the others to go down */
>   	if (ppc_md.kexec_cpu_down)
>   		ppc_md.kexec_cpu_down(0, 0);
> -
> -	put_cpu();
>   }
>   
>   #else /* ! SMP */



^ permalink raw reply

* Re: [PATCH] powerpc/warp: Fix error handling in pika_dtm_thread
From: Madhavan Srinivasan @ 2026-05-18  5:43 UTC (permalink / raw)
  To: mpe, npiggin, benh, smaclennan, Christophe Leroy, Ma Ke
  Cc: linuxppc-dev, linux-kernel, akpm, stable
In-Reply-To: <20251116024411.21968-1-make24@iscas.ac.cn>

On Sun, 16 Nov 2025 10:44:11 +0800, Ma Ke wrote:
> pika_dtm_thread() acquires client through of_find_i2c_device_by_node()
> but fails to release it in error handling path. This could result in a
> reference count leak, preventing proper cleanup and potentially
> leading to resource exhaustion. Add put_device() to release the
> reference in the error handling path.
> 
> Found by code review.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/warp: Fix error handling in pika_dtm_thread
      https://git.kernel.org/powerpc/c/108d7f951271cbd36ca36efc5e5d106966f5180c

cheers


^ permalink raw reply

* Re: [PATCH v2] powerpc/time: Remove redundant preempt_disable|enable() calls from arch_irq_work_raise() - part1
From: Madhavan Srinivasan @ 2026-05-18  5:43 UTC (permalink / raw)
  To: linuxppc-dev, Sayali Patil
  Cc: linux-kernel, Ritesh Harjani, Mahesh Salgaonkar, sshegde, chleroy
In-Reply-To: <20260513081413.222490-1-sayalip@linux.ibm.com>

On Wed, 13 May 2026 13:44:13 +0530, Sayali Patil wrote:
> A kernel panic is observed when handling machine check exceptions from
> real mode.
> 
>   BUG: Unable to handle kernel data access on read at 0xc00000006be21300
>   Oops: Kernel access of bad area, sig: 11 [#1]
>   MSR:  8000000000001003 <SF,ME,RI,LE>  CR: 88222248  XER: 00000005
>   CFAR: c00000000003ffc4 DAR: c00000006be21300 DSISR: 40000000 IRQMASK: 0
>   NIP [c000000000029e40] arch_irq_work_raise+0x10/0x70
>   LR [c00000000003ffc8] machine_check_queue_event+0xa8/0x150
>   Call Trace:
>   [c0000000179d3c70] [c00000000003ff64] machine_check_queue_event+0x44/0x150
>   [c0000000179d3d30] [c0000000000084e0] machine_check_early_common+0x1f0/0x2c0
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/time: Remove redundant preempt_disable|enable() calls from arch_irq_work_raise() - part1
      https://git.kernel.org/powerpc/c/31467b23823ffec1f6fff407f8e3ca9af8b7491a

cheers


^ permalink raw reply

* Re: [PATCH v2] powerpc/powermac: Remove pmac_low_i2c_{lock,unlock}()
From: Madhavan Srinivasan @ 2026-05-18  5:43 UTC (permalink / raw)
  To: Michael Ellerman, Bart Van Assche
  Cc: Nicholas Piggin, Christophe Leroy, linuxppc-dev, Thomas Gleixner,
	Ingo Molnar, Kees Cook
In-Reply-To: <20260316174747.3871924-1-bvanassche@acm.org>

On Mon, 16 Mar 2026 10:47:42 -0700, Bart Van Assche wrote:
> Commit a28d3af2a26c ("[PATCH] 2/5 powerpc: Rework PowerMac i2c part 2")
> removed the last calls to the pmac_low_i2c_{lock,unlock}() functions.
> Hence, remove these two functions.
> 
> 

Applied to powerpc/fixes.

[1/1] powerpc/powermac: Remove pmac_low_i2c_{lock,unlock}()
      https://git.kernel.org/powerpc/c/f98020c22ad9be07d6feefd0496e70f9f36a2f63

cheers


^ permalink raw reply

* Re: [PATCH] powerpc: fix dead default for GUEST_STATE_BUFFER_TEST
From: Madhavan Srinivasan @ 2026-05-18  5:43 UTC (permalink / raw)
  To: mpe, npiggin, chleroy, jniethe5, Julian Braha; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20260405161545.161006-1-julianbraha@gmail.com>

On Sun, 05 Apr 2026 17:15:45 +0100, Julian Braha wrote:
> The GUEST_STATE_BUFFER_TEST config option should default
> to KUNIT_ALL_TESTS so that if all tests are enabled then
> it is included, but currently the 'default KUNIT_ALL_TESTS'
> statement is shadowed by 'def_tristate n',
> meaning that this second default statement is currently dead code.
> 
> It looks to me like the commit
> 6ccbbc33f06a ("KVM: PPC: Add helper library for Guest State Buffers")
> intended to set the default to KUNIT_ALL_TESTS, but mistakenly
> missed the def_tristate.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc: fix dead default for GUEST_STATE_BUFFER_TEST
      https://git.kernel.org/powerpc/c/aef656a0e6c01796190bb5bd2bdba1c644ed7811

cheers


^ permalink raw reply

* Re: [PATCH v3] powerpc/g5: Enable all windfarms by default
From: Madhavan Srinivasan @ 2026-05-18  5:43 UTC (permalink / raw)
  To: Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Linus Walleij
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20260505-powermac-g5-config-v3-1-7747bf72f874@kernel.org>

On Tue, 05 May 2026 20:47:56 +0200, Linus Walleij wrote:
> The G5 defconfig is clearly intended for the G5 Powermac
> series, and that should enable all the available
> windfarm drivers, or the machine will overheat a short
> while after booting and shut itself down, which is
> annoying.
> 
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/g5: Enable all windfarms by default
      https://git.kernel.org/powerpc/c/8d57bb61734b23f6342e9de781173f1d83f90d3a

cheers


^ permalink raw reply

* Re: [PATCH v2] powerpc: 82xx: fix uninitialized pointers with free attribute
From: Madhavan Srinivasan @ 2026-05-18  5:43 UTC (permalink / raw)
  To: Michael Ellerman, Nicholas Piggin, Krzysztof Kozlowski,
	Christophe Leroy, Ally Heev
  Cc: linuxppc-dev, linux-kernel, Dan Carpenter
In-Reply-To: <20251116-aheev-uninitialized-free-attr-km82xx-v2-1-4307e2b5300d@gmail.com>

On Sun, 16 Nov 2025 19:55:44 +0530, Ally Heev wrote:
> Uninitialized pointers with `__free` attribute can cause undefined
> behavior as the memory allocated to the pointer is freed automatically
> when the pointer goes out of scope.
> 
> powerpc/km82xx doesn't have any bugs related to this as of now, but,
> it is better to initialize and assign pointers with `__free` attribute
> in one statement to ensure proper scope-based cleanup
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc: 82xx: fix uninitialized pointers with free attribute
      https://git.kernel.org/powerpc/c/acd1e47db03d4b528fd5efb8565dd0de1c79f62a

cheers


^ permalink raw reply

* Re: [PATCH] powerpc/hv-gpci: fix preempt count leak in sysfs show paths
From: Madhavan Srinivasan @ 2026-05-18  5:43 UTC (permalink / raw)
  To: Athira Rajeev, linuxppc-dev, Aboorva Devarajan
  Cc: Christophe Leroy, Kajol Jain, linux-kernel
In-Reply-To: <20260508041256.3447113-1-aboorvad@linux.ibm.com>

On Fri, 08 May 2026 09:42:56 +0530, Aboorva Devarajan wrote:
> Four sysfs show() callbacks in hv-gpci take get_cpu_var(hv_gpci_reqb)
> (which calls preempt_disable()) but only call the matching put_cpu_var()
> on the error path under the 'out:' label. Every successful read leaks
> one preempt_disable():
> 
>   processor_bus_topology_show()
>   processor_config_show()
>   affinity_domain_via_virtual_processor_show()
>   affinity_domain_via_domain_show()
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/hv-gpci: fix preempt count leak in sysfs show paths
      https://git.kernel.org/powerpc/c/dbc30a57bd8e026995e9fa8e8c31cffd18542c01

cheers


^ permalink raw reply

* [PATCH 18/18] raid6_kunit: randomize buffer alignment
From: Christoph Hellwig @ 2026-05-18  5:18 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Catalin Marinas, Will Deacon, Ard Biesheuvel, Huacai Chen,
	WANG Xuerui, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Paul Walmsley,
	Palmer Dabbelt, Albert Ou, Alexandre Ghiti, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, Herbert Xu, Dan Williams,
	Chris Mason, David Sterba, Arnd Bergmann, Song Liu, Yu Kuai,
	Li Nan, linux-kernel, linux-arm-kernel, loongarch, linuxppc-dev,
	linux-riscv, linux-s390, linux-crypto, linux-btrfs, linux-arch,
	linux-raid
In-Reply-To: <20260518051804.462141-1-hch@lst.de>

Add code to add random alignment to the buffers to test the case where
they are not page aligned, and to move the buffers to the end of the
allocation so that they are next to the vmalloc guard page.

This does not include the recovery buffers as the recovery requires
page alignment.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Ard Biesheuvel <ardb@kernel.org> # kunit only on arm64
---
 lib/raid/raid6/tests/raid6_kunit.c | 41 +++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/lib/raid/raid6/tests/raid6_kunit.c b/lib/raid/raid6/tests/raid6_kunit.c
index 71adf8932e93..9f3e671a1224 100644
--- a/lib/raid/raid6/tests/raid6_kunit.c
+++ b/lib/raid/raid6/tests/raid6_kunit.c
@@ -21,6 +21,7 @@ MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
 
 static struct rnd_state rng;
 static void *test_buffers[RAID6_KUNIT_MAX_BUFFERS];
+static void *aligned_buffers[RAID6_KUNIT_MAX_BUFFERS];
 static void *test_recov_buffers[RAID6_KUNIT_MAX_FAILURES];
 static size_t test_buflen;
 
@@ -50,6 +51,14 @@ static unsigned int random_nr_buffers(void)
 			RAID6_MIN_DISKS;
 }
 
+/* Generate a random alignment that is a multiple of 64. */
+static unsigned int random_alignment(unsigned int max_alignment)
+{
+	if (max_alignment == 0)
+		return 0;
+	return (rand32() % (max_alignment + 1)) & ~63;
+}
+
 static void makedata(int start, int stop)
 {
 	int i;
@@ -80,7 +89,7 @@ static void test_recover_one(struct kunit *test, unsigned int nr_buffers,
 	for (i = 0; i < RAID6_KUNIT_MAX_FAILURES; i++)
 		memset(test_recov_buffers[i], 0xf0, test_buflen);
 
-	memcpy(dataptrs, test_buffers, sizeof(dataptrs));
+	memcpy(dataptrs, aligned_buffers, sizeof(dataptrs));
 	dataptrs[faila] = test_recov_buffers[0];
 	dataptrs[failb] = test_recov_buffers[1];
 
@@ -102,13 +111,13 @@ static void test_recover_one(struct kunit *test, unsigned int nr_buffers,
 		ta->recov->data2(nr_buffers, len, faila, failb, dataptrs);
 	}
 
-	KUNIT_EXPECT_MEMEQ_MSG(test, test_buffers[faila], test_recov_buffers[0],
+	KUNIT_EXPECT_MEMEQ_MSG(test, aligned_buffers[faila], dataptrs[faila],
 			len,
 			"faila miscompared: %3d[%c] buffers %u len %u (failb=%3d[%c])\n",
 			faila, member_type(nr_buffers, faila),
 			nr_buffers, len,
 			failb, member_type(nr_buffers, failb));
-	KUNIT_EXPECT_MEMEQ_MSG(test, test_buffers[failb], test_recov_buffers[1],
+	KUNIT_EXPECT_MEMEQ_MSG(test, aligned_buffers[failb], dataptrs[failb],
 			len,
 			"failb miscompared: %3d[%c] buffers %u len %u (faila=%3d[%c])\n",
 			failb, member_type(nr_buffers, failb),
@@ -152,9 +161,9 @@ static void test_rmw_one(struct kunit *test, unsigned int nr_buffers,
 {
 	const struct test_args *ta = test->param_value;
 
-	ta->gen->xor_syndrome(nr_buffers, p1, p2, len, test_buffers);
+	ta->gen->xor_syndrome(nr_buffers, p1, p2, len, aligned_buffers);
 	makedata(p1, p2);
-	ta->gen->xor_syndrome(nr_buffers, p1, p2, len, test_buffers);
+	ta->gen->xor_syndrome(nr_buffers, p1, p2, len, aligned_buffers);
 	test_recover(test, nr_buffers, len);
 }
 
@@ -178,13 +187,33 @@ static void raid6_test_one(struct kunit *test)
 	const struct test_args *ta = test->param_value;
 	unsigned int nr_buffers = random_nr_buffers();
 	unsigned int len = random_length(RAID6_KUNIT_MAX_BYTES);
+	unsigned int max_alignment;
+	int i;
 
 	/* Nuke syndromes */
 	memset(test_buffers[nr_buffers - 2], 0xee, test_buflen);
 	memset(test_buffers[nr_buffers - 1], 0xee, test_buflen);
 
+	/*
+	 * If we're not using the entire buffer size, inject randomize alignment
+	 * into the buffer.
+	 */
+	max_alignment = RAID6_KUNIT_MAX_BYTES - len;
+	if (rand32() % 2 == 0) {
+		/* Use random alignments mod 64 */
+		for (i = 0; i < nr_buffers; i++)
+			aligned_buffers[i] = test_buffers[i] +
+				random_alignment(max_alignment);
+	} else {
+		/* Go up to the guard page, to catch buffer overreads */
+		unsigned int align = test_buflen - len;
+
+		for (i = 0; i < nr_buffers; i++)
+			aligned_buffers[i] = test_buffers[i] + align;
+	}
+
 	/* Generate assumed good syndrome */
-	ta->gen->gen_syndrome(nr_buffers, len, test_buffers);
+	ta->gen->gen_syndrome(nr_buffers, len, aligned_buffers);
 
 	test_recover(test, nr_buffers, len);
 
-- 
2.53.0



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox