LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6 02/15] arm64: mm: Drop redundant pgd_t* argument from map_mem()
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh,
	Kevin Brodsky
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

__map_memblock() and map_mem() always operate on swapper_pg_dir, so
there is no need to pass around a pgd_t pointer between them.

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 25 ++++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 112fa4a3b0eb..aa0e2c6435f7 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1035,11 +1035,11 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 	flush_tlb_kernel_range(virt, virt + size);
 }
 
-static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
-				  phys_addr_t end, pgprot_t prot, int flags)
+static void __init __map_memblock(phys_addr_t start, phys_addr_t end,
+				  pgprot_t prot, int flags)
 {
-	early_create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
-				 prot, early_pgtable_alloc, flags);
+	early_create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
+				 end - start, prot, early_pgtable_alloc, flags);
 }
 
 void __init mark_linear_text_alias_ro(void)
@@ -1087,13 +1087,13 @@ static phys_addr_t __init arm64_kfence_alloc_pool(void)
 	return kfence_pool;
 }
 
-static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
+static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool)
 {
 	if (!kfence_pool)
 		return;
 
 	/* KFENCE pool needs page-level mapping. */
-	__map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
+	__map_memblock(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
 			pgprot_tagged(PAGE_KERNEL),
 			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
 	memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
@@ -1129,11 +1129,11 @@ bool arch_kfence_init_pool(void)
 #else /* CONFIG_KFENCE */
 
 static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
-static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { }
+static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool) { }
 
 #endif /* CONFIG_KFENCE */
 
-static void __init map_mem(pgd_t *pgdp)
+static void __init map_mem(void)
 {
 	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
 	phys_addr_t kernel_start = __pa_symbol(_text);
@@ -1178,7 +1178,7 @@ static void __init map_mem(pgd_t *pgdp)
 		 * if MTE is present. Otherwise, it has the same attributes as
 		 * PAGE_KERNEL.
 		 */
-		__map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
+		__map_memblock(start, end, pgprot_tagged(PAGE_KERNEL),
 			       flags);
 	}
 
@@ -1192,10 +1192,9 @@ static void __init map_mem(pgd_t *pgdp)
 	 * Note that contiguous mappings cannot be remapped in this way,
 	 * so we should avoid them here.
 	 */
-	__map_memblock(pgdp, kernel_start, kernel_end,
-		       PAGE_KERNEL, NO_CONT_MAPPINGS);
+	__map_memblock(kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS);
 	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
-	arm64_kfence_map_pool(early_kfence_pool, pgdp);
+	arm64_kfence_map_pool(early_kfence_pool);
 }
 
 void mark_rodata_ro(void)
@@ -1417,7 +1416,7 @@ static void __init create_idmap(void)
 
 void __init paging_init(void)
 {
-	map_mem(swapper_pg_dir);
+	map_mem();
 
 	memblock_allow_resize();
 
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH mm-unstable RFC v4 7/7] mm: add PMD-level PFNMAP support for remap_pfn_range()
From: Yin Tirui @ 2026-05-26 14:50 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Juergen Gross, Jonathan Cameron, Will Deacon
  Cc: Catalin Marinas, Peter Xu, Luiz Capitulino, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H . Peter Anvin,
	Andy Lutomirski, Peter Zijlstra, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Liam R . Howlett, Zi Yan, Baolin Wang, Nico Pache, Ryan Roberts,
	Dev Jain, Barry Song, Lance Yang, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Anshuman Khandual, Rohan McLure,
	Kevin Brodsky, Alistair Popple, Andrew Donnellan, Pasha Tatashin,
	Baoquan He, Thomas Huth, Coiby Xu, Dan Williams, Yu-cheng Yu,
	Lu Baolu, Conor Dooley, Rik van Riel, wangkefeng.wang, chenjun102,
	yintirui, linux-mm, linux-kernel, x86, linux-arm-kernel,
	linuxppc-dev, linux-pm
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Teach remap_pfn_range() to install PMD-sized PFNMAP entries when the
virtual range and PFN are PMD-aligned, the architecture exposes PMD
PFNMAP support, and PMD leaves are available at runtime. The path only
runs on VMAs without ->fault or ->huge_fault, so the resulting PMDs
are known to be non-refaultable.

Non-refaultable PFNMAP PMDs cannot be rebuilt on demand and are
therefore installed with a deposited pgtable.
vma_pfnmap_has_deposited_pgtable() becomes the common predicate driving
the deposit logic in copy_huge_pmd(), zap_huge_pmd() through
has_deposited_pgtable(), and the new __split_huge_pfnmap_pmd().

The split path withdraws the pgtable and populates it with special PTEs
derived from the original PMD using pmd_pfn() and pmd_pgprot(). With
pmd_pgprot() returning PTE-level pgprot_t, this preserves protection and
cache attributes without reintroducing pte_clrhuge().

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 mm/huge_memory.c | 60 ++++++++++++++++++++++++++++-----
 mm/internal.h    | 21 ++++++++++++
 mm/memory.c      | 87 +++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 148 insertions(+), 20 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index be9b637c813b..19e6d856e8bf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1879,6 +1879,8 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return false;
 }
 
+static bool has_deposited_pgtable(struct vm_area_struct *vma, pmd_t pmdval,
+		struct folio *folio);
 static int copy_present_huge_pmd(
 		struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
@@ -1912,8 +1914,12 @@ static int copy_present_huge_pmd(
 		 * able to wrongly write to the backend MMIO.
 		 */
 		VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
-		pte_free(dst_mm, pgtable);
-		pgtable = NULL;
+
+		if (!has_deposited_pgtable(dst_vma, pmd, NULL)) {
+			pte_free(dst_mm, pgtable);
+			pgtable = NULL;
+		}
+
 		wrprotect = false;
 		goto set_pmd;
 	}
@@ -2495,11 +2501,19 @@ static bool has_deposited_pgtable(struct vm_area_struct *vma, pmd_t pmdval,
 	if (is_huge_zero_pmd(pmdval))
 		return !vma_is_dax(vma);
 
+	/*
+	 * PMD-sized PFNMAP mappings installed without fault handlers cannot be
+	 * refaulted after the PMD is cleared, so they carry a deposited page
+	 * table for later partial unmap/mprotect.
+	 */
+	if (!folio)
+		return pmd_present(pmdval) && vma_pfnmap_has_deposited_pgtable(vma);
+
 	/*
 	 * Otherwise, only anonymous folios are deposited, see
 	 * __do_huge_pmd_anonymous_page().
 	 */
-	return folio && folio_test_anon(folio);
+	return folio_test_anon(folio);
 }
 
 /**
@@ -3118,6 +3132,32 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	pmd_populate(mm, pmd, pgtable);
 }
 
+static void __split_huge_pfnmap_pmd(struct vm_area_struct *vma,
+		unsigned long haddr, pmd_t *pmd)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgtable_t pgtable;
+	pmd_t old_pmd, _pmd;
+	pte_t *pte, entry;
+
+	old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
+	if (!has_deposited_pgtable(vma, old_pmd, NULL))
+		return;
+
+	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+	pmd_populate(mm, &_pmd, pgtable);
+
+	pte = pte_offset_map(&_pmd, haddr);
+	VM_BUG_ON(!pte);
+
+	entry = pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd));
+	set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
+	pte_unmap(pte);
+
+	smp_wmb(); /* make pte visible before pmd */
+	pmd_populate(mm, pmd, pgtable);
+}
+
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long haddr, bool freeze)
 {
@@ -3157,11 +3200,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 				return __split_huge_zero_page_pmd(vma, haddr, pmd);
 			}
 
-			/* Present but not a normal folio: drop the PMD. */
-			old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
-			if (arch_needs_pgtable_deposit())
-				zap_deposited_table(mm, pmd);
-			return;
+			/*
+			 * Present PMDs without a normal folio are special mappings. Huge zero PMDs
+			 * are handled above; the remaining PMD-level special mappings are PFNMAP
+			 * mappings.
+			 */
+			return __split_huge_pfnmap_pmd(vma, haddr, pmd);
 		}
 
 		if (unlikely(!folio_test_anon(folio))) {
diff --git a/mm/internal.h b/mm/internal.h
index 5a2ddcf68e0b..f82bd987131d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -198,6 +198,27 @@ static inline void vma_close(struct vm_area_struct *vma)
 	}
 }
 
+static inline bool vma_has_fault_handler(const struct vm_area_struct *vma)
+{
+	const struct vm_operations_struct *vm_ops = vma->vm_ops;
+
+	return vm_ops && (vm_ops->fault || vm_ops->huge_fault);
+}
+
+/*
+ * PMD-sized PFNMAP mappings installed without fault handlers cannot be
+ * recreated after the PMD is cleared. Such mappings need a deposited page
+ * table so they can be split into PTEs for partial unmap/mprotect.
+ *
+ * Faultable PFNMAP VMAs can drop the PMD and refault it later, so they do
+ * not need a deposited page table.
+ */
+static inline bool
+vma_pfnmap_has_deposited_pgtable(const struct vm_area_struct *vma)
+{
+	return vma_test(vma, VMA_PFNMAP_BIT) && !vma_has_fault_handler(vma);
+}
+
 /* unmap_vmas is in mm/memory.c */
 void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
 
diff --git a/mm/memory.c b/mm/memory.c
index 56886d1ddaf3..226e3a53a48e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2943,9 +2943,66 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	return err;
 }
 
-static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
-			unsigned long addr, unsigned long end,
-			unsigned long pfn, pgprot_t prot)
+static int remap_try_install_pmd_leaf(struct mm_struct *mm,
+		pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr,
+		unsigned long end, unsigned long pfn, pgprot_t prot)
+{
+	pgtable_t pgtable;
+	spinlock_t *ptl;
+	unsigned long i;
+	pmd_t entry;
+
+	if (!pgtable_level_has_pxx_special(PGTABLE_LEVEL_PMD))
+		return 0;
+
+	if (!pgtable_has_pmd_leaves())
+		return 0;
+
+	/*
+	 * Do not install PMD leaves through remap_pfn_range() for VMAs that have
+	 * a fault handler. With this restriction, a PFNMAP PMD in a VMA without
+	 * a fault handler is known to have been installed by remap_pfn_range()
+	 * and to have a deposited page table for later split; see
+	 * vma_pfnmap_has_deposited_pgtable().
+	 */
+	if (vma_has_fault_handler(vma))
+		return 0;
+
+	if (!IS_ALIGNED(addr | end, PMD_SIZE))
+		return 0;
+
+	if (!IS_ALIGNED(PFN_PHYS(pfn), PMD_SIZE))
+		return 0;
+
+	for (i = 0; i < PFN_DOWN(PMD_SIZE); i++) {
+		if (!pfn_modify_allowed(pfn + i, prot))
+			return -EACCES;
+	}
+
+	pgtable = pte_alloc_one(mm);
+	if (unlikely(!pgtable))
+		return 0;
+
+	ptl = pmd_lock(mm, pmd);
+	if (!pmd_none(*pmd)) {
+		spin_unlock(ptl);
+		pte_free(mm, pgtable);
+		return 0;
+	}
+
+	entry = pfn_pmd(pfn, prot);
+	entry = pmd_mkspecial(entry);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	mm_inc_nr_ptes(mm);
+	set_pmd_at(mm, addr, pmd, entry);
+	spin_unlock(ptl);
+
+	return 1;
+}
+
+static inline int remap_pmd_range(struct mm_struct *mm,
+		struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
+		unsigned long end, unsigned long pfn, pgprot_t prot)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -2958,6 +3015,12 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 	VM_BUG_ON(pmd_trans_huge(*pmd));
 	do {
 		next = pmd_addr_end(addr, end);
+		err = remap_try_install_pmd_leaf(mm, pmd, vma, addr, next,
+				pfn + (addr >> PAGE_SHIFT), prot);
+		if (err < 0)
+			return err;
+		if (err > 0)
+			continue;
 		err = remap_pte_range(mm, pmd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
@@ -2966,9 +3029,9 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 	return 0;
 }
 
-static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
-			unsigned long addr, unsigned long end,
-			unsigned long pfn, pgprot_t prot)
+static inline int remap_pud_range(struct mm_struct *mm,
+		struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
+		unsigned long end, unsigned long pfn, pgprot_t prot)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -2980,7 +3043,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
-		err = remap_pmd_range(mm, pud, addr, next,
+		err = remap_pmd_range(mm, vma, pud, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
 			return err;
@@ -2988,9 +3051,9 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
 	return 0;
 }
 
-static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
-			unsigned long addr, unsigned long end,
-			unsigned long pfn, pgprot_t prot)
+static inline int remap_p4d_range(struct mm_struct *mm,
+		struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
+		unsigned long end, unsigned long pfn, pgprot_t prot)
 {
 	p4d_t *p4d;
 	unsigned long next;
@@ -3002,7 +3065,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
 		return -ENOMEM;
 	do {
 		next = p4d_addr_end(addr, end);
-		err = remap_pud_range(mm, p4d, addr, next,
+		err = remap_pud_range(mm, vma, p4d, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
 			return err;
@@ -3049,7 +3112,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
 	flush_cache_range(vma, addr, end);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = remap_p4d_range(mm, pgd, addr, next,
+		err = remap_p4d_range(mm, vma, pgd, addr, next,
 				pfn + (addr >> PAGE_SHIFT), prot);
 		if (err)
 			return err;
-- 
2.43.0



^ permalink raw reply related

* [PATCH mm-unstable RFC v4 5/7] mm/huge_memory: refactor __split_huge_pmd_locked()
From: Yin Tirui @ 2026-05-26 14:50 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Juergen Gross, Jonathan Cameron, Will Deacon
  Cc: Catalin Marinas, Peter Xu, Luiz Capitulino, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H . Peter Anvin,
	Andy Lutomirski, Peter Zijlstra, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Liam R . Howlett, Zi Yan, Baolin Wang, Nico Pache, Ryan Roberts,
	Dev Jain, Barry Song, Lance Yang, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Anshuman Khandual, Rohan McLure,
	Kevin Brodsky, Alistair Popple, Andrew Donnellan, Pasha Tatashin,
	Baoquan He, Thomas Huth, Coiby Xu, Dan Williams, Yu-cheng Yu,
	Lu Baolu, Conor Dooley, Rik van Riel, wangkefeng.wang, chenjun102,
	yintirui, linux-mm, linux-kernel, x86, linux-arm-kernel,
	linuxppc-dev, linux-pm
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Rework __split_huge_pmd_locked() to classify huge PMDs by the PMD entry
itself instead of starting from vma_is_anonymous().

Present PMDs are classified with vm_normal_folio_pmd(): file/shmem THPs
are dropped and refaulted later, anonymous THPs are split into PTEs, and
PMDs without a normal folio are handled as huge zero or special PMDs.

Non-present PMDs are classified with pmd_to_softleaf_folio(): file/shmem
migration entries are dropped, while anonymous migration/device-private
entries are split into PTEs.

This also makes the anonymous decision folio-based.  A private file
mapping that has CoW'ed to an anonymous THP now follows the anonymous
path even though the VMA is file-backed.

No intended behavioural change.

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 mm/huge_memory.c | 197 +++++++++++++++++++++++++++--------------------
 1 file changed, 114 insertions(+), 83 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3964258ff91d..8cd77389d52f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3136,25 +3136,38 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	count_vm_event(THP_SPLIT_PMD);
 
-	if (!vma_is_anonymous(vma)) {
-		old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
-		/*
-		 * We are going to unmap this huge page. So
-		 * just go ahead and zap it
-		 */
-		if (arch_needs_pgtable_deposit())
-			zap_deposited_table(mm, pmd);
-		if (vma_is_special_huge(vma))
-			return;
-		if (unlikely(pmd_is_migration_entry(old_pmd))) {
-			const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
+	if (pmd_present(*pmd)) {
+		folio = vm_normal_folio_pmd(vma, haddr, *pmd);
+
+		if (unlikely(!folio)) {
+			if (is_huge_zero_pmd(*pmd)) {
+				/*
+				 * FIXME: Do we want to invalidate secondary mmu by calling
+				 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
+				 * inside __split_huge_pmd() ?
+				 *
+				 * We are going from a zero huge page write protected to zero
+				 * small page also write protected so it does not seems useful
+				 * to invalidate secondary mmu at this time.
+				 */
+				return __split_huge_zero_page_pmd(vma, haddr, pmd);
+			}
 
-			folio = softleaf_to_folio(old_entry);
-		} else if (is_huge_zero_pmd(old_pmd)) {
+			/* Present but not a normal folio: drop the PMD. */
+			old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
+			if (arch_needs_pgtable_deposit())
+				zap_deposited_table(mm, pmd);
 			return;
-		} else {
+		}
+
+		if (unlikely(!folio_test_anon(folio))) {
+			old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
+			if (arch_needs_pgtable_deposit())
+				zap_deposited_table(mm, pmd);
+			if (vma_is_special_huge(vma))
+				return;
+
 			page = pmd_page(old_pmd);
-			folio = page_folio(page);
 			if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
 				folio_mark_dirty(folio);
 			if (!folio_test_referenced(folio) && pmd_young(old_pmd))
@@ -3164,72 +3177,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			folio_put(folio);
 			return;
 		}
-		add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
-		return;
-	}
-
-	if (is_huge_zero_pmd(*pmd)) {
-		/*
-		 * FIXME: Do we want to invalidate secondary mmu by calling
-		 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
-		 * inside __split_huge_pmd() ?
-		 *
-		 * We are going from a zero huge page write protected to zero
-		 * small page also write protected so it does not seems useful
-		 * to invalidate secondary mmu at this time.
-		 */
-		return __split_huge_zero_page_pmd(vma, haddr, pmd);
-	}
-
-	if (pmd_is_migration_entry(*pmd)) {
-		softleaf_t entry;
-
-		old_pmd = *pmd;
-		entry = softleaf_from_pmd(old_pmd);
-		page = softleaf_to_page(entry);
-		folio = page_folio(page);
-
-		soft_dirty = pmd_swp_soft_dirty(old_pmd);
-		uffd_wp = pmd_swp_uffd_wp(old_pmd);
-
-		write = softleaf_is_migration_write(entry);
-		if (PageAnon(page))
-			anon_exclusive = softleaf_is_migration_read_exclusive(entry);
-		young = softleaf_is_migration_young(entry);
-		dirty = softleaf_is_migration_dirty(entry);
-	} else if (pmd_is_device_private_entry(*pmd)) {
-		softleaf_t entry;
-
-		old_pmd = *pmd;
-		entry = softleaf_from_pmd(old_pmd);
-		page = softleaf_to_page(entry);
-		folio = page_folio(page);
-
-		soft_dirty = pmd_swp_soft_dirty(old_pmd);
-		uffd_wp = pmd_swp_uffd_wp(old_pmd);
-
-		write = softleaf_is_device_private_write(entry);
-		anon_exclusive = PageAnonExclusive(page);
-
-		/*
-		 * Device private THP should be treated the same as regular
-		 * folios w.r.t anon exclusive handling. See the comments for
-		 * folio handling and anon_exclusive below.
-		 */
-		if (freeze && anon_exclusive &&
-		    folio_try_share_anon_rmap_pmd(folio, page))
-			freeze = false;
-		if (!freeze) {
-			rmap_t rmap_flags = RMAP_NONE;
-
-			folio_ref_add(folio, HPAGE_PMD_NR - 1);
-			if (anon_exclusive)
-				rmap_flags |= RMAP_EXCLUSIVE;
 
-			folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
-						 vma, haddr, rmap_flags);
-		}
-	} else {
 		/*
 		 * Up to this point the pmd is present and huge and userland has
 		 * the whole access to the hugepage during the split (which
@@ -3255,7 +3203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 */
 		old_pmd = pmdp_invalidate(vma, haddr, pmd);
 		page = pmd_page(old_pmd);
-		folio = page_folio(page);
 		if (pmd_dirty(old_pmd)) {
 			dirty = true;
 			folio_set_dirty(folio);
@@ -3266,7 +3213,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		uffd_wp = pmd_uffd_wp(old_pmd);
 
 		VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
-		VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
 
 		/*
 		 * Without "freeze", we'll simply split the PMD, propagating the
@@ -3296,6 +3242,85 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
 						 vma, haddr, rmap_flags);
 		}
+	} else {
+		/*
+		 * Non-present PMD: a softleaf-encoded migration or
+		 * device-private entry. pmd_to_softleaf_folio() warns and
+		 * returns NULL for any other encoding.
+		 */
+		folio = pmd_to_softleaf_folio(*pmd);
+		if (unlikely(!folio))
+			return;
+
+		if (unlikely(!folio_test_anon(folio))) {
+			/*
+			 * File/shmem migration entry: drop the PMD without
+			 * splitting. Unlike the present case the entry holds
+			 * neither a folio reference nor an rmap to release,
+			 * so just adjust the RSS counter.
+			 */
+			pmdp_huge_clear_flush(vma, haddr, pmd);
+			if (arch_needs_pgtable_deposit())
+				zap_deposited_table(mm, pmd);
+			if (unlikely(vma_is_special_huge(vma))) {
+				VM_WARN_ONCE(1,
+					     "unexpected special huge PMD migration entry\n");
+				return;
+			}
+			add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
+			return;
+		}
+
+		if (pmd_is_migration_entry(*pmd)) {
+			softleaf_t entry;
+
+			old_pmd = *pmd;
+			entry = softleaf_from_pmd(old_pmd);
+			page = softleaf_to_page(entry);
+
+			soft_dirty = pmd_swp_soft_dirty(old_pmd);
+			uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+			write = softleaf_is_migration_write(entry);
+			if (PageAnon(page))
+				anon_exclusive = softleaf_is_migration_read_exclusive(entry);
+			young = softleaf_is_migration_young(entry);
+			dirty = softleaf_is_migration_dirty(entry);
+		} else if (pmd_is_device_private_entry(*pmd)) {
+			softleaf_t entry;
+
+			old_pmd = *pmd;
+			entry = softleaf_from_pmd(old_pmd);
+			page = softleaf_to_page(entry);
+
+			soft_dirty = pmd_swp_soft_dirty(old_pmd);
+			uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+			write = softleaf_is_device_private_write(entry);
+			anon_exclusive = PageAnonExclusive(page);
+
+			/*
+			 * Device-private THP should be treated the same as
+			 * regular folios w.r.t. anon-exclusive handling. See
+			 * the matching code for present anon folios above.
+			 */
+			if (freeze && anon_exclusive &&
+			    folio_try_share_anon_rmap_pmd(folio, page))
+				freeze = false;
+			if (!freeze) {
+				rmap_t rmap_flags = RMAP_NONE;
+
+				folio_ref_add(folio, HPAGE_PMD_NR - 1);
+				if (anon_exclusive)
+					rmap_flags |= RMAP_EXCLUSIVE;
+
+				folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+							 vma, haddr, rmap_flags);
+			}
+		} else {
+			VM_WARN_ON_ONCE(1);
+			return;
+		}
 	}
 
 	/*
-- 
2.43.0



^ permalink raw reply related

* [PATCH mm-unstable RFC v4 6/7] mm/huge_memory: make move_huge_pmd() use has_deposited_pgtable()
From: Yin Tirui @ 2026-05-26 14:50 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Juergen Gross, Jonathan Cameron, Will Deacon
  Cc: Catalin Marinas, Peter Xu, Luiz Capitulino, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H . Peter Anvin,
	Andy Lutomirski, Peter Zijlstra, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Liam R . Howlett, Zi Yan, Baolin Wang, Nico Pache, Ryan Roberts,
	Dev Jain, Barry Song, Lance Yang, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Anshuman Khandual, Rohan McLure,
	Kevin Brodsky, Alistair Popple, Andrew Donnellan, Pasha Tatashin,
	Baoquan He, Thomas Huth, Coiby Xu, Dan Williams, Yu-cheng Yu,
	Lu Baolu, Conor Dooley, Rik van Riel, wangkefeng.wang, chenjun102,
	yintirui, linux-mm, linux-kernel, x86, linux-arm-kernel,
	linuxppc-dev, linux-pm
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Use has_deposited_pgtable() in move_huge_pmd() to decide whether
pmd_move_must_withdraw() should move a deposited pgtable instead of
using the VMA type.

PowerPC radix follows the generic rule.  PowerPC hash keeps returning
true.

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  5 ++---
 arch/powerpc/mm/book3s64/pgtable.c           | 11 +++++------
 mm/huge_memory.c                             | 20 ++++++++++++--------
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b6629c041e75..a0042cacac8d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1424,9 +1424,8 @@ extern pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
 
 #define pmd_move_must_withdraw pmd_move_must_withdraw
 struct spinlock;
-extern int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
-				  struct spinlock *old_pmd_ptl,
-				  struct vm_area_struct *vma);
+extern bool pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+		struct spinlock *old_pmd_ptl, bool has_deposit);
 /*
  * Hash translation mode use the deposited table to store hash pte
  * slot information.
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 85ab6723c8f2..4c45b5762d57 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -548,15 +548,14 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
  * pmd page. Hence if we have different pmd page we need to withdraw during pmd
  * move.
  *
- * With hash we use deposited table always irrespective of anon or not.
- * With radix we use deposited table only for anonymous mapping.
+ * With hash we use deposited table always irrespective of has_deposit or not.
+ * With radix we use the same rule as the generic implementation.
  */
-int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
-			   struct spinlock *old_pmd_ptl,
-			   struct vm_area_struct *vma)
+bool pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+		struct spinlock *old_pmd_ptl, bool has_deposit)
 {
 	if (radix_enabled())
-		return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+		return (new_pmd_ptl != old_pmd_ptl) && has_deposit;
 
 	return true;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8cd77389d52f..be9b637c813b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2552,17 +2552,14 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 }
 
 #ifndef pmd_move_must_withdraw
-static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
-					 spinlock_t *old_pmd_ptl,
-					 struct vm_area_struct *vma)
+static inline bool pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+		spinlock_t *old_pmd_ptl, bool has_deposit)
 {
 	/*
 	 * With split pmd lock we also need to move preallocated
 	 * PTE page table if new_pmd is on different PMD page table.
-	 *
-	 * We also don't deposit and withdraw tables for file pages.
 	 */
-	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+	return (new_pmd_ptl != old_pmd_ptl) && has_deposit;
 }
 #endif
 
@@ -2595,8 +2592,11 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 {
 	spinlock_t *old_ptl, *new_ptl;
 	pmd_t pmd;
+	struct folio *folio = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	bool force_flush = false;
+	bool has_deposit;
+	bool is_present;
 
 	/*
 	 * The destination pmd shouldn't be established, free_pgtables()
@@ -2618,11 +2618,15 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		if (new_ptl != old_ptl)
 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
-		if (pmd_present(pmd))
+		is_present = pmd_present(pmd);
+		if (is_present)
 			force_flush = true;
 		VM_BUG_ON(!pmd_none(*new_pmd));
 
-		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
+		folio = normal_or_softleaf_folio_pmd(vma, old_addr, pmd, is_present);
+		has_deposit = has_deposited_pgtable(vma, pmd, folio);
+
+		if (pmd_move_must_withdraw(new_ptl, old_ptl, has_deposit)) {
 			pgtable_t pgtable;
 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-- 
2.43.0



^ permalink raw reply related

* [PATCH mm-unstable RFC v4 1/7] x86/mm: use PTE-level pgprot for huge PFN helpers
From: Yin Tirui @ 2026-05-26 14:49 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Juergen Gross, Jonathan Cameron, Will Deacon
  Cc: Catalin Marinas, Peter Xu, Luiz Capitulino, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H . Peter Anvin,
	Andy Lutomirski, Peter Zijlstra, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Liam R . Howlett, Zi Yan, Baolin Wang, Nico Pache, Ryan Roberts,
	Dev Jain, Barry Song, Lance Yang, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Anshuman Khandual, Rohan McLure,
	Kevin Brodsky, Alistair Popple, Andrew Donnellan, Pasha Tatashin,
	Baoquan He, Thomas Huth, Coiby Xu, Dan Williams, Yu-cheng Yu,
	Lu Baolu, Conor Dooley, Rik van Riel, wangkefeng.wang, chenjun102,
	yintirui, linux-mm, linux-kernel, x86, linux-arm-kernel,
	linuxppc-dev, linux-pm
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Make the x86 PMD/PUD PFN helpers use PTE-level pgprot_t as the basic
format.

pfn_pmd() and pfn_pud() now translate PTE-level attributes into
large-page entries, including the x86 PAT/PSE encoding. pmd_pgprot()
and pud_pgprot() translate large-page attributes back to PTE-level
pgprot_t, hiding _PAGE_PSE and converting large-page PAT encoding back
to the PTE PAT position.

Rework pmd_mkinvalid() and pud_mkinvalid() to use the same helpers:
extract a PTE-level pgprot_t with pmd_pgprot()/pud_pgprot(), clear
PRESENT/PROTNONE, and rebuild the PMD/PUD entry with
pfn_pmd()/pfn_pud().

The old explicit huge pgprot conversion helpers are no longer needed.
Remove pte_clrhuge(), pgprot_large_2_4k(), pgprot_4k_2_large(),
PAGE_KERNEL_LARGE and PAGE_KERNEL_LARGE_EXEC, and update x86 callers to
construct PMD/PUD entries through the normal PFN helpers.

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 arch/x86/include/asm/pgtable.h       | 68 +++++++++++++++++++---------
 arch/x86/include/asm/pgtable_types.h | 12 +----
 arch/x86/mm/init_32.c                |  8 ++--
 arch/x86/mm/init_64.c                | 30 ++++--------
 arch/x86/mm/pat/set_memory.c         | 51 ++++++---------------
 arch/x86/mm/pgtable.c                |  8 +---
 arch/x86/power/hibernate_32.c        |  6 +--
 7 files changed, 77 insertions(+), 106 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2edd6c9d789c..fe63a2f6d183 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -475,11 +475,6 @@ static inline pte_t pte_mkhuge(pte_t pte)
 	return pte_set_flags(pte, _PAGE_PSE);
 }
 
-static inline pte_t pte_clrhuge(pte_t pte)
-{
-	return pte_clear_flags(pte, _PAGE_PSE);
-}
-
 static inline pte_t pte_mkglobal(pte_t pte)
 {
 	return pte_set_flags(pte, _PAGE_GLOBAL);
@@ -741,29 +736,31 @@ static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 {
 	phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
-	pfn ^= protnone_mask(pgprot_val(pgprot));
+	pgprotval_t protval = protval_4k_2_large(pgprot_val(pgprot));
+
+	protval = check_pgprot(__pgprot(protval));
+	if (protval)
+		protval |= _PAGE_PSE;
+
+	pfn ^= protnone_mask(protval);
 	pfn &= PHYSICAL_PMD_PAGE_MASK;
-	return __pmd(pfn | check_pgprot(pgprot));
+
+	return __pmd(pfn | protval);
 }
 
 static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
 {
 	phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
-	pfn ^= protnone_mask(pgprot_val(pgprot));
-	pfn &= PHYSICAL_PUD_PAGE_MASK;
-	return __pud(pfn | check_pgprot(pgprot));
-}
+	pgprotval_t protval = protval_4k_2_large(pgprot_val(pgprot));
 
-static inline pmd_t pmd_mkinvalid(pmd_t pmd)
-{
-	return pfn_pmd(pmd_pfn(pmd),
-		      __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
-}
+	protval = check_pgprot(__pgprot(protval));
+	if (protval)
+		protval |= _PAGE_PSE;
 
-static inline pud_t pud_mkinvalid(pud_t pud)
-{
-	return pfn_pud(pud_pfn(pud),
-		       __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+	pfn ^= protnone_mask(protval);
+	pfn &= PHYSICAL_PUD_PAGE_MASK;
+
+	return __pud(pfn | protval);
 }
 
 static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
@@ -860,10 +857,37 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 }
 
 #define pte_pgprot(x) __pgprot(pte_flags(x))
-#define pmd_pgprot(x) __pgprot(pmd_flags(x))
-#define pud_pgprot(x) __pgprot(pud_flags(x))
+static inline pgprot_t pmd_pgprot(pmd_t pmd)
+{
+	return __pgprot(protval_large_2_4k(pmd_flags(pmd)));
+}
+
+#define pmd_pgprot pmd_pgprot
+
+static inline pgprot_t pud_pgprot(pud_t pud)
+{
+	return __pgprot(protval_large_2_4k(pud_flags(pud)));
+}
+
+#define pud_pgprot pud_pgprot
 #define p4d_pgprot(x) __pgprot(p4d_flags(x))
 
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
+{
+	pgprot_t prot = pmd_pgprot(pmd);
+
+	pgprot_val(prot) &= ~(_PAGE_PRESENT | _PAGE_PROTNONE);
+	return pfn_pmd(pmd_pfn(pmd), prot);
+}
+
+static inline pud_t pud_mkinvalid(pud_t pud)
+{
+	pgprot_t prot = pud_pgprot(pud);
+
+	pgprot_val(prot) &= ~(_PAGE_PRESENT | _PAGE_PROTNONE);
+	return pfn_pud(pud_pfn(pud), prot);
+}
+
 #define canon_pgprot(p) __pgprot(massage_pgprot(p))
 
 static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 2ec250ba467e..135f6f1f826c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -251,8 +251,6 @@ enum page_cache_mode {
 #define PAGE_KERNEL_EXEC_NOENC	__pgprot_mask(__PAGE_KERNEL_EXEC       |    0)
 #define PAGE_KERNEL_ROX		__pgprot_mask(__PAGE_KERNEL_ROX        | _ENC)
 #define PAGE_KERNEL_NOCACHE	__pgprot_mask(__PAGE_KERNEL_NOCACHE    | _ENC)
-#define PAGE_KERNEL_LARGE	__pgprot_mask(__PAGE_KERNEL_LARGE      | _ENC)
-#define PAGE_KERNEL_LARGE_EXEC	__pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
 #define PAGE_KERNEL_VVAR	__pgprot_mask(__PAGE_KERNEL_VVAR       | _ENC)
 
 #define PAGE_KERNEL_IO		__pgprot_mask(__PAGE_KERNEL_IO)
@@ -497,21 +495,13 @@ static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
 	return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
 		((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
 }
-static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
-{
-	return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
-}
+
 static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
 {
 	return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
 		((val & _PAGE_PAT_LARGE) >>
 		 (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
 }
-static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
-{
-	return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
-}
-
 
 typedef struct page *pgtable_t;
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0908c44d51e6..3c2c0af5a2d2 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -311,14 +311,12 @@ kernel_physical_mapping_init(unsigned long start,
 			 */
 			if (use_pse) {
 				unsigned int addr2;
-				pgprot_t prot = PAGE_KERNEL_LARGE;
+				pgprot_t prot = PAGE_KERNEL;
 				/*
 				 * first pass will use the same initial
 				 * identity mapping attribute + _PAGE_PSE.
 				 */
-				pgprot_t init_prot =
-					__pgprot(PTE_IDENT_ATTR |
-						 _PAGE_PSE);
+				pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
 
 				pfn &= PMD_MASK >> PAGE_SHIFT;
 				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
@@ -326,7 +324,7 @@ kernel_physical_mapping_init(unsigned long start,
 
 				if (is_x86_32_kernel_text(addr) ||
 				    is_x86_32_kernel_text(addr2))
-					prot = PAGE_KERNEL_LARGE_EXEC;
+					prot = PAGE_KERNEL_EXEC;
 
 				pages_2m++;
 				if (mapping_iter == 1)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 77b889b71cf3..9e83fac8df4e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -90,13 +90,6 @@ DEFINE_ENTRY(pud, pud, init)
 DEFINE_ENTRY(pmd, pmd, init)
 DEFINE_ENTRY(pte, pte, init)
 
-static inline pgprot_t prot_sethuge(pgprot_t prot)
-{
-	WARN_ON_ONCE(pgprot_val(prot) & _PAGE_PAT);
-
-	return __pgprot(pgprot_val(prot) | _PAGE_PSE);
-}
-
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
@@ -390,8 +383,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
 	pmd_t *pmd;
 	pgprot_t prot;
 
-	pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
-		protval_4k_2_large(cachemode2protval(cache));
+	pgprot_val(prot) = pgprot_val(PAGE_KERNEL) | cachemode2protval(cache);
 	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
 	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
 		pgd = pgd_offset_k((unsigned long)__va(phys));
@@ -414,7 +406,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
 		}
 		pmd = pmd_offset(pud, phys);
 		BUG_ON(!pmd_none(*pmd));
-		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+		set_pmd(pmd, pfn_pmd(phys >> PAGE_SHIFT, prot));
 	}
 }
 
@@ -572,15 +564,13 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
 				paddr_last = paddr_next;
 				continue;
 			}
-			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
+			new_prot = pmd_pgprot(*pmd);
 		}
 
 		if (page_size_mask & (1<<PG_LEVEL_2M)) {
 			pages++;
 			spin_lock(&init_mm.page_table_lock);
-			set_pmd_init(pmd,
-				     pfn_pmd(paddr >> PAGE_SHIFT, prot_sethuge(prot)),
-				     init);
+			set_pmd_init(pmd, pfn_pmd(paddr >> PAGE_SHIFT, prot), init);
 			spin_unlock(&init_mm.page_table_lock);
 			paddr_last = paddr_next;
 			continue;
@@ -658,15 +648,13 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
 				paddr_last = paddr_next;
 				continue;
 			}
-			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
+			prot = pud_pgprot(*pud);
 		}
 
 		if (page_size_mask & (1<<PG_LEVEL_1G)) {
 			pages++;
 			spin_lock(&init_mm.page_table_lock);
-			set_pud_init(pud,
-				     pfn_pud(paddr >> PAGE_SHIFT, prot_sethuge(prot)),
-				     init);
+			set_pud_init(pud, pfn_pud(paddr >> PAGE_SHIFT, prot), init);
 			spin_unlock(&init_mm.page_table_lock);
 			paddr_last = paddr_next;
 			continue;
@@ -1519,11 +1507,9 @@ static int __meminitdata node_start;
 void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
 			       unsigned long addr, unsigned long next)
 {
-	pte_t entry;
+	pmd_t entry = pfn_pmd(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 
-	entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
-			PAGE_KERNEL_LARGE);
-	set_pmd(pmd, __pmd(pte_val(entry)));
+	set_pmd(pmd, entry);
 
 	/* check to see if we have contiguous blocks */
 	if (p_end != p || node_start != node) {
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index d023a40a1e03..a26b2397c4cf 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -971,25 +971,16 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
 
 	/*
 	 * We are safe now. Check whether the new pgprot is the same:
-	 * Convert protection attributes to 4k-format, as cpa->mask* are set
-	 * up accordingly.
+	 * Note that old_prot is already in the ideal 4k-format, so we can
+	 * directly apply cpa->mask* to it.
 	 */
 
-	/* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
-	req_prot = pgprot_large_2_4k(old_prot);
+	req_prot = old_prot;
 
 	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 
-	/*
-	 * req_prot is in format of 4k pages. It must be converted to large
-	 * page format: the caching mode includes the PAT bit located at
-	 * different bit positions in the two formats.
-	 */
-	req_prot = pgprot_4k_2_large(req_prot);
 	req_prot = pgprot_clear_protnone_bits(req_prot);
-	if (pgprot_val(req_prot) & _PAGE_PRESENT)
-		pgprot_val(req_prot) |= _PAGE_PSE;
 
 	/*
 	 * old_pfn points to the large page base pfn. So we need to add the
@@ -1065,7 +1056,10 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
 		return 1;
 
 	/* All checks passed. Update the large page mapping. */
-	new_pte = pfn_pte(old_pfn, new_prot);
+	if (level == PG_LEVEL_2M)
+		new_pte = __pte(pmd_val(pfn_pmd(old_pfn, new_prot)));
+	else
+		new_pte = __pte(pud_val(pfn_pud(old_pfn, new_prot)));
 	__set_pmd_pte(kpte, address, new_pte);
 	cpa->flags |= CPA_FLUSHTLB;
 	cpa_inc_lp_preserved(level);
@@ -1120,7 +1114,10 @@ static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
 	else
 		pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
 set:
-	set_pte(pte, pfn_pte(pfn, ref_prot));
+	if (size == PMD_SIZE)
+		set_pmd((pmd_t *)pte, pfn_pmd(pfn, ref_prot));
+	else
+		set_pte(pte, pfn_pte(pfn, ref_prot));
 }
 
 static int
@@ -1151,11 +1148,6 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	switch (level) {
 	case PG_LEVEL_2M:
 		ref_prot = pmd_pgprot(*(pmd_t *)kpte);
-		/*
-		 * Clear PSE (aka _PAGE_PAT) and move
-		 * PAT bit to correct position.
-		 */
-		ref_prot = pgprot_large_2_4k(ref_prot);
 		ref_pfn = pmd_pfn(*(pmd_t *)kpte);
 		lpaddr = address & PMD_MASK;
 		lpinc = PAGE_SIZE;
@@ -1167,13 +1159,6 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 		pfninc = PMD_SIZE >> PAGE_SHIFT;
 		lpaddr = address & PUD_MASK;
 		lpinc = PMD_SIZE;
-		/*
-		 * Clear the PSE flags if the PRESENT flag is not set
-		 * otherwise pmd_present() will return true even on a non
-		 * present pmd.
-		 */
-		if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
-			pgprot_val(ref_prot) &= ~_PAGE_PSE;
 		break;
 
 	default:
@@ -1289,8 +1274,7 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
 	old_pmd = *pmd;
 
 	/* Success: set up a large page */
-	pgprot = pgprot_4k_2_large(pte_pgprot(first));
-	pgprot_val(pgprot) |= _PAGE_PSE;
+	pgprot = pte_pgprot(first);
 	_pmd = pfn_pmd(pfn, pgprot);
 	set_pmd(pmd, _pmd);
 
@@ -1593,7 +1577,6 @@ static long populate_pmd(struct cpa_data *cpa,
 {
 	long cur_pages = 0;
 	pmd_t *pmd;
-	pgprot_t pmd_pgprot;
 
 	/*
 	 * Not on a 2M boundary?
@@ -1625,8 +1608,6 @@ static long populate_pmd(struct cpa_data *cpa,
 	if (num_pages == cur_pages)
 		return cur_pages;
 
-	pmd_pgprot = pgprot_4k_2_large(pgprot);
-
 	while (end - start >= PMD_SIZE) {
 
 		/*
@@ -1638,8 +1619,7 @@ static long populate_pmd(struct cpa_data *cpa,
 
 		pmd = pmd_offset(pud, start);
 
-		set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
-					canon_pgprot(pmd_pgprot))));
+		set_pmd(pmd, pfn_pmd(cpa->pfn, canon_pgprot(pgprot)));
 
 		start	  += PMD_SIZE;
 		cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
@@ -1667,7 +1647,6 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
 	pud_t *pud;
 	unsigned long end;
 	long cur_pages = 0;
-	pgprot_t pud_pgprot;
 
 	end = start + (cpa->numpages << PAGE_SHIFT);
 
@@ -1705,14 +1684,12 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
 		return cur_pages;
 
 	pud = pud_offset(p4d, start);
-	pud_pgprot = pgprot_4k_2_large(pgprot);
 
 	/*
 	 * Map everything starting from the Gb boundary, possibly with 1G pages
 	 */
 	while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
-		set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
-				   canon_pgprot(pud_pgprot))));
+		set_pud(pud, pfn_pud(cpa->pfn, canon_pgprot(pgprot)));
 
 		start	  += PUD_SIZE;
 		cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index da7f0a03cf90..cd9a62f4d437 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -644,9 +644,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 	if (pud_present(*pud) && !pud_leaf(*pud))
 		return 0;
 
-	set_pte((pte_t *)pud, pfn_pte(
-		(u64)addr >> PAGE_SHIFT,
-		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
+	set_pud(pud, pfn_pud((u64)addr >> PAGE_SHIFT, prot));
 
 	return 1;
 }
@@ -676,9 +674,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 	if (pmd_present(*pmd) && !pmd_leaf(*pmd))
 		return 0;
 
-	set_pte((pte_t *)pmd, pfn_pte(
-		(u64)addr >> PAGE_SHIFT,
-		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
+	set_pmd(pmd, pfn_pmd((u64)addr >> PAGE_SHIFT, prot));
 
 	return 1;
 }
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 223d5bca29b8..2f18f8223376 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -107,7 +107,7 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
 			 * NOTE: We can mark everything as executable here
 			 */
 			if (boot_cpu_has(X86_FEATURE_PSE)) {
-				set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
+				set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_EXEC));
 				pfn += PTRS_PER_PTE;
 			} else {
 				pte_t *max_pte;
@@ -156,13 +156,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd_base)
 
 	if (boot_cpu_has(X86_FEATURE_PSE)) {
 		set_pmd(pmd + pmd_index(restore_jump_address),
-		__pmd((jump_address_phys & PMD_MASK) | pgprot_val(PAGE_KERNEL_LARGE_EXEC)));
+			pfn_pmd(jump_address_phys >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
 	} else {
 		pte = resume_one_page_table_init(pmd);
 		if (!pte)
 			return -ENOMEM;
 		set_pte(pte + pte_index(restore_jump_address),
-		__pte((jump_address_phys & PAGE_MASK) | pgprot_val(PAGE_KERNEL_EXEC)));
+			pfn_pte(jump_address_phys >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
 	}
 
 	return 0;
-- 
2.43.0



^ permalink raw reply related

* [PATCH mm-unstable RFC v4 0/7] mm: add huge pfnmap support for remap_pfn_range()
From: Yin Tirui @ 2026-05-26 14:49 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Juergen Gross, Jonathan Cameron, Will Deacon
  Cc: Catalin Marinas, Peter Xu, Luiz Capitulino, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H . Peter Anvin,
	Andy Lutomirski, Peter Zijlstra, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Liam R . Howlett, Zi Yan, Baolin Wang, Nico Pache, Ryan Roberts,
	Dev Jain, Barry Song, Lance Yang, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Anshuman Khandual, Rohan McLure,
	Kevin Brodsky, Alistair Popple, Andrew Donnellan, Pasha Tatashin,
	Baoquan He, Thomas Huth, Coiby Xu, Dan Williams, Yu-cheng Yu,
	Lu Baolu, Conor Dooley, Rik van Riel, wangkefeng.wang, chenjun102,
	yintirui, linux-mm, linux-kernel, x86, linux-arm-kernel,
	linuxppc-dev, linux-pm

This series is based on mm-unstable and depends on:
1. pgtable_has_pmd_leaves(), introduced by Luiz's series:
     https://lore.kernel.org/linux-mm/cover.1777663129.git.luizcap@redhat.com/
2. mm/huge_memory: update file PMD counter before folio_put()
     https://lore.kernel.org/linux-mm/20260526101337.1984081-1-yintirui@huawei.com/T/#u

v4:
- Following Matthew Wilcox's feedback that huge-page attribute handling
  should stay in architecture helpers:
  https://lore.kernel.org/all/aapXRN4KjWtUUJ0g@casper.infradead.org/

  Reworked the pgprot contract for architectures that enable
  CONFIG_ARCH_SUPPORTS_PMD_PFNMAP: pfn_pmd()/pfn_pud() construct PMD/PUD
  leaf entries from base-PTE pgprot_t, while pmd_pgprot()/pud_pgprot()
  return base-PTE pgprot_t.  Added the required x86, arm64 and powerpc
  support; RISC-V already satisfies the required semantics.
- Refactored copy_huge_pmd() and __split_huge_pmd_locked() to first
  classify PMDs by pmd_present(), and then use vm_normal_folio_pmd() for
  present PMDs, and make move_huge_pmd() use has_deposited_pgtable().
- Introduced a restriction, following the discussion with Lorenzo and
  David, that remap_pfn_range() does not create PMD-sized mappings for
  VMAs that have a fault handler:
  [https://lore.kernel.org/linux-mm/6417587a-7e43-4615-9e2c-50a245842f59@kernel.org/]

  With this restriction, PMD PFNMAP entries in VMAs without fault handlers
  are known to have been installed by remap_pfn_range(), which deposits a
  page table when installing such mappings; PMD PFNMAP entries in VMAs
  with fault handlers are created through fault-time insertion paths such
  as vmf_insert_pfn_pmd().

v3: https://lore.kernel.org/all/20260228070906.1418911-1-yintirui@huawei.com/
1. Architectural Type Safety (Matthew Wilcox):
Following the insightful architectural feedback from Matthew Wilcox in v2,
the approach to clearing huge page attributes has been completely redesigned.
Instead of spreading the `pte_clrhuge()` anti-pattern to ARM64 and RISC-V,
this series enforces strict type safety at the lowest level: `pfn_pte()`
must never natively return a PTE with huge page attributes set.

To achieve this without breaking the x86 core MM, the series is structured as:
  - Fix historical type-casting abuses in x86 (vmemmap, vmalloc, CPA) where
    `pfn_pte()` was wrongly used to generate huge PMDs/PUDs.
  - Update `pfn_pte()` on x86 and ARM64 to inherently filter out huge page
    attributes. (RISC-V leaf PMDs and PTEs share the exact same hardware
    format without a specific "huge" bit, so it is naturally compliant).
  - Completely eradicate `pte_clrhuge()` from the x86 tree and clean up
    the type-casting mess in `arch/x86/mm/init_64.c`.

2. Page Table Deposit fix during clone() (syzbot):
Previously, `copy_huge_pmd()` was unaware of special PMDs created by pfnmap,
failing to deposit a page table for the child process during `clone()`.
This led to crashes during process teardown or PMD splitting. The logic is now
updated to properly allocate and deposit pgtables for `pmd_special()` entries.

v2: https://lore.kernel.org/linux-mm/20251016112704.179280-1-yintirui@huawei.com/#t
- remove "nohugepfnmap" boot option and "pfnmap_max_page_shift" variable.
- zap_deposited_table for non-special pmd.
- move set_pmd_at() inside pmd_lock.
- prevent PMD mapping creation when pgtable allocation fails.
- defer the refactor of pte_clrhuge() to a separate patch series. For now,
  add a TODO to track this.

v1: https://lore.kernel.org/linux-mm/20250923133104.926672-1-yintirui@huawei.com/

Overview
========
This patch series adds huge page support for remap_pfn_range(),
automatically creating huge mappings when prerequisites are satisfied
(size, alignment, architecture support, etc.) and falling back to
normal page mappings otherwise.

This work builds on Peter Xu's previous efforts on huge pfnmap
support [0].

TODO
====
- Add PUD-level huge page support. Currently, only PMD-level huge
pages are supported.

Tests Done
==========
- Cross-build tests.
- Core MM Regression Tests
   - Booted x86 kernel with `debug_pagealloc=on` to heavily stress the
     large page splitting logic in direct mapping. No panics observed.
   - Ran `make -C tools/testing/selftests/vm run_tests`. Both THP and
     Hugetlbfs tests passed successfully, proving the `pfn_pte()` changes
     do not interfere with native huge page generation.
- Functional Tests (with a custom device driver & PTDUMP):
   - Verified that `remap_pfn_range()` successfully creates 2MB mappings
     by observing `/sys/kernel/debug/page_tables/current_user`.
   - Triggered PMD splits via 4K-granular `mprotect()` and partial `munmap()`,
     verifying correct fallback to 512 PTEs without corrupting permissions
     or causing kernel crashes.
   - Triggered `fork()`/`clone()` on the mapped regions, validating the
     syzbot fix and ensuring safe pgtable deposit/withdraw lifecycle.
- Performance tests with custom device driver implementing mmap()
  with remap_pfn_range():
    - lat_mem_rd benchmark modified to use mmap(device_fd) instead of
      malloc() shows around 40% improvement in memory access latency with
      huge page support compared to normal page mappings.

      numactl -C 0 lat_mem_rd -t 4096M (stride=64)
      Memory Size (MB)    Without Huge Mapping With Huge Mapping Improvement
      ----------------    -----------------    --------------    -----------
      64.00               148.858 ns           100.780 ns        32.3%
      128.00              164.745 ns           103.537 ns        37.2%
      256.00              169.907 ns           103.179 ns        39.3%
      512.00              171.285 ns           103.072 ns        39.8%
      1024.00             173.054 ns           103.055 ns        40.4%
      2048.00             172.820 ns           103.091 ns        40.3%
      4096.00             172.877 ns           103.115 ns        40.4%

    - Custom memory copy operations on mmap(device_fd) show around 18% performance 
      improvement with huge page support compared to normal page mappings.

      numactl -C 0 memcpy_test (memory copy performance test)
      Memory Size (MB)    Without Huge Mapping With Huge Mapping Improvement
      ----------------    -----------------    --------------    -----------
      1024.00             95.76 ms             77.91 ms          18.6%
      2048.00             190.87 ms            155.64 ms         18.5%
      4096.00             380.84 ms            311.45 ms         18.2%

[0] https://lore.kernel.org/all/20240826204353.2228736-2-peterx@redhat.com/T/#u

Yin Tirui (7):
  x86/mm: use PTE-level pgprot for huge PFN helpers
  arm64/mm: use PTE-level pgprot for huge PFN helpers
  powerpc/mm: use PTE-level pgprot for huge PFN helpers
  mm/huge_memory: refactor copy_huge_pmd()
  mm/huge_memory: refactor __split_huge_pmd_locked()
  mm/huge_memory: make move_huge_pmd() use has_deposited_pgtable()
  mm: add PMD-level PFNMAP support for remap_pfn_range()

 arch/arm64/include/asm/pgtable.h             |  48 +-
 arch/arm64/mm/mmu.c                          |   4 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h |   5 +-
 arch/powerpc/include/asm/pgtable.h           |  11 +-
 arch/powerpc/mm/book3s64/pgtable.c           |  11 +-
 arch/x86/include/asm/pgtable.h               |  68 ++-
 arch/x86/include/asm/pgtable_types.h         |  12 +-
 arch/x86/mm/init_32.c                        |   8 +-
 arch/x86/mm/init_64.c                        |  30 +-
 arch/x86/mm/pat/set_memory.c                 |  51 +--
 arch/x86/mm/pgtable.c                        |   8 +-
 arch/x86/power/hibernate_32.c                |   6 +-
 mm/huge_memory.c                             | 440 +++++++++++--------
 mm/internal.h                                |  21 +
 mm/memory.c                                  |  87 +++-
 15 files changed, 493 insertions(+), 317 deletions(-)

-- 
2.43.0



^ permalink raw reply

* [PATCH mm-unstable RFC v4 3/7] powerpc/mm: use PTE-level pgprot for huge PFN helpers
From: Yin Tirui @ 2026-05-26 14:49 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Juergen Gross, Jonathan Cameron, Will Deacon
  Cc: Catalin Marinas, Peter Xu, Luiz Capitulino, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H . Peter Anvin,
	Andy Lutomirski, Peter Zijlstra, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Liam R . Howlett, Zi Yan, Baolin Wang, Nico Pache, Ryan Roberts,
	Dev Jain, Barry Song, Lance Yang, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Anshuman Khandual, Rohan McLure,
	Kevin Brodsky, Alistair Popple, Andrew Donnellan, Pasha Tatashin,
	Baoquan He, Thomas Huth, Coiby Xu, Dan Williams, Yu-cheng Yu,
	Lu Baolu, Conor Dooley, Rik van Riel, wangkefeng.wang, chenjun102,
	yintirui, linux-mm, linux-kernel, x86, linux-arm-kernel,
	linuxppc-dev, linux-pm
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Make the powerpc PMD PFN helper use PTE-level pgprot_t as the basic
format.

pmd_pgprot() currently derives pgprot_t from the PMD entry through
pte_pgprot(). Some PMD leaf entries can carry H_PAGE_THP_HUGE, which is
specific to huge PMDs and should not be propagated into PTE-level
pgprot_t.

Mask H_PAGE_THP_HUGE out in pmd_pgprot().

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 arch/powerpc/include/asm/pgtable.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index d20ff2ae02f5..0f368ea64b1f 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -67,7 +67,16 @@ static inline pgprot_t pte_pgprot(pte_t pte)
 #define pmd_pgprot pmd_pgprot
 static inline pgprot_t pmd_pgprot(pmd_t pmd)
 {
-	return pte_pgprot(pmd_pte(pmd));
+	pgprot_t prot = pte_pgprot(pmd_pte(pmd));
+
+	/*
+	 * pmd_pgprot() returns PTE-level pgprot_t. H_PAGE_THP_HUGE is specific
+	 * to huge PMDs.
+	 */
+#ifdef H_PAGE_THP_HUGE
+	prot = __pgprot(pgprot_val(prot) & ~H_PAGE_THP_HUGE);
+#endif
+	return prot;
 }
 
 #define pud_pgprot pud_pgprot
-- 
2.43.0



^ permalink raw reply related

* [PATCH mm-unstable RFC v4 4/7] mm/huge_memory: refactor copy_huge_pmd()
From: Yin Tirui @ 2026-05-26 14:50 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Juergen Gross, Jonathan Cameron, Will Deacon
  Cc: Catalin Marinas, Peter Xu, Luiz Capitulino, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H . Peter Anvin,
	Andy Lutomirski, Peter Zijlstra, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Liam R . Howlett, Zi Yan, Baolin Wang, Nico Pache, Ryan Roberts,
	Dev Jain, Barry Song, Lance Yang, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Anshuman Khandual, Rohan McLure,
	Kevin Brodsky, Alistair Popple, Andrew Donnellan, Pasha Tatashin,
	Baoquan He, Thomas Huth, Coiby Xu, Dan Williams, Yu-cheng Yu,
	Lu Baolu, Conor Dooley, Rik van Riel, wangkefeng.wang, chenjun102,
	yintirui, linux-mm, linux-kernel, x86, linux-arm-kernel,
	linuxppc-dev, linux-pm
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Classify the source PMD via pmd_present() and vm_normal_folio_pmd(),
matching the way the PTE path uses pte_present() and vm_normal_page().
This moves the present-PMD decision from VMA identity checks to the
actual PMD/folio state.

Drop the defensive "if (!pmd_trans_huge(pmd)) goto out_unlock" branch:
with mmap_write_lock held during fork, it should not occur.

Extract the present-PMD side of copy_huge_pmd() into
copy_present_huge_pmd(). The helper owns the child pgtable passed by the
caller: it either deposits the pgtable when installing a copied PMD, or
frees it on paths that do not install one.

The child pgtable is now allocated once up front and freed on every skip
path. This makes file/shmem and PFNMAP/special skip paths take the PMD
locks and free the preallocated pgtable before returning. These are not
expected to be hot paths, and the PFNMAP case is reused by the follow-up
PMD PFNMAP copy support.

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 mm/huge_memory.c | 175 +++++++++++++++++++++++++----------------------
 1 file changed, 95 insertions(+), 80 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9832ee910d5e..3964258ff91d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1879,6 +1879,82 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return false;
 }
 
+static int copy_present_huge_pmd(
+		struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+		pmd_t pmd, pgtable_t pgtable, bool *need_split)
+{
+	struct folio *src_folio;
+	bool wrprotect = true;
+
+	src_folio = vm_normal_folio_pmd(src_vma, addr, pmd);
+	if (!src_folio) {
+		/*
+		 * When page table lock is held, the huge zero pmd should not be
+		 * under splitting since we don't split the page itself, only pmd to
+		 * a page table.
+		 */
+		if (is_huge_zero_pmd(pmd)) {
+			/*
+			 * mm_get_huge_zero_folio() will never allocate a new
+			 * folio here, since we already have a zero page to
+			 * copy. It just takes a reference.
+			 */
+			mm_get_huge_zero_folio(dst_mm);
+			goto set_pmd;
+		}
+
+		/*
+		 * Making sure it's not a CoW VMA with writable
+		 * mapping, otherwise it means either the anon page wrongly
+		 * applied special bit, or we made the PRIVATE mapping be
+		 * able to wrongly write to the backend MMIO.
+		 */
+		VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
+		pte_free(dst_mm, pgtable);
+		pgtable = NULL;
+		wrprotect = false;
+		goto set_pmd;
+	}
+
+	/* File THPs are copied lazily by refaulting. */
+	if (!folio_test_anon(src_folio)) {
+		pte_free(dst_mm, pgtable);
+		return 0;
+	}
+
+	folio_get(src_folio);
+	if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio,
+							&src_folio->page,
+							dst_vma, src_vma))) {
+		/* Page maybe pinned: split and retry the fault on PTEs. */
+		folio_put(src_folio);
+		pte_free(dst_mm, pgtable);
+		*need_split = true;
+		return -EAGAIN;
+	}
+	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+set_pmd:
+	if (pgtable) {
+		mm_inc_nr_ptes(dst_mm);
+		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+	}
+
+	if (wrprotect) {
+		pmdp_set_wrprotect(src_mm, addr, src_pmd);
+		if (!userfaultfd_wp(dst_vma))
+			pmd = pmd_clear_uffd_wp(pmd);
+		pmd = pmd_wrprotect(pmd);
+	}
+
+	pmd = pmd_mkold(pmd);
+	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+
+	return 0;
+}
+
 static void copy_huge_non_present_pmd(
 		struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
@@ -1940,104 +2016,43 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 {
 	spinlock_t *dst_ptl, *src_ptl;
-	struct page *src_page;
-	struct folio *src_folio;
-	pmd_t pmd;
 	pgtable_t pgtable = NULL;
-	int ret = -ENOMEM;
-
-	pmd = pmdp_get_lockless(src_pmd);
-	if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
-		     !is_huge_zero_pmd(pmd))) {
-		dst_ptl = pmd_lock(dst_mm, dst_pmd);
-		src_ptl = pmd_lockptr(src_mm, src_pmd);
-		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
-		/*
-		 * No need to recheck the pmd, it can't change with write
-		 * mmap lock held here.
-		 *
-		 * Meanwhile, making sure it's not a CoW VMA with writable
-		 * mapping, otherwise it means either the anon page wrongly
-		 * applied special bit, or we made the PRIVATE mapping be
-		 * able to wrongly write to the backend MMIO.
-		 */
-		VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
-		goto set_pmd;
-	}
-
-	/* Skip if can be re-fill on fault */
-	if (!vma_is_anonymous(dst_vma))
-		return 0;
+	bool need_split = false;
+	int ret = 0;
+	pmd_t pmd;
 
 	pgtable = pte_alloc_one(dst_mm);
 	if (unlikely(!pgtable))
-		goto out;
+		return -ENOMEM;
 
 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
 	src_ptl = pmd_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 
-	ret = -EAGAIN;
 	pmd = *src_pmd;
 
-	if (unlikely(thp_migration_supported() &&
-		     pmd_is_valid_softleaf(pmd))) {
-		copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
+	if (likely(pmd_present(pmd))) {
+		ret = copy_present_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
+					  dst_vma, src_vma, pmd, pgtable, &need_split);
+	} else if (unlikely(thp_migration_supported() && pmd_is_valid_softleaf(pmd))) {
+		if (unlikely(!vma_is_anonymous(dst_vma)))
+			pte_free(dst_mm, pgtable);
+		else
+			copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
 					  dst_vma, src_vma, pmd, pgtable);
-		ret = 0;
-		goto out_unlock;
-	}
-
-	if (unlikely(!pmd_trans_huge(pmd))) {
+	} else {
+		VM_WARN_ONCE(1, "unexpected non-present PMD %llx\n",
+				(unsigned long long)pmd_val(pmd));
 		pte_free(dst_mm, pgtable);
-		goto out_unlock;
-	}
-	/*
-	 * When page table lock is held, the huge zero pmd should not be
-	 * under splitting since we don't split the page itself, only pmd to
-	 * a page table.
-	 */
-	if (is_huge_zero_pmd(pmd)) {
-		/*
-		 * mm_get_huge_zero_folio() will never allocate a new
-		 * folio here, since we already have a zero page to
-		 * copy. It just takes a reference.
-		 */
-		mm_get_huge_zero_folio(dst_mm);
-		goto out_zero_page;
+		ret = -EAGAIN;
 	}
 
-	src_page = pmd_page(pmd);
-	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
-	src_folio = page_folio(src_page);
+	spin_unlock(src_ptl);
+	spin_unlock(dst_ptl);
 
-	folio_get(src_folio);
-	if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
-		/* Page maybe pinned: split and retry the fault on PTEs. */
-		folio_put(src_folio);
-		pte_free(dst_mm, pgtable);
-		spin_unlock(src_ptl);
-		spin_unlock(dst_ptl);
+	if (unlikely(need_split))
 		__split_huge_pmd(src_vma, src_pmd, addr, false);
-		return -EAGAIN;
-	}
-	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-out_zero_page:
-	mm_inc_nr_ptes(dst_mm);
-	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-	pmdp_set_wrprotect(src_mm, addr, src_pmd);
-	if (!userfaultfd_wp(dst_vma))
-		pmd = pmd_clear_uffd_wp(pmd);
-	pmd = pmd_wrprotect(pmd);
-set_pmd:
-	pmd = pmd_mkold(pmd);
-	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 
-	ret = 0;
-out_unlock:
-	spin_unlock(src_ptl);
-	spin_unlock(dst_ptl);
-out:
 	return ret;
 }
 
-- 
2.43.0



^ permalink raw reply related

* [PATCH mm-unstable RFC v4 2/7] arm64/mm: use PTE-level pgprot for huge PFN helpers
From: Yin Tirui @ 2026-05-26 14:49 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, David Hildenbrand, Lorenzo Stoakes,
	Juergen Gross, Jonathan Cameron, Will Deacon
  Cc: Catalin Marinas, Peter Xu, Luiz Capitulino, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H . Peter Anvin,
	Andy Lutomirski, Peter Zijlstra, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy,
	Liam R . Howlett, Zi Yan, Baolin Wang, Nico Pache, Ryan Roberts,
	Dev Jain, Barry Song, Lance Yang, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Anshuman Khandual, Rohan McLure,
	Kevin Brodsky, Alistair Popple, Andrew Donnellan, Pasha Tatashin,
	Baoquan He, Thomas Huth, Coiby Xu, Dan Williams, Yu-cheng Yu,
	Lu Baolu, Conor Dooley, Rik van Riel, wangkefeng.wang, chenjun102,
	yintirui, linux-mm, linux-kernel, x86, linux-arm-kernel,
	linuxppc-dev, linux-pm
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Make the arm64 PMD/PUD PFN helpers use PTE-level pgprot_t as the basic
format.

pfn_pmd() and pfn_pud() now translate PTE-level attributes into block
entries. pmd_pgprot() and pud_pgprot() translate block descriptor
attributes back into PTE-level attributes.

Remove mk_pmd_sect_prot() and mk_pud_sect_prot().

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 arch/arm64/include/asm/pgtable.h | 48 ++++++++++++++++++++++----------
 arch/arm64/mm/mmu.c              |  4 +--
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 4dfa42b7d053..c3ee12e14f86 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -511,16 +511,6 @@ static inline pmd_t pte_pmd(pte_t pte)
 	return __pmd(pte_val(pte));
 }
 
-static inline pgprot_t mk_pud_sect_prot(pgprot_t prot)
-{
-	return __pgprot((pgprot_val(prot) & ~PUD_TYPE_MASK) | PUD_TYPE_SECT);
-}
-
-static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot)
-{
-	return __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
-}
-
 static inline pte_t pte_swp_mkexclusive(pte_t pte)
 {
 	return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
@@ -628,7 +618,13 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd)
 #define __pmd_to_phys(pmd)	__pte_to_phys(pmd_pte(pmd))
 #define __phys_to_pmd_val(phys)	__phys_to_pte_val(phys)
 #define pmd_pfn(pmd)		((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT)
-#define pfn_pmd(pfn,prot)	__pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
+static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd = __pmd(__phys_to_pmd_val((phys_addr_t)pfn << PAGE_SHIFT) |
+			  pgprot_val(prot));
+
+	return pmd_mkhuge(pmd);
+}
 
 #define pud_young(pud)		pte_young(pud_pte(pud))
 #define pud_mkyoung(pud)	pte_pud(pte_mkyoung(pud_pte(pud)))
@@ -652,22 +648,46 @@ static inline pud_t pud_mkhuge(pud_t pud)
 #define __pud_to_phys(pud)	__pte_to_phys(pud_pte(pud))
 #define __phys_to_pud_val(phys)	__phys_to_pte_val(phys)
 #define pud_pfn(pud)		((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
-#define pfn_pud(pfn,prot)	__pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
+static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
+{
+	pud_t pud = __pud(__phys_to_pud_val((phys_addr_t)pfn << PAGE_SHIFT) |
+			  pgprot_val(prot));
+
+	return pud_mkhuge(pud);
+}
 
 #define pmd_pgprot pmd_pgprot
 static inline pgprot_t pmd_pgprot(pmd_t pmd)
 {
 	unsigned long pfn = pmd_pfn(pmd);
+	pmdval_t protval = pmd_val(pmd) ^
+		__phys_to_pmd_val((phys_addr_t)pfn << PAGE_SHIFT);
+
+	/*
+	 * pgprot_t represents PTE-level attributes. Convert the PMD
+	 * block descriptor type into a PTE page descriptor type.
+	 */
+	pmdval_t mask = PMD_TYPE_MASK & ~PTE_VALID;
+	pmdval_t val = PTE_TYPE_PAGE & ~PTE_VALID;
 
-	return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd));
+	return __pgprot((protval & ~mask) | val);
 }
 
 #define pud_pgprot pud_pgprot
 static inline pgprot_t pud_pgprot(pud_t pud)
 {
 	unsigned long pfn = pud_pfn(pud);
+	pudval_t protval = pud_val(pud) ^
+		__phys_to_pud_val((phys_addr_t)pfn << PAGE_SHIFT);
+
+	/*
+	 * pgprot_t represents PTE-level attributes. Convert the PUD
+	 * block descriptor type into a PTE page descriptor type.
+	 */
+	pudval_t mask = PUD_TYPE_MASK & ~PTE_VALID;
+	pudval_t val = PTE_TYPE_PAGE & ~PTE_VALID;
 
-	return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud));
+	return __pgprot((protval & ~mask) | val);
 }
 
 static inline void __set_ptes_anysz(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e5a42b7a0160..2dd99d595f19 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1816,7 +1816,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
 
 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 {
-	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
+	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), prot);
 
 	/* Only allow permission changes for now */
 	if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
@@ -1830,7 +1830,7 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 
 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 {
-	pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
+	pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), prot);
 
 	/* Only allow permission changes for now */
 	if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
-- 
2.43.0



^ permalink raw reply related

* [PATCH] ibmvnic: fix krealloc() memory leak
From: Alexander A. Klimov @ 2026-05-26 18:41 UTC (permalink / raw)
  To: Haren Myneni, Rick Lindsley, Nick Child, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Thomas Falcon, Desnes Augusto Nunes do Rosario,
	open list:IBM Power SRIOV Virtual NIC Device Driver,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT), open list
  Cc: Alexander A. Klimov
In-Reply-To: <20260526184105.18962-1-grandmaster@al2klimov.de>

Don't just overwrite the original pointer passed to krealloc()
with its return value without checking latter:

    MEM = krealloc(MEM, SZ, GFP);

If krealloc() returns NULL, that erases the pointer
to the still allocated memory, hence leaks this memory.
Instead, use a temporary variable, check it's not NULL
and only then assign it to the original pointer:

    TMP = krealloc(MEM, SZ, GFP);
    if (!TMP) return;
    MEM = TMP;

Fixes: 4e6759be28e4 ("ibmvnic: Feature implementation of Vital Product Data (VPD) for the ibmvnic driver")
Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 5a510eed335e..25d1d844ad19 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1761,8 +1761,9 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 	union ibmvnic_crq crq;
 	int len = 0;
 	int rc;
+	unsigned char *buff = adapter->vpd->buff;
 
-	if (adapter->vpd->buff)
+	if (buff)
 		len = adapter->vpd->len;
 
 	mutex_lock(&adapter->fw_lock);
@@ -1788,17 +1789,17 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 	if (!adapter->vpd->len)
 		return -ENODATA;
 
-	if (!adapter->vpd->buff)
-		adapter->vpd->buff = kzalloc(adapter->vpd->len, GFP_KERNEL);
+	if (!buff)
+		buff = kzalloc(adapter->vpd->len, GFP_KERNEL);
 	else if (adapter->vpd->len != len)
-		adapter->vpd->buff =
-			krealloc(adapter->vpd->buff,
-				 adapter->vpd->len, GFP_KERNEL);
+		buff = krealloc(buff,
+				adapter->vpd->len, GFP_KERNEL);
 
-	if (!adapter->vpd->buff) {
+	if (!buff) {
 		dev_err(dev, "Could allocate VPD buffer\n");
 		return -ENOMEM;
 	}
+	adapter->vpd->buff = buff;
 
 	adapter->vpd->dma_addr =
 		dma_map_single(dev, adapter->vpd->buff, adapter->vpd->len,
-- 
2.54.0



^ permalink raw reply related

* Re: [PATCH v5 10/20] dma-direct: make dma_direct_map_phys() honor DMA_ATTR_CC_SHARED
From: Jason Gunthorpe @ 2026-05-26 15:39 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Aneesh Kumar K.V (Arm), iommu@lists.linux.dev,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
	Robin Murphy, Marek Szyprowski, Will Deacon, Marc Zyngier,
	Steven Price, Suzuki K Poulose, Catalin Marinas, Jiri Pirko,
	Mostafa Saleh, Petr Tesarik, Alexey Kardashevskiy, Dan Williams,
	Xu Yilun, linuxppc-dev@lists.ozlabs.org,
	linux-s390@vger.kernel.org, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86@kernel.org, Jiri Pirko
In-Reply-To: <SN6PR02MB41574064D14D4A2734222C51D40B2@SN6PR02MB4157.namprd02.prod.outlook.com>

On Tue, May 26, 2026 at 02:56:57AM +0000, Michael Kelley wrote:

> With this patch removing SWIOTLB_FORCE from four places in
> kernel code, there are no remaining places where it is set.
> The test of SWIOTLB_FORCE could be removed from
> swiotlb_init_remap(), and its definition could be deleted
> from include/linux/swiotlb.h.

That's great! I think it shows this is the right approach!

Jason


^ permalink raw reply

* Re: [PATCH mm-unstable RFC v4 0/7] mm: add huge pfnmap support for remap_pfn_range()
From: Lorenzo Stoakes @ 2026-05-26 15:33 UTC (permalink / raw)
  To: Yin Tirui
  Cc: Andrew Morton, Matthew Wilcox, David Hildenbrand, Juergen Gross,
	Jonathan Cameron, Will Deacon, Catalin Marinas, Peter Xu,
	Luiz Capitulino, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, H . Peter Anvin, Andy Lutomirski, Peter Zijlstra,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy, Liam R . Howlett, Zi Yan, Baolin Wang,
	Nico Pache, Ryan Roberts, Dev Jain, Barry Song, Lance Yang,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Anshuman Khandual, Rohan McLure, Kevin Brodsky, Alistair Popple,
	Andrew Donnellan, Pasha Tatashin, Baoquan He, Thomas Huth,
	Coiby Xu, Dan Williams, Yu-cheng Yu, Lu Baolu, Conor Dooley,
	Rik van Riel, wangkefeng.wang, chenjun102, linux-mm, linux-kernel,
	x86, linux-arm-kernel, linuxppc-dev, linux-pm
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Allow me to be mildly pedantic (sorry :)

One thing I'd like to highlight here is that remap_pfn_range() is, in the long
run, deprecated.

mmap_prepare callbacks will indicate a PFN remap mmap_action, which will do the
heavy lifting (see [0]).

So perhaps worth referring to 'PFN remapping' or something?

(Since we already have mmap_prepare() in use, it's also kinda inaccurate to
say remap_pfn_range() :)

[0]: https://www.kernel.org/doc/html/next/filesystems/mmap_prepare.html

Sorry, this is pretty nitpicky :)

Cheers, Lorenzo


^ permalink raw reply

* Re: [BUG] sched/cache: "Make LLC id continuous" causes NULL cpumask
From: Chen Yu @ 2026-05-26 14:08 UTC (permalink / raw)
  To: kprateek.nayak
  Cc: srikar, venkat88, maddy, sshegde, riteshh, chleroy, tim.c.chen,
	peterz, linux-kernel, linuxppc-dev, linux-sched, Chen Yu
In-Reply-To: <058664ab-0982-4c13-9d4b-caa2f7616b0f@amd.com>

Hi Prateek,

On Tue, 26 May 2026 11:23:59 +0530, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
> Hello Srikar,
>
> On 5/26/2026 10:28 AM, Srikar Dronamraju wrote:
> > L2 Cache reported here is for SMT8 Core aka CACHE domain.
>
> Apart for the scheduler, nothing in tree currently cares about
> cpu_coregroup_mask() except for drivers/base/arch_topology.c but
> Power doesn't select GENERIC_ARCH_TOPOLOGY.
>
> Why can't Power have an internal mask for MC domain (tl_mc_mask) and
> the scheduler can use cpu_coregroup_mask() for the actual LLc? (The L2
> mask in this case.)
>
> Power anyways adds its own topology via set_sched_topology() so the
> default_topology from kernel/sched/topology.c remains unused.
>
> ...
>
> > Shouldnt cache-aware scheduling be worried about cpuset partitions too.
> > If a cpuset has subset of LLC cores, then we should Scheduler assume it can
> > control complete LLC?
>
> Well, the scheduling takes care of partitions and the cache aware
> scheduling bits take care of looking at the full system perspective
> for stats aggregation and pointing to a particular LLc.
>
> We don't compare llc_id across cpusets so we keeping one unique llc_id
> per H/W LLC instance is feasible and it enables us to keep llc_id space
> limited for optimizing cache-aware scheduling.
>
> Now if we have threads of same process across partitions, we'll
> still aggregate the utilization numbers across the full LLC but
> the load balancers at individual partitions will make a call on
> the aggregation.
>
> -- 
> Thanks and Regards,
> Prateek
>
>

I suppose what you suggested looks like below:

powerpc/smp: make cpu_coregroup_mask() return the LLC

On pSeries shared LPARs(or coregroup_enabled is false on
Power9 and earlier) the hemisphere map is not allocated, so
build_sched_domains() dereferences a NULL cpumask and crashes.

The generic scheduler expects cpu_coregroup_mask() to span the LLC.
On powerpc the LLC is the L2. Return cpu_l2_cache_mask() instead of
the hemisphere map. Use a coregroup_map() helper for the in-file
hemisphere users, and a powerpc_tl_mc_mask() wrapper for the MC
sched-domain level.

Fixes: b5ea300a17e3 ("sched/cache: Make LLC id continuous")
Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
 arch/powerpc/kernel/smp.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1040,11 +1040,22 @@ static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_
 }
 #endif
 
+static inline struct cpumask *coregroup_map(int cpu)
+{
+	return per_cpu(cpu_coregroup_map, cpu);
+}
+
 struct cpumask *cpu_coregroup_mask(int cpu)
 {
-	return per_cpu(cpu_coregroup_map, cpu);
+	return cpu_l2_cache_mask(cpu);
+}
+
+static const struct cpumask *
+powerpc_tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return coregroup_map(cpu);
 }
 
 static bool has_coregroup_support(void)
 {
 	if (is_shared_processor())
@@ -1155,7 +1166,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
 	if (has_coregroup_support())
-		cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
+		cpumask_set_cpu(boot_cpuid, coregroup_map(boot_cpuid));
 
 	init_big_cores();
 	if (has_big_cores) {
@@ -1520,8 +1531,8 @@ static void remove_cpu_from_masks(int cpu)
 		set_cpus_unrelated(cpu, i, cpu_core_mask);
 
 	if (has_coregroup_support()) {
-		for_each_cpu(i, cpu_coregroup_mask(cpu))
-			set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
+		for_each_cpu(i, coregroup_map(cpu))
+			set_cpus_unrelated(cpu, i, coregroup_map);
 	}
 }
 #endif
@@ -1553,7 +1564,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
 	if (!*mask) {
 		/* Assume only siblings are part of this CPU's coregroup */
 		for_each_cpu(i, submask_fn(cpu))
-			set_cpus_related(cpu, i, cpu_coregroup_mask);
+			set_cpus_related(cpu, i, coregroup_map);
 
 		return;
 	}
@@ -1561,18 +1572,18 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
 	cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));
 
 	/* Update coregroup mask with all the CPUs that are part of submask */
-	or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
+	or_cpumasks_related(cpu, cpu, submask_fn, coregroup_map);
 
 	/* Skip all CPUs already part of coregroup mask */
-	cpumask_andnot(*mask, *mask, cpu_coregroup_mask(cpu));
+	cpumask_andnot(*mask, *mask, coregroup_map(cpu));
 
 	for_each_cpu(i, *mask) {
 		/* Skip all CPUs not part of this coregroup */
 		if (coregroup_id == cpu_to_coregroup_id(i)) {
-			or_cpumasks_related(cpu, i, submask_fn, cpu_coregroup_mask);
+			or_cpumasks_related(cpu, i, submask_fn, coregroup_map);
 			cpumask_andnot(*mask, *mask, submask_fn(i));
 		} else {
-			cpumask_andnot(*mask, *mask, cpu_coregroup_mask(i));
+			cpumask_andnot(*mask, *mask, coregroup_map(i));
 		}
 	}
 }
@@ -1733,7 +1744,7 @@ static void __init build_sched_topology(void)
 
 	if (has_coregroup_support()) {
 		powerpc_topology[i++] =
-			SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC);
+			SDTL_INIT(powerpc_tl_mc_mask, powerpc_shared_proc_flags, MC);
 	}
 
 	powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG);
-- 
2.43.0

Thanks,
Yu


^ permalink raw reply

* Re: [PATCH v5 10/14] module: Prepare for additional module authentication mechanisms
From: Petr Pavlu @ 2026-05-26 13:14 UTC (permalink / raw)
  To: Thomas Weißschuh
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Nathan Chancellor,
	Nicolas Schier, Arnd Bergmann, Luis Chamberlain, Sami Tolvanen,
	Daniel Gomez, Paul Moore, James Morris, Serge E. Hallyn,
	Jonathan Corbet, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Naveen N Rao, Mimi Zohar, Roberto Sassu,
	Dmitry Kasatkin, Eric Snowberg, Nicolas Schier, Daniel Gomez,
	Aaron Tomlin, Christophe Leroy (CS GROUP), Nicolas Bouchinet,
	Xiu Jianfeng, Martin KaFai Lau, Song Liu, Yonghong Song,
	Jiri Olsa, bpf, Fabian Grünbichler, Arnout Engelen,
	Mattia Rizzolo, kpcyrd, Christian Heusel, Câju Mihai-Drosi,
	Eric Biggers, Sebastian Andrzej Siewior, linux-kbuild,
	linux-kernel, linux-arch, linux-modules, linux-security-module,
	linux-doc, linuxppc-dev, linux-integrity, debian-kernel
In-Reply-To: <20260505-module-hashes-v5-10-e174a5a49fce@weissschuh.net>

On 5/5/26 11:05 AM, Thomas Weißschuh wrote:
> Reorganize the code to make it easier to add the new hash-based module
> authentication.
> 
> Also drop the now unnecessary stub for module_sig_check().
> 
> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>

Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>

-- Petr


^ permalink raw reply

* Re: [PATCH v5 09/14] module: Move signature type check out of mod_check_sig()
From: Petr Pavlu @ 2026-05-26 13:03 UTC (permalink / raw)
  To: Thomas Weißschuh
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Nathan Chancellor,
	Nicolas Schier, Arnd Bergmann, Luis Chamberlain, Sami Tolvanen,
	Daniel Gomez, Paul Moore, James Morris, Serge E. Hallyn,
	Jonathan Corbet, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Naveen N Rao, Mimi Zohar, Roberto Sassu,
	Dmitry Kasatkin, Eric Snowberg, Nicolas Schier, Daniel Gomez,
	Aaron Tomlin, Christophe Leroy (CS GROUP), Nicolas Bouchinet,
	Xiu Jianfeng, Martin KaFai Lau, Song Liu, Yonghong Song,
	Jiri Olsa, bpf, Fabian Grünbichler, Arnout Engelen,
	Mattia Rizzolo, kpcyrd, Christian Heusel, Câju Mihai-Drosi,
	Eric Biggers, Sebastian Andrzej Siewior, linux-kbuild,
	linux-kernel, linux-arch, linux-modules, linux-security-module,
	linux-doc, linuxppc-dev, linux-integrity, debian-kernel
In-Reply-To: <20260505-module-hashes-v5-9-e174a5a49fce@weissschuh.net>

On 5/5/26 11:05 AM, Thomas Weißschuh wrote:
> Additional signature types are about to be added.
> As each caller of mod_check_sig() can have different support for these,
> move the type validation into the callers.
> 
> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>

Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>

-- Petr


^ permalink raw reply

* Re: [PATCH] [net-next] net: dsa: netc: fix enetc dependencies
From: Arnd Bergmann @ 2026-05-26 12:39 UTC (permalink / raw)
  To: Wei Fang, Arnd Bergmann, Claudiu Manoil, Clark Wang,
	Christophe Leroy
  Cc: Andrew Lunn, Vladimir Oltean, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, imx@lists.linux.dev, Netdev,
	linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <DBBPR04MB7500D92831D793CDD1D562E2880B2@DBBPR04MB7500.eurprd04.prod.outlook.com>

On Tue, May 26, 2026, at 13:03, Wei Fang wrote:
>
> Thanks for fix this issue, I have sent a patch last Sunday.
> Link: https://lore.kernel.org/netdev/20260524070310.2429819-1-wei.fang@nxp.com/
>
> I think the solution should simply be to add
> "depends on NET_VENDOR_FREESCALE", right? The changes in
> enetc_mdio.h seem more like improvements to me.

Yes, the added dependency is sufficient. I removed the other
change only because that was intended as a workaround
for the same problem but was incorrect.

      Arnd


^ permalink raw reply

* Re: [PATCH v5 07/14] module: Make module authentication usable without MODULE_SIG
From: kpcyrd @ 2026-05-26 12:27 UTC (permalink / raw)
  To: Thomas Weißschuh, Petr Pavlu
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Nathan Chancellor,
	Nicolas Schier, Arnd Bergmann, Luis Chamberlain, Sami Tolvanen,
	Daniel Gomez, Paul Moore, James Morris, Serge E. Hallyn,
	Jonathan Corbet, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Naveen N Rao, Mimi Zohar, Roberto Sassu,
	Dmitry Kasatkin, Eric Snowberg, Nicolas Schier, Daniel Gomez,
	Aaron Tomlin, Christophe Leroy (CS GROUP), Nicolas Bouchinet,
	Xiu Jianfeng, Martin KaFai Lau, Song Liu, Yonghong Song,
	Jiri Olsa, bpf, Fabian Grünbichler, Arnout Engelen,
	Mattia Rizzolo, Christian Heusel, Câju Mihai-Drosi,
	Eric Biggers, Sebastian Andrzej Siewior, linux-kbuild,
	linux-kernel, linux-arch, linux-modules, linux-security-module,
	linux-doc, linuxppc-dev, linux-integrity, debian-kernel,
	Holger Levsen
In-Reply-To: <4ee3c775-1fbf-45e1-8b77-5f9034f45125@t-8ch.de>

On 5/26/26 1:38 PM, Thomas Weißschuh wrote:
> On 2026-05-26 12:53:22+0200, Petr Pavlu wrote:
>> Should MODULE_SIG_FORCE be renamed to MODULE_AUTH_FORCE, along with
>> renaming the sig_enforce functionality in kernel/module/auth.c to
>> auth_enforce?
> 
> Given that it is a user-visible symbol we'll need to be a bit careful
> not to break existing configurations.
> I'll try to use the new "transitional" kconfig attribute.
A slightly softer worded alternative (yet semantically equivalent) name could be 
MODULE_AUTH_REQUIRE. No strong opinion though, I think MODULE_AUTH_* does make 
sense.

I initially shared the concern about renaming well established config options, 
but the transitional feature does seem to be a good fit for this.

Sincerely,
kpcyrd


^ permalink raw reply

* Re: [PATCH V16 4/7] rust/powerpc: Set min rustc version for powerpc
From: Miguel Ojeda @ 2026-05-26 12:21 UTC (permalink / raw)
  To: Mukesh Kumar Chaurasiya
  Cc: maddy, mpe, npiggin, chleroy, peterz, jpoimboe, jbaron, aliceryhl,
	rostedt, ardb, ojeda, boqun, gary, bjorn3_gh, lossin, a.hindborg,
	tmgross, dakr, nathan, nick.desaulniers+lkml, morbo, justinstitt,
	daniel.almeida, acourbot, fujita.tomonori, gregkh, prafulrai522,
	tamird, kees, lyude, airlied, linuxppc-dev, linux-kernel,
	rust-for-linux, llvm
In-Reply-To: <ahVb4k-2rC4EYZF9@li-1a3e774c-28e4-11b2-a85c-acc9f2883e29.ibm.com>

On Tue, May 26, 2026 at 10:53 AM Mukesh Kumar Chaurasiya
<mkchauras@gmail.com> wrote:
>
> I wanted inline asm be stable, I was skeptical about inline asm to be
> unstable and potentially messing up the whole system. That's the reason
> I waited for the stable support to get merged before sending out this
> patch series.

Yeah, sometimes unstable may mean there were remaining issues, but it
is also common for features to remain unstable for a long time without
no changes. So, yeah, we should at least try to know what is the
overall status, i.e. the latest changes that landed regarding ppc64
inline asm, e.g. when was the last time a significant bug was fixed
etc.

(And I guess IBM prefers that it works with more versions rather than less?)

> Yeah it may be true. I'll test out the 1.85 rustc and come back with the
> results.

Thanks, that would be great.

> I am not aware of s390x way of approaching support for rust so can't say
> anything about that.

Ah, OK -- both series added the conditional to
`scripts/min-tool-version.sh` and both are from IBM, so I was just
wondering :)

Cheers,
Miguel


^ permalink raw reply

* Re: [PATCH v5 08/14] module: Move authentication logic into dedicated new file
From: Petr Pavlu @ 2026-05-26 11:58 UTC (permalink / raw)
  To: Thomas Weißschuh
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Nathan Chancellor,
	Nicolas Schier, Arnd Bergmann, Luis Chamberlain, Sami Tolvanen,
	Daniel Gomez, Paul Moore, James Morris, Serge E. Hallyn,
	Jonathan Corbet, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Naveen N Rao, Mimi Zohar, Roberto Sassu,
	Dmitry Kasatkin, Eric Snowberg, Nicolas Schier, Daniel Gomez,
	Aaron Tomlin, Christophe Leroy (CS GROUP), Nicolas Bouchinet,
	Xiu Jianfeng, Martin KaFai Lau, Song Liu, Yonghong Song,
	Jiri Olsa, bpf, Fabian Grünbichler, Arnout Engelen,
	Mattia Rizzolo, kpcyrd, Christian Heusel, Câju Mihai-Drosi,
	Eric Biggers, Sebastian Andrzej Siewior, linux-kbuild,
	linux-kernel, linux-arch, linux-modules, linux-security-module,
	linux-doc, linuxppc-dev, linux-integrity, debian-kernel
In-Reply-To: <20260505-module-hashes-v5-8-e174a5a49fce@weissschuh.net>

On 5/5/26 11:05 AM, Thomas Weißschuh wrote:
> The module authentication functionality will also be used by the
> hash-based module authentication. To make it usable even if
> CONFIG_MODULE_SIG is disabled, move it to a new file.
> 
> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
> ---
> [...]
> diff --git a/kernel/module/auth.c b/kernel/module/auth.c
> index 956ac63d9d33..831a13eb0c9b 100644
> --- a/kernel/module/auth.c
> +++ b/kernel/module/auth.c
> @@ -5,10 +5,16 @@
>   * Written by David Howells (dhowells@redhat.com)
>   */
>  
> +#include <linux/errno.h>
>  #include <linux/export.h>
>  #include <linux/module.h>
> +#include <linux/module_signature.h>
>  #include <linux/moduleparam.h>
> +#include <linux/security.h>
> +#include <linux/string.h>
>  #include <linux/types.h>
> +#include <uapi/linux/module.h>
> +#include "internal.h"
>  
>  #undef MODULE_PARAM_PREFIX
>  #define MODULE_PARAM_PREFIX "module."
> @@ -30,3 +36,82 @@ void set_module_sig_enforced(void)
>  {
>  	sig_enforce = true;
>  }
> +
> +static int mod_verify_sig(const void *mod, struct load_info *info)
> +{
> +	struct module_signature ms;
> +	size_t sig_len, modlen = info->len;
> +	int ret;
> +
> +	if (modlen <= sizeof(ms))
> +		return -EBADMSG;
> +
> +	memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
> +
> +	ret = mod_check_sig(&ms, modlen, "module");
> +	if (ret)
> +		return ret;
> +
> +	sig_len = be32_to_cpu(ms.sig_len);
> +	modlen -= sig_len + sizeof(ms);
> +	info->len = modlen;
> +
> +	return module_sig_check(mod, modlen, mod + modlen, sig_len);
> +}
> +
> +int module_auth_check(struct load_info *info, int flags)
> +{
> +	int err = -ENODATA;
> +	const unsigned long markerlen = sizeof(MODULE_SIGNATURE_MARKER) - 1;
> +	const char *reason;
> +	const void *mod = info->hdr;
> +	bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
> +				       MODULE_INIT_IGNORE_VERMAGIC);
> +	/*
> +	 * Do not allow mangled modules as a module with version information
> +	 * removed is no longer the module that was signed.
> +	 */
> +	if (!mangled_module &&
> +	    info->len > markerlen &&
> +	    memcmp(mod + info->len - markerlen, MODULE_SIGNATURE_MARKER, markerlen) == 0) {
> +		/* We truncate the module to discard the signature */
> +		info->len -= markerlen;
> +		err = mod_verify_sig(mod, info);
> +		if (!err) {
> +			info->auth_ok = true;
> +			return 0;
> +		}
> +	}
> +
> +	/*
> +	 * We don't permit modules to be loaded into the trusted kernels
> +	 * without a valid signature on them, but if we're not enforcing,
> +	 * certain errors are non-fatal.
> +	 */
> +	switch (err) {
> +	case -ENODATA:
> +		reason = "unsigned module";
> +		break;
> +	case -ENOPKG:
> +		reason = "module with unsupported crypto";
> +		break;
> +	case -ENOKEY:
> +		reason = "module with unavailable key";
> +		break;
> +
> +	default:
> +		/*
> +		 * All other errors are fatal, including lack of memory,
> +		 * unparseable signatures, and signature check failures --
> +		 * even if signatures aren't required.
> +		 */
> +		return err;
> +	}
> +
> +	if (is_module_sig_enforced()) {
> +		pr_notice("Loading of %s is rejected\n", reason);
> +		return -EKEYREJECTED;
> +	}
> +
> +	return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
> +}

The resulting call chain of the module authentication/signature
functions is as follows:

ima_read_modsig() -----------------------------,
                                               v
module_auth_check() -> mod_verify_sig() -> mod_check_sig()
                             |
                             |-> module_sig_check()
                             '-> module_hash_check()

I think this logic is quite hard to follow because mod_verify_sig(),
mod_check_sig() and module_sig_check() have very similar names.

The naming of module_auth_check(), module_sig_check() and
module_hash_check() looks good to me, but I would prefer to rename
mod_check_sig() and mod_verify_sig(). Perhaps mod_check_sig() could be
renamed to mod_check_sig_header(), and mod_verify_sig() to
mod_dispatch_auth_check()?

Otherwise, the patch looks ok to me. Feel free to add:

Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>

-- 
Thanks,
Petr


^ permalink raw reply

* Re: [PATCH 2/2] powerpc: export memory encryption helper functions
From: Christophe Leroy (CS GROUP) @ 2026-05-26 11:53 UTC (permalink / raw)
  To: Arnd Bergmann, Madhavan Srinivasan, Michael Ellerman,
	T.J. Mercier, Maxime Ripard, Sumit Semwal, Andrew Davis,
	Christoph Hellwig
  Cc: Arnd Bergmann, Nicholas Piggin, linuxppc-dev, linux-kernel
In-Reply-To: <20260526102113.2594501-2-arnd@kernel.org>

Hi Arnd,

Le 26/05/2026 à 12:20, Arnd Bergmann a écrit :
> From: Arnd Bergmann <arnd@arndb.de>
> 
> The set_memory_encrypted/set_memory_decrypted functions are exported
> on x86 and arm64 but not on powerpc, which leads to a new build failure
> because they are now used in a loadable module:
> 
> ERROR: modpost: "set_memory_encrypted" [drivers/dma-buf/heaps/system_heap.ko] undefined!
> ERROR: modpost: "set_memory_decrypted" [drivers/dma-buf/heaps/system_heap.ko] undefined!
> 
> Export these the same way we do on the other architectures.

The same fix was rejected already, see 
https://lore.kernel.org/all/ahPqbfH54R3JJyaV@infradead.org/

Christophe

> 
> Fixes: fd55edff8a0a ("dma-buf: heaps: system: Turn the heap into a module")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> ---
>   arch/powerpc/platforms/pseries/svm.c | 3 +++
>   1 file changed, 3 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
> index 384c9dc1899a..ab8f8c722741 100644
> --- a/arch/powerpc/platforms/pseries/svm.c
> +++ b/arch/powerpc/platforms/pseries/svm.c
> @@ -6,6 +6,7 @@
>    * Author: Anshuman Khandual <khandual@linux.vnet.ibm.com>
>    */
>   
> +#include <linux/export.h>
>   #include <linux/mm.h>
>   #include <linux/memblock.h>
>   #include <linux/mem_encrypt.h>
> @@ -50,6 +51,7 @@ int set_memory_encrypted(unsigned long addr, int numpages)
>   
>   	return 0;
>   }
> +EXPORT_SYMBOL_GPL(set_memory_encrypted);
>   
>   int set_memory_decrypted(unsigned long addr, int numpages)
>   {
> @@ -63,6 +65,7 @@ int set_memory_decrypted(unsigned long addr, int numpages)
>   
>   	return 0;
>   }
> +EXPORT_SYMBOL_GPL(set_memory_decrypted);
>   
>   /* There's one dispatch log per CPU. */
>   #define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)



^ permalink raw reply

* Re: [PATCH v5 07/14] module: Make module authentication usable without MODULE_SIG
From: Thomas Weißschuh @ 2026-05-26 11:38 UTC (permalink / raw)
  To: Petr Pavlu
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Nathan Chancellor,
	Nicolas Schier, Arnd Bergmann, Luis Chamberlain, Sami Tolvanen,
	Daniel Gomez, Paul Moore, James Morris, Serge E. Hallyn,
	Jonathan Corbet, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Naveen N Rao, Mimi Zohar, Roberto Sassu,
	Dmitry Kasatkin, Eric Snowberg, Nicolas Schier, Daniel Gomez,
	Aaron Tomlin, Christophe Leroy (CS GROUP), Nicolas Bouchinet,
	Xiu Jianfeng, Martin KaFai Lau, Song Liu, Yonghong Song,
	Jiri Olsa, bpf, Fabian Grünbichler, Arnout Engelen,
	Mattia Rizzolo, kpcyrd, Christian Heusel, Câju Mihai-Drosi,
	Eric Biggers, Sebastian Andrzej Siewior, linux-kbuild,
	linux-kernel, linux-arch, linux-modules, linux-security-module,
	linux-doc, linuxppc-dev, linux-integrity, debian-kernel
In-Reply-To: <0a0736a4-2cdd-49f2-9062-e2f18d769fc0@suse.com>

On 2026-05-26 12:53:22+0200, Petr Pavlu wrote:
> On 5/5/26 11:05 AM, Thomas Weißschuh wrote:
> > The module authentication functionality will also be used by the
> > hash-based module authentication. Split it out from CONFIG_MODULE_SIG
> > so it is usable by both.
> > 
> > Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
> > [...]
> > diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
> > index f535181e0d98..84297da666ff 100644
> > --- a/kernel/module/Kconfig
> > +++ b/kernel/module/Kconfig
> > @@ -271,9 +271,12 @@ config MODULE_SIG
> >  	  debuginfo strip done by some packagers (such as rpmbuild) and
> >  	  inclusion into an initramfs that wants the module size reduced.
> >  
> > +config MODULE_AUTH
> > +	def_bool MODULE_SIG
> > +
> >  config MODULE_SIG_FORCE
> >  	bool "Require modules to be validly signed"
> > -	depends on MODULE_SIG
> > +	depends on MODULE_AUTH
> >  	help
> >  	  Reject unsigned modules or signed modules for which we don't have a
> >  	  key.  Without this, such modules will simply taint the kernel.
> 
> Should MODULE_SIG_FORCE be renamed to MODULE_AUTH_FORCE, along with
> renaming the sig_enforce functionality in kernel/module/auth.c to
> auth_enforce?

Given that it is a user-visible symbol we'll need to be a bit careful
not to break existing configurations.
I'll try to use the new "transitional" kconfig attribute.


Thomas


^ permalink raw reply

* Re: [PATCH v5 06/14] module: Switch load_info::len to size_t
From: Thomas Weißschuh @ 2026-05-26 11:35 UTC (permalink / raw)
  To: Petr Pavlu
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Nathan Chancellor,
	Nicolas Schier, Arnd Bergmann, Luis Chamberlain, Sami Tolvanen,
	Daniel Gomez, Paul Moore, James Morris, Serge E. Hallyn,
	Jonathan Corbet, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Naveen N Rao, Mimi Zohar, Roberto Sassu,
	Dmitry Kasatkin, Eric Snowberg, Nicolas Schier, Daniel Gomez,
	Aaron Tomlin, Christophe Leroy (CS GROUP), Nicolas Bouchinet,
	Xiu Jianfeng, Martin KaFai Lau, Song Liu, Yonghong Song,
	Jiri Olsa, bpf, Fabian Grünbichler, Arnout Engelen,
	Mattia Rizzolo, kpcyrd, Christian Heusel, Câju Mihai-Drosi,
	Eric Biggers, Sebastian Andrzej Siewior, linux-kbuild,
	linux-kernel, linux-arch, linux-modules, linux-security-module,
	linux-doc, linuxppc-dev, linux-integrity, debian-kernel
In-Reply-To: <8de0e6ad-987a-4729-bbd0-8399968dbb48@suse.com>

On 2026-05-26 11:47:09+0200, Petr Pavlu wrote:
> On 5/5/26 11:05 AM, Thomas Weißschuh wrote:
> > Switching the types will make some later changes cleaner.
> 
> Since the updated version drops the patch "module: Deduplicate signature
> extraction", I believe this change is no longer necessary.

Ack.

(...)

Thomas


^ permalink raw reply

* RE: [PATCH] [net-next] net: dsa: netc: fix enetc dependencies
From: Wei Fang @ 2026-05-26 11:03 UTC (permalink / raw)
  To: Arnd Bergmann, Claudiu Manoil, Clark Wang,
	Christophe Leroy (CS GROUP)
  Cc: Arnd Bergmann, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, imx@lists.linux.dev,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <20260526102708.2837129-1-arnd@kernel.org>

> Add the required 'NET_VENDOR_FREESCALE' dependency to make it possible
> to select both the PHY and NTMP library modules. Originally this was
> meant to be done through an 'IS_REACHABLE' check in the header file,
> but that did not work because the drivers/net/ethernet/freescale
> directory is not even entered in this case. IS_REACHABLE() is generally
> counterproductive because even when it works as intended, it turns
> a helpful link-time error into a silent runtime failure that is
> harder to debug. In this case, it clearly did not even do anything.
> 

Hi Arnd,

Thanks for fix this issue, I have sent a patch last Sunday.
Link: https://lore.kernel.org/netdev/20260524070310.2429819-1-wei.fang@nxp.com/

I think the solution should simply be to add
"depends on NET_VENDOR_FREESCALE", right? The changes in
enetc_mdio.h seem more like improvements to me.



^ permalink raw reply

* Re: [PATCH v5 07/14] module: Make module authentication usable without MODULE_SIG
From: Petr Pavlu @ 2026-05-26 10:53 UTC (permalink / raw)
  To: Thomas Weißschuh
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Nathan Chancellor,
	Nicolas Schier, Arnd Bergmann, Luis Chamberlain, Sami Tolvanen,
	Daniel Gomez, Paul Moore, James Morris, Serge E. Hallyn,
	Jonathan Corbet, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Naveen N Rao, Mimi Zohar, Roberto Sassu,
	Dmitry Kasatkin, Eric Snowberg, Nicolas Schier, Daniel Gomez,
	Aaron Tomlin, Christophe Leroy (CS GROUP), Nicolas Bouchinet,
	Xiu Jianfeng, Martin KaFai Lau, Song Liu, Yonghong Song,
	Jiri Olsa, bpf, Fabian Grünbichler, Arnout Engelen,
	Mattia Rizzolo, kpcyrd, Christian Heusel, Câju Mihai-Drosi,
	Eric Biggers, Sebastian Andrzej Siewior, linux-kbuild,
	linux-kernel, linux-arch, linux-modules, linux-security-module,
	linux-doc, linuxppc-dev, linux-integrity, debian-kernel
In-Reply-To: <20260505-module-hashes-v5-7-e174a5a49fce@weissschuh.net>

On 5/5/26 11:05 AM, Thomas Weißschuh wrote:
> The module authentication functionality will also be used by the
> hash-based module authentication. Split it out from CONFIG_MODULE_SIG
> so it is usable by both.
> 
> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
> [...]
> diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
> index f535181e0d98..84297da666ff 100644
> --- a/kernel/module/Kconfig
> +++ b/kernel/module/Kconfig
> @@ -271,9 +271,12 @@ config MODULE_SIG
>  	  debuginfo strip done by some packagers (such as rpmbuild) and
>  	  inclusion into an initramfs that wants the module size reduced.
>  
> +config MODULE_AUTH
> +	def_bool MODULE_SIG
> +
>  config MODULE_SIG_FORCE
>  	bool "Require modules to be validly signed"
> -	depends on MODULE_SIG
> +	depends on MODULE_AUTH
>  	help
>  	  Reject unsigned modules or signed modules for which we don't have a
>  	  key.  Without this, such modules will simply taint the kernel.

Should MODULE_SIG_FORCE be renamed to MODULE_AUTH_FORCE, along with
renaming the sig_enforce functionality in kernel/module/auth.c to
auth_enforce?

-- 
Thanks,
Petr


^ permalink raw reply

* Re: [PATCH 00/15 v4] tick/sched: Refactor idle cputime accounting
From: Frederic Weisbecker @ 2026-05-26 10:42 UTC (permalink / raw)
  To: LKML, Peter Zijlstra, Thomas Gleixner
  Cc: Madhavan Srinivasan, Jan Kiszka, Dietmar Eggemann,
	Shrikanth Hegde, Nicholas Piggin, Alexander Gordeev, Ben Segall,
	Vasily Gorbik, Rafael J. Wysocki, linux-pm, Sashiko, Ingo Molnar,
	Michael Ellerman, Boqun Feng, Valentin Schneider, linuxppc-dev,
	Sven Schnelle, Ingo Molnar, Vincent Guittot,
	Christian Borntraeger, Mel Gorman, Steven Rostedt, Joel Fernandes,
	Paul E . McKenney, Neeraj Upadhyay, Anna-Maria Behnsen,
	Christophe Leroy (CS GROUP), Juri Lelli, Uladzislau Rezki,
	Viresh Kumar, Kieran Bingham, Xin Zhao, linux-s390,
	Heiko Carstens
In-Reply-To: <20260508131647.43868-1-frederic@kernel.org>

Hi,

I don't see any further concern. What should we do with this? It could
either go through the scheduler tree or the timer tree.

Thanks.


Le Fri, May 08, 2026 at 03:16:32PM +0200, Frederic Weisbecker a écrit :
> Hi,
> 
> After the issue reported here:
> 
>         https://lore.kernel.org/all/20251210083135.3993562-1-jackzxcui1989@163.com/
> 
> It occurs that the idle cputime accounting is a big mess that
> accumulates within two concurrent statistics, each having their own
> shortcomings:
> 
> * The accounting for online CPUs which is based on the delta between
>   tick_nohz_start_idle() and tick_nohz_stop_idle().
> 
>   Pros:
>        - Works when the tick is off
> 
>        - Has nsecs granularity
> 
>   Cons:
>        - Account idle steal time but doesn't substract it from idle
>          cputime.
> 
>        - Assumes CONFIG_IRQ_TIME_ACCOUNTING by not accounting IRQs but
>          the IRQ time is simply ignored when
>          CONFIG_IRQ_TIME_ACCOUNTING=n
> 
>        - The windows between 1) idle task scheduling and the first call
>          to tick_nohz_start_idle() and 2) idle task between the last
>          tick_nohz_stop_idle() and the rest of the idle time are
>          blindspots wrt. cputime accounting (though mostly insignificant
>          amount)
> 
>        - Relies on private fields outside of kernel stats, with specific
>          accessors.
> 
> * The accounting for offline CPUs which is based on ticks and the
>   jiffies delta during which the tick was stopped.
> 
>   Pros:
>        - Handles steal time correctly
> 
>        - Handle CONFIG_IRQ_TIME_ACCOUNTING=y and
>          CONFIG_IRQ_TIME_ACCOUNTING=n correctly.
> 
>        - Handles the whole idle task
> 
>        - Accounts directly to kernel stats, without midlayer accumulator.
> 
>    Cons:
>        - Doesn't elapse when the tick is off, which doesn't make it
>          suitable for online CPUs.
> 
>        - Has TICK_NSEC granularity (jiffies)
> 
>        - Needs to track the dyntick-idle ticks that were accounted and
>          substract them from the total jiffies time spent while the tick
>          was stopped. This is an ugly workaround.
> 
> Having two different accounting for a single context is not the only
> problem: since those accountings are of different natures, it is
> possible to observe the global idle time going backward after a CPU goes
> offline, as reported by Xin Zhao.
> 
> Clean up the situation with introducing a hybrid approach that stays
> coherent, fixes the backward jumps and works for both online and offline
> CPUs:
> 
> * Tick based or native vtime accounting operate before the tick is
>   stopped and resumes once the tick is restarted.
> 
> * When the idle loop starts, switch to dynticks-idle accounting as is
>   done currently, except that the statistics accumulate directly to the
>   relevant kernel stat fields.
> 
> * Private dyntick cputime accounting fields are removed.
> 
> * Works on both online and offline case.
> 
> * Move most of the relevant code to the common sched/cputime subsystem
> 
> * Handle CONFIG_IRQ_TIME_ACCOUNTING=n correctly such that the
>   dynticks-idle accounting still elapses while on IRQs.
> 
> * Correctly substract idle steal cputime from idle time
> 
> Changes since v3 (among which a lot of relevant reviews from Sashiko):
> 
> - Add new tags
> 
> - Rebase on latest -rc1
> 
> - Add "tick/sched: Fix TOCTOU in nohz idle time fetch" (Sashiko)
> 
> - Fix buggy state refetch in kcpustat_cpu_fetch_vtime() (Sashiko)
> 
> - Fix build issue on powerpc (Christophe Leroy)
> 
> - Fix s390 lost steal time occuring on idle IRQs (call vtime_flush() on
>   vtime_account_hardirq() and vtime_account_softirq()) (Sashiko)
> 
> - Fix build issue on s390
> 
> - Fix uninitialized idle_sleeptime_seq (Sashiko)
> 
> - Fix irqtime being disabled or enabled in the middle of an idle IRQ
>   (Sashiko)
>   
> - Fix tick restart and then restop in the same idle loop (Sashiko)
> 
> - Fix "sched/cputime: Handle idle irqtime gracefully" changelog (Sashiko)
> 
> - Fix idle steal time substracted from the wrong index between idle and
>   iowait kcpustat. (Sashiko)
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
> 	timers/core-v4
> 
> HEAD: e64ba052ce04e363ff76d3cb8bedc5f812188acb
> Thanks,
> 	Frederic
> ---
> 
> Frederic Weisbecker (15):
>       tick/sched: Fix TOCTOU in nohz idle time fetch
>       sched/idle: Handle offlining first in idle loop
>       sched/cputime: Remove superfluous and error prone kcpustat_field() parameter
>       sched/cputime: Correctly support generic vtime idle time
>       powerpc/time: Prepare to stop elapsing in dynticks-idle
>       s390/time: Prepare to stop elapsing in dynticks-idle
>       tick/sched: Unify idle cputime accounting
>       tick/sched: Remove nohz disabled special case in cputime fetch
>       tick/sched: Move dyntick-idle cputime accounting to cputime code
>       tick/sched: Remove unused fields
>       tick/sched: Account tickless idle cputime only when tick is stopped
>       tick/sched: Consolidate idle time fetching APIs
>       sched/cputime: Provide get_cpu_[idle|iowait]_time_us() off-case
>       sched/cputime: Handle idle irqtime gracefully
>       sched/cputime: Handle dyntick-idle steal time correctly
> 
>  arch/powerpc/kernel/time.c         |  41 +++++
>  arch/s390/include/asm/idle.h       |   2 +
>  arch/s390/kernel/idle.c            |   5 +-
>  arch/s390/kernel/vtime.c           |  75 ++++++++-
>  drivers/cpufreq/cpufreq.c          |  29 +---
>  drivers/cpufreq/cpufreq_governor.c |   6 +-
>  drivers/macintosh/rack-meter.c     |   2 +-
>  fs/proc/stat.c                     |  40 +----
>  fs/proc/uptime.c                   |   8 +-
>  include/linux/kernel_stat.h        |  76 +++++++--
>  include/linux/tick.h               |   4 -
>  include/linux/vtime.h              |  22 ++-
>  kernel/rcu/tree.c                  |   9 +-
>  kernel/rcu/tree_stall.h            |   7 +-
>  kernel/sched/core.c                |   6 +-
>  kernel/sched/cputime.c             | 308 +++++++++++++++++++++++++++++++------
>  kernel/sched/idle.c                |  13 +-
>  kernel/time/tick-sched.c           | 212 ++++++-------------------
>  kernel/time/tick-sched.h           |  12 --
>  kernel/time/timer_list.c           |   6 +-
>  scripts/gdb/linux/timerlist.py     |   4 -
>  21 files changed, 529 insertions(+), 358 deletions(-)

-- 
Frederic Weisbecker
SUSE Labs


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox