LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v6 02/15] arm64: mm: Drop redundant pgd_t* argument from map_mem()
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh,
	Kevin Brodsky
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

__map_memblock() and map_mem() always operate on swapper_pg_dir, so
there is no need to pass around a pgd_t pointer between them.

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 25 ++++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 112fa4a3b0eb..aa0e2c6435f7 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1035,11 +1035,11 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 	flush_tlb_kernel_range(virt, virt + size);
 }
 
-static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
-				  phys_addr_t end, pgprot_t prot, int flags)
+static void __init __map_memblock(phys_addr_t start, phys_addr_t end,
+				  pgprot_t prot, int flags)
 {
-	early_create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
-				 prot, early_pgtable_alloc, flags);
+	early_create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
+				 end - start, prot, early_pgtable_alloc, flags);
 }
 
 void __init mark_linear_text_alias_ro(void)
@@ -1087,13 +1087,13 @@ static phys_addr_t __init arm64_kfence_alloc_pool(void)
 	return kfence_pool;
 }
 
-static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
+static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool)
 {
 	if (!kfence_pool)
 		return;
 
 	/* KFENCE pool needs page-level mapping. */
-	__map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
+	__map_memblock(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
 			pgprot_tagged(PAGE_KERNEL),
 			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
 	memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
@@ -1129,11 +1129,11 @@ bool arch_kfence_init_pool(void)
 #else /* CONFIG_KFENCE */
 
 static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
-static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { }
+static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool) { }
 
 #endif /* CONFIG_KFENCE */
 
-static void __init map_mem(pgd_t *pgdp)
+static void __init map_mem(void)
 {
 	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
 	phys_addr_t kernel_start = __pa_symbol(_text);
@@ -1178,7 +1178,7 @@ static void __init map_mem(pgd_t *pgdp)
 		 * if MTE is present. Otherwise, it has the same attributes as
 		 * PAGE_KERNEL.
 		 */
-		__map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
+		__map_memblock(start, end, pgprot_tagged(PAGE_KERNEL),
 			       flags);
 	}
 
@@ -1192,10 +1192,9 @@ static void __init map_mem(pgd_t *pgdp)
 	 * Note that contiguous mappings cannot be remapped in this way,
 	 * so we should avoid them here.
 	 */
-	__map_memblock(pgdp, kernel_start, kernel_end,
-		       PAGE_KERNEL, NO_CONT_MAPPINGS);
+	__map_memblock(kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS);
 	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
-	arm64_kfence_map_pool(early_kfence_pool, pgdp);
+	arm64_kfence_map_pool(early_kfence_pool);
 }
 
 void mark_rodata_ro(void)
@@ -1417,7 +1416,7 @@ static void __init create_idmap(void)
 
 void __init paging_init(void)
 {
-	map_mem(swapper_pg_dir);
+	map_mem();
 
 	memblock_allow_resize();
 
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 01/15] arm64: mm: Remove bogus stop condition from map_mem() loop
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh,
	Kevin Brodsky
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

The memblock API guarantees that start is not greater than or equal to
end, so there is no need to test it. And if it were, it is doubtful that
breaking out of the loop would be a reasonable course of action here
(rather than attempting to map the remaining regions)

So let's drop this check.

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index dd85e093ffdb..112fa4a3b0eb 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1173,8 +1173,6 @@ static void __init map_mem(pgd_t *pgdp)
 
 	/* map all the memory banks */
 	for_each_mem_range(i, &start, &end) {
-		if (start >= end)
-			break;
 		/*
 		 * The linear map must allow allocation tags reading/writing
 		 * if MTE is present. Otherwise, it has the same attributes as
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 03/15] arm64: mm: Check for pud_/pmd_set_huge() failures on kernel mappings
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

Sashiko reports:

| If pmd_set_huge() rejects an unsafe page table transition (such as
| mapping a different physical address over an existing block mapping),
| it returns 0 and leaves the page table entry unmodified.
|
| Because *pmdp remains unmodified, READ_ONCE(pmd_val(*pmdp)) will equal
| pmd_val(old_pmd). The transition from old_pmd to old_pmd is evaluated
| as safe by pgattr_change_is_safe(), so the BUG_ON never triggers.
|
| This allows invalid and unsafe mapping updates to be silently dropped
| instead of panicking, leaving stale memory mappings active while the
| caller assumes the update was successful.

The same applies to pud_set_huge() in alloc_init_pud().

Given how it is generally preferred to limp on rather than blow up the
system if an unexpected condition such as this one occurs, and the fact
that there are no known cases where this disparity results in real
problems, let's WARN on these failures rather than BUG, allowing the
system to survive to the point where it can actually report them.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index aa0e2c6435f7..b2ba5b35c35f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -257,7 +257,7 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		/* try section mapping first */
 		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
-			pmd_set_huge(pmdp, phys, prot);
+			WARN_ON(!pmd_set_huge(pmdp, phys, prot));

 			/*
 			 * After the PMD entry has been populated once, we
@@ -380,7 +380,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 		if (pud_sect_supported() &&
 		   ((addr | next | phys) & ~PUD_MASK) == 0 &&
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
-			pud_set_huge(pudp, phys, prot);
+			WARN_ON(!pud_set_huge(pudp, phys, prot));

 			/*
 			 * After the PUD entry has been populated once, we
-- 
2.54.0.794.g4f17f83d09-goog

^ permalink raw reply related

* [PATCH v6 04/15] arm64: mm: Preserve existing table mappings when mapping DRAM
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

Instead of blindly overwriting an existing table entry when mapping DRAM
regions, take care not to replace a pre-existing table entry with a
block entry. This permits the logic of mapping the kernel's linear alias
to be simplified in a subsequent patch.

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b2ba5b35c35f..5c827fa3cd38 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -256,7 +256,8 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 		/* try section mapping first */
 		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
-		    (flags & NO_BLOCK_MAPPINGS) == 0) {
+		    (flags & NO_BLOCK_MAPPINGS) == 0 &&
+		    !pmd_table(old_pmd)) {
 			WARN_ON(!pmd_set_huge(pmdp, phys, prot));
 
 			/*
@@ -379,7 +380,8 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 		 */
 		if (pud_sect_supported() &&
 		   ((addr | next | phys) & ~PUD_MASK) == 0 &&
-		    (flags & NO_BLOCK_MAPPINGS) == 0) {
+		    (flags & NO_BLOCK_MAPPINGS) == 0 &&
+		    !pud_table(old_pud)) {
 			WARN_ON(!pud_set_huge(pudp, phys, prot));
 
 			/*
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 00/15] arm64: Unmap linear alias of kernel data/bss
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh

From: Ard Biesheuvel <ardb@kernel.org>

One of the reasons the lack of randomization of the linear map on arm64
is considered problematic is the fact that bootloaders adhering to the
original arm64 boot protocol (i.e., a substantial fraction of all
Android phones) may place the kernel at the base of DRAM, and therefore
at the base of the non-randomized linear map. This puts a writable alias
of the kernel's data and bss regions at a predictable location, removing
the need for an attacker to guess where KASLR mapped the kernel.

Let's unmap this linear, writable alias entirely, so that knowing the
location of the linear alias does not give write access to the kernel's
data and bss regions.

Changes since v5:
- Reorder series in ascending order of impact, so that the first few can
  be merged earlier if desired. This also makes the patch that remaps
  the data/bss linear alias as tagged redundant, which is therefore
  dropped.
- Add patch #3 to address an existing issue spotted by Sashiko
- Fix thinko in contiguous region check (#5), where the whole region
  needs to be considered and not only the first entry (dropped Rb as
  well) - this addresses the kfence issue Sashiko reported on v5 [0]
- Update commit log on #6 to clarify that changing permission bits on
  PTE_CONT entries is safe as long as PTE_CONT itself does not change
- Likewise, drop hunk that adds the PTE_CONT bit to the 'permitted' mask
  in pgattr_change_is_safe(), as changing it is not safe. (#8)
- Move kasan's additional page table to pgdir BSS as well
- Use (NOLOAD) on the .pgdir.bss section so it does not get emitted into
  vmlinux
- Add powerpc and SuperH patches to deal with empty_zero_page[] being
  made const

Changes since v4:
- Update the correct [early] mapping in patch #1
- Make empty_zero_page[] const instead of __ro_after_init
- Drop patches that remap the fixmap page tables r/o for now
- Don't force page mappings for the data/bss linear alias, as it is no
  longer needed for set_memory_valid()
- Add acks

Changes since v3:
- Drop bogus patch adding hierarchical PXN to the fixmap mapping, which
  breaks the KPTI trampoline (thanks to Sashiko)
- Add generic patch to move the empty_zero_page to __ro_after_init, as
  it now lives in generic code.
- Add patches to remap the linear aliases of the fixmap page tables
  read-only too - these live at an a priori known offset in the linear
  map if physical KASLR was omitted, and control a priori known
  addresses in the virtual kernel space.
- Rebase onto v7.1-rc1

Changes since v2:
- Keep bm_pte[] in the region that is remapped r/o or unmapped, as it is
  only manipulated via its kernel alias
- Drop check that prohibits any manipulation of descriptors with the
  CONT bit set
- Add Ryan's ack to a couple of patches
- Rebase onto v7.0-rc4

Changes since v1:
- Put zero page patch at the start of the series
- Tweak __map_memblock() API to respect existing table and contiguous
  mappings, so that the logic to map the kernel alias can be simplified
- Stop abusing the MEMBLOCK_NOMAP flag to initially omit the kernel
  linear alias from the linear map
- Some additional cleanup patches
- Use proper API [set_memory_valid()] to (un)map the linear alias of
  data/bss.

Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc; Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Liz Prucka <lizprucka@google.com>
Cc: Seth Jenkins <sethjenkins@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jann Horn <jannh@google.com>
Cc: linux-mm@kvack.org
Cc: linux-hardening@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-sh@vger.kernel.org

[0] https://sashiko.dev/#/patchset/20260519151616.2557018-15-ardb%2Bgit%40google.com

Ard Biesheuvel (15):
  arm64: mm: Remove bogus stop condition from map_mem() loop
  arm64: mm: Drop redundant pgd_t* argument from map_mem()
  arm64: mm: Check for pud_/pmd_set_huge() failures on kernel mappings
  arm64: mm: Preserve existing table mappings when mapping DRAM
  arm64: mm: Preserve non-contiguous descriptors when mapping DRAM
  arm64: mm: Permit contiguous descriptors to be manipulated
  arm64: kfence: Avoid NOMAP tricks when mapping the early pool
  arm64: mm: Permit contiguous attribute for preliminary mappings
  arm64: Move fixmap and kasan page tables to end of kernel image
  arm64: mm: Don't abuse memblock NOMAP to check for overlaps
  arm64: mm: Map the kernel data/bss read-only in the linear map
  powerpc/code-patching: Avoid r/w mapping of the zero page
  sh: cast away constness from the zero page when flushing it from the
    cache
  mm: Make empty_zero_page[] const
  arm64: mm: Unmap kernel data/bss entirely from the linear map

 arch/arm64/include/asm/mmu.h     |   2 +
 arch/arm64/include/asm/pgtable.h |   4 +
 arch/arm64/kernel/vmlinux.lds.S  |   8 +-
 arch/arm64/mm/fixmap.c           |   6 +-
 arch/arm64/mm/kasan_init.c       |   2 +-
 arch/arm64/mm/mmu.c              | 153 ++++++++++++--------
 arch/powerpc/lib/code-patching.c |  52 +------
 arch/sh/mm/init.c                |   2 +-
 include/linux/pgtable.h          |   2 +-
 mm/mm_init.c                     |   2 +-
 10 files changed, 111 insertions(+), 122 deletions(-)


base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply

* [PATCH v6 05/15] arm64: mm: Preserve non-contiguous descriptors when mapping DRAM
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

Instead of blindly overwriting existing live entries regardless of the
value of their contiguous bit when mapping DRAM regions at
contiguous-hint granularity, check whether the contiguous region in
question contains any valid descriptors that have the contiguous bit
cleared, and in that case, leave the contiguous bit unset on the entire
region. This permits the logic of mapping the kernel's linear alias to
be simplified in a subsequent patch.

Note that this can only result in a misprogrammed contiguous bit (as per
ARM ARM RNGLXZ) if the region in question already contains a mix of
valid contiguous and valid non-contiguous descriptors, in which case it
was already misprogrammed to begin with.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/include/asm/pgtable.h |  4 ++++
 arch/arm64/mm/mmu.c              | 22 ++++++++++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 4dfa42b7d053..a1c5894332d9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -181,6 +181,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
  * Returns true if the pte is valid and has the contiguous bit set.
  */
 #define pte_valid_cont(pte)	(pte_valid(pte) && pte_cont(pte))
+/*
+ * Returns true if the pte is valid and has the contiguous bit cleared.
+ */
+#define pte_valid_noncont(pte)	(pte_valid(pte) && !pte_cont(pte))
 /*
  * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
  * so that we don't erroneously return false for pages that have been
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 5c827fa3cd38..6b42d724bd1b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -187,6 +187,14 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
 }
 
+static bool pte_range_has_valid_noncont(pte_t *ptep)
+{
+	for (int i = 0; i < CONT_PTES; i++)
+		if (pte_valid_noncont(__ptep_get(&ptep[i])))
+			return true;
+	return false;
+}
+
 static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 			       unsigned long end, phys_addr_t phys,
 			       pgprot_t prot,
@@ -224,7 +232,8 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 
 		/* use a contiguous mapping if the range is suitably aligned */
 		if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
-		    (flags & NO_CONT_MAPPINGS) == 0)
+		    (flags & NO_CONT_MAPPINGS) == 0 &&
+		    !pte_range_has_valid_noncont(ptep))
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
 		init_pte(ptep, addr, next, phys, __prot);
@@ -283,6 +292,14 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	return 0;
 }
 
+static bool pmd_range_has_valid_noncont(pmd_t *pmdp)
+{
+	for (int i = 0; i < CONT_PMDS; i++)
+		if (pte_valid_noncont(pmd_pte(READ_ONCE(pmdp[i]))))
+			return true;
+	return false;
+}
+
 static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 			       unsigned long end, phys_addr_t phys,
 			       pgprot_t prot,
@@ -324,7 +341,8 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 
 		/* use a contiguous mapping if the range is suitably aligned */
 		if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
-		    (flags & NO_CONT_MAPPINGS) == 0)
+		    (flags & NO_CONT_MAPPINGS) == 0 &&
+		    !pmd_range_has_valid_noncont(pmdp))
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
 		ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 06/15] arm64: mm: Permit contiguous descriptors to be manipulated
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

Currently, pgattr_change_is_safe() is overly pedantic when it comes to
descriptors with the contiguous hint attribute set, as it rejects
assignments even if the old and the new value are the same.

In fact, as per ARM ARM RJQQTC, manipulating descriptors with the
contiguous bit set is safe as long as the bit itself does not change
value, in the sense that no TLB conflict aborts or other exceptions may
be raised as a result. Inconsistent permission attributes within the
contiguous region may result in any of the alternatives to be taken to
apply to the entire region, which might be a programming error, but it
does not constitute an unsafe manipulation in terms of what
pgattr_change_is_safe() is intended to detect.

So drop the special PTE_CONT check, but still omit PTE_CONT from 'mask'
so that modifying the bit is still regarded as unsafe.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 6b42d724bd1b..d7a6991e1844 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -134,10 +134,6 @@ bool pgattr_change_is_safe(pteval_t old, pteval_t new)
 	if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
 		return false;

-	/* live contiguous mappings may not be manipulated at all */
-	if ((old | new) & PTE_CONT)
-		return false;
-
 	/* Transitioning from Non-Global to Global is unsafe */
 	if (old & ~new & PTE_NG)
 		return false;
-- 
2.54.0.794.g4f17f83d09-goog

^ permalink raw reply related

* [PATCH v6 07/15] arm64: kfence: Avoid NOMAP tricks when mapping the early pool
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

Now that the map_mem() routines respect existing page mappings and
contiguous granule sized blocks with the contiguous bit cleared, there
is no longer a reason to play tricks with the memblock NOMAP attribute.

Instead, the kfence pool can be allocated and mapped with page
granularity first, and this granularity will be respected when the rest
of DRAM is mapped later, even if block and contiguous mappings are
allowed for the remainder of those mappings.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 25 ++++----------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d7a6991e1844..55bb40348a47 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1083,36 +1083,24 @@ static int __init parse_kfence_early_init(char *arg)
 }
 early_param("kfence.sample_interval", parse_kfence_early_init);
 
-static phys_addr_t __init arm64_kfence_alloc_pool(void)
+static void __init arm64_kfence_map_pool(void)
 {
 	phys_addr_t kfence_pool;
 
 	if (!kfence_early_init)
-		return 0;
+		return;
 
 	kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
 	if (!kfence_pool) {
 		pr_err("failed to allocate kfence pool\n");
 		kfence_early_init = false;
-		return 0;
-	}
-
-	/* Temporarily mark as NOMAP. */
-	memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
-
-	return kfence_pool;
-}
-
-static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool)
-{
-	if (!kfence_pool)
 		return;
+	}
 
 	/* KFENCE pool needs page-level mapping. */
 	__map_memblock(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
 			pgprot_tagged(PAGE_KERNEL),
 			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
-	memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
 	__kfence_pool = phys_to_virt(kfence_pool);
 }
 
@@ -1144,8 +1132,7 @@ bool arch_kfence_init_pool(void)
 }
 #else /* CONFIG_KFENCE */
 
-static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
-static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool) { }
+static inline void arm64_kfence_map_pool(void) { }
 
 #endif /* CONFIG_KFENCE */
 
@@ -1155,7 +1142,6 @@ static void __init map_mem(void)
 	phys_addr_t kernel_start = __pa_symbol(_text);
 	phys_addr_t kernel_end = __pa_symbol(__init_begin);
 	phys_addr_t start, end;
-	phys_addr_t early_kfence_pool;
 	int flags = NO_EXEC_MAPPINGS;
 	u64 i;
 
@@ -1172,7 +1158,7 @@ static void __init map_mem(void)
 	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) &&
 		     pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1);
 
-	early_kfence_pool = arm64_kfence_alloc_pool();
+	arm64_kfence_map_pool();
 
 	linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map();
 
@@ -1210,7 +1196,6 @@ static void __init map_mem(void)
 	 */
 	__map_memblock(kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS);
 	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
-	arm64_kfence_map_pool(early_kfence_pool);
 }
 
 void mark_rodata_ro(void)
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 08/15] arm64: mm: Permit contiguous attribute for preliminary mappings
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

There are a few cases where we omit the contiguous hint for mappings
that start out as read-write and are remapped read-only later, on the
basis that manipulating live descriptors with the PTE_CONT attribute set
is unsafe. When support for the contiguous hint was added to the code,
the ARM ARM was ambiguous about this, and so we erred on the side of
caution.

In the meantime, this has been clarified [0], and regions that will be
remapped in their entirety, retaining the contiguous bit on all entries,
can use the contiguous hint both in the initial mapping as well as the
one that replaces it. Note that this requires that the logic that may be
called to remap overlapping regions respects existing valid descriptors
that have the contiguous bit cleared.

So omit the NO_CONT_MAPPINGS flag in places where it is unneeded.

Thanks to Ryan for the reference.

[0] RJQQTC

For a TLB lookup in a contiguous region mapped by translation table entries that
have consistent values for the Contiguous bit, but have the OA, attributes, or
permissions misprogrammed, that TLB lookup is permitted to produce an OA, access
permissions, and memory attributes that are consistent with any one of the
programmed translation table values.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 55bb40348a47..04cc579c7a15 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1016,8 +1016,7 @@ void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 			&phys, virt);
 		return;
 	}
-	early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
-				 NO_CONT_MAPPINGS);
+	early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0);
 }

 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
@@ -1044,8 +1043,7 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 		return;
 	}

-	early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
-				 NO_CONT_MAPPINGS);
+	early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0);

 	/* flush the TLBs after updating live kernel mappings */
 	flush_tlb_kernel_range(virt, virt + size);
@@ -1191,10 +1189,8 @@ static void __init map_mem(void)
 	 * alternative patching has completed). This makes the contents
 	 * of the region accessible to subsystems such as hibernate,
 	 * but protects it from inadvertent modification or execution.
-	 * Note that contiguous mappings cannot be remapped in this way,
-	 * so we should avoid them here.
 	 */
-	__map_memblock(kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS);
+	__map_memblock(kernel_start, kernel_end, PAGE_KERNEL, 0);
 	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
 }

-- 
2.54.0.794.g4f17f83d09-goog

^ permalink raw reply related

* [PATCH v6 09/15] arm64: Move fixmap and kasan page tables to end of kernel image
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh,
	Kevin Brodsky
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

Move the fixmap and kasan page tables out of the BSS section, and place
them at the end of the image, right before the init_pg_dir section where
some of the other statically allocated page tables live.

These page tables are currently the only data objects in vmlinux that
are meant to be accessed via the kernel image's linear alias, and so
placing them together allows the remainder of the data/bss section to be
remapped read-only or unmapped entirely.

Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/include/asm/mmu.h    | 2 ++
 arch/arm64/kernel/vmlinux.lds.S | 8 +++++++-
 arch/arm64/mm/fixmap.c          | 6 +++---
 arch/arm64/mm/kasan_init.c      | 2 +-
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 5e1211c540ab..fb95754f2876 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -13,6 +13,8 @@
 
 #ifndef __ASSEMBLER__
 
+#define __pgtbl_bss __section(".pgdir.bss") __aligned(PAGE_SIZE)
+
 #include <linux/refcount.h>
 #include <asm/cpufeature.h>
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index e1ac876200a3..2b0ebfb30c63 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -349,9 +349,15 @@ SECTIONS
 	_edata = .;
 
 	/* start of zero-init region */
-	BSS_SECTION(SBSS_ALIGN, 0, 0)
+	BSS_SECTION(SBSS_ALIGN, 0, PAGE_SIZE)
 	__pi___bss_start = __bss_start;
 
+	/* fixmap BSS starts here - preceding data/BSS is omitted from the linear map */
+	.pgdir.bss (NOLOAD) : ALIGN(PAGE_SIZE) {
+		*(.pgdir.bss)
+	}
+	ASSERT(ADDR(.pgdir.bss) == __bss_stop, ".pgdir.bss must follow BSS")
+
 	. = ALIGN(PAGE_SIZE);
 	__pi_init_pg_dir = .;
 	. += INIT_DIR_SIZE;
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index c5c5425791da..1a3bbd67dd76 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -31,9 +31,9 @@ static_assert(NR_BM_PMD_TABLES == 1);
 
 #define BM_PTE_TABLE_IDX(addr)	__BM_TABLE_IDX(addr, PMD_SHIFT)
 
-static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __page_aligned_bss;
-static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
-static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
+static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __pgtbl_bss;
+static pmd_t bm_pmd[PTRS_PER_PMD] __pgtbl_bss __maybe_unused;
+static pud_t bm_pud[PTRS_PER_PUD] __pgtbl_bss __maybe_unused;
 
 static inline pte_t *fixmap_pte(unsigned long addr)
 {
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index abeb81bf6ebd..dbf22cae82ee 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -214,7 +214,7 @@ asmlinkage void __init kasan_early_init(void)
 		 * shadow pud_t[]/p4d_t[], which could end up getting corrupted
 		 * when the linear region is mapped.
 		 */
-		static pte_t tbl[PTRS_PER_PTE] __page_aligned_bss;
+		static pte_t tbl[PTRS_PER_PTE] __pgtbl_bss;
 		pgd_t *pgdp = pgd_offset_k(KASAN_SHADOW_START);
 
 		set_pgd(pgdp, __pgd(__pa_symbol(tbl) | PGD_TYPE_TABLE));
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 10/15] arm64: mm: Don't abuse memblock NOMAP to check for overlaps
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

Now that the linear region mapping routines respect existing table
mappings and contiguous block and page mappings, it is no longer needed
to fiddle with the memblock tables to set and clear the NOMAP attribute
in order to omit text and rodata when creating the linear map.

Instead, map the kernel text and rodata alias first with the desired
initial attributes and granularity, so that the loop iterating over the
memblocks will not remap it in a manner that prevents it from being
remapped with updated attributes later.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 23 ++++++--------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 04cc579c7a15..b20c76b8381d 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1164,12 +1164,14 @@ static void __init map_mem(void)
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	/*
-	 * Take care not to create a writable alias for the
-	 * read-only text and rodata sections of the kernel image.
-	 * So temporarily mark them as NOMAP to skip mappings in
-	 * the following for-loop
+	 * Map the linear alias of the [_text, __init_begin) interval
+	 * as non-executable now, and remove the write permission in
+	 * mark_linear_text_alias_ro() above (which will be called after
+	 * alternative patching has completed). This makes the contents
+	 * of the region accessible to subsystems such as hibernate,
+	 * but protects it from inadvertent modification or execution.
 	 */
-	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
+	__map_memblock(kernel_start, kernel_end, PAGE_KERNEL, flags);
 
 	/* map all the memory banks */
 	for_each_mem_range(i, &start, &end) {
@@ -1181,17 +1183,6 @@ static void __init map_mem(void)
 		__map_memblock(start, end, pgprot_tagged(PAGE_KERNEL),
 			       flags);
 	}
-
-	/*
-	 * Map the linear alias of the [_text, __init_begin) interval
-	 * as non-executable now, and remove the write permission in
-	 * mark_linear_text_alias_ro() below (which will be called after
-	 * alternative patching has completed). This makes the contents
-	 * of the region accessible to subsystems such as hibernate,
-	 * but protects it from inadvertent modification or execution.
-	 */
-	__map_memblock(kernel_start, kernel_end, PAGE_KERNEL, 0);
-	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
 }
 
 void mark_rodata_ro(void)
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 11/15] arm64: mm: Map the kernel data/bss read-only in the linear map
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

On systems where the bootloader adheres to the original arm64 boot
protocol, the placement of the kernel in the physical address space is
highly predictable, and this makes the placement of its linear alias in
the kernel virtual address space equally predictable, given the lack of
randomization of the linear map.

The linear aliases of the kernel text and rodata regions are already
mapped read-only, but the kernel data and bss are mapped read-write in
this region. This is not needed, so map them read-only as well.

Note that the statically allocated kernel page tables do need to be
modifiable via the linear map, so leave these mapped read-write.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b20c76b8381d..e7ca53d20b87 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1138,7 +1138,9 @@ static void __init map_mem(void)
 {
 	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
 	phys_addr_t kernel_start = __pa_symbol(_text);
-	phys_addr_t kernel_end = __pa_symbol(__init_begin);
+	phys_addr_t init_begin = __pa_symbol(__init_begin);
+	phys_addr_t init_end = __pa_symbol(__init_end);
+	phys_addr_t kernel_end = __pa_symbol(__bss_stop);
 	phys_addr_t start, end;
 	int flags = NO_EXEC_MAPPINGS;
 	u64 i;
@@ -1171,7 +1173,11 @@ static void __init map_mem(void)
 	 * of the region accessible to subsystems such as hibernate,
 	 * but protects it from inadvertent modification or execution.
 	 */
-	__map_memblock(kernel_start, kernel_end, PAGE_KERNEL, flags);
+	__map_memblock(kernel_start, init_begin, PAGE_KERNEL, flags);
+
+	/* Map the kernel data/bss so it can be remapped later */
+	__map_memblock(init_end, kernel_end, pgprot_tagged(PAGE_KERNEL),
+		       flags);
 
 	/* map all the memory banks */
 	for_each_mem_range(i, &start, &end) {
@@ -1183,6 +1189,11 @@ static void __init map_mem(void)
 		__map_memblock(start, end, pgprot_tagged(PAGE_KERNEL),
 			       flags);
 	}
+
+	/* Map the kernel data/bss read-only in the linear map */
+	__map_memblock(init_end, kernel_end, PAGE_KERNEL_RO, flags);
+	flush_tlb_kernel_range((unsigned long)lm_alias(__init_end),
+			       (unsigned long)lm_alias(__bss_stop));
 }
 
 void mark_rodata_ro(void)
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 12/15] powerpc/code-patching: Avoid r/w mapping of the zero page
From: Ard Biesheuvel @ 2026-05-26 17:58 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy (CS GROUP)
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

The only remaining use of map_patch_area() is mapping the zero page, and
immediately unmapping it again so that the intermediate page table
levels are all guaranteed to be populated.

The use of the zero page here is completely arbitrary, and not harmful
per se, but currently, it creates a writable mapping, and does so in a
manner that requires that the empty_zero_page[] symbol is not
const-qualified.

Given that this is about to change, and that map_patch_area() now never
maps anything other than the zero page, let's simplify the code and
- remove the helpers and call [un]map_kernel_page() directly
- take the PA of empty_zero_page directly
- create a read-only temporary mapping.

This allows empty_zero_page[] to be repainted as const u8[] in a
subsequent patch, without making substantial changes to this code
patching logic.

Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: "Christophe Leroy (CS GROUP)" <chleroy@kernel.org>
Link: https://lore.kernel.org/all/20260520085423.485402-1-ardb@kernel.org/
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/powerpc/lib/code-patching.c | 52 +-------------------
 1 file changed, 2 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index f84e0337cc02..44ff9f684bef 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -60,9 +60,6 @@ struct patch_context {
 
 static DEFINE_PER_CPU(struct patch_context, cpu_patching_context);
 
-static int map_patch_area(void *addr, unsigned long text_poke_addr);
-static void unmap_patch_area(unsigned long addr);
-
 static bool mm_patch_enabled(void)
 {
 	return IS_ENABLED(CONFIG_SMP) && radix_enabled();
@@ -117,11 +114,11 @@ static int text_area_cpu_up(unsigned int cpu)
 
 	// Map/unmap the area to ensure all page tables are pre-allocated
 	addr = (unsigned long)area->addr;
-	err = map_patch_area(empty_zero_page, addr);
+	err = map_kernel_page(addr, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO);
 	if (err)
 		return err;
 
-	unmap_patch_area(addr);
+	unmap_kernel_page(addr);
 
 	this_cpu_write(cpu_patching_context.area, area);
 	this_cpu_write(cpu_patching_context.addr, addr);
@@ -233,51 +230,6 @@ static unsigned long get_patch_pfn(void *addr)
 		return __pa_symbol(addr) >> PAGE_SHIFT;
 }
 
-/*
- * This can be called for kernel text or a module.
- */
-static int map_patch_area(void *addr, unsigned long text_poke_addr)
-{
-	unsigned long pfn = get_patch_pfn(addr);
-
-	return map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
-}
-
-static void unmap_patch_area(unsigned long addr)
-{
-	pte_t *ptep;
-	pmd_t *pmdp;
-	pud_t *pudp;
-	p4d_t *p4dp;
-	pgd_t *pgdp;
-
-	pgdp = pgd_offset_k(addr);
-	if (WARN_ON(pgd_none(*pgdp)))
-		return;
-
-	p4dp = p4d_offset(pgdp, addr);
-	if (WARN_ON(p4d_none(*p4dp)))
-		return;
-
-	pudp = pud_offset(p4dp, addr);
-	if (WARN_ON(pud_none(*pudp)))
-		return;
-
-	pmdp = pmd_offset(pudp, addr);
-	if (WARN_ON(pmd_none(*pmdp)))
-		return;
-
-	ptep = pte_offset_kernel(pmdp, addr);
-	if (WARN_ON(pte_none(*ptep)))
-		return;
-
-	/*
-	 * In hash, pte_clear flushes the tlb, in radix, we have to
-	 */
-	pte_clear(&init_mm, addr, ptep);
-	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
-}
-
 static int __do_patch_mem_mm(void *addr, unsigned long val, bool is_dword)
 {
 	int err;
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 13/15] sh: cast away constness from the zero page when flushing it from the cache
From: Ard Biesheuvel @ 2026-05-26 17:59 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh,
	Yoshinori Sato, Rich Felker, John Paul Adrian Glaubitz
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

SH performs cache maintenance on the zero page during boot, presumably
to ensure that any clearing of BSS that has occurred at startup is
visible to other CPUs and DMA devices.

The __flush_wback_region() function takes a void* argument, which is
conceptually sound, but given that empty_zero_page[] must never be
modified, it is being repainted as const, making it incompatible with a
void* formal parameter.

Given the above, and the fact that __flush_wback_region() is in fact a
function pointer variable with multiple implementations, take the easy
way out, and cast away the constness in this particular invocation.

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/sh/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 4e40d5e96be9..acbb481cdbfe 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -332,7 +332,7 @@ void __init mem_init(void)
 	cpu_cache_init();

 	/* clear the zero-page */
-	__flush_wback_region(empty_zero_page, PAGE_SIZE);
+	__flush_wback_region((void *)empty_zero_page, PAGE_SIZE);

 	vsyscall_init();

-- 
2.54.0.794.g4f17f83d09-goog

^ permalink raw reply related

* [PATCH v6 14/15] mm: Make empty_zero_page[] const
From: Ard Biesheuvel @ 2026-05-26 17:59 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh,
	Kevin Brodsky, Feng Tang
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

The empty zero page is used to back any kernel or user space mapping
that is supposed to remain cleared, and so the page itself is never
supposed to be modified.

So mark it as const, which moves it into .rodata rather than .bss: on
most architectures, this ensures that both the kernel's mapping of it
and any aliases that are accessible via the kernel direct (linear) map
are mapped read-only, and cannot be used (inadvertently or maliciously)
to corrupt the contents of the zero page.

Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Reviewed-by: Jann Horn <jannh@google.com>
Reviewed-by: Feng Tang <feng.tang@linux.alibaba.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/pgtable.h | 2 +-
 mm/mm_init.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cdd68ed3ae1a..67aa23814010 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1993,7 +1993,7 @@ static inline unsigned long zero_pfn(unsigned long addr)
 	return zero_page_pfn;
 }
 
-extern uint8_t empty_zero_page[PAGE_SIZE];
+extern const uint8_t empty_zero_page[PAGE_SIZE];
 extern struct page *__zero_page;
 
 static inline struct page *_zero_page(unsigned long addr)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f9f8e1af921c..46cf001238c5 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -57,7 +57,7 @@ unsigned long zero_page_pfn __ro_after_init;
 EXPORT_SYMBOL(zero_page_pfn);
 
 #ifndef __HAVE_COLOR_ZERO_PAGE
-uint8_t empty_zero_page[PAGE_SIZE] __page_aligned_bss;
+const uint8_t empty_zero_page[PAGE_SIZE] __aligned(PAGE_SIZE);
 EXPORT_SYMBOL(empty_zero_page);
 
 struct page *__zero_page __ro_after_init;
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v6 15/15] arm64: mm: Unmap kernel data/bss entirely from the linear map
From: Ard Biesheuvel @ 2026-05-26 17:59 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, will, catalin.marinas, mark.rutland, Ard Biesheuvel,
	Ryan Roberts, Anshuman Khandual, Liz Prucka, Seth Jenkins,
	Kees Cook, Mike Rapoport, David Hildenbrand, Andrew Morton,
	Jann Horn, linux-mm, linux-hardening, linuxppc-dev, linux-sh
In-Reply-To: <20260526175846.2694125-17-ardb+git@google.com>

From: Ard Biesheuvel <ardb@kernel.org>

The linear aliases of the kernel text and rodata are mapped read-only in
the linear map as well. Given that the contents of these regions are
mostly identical to the version in the loadable image, mapping them
read-only and leaving their contents visible is a reasonable hardening
measure.

Data and bss, however, are now also mapped read-only but the contents of
these regions are more likely to contain data that we'd rather not leak.
So let's unmap these entirely in the linear map when the kernel is
running normally.

When going into hibernation or waking up from it, these regions need to
be mapped, so map the region initially, and toggle the valid bit so
map/unmap the region as needed. (While the hibernation snapshot logic
seems able to map inaccessible pages as needed, it currently disregards
non-present pages entirely.)

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/mmu.c | 39 +++++++++++++++++---
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e7ca53d20b87..cb00e42abbe1 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -24,6 +24,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/set_memory.h>
+#include <linux/suspend.h>
 #include <linux/kfence.h>
 #include <linux/pkeys.h>
 #include <linux/mm_inline.h>
@@ -1056,6 +1057,29 @@ static void __init __map_memblock(phys_addr_t start, phys_addr_t end,
 				 end - start, prot, early_pgtable_alloc, flags);
 }
 
+static void remap_linear_data_alias(bool unmap)
+{
+	set_memory_valid((unsigned long)lm_alias(__init_end),
+			 (unsigned long)(__bss_stop - __init_end) / PAGE_SIZE,
+			 !unmap);
+}
+
+static int arm64_hibernate_pm_notify(struct notifier_block *nb,
+				     unsigned long mode, void *unused)
+{
+	switch (mode) {
+	default:
+		break;
+	case PM_POST_HIBERNATION:
+		remap_linear_data_alias(true);
+		break;
+	case PM_HIBERNATION_PREPARE:
+		remap_linear_data_alias(false);
+		break;
+	}
+	return 0;
+}
+
 void __init mark_linear_text_alias_ro(void)
 {
 	/*
@@ -1064,6 +1088,16 @@ void __init mark_linear_text_alias_ro(void)
 	update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
 			    (unsigned long)__init_begin - (unsigned long)_text,
 			    PAGE_KERNEL_RO);
+
+	remap_linear_data_alias(true);
+
+	if (IS_ENABLED(CONFIG_HIBERNATION)) {
+		static struct notifier_block nb = {
+			.notifier_call = arm64_hibernate_pm_notify
+		};
+
+		register_pm_notifier(&nb);
+	}
 }
 
 #ifdef CONFIG_KFENCE
@@ -1189,11 +1223,6 @@ static void __init map_mem(void)
 		__map_memblock(start, end, pgprot_tagged(PAGE_KERNEL),
 			       flags);
 	}
-
-	/* Map the kernel data/bss read-only in the linear map */
-	__map_memblock(init_end, kernel_end, PAGE_KERNEL_RO, flags);
-	flush_tlb_kernel_range((unsigned long)lm_alias(__init_end),
-			       (unsigned long)lm_alias(__bss_stop));
 }
 
 void mark_rodata_ro(void)
-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* Re: [PATCH] ibmvnic: fix krealloc() memory leak
From: Nicolai Buchwitz @ 2026-05-26 20:50 UTC (permalink / raw)
  To: Alexander A. Klimov
  Cc: Haren Myneni, Rick Lindsley, Nick Child, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Thomas Falcon, Desnes Augusto Nunes do Rosario,
	netdev, linuxppc-dev, linux-kernel
In-Reply-To: <20260526184105.18962-6-grandmaster@al2klimov.de>

Hi Alex

You patch is missing the prefix with the target tree. Please have
a look at [1] for more details on the workflow.

On 26.5.2026 20:41, Alexander A. Klimov wrote:
> Don't just overwrite the original pointer passed to krealloc()
> with its return value without checking latter:
> 
>     MEM = krealloc(MEM, SZ, GFP);
> 
> If krealloc() returns NULL, that erases the pointer
> to the still allocated memory, hence leaks this memory.
> Instead, use a temporary variable, check it's not NULL
> and only then assign it to the original pointer:
> 
>     TMP = krealloc(MEM, SZ, GFP);
>     if (!TMP) return;
>     MEM = TMP;
> 
> Fixes: 4e6759be28e4 ("ibmvnic: Feature implementation of Vital Product 
> Data (VPD) for the ibmvnic driver")
> Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
> ---
>  drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++-------
>  1 file changed, 8 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
> b/drivers/net/ethernet/ibm/ibmvnic.c
> index 5a510eed335e..25d1d844ad19 100644
> --- a/drivers/net/ethernet/ibm/ibmvnic.c
> +++ b/drivers/net/ethernet/ibm/ibmvnic.c
> @@ -1761,8 +1761,9 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter 
> *adapter)
>  	union ibmvnic_crq crq;
>  	int len = 0;
>  	int rc;
> +	unsigned char *buff = adapter->vpd->buff;

Should be reverse x-mas tree (longest to shortest).

> 
> -	if (adapter->vpd->buff)
> +	if (buff)
>  		len = adapter->vpd->len;
> 
>  	mutex_lock(&adapter->fw_lock);
> @@ -1788,17 +1789,17 @@ static int ibmvnic_get_vpd(struct 
> ibmvnic_adapter *adapter)
>  	if (!adapter->vpd->len)
>  		return -ENODATA;
> 
> -	if (!adapter->vpd->buff)
> -		adapter->vpd->buff = kzalloc(adapter->vpd->len, GFP_KERNEL);
> +	if (!buff)
> +		buff = kzalloc(adapter->vpd->len, GFP_KERNEL);
>  	else if (adapter->vpd->len != len)
> -		adapter->vpd->buff =
> -			krealloc(adapter->vpd->buff,
> -				 adapter->vpd->len, GFP_KERNEL);
> +		buff = krealloc(buff,
> +				adapter->vpd->len, GFP_KERNEL);

Dead branch? The only caller, init_resources(), kzalloc()s a fresh vpd
right before, and resets run release_vpd_data() first, so vpd->buff is
always NULL here and kzalloc() above always wins. The leak can't 
trigger,
which makes the Fixes tag misleading.

> 
> -	if (!adapter->vpd->buff) {
> +	if (!buff) {
>  		dev_err(dev, "Could allocate VPD buffer\n");
>  		return -ENOMEM;
>  	}
> +	adapter->vpd->buff = buff;

If you keep it anyway: on failure the old buffer stays in vpd->buff 
while
vpd->len is already the new size, a mismatch the original avoided by
NULLing. kfree() it (krealloc() does not free on failure) and NULL 
before
-ENOMEM.

> 
>  	adapter->vpd->dma_addr =
>  		dma_map_single(dev, adapter->vpd->buff, adapter->vpd->len,

[1] https://docs.kernel.org/process/maintainer-netdev.html

Thanks,
Nicolai


^ permalink raw reply

* [PATCH] powerpc/kexec_file: use snprintf to simplify setup_kdump_cmdline
From: Thorsten Blum @ 2026-05-27  0:01 UTC (permalink / raw)
  To: Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy (CS GROUP)
  Cc: Thorsten Blum, linuxppc-dev, linux-kernel

Replace the manual string length accounting, memcpy(), and NUL
termination with a single snprintf() call to prepend the elfcorehdr=
address and to detect string truncation at the same time.

Use kmalloc() to avoid unnecessarily zeroing the memory. While at it,
also use "prepending" instead of "appending" in the error message.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
---
 arch/powerpc/kexec/file_load.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kexec/file_load.c b/arch/powerpc/kexec/file_load.c
index 4284f76cbef5..597998235136 100644
--- a/arch/powerpc/kexec/file_load.c
+++ b/arch/powerpc/kexec/file_load.c
@@ -36,25 +36,19 @@
 char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
 			  unsigned long cmdline_len)
 {
-	int elfcorehdr_strlen;
 	char *cmdline_ptr;
 
-	cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL);
+	cmdline_ptr = kmalloc(COMMAND_LINE_SIZE, GFP_KERNEL);
 	if (!cmdline_ptr)
 		return NULL;
 
-	elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ",
-				    image->elf_load_addr);
-
-	if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) {
-		pr_err("Appending elfcorehdr=<addr> exceeds cmdline size\n");
+	if (snprintf(cmdline_ptr, COMMAND_LINE_SIZE, "elfcorehdr=0x%lx %s",
+		     image->elf_load_addr, cmdline_len ? cmdline : "") >= COMMAND_LINE_SIZE) {
+		pr_err("Prepending elfcorehdr=<addr> exceeds cmdline size\n");
 		kfree(cmdline_ptr);
 		return NULL;
 	}
 
-	memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len);
-	// Ensure it's nul terminated
-	cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0';
 	return cmdline_ptr;
 }
 


^ permalink raw reply related

* Re: [PATCH mm-unstable RFC v4 0/7] mm: add huge pfnmap support for remap_pfn_range()
From: Yin Tirui @ 2026-05-27  2:57 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Andrew Morton, Matthew Wilcox, David Hildenbrand, Juergen Gross,
	Jonathan Cameron, Will Deacon, Catalin Marinas, Peter Xu,
	Luiz Capitulino, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, H . Peter Anvin, Andy Lutomirski, Peter Zijlstra,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy, Liam R . Howlett, Zi Yan, Baolin Wang,
	Nico Pache, Ryan Roberts, Dev Jain, Barry Song, Lance Yang,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Anshuman Khandual, Rohan McLure, Kevin Brodsky, Alistair Popple,
	Andrew Donnellan, Pasha Tatashin, Baoquan He, Thomas Huth,
	Coiby Xu, Dan Williams, Yu-cheng Yu, Lu Baolu, Conor Dooley,
	Rik van Riel, wangkefeng.wang, chenjun102, linux-mm, linux-kernel,
	x86, linux-arm-kernel, linuxppc-dev, linux-pm
In-Reply-To: <ahW7rut_WP17KADW@lucifer>


On 5/26/2026 11:33 PM, Lorenzo Stoakes wrote:
> Allow me to be mildly pedantic (sorry :)
>
> One thing I'd like to highlight here is that remap_pfn_range() is, in the long
> run, deprecated.
>
> mmap_prepare callbacks will indicate a PFN remap mmap_action, which will do the
> heavy lifting (see [0]).

Thanks, I missed that mmap_action_remap() may also reach the same common 
PFN remap
path.

> So perhaps worth referring to 'PFN remapping' or something?
>
> (Since we already have mmap_prepare() in use, it's also kinda inaccurate to
> say remap_pfn_range() :)

Agreed, describing this as remap_pfn_range()support is indeed 
inaccurate. I'll reword it.

>
> [0]: https://www.kernel.org/doc/html/next/filesystems/mmap_prepare.html
>
> Sorry, this is pretty nitpicky :)
Not at all, this is really helpful.
>
> Cheers, Lorenzo
>
-- 
Yin Tirui



^ permalink raw reply

* [PATCH] smp: prevent soft lockup in smp_call_function_many_cond
From: Mark Tomlinson @ 2026-05-27  3:16 UTC (permalink / raw)
  To: Madhavan Srinivasan, Thomas Gleixner
  Cc: linux-kernel, linuxppc-dev, Mark Tomlinson

Using the PowerPC P2040 (e500mc) CPU, soft lockups can occasionally be
seen in smp_call_function_many_cond(). The conclusion is that this CPU
does not process the doorbell interrupt while in a data-storage (MMU)
exception. If more than one CPU in a multi core environment is calling
this function at the same time, it is possible for a deadlock to occur.

The fix for this is to call flush_smp_call_function_queue() before
waiting for responses from other CPUs. If there is something in the
queue, this is a good time to process it before busy-waiting on other
CPUs. On other architectures this call will quickly do nothing, as the
queue will be empty.

Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
---
 kernel/smp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/smp.c b/kernel/smp.c
index a0bb56bd8dda..3c4467654ab0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -884,6 +884,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 		local_irq_restore(flags);
 	}

+	flush_smp_call_function_queue();
+
 	if (run_remote && wait) {
 		for_each_cpu(cpu, cfd->cpumask) {
 			call_single_data_t *csd;
-- 
2.54.0

^ permalink raw reply related

* Re: [BUG] sched/cache: "Make LLC id continuous" causes NULL cpumask
From: Shrikanth Hegde @ 2026-05-27  7:01 UTC (permalink / raw)
  To: Chen Yu, kprateek.nayak
  Cc: srikar, venkat88, maddy, riteshh, chleroy, tim.c.chen, peterz,
	linux-kernel, linuxppc-dev, linux-sched
In-Reply-To: <20260526140856.139657-1-yu.c.chen@intel.com>

Hi Chen, Prateek.

I got back to work today, sorry for delay.
I am trying to go through the mails.
Apologies in case i have missed any bits.

On 5/26/26 7:38 PM, Chen Yu wrote:
> Hi Prateek,
> 
> On Tue, 26 May 2026 11:23:59 +0530, K Prateek Nayak <kprateek.nayak@amd.com> wrote:
>> Hello Srikar,
>>
>> On 5/26/2026 10:28 AM, Srikar Dronamraju wrote:
>>> L2 Cache reported here is for SMT8 Core aka CACHE domain.
>>
>> Apart for the scheduler, nothing in tree currently cares about
>> cpu_coregroup_mask() except for drivers/base/arch_topology.c but
>> Power doesn't select GENERIC_ARCH_TOPOLOGY.
>>
>> Why can't Power have an internal mask for MC domain (tl_mc_mask) and
>> the scheduler can use cpu_coregroup_mask() for the actual LLc? (The L2
>> mask in this case.)

This seems wrong. there is no notion that coregroup_mask
(MC domain) has to point at LLC domain.

For example, on Shared LPAR, there is no MC domain and LLC is at SMT core level.
In that case coregroup_mask has point at SMT mask is wrong.

If we need a mask to point to the LLC mask which arch has to return, then we would
need a new api say cpu_llc_mask ? that can point accordingly.

I don't like mixing MC domain and LLC into one bit.

>>
>> Power anyways adds its own topology via set_sched_topology() so the
>> default_topology from kernel/sched/topology.c remains unused.
>>
>> ...
>>
>>> Shouldnt cache-aware scheduling be worried about cpuset partitions too.
>>> If a cpuset has subset of LLC cores, then we should Scheduler assume it can
>>> control complete LLC?
>>
>> Well, the scheduling takes care of partitions and the cache aware
>> scheduling bits take care of looking at the full system perspective
>> for stats aggregation and pointing to a particular LLc.
>>
>> We don't compare llc_id across cpusets so we keeping one unique llc_id
>> per H/W LLC instance is feasible and it enables us to keep llc_id space
>> limited for optimizing cache-aware scheduling.
>>
>> Now if we have threads of same process across partitions, we'll
>> still aggregate the utilization numbers across the full LLC but
>> the load balancers at individual partitions will make a call on
>> the aggregation.
>>
>> -- 
>> Thanks and Regards,
>> Prateek
>>
>>
> 
> I suppose what you suggested looks like below:
> 
> powerpc/smp: make cpu_coregroup_mask() return the LLC
> 
> On pSeries shared LPARs(or coregroup_enabled is false on
> Power9 and earlier) the hemisphere map is not allocated, so
> build_sched_domains() dereferences a NULL cpumask and crashes.
> 
> The generic scheduler expects cpu_coregroup_mask() to span the LLC.
> On powerpc the LLC is the L2. Return cpu_l2_cache_mask() instead of
> the hemisphere map. Use a coregroup_map() helper for the in-file
> hemisphere users, and a powerpc_tl_mc_mask() wrapper for the MC
> sched-domain level.
> 
> Fixes: b5ea300a17e3 ("sched/cache: Make LLC id continuous")
> Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
> Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> ---
>   arch/powerpc/kernel/smp.c | 35 +++++++++++++++++++++++------------
>   1 file changed, 23 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1040,11 +1040,22 @@ static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_
>   }
>   #endif
>   
> +static inline struct cpumask *coregroup_map(int cpu)
> +{
> +	return per_cpu(cpu_coregroup_map, cpu);
> +}
> +
>   struct cpumask *cpu_coregroup_mask(int cpu)
>   {
> -	return per_cpu(cpu_coregroup_map, cpu);
> +	return cpu_l2_cache_mask(cpu);
> +}

This looks wrong to me too. In different hardware topologies
there maybe distinction between coregroup and l2 mask.

Let me go through the code and see if there is better way.

> +
> +static const struct cpumask *
> +powerpc_tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
> +{
> +	return coregroup_map(cpu);
>   }
>   
>   static bool has_coregroup_support(void)
>   {
>   	if (is_shared_processor())
> @@ -1155,7 +1166,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   	cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
>   
>   	if (has_coregroup_support())
> -		cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
> +		cpumask_set_cpu(boot_cpuid, coregroup_map(boot_cpuid));
>   
>   	init_big_cores();
>   	if (has_big_cores) {
> @@ -1520,8 +1531,8 @@ static void remove_cpu_from_masks(int cpu)
>   		set_cpus_unrelated(cpu, i, cpu_core_mask);
>   
>   	if (has_coregroup_support()) {
> -		for_each_cpu(i, cpu_coregroup_mask(cpu))
> -			set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
> +		for_each_cpu(i, coregroup_map(cpu))
> +			set_cpus_unrelated(cpu, i, coregroup_map);
>   	}
>   }
>   #endif
> @@ -1553,7 +1564,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
>   	if (!*mask) {
>   		/* Assume only siblings are part of this CPU's coregroup */
>   		for_each_cpu(i, submask_fn(cpu))
> -			set_cpus_related(cpu, i, cpu_coregroup_mask);
> +			set_cpus_related(cpu, i, coregroup_map);
>   
>   		return;
>   	}
> @@ -1561,18 +1572,18 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask)
>   	cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu));
>   
>   	/* Update coregroup mask with all the CPUs that are part of submask */
> -	or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask);
> +	or_cpumasks_related(cpu, cpu, submask_fn, coregroup_map);
>   
>   	/* Skip all CPUs already part of coregroup mask */
> -	cpumask_andnot(*mask, *mask, cpu_coregroup_mask(cpu));
> +	cpumask_andnot(*mask, *mask, coregroup_map(cpu));
>   
>   	for_each_cpu(i, *mask) {
>   		/* Skip all CPUs not part of this coregroup */
>   		if (coregroup_id == cpu_to_coregroup_id(i)) {
> -			or_cpumasks_related(cpu, i, submask_fn, cpu_coregroup_mask);
> +			or_cpumasks_related(cpu, i, submask_fn, coregroup_map);
>   			cpumask_andnot(*mask, *mask, submask_fn(i));
>   		} else {
> -			cpumask_andnot(*mask, *mask, cpu_coregroup_mask(i));
> +			cpumask_andnot(*mask, *mask, coregroup_map(i));
>   		}
>   	}
>   }
> @@ -1733,7 +1744,7 @@ static void __init build_sched_topology(void)
>   
>   	if (has_coregroup_support()) {
>   		powerpc_topology[i++] =
> -			SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC);

I would prefer not do this rename. having tl_mc_mask helps to find the usage across
the codebase.

> +			SDTL_INIT(powerpc_tl_mc_mask, powerpc_shared_proc_flags, MC);
>   	}
>   
>   	powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG);



^ permalink raw reply

* Re: [BUG] sched/cache: "Make LLC id continuous" causes NULL cpumask dereference in build_sched_domains on POWER9
From: Shrikanth Hegde @ 2026-05-27  7:05 UTC (permalink / raw)
  To: Venkat Rao Bagalkote
  Cc: Madhavan Srinivasan, Ritesh Harjani, Christophe Leroy (CS GROUP),
	LKML, linuxppc-dev, linux-sched, tim.c.chen, K Prateek Nayak,
	Peter Zijlstra, Chen, Yu C, Srikar Dronamraju
In-Reply-To: <8675afde-5c7f-4717-be7a-a473bd1af381@linux.ibm.com>



On 5/26/26 10:54 AM, Venkat Rao Bagalkote wrote:
> 
> On 26/05/26 9:38 am, Chen, Yu C wrote:
>> Hi Venkat,
>>
>> On 5/26/2026 11:14 AM, Srikar Dronamraju wrote:
>>> * Chen, Yu C <yu.c.chen@intel.com> [2026-05-25 23:35:45]:
>>>
>>>> Hi Venkat,
>>>>
>>>> On 5/25/2026 10:07 PM, Venkat Rao Bagalkote wrote:
>>>>> Greetings!!!
>>>>>
>>>>> I am seeing an early boot kernel panic due to NULL pointer dereference
>>>>> on a POWER9 (pSeries) system when testing linux-next (next-20260522).
> 
> 
> This issue is seen on P11 as well.
> 
> [    0.006697] smp: Brought up 1 node, 16 CPUs
> [    0.006702] Big cores detected but using small core scheduling
> [    0.006752] BUG: Kernel NULL pointer dereference on read at 0x00000000
> [    0.006755] Faulting instruction address: 0xc000000020adbb6c
> [    0.006759] Oops: Kernel access of bad area, sig: 7 [#1]
> [    0.006762] LE PAGE_SIZE=4K MMU=Radix  SMP NR_CPUS=8192 NUMA pSeries
> [    0.006767] Modules linked in:
> [    0.006772] CPU: 4 UID: 0 PID: 1 Comm: swapper/4 Not tainted 7.1.0- 
> rc5-next-20260525 #1 PREEMPT(lazy)
> [    0.006777] Hardware name: IBM,9080-HEX Power11 (architected) 
> 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries
> [    0.006781] NIP:  c000000020adbb6c LR: c0000000202e5a58 CTR: 
> 0000000000000000
> [    0.006784] REGS: c0000000283d7890 TRAP: 0300   Not tainted (7.1.0- 
> rc5-next-20260525)
> [    0.006788] MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 
> 44002242  XER: 20040003
> [    0.006796] CFAR: c0000000202e5a54 DAR: 0000000000000000 DSISR: 
> 00080000 IRQMASK: 0
> [    0.006796] GPR00: 0000000000000000 c0000000283d7b50 c000000021abf100 
> 0000000000000010
> [    0.006796] GPR04: 0000000000000010 0000000000000030 0000000000000000 
> c000000028365500
> [    0.006796] GPR08: 0000000000000000 c000000022213598 000000003b77d000 
> 0000000000000000
> [    0.006796] GPR12: c00000002005d8f0 c000000000008000 c0000000283cb578 
> c0000000283cb400
> [    0.006796] GPR16: c0000000283c9000 c000000022218b20 c0000000222330e8 
> 00000000ffffffff
> [    0.006796] GPR20: fffffffffffffff6 0000000000000000 c000000022da36e0 
> 0000000000000000
> [    0.006796] GPR24: 0000000000000000 0000000000000000 c0000000283c9178 
> c0000000227b5f00
> [    0.006796] GPR28: c00000002831c1e8 c000000022db5980 0000000000000000 
> 0000000000000000
> [    0.006835] NIP [c000000020adbb6c] _find_first_bit+0xc/0xc0
> [    0.006842] LR [c0000000202e5a58] build_sched_domains+0x7d8/0xb40
> [    0.006847] Call Trace:
> [    0.006849] [c0000000283d7b50] [c0000000202e5408] 
> build_sched_domains+0x188/0xb40 (unreliable)
> [    0.006854] [c0000000283d7c90] [c000000022034380] 
> sched_init_domains+0x118/0x168
> [    0.006860] [c0000000283d7ce0] [c000000022032b14] 
> sched_init_smp+0xa8/0x158
> [    0.006865] [c0000000283d7d30] [c000000022005674] 
> kernel_init_freeable+0x1ac/0x294
> [    0.006870] [c0000000283d7dd0] [c000000020011718] kernel_init+0x2c/0x1c4
> [    0.006874] [c0000000283d7e30] [c00000002000debc] 
> ret_from_kernel_user_thread+0x14/0x1c
> [    0.006878] ---- interrupt: 0 at 0x0
> [    0.006881] Code: eb610038 7fc3f378 eb810040 eba10048 38210060 
> ebc1fff0 ebe1fff8 7c0803a6 4e800020 7c681b78 7c832379 4d820020 
> <e9280000> 38e3ffff 39400000 78e7d7e2
> [    0.006895] ---[ end trace 0000000000000000 ]---
> [    0.006898]
> 
> 
> Regards,
> 
> Venkat.
> 

Venkat,

Was it on P11 on Shared LPAR?


^ permalink raw reply

* [PATCH v2 0/5] powerpc/pseries: Add full RTAS-based error injection support
From: Narayana Murty N @ 2026-05-27  7:24 UTC (permalink / raw)
  To: mahesh, maddy, mpe, christophe.leroy, gregkh, oohall, npiggin
  Cc: linuxppc-dev, linux-kernel, tyreld, vaibhav, sbhat, ganeshgr,
	sourabhjain, haren, nnmlinux, thuth

The series introduces complete support for RTAS-based hardware error
injection on pseries platforms. The implementation replaces the legacy
MMIO-based approach with a full PAPR-compliant workflow built around the
RTAS services:
	ibm,open-errinjct
	ibm,errinjct
	ibm,close-errinjct

The new pseries_eeh_err_inject() interface enables controlled injection
of synthetic PCI, memory, and cache/TLB faults for platform validation,
EEH testing, firmware diagnostics, and tooling (e.g., bpftrace-based
tracing).

Current testing scope:
At this stage, the feature can be triggered only from VFIO-passthrough
devices assigned to a guest, and from userspace-exposed VFIO devices
using the VFIO_EEH_PE_INJECT_ERR ioctl.

Key Highlights

Dynamic acquisition of all required RTAS tokens and correct
open/errinjct/close session handling as defined by PAPR. A 1KB
naturally-aligned, zero-initialized RTAS working buffer is populated
exactly per PAPR buffer definitions.

Support for a wide range of error types:

0x03 - recovered-special-event
0x04 - corrupted-page
0x07 - ioa-bus-error (32-bit)
0x0F - ioa-bus-error-64 (64-bit)
0x09 - corrupted-dcache-start
0x0A - corrupted-dcache-end
0x0B - corrupted-icache-start
0x0C - corrupted-icache-end
0x0D - corrupted-tlb-start
0x0E - corrupted-tlb-end

All RTAS parameters use proper big-endian formatting (cpu_to_be32()).
Robust status-handling, printk-based diagnostics, and thorough
validation for invalid or unsupported conditions.

Error-specific buffer population logic is factored into helpers for
clarity and maintainability.

Fully tested on PowerVM with firmware that supports RTAS error
injection, along with the companion QEMU support posted here:
https://lore.kernel.org/qemu-devel/20260520095446.64206-1-nnmlinux@linux.ibm.com/

Signed-off-by: Narayana Murty N <nnmlinux@linux.ibm.com>
---
Change Log:
v1 -> v2:
 * Addressed all review comments from Sourabh Jain
   - Removed unnecessary empty line in rtas_call()
   - Enhanced comment to explain PAPR specification requirements
   - Corrected misleading comment about output handling
   - Improved else block comment for better code clarity
 * Fixed kernel test robot warnings
   - Fixed kernel-doc warning for __maybe_unused parameter
   - Confirmed sparse warnings are false positives (correct endianness handling)
 * Added PowerNV platform abstraction layer (new Patch 5)
   - Maps EEH error types to OPAL-specific types
   - Simplifies type handling by direct variable update
 * Improved code comments and documentation throughout
 * Added Reported-by tags for kernel test robot findings
 * Split into logical 5-patch series for better review

RFC -> v1: https://lore.kernel.org/all/20251205094510.4671-1-nnmlinux@linux.ibm.com/
 * Initial 4-patch series
 * Fixed PAPR ibm,open-errinjct output format (token,status order)
 * Added pr_fmt handling for EEH subsystem compatibility
 * Implemented comprehensive validation helpers

RFC: https://lore.kernel.org/all/20251107091009.43034-1-nnmlinux@linux.ibm.com/
 * Initial RFC implementation

Narayana Murty N (5):
  powerpc/rtas: Handle special return format for
    RTAS_FN_IBM_OPEN_ERRINJCT
  powerpc/pseries: Add RTAS error injection buffer infrastructure
  powerpc/pseries: Add RTAS error injection validation helpers
  powerpc/pseries: Implement RTAS error injection via
    pseries_eeh_err_inject
  powerpc/powernv: Map EEH error types to OPAL error injection types

 arch/powerpc/include/asm/rtas.h              |  21 +
 arch/powerpc/include/uapi/asm/eeh.h          |  18 +
 arch/powerpc/kernel/rtas.c                   |  59 ++-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  11 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 423 ++++++++++++++++++-
 5 files changed, 505 insertions(+), 27 deletions(-)

-- 
2.54.0



^ permalink raw reply

* [PATCH v2 1/5] powerpc/rtas: Handle special return format for RTAS_FN_IBM_OPEN_ERRINJCT
From: Narayana Murty N @ 2026-05-27  7:24 UTC (permalink / raw)
  To: mahesh, maddy, mpe, christophe.leroy, gregkh, oohall, npiggin
  Cc: linuxppc-dev, linux-kernel, tyreld, vaibhav, sbhat, ganeshgr,
	sourabhjain, haren, nnmlinux, thuth
In-Reply-To: <20260527072433.94510-1-nnmlinux@linux.ibm.com>

RTAS_FN_IBM_OPEN_ERRINJCT returns results in special format:
  rets[0] = session token (output)
  rets[1] = status code
  rets[2..] = additional outputs (if any)

Unlike standard RTAS calls where:
  rets[0] = status code
  rets[1..] = outputs

This patch adds special handling for OPEN_ERRINJCT to:
1. Check correct status position (rets[1]) for __fetch_rtas_last_error()
2. Copy all rets[0..nret-1] to outputs[] (including token at rets[0])
3. Return status from rets[1] instead of rets[0]

Reference: OpenPOWER PAPR documentation
	   https://files.openpower.foundation/s/XFgfMaqLMD5Bcm8
Signed-off-by: Narayana Murty N <nnmlinux@linux.ibm.com>
---
 arch/powerpc/kernel/rtas.c | 47 ++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 8d81c1e7a8db..a2dd94eed9d0 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1183,7 +1183,7 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...)
 	unsigned long flags;
 	struct rtas_args *args;
 	char *buff_copy = NULL;
-	int ret;
+	int ret = 0;
 
 	if (!rtas.entry || token == RTAS_UNKNOWN_SERVICE)
 		return -1;
@@ -1213,15 +1213,48 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...)
 	va_rtas_call_unlocked(args, token, nargs, nret, list);
 	va_end(list);
 
+	/*
+	 * Special handling for RTAS_FN_IBM_OPEN_ERRINJCT:
+	 * Per PAPR, ibm,open-errinjct has a unique return format:
+	 *   rets[0] = injection session token (output parameter)
+	 *   rets[1] = status code
+	 *
+	 * This differs from standard RTAS calls which return:
+	 *   rets[0] = status code
+	 *   rets[1..] = output parameters
+	 *
+	 * We must extract status from rets[1] (not rets[0]) to correctly
+	 * detect errors and trigger __fetch_rtas_last_error() when status == -1.
+	 */
 	/* A -1 return code indicates that the last command couldn't
-	   be completed due to a hardware error. */
-	if (be32_to_cpu(args->rets[0]) == -1)
+	 * be completed due to a hardware error.
+	 */
+	if (token == rtas_function_token(RTAS_FN_IBM_OPEN_ERRINJCT) && nret > 1)
+		ret = be32_to_cpu(args->rets[1]);
+	else if (nret > 0)
+		ret = be32_to_cpu(args->rets[0]);
+
+	if (ret == -1)
 		buff_copy = __fetch_rtas_last_error(NULL);
 
-	if (nret > 1 && outputs != NULL)
-		for (i = 0; i < nret-1; ++i)
-			outputs[i] = be32_to_cpu(args->rets[i + 1]);
-	ret = (nret > 0) ? be32_to_cpu(args->rets[0]) : 0;
+	/* Copy all return values to caller's outputs buffer if provided */
+	if (nret > 1 && outputs != NULL) {
+		if (token == rtas_function_token(RTAS_FN_IBM_OPEN_ERRINJCT)) {
+			/* Special case: rets[0]=token, rets[1]=status, rets[2..]=outputs */
+			for (i = 0; i < nret; ++i)
+				outputs[i] = be32_to_cpu(args->rets[i]);
+		} else {
+			/* Normal case: rets[0]=status, rets[1..]=outputs */
+			for (i = 0; i < nret - 1; ++i)
+				outputs[i] = be32_to_cpu(args->rets[i + 1]);
+		}
+	} else {
+		/* Either no outputs to copy (nret <= 1) or caller
+		 * didn't provide output buffer ensure ret contains
+		 * the status code for standard RTAS calls.
+		 */
+		ret = (nret > 0) ? be32_to_cpu(args->rets[0]) : 0;
+	}
 
 	lockdep_unpin_lock(&rtas_lock, cookie);
 	raw_spin_unlock_irqrestore(&rtas_lock, flags);
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 2/5] powerpc/pseries: Add RTAS error injection buffer infrastructure
From: Narayana Murty N @ 2026-05-27  7:24 UTC (permalink / raw)
  To: mahesh, maddy, mpe, christophe.leroy, gregkh, oohall, npiggin
  Cc: linuxppc-dev, linux-kernel, tyreld, vaibhav, sbhat, ganeshgr,
	sourabhjain, haren, nnmlinux, thuth
In-Reply-To: <20260527072433.94510-1-nnmlinux@linux.ibm.com>

Adds global infrastructure required by the injection engine:
 - a 1KB aligned RTAS working buffer in rtas.c
 - a spinlock to serialize buffer access
 - UAPI definitions for error-injection tokens (added to eeh.h)

Signed-off-by: Narayana Murty N <nnmlinux@linux.ibm.com>
---
 arch/powerpc/include/asm/rtas.h     | 21 +++++++++++++++++++++
 arch/powerpc/include/uapi/asm/eeh.h | 18 ++++++++++++++++++
 arch/powerpc/kernel/rtas.c          | 12 ++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index d046bbd5017d..82512f822c7a 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -519,6 +519,27 @@ int rtas_get_error_log_max(void);
 extern spinlock_t rtas_data_buf_lock;
 extern char rtas_data_buf[RTAS_DATA_BUF_SIZE];
 
+/*
+ * RTAS Error Injection Buffer (PAPR-compliant)
+ * ============================================
+ *
+ * 1KB aligned, zero-initialized buffer for ibm,errinjct RTAS work area.
+ * Protected by rtas_errinjct_buf_lock for concurrent access safety.
+ *
+ * PAPR Requirement: ibm,errinjct requires a caller-allocated buffer passed
+ * via physical address. Buffer must accommodate largest error type layouts:
+ * - IOA bus error (64-bit): 8x32-bit words (32 bytes)
+ * - All other types: <=4x32-bit words (16 bytes)
+ *
+ * Usage:
+ * prepare_errinjct_buffer() -> spin_lock() -> rtas_call() -> spin_unlock()
+ *
+ * Alignment: SZ_1K ensures PAPR firmware requirements and cache-line safety.
+ */
+#define RTAS_ERRINJCT_BUF_SIZE 1024
+extern spinlock_t rtas_errinjct_buf_lock;
+extern char rtas_errinjct_buf[RTAS_ERRINJCT_BUF_SIZE];
+
 /* RMO buffer reserved for user-space RTAS use */
 extern unsigned long rtas_rmo_buf;
 
diff --git a/arch/powerpc/include/uapi/asm/eeh.h b/arch/powerpc/include/uapi/asm/eeh.h
index 3b5c47ff3fc4..86645cab2827 100644
--- a/arch/powerpc/include/uapi/asm/eeh.h
+++ b/arch/powerpc/include/uapi/asm/eeh.h
@@ -41,4 +41,22 @@
 #define EEH_ERR_FUNC_DMA_WR_TARGET	19
 #define EEH_ERR_FUNC_MAX		19
 
+/* RTAS PCI Error Injection Token Types */
+#define RTAS_ERR_TYPE_FATAL                     0x1
+#define RTAS_ERR_TYPE_RECOVERED_RANDOM_EVENT    0x2
+#define RTAS_ERR_TYPE_RECOVERED_SPECIAL_EVENT   0x3
+#define RTAS_ERR_TYPE_CORRUPTED_PAGE            0x4
+#define RTAS_ERR_TYPE_CORRUPTED_SLB             0x5
+#define RTAS_ERR_TYPE_TRANSLATOR_FAILURE        0x6
+#define RTAS_ERR_TYPE_IOA_BUS_ERROR             0x7
+#define RTAS_ERR_TYPE_PLATFORM_SPECIFIC         0x8
+#define RTAS_ERR_TYPE_CORRUPTED_DCACHE_START    0x9
+#define RTAS_ERR_TYPE_CORRUPTED_DCACHE_END      0xA
+#define RTAS_ERR_TYPE_CORRUPTED_ICACHE_START    0xB
+#define RTAS_ERR_TYPE_CORRUPTED_ICACHE_END      0xC
+#define RTAS_ERR_TYPE_CORRUPTED_TLB_START       0xD
+#define RTAS_ERR_TYPE_CORRUPTED_TLB_END         0xE
+#define RTAS_ERR_TYPE_IOA_BUS_ERROR_64          0xF
+#define RTAS_ERR_TYPE_UPSTREAM_IO_ERROR         0x10
+
 #endif /* _ASM_POWERPC_EEH_H */
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index a2dd94eed9d0..c110965ea1d9 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -769,6 +769,18 @@ EXPORT_SYMBOL_GPL(rtas_data_buf);
 
 unsigned long rtas_rmo_buf;
 
+/*
+ * RTAS Error Injection Buffer - Global Definitions
+ * Global 1KB buffer and spinlock for ibm,errinjct RTAS service.
+ * Exported for pseries EEH error injection usage.
+ */
+
+DEFINE_SPINLOCK(rtas_errinjct_buf_lock);
+EXPORT_SYMBOL_GPL(rtas_errinjct_buf_lock);
+
+char rtas_errinjct_buf[1024] __aligned(SZ_1K);
+EXPORT_SYMBOL_GPL(rtas_errinjct_buf);
+
 /*
  * If non-NULL, this gets called when the kernel terminates.
  * This is done like this so rtas_flash can be a module.
-- 
2.54.0



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox