Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [RFC PATCH 5/8] mm/vmalloc: map contiguous pages in batches for vmap() if possible
From: Dev Jain @ 2026-04-08 11:22 UTC (permalink / raw)
  To: Barry Song
  Cc: linux-mm, linux-arm-kernel, catalin.marinas, will, akpm, urezki,
	linux-kernel, anshuman.khandual, ryan.roberts, ajd, rppt, david,
	Xueyuan.chen21
In-Reply-To: <CAGsJ_4xCtFe=5ofj4FW6cqu-fgR+K9BM7FPZRdAWOGP3YKtNzQ@mail.gmail.com>



On 08/04/26 10:42 am, Barry Song wrote:
> On Wed, Apr 8, 2026 at 12:20 PM Dev Jain <dev.jain@arm.com> wrote:
>>
>>
>>
>> On 08/04/26 8:21 am, Barry Song (Xiaomi) wrote:
>>> In many cases, the pages passed to vmap() may include high-order
>>> pages allocated with __GFP_COMP flags. For example, the systemheap
>>> often allocates pages in descending order: order 8, then 4, then 0.
>>> Currently, vmap() iterates over every page individually—even pages
>>> inside a high-order block are handled one by one.
>>>
>>> This patch detects high-order pages and maps them as a single
>>> contiguous block whenever possible.
>>>
>>> An alternative would be to implement a new API, vmap_sg(), but that
>>> change seems to be large in scope.
>>>
>>> Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
>>> ---
>>
>> Coincidentally, I was working on the same thing :)
> 
> Interesting, thanks — at least I’ve got one good reviewer :-)
> 
>>
>> We have a usecase regarding Arm TRBE and SPE aux buffers.
>>
>> I'll take a look at your patches later, but my implementation is the
> 
> Yes. Please.
> 
> 
>> following, if you have any comments. I have squashed the patches into
>> a single diff.
> 
> Thanks very much, Dev. What you’ve done is quite similar to
> patches 5/8 and 6/8, although the code differs somewhat.
> 
>>
>>
>>
>> From ccb9670a52b7f50b1f1e07b579a1316f76b84811 Mon Sep 17 00:00:00 2001
>> From: Dev Jain <dev.jain@arm.com>
>> Date: Thu, 26 Feb 2026 16:21:29 +0530
>> Subject: [PATCH] arm64/perf: map AUX buffer with large pages
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>  .../hwtracing/coresight/coresight-etm-perf.c  |  3 +-
>>  drivers/hwtracing/coresight/coresight-trbe.c  |  3 +-
>>  drivers/perf/arm_spe_pmu.c                    |  5 +-
>>  mm/vmalloc.c                                  | 86 ++++++++++++++++---
>>  4 files changed, 79 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
>> index 72017dcc3b7f1..e90a430af86bb 100644
>> --- a/drivers/hwtracing/coresight/coresight-etm-perf.c
>> +++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
>> @@ -984,7 +984,8 @@ int __init etm_perf_init(void)
>>
>>         etm_pmu.capabilities            = (PERF_PMU_CAP_EXCLUSIVE |
>>                                            PERF_PMU_CAP_ITRACE |
>> -                                          PERF_PMU_CAP_AUX_PAUSE);
>> +                                          PERF_PMU_CAP_AUX_PAUSE |
>> +                                          PERF_PMU_CAP_AUX_PREFER_LARGE);
>>
>>         etm_pmu.attr_groups             = etm_pmu_attr_groups;
>>         etm_pmu.task_ctx_nr             = perf_sw_context;
>> diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c
>> index 1511f8eb95afb..74e6ad891e236 100644
>> --- a/drivers/hwtracing/coresight/coresight-trbe.c
>> +++ b/drivers/hwtracing/coresight/coresight-trbe.c
>> @@ -760,7 +760,8 @@ static void *arm_trbe_alloc_buffer(struct coresight_device *csdev,
>>         for (i = 0; i < nr_pages; i++)
>>                 pglist[i] = virt_to_page(pages[i]);
>>
>> -       buf->trbe_base = (unsigned long)vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
>> +       buf->trbe_base = (unsigned long)vmap(pglist, nr_pages,
>> +                        VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL);
>>         if (!buf->trbe_base) {
>>                 kfree(pglist);
>>                 kfree(buf);
>> diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
>> index dbd0da1116390..90c349fd66b2c 100644
>> --- a/drivers/perf/arm_spe_pmu.c
>> +++ b/drivers/perf/arm_spe_pmu.c
>> @@ -1027,7 +1027,7 @@ static void *arm_spe_pmu_setup_aux(struct perf_event *event, void **pages,
>>         for (i = 0; i < nr_pages; ++i)
>>                 pglist[i] = virt_to_page(pages[i]);
>>
>> -       buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL);
>> +       buf->base = vmap(pglist, nr_pages, VM_MAP | VM_ALLOW_HUGE_VMAP, PAGE_KERNEL);
>>         if (!buf->base)
>>                 goto out_free_pglist;
>>
>> @@ -1064,7 +1064,8 @@ static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu)
>>         spe_pmu->pmu = (struct pmu) {
>>                 .module = THIS_MODULE,
>>                 .parent         = &spe_pmu->pdev->dev,
>> -               .capabilities   = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
>> +               .capabilities   = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE |
>> +                                 PERF_PMU_CAP_AUX_PREFER_LARGE,
>>                 .attr_groups    = arm_spe_pmu_attr_groups,
>>                 /*
>>                  * We hitch a ride on the software context here, so that
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index 61caa55a44027..8482463d41203 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -660,14 +660,14 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>>                 pgprot_t prot, struct page **pages, unsigned int page_shift)
>>  {
>>         unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
>> -
>> +       unsigned long step = 1UL << (page_shift - PAGE_SHIFT);
>>         WARN_ON(page_shift < PAGE_SHIFT);
>>
>>         if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
>>                         page_shift == PAGE_SHIFT)
>>                 return vmap_small_pages_range_noflush(addr, end, prot, pages);
>>
>> -       for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
>> +       for (i = 0; i < ALIGN_DOWN(nr, step); i += step) {
>>                 int err;
>>
>>                 err = vmap_range_noflush(addr, addr + (1UL << page_shift),
>> @@ -678,8 +678,9 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>>
>>                 addr += 1UL << page_shift;
>>         }
>> -
>> -       return 0;
>> +       if (IS_ALIGNED(nr, step))
>> +               return 0;
>> +       return vmap_small_pages_range_noflush(addr, end, prot, pages + i);
>>  }
>>
>>  int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
>> @@ -3514,6 +3515,50 @@ void vunmap(const void *addr)
>>  }
>>  EXPORT_SYMBOL(vunmap);
>>
>> +static inline unsigned int vm_shift(pgprot_t prot, unsigned long size)
>> +{
>> +       if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
>> +               return PMD_SHIFT;
>> +
>> +       return arch_vmap_pte_supported_shift(size);
>> +}
>> +
>> +static inline int __vmap_huge(struct page **pages, pgprot_t prot,
>> +               unsigned long addr, unsigned int count)
>> +{
>> +       unsigned int i = 0;
>> +       unsigned int shift;
>> +       unsigned long nr;
>> +
>> +       while (i < count) {
>> +               nr = num_pages_contiguous(pages + i, count - i);
>> +               shift = vm_shift(prot, nr << PAGE_SHIFT);
>> +               if (vmap_pages_range(addr, addr + (nr << PAGE_SHIFT),
>> +                                    pgprot_nx(prot), pages + i, shift) < 0) {
>> +                       return 1;
>> +               }
> 
> One observation on my side is that the performance gain is somewhat
> offset by page table zigzagging caused by what you are doing here -
> iterating each mem segment by vmap_pages_range() .

I recall having observed this problem half an year back, and I wrote
code similar to what you did with patch 3 - but I didn't observe any
performance improvement. I think that was because I was testing
vmalloc - most of the cost there lies in the page allocation.

So looks like this indeed is a benefit for vmap.

> 
> In patch 3/8, I enhanced vmap_small_pages_range_noflush() to
> avoid repeated pgd → p4d → pud → pmd → pte traversals for page
> shifts other than PAGE_SHIFT. This improves performance for
> vmalloc as well as vmap(). Then, in patch 7/8, I adopt the new
> vmap_small_pages_range_noflush() and eliminate the iteration.
> 
>> +               i += nr;
>> +               addr += (nr << PAGE_SHIFT);
>> +       }
>> +       return 0;
>> +}
>> +
>> +static unsigned long max_contiguous_stride_order(struct page **pages,
>> +               pgprot_t prot, unsigned int count)
>> +{
>> +       unsigned long max_shift = PAGE_SHIFT;
>> +       unsigned int i = 0;
>> +
>> +       while (i < count) {
>> +               unsigned long nr = num_pages_contiguous(pages + i, count - i);
>> +               unsigned long shift = vm_shift(prot, nr << PAGE_SHIFT);
>> +
>> +               max_shift = max(max_shift, shift);
>> +               i += nr;
>> +       }
>> +       return max_shift;
>> +}
>> +
>>  /**
>>   * vmap - map an array of pages into virtually contiguous space
>>   * @pages: array of page pointers
>> @@ -3552,15 +3597,32 @@ void *vmap(struct page **pages, unsigned int count,
>>                 return NULL;
>>
>>         size = (unsigned long)count << PAGE_SHIFT;
>> -       area = get_vm_area_caller(size, flags, __builtin_return_address(0));
>> +       if (flags & VM_ALLOW_HUGE_VMAP) {
>> +               /* determine from page array, the max alignment */
>> +               unsigned long max_shift = max_contiguous_stride_order(pages, prot, count);
>> +
>> +               area = __get_vm_area_node(size, 1 << max_shift, max_shift, flags,
>> +                                         VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
>> +                                         GFP_KERNEL, __builtin_return_address(0));
>> +       } else {
>> +               area = get_vm_area_caller(size, flags, __builtin_return_address(0));
>> +       }
>>         if (!area)
>>                 return NULL;
>>
>>         addr = (unsigned long)area->addr;
>> -       if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
>> -                               pages, PAGE_SHIFT) < 0) {
>> -               vunmap(area->addr);
>> -               return NULL;
>> +
>> +       if (flags & VM_ALLOW_HUGE_VMAP) {
>> +               if (__vmap_huge(pages, prot, addr, count)) {
>> +                       vunmap(area->addr);
>> +                       return NULL;
>> +               }
>> +       } else {
>> +               if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
>> +                                       pages, PAGE_SHIFT) < 0) {
>> +                       vunmap(area->addr);
>> +                       return NULL;
>> +               }
>>         }
>>
>>         if (flags & VM_MAP_PUT_PAGES) {
>> @@ -4011,11 +4073,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
>>                  * their allocations due to apply_to_page_range not
>>                  * supporting them.
>>                  */
>> -
>> -               if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
>> -                       shift = PMD_SHIFT;
>> -               else
>> -                       shift = arch_vmap_pte_supported_shift(size);
>> +               shift = vm_shift(prot, size);
> 
> What I actually did is different. In patches 1/8 and 2/8, I
> extended the arm64 levels to support N * CONT_PTE, and let the
> final PTE mapping use the maximum possible batch after avoiding
> zigzag. This further improves all orders greater than CONT_PTE.
> 
> Thanks
> Barry



^ permalink raw reply

* [PATCH v2 3/5] KVM: arm64: selftests: Add vgic IIDR revision test
From: David Woodhouse @ 2026-04-08 11:30 UTC (permalink / raw)
  To: Marc Zyngier, Oliver Upton, Joey Gouly, Suzuki K Poulose,
	Zenghui Yu, Catalin Marinas, Will Deacon, Paolo Bonzini,
	Shuah Khan, David Woodhouse, Raghavendra Rao Ananta, Eric Auger,
	Kees Cook, Arnd Bergmann, Nathan Chancellor, linux-arm-kernel,
	kvmarm, linux-kernel, kvm, linux-kselftest
In-Reply-To: <20260408113256.2095505-1-dwmw2@infradead.org>

From: David Woodhouse <dwmw@amazon.co.uk>

Test that the GICD_IIDR implementation revision correctly controls
guest-visible behaviour for GICv3:

  Revision 1: IGROUPR reads as all-ones (group 1), writes are ignored.
              GICR_CTLR.{IR,CES} not advertised.
  Revision 2: IGROUPR is guest-configurable (read/write).
              GICR_CTLR.{IR,CES} not advertised.
  Revision 3: IGROUPR is guest-configurable (read/write).
              GICR_CTLR.{IR,CES} advertised.

For each revision, the test sets the IIDR via KVM_DEV_ARM_VGIC_GRP_DIST_REGS
before initializing the vGIC, then runs a guest that verifies the
expected IGROUPR and GICR_CTLR behaviour.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 tools/testing/selftests/kvm/Makefile.kvm      |   1 +
 .../selftests/kvm/arm64/vgic_group_iidr.c     | 118 ++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/arm64/vgic_group_iidr.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 6471fa214a9f..df729a70124f 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -177,6 +177,7 @@ TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config
 TEST_GEN_PROGS_arm64 += arm64/vgic_init
 TEST_GEN_PROGS_arm64 += arm64/vgic_irq
 TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress
+TEST_GEN_PROGS_arm64 += arm64/vgic_group_iidr
 TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access
 TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3
 TEST_GEN_PROGS_arm64 += arm64/idreg-idst
diff --git a/tools/testing/selftests/kvm/arm64/vgic_group_iidr.c b/tools/testing/selftests/kvm/arm64/vgic_group_iidr.c
new file mode 100644
index 000000000000..0073ccc19e92
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/vgic_group_iidr.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic_group_iidr.c - Test IGROUPR behaviour across IIDR revisions
+ *
+ * Validate that the GICD_IIDR implementation revision controls
+ * IGROUPR semantics for GICv3:
+ *   Rev 1: IGROUPR reads as all-ones (group 1), writes ignored
+ *   Rev 2+: IGROUPR is guest-configurable (read/write)
+ */
+#include <linux/sizes.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "gic.h"
+#include "gic_v3.h"
+#include "vgic.h"
+
+#define NR_IRQS		128
+#define SPI_IGROUPR	(GICD_IGROUPR + (32 / 32) * 4) /* intids 32-63 */
+
+static uint64_t shared_rev;
+
+static void guest_code(void)
+{
+	uint32_t val;
+
+	val = readl(GICD_BASE_GVA + SPI_IGROUPR);
+
+	if (shared_rev == 1) {
+		/* Rev 1: all group 1, guest writes must be ignored */
+		GUEST_ASSERT_EQ(val, 0xffffffff);
+		writel(0x0, GICD_BASE_GVA + SPI_IGROUPR);
+		val = readl(GICD_BASE_GVA + SPI_IGROUPR);
+		GUEST_ASSERT_EQ(val, 0xffffffff);
+		writel(0x55aa55aa, GICD_BASE_GVA + SPI_IGROUPR);
+		val = readl(GICD_BASE_GVA + SPI_IGROUPR);
+		GUEST_ASSERT_EQ(val, 0xffffffff);
+	} else {
+		/* Rev 2/3: guest-configurable */
+		writel(0xa5a5a5a5, GICD_BASE_GVA + SPI_IGROUPR);
+		val = readl(GICD_BASE_GVA + SPI_IGROUPR);
+		GUEST_ASSERT_EQ(val, 0xa5a5a5a5);
+		writel(0x0, GICD_BASE_GVA + SPI_IGROUPR);
+		val = readl(GICD_BASE_GVA + SPI_IGROUPR);
+		GUEST_ASSERT_EQ(val, 0x0);
+	}
+
+	/* Rev 3: GICR_CTLR advertises IR and CES. Rev 1/2: it does not. */
+	val = readl(GICR_BASE_GVA + GICR_CTLR);
+	if (shared_rev >= 3)
+		GUEST_ASSERT(val & (GICR_CTLR_IR | GICR_CTLR_CES));
+	else
+		GUEST_ASSERT(!(val & (GICR_CTLR_IR | GICR_CTLR_CES)));
+
+	GUEST_DONE();
+}
+
+static void run_test(int rev)
+{
+	struct kvm_vcpu *vcpus[1];
+	struct kvm_vm *vm;
+	struct ucall uc;
+	uint32_t iidr;
+	int gic_fd;
+
+	pr_info("Testing IIDR revision %d\n", rev);
+
+	test_disable_default_vgic();
+	vm = vm_create_with_vcpus(1, guest_code, vcpus);
+
+	gic_fd = __vgic_v3_setup(vm, 1, NR_IRQS);
+	TEST_ASSERT(gic_fd >= 0, "Failed to create vGICv3");
+
+	/* Set the requested IIDR revision before init. */
+	kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+			    GICD_IIDR, &iidr);
+	iidr &= ~GICD_IIDR_REVISION_MASK;
+	iidr |= rev << GICD_IIDR_REVISION_SHIFT;
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+			    GICD_IIDR, &iidr);
+
+	__vgic_v3_init(gic_fd);
+
+	/* Verify the revision was applied. */
+	kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+			    GICD_IIDR, &iidr);
+	TEST_ASSERT(((iidr & GICD_IIDR_REVISION_MASK) >> GICD_IIDR_REVISION_SHIFT) == rev,
+		    "IIDR revision readback: expected %d, got %d",
+		    rev, (iidr & GICD_IIDR_REVISION_MASK) >> GICD_IIDR_REVISION_SHIFT);
+
+	/* Tell the guest which revision we set. */
+	sync_global_to_guest(vm, shared_rev);
+	shared_rev = rev;
+	sync_global_to_guest(vm, shared_rev);
+
+	vcpu_run(vcpus[0]);
+	switch (get_ucall(vcpus[0], &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall %lu", uc.cmd);
+	}
+
+	close(gic_fd);
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	run_test(1);
+	run_test(2);
+	run_test(3);
+	return 0;
+}
-- 
2.51.0



^ permalink raw reply related

* [PATCH v2 1/5] KVM: arm64: vgic: Fix IIDR revision field extracted from wrong value
From: David Woodhouse @ 2026-04-08 11:30 UTC (permalink / raw)
  To: Marc Zyngier, Oliver Upton, Joey Gouly, Suzuki K Poulose,
	Zenghui Yu, Catalin Marinas, Will Deacon, Paolo Bonzini,
	Shuah Khan, David Woodhouse, Raghavendra Rao Ananta, Eric Auger,
	Kees Cook, Arnd Bergmann, Nathan Chancellor, linux-arm-kernel,
	kvmarm, linux-kernel, kvm, linux-kselftest
In-Reply-To: <20260408113256.2095505-1-dwmw2@infradead.org>

From: David Woodhouse <dwmw@amazon.co.uk>

The uaccess write handlers for GICD_IIDR in both GICv2 and GICv3
extract the revision field from 'reg' (the current IIDR value read back
from the emulated distributor) instead of 'val' (the value userspace is
trying to write). This means userspace can never actually change the
implementation revision — the extracted value is always the current one.

Fix the FIELD_GET to use 'val' so that userspace can select a different
revision for migration compatibility.

Fixes: 49a1a2c70a7f ("KVM: arm64: vgic-v3: Advertise GICR_CTLR.{IR, CES} as a new GICD_IIDR revision")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/arm64/kvm/vgic/vgic-mmio-v2.c | 2 +-
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index 406845b3117c..0643e333db35 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -91,7 +91,7 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
 		 * migration from old kernels to new kernels with legacy
 		 * userspace.
 		 */
-		reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg);
+		reg = FIELD_GET(GICD_IIDR_REVISION_MASK, val);
 		switch (reg) {
 		case KVM_VGIC_IMP_REV_2:
 		case KVM_VGIC_IMP_REV_3:
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 89edb84d1ac6..5913a20d8301 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -194,7 +194,7 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
 		if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK)
 			return -EINVAL;
 
-		reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg);
+		reg = FIELD_GET(GICD_IIDR_REVISION_MASK, val);
 		switch (reg) {
 		case KVM_VGIC_IMP_REV_2:
 		case KVM_VGIC_IMP_REV_3:
-- 
2.51.0



^ permalink raw reply related

* [PATCH v2 0/5] KVM: arm64: vgic: Fix IIDR revision handling and add revision 1
From: David Woodhouse @ 2026-04-08 11:30 UTC (permalink / raw)
  To: Marc Zyngier, Oliver Upton, Joey Gouly, Suzuki K Poulose,
	Zenghui Yu, Catalin Marinas, Will Deacon, Paolo Bonzini,
	Shuah Khan, David Woodhouse, Raghavendra Rao Ananta, Eric Auger,
	Kees Cook, Arnd Bergmann, Nathan Chancellor, linux-arm-kernel,
	kvmarm, linux-kernel, kvm, linux-kselftest

The uaccess write handlers for GICD_IIDR extract the revision field
from the wrong variable, making it impossible for userspace to actually
change the implementation revision. Fix that.

Additionally, allow userspace to select IIDR revision 1, restoring the
behaviour from before commit d53c2c29ae0d ("KVM: arm/arm64: vgic: Allow
configuration of interrupt groups") where interrupt groups are not
guest-configurable. This is needed by hypervisors that were reverting
that commit to preserve the original guest-visible semantics, and to
allow for a safely controlled deployment of the new behaviour.

For GICv2, kill the v2_groups_user_writable flag and make the behaviour 
depend directly on the IIDR. The existing default behaviour of setting 
the IIDR to revision 3 and allowing the groups to be writable by the 
*guest* but just not by userspace was just weird, and almost certainly
not intentional. (New in v2 posting).

Tested on Graviton 3 (Neoverse-V1) metal for GICv3 selftests, and
under QEMU TCG with GICv2 emulation for GICv2 selftests.

v2:
 • Fixed -Wdiscarded-qualifiers warning from 0-day bot.
 • Remove GICv2 v2_groups_user_writable flag and just use IIDR.
 • Address Marc's review feedback (no special cases in read_group,
   other minor cleanups).

v1: https://lore.kernel.org/all/20260407210949.2076251-1-dwmw2@infradead.org/

David Woodhouse (5):
      KVM: arm64: vgic: Fix IIDR revision field extracted from wrong value
      KVM: arm64: vgic: Allow userspace to set IIDR revision 1
      KVM: arm64: selftests: Add vgic IIDR revision test
      KVM: arm64: vgic: Remove v2_groups_user_writable and use IIDR revision directly
      KVM: arm64: selftests: Add GICv2 IGROUPR writability test

 arch/arm64/kvm/vgic/vgic-mmio-v2.c                 |  18 +--
 arch/arm64/kvm/vgic/vgic-mmio-v3.c                 |   6 +-
 arch/arm64/kvm/vgic/vgic-mmio.c                    |   4 +
 include/kvm/arm_vgic.h                             |   4 +-
 tools/testing/selftests/kvm/Makefile.kvm           |   2 +
 .../testing/selftests/kvm/arm64/vgic_group_iidr.c  | 118 +++++++++++++++
 tools/testing/selftests/kvm/arm64/vgic_group_v2.c  | 168 +++++++++++++++++++++
 7 files changed, 306 insertions(+), 14 deletions(-)



^ permalink raw reply

* [PATCH v2 2/5] KVM: arm64: vgic: Allow userspace to set IIDR revision 1
From: David Woodhouse @ 2026-04-08 11:30 UTC (permalink / raw)
  To: Marc Zyngier, Oliver Upton, Joey Gouly, Suzuki K Poulose,
	Zenghui Yu, Catalin Marinas, Will Deacon, Paolo Bonzini,
	Shuah Khan, David Woodhouse, Raghavendra Rao Ananta, Eric Auger,
	Kees Cook, Arnd Bergmann, Nathan Chancellor, linux-arm-kernel,
	kvmarm, linux-kernel, kvm, linux-kselftest
In-Reply-To: <20260408113256.2095505-1-dwmw2@infradead.org>

From: David Woodhouse <dwmw@amazon.co.uk>

Allow userspace to select GICD_IIDR revision 1, which restores the
original pre-d53c2c29ae0d ("KVM: arm/arm64: vgic: Allow configuration
of interrupt groups") behaviour where interrupt groups are not
guest-configurable.

When revision 1 is selected:
 - GICv2: IGROUPR reads as zero (group 0), writes are ignored
 - GICv3: IGROUPR reads as all-ones (group 1), writes are ignored
 - v2_groups_user_writable is not set

This is implemented by checking the implementation revision in
vgic_mmio_write_group() and suppressing writes when the revision is
below 2. The read side needs no change since the per-IRQ group reset
values already match the expected behaviour.

Note that d53c2c29ae0d wired guest IGROUPR writes directly to
vgic_mmio_write_group() without any revision check, while only gating
the userspace write path via v2_groups_user_writable. This meant a
guest could modify interrupt groups even at revision 1, which was
never intended. The write_group revision check fixes both the guest
and GICv3 userspace paths.

Fixes: d53c2c29ae0d ("KVM: arm/arm64: vgic: Allow configuration of interrupt groups")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/arm64/kvm/vgic/vgic-mmio-v2.c | 3 +++
 arch/arm64/kvm/vgic/vgic-mmio-v3.c | 4 ++++
 arch/arm64/kvm/vgic/vgic-mmio.c    | 4 ++++
 include/kvm/arm_vgic.h             | 1 +
 4 files changed, 12 insertions(+)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index 0643e333db35..e5714f7fd2ec 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -20,6 +20,7 @@
  * Revision 1: Report GICv2 interrupts as group 0 instead of group 1
  * Revision 2: Interrupt groups are guest-configurable and signaled using
  * 	       their configured groups.
+ * Revision 3: GICv2 behaviour is unchanged from revision 2.
  */
 
 static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
@@ -96,6 +97,8 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
 		case KVM_VGIC_IMP_REV_2:
 		case KVM_VGIC_IMP_REV_3:
 			vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
+			fallthrough;
+		case KVM_VGIC_IMP_REV_1:
 			dist->implementation_rev = reg;
 			return 0;
 		default:
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 5913a20d8301..0130db71cfc9 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -74,8 +74,11 @@ bool vgic_supports_direct_sgis(struct kvm *kvm)
 /*
  * The Revision field in the IIDR have the following meanings:
  *
+ * Revision 1: Interrupt groups are not guest-configurable.
+ * 	       IGROUPR reads as all-ones (group 1), writes ignored.
  * Revision 2: Interrupt groups are guest-configurable and signaled using
  * 	       their configured groups.
+ * Revision 3: GICR_CTLR.{IR,CES} are advertised.
  */
 
 static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
@@ -196,6 +199,7 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
 
 		reg = FIELD_GET(GICD_IIDR_REVISION_MASK, val);
 		switch (reg) {
+		case KVM_VGIC_IMP_REV_1:
 		case KVM_VGIC_IMP_REV_2:
 		case KVM_VGIC_IMP_REV_3:
 			dist->implementation_rev = reg;
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index a573b1f0c6cb..4fbe0ad22adf 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -73,6 +73,10 @@ void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
 	int i;
 	unsigned long flags;
 
+	/* Revision 1 and below: groups are not guest-configurable. */
+	if (vgic_get_implementation_rev(vcpu) < KVM_VGIC_IMP_REV_2)
+		return;
+
 	for (i = 0; i < len * 8; i++) {
 		struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index f2eafc65bbf4..90fb6cd3c91c 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -248,6 +248,7 @@ struct vgic_dist {
 
 	/* Implementation revision as reported in the GICD_IIDR */
 	u32			implementation_rev;
+#define KVM_VGIC_IMP_REV_1	1 /* GICv2 interrupts as group 0 */
 #define KVM_VGIC_IMP_REV_2	2 /* GICv2 restorable groups */
 #define KVM_VGIC_IMP_REV_3	3 /* GICv3 GICR_CTLR.{IW,CES,RWP} */
 #define KVM_VGIC_IMP_REV_LATEST	KVM_VGIC_IMP_REV_3
-- 
2.51.0



^ permalink raw reply related

* [PATCH v2 4/5] KVM: arm64: vgic: Remove v2_groups_user_writable and use IIDR revision directly
From: David Woodhouse @ 2026-04-08 11:30 UTC (permalink / raw)
  To: Marc Zyngier, Oliver Upton, Joey Gouly, Suzuki K Poulose,
	Zenghui Yu, Catalin Marinas, Will Deacon, Paolo Bonzini,
	Shuah Khan, David Woodhouse, Raghavendra Rao Ananta, Eric Auger,
	Kees Cook, Arnd Bergmann, Nathan Chancellor, linux-arm-kernel,
	kvmarm, linux-kernel, kvm, linux-kselftest
In-Reply-To: <20260408113256.2095505-1-dwmw2@infradead.org>

From: David Woodhouse <dwmw@amazon.co.uk>

The v2_groups_user_writable flag was introduced to gate GICv2 userspace
IGROUPR writes until userspace explicitly wrote the IIDR, signalling
awareness of the group semantics. However, the guest write path through
vgic_mmio_write_group() was never gated by this flag, allowing a GICv2
guest to modify interrupt groups regardless of whether userspace had
opted in.

Rather than adding the same flag check to the guest path, remove the
flag entirely and make both guest and userspace IGROUPR writability
follow the IIDR implementation revision directly. Groups are writable
when the revision is >= 2, which is the case when userspace explicitly
sets the IIDR to revision 2 or 3. When userspace does not write the
IIDR, vgic_init() defaults to KVM_VGIC_IMP_REV_LATEST (currently 3),
so the behaviour is unchanged for userspace that doesn't set the IIDR.

This also fixes the inconsistency where a GICv2 guest could write
IGROUPR even when the IIDR had not been explicitly set by userspace.

Fixes: d53c2c29ae0d ("KVM: arm/arm64: vgic: Allow configuration of interrupt groups")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/arm64/kvm/vgic/vgic-mmio-v2.c | 16 +++++-----------
 include/kvm/arm_vgic.h             |  3 ---
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index e5714f7fd2ec..e5fc673a1ea9 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -84,21 +84,15 @@ static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 
 		/*
-		 * If we observe a write to GICD_IIDR we know that userspace
-		 * has been updated and has had a chance to cope with older
-		 * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting
-		 * interrupts as group 1, and therefore we now allow groups to
-		 * be user writable.  Doing this by default would break
-		 * migration from old kernels to new kernels with legacy
-		 * userspace.
+		 * Allow userspace to select the GICv2 IIDR revision.
+		 * Group writability follows the revision directly:
+		 * groups are guest/user writable for revision >= 2.
 		 */
 		reg = FIELD_GET(GICD_IIDR_REVISION_MASK, val);
 		switch (reg) {
+		case KVM_VGIC_IMP_REV_1:
 		case KVM_VGIC_IMP_REV_2:
 		case KVM_VGIC_IMP_REV_3:
-			vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
-			fallthrough;
-		case KVM_VGIC_IMP_REV_1:
 			dist->implementation_rev = reg;
 			return 0;
 		default:
@@ -114,7 +108,7 @@ static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu,
 					    gpa_t addr, unsigned int len,
 					    unsigned long val)
 {
-	if (vcpu->kvm->arch.vgic.v2_groups_user_writable)
+	if (vgic_get_implementation_rev(vcpu) >= KVM_VGIC_IMP_REV_2)
 		vgic_mmio_write_group(vcpu, addr, len, val);
 
 	return 0;
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 90fb6cd3c91c..cdfab2c20877 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -253,9 +253,6 @@ struct vgic_dist {
 #define KVM_VGIC_IMP_REV_3	3 /* GICv3 GICR_CTLR.{IW,CES,RWP} */
 #define KVM_VGIC_IMP_REV_LATEST	KVM_VGIC_IMP_REV_3
 
-	/* Userspace can write to GICv2 IGROUPR */
-	bool			v2_groups_user_writable;
-
 	/* Do injected MSIs require an additional device ID? */
 	bool			msis_require_devid;
 
-- 
2.51.0



^ permalink raw reply related

* [PATCH v2 5/5] KVM: arm64: selftests: Add GICv2 IGROUPR writability test
From: David Woodhouse @ 2026-04-08 11:30 UTC (permalink / raw)
  To: Marc Zyngier, Oliver Upton, Joey Gouly, Suzuki K Poulose,
	Zenghui Yu, Catalin Marinas, Will Deacon, Paolo Bonzini,
	Shuah Khan, David Woodhouse, Raghavendra Rao Ananta, Eric Auger,
	Kees Cook, Arnd Bergmann, Nathan Chancellor, linux-arm-kernel,
	kvmarm, linux-kernel, kvm, linux-kselftest
In-Reply-To: <20260408113256.2095505-1-dwmw2@infradead.org>

From: David Woodhouse <dwmw@amazon.co.uk>

Test that GICv2 IGROUPR writability is consistently gated by the IIDR
implementation revision for both guest and userspace paths:

  Default (no IIDR write): implementation_rev defaults to 3, groups
    writable from both guest and userspace.
  Rev 1: IGROUPR reads as zero (group 0), writes ignored from both
    guest and userspace.
  Rev 2: IGROUPR is writable from both guest and userspace.

This test requires GICv2 emulation support (GICv3 with GICv2 compat
CPU interface) and will be skipped on hardware without it.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 tools/testing/selftests/kvm/Makefile.kvm      |   1 +
 .../selftests/kvm/arm64/vgic_group_v2.c       | 168 ++++++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/arm64/vgic_group_v2.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index df729a70124f..878d7cb92555 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -178,6 +178,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_init
 TEST_GEN_PROGS_arm64 += arm64/vgic_irq
 TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress
 TEST_GEN_PROGS_arm64 += arm64/vgic_group_iidr
+TEST_GEN_PROGS_arm64 += arm64/vgic_group_v2
 TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access
 TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3
 TEST_GEN_PROGS_arm64 += arm64/idreg-idst
diff --git a/tools/testing/selftests/kvm/arm64/vgic_group_v2.c b/tools/testing/selftests/kvm/arm64/vgic_group_v2.c
new file mode 100644
index 000000000000..6d4bad44bae7
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/vgic_group_v2.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vgic_group_v2.c - Test GICv2 IGROUPR behaviour across IIDR revisions
+ *
+ * Validate that the GICD_IIDR implementation revision controls GICv2
+ * IGROUPR writability for both guest and userspace:
+ *   Default (no IIDR write): groups writable (implementation_rev defaults to 3)
+ *   Rev 1: IGROUPR reads as zero (group 0), writes ignored
+ *   Rev 2: IGROUPR is guest and userspace configurable
+ */
+#include <linux/sizes.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "gic.h"
+#include "gic_v3.h"
+#include "vgic.h"
+
+#define NR_IRQS		64
+
+#define V2_DIST_BASE	0x8000000ULL
+#define V2_CPU_BASE	0x8010000ULL
+#define V2_DIST_GVA	((volatile void *)V2_DIST_BASE)
+
+#define SPI_IGROUPR	(GICD_IGROUPR + (32 / 32) * 4)
+
+static uint64_t shared_rev;
+static uint64_t guest_result;
+
+static void guest_code(void)
+{
+	uint32_t before, after;
+
+	before = readl(V2_DIST_GVA + SPI_IGROUPR);
+	writel(0x5a5a5a5a, V2_DIST_GVA + SPI_IGROUPR);
+	after = readl(V2_DIST_GVA + SPI_IGROUPR);
+
+	guest_result = ((uint64_t)before << 32) | after;
+	GUEST_DONE();
+}
+
+static int create_v2_gic(struct kvm_vm *vm)
+{
+	uint32_t nr_irqs = NR_IRQS;
+	uint64_t addr;
+	int gic_fd;
+
+	gic_fd = __kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V2);
+	if (gic_fd < 0)
+		return gic_fd;
+
+	addr = V2_DIST_BASE;
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V2_ADDR_TYPE_DIST, &addr);
+	addr = V2_CPU_BASE;
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR,
+			    KVM_VGIC_V2_ADDR_TYPE_CPU, &addr);
+
+	virt_map(vm, V2_DIST_BASE, V2_DIST_BASE,
+		 vm_calc_num_guest_pages(vm->mode, SZ_64K));
+	virt_map(vm, V2_CPU_BASE, V2_CPU_BASE,
+		 vm_calc_num_guest_pages(vm->mode, SZ_64K));
+
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS,
+			    0, &nr_irqs);
+	return gic_fd;
+}
+
+static void run_test(int set_iidr_rev)
+{
+	struct kvm_vcpu *vcpus[1];
+	struct kvm_vm *vm;
+	struct ucall uc;
+	uint32_t before, after, igroupr, iidr;
+	int gic_fd;
+	bool expect_writable;
+
+	if (set_iidr_rev >= 0)
+		pr_info("Testing GICv2 IIDR revision %d\n", set_iidr_rev);
+	else
+		pr_info("Testing GICv2 IIDR default (no write)\n");
+
+	test_disable_default_vgic();
+	vm = vm_create_with_vcpus(1, guest_code, vcpus);
+
+	gic_fd = create_v2_gic(vm);
+	TEST_REQUIRE(gic_fd >= 0);
+
+	if (set_iidr_rev >= 0) {
+		kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+				    GICD_IIDR, &iidr);
+		iidr &= ~GICD_IIDR_REVISION_MASK;
+		iidr |= set_iidr_rev << GICD_IIDR_REVISION_SHIFT;
+		kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+				    GICD_IIDR, &iidr);
+	}
+
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL,
+			    KVM_DEV_ARM_VGIC_CTRL_INIT, NULL);
+
+	/*
+	 * Default (no IIDR write) gets implementation_rev=3 from vgic_init(),
+	 * so groups should be writable. Rev 1 = not writable. Rev 2+ = writable.
+	 */
+	expect_writable = (set_iidr_rev != 1);
+
+	/* Test userspace IGROUPR write */
+	igroupr = 0xa5a5a5a5;
+	kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+			    SPI_IGROUPR, &igroupr);
+	igroupr = 0;
+	kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+			    SPI_IGROUPR, &igroupr);
+
+	if (expect_writable)
+		TEST_ASSERT(igroupr == 0xa5a5a5a5,
+			    "Userspace write should succeed: got 0x%08x", igroupr);
+	else
+		TEST_ASSERT(igroupr == 0x00000000,
+			    "Userspace write should be ignored: got 0x%08x", igroupr);
+
+	/* Reset IGROUPR to 0 via userspace for rev 2+ before guest test */
+	if (expect_writable) {
+		igroupr = 0;
+		kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_DIST_REGS,
+				    SPI_IGROUPR, &igroupr);
+	}
+
+	/* Test guest IGROUPR write */
+	sync_global_to_guest(vm, guest_result);
+	vcpu_run(vcpus[0]);
+
+	switch (get_ucall(vcpus[0], &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall %lu", uc.cmd);
+	}
+
+	sync_global_from_guest(vm, guest_result);
+	before = guest_result >> 32;
+	after = guest_result & 0xffffffff;
+
+	TEST_ASSERT(before == 0x00000000,
+		    "Initial IGROUPR should be 0 (group 0): got 0x%08x", before);
+
+	if (expect_writable)
+		TEST_ASSERT(after == 0x5a5a5a5a,
+			    "Guest write should succeed: got 0x%08x", after);
+	else
+		TEST_ASSERT(after == 0x00000000,
+			    "Guest write should be ignored: got 0x%08x", after);
+
+	close(gic_fd);
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	run_test(-1);  /* default */
+	run_test(1);   /* rev 1 */
+	run_test(2);   /* rev 2 */
+	return 0;
+}
-- 
2.51.0



^ permalink raw reply related

* Re: [RFC PATCH 7/8] mm/vmalloc: Coalesce same page_shift mappings in vmap to avoid pgtable zigzag
From: Dev Jain @ 2026-04-08 11:36 UTC (permalink / raw)
  To: Barry Song (Xiaomi), linux-mm, linux-arm-kernel, catalin.marinas,
	will, akpm, urezki
  Cc: linux-kernel, anshuman.khandual, ryan.roberts, ajd, rppt, david,
	Xueyuan.chen21
In-Reply-To: <20260408025115.27368-8-baohua@kernel.org>



On 08/04/26 8:21 am, Barry Song (Xiaomi) wrote:
> For vmap(), detect pages with the same page_shift and map them in
> batches, avoiding the pgtable zigzag caused by per-page mapping.
> 
> Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
> ---

In patch 4, you eliminate the pagetable rewalk, and in patch 5,
you re-introduce it, then in this patch you eliminate it again.
So please just squash this into #5.

>  mm/vmalloc.c | 24 ++++++++++++++++++++----
>  1 file changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 6643ec0288cd..3c3b7217693a 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3551,6 +3551,8 @@ static int vmap_contig_pages_range(unsigned long addr, unsigned long end,
>  		pgprot_t prot, struct page **pages)
>  {
>  	unsigned int count = (end - addr) >> PAGE_SHIFT;
> +	unsigned int prev_shift = 0, idx = 0;
> +	unsigned long map_addr = addr;
>  	int err;
>  
>  	err = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
> @@ -3562,15 +3564,29 @@ static int vmap_contig_pages_range(unsigned long addr, unsigned long end,
>  		unsigned int shift = PAGE_SHIFT +
>  			get_vmap_batch_order(pages, count - i, i);
>  
> -		err = vmap_range_noflush(addr, addr + (1UL << shift),
> -				page_to_phys(pages[i]), prot, shift);
> -		if (err)
> -			goto out;
> +		if (!i)
> +			prev_shift = shift;
> +
> +		if (shift != prev_shift) {
> +			err = vmap_small_pages_range_noflush(map_addr, addr,
> +					prot, pages + idx,
> +					min(prev_shift, PMD_SHIFT));
> +			if (err)
> +				goto out;
> +			prev_shift = shift;
> +			map_addr = addr;
> +			idx = i;
> +		}
>  
>  		addr += 1UL << shift;
>  		i += 1U << (shift - PAGE_SHIFT);
>  	}
>  
> +	/* Remaining */
> +	if (map_addr < end)
> +		err = vmap_small_pages_range_noflush(map_addr, end,
> +				prot, pages + idx, min(prev_shift, PMD_SHIFT));
> +
>  out:
>  	flush_cache_vmap(addr, end);
>  	return err;



^ permalink raw reply

* [PATCH v2] KVM: arm64: Reject non compliant SMCCC function calls in pKVM
From: Sebastian Ene @ 2026-04-08 11:41 UTC (permalink / raw)
  To: catalin.marinas, kvmarm, linux-arm-kernel, linux-kernel,
	android-kvm
  Cc: joey.gouly, korneld, maz, mrigendra.chaubey, oupton, perlarsen,
	sebastianene, suzuki.poulose, will, yuzenghui

Prevent the propagation of a function-id that has the top bits set since
this is not compliant with the SMCCC spec and can overlap with the
already known function-id decoders. (eg. if we invoke an smc with
0xffffffffc4000012 it will be decoded as a PSCI reset call). Instead,
make it clear that we don't support it and return an error.

Signed-off-by: Sebastian Ene <sebastianene@google.com>
---
NOTE: This is based on linux-next, next-20260407 to avoid a minor
conflict with a previously submitted patch (commit cf6348af645b).  

Changelog:

v1 -> v2:
* dropped the changes to the function signature that were accepting
  64-bit function-ids.
* applied Mark's suggestion to make it clear that we don't accept non
  standard SMCCC calls.
* revised commit message & updated the title.

Link to v1: 
https://lore.kernel.org/all/20260401123201.389906-1-sebastianene@google.com/

---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 73f2e0221e70..cca4b07c8d61 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -805,6 +805,10 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
 	}
 
 	func_id &= ~ARM_SMCCC_CALL_HINTS;
+	if (upper_32_bits(func_id)) {
+		cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
+		goto exit_skip_instr;
+	}
 
 	handled = kvm_host_psci_handler(host_ctxt, func_id);
 	if (!handled)
-- 
2.53.0.1213.gd9a14994de-goog



^ permalink raw reply related

* [PATCH 1/2] Documentation: ABI: add sysfs interface for ZynqMP CSU registers
From: Ronak Jain @ 2026-04-08 11:42 UTC (permalink / raw)
  To: michal.simek, senthilnathan.thangaraj
  Cc: linux-kernel, linux-arm-kernel, ronak.jain
In-Reply-To: <20260408114244.2852015-1-ronak.jain@amd.com>

Document the new sysfs interface that exposes Configuration Security
Unit (CSU) registers through the zynqmp-firmware driver.

The interface is available under:

  /sys/devices/platform/firmware:zynqmp-firmware/csu_registers/

The CSU registers are discovered at boot time using the PM_QUERY_DATA
firmware API. The following registers are currently supported:

  - multiboot     (CSU_MULTI_BOOT)
  - idcode        (CSU_IDCODE, read-only)
  - pcap-status   (CSU_PCAP_STATUS, read-only)

Read operations use the existing IOCTL_READ_REG firmware interface,
while write operations use IOCTL_MASK_WRITE_REG.

Access control is enforced by the firmware. Write attempts to
read-only registers are rejected by firmware even though the sysfs file
permissions allow writes.

Document the ABI entry accordingly.

Signed-off-by: Ronak Jain <ronak.jain@amd.com>
---
 .../ABI/stable/sysfs-driver-firmware-zynqmp   | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp
index c3fec3c835af..f537f7d9bb55 100644
--- a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp
+++ b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp
@@ -254,3 +254,36 @@ Description:
 		The expected result is 500.
 
 Users:		Xilinx
+
+What:		/sys/devices/platform/firmware\:zynqmp-firmware/csu_registers/*
+Date:		March 2026
+KernelVersion:	7.1
+Contact:	"Ronak Jain" <ronak.jain@amd.com>
+Description:
+		Read/Write CSU (Configuration Security Unit) registers.
+
+		This interface provides dynamic access to CSU registers that are
+		discovered from the firmware at boot time using PM_QUERY_DATA API.
+
+		The supported registers are:
+
+		- multiboot: CSU_MULTI_BOOT register
+		- idcode: CSU_IDCODE register (read-only)
+		- pcap-status: CSU_PCAP_STATUS register (read-only)
+
+		Read operations use the existing IOCTL_READ_REG API.
+		Write operations use the existing IOCTL_MASK_WRITE_REG API.
+
+		The firmware enforces access control - read-only registers will reject
+		write attempts even though the sysfs permissions show write access.
+
+		Usage for reading::
+
+		    # cat /sys/devices/platform/firmware\:zynqmp-firmware/csu_registers/multiboot
+		    # cat /sys/devices/platform/firmware\:zynqmp-firmware/csu_registers/idcode
+
+		Usage for writing (mask and value are in hexadecimal)::
+
+		    # echo 0xFFFFFFF 0x0 > /sys/devices/platform/firmware\:zynqmp-firmware/csu_registers/multiboot
+
+Users:		Xilinx/AMD
-- 
2.34.1



^ permalink raw reply related

* [PATCH 0/2] Add dynamic CSU register sysfs interface
From: Ronak Jain @ 2026-04-08 11:42 UTC (permalink / raw)
  To: michal.simek, senthilnathan.thangaraj
  Cc: linux-kernel, linux-arm-kernel, ronak.jain

This patch series adds support for exposing CSU registers through a
sysfs interface. The implementation uses dynamic discovery via the
PM_QUERY_DATA firmware API to determine available registers at
runtime, making the interface flexible and maintainable without
requiring kernel changes when firmware capabilities evolve.

Background:

The ZynqMP platform has several CSU registers that are useful for
system configuration and debugging. Previously, accessing these
registers required direct memory access or custom tools. This series
provides a standardized sysfs interface that leverages existing
firmware APIs for secure access.

Key Features:

- Dynamic register discovery using PM_QUERY_DATA API
  * PM_QID_GET_NODE_COUNT: Query number of available registers
  * PM_QID_GET_NODE_NAME: Query register names by index
- Automatic sysfs attribute creation under csu_registers/ group
- Read operations via existing IOCTL_READ_REG firmware API
- Write operations via existing IOCTL_MASK_WRITE_REG firmware API
- Firmware-enforced access control for read-only registers

Currently Supported Registers:

- multiboot (CSU_MULTI_BOOT): Boot mode configuration
- idcode (CSU_IDCODE): Device identification (read-only)
- pcap-status (CSU_PCAP_STATUS): PCAP status (read-only)

The sysfs interface is available at:
  /sys/devices/platform/firmware:zynqmp-firmware/csu_registers/

Usage Examples:

Reading a register:
  # cat /sys/devices/platform/firmware:zynqmp-firmware/csu_registers/idcode

Writing a register (mask and value in hex):
  # echo "0xFFFFFFFF 0x0" > /sys/devices/platform/firmware:zynqmp-firmware/csu_registers/multiboot


Testing:

- Verified register read operations return correct values
- Verified write operations update registers correctly
- Verified read-only registers reject write attempts
- Verified dynamic discovery works with different firmware versions


Ronak Jain (2):
  Documentation: ABI: add sysfs interface for ZynqMP CSU registers
  firmware: zynqmp: Add dynamic CSU register discovery and sysfs
    interface

 .../ABI/stable/sysfs-driver-firmware-zynqmp   |  33 +++
 MAINTAINERS                                   |  10 +
 drivers/firmware/xilinx/Makefile              |   2 +-
 drivers/firmware/xilinx/zynqmp-csu-reg.c      | 249 ++++++++++++++++++
 drivers/firmware/xilinx/zynqmp-csu-reg.h      |  18 ++
 drivers/firmware/xilinx/zynqmp.c              |   6 +
 include/linux/firmware/xlnx-zynqmp.h          |   4 +-
 7 files changed, 320 insertions(+), 2 deletions(-)
 create mode 100644 drivers/firmware/xilinx/zynqmp-csu-reg.c
 create mode 100644 drivers/firmware/xilinx/zynqmp-csu-reg.h

-- 
2.34.1



^ permalink raw reply

* [PATCH 2/2] firmware: zynqmp: Add dynamic CSU register discovery and sysfs interface
From: Ronak Jain @ 2026-04-08 11:42 UTC (permalink / raw)
  To: michal.simek, senthilnathan.thangaraj
  Cc: linux-kernel, linux-arm-kernel, ronak.jain
In-Reply-To: <20260408114244.2852015-1-ronak.jain@amd.com>

Add support for dynamically discovering and exposing Configuration
Security Unit (CSU) registers through sysfs. Leverage the existing
PM_QUERY_DATA API to discover available registers at runtime, making
the interface flexible and maintainable.

Key features:
- Dynamic register discovery using PM_QUERY_DATA API
  * PM_QID_GET_NODE_COUNT: Query number of available registers
  * PM_QID_GET_NODE_NAME: Query register names by index
- Automatic sysfs attribute creation under csu_registers/ group
- Read operations via existing IOCTL_READ_REG API
- Write operations via existing IOCTL_MASK_WRITE_REG API
- Firmware-enforced access control (read-only registers reject writes)

The sysfs interface is created at:
  /sys/devices/platform/firmware:zynqmp-firmware/csu_registers/

Currently supported registers include:
  - multiboot (CSU_MULTI_BOOT)
  - idcode (CSU_IDCODE, read-only)
  - pcap-status (CSU_PCAP_STATUS, read-only)

The dynamic discovery approach allows firmware to control which
registers are exposed without requiring kernel changes, improving
maintainability and security.

Signed-off-by: Ronak Jain <ronak.jain@amd.com>
---
 MAINTAINERS                              |  10 +
 drivers/firmware/xilinx/Makefile         |   2 +-
 drivers/firmware/xilinx/zynqmp-csu-reg.c | 249 +++++++++++++++++++++++
 drivers/firmware/xilinx/zynqmp-csu-reg.h |  18 ++
 drivers/firmware/xilinx/zynqmp.c         |   6 +
 include/linux/firmware/xlnx-zynqmp.h     |   4 +-
 6 files changed, 287 insertions(+), 2 deletions(-)
 create mode 100644 drivers/firmware/xilinx/zynqmp-csu-reg.c
 create mode 100644 drivers/firmware/xilinx/zynqmp-csu-reg.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 10d12b51b1f6..37fe2b7e0ccf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -29212,6 +29212,16 @@ F:	drivers/dma/xilinx/xdma.c
 F:	include/linux/dma/amd_xdma.h
 F:	include/linux/platform_data/amd_xdma.h
 
+XILINX ZYNQMP CSU REGISTER DRIVER
+M:	Senthil Nathan Thangaraj <senthilnathan.thangaraj@amd.com>
+R:	Michal Simek <michal.simek@amd.com>
+R:	Ronak Jain <ronak.jain@amd.com>
+L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+S:	Maintained
+F:	Documentation/ABI/stable/sysfs-driver-firmware-zynqmp
+F:	drivers/firmware/xilinx/zynqmp-csu-reg.c
+F:	drivers/firmware/xilinx/zynqmp-csu-reg.h
+
 XILINX ZYNQMP DPDMA DRIVER
 M:	Laurent Pinchart <laurent.pinchart@ideasonboard.com>
 L:	dmaengine@vger.kernel.org
diff --git a/drivers/firmware/xilinx/Makefile b/drivers/firmware/xilinx/Makefile
index 8db0e66b6b7e..6203f41daaa6 100644
--- a/drivers/firmware/xilinx/Makefile
+++ b/drivers/firmware/xilinx/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for Xilinx firmwares
 
-obj-$(CONFIG_ZYNQMP_FIRMWARE) += zynqmp.o zynqmp-ufs.o zynqmp-crypto.o
+obj-$(CONFIG_ZYNQMP_FIRMWARE) += zynqmp.o zynqmp-ufs.o zynqmp-crypto.o zynqmp-csu-reg.o
 obj-$(CONFIG_ZYNQMP_FIRMWARE_DEBUG) += zynqmp-debug.o
diff --git a/drivers/firmware/xilinx/zynqmp-csu-reg.c b/drivers/firmware/xilinx/zynqmp-csu-reg.c
new file mode 100644
index 000000000000..1f304ce858b1
--- /dev/null
+++ b/drivers/firmware/xilinx/zynqmp-csu-reg.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xilinx Zynq MPSoC CSU Register Access
+ *
+ * Copyright (C) 2026 Advanced Micro Devices, Inc.
+ *
+ *  Michal Simek <michal.simek@amd.com>
+ *  Ronak Jain <ronak.jain@amd.com>
+ */
+
+#include <linux/firmware/xlnx-zynqmp.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "zynqmp-csu-reg.h"
+
+/* Node ID for CSU module in firmware */
+#define CSU_NODE_ID 0
+
+/* Maximum number of CSU registers supported */
+#define MAX_CSU_REGS 50
+
+/* Size of register name returned by firmware (3 u32 words = 12 bytes) */
+#define CSU_REG_NAME_LEN 12
+
+/**
+ * struct zynqmp_csu_reg - CSU register information
+ * @id: Register index from firmware
+ * @name: Register name
+ * @attr: Device attribute for sysfs
+ */
+struct zynqmp_csu_reg {
+	u32 id;
+	char name[CSU_REG_NAME_LEN];
+	struct device_attribute attr;
+};
+
+/**
+ * struct zynqmp_csu_data - Per-device CSU data
+ * @csu_regs: Array of CSU registers
+ * @csu_reg_count: Number of CSU registers
+ * @csu_attr_group: Attribute group for sysfs
+ */
+struct zynqmp_csu_data {
+	struct zynqmp_csu_reg *csu_regs;
+	int csu_reg_count;
+	struct attribute_group csu_attr_group;
+};
+
+/**
+ * zynqmp_pm_get_node_count() - Get number of supported nodes via QUERY_DATA
+ *
+ * Return: Number of nodes on success, or negative error code
+ */
+static int zynqmp_pm_get_node_count(void)
+{
+	struct zynqmp_pm_query_data qdata = {0};
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	int ret;
+
+	qdata.qid = PM_QID_GET_NODE_COUNT;
+
+	ret = zynqmp_pm_query_data(qdata, ret_payload);
+	if (ret)
+		return ret;
+
+	return ret_payload[1];
+}
+
+/**
+ * zynqmp_pm_get_node_name() - Get node name via QUERY_DATA
+ * @index: Register index
+ * @name: Buffer to store register name
+ *
+ * Return: 0 on success, error code otherwise
+ */
+static int zynqmp_pm_get_node_name(u32 index, char *name)
+{
+	struct zynqmp_pm_query_data qdata = {0};
+	u32 ret_payload[PAYLOAD_ARG_CNT];
+	int ret;
+
+	qdata.qid = PM_QID_GET_NODE_NAME;
+	qdata.arg1 = index;
+
+	ret = zynqmp_pm_query_data(qdata, ret_payload);
+	if (ret)
+		return ret;
+
+	memcpy(name, &ret_payload[1], CSU_REG_NAME_LEN);
+	name[CSU_REG_NAME_LEN - 1] = '\0';
+
+	return 0;
+}
+
+/**
+ * zynqmp_csu_reg_show() - Generic show function for all registers
+ * @dev: Device pointer
+ * @attr: Device attribute
+ * @buf: Output buffer
+ *
+ * Return: Number of bytes written to buffer, or error code
+ */
+static ssize_t zynqmp_csu_reg_show(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct zynqmp_csu_reg *reg;
+	u32 value;
+	int ret;
+
+	/* Use container_of to get register directly */
+	reg = container_of(attr, struct zynqmp_csu_reg, attr);
+
+	ret = zynqmp_pm_sec_read_reg(CSU_NODE_ID, reg->id, &value);
+	if (ret)
+		return ret;
+
+	return sysfs_emit(buf, "0x%08x\n", value);
+}
+
+/**
+ * zynqmp_csu_reg_store() - Generic store function for writable registers
+ * @dev: Device pointer
+ * @attr: Device attribute
+ * @buf: Input buffer
+ * @count: Buffer size
+ *
+ * Format: "mask value" - both mask and value required
+ * Example: echo "0xFFFFFFFF 0x12345678" > register
+ *
+ * Return: count on success, error code otherwise
+ */
+static ssize_t zynqmp_csu_reg_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct zynqmp_csu_reg *reg;
+	u32 mask, value;
+	int ret;
+
+	reg = container_of(attr, struct zynqmp_csu_reg, attr);
+
+	if (sscanf(buf, "%x %x", &mask, &value) != 2)
+		return -EINVAL;
+
+	ret = zynqmp_pm_sec_mask_write_reg(CSU_NODE_ID, reg->id, mask, value);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+/**
+ * zynqmp_csu_discover_registers() - Discover CSU registers from firmware
+ * @pdev: Platform device pointer
+ *
+ * This function uses PM_QUERY_DATA to discover all available CSU registers
+ * and creates sysfs group under /sys/devices/platform/firmware:zynqmp-firmware/
+ *
+ * Return: 0 on success, error code otherwise
+ */
+int zynqmp_csu_discover_registers(struct platform_device *pdev)
+{
+	struct zynqmp_csu_data *csu_data;
+	struct attribute **attrs;
+	int count, ret, i;
+
+	ret = zynqmp_pm_is_function_supported(PM_QUERY_DATA, PM_QID_GET_NODE_COUNT);
+	if (ret) {
+		dev_dbg(&pdev->dev, "CSU register discovery not supported by current firmware\n");
+		return 0;
+	}
+
+	count = zynqmp_pm_get_node_count();
+	if (count < 0)
+		return count;
+	if (count == 0) {
+		dev_dbg(&pdev->dev, "No nodes available from firmware\n");
+		return 0;
+	}
+
+	/* Validate count to prevent excessive memory allocation */
+	if (count > MAX_CSU_REGS) {
+		dev_err(&pdev->dev, "Register count %d exceeds maximum %d\n",
+			count, MAX_CSU_REGS);
+		return -EINVAL;
+	}
+
+	dev_dbg(&pdev->dev, "Discovered %d nodes from firmware\n", count);
+
+	csu_data = devm_kzalloc(&pdev->dev, sizeof(*csu_data), GFP_KERNEL);
+	if (!csu_data)
+		return -ENOMEM;
+
+	csu_data->csu_reg_count = count;
+
+	csu_data->csu_regs = devm_kcalloc(&pdev->dev, count, sizeof(*csu_data->csu_regs),
+					  GFP_KERNEL);
+	if (!csu_data->csu_regs) {
+		devm_kfree(&pdev->dev, csu_data);
+		return -ENOMEM;
+	}
+
+	attrs = devm_kcalloc(&pdev->dev, count + 1, sizeof(*attrs), GFP_KERNEL);
+	if (!attrs) {
+		devm_kfree(&pdev->dev, csu_data->csu_regs);
+		devm_kfree(&pdev->dev, csu_data);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < count; i++) {
+		struct zynqmp_csu_reg *reg = &csu_data->csu_regs[i];
+		struct device_attribute *dev_attr = &reg->attr;
+
+		reg->id = i;
+
+		ret = zynqmp_pm_get_node_name(i, reg->name);
+		if (ret) {
+			dev_warn(&pdev->dev, "Failed to get name for register %d\n", i);
+			snprintf(reg->name, sizeof(reg->name), "csu_reg_%d", i);
+		}
+
+		/* Create sysfs attribute - firmware enforces actual access control */
+		sysfs_attr_init(&dev_attr->attr);
+		dev_attr->attr.name = reg->name;
+		dev_attr->attr.mode = 0644;
+		dev_attr->show = zynqmp_csu_reg_show;
+		dev_attr->store = zynqmp_csu_reg_store;
+
+		attrs[i] = &dev_attr->attr;
+
+		dev_dbg(&pdev->dev, "Register %d: id=%d name=%s\n", i, reg->id, reg->name);
+	}
+
+	csu_data->csu_attr_group.name = "csu_registers";
+	csu_data->csu_attr_group.attrs = attrs;
+
+	ret = devm_device_add_group(&pdev->dev, &csu_data->csu_attr_group);
+	if (ret) {
+		devm_kfree(&pdev->dev, attrs);
+		devm_kfree(&pdev->dev, csu_data->csu_regs);
+		devm_kfree(&pdev->dev, csu_data);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(zynqmp_csu_discover_registers);
diff --git a/drivers/firmware/xilinx/zynqmp-csu-reg.h b/drivers/firmware/xilinx/zynqmp-csu-reg.h
new file mode 100644
index 000000000000..b12415db3496
--- /dev/null
+++ b/drivers/firmware/xilinx/zynqmp-csu-reg.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Xilinx Zynq MPSoC CSU Register Access
+ *
+ * Copyright (C) 2026 Advanced Micro Devices, Inc.
+ *
+ *  Michal Simek <michal.simek@amd.com>
+ *  Ronak Jain <ronak.jain@amd.com>
+ */
+
+#ifndef __ZYNQMP_CSU_REG_H__
+#define __ZYNQMP_CSU_REG_H__
+
+#include <linux/platform_device.h>
+
+int zynqmp_csu_discover_registers(struct platform_device *pdev);
+
+#endif /* __ZYNQMP_CSU_REG_H__ */
diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index fbe8510f4927..b549d07f7497 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -27,6 +27,7 @@
 
 #include <linux/firmware/xlnx-zynqmp.h>
 #include <linux/firmware/xlnx-event-manager.h>
+#include "zynqmp-csu-reg.h"
 #include "zynqmp-debug.h"
 
 /* Max HashMap Order for PM API feature check (1<<7 = 128) */
@@ -2120,6 +2121,11 @@ static int zynqmp_firmware_probe(struct platform_device *pdev)
 			dev_err_probe(&pdev->dev, PTR_ERR(em_dev), "EM register fail with error\n");
 	}
 
+	/* Discover CSU registers dynamically */
+	ret = zynqmp_csu_discover_registers(pdev);
+	if (ret)
+		dev_warn(&pdev->dev, "CSU register discovery failed: %d\n", ret);
+
 	return of_platform_populate(dev->of_node, NULL, NULL, dev);
 }
 
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index d70dcd462b44..a4b293eb96ce 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -3,7 +3,7 @@
  * Xilinx Zynq MPSoC Firmware layer
  *
  *  Copyright (C) 2014-2021 Xilinx
- *  Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc.
+ *  Copyright (C) 2022 - 2026 Advanced Micro Devices, Inc.
  *
  *  Michal Simek <michal.simek@amd.com>
  *  Davorin Mista <davorin.mista@aggios.com>
@@ -262,6 +262,8 @@ enum pm_query_id {
 	PM_QID_CLOCK_GET_NUM_CLOCKS = 12,
 	PM_QID_CLOCK_GET_MAX_DIVISOR = 13,
 	PM_QID_PINCTRL_GET_ATTRIBUTES = 15,
+	PM_QID_GET_NODE_NAME = 16,
+	PM_QID_GET_NODE_COUNT = 17,
 };
 
 enum rpu_oper_mode {
-- 
2.34.1



^ permalink raw reply related

* Re: [PATCH] nvmem: imx-ocotp: Initialize in subsys_initcall
From: Greg KH @ 2026-04-08 11:46 UTC (permalink / raw)
  To: Paul Geurts
  Cc: srini, Frank.Li, s.hauer, kernel, festevam, p.zabel, imx,
	linux-arm-kernel, linux-kernel, martijn.de.gouw
In-Reply-To: <20260408101901.2111140-1-paul.geurts@prodrive-technologies.com>

On Wed, Apr 08, 2026 at 12:19:01PM +0200, Paul Geurts wrote:
> The i.MX OCOTP driver is implemented as module_platform_driver();,
> which makes it initialize in device_initcall(). This means that all
> drivers referencing the clock driver nodes in the device tree are
> deferred by fw_devlink.
> 
> As the OCOTP driver is arch specific, but dependent on the i.MX clock
> driver, which is also initialized in arch_initcall(), explicitly
> initialize the driver in subsys_initcall(). This makes sure the drivers
> depending on fuses defined by OCOTP, which are initialized in
> device_initcall() are not deferred.
> 
> Fixes: 3edba6b47e42 ("nvmem: imx-ocotp: Add i.MX6 OCOTP driver")
> Signed-off-by: Paul Geurts <paul.geurts@prodrive-technologies.com>
> ---
>  drivers/nvmem/imx-ocotp.c | 13 ++++++++++++-
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvmem/imx-ocotp.c b/drivers/nvmem/imx-ocotp.c
> index 108d78d7f6cb..9b1e7bb14ced 100644
> --- a/drivers/nvmem/imx-ocotp.c
> +++ b/drivers/nvmem/imx-ocotp.c
> @@ -638,7 +638,18 @@ static struct platform_driver imx_ocotp_driver = {
>  		.of_match_table = imx_ocotp_dt_ids,
>  	},
>  };
> -module_platform_driver(imx_ocotp_driver);
> +
> +static int __init imx_ocotp_init(void)
> +{
> +	return platform_driver_register(&imx_ocotp_driver);
> +}
> +subsys_initcall(imx_ocotp_init);

This is not a subsystem, sorry, but this isn't ok for a single driver to
use.

Please use the default level here, module_platform_driver() is correct,
and handle the deferred probe correctly, that is what it is designed to
do.  Playing games with init levels will not solve the root problem
here, as was pointed out by the fact that you could load this module in
any order and have the exact same problem you are attempting to "solve"
here.

thanks,

greg k-h


^ permalink raw reply

* [PATCH] ARM: dts: exynos: Add bluetooth support to manta
From: Lukas Timmermann @ 2026-04-08 11:56 UTC (permalink / raw)
  To: Rob Herring, Krzysztof Kozlowski, Conor Dooley, Alim Akhtar
  Cc: devicetree, linux-arm-kernel, linux-samsung-soc, linux-kernel,
	Lukas Timmermann, Alexandre Marquet

Enable the bcm4330-bt device for manta boards on serial0.
Also adds the necessary pin definitions and interrupt handling for
wakeup.

Signed-off-by: Lukas Timmermann <linux@timmermann.space>
Co-developed-by: Alexandre Marquet <tb@a-marquet.fr>
Signed-off-by: Alexandre Marquet <tb@a-marquet.fr>
---
This patch depends on previous patches which are
currently only found in linux-next.
See: https://lore.kernel.org/all/177214038655.341086.4114348823043257597.b4-ty@kernel.org/
---
 arch/arm/boot/dts/samsung/exynos5250-manta.dts | 41 +++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/samsung/exynos5250-manta.dts b/arch/arm/boot/dts/samsung/exynos5250-manta.dts
index 24a27b342227..76d3657eb22f 100644
--- a/arch/arm/boot/dts/samsung/exynos5250-manta.dts
+++ b/arch/arm/boot/dts/samsung/exynos5250-manta.dts
@@ -461,6 +461,13 @@ acc_int: acc-int-pins {
 		samsung,pin-pud = <EXYNOS_PIN_PULL_UP>;
 	};
 
+	bt_host_wakeup: bt-host-wakeup-pins {
+		samsung,pins = "gpx2-6";
+		samsung,pin-function = <EXYNOS_PIN_FUNC_INPUT>;
+		samsung,pin-pud = <EXYNOS_PIN_PULL_NONE>;
+		samsung,pin-drv = <EXYNOS4_PIN_DRV_LV1>;
+	};
+
 	max77686_irq: max77686-irq-pins {
 		samsung,pins = "gpx0-2";
 		samsung,pin-function = <EXYNOS_PIN_FUNC_F>;
@@ -488,6 +495,20 @@ bh1721fvc_reset: bh1721fvc-reset-pins {
 		samsung,pin-pud = <EXYNOS_PIN_PULL_NONE>;
 	};
 
+	bt_reg_on: bt-reg-on-pins {
+		samsung,pins = "gph0-0";
+		samsung,pin-function = <EXYNOS_PIN_FUNC_OUTPUT>;
+		samsung,pin-con-pdn = <EXYNOS_PIN_PDN_PREV>;
+		samsung,pin-pud-pdn = <EXYNOS_PIN_PULL_NONE>;
+	};
+
+	bt_wake: bt-wake-pins {
+		samsung,pins = "gph1-3";
+		samsung,pin-function = <EXYNOS_PIN_FUNC_OUTPUT>;
+		samsung,pin-con-pdn = <EXYNOS_PIN_PDN_PREV>;
+		samsung,pin-pud-pdn = <EXYNOS_PIN_PULL_NONE>;
+	};
+
 	msense_reset: msense-reset-pins {
 		samsung,pins = "gpg2-0";
 		samsung,pin-function = <EXYNOS_PIN_FUNC_OUTPUT>;
@@ -536,7 +557,25 @@ &sd1_cmd {
 
 /* Bluetooth */
 &serial_0 {
-	status = "disabled";
+	pinctrl-0 = <&uart0_data &uart0_fctl>;
+	pinctrl-names = "default";
+
+	bluetooth {
+		compatible = "brcm,bcm4330-bt";
+
+		pinctrl-0 = <&bt_reg_on &bt_wake &bt_host_wakeup>;
+		pinctrl-names = "default";
+
+		shutdown-gpios = <&gph0 0 GPIO_ACTIVE_HIGH>;
+		device-wakeup-gpios = <&gph1 3 GPIO_ACTIVE_HIGH>;
+
+		interrupt-parent = <&gpx2>;
+		interrupts = <6 IRQ_TYPE_EDGE_FALLING>;
+		interrupt-names = "host-wakeup";
+
+		clocks = <&max77686 MAX77686_CLK_PMIC>;
+		clock-names = "lpo";
+	};
 };
 
 /* GPS */

---
base-commit: e5f7e05a699f41275d6380c497293446034bc8af
change-id: 20260404-manta-bluetooth-836133028bb6

Best regards,
--  
Lukas Timmermann <linux@timmermann.space>



^ permalink raw reply related

* Re: [RFC V1 05/16] arm64/mm: Convert READ_ONCE() as pmdp_get() while accessing PMD
From: David Hildenbrand (Arm) @ 2026-04-08 12:11 UTC (permalink / raw)
  To: Anshuman Khandual, linux-arm-kernel
  Cc: Catalin Marinas, Will Deacon, Ryan Roberts, Mark Rutland,
	Lorenzo Stoakes, Andrew Morton, Mike Rapoport, Linu Cherian,
	linux-kernel, linux-mm, kasan-dev
In-Reply-To: <20260224051153.3150613-6-anshuman.khandual@arm.com>

On 2/24/26 06:11, Anshuman Khandual wrote:
> Convert all READ_ONCE() based PMD accesses as pmdp_get() instead which will
> support both D64 and D128 translation regime going forward.

You should mention the move from pmdp_test_and_clear_young(), and why it
is performed.

Nothing else jumped at me :)

-- 
Cheers,

David


^ permalink raw reply

* Re: [RFC V1 00/16] arm64/mm: Enable 128 bit page table entries
From: David Hildenbrand (Arm) @ 2026-04-08 12:13 UTC (permalink / raw)
  To: Anshuman Khandual, linux-arm-kernel
  Cc: Catalin Marinas, Will Deacon, Ryan Roberts, Mark Rutland,
	Lorenzo Stoakes, Andrew Morton, Mike Rapoport, Linu Cherian,
	linux-kernel, linux-mm
In-Reply-To: <8d2c9ecb-ae33-42f2-a8ed-66b3286b9286@arm.com>

On 4/8/26 12:53, Anshuman Khandual wrote:
> On 07/04/26 8:14 PM, David Hildenbrand (Arm) wrote:
>> On 2/24/26 06:11, Anshuman Khandual wrote:
>>> FEAT_D128 is a new arm architecture feature adding support for VMSAv9-128
>>> translation system. FEAT_D128 is an optional feature from ARMV9.3 onwards.
>>> So with this feature arm64 platforms could have two different translation
>>> systems, VMSAv8-64 and VMSAv9-128 could selectively be enabled.
>>>
>>> FEAT_D128 adds 128 bit page table entries, thus supporting larger physical
>>> and virtual address range while also expanding available room for more MMU
>>> management feature bits both for HW and SW. 
>>>
>>> This series has been split into two parts. Generic MM changes followed by
>>> arm64 platform changes, finally enabling D128 with a new config ARM64_D128.
>>>
>>> READ_ONCE() on page table entries get routed via level specific pxdp_get()
>>> helpers which platforms could then override when required. These accessors
>>> on arm64 platform help in ensuring page table accesses are performed in an
>>> atomic manner while reading 128 bit page table entries.
>>>
>>> All ARM64_VA_BITS and ARM64_PA_BITS combinations for all page sizes are now
>>> supported both on D64 and D128 translation regimes. Although new 56 bits VA
>>> space is not yet supported. Similarly FEAT_D128 skip level is not supported
>>> currently.
>>>
>>> Basic page table geometry has been changed with D128 as there are now fewer
>>> entries per level. Please refer to the following table for leaf entry sizes
>>>
>>>                     D64              D128
>>> ------------------------------------------------
>>> | PAGE_SIZE |   PMD  |  PUD  |   PMD  |   PUD  |
>>> -----------------------------|-----------------|
>>> |     4K    |    2M  |  1G   |    1M  |  256M  |
>>> |    16K    |   32M  | 64G   |   16M  |   16G  |
>>> |    64K    |  512M  |  4T   |  256M  |    1T  |
>>> ------------------------------------------------
>>>
>>
>> Interesting. That means user space will have it even harder to optimize
>> for THP sizes.
>>
>> What's the effect on cont-pte? Do they still span the same number of
>> entries and there is effectively no change?
> 
> The numbers are the same for 4K base page size but will need
> some changes for 16K and 64K base page sizes. Something that
> git missed in this series, will fix it.

Oh, and it would be great to also clearly spell out the effect on
hugetlb as well. I assume the available hugetlb sizes will change as well.

-- 
Cheers,

David


^ permalink raw reply

* Re: [RFC V1 06/16] arm64/mm: Convert READ_ONCE() as pudp_get() while accessing PUD
From: David Hildenbrand (Arm) @ 2026-04-08 12:15 UTC (permalink / raw)
  To: Anshuman Khandual, linux-arm-kernel
  Cc: Catalin Marinas, Will Deacon, Ryan Roberts, Mark Rutland,
	Lorenzo Stoakes, Andrew Morton, Mike Rapoport, Linu Cherian,
	linux-kernel, linux-mm, kasan-dev
In-Reply-To: <20260224051153.3150613-7-anshuman.khandual@arm.com>

On 2/24/26 06:11, Anshuman Khandual wrote:
> Convert all READ_ONCE() based PUD accesses as pudp_get() instead which will
> support both D64 and D128 translation regime going forward.
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Cc: kasan-dev@googlegroups.com
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---

I was wondering for a second whether it would be better to structure
this as "convert READ_ONCE to use pxxxp_get() in fault.c" instead,
essentially, to touch each file only once.

Anyhow

Reviewed-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David


^ permalink raw reply

* Re: [RFC V1 07/16] arm64/mm: Convert READ_ONCE() as p4dp_get() while accessing P4D
From: David Hildenbrand (Arm) @ 2026-04-08 12:17 UTC (permalink / raw)
  To: Anshuman Khandual, linux-arm-kernel
  Cc: Catalin Marinas, Will Deacon, Ryan Roberts, Mark Rutland,
	Lorenzo Stoakes, Andrew Morton, Mike Rapoport, Linu Cherian,
	linux-kernel, linux-mm, kasan-dev
In-Reply-To: <20260224051153.3150613-8-anshuman.khandual@arm.com>

On 2/24/26 06:11, Anshuman Khandual wrote:
> Convert all READ_ONCE() based P4D accesses as p4dp_get() instead which will
> support both D64 and D128 translation regime going forward.
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Cc: kasan-dev@googlegroups.com
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---


[...]

>  static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index a80d06db4de6..16ae11b29f66 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -354,7 +354,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>  {
>  	int ret = 0;
>  	unsigned long next;
> -	p4d_t p4d = READ_ONCE(*p4dp);
> +	p4d_t p4d = p4dp_get(p4dp);
>  	pud_t *pudp;
>  
>  	if (p4d_none(p4d)) {
> @@ -443,7 +443,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  	}
>  
>  	do {
> -		p4d_t old_p4d = READ_ONCE(*p4dp);
> +		p4d_t old_p4d = p4dp_get(p4dp);
>  
>  		next = p4d_addr_end(addr, end);
>  
> @@ -453,7 +453,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  			goto out;
>  
>  		BUG_ON(p4d_val(old_p4d) != 0 &&
> -		       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));
> +		       p4d_val(old_p4d) != (p4d_val(p4dp_get(p4dp))));

Same here, while at it remove the BUG_ON. (see below)

>  
>  		phys += next - addr;
>  	} while (p4dp++, addr = next, addr != end);
> @@ -1541,7 +1541,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
>  	do {
>  		next = p4d_addr_end(addr, end);
>  		p4dp = p4d_offset(pgdp, addr);
> -		p4d = READ_ONCE(*p4dp);
> +		p4d = p4dp_get(p4dp);
>  		if (p4d_none(p4d))
>  			continue;
>  
> @@ -1703,7 +1703,7 @@ static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
>  	do {
>  		next = p4d_addr_end(addr, end);
>  		p4dp = p4d_offset(pgdp, addr);
> -		p4d = READ_ONCE(*p4dp);
> +		p4d = p4dp_get(p4dp);
>  		if (p4d_none(p4d))
>  			continue;
>  
> @@ -1724,7 +1724,7 @@ static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
>  	 */
>  	p4dp = p4d_offset(pgdp, 0UL);
>  	for (i = 0; i < PTRS_PER_P4D; i++) {
> -		if (!p4d_none(READ_ONCE(p4dp[i])))
> +		if (!p4d_none(p4dp_get(p4dp + i)))
>  			return;
>  	}
>  
> @@ -2258,4 +2258,21 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>  }
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
>  
> +#if CONFIG_PGTABLE_LEVELS > 3
> +phys_addr_t pud_offset_phys(p4d_t *p4dp, unsigned long addr)
> +{
> +	p4d_t p4d = p4dp_get(p4dp);
> +
> +	BUG_ON(!pgtable_l4_enabled());

Heh, while at it, convert that to a VM_WARN_ON_ONCE() or anything else
that is not a BUG.

I strongly assume CONFIG_DEBUG_VM checks are sufficient.

> +
> +	return p4d_page_paddr(p4d) + pud_index(addr) * sizeof(pud_t);
> +}
> +

-- 
Cheers,

David


^ permalink raw reply

* Re: [RFC V1 08/16] arm64/mm: Convert READ_ONCE() as pgdp_get() while accessing PGD
From: David Hildenbrand (Arm) @ 2026-04-08 12:19 UTC (permalink / raw)
  To: Anshuman Khandual, linux-arm-kernel
  Cc: Catalin Marinas, Will Deacon, Ryan Roberts, Mark Rutland,
	Lorenzo Stoakes, Andrew Morton, Mike Rapoport, Linu Cherian,
	linux-kernel, linux-mm, kasan-dev
In-Reply-To: <20260224051153.3150613-9-anshuman.khandual@arm.com>

On 2/24/26 06:11, Anshuman Khandual wrote:
> Convert all READ_ONCE() based PGD accesses as pgdp_get() instead which will
> support both D64 and D128 translation regime going forward.

Please mention here why you move p4d_offset_phys/p4d_offset. (same
applies to other patches)

Do we get additional function calls that might degrade some page table
walkers?

Same comment regarding BUG_ON.

-- 
Cheers,

David


^ permalink raw reply

* Re: [PATCH v2 1/4] perf/arm_pmuv3: Fix NULL pointer dereference in armv8pmu_sched_task()
From: Usama Arif @ 2026-04-08 12:23 UTC (permalink / raw)
  To: Puranjay Mohan
  Cc: Usama Arif, bpf, Puranjay Mohan, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Will Deacon,
	Mark Rutland, Catalin Marinas, Leo Yan, Rob Herring, Breno Leitao,
	linux-arm-kernel, linux-perf-users, kernel-team
In-Reply-To: <20260318171706.2840512-2-puranjay@kernel.org>

On Wed, 18 Mar 2026 10:16:55 -0700 Puranjay Mohan <puranjay@kernel.org> wrote:

> This is easily triggered with:
> 
>   perf record -b -e cycles -a -- ls
> 
> which crashes on the first context switch with:
> 
>   Unable to handle kernel NULL pointer dereference at virtual address 00[.]
>   PC is at armv8pmu_sched_task+0x14/0x50
>   LR is at perf_pmu_sched_task+0xac/0x108
>   Call trace:
>     armv8pmu_sched_task+0x14/0x50 (P)
>     perf_pmu_sched_task+0xac/0x108
>     __perf_event_task_sched_out+0x6c/0xe0
>     prepare_task_switch+0x120/0x268
>     __schedule+0x1e8/0x828
>     ...
> 
> perf_pmu_sched_task() invokes the PMU sched callback with cpc->task_epc,
> which is NULL when no per-task events exist for this PMU. With CPU-wide
> branch-stack events, armv8pmu_sched_task() is still registered and
> dereferences pmu_ctx->pmu unconditionally, causing the crash.
> 
> The bug was introduced by commit fa9d27773873 ("perf: arm_pmu: Kill last
> use of per-CPU cpu_armpmu pointer") which changed the function from
> using the per-CPU cpu_armpmu pointer (always valid) to dereferencing
> pmu_ctx->pmu without adding a NULL check.
> 
> Add a NULL check for pmu_ctx to avoid the crash.
> 
> Fixes: fa9d27773873 ("perf: arm_pmu: Kill last use of per-CPU cpu_armpmu pointer")
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
>  drivers/perf/arm_pmuv3.c | 11 +++++++++--
>  1 file changed, 9 insertions(+), 2 deletions(-)
> 

Acked-by: Usama Arif <usama.arif@linux.dev>


^ permalink raw reply

* Re: [RFC V1 09/16] arm64/mm: Route all pgtable reads via ptdesc_get()
From: David Hildenbrand (Arm) @ 2026-04-08 12:25 UTC (permalink / raw)
  To: Anshuman Khandual, Mike Rapoport
  Cc: linux-arm-kernel, Catalin Marinas, Will Deacon, Ryan Roberts,
	Mark Rutland, Lorenzo Stoakes, Andrew Morton, Linu Cherian,
	linux-kernel, linux-mm
In-Reply-To: <7f7130d0-f348-451e-960e-b7e6ae9c9ee7@arm.com>

On 3/2/26 05:34, Anshuman Khandual wrote:
> 
> 
> On 28/02/26 4:47 PM, Mike Rapoport wrote:
>> Hi Anshuman,
>>
>> On Tue, Feb 24, 2026 at 10:41:46AM +0530, Anshuman Khandual wrote:
>>> Define arm64 platform specific implementations for new pXdp_get() helpers.
>>> These resolve into READ_ONCE(), thus ensuring required single copy atomic
>>> semantics for the page table entry reads.
>>>
>>> In future this infrastructure can be used for D128 to maintain single copy
>>> atomicity semantics with inline asm blocks.
>>>
>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>> Cc: Will Deacon <will@kernel.org>
>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>> Cc: Mark Rutland <mark.rutland@arm.com>
>>> Cc: linux-arm-kernel@lists.infradead.org
>>> Cc: linux-kernel@vger.kernel.org
>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>>> ---
>>>  arch/arm64/include/asm/pgtable.h | 28 +++++++++++++++++++++++++++-
>>>  1 file changed, 27 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>>> index 257af1c3015d..804ef49aea88 100644
>>> --- a/arch/arm64/include/asm/pgtable.h
>>> +++ b/arch/arm64/include/asm/pgtable.h
>>> @@ -84,6 +84,32 @@ static inline void arch_leave_lazy_mmu_mode(void)
>>>  	arch_flush_lazy_mmu_mode();
>>>  }
>>>  
>>> +#define ptdesc_get(x)		READ_ONCE(x)
>>
>> This will be confusing with 'struct ptdesc' APIs, maybe ptent_get()?
> 
> Created 'ptdesc_t' earlier on arm64 platform as an unified data type, which could
> represent page table entries including their protection fields and masks for any
> level.
> 
> typedef u64 ptdesc_t;
> 
> typedef ptdesc_t pteval_t;
> typedef ptdesc_t pmdval_t;
> typedef ptdesc_t pudval_t;
> typedef ptdesc_t p4dval_t;
> typedef ptdesc_t pgdval_t;
> 
> But now it conflicts with generic 'struct ptdesc'. Agreed that overall renaming is
> required. Probably ptent_t along with ptent_get/set() could be an option. But that
> is probably orthogonal to the series and can be done later in a separate patch.

We often refer to *ptep as ptent. Can we find something that does not
contain "pte" to avoid confusion with pteval_t ?

We often refer to the set of functions as "pXX" or "pxx". pxxval_t would
certainly be an option, but it would be the first "official" such
occurrence.

-- 
Cheers,

David


^ permalink raw reply

* [PATCH v11 01/14] asm-generic: barrier: Add smp_cond_load_relaxed_timeout()
From: Ankur Arora @ 2026-04-08 12:25 UTC (permalink / raw)
  To: linux-kernel, linux-arch, linux-arm-kernel, linux-pm, bpf
  Cc: arnd, catalin.marinas, will, peterz, akpm, mark.rutland, harisokn,
	cl, ast, rafael, daniel.lezcano, memxor, zhenglifeng1, xueshuai,
	rdunlap, david.laight.linux, joao.m.martins, boris.ostrovsky,
	konrad.wilk, ashok.bhat, Ankur Arora
In-Reply-To: <20260408122538.3610871-1-ankur.a.arora@oracle.com>

Add smp_cond_load_relaxed_timeout(), which extends
smp_cond_load_relaxed() to allow waiting for a duration.

We loop around waiting for the condition variable to change while
peridically doing a time-check. The loop uses cpu_poll_relax() to slow
down the busy-wait, which, unless overridden by the architecture
code, amounts to a cpu_relax().

Note that there are two ways for the time-check to fail: the timeout
case or, @time_expr_ns returning an invalid value (negative or zero).
The second failure mode allows for clocks attached to the clock-domain
of @cond_expr --  which might cease to operate meaningfully once some
state internal to @cond_expr has changed -- to fail.

Evaluation of @time_expr_ns: in the fastpath we want to keep the
performance close to smp_cond_load_relaxed(). So defer evaluation
of the potentially costly @time_expr_ns to the slowpath.

This also means that there will always be some hardware dependent
duration that has passed in cpu_poll_relax() iterations at the time
of first evaluation. Additionally cpu_poll_relax() is not guaranteed
to return at timeout boundary. In sum, expect timeout overshoot when
we exit due to expiration of the timeout.

The number of spin iterations before time-check, SMP_TIMEOUT_POLL_COUNT
is chosen to be 200 by default. With a cpu_poll_relax() iteration
taking ~20-30 cycles (measured on a variety of x86 platforms), we
expect a time-check every ~4000-6000 cycles.

The outer limit of the overshoot is double that when working with the
parameters above. This might be higher or lower depending on the
implementation of cpu_poll_relax() across architectures.

Lastly, config option ARCH_HAS_CPU_RELAX indicates availability of a
cpu_poll_relax() that is cheaper than polling. This might be relevant
for cases with a long timeout.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-arch@vger.kernel.org
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
Notes:
   - add a comment mentioning that smp_cond_load_relaxed_timeout() might
     be using architectural primitives that don't support MMIO.
     (David Laight, Catalin Marinas)

 include/asm-generic/barrier.h | 69 +++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index d4f581c1e21d..e5a6a1c04649 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -273,6 +273,75 @@ do {									\
 })
 #endif
 
+/*
+ * Number of times we iterate in the loop before doing the time check.
+ * Note that the iteration count assumes that the loop condition is
+ * relatively cheap.
+ */
+#ifndef SMP_TIMEOUT_POLL_COUNT
+#define SMP_TIMEOUT_POLL_COUNT		200
+#endif
+
+/*
+ * Platforms with ARCH_HAS_CPU_RELAX have a cpu_poll_relax() implementation
+ * that is expected to be cheaper (lower power) than pure polling.
+ */
+#ifndef cpu_poll_relax
+#define cpu_poll_relax(ptr, val, timeout_ns)	cpu_relax()
+#endif
+
+/**
+ * smp_cond_load_relaxed_timeout() - (Spin) wait for cond with no ordering
+ * guarantees until a timeout expires.
+ * @ptr: pointer to the variable to wait on.
+ * @cond_expr: boolean expression to wait for.
+ * @time_expr_ns: expression that evaluates to monotonic time (in ns) or,
+ *  on failure, returns a negative value.
+ * @timeout_ns: timeout value in ns
+ * Both of the above are assumed to be compatible with s64; the signed
+ * value is used to handle the failure case in @time_expr_ns.
+ *
+ * Equivalent to using READ_ONCE() on the condition variable.
+ *
+ * Callers that expect to wait for prolonged durations might want
+ * to take into account the availability of ARCH_HAS_CPU_RELAX.
+ *
+ * Note that @ptr is expected to point to a memory address. Using this
+ * interface with MMIO will be slower (since SMP_TIMEOUT_POLL_COUNT is
+ * tuned for memory) and might also break in interesting architecture
+ * dependent ways.
+ */
+#ifndef smp_cond_load_relaxed_timeout
+#define smp_cond_load_relaxed_timeout(ptr, cond_expr,			\
+				      time_expr_ns, timeout_ns)		\
+({									\
+	typeof(ptr) __PTR = (ptr);					\
+	__unqual_scalar_typeof(*ptr) VAL;				\
+	u32 __n = 0, __spin = SMP_TIMEOUT_POLL_COUNT;			\
+	s64 __timeout = (s64)timeout_ns;				\
+	s64 __time_now, __time_end = 0;					\
+									\
+	for (;;) {							\
+		VAL = READ_ONCE(*__PTR);				\
+		if (cond_expr)						\
+			break;						\
+		cpu_poll_relax(__PTR, VAL, (u64)__timeout);		\
+		if (++__n < __spin)					\
+			continue;					\
+		__time_now = (s64)(time_expr_ns);			\
+		if (unlikely(__time_end == 0))				\
+			__time_end = __time_now + __timeout;		\
+		__timeout = __time_end - __time_now;			\
+		if (__time_now <= 0 || __timeout <= 0) {		\
+			VAL = READ_ONCE(*__PTR);			\
+			break;						\
+		}							\
+		__n = 0;						\
+	}								\
+	(typeof(*ptr))VAL;						\
+})
+#endif
+
 /*
  * pmem_wmb() ensures that all stores for which the modification
  * are written to persistent storage by preceding instructions have
-- 
2.31.1



^ permalink raw reply related

* [PATCH v11 00/14] barrier: Add smp_cond_load_{relaxed,acquire}_timeout()
From: Ankur Arora @ 2026-04-08 12:25 UTC (permalink / raw)
  To: linux-kernel, linux-arch, linux-arm-kernel, linux-pm, bpf
  Cc: arnd, catalin.marinas, will, peterz, akpm, mark.rutland, harisokn,
	cl, ast, rafael, daniel.lezcano, memxor, zhenglifeng1, xueshuai,
	rdunlap, david.laight.linux, joao.m.martins, boris.ostrovsky,
	konrad.wilk, ashok.bhat, Ankur Arora

Hi,

Main change in this version:
  - adds a kunit validation test.

What remains?:
  - Review by PeterZ of the new interface tif_need_resched_relaxed_wait()
    (patch 11, "sched: add need-resched timed wait interface").
    (Peter had originally proposed using smp_cond_load_relaxed() in
     poll_idle() [11]).

The core kernel often uses smp_cond_load_{relaxed,acquire}() to spin
on condition variables with architectural primitives used to avoid
hammering the relevant cachelines.

(This primitive can vary greatly across architectures: on x86 it's a
cpu_relax() to slow down the pipeline. On arm64, this is a __cmpwait()
which waits for a cacheline to change state in a time limited fashion.)

Regardless of architectural details, typical smp_cond_load*() usage
does not allow for termination until the condition change occurs.

Beyond the core kernel, there are cases where it is useful to additionally
terminate on a timeout. Two cases:

  - cpuidle poll_idle(): wait for need-resched until the cpuidle polling
    duration expires.

  - rqspinlock: nested qspinlock acquisition that terminates on timeout
    or deadlock.

Accordingly add two interfaces (with their generic and arm64 specific
implementations):

   smp_cond_load_relaxed_timeout(ptr, cond_expr, time_expr, timeout)
   smp_cond_load_acquire_timeout(ptr, cond_expr, time_expr, timeout)

Also add tif_need_resched_relaxed_wait() which wraps the polling
pattern and its scheduler specific details in poll_idle().
In addition add atomic_cond_read_*_timeout(),
atomic64_cond_read_*_timeout(), and atomic_long wrappers.

Structurally, both the smp_cond_load_*_timeout() interfaces are similar
to smp_cond_load*(), with the addition of a rate-limited time-check.

Usage
==

These interfaces drop straight-forwardly into the rqspinlock logic
since qspinlock already uses smp_cond_load*(), and the time-check
extension can now be used for timeout and deadlock handling.

Using tif_need_resched_relaxed_wait() in poll_idle() removes any
architectural details allowing arm64 to straight-forwardly support
that path.
(However, for efficiency reasons cpuidle/poll_state.c continues to
depend on ARCH_HAS_CPU_RELAX since that is defined on architectures
with an optimized architectural primitive.)


Performance
==

Apart from simplifications due to this change, supporting polling in
cpuidle on arm64 helps improve wakeup latency (needs a few cpuidle/acpi
patches):


  # perf stat -r 5 --cpu 4,5 -e task-clock,cycles,instructions,sched:sched_wake_idle_without_ipi \
  perf bench sched pipe -l 1000000 -c 4

  # No haltpoll (and, no TIF_POLLING_NRFLAG):

  Performance counter stats for 'CPU(s) 4,5' (5 runs):

         25,229.57 msec task-clock                       #    2.000 CPUs utilized               ( +-  7.75% )
    45,821,250,284      cycles                           #    1.816 GHz                         ( +- 10.07% )
    26,557,496,665      instructions                     #    0.58  insn per cycle              ( +-  0.21% )
                 0      sched:sched_wake_idle_without_ipi #    0.000 /sec

       12.615 +- 0.977 seconds time elapsed  ( +-  7.75% )


  # Haltpoll:

  Performance counter stats for 'CPU(s) 4,5' (5 runs):

         15,131.58 msec task-clock                       #    2.000 CPUs utilized               ( +- 10.00% )
    34,158,188,839      cycles                           #    2.257 GHz                         ( +-  6.91% )
    20,824,950,916      instructions                     #    0.61  insn per cycle              ( +-  0.09% )
         1,983,822      sched:sched_wake_idle_without_ipi #  131.105 K/sec                       ( +-  0.78% )

        7.566 +- 0.756 seconds time elapsed  ( +- 10.00% )

  We get improved latency because we don't switch in and out of a
  deeper sleep state or from the hypervisor. This also causes us to
  execute ~20% fewer instructions.


Haris Okanovic also saw improvement in real workloads due to the
cpuidle changes: "observed 4-6% improvements in memcahed, cassandra,
mysql, and postgresql under certain loads. Other applications likely
benefit too." [12]


Changelog:
  v10 [10]:
   - add a comment mentioning that smp_cond_load_relaxed_timeout() might
     be using architectural primitives that don't support MMIO.
     (David Laight, Catalin Marinas)
   - added a kunit test for smp_cond_load_relaxed_timeout() (Andrew
     Morton.)

  v9 [9]:
   - s/@cond/@cond_expr/ (Randy Dunlap)
   - Clarify that SMP_TIMEOUT_POLL_COUNT is only around memory
     addresses. (David Laight)
   - Add the missing config ARCH_HAS_CPU_RELAX in arch/arm64/Kconfig.
     (Catalin Marinas).
   - Switch to arch_counter_get_cntvct_stable() (via __delay_cycles())
     in the cmpwait path instead of using arch_timer_read_counter().
     (Catalin Marinas)

  v8 [0]:
   - Defer evaluation of @time_expr_ns to when we hit the slowpath.
      (comment from Alexei Starovoitov).

   - Mention that cpu_poll_relax() is better than raw CPU polling
     only where ARCH_HAS_CPU_RELAX is defined.
     - also define ARCH_HAS_CPU_RELAX for arm64.
      (Came out of a discussion with Will Deacon.)

   - Split out WFET and WFE handling. I was doing both of these
     in a common handler.
     (From Will Deacon and in an earlier revision by Catalin Marinas.)

   - Add mentions of atomic_cond_read_{relaxed,acquire}(),
     atomic_cond_read_{relaxed,acquire}_timeout() in
     Documentation/atomic_t.txt.

   - Use the BIT() macro to do the checking in tif_bitset_relaxed_wait().

   - Cleanup unnecessary assignments, casts etc in poll_idle().
     (From Rafael Wysocki.)

   - Fixup warnings from kernel build robot


  v7 [1]:
   - change the interface to separately provide the timeout. This is
     useful for supporting WFET and similar primitives which can do
     timed waiting (suggested by Arnd Bergmann).

   - Adapting rqspinlock code to this changed interface also
     necessitated allowing time_expr to fail.
   - rqspinlock changes to adapt to the new smp_cond_load_acquire_timeout().

   - add WFET support (suggested by Arnd Bergmann).
   - add support for atomic-long wrappers.
   - add a new scheduler interface tif_need_resched_relaxed_wait() which
     encapsulates the polling logic used by poll_idle().
     - interface suggested by (Rafael J. Wysocki).


  v6 [2]:
   - fixup missing timeout parameters in atomic64_cond_read_*_timeout()
   - remove a race between setting of TIF_NEED_RESCHED and the call to
     smp_cond_load_relaxed_timeout(). This would mean that dev->poll_time_limit
     would be set even if we hadn't spent any time waiting.
     (The original check compared against local_clock(), which would have been
     fine, but I was instead using a cheaper check against _TIF_NEED_RESCHED.)
   (Both from meta-CI bot)


  v5 [3]:
   - use cpu_poll_relax() instead of cpu_relax().
   - instead of defining an arm64 specific
     smp_cond_load_relaxed_timeout(), just define the appropriate
     cpu_poll_relax().
   - re-read the target pointer when we exit due to the time-check.
   - s/SMP_TIMEOUT_SPIN_COUNT/SMP_TIMEOUT_POLL_COUNT/
   (Suggested by Will Deacon)

   - add atomic_cond_read_*_timeout() and atomic64_cond_read_*_timeout()
     interfaces.
   - rqspinlock: use atomic_cond_read_acquire_timeout().
   - cpuidle: use smp_cond_load_relaxed_tiemout() for polling.
   (Suggested by Catalin Marinas)

   - rqspinlock: define SMP_TIMEOUT_POLL_COUNT to be 16k for non arm64


  v4 [4]:
    - naming change 's/timewait/timeout/'
    - resilient spinlocks: get rid of res_smp_cond_load_acquire_waiting()
      and fixup use of RES_CHECK_TIMEOUT().
    (Both suggested by Catalin Marinas)

  v3 [5]:
    - further interface simplifications (suggested by Catalin Marinas)

  v2 [6]:
    - simplified the interface (suggested by Catalin Marinas)
       - get rid of wait_policy, and a multitude of constants
       - adds a slack parameter
      This helped remove a fair amount of duplicated code duplication and in
      hindsight unnecessary constants.

  v1 [7]:
     - add wait_policy (coarse and fine)
     - derive spin-count etc at runtime instead of using arbitrary
       constants.

Haris Okanovic tested v4 of this series with poll_idle()/haltpoll patches. [8]

Comments appreciated!

Thanks
Ankur

 [0] https://lore.kernel.org/lkml/20251215044919.460086-1-ankur.a.arora@oracle.com/
 [1] https://lore.kernel.org/lkml/20251028053136.692462-1-ankur.a.arora@oracle.com/
 [2] https://lore.kernel.org/lkml/20250911034655.3916002-1-ankur.a.arora@oracle.com/
 [3] https://lore.kernel.org/lkml/20250911034655.3916002-1-ankur.a.arora@oracle.com/
 [4] https://lore.kernel.org/lkml/20250829080735.3598416-1-ankur.a.arora@oracle.com/
 [5] https://lore.kernel.org/lkml/20250627044805.945491-1-ankur.a.arora@oracle.com/
 [6] https://lore.kernel.org/lkml/20250502085223.1316925-1-ankur.a.arora@oracle.com/
 [7] https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com/
 [8] https://lore.kernel.org/lkml/2cecbf7fb23ee83a4ce027e1be3f46f97efd585c.camel@amazon.com/
 [9] https://lore.kernel.org/lkml/20260209023153.2661784-1-ankur.a.arora@oracle.com/
 [10] https://lore.kernel.org/lkml/20260316013651.3225328-1-ankur.a.arora@oracle.com/
 [11] https://lore.kernel.org/lkml/20230809134837.GM212435@hirez.programming.kicks-ass.net/
 [12] https://lore.kernel.org/lkml/c6f3c8d3f1f2e89a9dc7ae22482973b5a51b08cb.camel@amazon.com/

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: bpf@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-pm@vger.kernel.org

Ankur Arora (14):
  asm-generic: barrier: Add smp_cond_load_relaxed_timeout()
  arm64: barrier: Support smp_cond_load_relaxed_timeout()
  arm64/delay: move some constants out to a separate header
  arm64: support WFET in smp_cond_load_relaxed_timeout()
  arm64: rqspinlock: Remove private copy of
    smp_cond_load_acquire_timewait()
  asm-generic: barrier: Add smp_cond_load_acquire_timeout()
  atomic: Add atomic_cond_read_*_timeout()
  locking/atomic: scripts: build atomic_long_cond_read_*_timeout()
  bpf/rqspinlock: switch check_timeout() to a clock interface
  bpf/rqspinlock: Use smp_cond_load_acquire_timeout()
  sched: add need-resched timed wait interface
  cpuidle/poll_state: Wait for need-resched via
    tif_need_resched_relaxed_wait()
  kunit: enable testing smp_cond_load_relaxed_timeout()
  kunit: add tests for smp_cond_load_relaxed_timeout()

 Documentation/atomic_t.txt           |  14 +--
 arch/arm64/Kconfig                   |   3 +
 arch/arm64/include/asm/barrier.h     |  23 +++++
 arch/arm64/include/asm/cmpxchg.h     |  62 ++++++++++---
 arch/arm64/include/asm/delay-const.h |  27 ++++++
 arch/arm64/include/asm/rqspinlock.h  |  85 ------------------
 arch/arm64/lib/delay.c               |  17 ++--
 drivers/clocksource/arm_arch_timer.c |   2 +
 drivers/cpuidle/poll_state.c         |  21 +----
 drivers/soc/qcom/rpmh-rsc.c          |   8 +-
 include/asm-generic/barrier.h        |  95 ++++++++++++++++++++
 include/linux/atomic.h               |  10 +++
 include/linux/atomic/atomic-long.h   |  18 ++--
 include/linux/sched/idle.h           |  29 +++++++
 kernel/bpf/rqspinlock.c              |  77 +++++++++++------
 lib/Kconfig.debug                    |  10 +++
 lib/tests/Makefile                   |   1 +
 lib/tests/barrier-timeout-test.c     | 125 +++++++++++++++++++++++++++
 scripts/atomic/gen-atomic-long.sh    |  16 ++--
 19 files changed, 465 insertions(+), 178 deletions(-)
 create mode 100644 arch/arm64/include/asm/delay-const.h
 create mode 100644 lib/tests/barrier-timeout-test.c

-- 
2.31.1



^ permalink raw reply

* [PATCH v11 03/14] arm64/delay: move some constants out to a separate header
From: Ankur Arora @ 2026-04-08 12:25 UTC (permalink / raw)
  To: linux-kernel, linux-arch, linux-arm-kernel, linux-pm, bpf
  Cc: arnd, catalin.marinas, will, peterz, akpm, mark.rutland, harisokn,
	cl, ast, rafael, daniel.lezcano, memxor, zhenglifeng1, xueshuai,
	rdunlap, david.laight.linux, joao.m.martins, boris.ostrovsky,
	konrad.wilk, ashok.bhat, Ankur Arora, Bjorn Andersson,
	Konrad Dybcio, Christoph Lameter
In-Reply-To: <20260408122538.3610871-1-ankur.a.arora@oracle.com>

Moves some constants and functions related to xloops, cycles computation
out to a new header. Also make __delay_cycles() available outside of
arch/arm64/lib/delay.c.

Rename some macros in qcom/rpmh-rsc.c which were occupying the same
namespace.

No functional change.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Bjorn Andersson <andersson@kernel.org>
Cc: Konrad Dybcio <konradybcio@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Reviewed-by: Christoph Lameter <cl@linux.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 arch/arm64/include/asm/delay-const.h | 27 +++++++++++++++++++++++++++
 arch/arm64/lib/delay.c               | 15 ++++-----------
 drivers/soc/qcom/rpmh-rsc.c          |  8 ++++----
 3 files changed, 35 insertions(+), 15 deletions(-)
 create mode 100644 arch/arm64/include/asm/delay-const.h

diff --git a/arch/arm64/include/asm/delay-const.h b/arch/arm64/include/asm/delay-const.h
new file mode 100644
index 000000000000..cb3988ff4e41
--- /dev/null
+++ b/arch/arm64/include/asm/delay-const.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_DELAY_CONST_H
+#define _ASM_DELAY_CONST_H
+
+#include <asm/param.h>	/* For HZ */
+
+/* 2**32 / 1000000 (rounded up) */
+#define __usecs_to_xloops_mult	0x10C7UL
+
+/* 2**32 / 1000000000 (rounded up) */
+#define __nsecs_to_xloops_mult	0x5UL
+
+extern unsigned long loops_per_jiffy;
+static inline unsigned long xloops_to_cycles(unsigned long xloops)
+{
+	return (xloops * loops_per_jiffy * HZ) >> 32;
+}
+
+#define USECS_TO_CYCLES(time_usecs) \
+	xloops_to_cycles((time_usecs) * __usecs_to_xloops_mult)
+
+#define NSECS_TO_CYCLES(time_nsecs) \
+	xloops_to_cycles((time_nsecs) * __nsecs_to_xloops_mult)
+
+u64 notrace __delay_cycles(void);
+
+#endif	/* _ASM_DELAY_CONST_H */
diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c
index e278e060e78a..c660a7ea26dd 100644
--- a/arch/arm64/lib/delay.c
+++ b/arch/arm64/lib/delay.c
@@ -12,17 +12,10 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/timex.h>
+#include <asm/delay-const.h>
 
 #include <clocksource/arm_arch_timer.h>
 
-#define USECS_TO_CYCLES(time_usecs)			\
-	xloops_to_cycles((time_usecs) * 0x10C7UL)
-
-static inline unsigned long xloops_to_cycles(unsigned long xloops)
-{
-	return (xloops * loops_per_jiffy * HZ) >> 32;
-}
-
 /*
  * Force the use of CNTVCT_EL0 in order to have the same base as WFxT.
  * This avoids some annoying issues when CNTVOFF_EL2 is not reset 0 on a
@@ -32,7 +25,7 @@ static inline unsigned long xloops_to_cycles(unsigned long xloops)
  * Note that userspace cannot change the offset behind our back either,
  * as the vcpu mutex is held as long as KVM_RUN is in progress.
  */
-static cycles_t notrace __delay_cycles(void)
+u64 notrace __delay_cycles(void)
 {
 	guard(preempt_notrace)();
 	return __arch_counter_get_cntvct_stable();
@@ -73,12 +66,12 @@ EXPORT_SYMBOL(__const_udelay);
 
 void __udelay(unsigned long usecs)
 {
-	__const_udelay(usecs * 0x10C7UL); /* 2**32 / 1000000 (rounded up) */
+	__const_udelay(usecs * __usecs_to_xloops_mult);
 }
 EXPORT_SYMBOL(__udelay);
 
 void __ndelay(unsigned long nsecs)
 {
-	__const_udelay(nsecs * 0x5UL); /* 2**32 / 1000000000 (rounded up) */
+	__const_udelay(nsecs * __nsecs_to_xloops_mult);
 }
 EXPORT_SYMBOL(__ndelay);
diff --git a/drivers/soc/qcom/rpmh-rsc.c b/drivers/soc/qcom/rpmh-rsc.c
index c6f7d5c9c493..ad5ec5c0de0a 100644
--- a/drivers/soc/qcom/rpmh-rsc.c
+++ b/drivers/soc/qcom/rpmh-rsc.c
@@ -146,10 +146,10 @@ enum {
  *  +---------------------------------------------------+
  */
 
-#define USECS_TO_CYCLES(time_usecs)			\
-	xloops_to_cycles((time_usecs) * 0x10C7UL)
+#define RPMH_USECS_TO_CYCLES(time_usecs)		\
+	rpmh_xloops_to_cycles((time_usecs) * 0x10C7UL)
 
-static inline unsigned long xloops_to_cycles(u64 xloops)
+static inline unsigned long rpmh_xloops_to_cycles(u64 xloops)
 {
 	return (xloops * loops_per_jiffy * HZ) >> 32;
 }
@@ -819,7 +819,7 @@ void rpmh_rsc_write_next_wakeup(struct rsc_drv *drv)
 	wakeup_us = ktime_to_us(wakeup);
 
 	/* Convert the wakeup to arch timer scale */
-	wakeup_cycles = USECS_TO_CYCLES(wakeup_us);
+	wakeup_cycles = RPMH_USECS_TO_CYCLES(wakeup_us);
 	wakeup_cycles += arch_timer_read_counter();
 
 exit:
-- 
2.31.1



^ permalink raw reply related

* [PATCH v11 04/14] arm64: support WFET in smp_cond_load_relaxed_timeout()
From: Ankur Arora @ 2026-04-08 12:25 UTC (permalink / raw)
  To: linux-kernel, linux-arch, linux-arm-kernel, linux-pm, bpf
  Cc: arnd, catalin.marinas, will, peterz, akpm, mark.rutland, harisokn,
	cl, ast, rafael, daniel.lezcano, memxor, zhenglifeng1, xueshuai,
	rdunlap, david.laight.linux, joao.m.martins, boris.ostrovsky,
	konrad.wilk, ashok.bhat, Ankur Arora
In-Reply-To: <20260408122538.3610871-1-ankur.a.arora@oracle.com>

To handle WFET use __cmpwait_timeout() similarly to __cmpwait(). These
call out to the respective __cmpwait_case_timeout_##sz(),
__cmpwait_case_##sz() functions.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 arch/arm64/include/asm/barrier.h |  8 +++--
 arch/arm64/include/asm/cmpxchg.h | 62 +++++++++++++++++++++++++-------
 2 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 6190e178db51..fbd71cd4ef4e 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -224,8 +224,8 @@ do {									\
 extern bool arch_timer_evtstrm_available(void);
 
 /*
- * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()
- * for the ptr value to change.
+ * In the common case, cpu_poll_relax() sits waiting in __cmpwait_relaxed()/
+ * __cmpwait_relaxed_timeout() for the ptr value to change.
  *
  * Since this period is reasonably long, choose SMP_TIMEOUT_POLL_COUNT
  * to be 1, so smp_cond_load_{relaxed,acquire}_timeout() does a
@@ -234,7 +234,9 @@ extern bool arch_timer_evtstrm_available(void);
 #define SMP_TIMEOUT_POLL_COUNT	1
 
 #define cpu_poll_relax(ptr, val, timeout_ns) do {			\
-	if (arch_timer_evtstrm_available())				\
+	if (alternative_has_cap_unlikely(ARM64_HAS_WFXT))		\
+		__cmpwait_relaxed_timeout(ptr, val, timeout_ns);	\
+	else if (arch_timer_evtstrm_available())			\
 		__cmpwait_relaxed(ptr, val);				\
 	else								\
 		cpu_relax();						\
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 6cf3cd6873f5..9e4cdc9e41d1 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -12,6 +12,7 @@
 
 #include <asm/barrier.h>
 #include <asm/lse.h>
+#include <asm/delay-const.h>
 
 /*
  * We need separate acquire parameters for ll/sc and lse, since the full
@@ -212,7 +213,8 @@ __CMPXCHG_GEN(_mb)
 
 #define __CMPWAIT_CASE(w, sfx, sz)					\
 static inline void __cmpwait_case_##sz(volatile void *ptr,		\
-				       unsigned long val)		\
+				       unsigned long val,		\
+				       u64 __maybe_unused timeout_ns)	\
 {									\
 	unsigned long tmp;						\
 									\
@@ -235,20 +237,52 @@ __CMPWAIT_CASE( ,  , 64);
 
 #undef __CMPWAIT_CASE
 
-#define __CMPWAIT_GEN(sfx)						\
-static __always_inline void __cmpwait##sfx(volatile void *ptr,		\
-				  unsigned long val,			\
-				  int size)				\
+#define __CMPWAIT_TIMEOUT_CASE(w, sfx, sz)				\
+static inline void __cmpwait_case_timeout_##sz(volatile void *ptr,	\
+					       unsigned long val,	\
+					       u64 timeout_ns)		\
+{									\
+	unsigned long tmp;						\
+	u64 ecycles = __delay_cycles() +				\
+			NSECS_TO_CYCLES(timeout_ns);			\
+	asm volatile(							\
+	"	sevl\n"							\
+	"	wfe\n"							\
+	"	ldxr" #sfx "\t%" #w "[tmp], %[v]\n"			\
+	"	eor	%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"	\
+	"	cbnz	%" #w "[tmp], 2f\n"				\
+	"	msr s0_3_c1_c0_0, %[ecycles]\n"				\
+	"2:"								\
+	: [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr)			\
+	: [val] "r" (val), [ecycles] "r" (ecycles));			\
+}
+
+__CMPWAIT_TIMEOUT_CASE(w, b, 8);
+__CMPWAIT_TIMEOUT_CASE(w, h, 16);
+__CMPWAIT_TIMEOUT_CASE(w,  , 32);
+__CMPWAIT_TIMEOUT_CASE( ,  , 64);
+
+#undef __CMPWAIT_TIMEOUT_CASE
+
+#define __CMPWAIT_GEN(timeout, sfx)					\
+static __always_inline void __cmpwait##timeout##sfx(volatile void *ptr,	\
+						    unsigned long val,	\
+						    u64 timeout_ns,	\
+						    int size)		\
 {									\
 	switch (size) {							\
 	case 1:								\
-		return __cmpwait_case##sfx##_8(ptr, (u8)val);		\
+		return __cmpwait_case##timeout##sfx##_8(ptr, (u8)val,	\
+							timeout_ns);	\
 	case 2:								\
-		return __cmpwait_case##sfx##_16(ptr, (u16)val);		\
+		return __cmpwait_case##timeout##sfx##_16(ptr, (u16)val,	\
+							 timeout_ns);	\
 	case 4:								\
-		return __cmpwait_case##sfx##_32(ptr, val);		\
+		return __cmpwait_case##timeout##sfx##_32(ptr, val,	\
+							 timeout_ns);	\
 	case 8:								\
-		return __cmpwait_case##sfx##_64(ptr, val);		\
+		return __cmpwait_case##timeout##sfx##_64(ptr, val,	\
+							 timeout_ns);	\
 	default:							\
 		BUILD_BUG();						\
 	}								\
@@ -256,11 +290,15 @@ static __always_inline void __cmpwait##sfx(volatile void *ptr,		\
 	unreachable();							\
 }
 
-__CMPWAIT_GEN()
+__CMPWAIT_GEN(        , )
+__CMPWAIT_GEN(_timeout, )
 
 #undef __CMPWAIT_GEN
 
-#define __cmpwait_relaxed(ptr, val) \
-	__cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))
+#define __cmpwait_relaxed_timeout(ptr, val, timeout_ns)			\
+	__cmpwait_timeout((ptr), (unsigned long)(val), timeout_ns, sizeof(*(ptr)))
+
+#define __cmpwait_relaxed(ptr, val)					\
+	__cmpwait((ptr), (unsigned long)(val), 0, sizeof(*(ptr)))
 
 #endif	/* __ASM_CMPXCHG_H */
-- 
2.31.1



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox