Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v7 14/20] KVM: selftests: Verify non-postable IRQ remapping in IRQ test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

Extend the eventfd IRQ test with an '-n' flag to route a subset of device
interrupts as NMIs (Non-Maskable Interrupts) into the guest using an
alternating pattern of 4 NMIs followed by 4 regular interrupts.

While this adds coverage for NMI injection, the primary goal is to
validate KVM's handling of non-postable interrupt delivery (AMD and Intel
IOMMUs only support posting fixed IRQs targeting a single vCPU).  KVM
has historically bungled handling transitions between posted and remapped
modes.  Use NMIs to stress the transitions, because they are a reliable,
architectural way to force these code paths.

Signed-off-by: David Matlack <dmatlack@google.com>
Co-developed-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: add GUEST_RECEIVED_INTERRUPT(), massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/irq_test.c | 48 ++++++++++++++++++++------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/kvm/irq_test.c b/tools/testing/selftests/kvm/irq_test.c
index 2cfb6c24e8d6..d2d861119854 100644
--- a/tools/testing/selftests/kvm/irq_test.c
+++ b/tools/testing/selftests/kvm/irq_test.c
@@ -17,11 +17,17 @@
 static u64 timeout_ns = 2ULL * 1000 * 1000 * 1000;
 static bool guest_ready_for_irqs[KVM_MAX_VCPUS];
 static bool guest_received_irq[KVM_MAX_VCPUS];
+static bool guest_received_nmi[KVM_MAX_VCPUS];
 static bool irq_affinity;
 static bool done;
 
 #define GUEST_RECEIVED_IRQ(__vcpu)	\
 	SYNC_FROM_GUEST_AND_READ((__vcpu)->vm, guest_received_irq[(__vcpu)->id])
+#define GUEST_RECEIVED_NMI(__vcpu)	\
+	SYNC_FROM_GUEST_AND_READ((__vcpu)->vm, guest_received_nmi[(__vcpu)->id])
+
+#define GUEST_RECEIVED_INTERRUPT(__vcpu, __nmi)	\
+	((__nmi) ? GUEST_RECEIVED_NMI(__vcpu) : GUEST_RECEIVED_IRQ(__vcpu))
 
 static u32 guest_get_vcpu_id(void)
 {
@@ -35,6 +41,11 @@ static void guest_irq_handler(struct ex_regs *regs)
 	x2apic_write_reg(APIC_EOI, 0);
 }
 
+static void guest_nmi_handler(struct ex_regs *regs)
+{
+	WRITE_ONCE(guest_received_nmi[guest_get_vcpu_id()], true);
+}
+
 static void guest_code(void)
 {
 	x2apic_enable();
@@ -91,7 +102,7 @@ static void trigger_interrupt(struct vfio_pci_device *device, int eventfd)
 
 
 static void kvm_route_msi(struct kvm_vm *vm, u32 gsi, struct kvm_vcpu *vcpu,
-			  u8 vector)
+			  u8 vector, bool use_nmi)
 {
 	struct {
 		struct kvm_irq_routing header;
@@ -102,7 +113,7 @@ static void kvm_route_msi(struct kvm_vm *vm, u32 gsi, struct kvm_vcpu *vcpu,
 			.gsi = gsi,
 			.type = KVM_IRQ_ROUTING_MSI,
 			.u.msi.address_lo = 0xFEE00000 | (vcpu->id << 12),
-			.u.msi.data = vector,
+			.u.msi.data = use_nmi ? NMI_VECTOR | (4 << 8) : vector,
 		},
 	};
 
@@ -134,13 +145,14 @@ static const char *probe_iommu_type(void)
 
 static void help(const char *name)
 {
-	printf("Usage: %s [-a] [-d <segment:bus:device.function>] [-e] [-h] [-i nr_irqs] [-t iommu_type]\n", name);
+	printf("Usage: %s [-a] [-d <segment:bus:device.function>] [-e] [-h] [-i nr_irqs] [-n] [-t iommu_type]\n", name);
 	printf("\n");
 	printf("Tests KVM interrupt routing and delivery via irqfd.\n");
 	printf("-a	Affine the device's host IRQ to a random physical CPU\n");
 	printf("-d	Use a VFIO device to send MSI-X interrupts instead of manually signaling the eventfd\n");
 	printf("-e	Set empty GSI routing in-between some interrupts\n");
 	printf("-i	The number of IRQs to generate during the test\n");
+	printf("-n	Deliver 50 percent of IRQs as non-maskable interrupts\n");
 	printf("-t	Override the IOMMU type to use (vfio_type1_iommu or iommufd)\n");
 	printf("\n");
 	exit(KSFT_FAIL);
@@ -171,11 +183,12 @@ int main(int argc, char **argv)
 	const char *device_bdf = NULL;
 	const char *iommu_type = NULL;
 	int i, j, c, msix, eventfd;
+	bool use_nmi = false;
 	struct iommu *iommu;
 	struct kvm_vm *vm;
 	int irq, irq_cpu;
 
-	while ((c = getopt(argc, argv, "ad:ehi:t:")) != -1) {
+	while ((c = getopt(argc, argv, "ad:ehi:nt:")) != -1) {
 		switch (c) {
 		case 'a':
 			irq_affinity = true;
@@ -189,6 +202,9 @@ int main(int argc, char **argv)
 		case 'i':
 			nr_irqs = atoi_positive("Number of IRQs", optarg);
 			break;
+		case 'n':
+			use_nmi = true;
+			break;
 		case 't':
 			iommu_type = optarg;
 			break;
@@ -202,6 +218,7 @@ int main(int argc, char **argv)
 
 	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
 	vm_install_exception_handler(vm, vector, guest_irq_handler);
+	vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler);
 
 	if (device_bdf) {
 		if (!iommu_type)
@@ -240,36 +257,45 @@ int main(int argc, char **argv)
 
 	for (i = 0; i < nr_irqs; i++) {
 		const bool do_set_empty_routing = set_empty_routing && (i & BIT(3));
+		const bool do_use_nmi = use_nmi && (i & BIT(2));
 		struct kvm_vcpu *vcpu = vcpus[i % nr_vcpus];
 		struct timespec start;
 
 		if (do_set_empty_routing)
 			kvm_set_empty_gsi_routing(vm);
 
-		kvm_route_msi(vm, gsi, vcpu, vector);
+		kvm_route_msi(vm, gsi, vcpu, vector, do_use_nmi);
 
 		if (irq_affinity) {
 			irq_cpu = kvm_random_u64(&kvm_rng) % get_nprocs();
 			proc_irq_set_smp_affinity(irq, irq_cpu);
 		}
 
-		for (j = 0; j < nr_vcpus; j++)
+		for (j = 0; j < nr_vcpus; j++) {
 			TEST_ASSERT(!GUEST_RECEIVED_IRQ(vcpus[j]),
 				    "IRQ flag for vCPU %d not clear prior to test",
 				    vcpus[j]->id);
+			TEST_ASSERT(!GUEST_RECEIVED_NMI(vcpus[j]),
+				    "NMI flag for vCPU %d not clear prior to test",
+				    vcpus[j]->id);
+		}
 
 		trigger_interrupt(device, eventfd);
 
 		clock_gettime(CLOCK_MONOTONIC, &start);
-		while (!GUEST_RECEIVED_IRQ(vcpu) &&
+		while (!GUEST_RECEIVED_INTERRUPT(vcpu, do_use_nmi) &&
 		       timespec_to_ns(timespec_elapsed(start)) <= timeout_ns)
 			cpu_relax();
 
-		TEST_ASSERT(GUEST_RECEIVED_IRQ(vcpu),
-			    "vCPU %d timed out waiting for IRQ (vector 0x%x) from GSI %d (via CPU %d)\n",
-			    vcpu->id, vector, gsi, irq_cpu);
+		TEST_ASSERT(GUEST_RECEIVED_INTERRUPT(vcpu, do_use_nmi),
+			    "vCPU %d timed out waiting for %s (vector 0x%x) from GSI %d (via CPU %d)\n",
+			    vcpu->id, do_use_nmi ? "NMI" : "IRQ",
+			    do_use_nmi ? NMI_VECTOR : vector, gsi, irq_cpu);
 
-		WRITE_AND_SYNC_TO_GUEST(vm, guest_received_irq[vcpu->id], false);
+		if (do_use_nmi)
+			WRITE_AND_SYNC_TO_GUEST(vm, guest_received_nmi[vcpu->id], false);
+		else
+			WRITE_AND_SYNC_TO_GUEST(vm, guest_received_irq[vcpu->id], false);
 	}
 
 	WRITE_AND_SYNC_TO_GUEST(vm, done, true);
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 16/20] KVM: selftests: Add kvm_sched_getaffinity() wrapper and convert users
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: Josh Hilke <jrhilke@google.com>

Add and use a KVM wrapper for the sched_getaffinity() syscall so that
selftests don't need to manually assert that the syscall succeeded.

Note, some tests didn't actually assert success, but they all obviously
rely on the syscall to succeed.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/arch_timer.c                  | 2 +-
 tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c | 2 +-
 tools/testing/selftests/kvm/include/kvm_syscalls.h        | 2 ++
 tools/testing/selftests/kvm/lib/kvm_util.c                | 5 ++---
 tools/testing/selftests/kvm/mmu_stress_test.c             | 6 +-----
 tools/testing/selftests/kvm/rseq_test.c                   | 4 +---
 6 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c
index 90c475a61b22..f8b02597897b 100644
--- a/tools/testing/selftests/kvm/arch_timer.c
+++ b/tools/testing/selftests/kvm/arch_timer.c
@@ -85,7 +85,7 @@ static u32 test_get_pcpu(void)
 	cpu_set_t online_cpuset;
 
 	nproc_conf = get_nprocs_conf();
-	sched_getaffinity(0, sizeof(cpu_set_t), &online_cpuset);
+	kvm_sched_getaffinity(0, sizeof(cpu_set_t), &online_cpuset);
 
 	/* Randomly find an available pCPU to place a vCPU on */
 	do {
diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
index f7625eb711d6..d9c9377a6325 100644
--- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
+++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c
@@ -1039,7 +1039,7 @@ int main(int argc, char *argv[])
 	if (!parse_args(argc, argv))
 		exit(KSFT_SKIP);
 
-	sched_getaffinity(0, sizeof(default_cpuset), &default_cpuset);
+	kvm_sched_getaffinity(0, sizeof(default_cpuset), &default_cpuset);
 	set_counter_defaults();
 
 	if (test_args.test_virtual) {
diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h
index dc4fb97aef8d..5dae6143ddb0 100644
--- a/tools/testing/selftests/kvm/include/kvm_syscalls.h
+++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h
@@ -12,6 +12,7 @@
 #include <sys/mman.h>
 #include <sys/syscall.h>
 
+#include <sched.h>
 #include <test_util.h>
 
 #define MAP_ARGS0(m,...)
@@ -93,6 +94,7 @@ __KVM_SYSCALL_DEFINE(close, 1, int, fd);
 __KVM_SYSCALL_DEFINE(fallocate, 4, int, fd, int, mode, loff_t, offset, loff_t, len);
 __KVM_SYSCALL_DEFINE(ftruncate, 2, unsigned int, fd, off_t, length);
 __KVM_SYSCALL_DEFINE(madvise, 3, void *, addr, size_t, length, int, advice);
+__KVM_SYSCALL_DEFINE(sched_getaffinity, 3, pid_t, pid, size_t, cpusetsize, cpu_set_t *, mask);
 
 #define kvm_free_fd(fd)		\
 do {				\
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 2e08d9fcefc7..9bf28b7d9d7a 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -673,13 +673,12 @@ void kvm_parse_vcpu_pinning(const char *pcpus_string, u32 vcpu_to_pcpu[],
 	cpu_set_t allowed_mask;
 	char *cpu, *cpu_list;
 	char delim[2] = ",";
-	int i, r;
+	int i;
 
 	cpu_list = strdup(pcpus_string);
 	TEST_ASSERT(cpu_list, "strdup() allocation failed.");
 
-	r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask);
-	TEST_ASSERT(!r, "sched_getaffinity() failed");
+	kvm_sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask);
 
 	cpu = strtok(cpu_list, delim);
 
diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index 473ef4c0ea9f..3d5f33a63b2b 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -255,11 +255,7 @@ static void rendezvous_with_vcpus(struct timespec *time, const char *name)
 static void calc_default_nr_vcpus(void)
 {
 	cpu_set_t possible_mask;
-	int r;
-
-	r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
-	TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)",
-		    errno, strerror(errno));
+	kvm_sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
 
 	nr_vcpus = CPU_COUNT(&possible_mask);
 	TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?");
diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c
index 6510fbfd64f1..557e393c223b 100644
--- a/tools/testing/selftests/kvm/rseq_test.c
+++ b/tools/testing/selftests/kvm/rseq_test.c
@@ -226,9 +226,7 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
-	TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno,
-		    strerror(errno));
+	kvm_sched_getaffinity(0, sizeof(possible_mask), &possible_mask);
 
 	calc_min_max_cpu();
 
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 11/20] KVM: selftests: Verify interrupts are received when IRQ affinity changes in IRQ test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: David Matlack <dmatlack@google.com>

Extent the eventfd IRQ test with a '-a' flag to randomly affinitize the
device's host IRQ to different physical CPUs throughout the test.  This
stresses the kernel's ability to maintain correct interrupt routing and
delivery even as the underlying hardware IRQ affinity is changed
dynamically via /proc/<irq>/smp_affinity{,_list}.

Signed-off-by: David Matlack <dmatlack@google.com>
Co-developed-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/irq_test.c | 27 +++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/kvm/irq_test.c b/tools/testing/selftests/kvm/irq_test.c
index 6888be54ee4a..c01aa313f719 100644
--- a/tools/testing/selftests/kvm/irq_test.c
+++ b/tools/testing/selftests/kvm/irq_test.c
@@ -12,10 +12,12 @@
 #include <unistd.h>
 #include <pthread.h>
 #include <sys/eventfd.h>
+#include <sys/sysinfo.h>
 
 static u64 timeout_ns = 2ULL * 1000 * 1000 * 1000;
 static bool guest_ready_for_irqs[KVM_MAX_VCPUS];
 static bool guest_received_irq[KVM_MAX_VCPUS];
+static bool irq_affinity;
 static bool done;
 
 #define GUEST_RECEIVED_IRQ(__vcpu)	\
@@ -125,9 +127,10 @@ static const char *probe_iommu_type(void)
 
 static void help(const char *name)
 {
-	printf("Usage: %s [-d <segment:bus:device.function>] [-h] [-t iommu_type]\n", name);
+	printf("Usage: %s [-a] [-d <segment:bus:device.function>] [-h] [-t iommu_type]\n", name);
 	printf("\n");
 	printf("Tests KVM interrupt routing and delivery via irqfd.\n");
+	printf("-a	Affine the device's host IRQ to a random physical CPU\n");
 	printf("-d	Use a VFIO device to send MSI-X interrupts instead of manually signaling the eventfd\n");
 	printf("-t	Override the IOMMU type to use (vfio_type1_iommu or iommufd)\n");
 	printf("\n");
@@ -160,10 +163,13 @@ int main(int argc, char **argv)
 	int i, j, c, msix, eventfd;
 	struct iommu *iommu;
 	struct kvm_vm *vm;
-	int irq;
+	int irq, irq_cpu;
 
-	while ((c = getopt(argc, argv, "d:ht:")) != -1) {
+	while ((c = getopt(argc, argv, "ad:ht:")) != -1) {
 		switch (c) {
+		case 'a':
+			irq_affinity = true;
+			break;
 		case 'd':
 			device_bdf = optarg;
 			break;
@@ -192,7 +198,11 @@ int main(int argc, char **argv)
 		printf("Using device %s MSI-X[%d] (IRQ-%u)\n", device_bdf, msix,
 		       irq);
 	} else {
+		TEST_ASSERT(!irq_affinity,
+			    "Setting IRQ affinity (-a) requires a backing device (-d)");
+
 		eventfd = kvm_new_eventfd();
+		irq = -1;
 	}
 
 	pr_info("Injecting interrupts for GSI %d (guest vector 0x%x) %d times\n",
@@ -210,12 +220,19 @@ int main(int argc, char **argv)
 			continue;
 	}
 
+	irq_cpu = -1;
+
 	for (i = 0; i < nr_irqs; i++) {
 		struct kvm_vcpu *vcpu = vcpus[i % nr_vcpus];
 		struct timespec start;
 
 		kvm_route_msi(vm, gsi, vcpu, vector);
 
+		if (irq_affinity) {
+			irq_cpu = kvm_random_u64(&kvm_rng) % get_nprocs();
+			proc_irq_set_smp_affinity(irq, irq_cpu);
+		}
+
 		for (j = 0; j < nr_vcpus; j++)
 			TEST_ASSERT(!GUEST_RECEIVED_IRQ(vcpus[j]),
 				    "IRQ flag for vCPU %d not clear prior to test",
@@ -229,8 +246,8 @@ int main(int argc, char **argv)
 			cpu_relax();
 
 		TEST_ASSERT(GUEST_RECEIVED_IRQ(vcpu),
-			    "vCPU %d timed out waiting for IRQ (vector 0x%x) from GSI %d\n",
-			    vcpu->id, vector, gsi);
+			    "vCPU %d timed out waiting for IRQ (vector 0x%x) from GSI %d (via CPU %d)\n",
+			    vcpu->id, vector, gsi, irq_cpu);
 
 		WRITE_AND_SYNC_TO_GUEST(vm, guest_received_irq[vcpu->id], false);
 	}
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 13/20] KVM: selftests: Make number of IRQs configurable in IRQ test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: David Matlack <dmatlack@google.com>

Extend the eventfd IRQ test with a '-i' flag to let the user specify the
the number of IRQs to generate (instead of hardcoding the test to always
generate 1000 interrupts).

Signed-off-by: David Matlack <dmatlack@google.com>
Co-developed-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/irq_test.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/irq_test.c b/tools/testing/selftests/kvm/irq_test.c
index 2356912e2272..2cfb6c24e8d6 100644
--- a/tools/testing/selftests/kvm/irq_test.c
+++ b/tools/testing/selftests/kvm/irq_test.c
@@ -134,12 +134,13 @@ static const char *probe_iommu_type(void)
 
 static void help(const char *name)
 {
-	printf("Usage: %s [-a] [-d <segment:bus:device.function>] [-e] [-h] [-t iommu_type]\n", name);
+	printf("Usage: %s [-a] [-d <segment:bus:device.function>] [-e] [-h] [-i nr_irqs] [-t iommu_type]\n", name);
 	printf("\n");
 	printf("Tests KVM interrupt routing and delivery via irqfd.\n");
 	printf("-a	Affine the device's host IRQ to a random physical CPU\n");
 	printf("-d	Use a VFIO device to send MSI-X interrupts instead of manually signaling the eventfd\n");
 	printf("-e	Set empty GSI routing in-between some interrupts\n");
+	printf("-i	The number of IRQs to generate during the test\n");
 	printf("-t	Override the IOMMU type to use (vfio_type1_iommu or iommufd)\n");
 	printf("\n");
 	exit(KSFT_FAIL);
@@ -174,7 +175,7 @@ int main(int argc, char **argv)
 	struct kvm_vm *vm;
 	int irq, irq_cpu;
 
-	while ((c = getopt(argc, argv, "ad:eht:")) != -1) {
+	while ((c = getopt(argc, argv, "ad:ehi:t:")) != -1) {
 		switch (c) {
 		case 'a':
 			irq_affinity = true;
@@ -185,6 +186,9 @@ int main(int argc, char **argv)
 		case 'e':
 			set_empty_routing = true;
 			break;
+		case 'i':
+			nr_irqs = atoi_positive("Number of IRQs", optarg);
+			break;
 		case 't':
 			iommu_type = optarg;
 			break;
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 10/20] KVM: selftests: Add a helper to set proc IRQ affinity for IRQ test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: Josh Hilke <jrhilke@google.com>

Add a utility, proc_irq_set_smp_affinity(), to set the CPU affinity of a
Linux host IRQ via the proc filesystem.  Use smp_affinity_list instead of
smp_affinity to avoid having to convert the single CPU to a bitmask.

The helper will be used by the eventfd IRQ test to verify delivery of IRQs
when the affinity is randomized/modified.

Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: make the utility self-contained, drop "list", massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/proc_util.h |  2 ++
 tools/testing/selftests/kvm/lib/proc_util.c     | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/proc_util.h b/tools/testing/selftests/kvm/include/proc_util.h
index 704839b6d7af..d1ddc967d11d 100644
--- a/tools/testing/selftests/kvm/include/proc_util.h
+++ b/tools/testing/selftests/kvm/include/proc_util.h
@@ -6,4 +6,6 @@
 
 unsigned int vfio_msix_to_host_irq(const char *vfio_device_bdf, int msix);
 
+void proc_irq_set_smp_affinity(unsigned int irq, int cpu);
+
 #endif /* SELFTEST_KVM_PROC_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/proc_util.c b/tools/testing/selftests/kvm/lib/proc_util.c
index 84d30f055a0a..3960b3841d63 100644
--- a/tools/testing/selftests/kvm/lib/proc_util.c
+++ b/tools/testing/selftests/kvm/lib/proc_util.c
@@ -38,3 +38,17 @@ unsigned int vfio_msix_to_host_irq(const char *device_bdf, int msix)
 	return (unsigned int)irq;
 }
 
+void proc_irq_set_smp_affinity(unsigned int irq, int cpu)
+{
+	char path[PATH_MAX];
+	int r, fd;
+
+	snprintf(path, sizeof(path), "/proc/irq/%u/smp_affinity_list", irq);
+	fd = open(path, O_RDWR);
+	TEST_ASSERT(fd >= 0, "Failed to open %s", path);
+
+	r = dprintf(fd, "%d\n", cpu);
+	TEST_ASSERT(r > 0, "Failed to affinitize IRQ-%u to CPU %d", irq, cpu);
+
+	kvm_close(fd);
+}
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 12/20] KVM: selftests: Add option to set empty routing between IRQs in eventfd IRQ test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: David Matlack <dmatlack@google.com>

Extend the eventfd IRQ test with an '-e' flag to set empty GSI routing
between interrupts.  Clobbering the GSI routing table verifies that KVM
correctly handles CPUx => NULL => CPUy transitions, not just CPUx => CPUy
transitions, and verifies that KVM can "rebuild" an entire routing setup.

Signed-off-by: David Matlack <dmatlack@google.com>
Co-developed-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: '-e' for "empty" instead of '-c' for "clear", massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/irq_test.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/irq_test.c b/tools/testing/selftests/kvm/irq_test.c
index c01aa313f719..2356912e2272 100644
--- a/tools/testing/selftests/kvm/irq_test.c
+++ b/tools/testing/selftests/kvm/irq_test.c
@@ -109,6 +109,13 @@ static void kvm_route_msi(struct kvm_vm *vm, u32 gsi, struct kvm_vcpu *vcpu,
 	vm_ioctl(vm, KVM_SET_GSI_ROUTING, &routing.header);
 }
 
+static void kvm_set_empty_gsi_routing(struct kvm_vm *vm)
+{
+	struct kvm_irq_routing routing = {};
+
+	vm_ioctl(vm, KVM_SET_GSI_ROUTING, &routing);
+}
+
 static const char *probe_iommu_type(void)
 {
 	int io_fd;
@@ -127,11 +134,12 @@ static const char *probe_iommu_type(void)
 
 static void help(const char *name)
 {
-	printf("Usage: %s [-a] [-d <segment:bus:device.function>] [-h] [-t iommu_type]\n", name);
+	printf("Usage: %s [-a] [-d <segment:bus:device.function>] [-e] [-h] [-t iommu_type]\n", name);
 	printf("\n");
 	printf("Tests KVM interrupt routing and delivery via irqfd.\n");
 	printf("-a	Affine the device's host IRQ to a random physical CPU\n");
 	printf("-d	Use a VFIO device to send MSI-X interrupts instead of manually signaling the eventfd\n");
+	printf("-e	Set empty GSI routing in-between some interrupts\n");
 	printf("-t	Override the IOMMU type to use (vfio_type1_iommu or iommufd)\n");
 	printf("\n");
 	exit(KSFT_FAIL);
@@ -158,6 +166,7 @@ int main(int argc, char **argv)
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	struct vfio_pci_device *device = NULL;
 	int nr_irqs = 1000, nr_vcpus = 1;
+	bool set_empty_routing = false;
 	const char *device_bdf = NULL;
 	const char *iommu_type = NULL;
 	int i, j, c, msix, eventfd;
@@ -165,7 +174,7 @@ int main(int argc, char **argv)
 	struct kvm_vm *vm;
 	int irq, irq_cpu;
 
-	while ((c = getopt(argc, argv, "ad:ht:")) != -1) {
+	while ((c = getopt(argc, argv, "ad:eht:")) != -1) {
 		switch (c) {
 		case 'a':
 			irq_affinity = true;
@@ -173,6 +182,9 @@ int main(int argc, char **argv)
 		case 'd':
 			device_bdf = optarg;
 			break;
+		case 'e':
+			set_empty_routing = true;
+			break;
 		case 't':
 			iommu_type = optarg;
 			break;
@@ -223,9 +235,13 @@ int main(int argc, char **argv)
 	irq_cpu = -1;
 
 	for (i = 0; i < nr_irqs; i++) {
+		const bool do_set_empty_routing = set_empty_routing && (i & BIT(3));
 		struct kvm_vcpu *vcpu = vcpus[i % nr_vcpus];
 		struct timespec start;
 
+		if (do_set_empty_routing)
+			kvm_set_empty_gsi_routing(vm);
+
 		kvm_route_msi(vm, gsi, vcpu, vector);
 
 		if (irq_affinity) {
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 09/20] KVM: selftests: Add VFIO device support to eventfd IRQ test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: David Matlack <dmatlack@google.com>

Extend the eventfd IRQ test with a '-d' argument that takes a BDF (in the
format segment:bus:device.function) of an interrupt-capable PCI(e) device
bound to VFIO, and use said device to trigger interrupts instead of always
synthesizing interrupts via direct writes to the eventfd.

Using a VFIO device to trigger interrupts validates the end-to-end delivery
of IRQs for "real" devices, and when supported by hardware (and KVM), also
validates interrupt delivery via IRQ bypass, i.e. via device posted IRQs.

Now that IOMMUFD is a thing, auto-probe IOMMUFD vs. "legacy" VFIO by
temporarily opening /dev/iommufd, and skip the test if neither IOMMUFD nor
legacy VFIO is available.  Add a '-t' option to the user override the probe
logic, e.g. in case IOMMUFD is available but the system is configured for
legacy usage.

Note, the device must have a VFIO selftest driver in order to work with
the test.  A helper script to list supported devices will hopefully be
available in the near future at
tools/testing/selftests/vfio/scripts/list_supported_devices.sh[1].

Example:
$ ./tools/testing/selftests/kvm/irq_test -d 0000:06:0a.1

Link: https://lore.kernel.org/all/20260602222941.3133236-1-jrhilke%40google.com [1]
Signed-off-by: David Matlack <dmatlack@google.com>
Co-developed-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/irq_test.c | 86 ++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/kvm/irq_test.c b/tools/testing/selftests/kvm/irq_test.c
index 9f8895b89821..6888be54ee4a 100644
--- a/tools/testing/selftests/kvm/irq_test.c
+++ b/tools/testing/selftests/kvm/irq_test.c
@@ -3,7 +3,10 @@
 #include "test_util.h"
 #include "apic.h"
 #include "processor.h"
+#include "proc_util.h"
 
+#include <libvfio.h>
+#include <linux/sizes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -55,6 +58,36 @@ static void *vcpu_thread_main(void *arg)
 	return NULL;
 }
 
+static int vfio_setup_msi(struct vfio_pci_device *device)
+{
+	const int flags = MAP_SHARED | MAP_ANONYMOUS;
+	const int prot = PROT_READ | PROT_WRITE;
+	struct dma_region *region;
+
+	/* A driver is required to generate an MSI. */
+	TEST_REQUIRE(device->driver.ops);
+
+	/* Set up a DMA-able region for the driver to use. */
+	region = &device->driver.region;
+	region->iova = 0;
+	region->size = SZ_2M;
+	region->vaddr = kvm_mmap(region->size, prot, flags, -1);
+	TEST_ASSERT(region->vaddr != MAP_FAILED, "mmap() failed\n");
+	iommu_map(device->iommu, region);
+
+	vfio_pci_driver_init(device);
+	return device->driver.msi;
+}
+
+static void trigger_interrupt(struct vfio_pci_device *device, int eventfd)
+{
+	if (device)
+		vfio_pci_driver_send_msi(device);
+	else
+		eventfd_write(eventfd, 1);
+}
+
+
 static void kvm_route_msi(struct kvm_vm *vm, u32 gsi, struct kvm_vcpu *vcpu,
 			  u8 vector)
 {
@@ -74,11 +107,29 @@ static void kvm_route_msi(struct kvm_vm *vm, u32 gsi, struct kvm_vcpu *vcpu,
 	vm_ioctl(vm, KVM_SET_GSI_ROUTING, &routing.header);
 }
 
+static const char *probe_iommu_type(void)
+{
+	int io_fd;
+
+	io_fd = open("/dev/iommu", O_RDONLY);
+	if (io_fd >= 0) {
+		close(io_fd);
+		return MODE_IOMMUFD;
+	}
+
+	io_fd = __open_path_or_exit("/dev/vfio", O_RDONLY,
+				    "Is VFIO (or IOMMUFD) loaded and enabled?");
+	close(io_fd);
+	return MODE_VFIO_TYPE1_IOMMU;
+}
+
 static void help(const char *name)
 {
-	printf("Usage: %s [-h]\n", name);
+	printf("Usage: %s [-d <segment:bus:device.function>] [-h] [-t iommu_type]\n", name);
 	printf("\n");
 	printf("Tests KVM interrupt routing and delivery via irqfd.\n");
+	printf("-d	Use a VFIO device to send MSI-X interrupts instead of manually signaling the eventfd\n");
+	printf("-t	Override the IOMMU type to use (vfio_type1_iommu or iommufd)\n");
 	printf("\n");
 	exit(KSFT_FAIL);
 }
@@ -100,14 +151,25 @@ int main(int argc, char **argv)
 	u32 gsi = kvm_random_u64_in_range(&kvm_rng, 24, KVM_MAX_IRQ_ROUTES - 1);
 	u8 vector = kvm_random_u64_in_range(&kvm_rng, 32, UINT8_MAX);
 
-	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	pthread_t vcpu_threads[KVM_MAX_VCPUS];
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	struct vfio_pci_device *device = NULL;
 	int nr_irqs = 1000, nr_vcpus = 1;
-	int i, j, c, eventfd;
+	const char *device_bdf = NULL;
+	const char *iommu_type = NULL;
+	int i, j, c, msix, eventfd;
+	struct iommu *iommu;
 	struct kvm_vm *vm;
+	int irq;
 
-	while ((c = getopt(argc, argv, "h")) != -1) {
+	while ((c = getopt(argc, argv, "d:ht:")) != -1) {
 		switch (c) {
+		case 'd':
+			device_bdf = optarg;
+			break;
+		case 't':
+			iommu_type = optarg;
+			break;
 		case 'h':
 		default:
 			help(argv[0]);
@@ -119,7 +181,19 @@ int main(int argc, char **argv)
 	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
 	vm_install_exception_handler(vm, vector, guest_irq_handler);
 
-	eventfd = kvm_new_eventfd();
+	if (device_bdf) {
+		if (!iommu_type)
+			iommu_type = probe_iommu_type();
+		iommu = iommu_init(iommu_type);
+		device = vfio_pci_device_init(device_bdf, iommu);
+		msix = vfio_setup_msi(device);
+		irq = vfio_msix_to_host_irq(device_bdf, msix);
+		eventfd = device->msi_eventfds[msix];
+		printf("Using device %s MSI-X[%d] (IRQ-%u)\n", device_bdf, msix,
+		       irq);
+	} else {
+		eventfd = kvm_new_eventfd();
+	}
 
 	pr_info("Injecting interrupts for GSI %d (guest vector 0x%x) %d times\n",
 		gsi, vector, nr_irqs);
@@ -147,7 +221,7 @@ int main(int argc, char **argv)
 				    "IRQ flag for vCPU %d not clear prior to test",
 				    vcpus[j]->id);
 
-		eventfd_write(eventfd, 1);
+		trigger_interrupt(device, eventfd);
 
 		clock_gettime(CLOCK_MONOTONIC, &start);
 		while (!GUEST_RECEIVED_IRQ(vcpu) &&
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 07/20] KVM: selftests: Add an irqfd send+receive (and later IRQ bypass) test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: David Matlack <dmatlack@google.com>

Add a new test, irq_test to verify that KVM correctly delivers interrupts
to a running vCPU, when triggered via an eventfd bound to a KVM GSI using
KVM's irqfd mechanism.

This test is intentionally simple, for now.  Support for sending interrupts
via VFIO devices, for IRQ bypass, and for other features will be added in
the near future.

Add the test in common code, even though it currently will only build and
run on x86, as the concept and the bulk of the host-side code isn't
specific to x86.

Suggested-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/kvm/20250404193923.1413163-68-seanjc@google.com
Signed-off-by: David Matlack <dmatlack@google.com>
Co-developed-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: use while() and TEST_ASSERT() instead of if-statement => TEST_FAIL()]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm |   1 +
 tools/testing/selftests/kvm/irq_test.c   | 170 +++++++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/irq_test.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 59ec232b1e93..de5f6e91203b 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -157,6 +157,7 @@ TEST_GEN_PROGS_x86 += coalesced_io_test
 TEST_GEN_PROGS_x86 += dirty_log_perf_test
 TEST_GEN_PROGS_x86 += guest_memfd_test
 TEST_GEN_PROGS_x86 += hardware_disable_test
+TEST_GEN_PROGS_x86 += irq_test
 TEST_GEN_PROGS_x86 += mmu_stress_test
 TEST_GEN_PROGS_x86 += rseq_test
 TEST_GEN_PROGS_x86 += steal_time
diff --git a/tools/testing/selftests/kvm/irq_test.c b/tools/testing/selftests/kvm/irq_test.c
new file mode 100644
index 000000000000..9f8895b89821
--- /dev/null
+++ b/tools/testing/selftests/kvm/irq_test.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "kvm_util.h"
+#include "test_util.h"
+#include "apic.h"
+#include "processor.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/eventfd.h>
+
+static u64 timeout_ns = 2ULL * 1000 * 1000 * 1000;
+static bool guest_ready_for_irqs[KVM_MAX_VCPUS];
+static bool guest_received_irq[KVM_MAX_VCPUS];
+static bool done;
+
+#define GUEST_RECEIVED_IRQ(__vcpu)	\
+	SYNC_FROM_GUEST_AND_READ((__vcpu)->vm, guest_received_irq[(__vcpu)->id])
+
+static u32 guest_get_vcpu_id(void)
+{
+	return x2apic_read_reg(APIC_ID);
+}
+
+static void guest_irq_handler(struct ex_regs *regs)
+{
+	WRITE_ONCE(guest_received_irq[guest_get_vcpu_id()], true);
+
+	x2apic_write_reg(APIC_EOI, 0);
+}
+
+static void guest_code(void)
+{
+	x2apic_enable();
+
+	sti_nop();
+
+	WRITE_ONCE(guest_ready_for_irqs[guest_get_vcpu_id()], true);
+
+	while (!READ_ONCE(done))
+		cpu_relax();
+
+	GUEST_DONE();
+}
+
+static void *vcpu_thread_main(void *arg)
+{
+	struct kvm_vcpu *vcpu = arg;
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_EQ(UCALL_DONE, get_ucall(vcpu, &uc));
+
+	return NULL;
+}
+
+static void kvm_route_msi(struct kvm_vm *vm, u32 gsi, struct kvm_vcpu *vcpu,
+			  u8 vector)
+{
+	struct {
+		struct kvm_irq_routing header;
+		struct kvm_irq_routing_entry entry;
+	} routing = {
+		.header.nr = 1,
+		.entry = {
+			.gsi = gsi,
+			.type = KVM_IRQ_ROUTING_MSI,
+			.u.msi.address_lo = 0xFEE00000 | (vcpu->id << 12),
+			.u.msi.data = vector,
+		},
+	};
+
+	vm_ioctl(vm, KVM_SET_GSI_ROUTING, &routing.header);
+}
+
+static void help(const char *name)
+{
+	printf("Usage: %s [-h]\n", name);
+	printf("\n");
+	printf("Tests KVM interrupt routing and delivery via irqfd.\n");
+	printf("\n");
+	exit(KSFT_FAIL);
+}
+
+int main(int argc, char **argv)
+{
+	/*
+	 * Pick a random vector and a random GSI to use for device IRQ.
+	 *
+	 * Pick an IRQ vector in range [32, UINT8_MAX]. Min value is 32 because
+	 * Linux/x86 reserves vectors 0-31 for exceptions and architecture
+	 * defined NMIs and interrupts.
+	 *
+	 * Pick a GSI in range [24, KVM_MAX_IRQ_ROUTES - 1]. The min value is 24
+	 * because KVM reserves GSIs 0-15 for legacy ISA IRQs and 16-23 only go
+	 * to the IOAPIC. The max is KVM_MAX_IRQ_ROUTES - 1, because
+	 * KVM_MAX_IRQ_ROUTES is exclusive.
+	 */
+	u32 gsi = kvm_random_u64_in_range(&kvm_rng, 24, KVM_MAX_IRQ_ROUTES - 1);
+	u8 vector = kvm_random_u64_in_range(&kvm_rng, 32, UINT8_MAX);
+
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	pthread_t vcpu_threads[KVM_MAX_VCPUS];
+	int nr_irqs = 1000, nr_vcpus = 1;
+	int i, j, c, eventfd;
+	struct kvm_vm *vm;
+
+	while ((c = getopt(argc, argv, "h")) != -1) {
+		switch (c) {
+		case 'h':
+		default:
+			help(argv[0]);
+		}
+	}
+
+	TEST_REQUIRE(kvm_arch_has_default_irqchip());
+
+	vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus);
+	vm_install_exception_handler(vm, vector, guest_irq_handler);
+
+	eventfd = kvm_new_eventfd();
+
+	pr_info("Injecting interrupts for GSI %d (guest vector 0x%x) %d times\n",
+		gsi, vector, nr_irqs);
+
+	kvm_assign_irqfd(vm, gsi, eventfd);
+
+	for (i = 0; i < nr_vcpus; i++)
+		pthread_create(&vcpu_threads[i], NULL, vcpu_thread_main, vcpus[i]);
+
+	for (i = 0; i < nr_vcpus; i++) {
+		struct kvm_vcpu *vcpu = vcpus[i];
+
+		while (!SYNC_FROM_GUEST_AND_READ(vm, guest_ready_for_irqs[vcpu->id]))
+			continue;
+	}
+
+	for (i = 0; i < nr_irqs; i++) {
+		struct kvm_vcpu *vcpu = vcpus[i % nr_vcpus];
+		struct timespec start;
+
+		kvm_route_msi(vm, gsi, vcpu, vector);
+
+		for (j = 0; j < nr_vcpus; j++)
+			TEST_ASSERT(!GUEST_RECEIVED_IRQ(vcpus[j]),
+				    "IRQ flag for vCPU %d not clear prior to test",
+				    vcpus[j]->id);
+
+		eventfd_write(eventfd, 1);
+
+		clock_gettime(CLOCK_MONOTONIC, &start);
+		while (!GUEST_RECEIVED_IRQ(vcpu) &&
+		       timespec_to_ns(timespec_elapsed(start)) <= timeout_ns)
+			cpu_relax();
+
+		TEST_ASSERT(GUEST_RECEIVED_IRQ(vcpu),
+			    "vCPU %d timed out waiting for IRQ (vector 0x%x) from GSI %d\n",
+			    vcpu->id, vector, gsi);
+
+		WRITE_AND_SYNC_TO_GUEST(vm, guest_received_irq[vcpu->id], false);
+	}
+
+	WRITE_AND_SYNC_TO_GUEST(vm, done, true);
+
+	for (i = 0; i < nr_vcpus; i++)
+		pthread_join(vcpu_threads[i], NULL);
+
+	return 0;
+}
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 05/20] KVM: selftests: Seed libc's RNG before using it to generate a seed for KVM's pRNG
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

Seed the RNG used by random() using the de facto standard method of
srand(time(0)), so that a different seed is actually used in each test run.
E.g. without seeding the RNG, literally every test on x86 will use
0x6b8b4567 to seed the KVM RNG.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 1016865d3f7a..2e08d9fcefc7 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2284,6 +2284,7 @@ void __attribute((constructor)) kvm_selftest_init(void)
 	sigaction(SIGILL, &sig_sa, NULL);
 	sigaction(SIGFPE, &sig_sa, NULL);
 
+	srandom(time(0));
 	kvm_seed_rng(random());
 
 	kvm_selftest_arch_init();
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 08/20] KVM: selftests: Add helper to get host IRQ from device MSI-X for IRQ bypass test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: David Matlack <dmatlack@google.com>

Introduce proc_util.c and proc_util.h to house utility functions for
interacting with the proc filesystem.

Add vfio_msix_to_host_irq(), which parses /proc/interrupts, to get the host
Linux IRQ for a given VFIO device BDF and MSI-X vector.

This helper will be used by the eventfd IRQ test to print the host IRQ
number when triggering IRQs via VFIO device, e.g. to aid in debugging if
the test fails.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: David Matlack <dmatlack@google.com>
Co-developed-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm      |  1 +
 .../testing/selftests/kvm/include/proc_util.h |  9 +++++
 tools/testing/selftests/kvm/lib/proc_util.c   | 40 +++++++++++++++++++
 3 files changed, 50 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/include/proc_util.h
 create mode 100644 tools/testing/selftests/kvm/lib/proc_util.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index de5f6e91203b..c112cedd3a2a 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -11,6 +11,7 @@ LIBKVM += lib/kvm_util.c
 LIBKVM += lib/lru_gen_util.c
 LIBKVM += lib/memstress.c
 LIBKVM += lib/guest_sprintf.c
+LIBKVM += lib/proc_util.c
 LIBKVM += lib/rbtree.c
 LIBKVM += lib/sparsebit.c
 LIBKVM += lib/test_util.c
diff --git a/tools/testing/selftests/kvm/include/proc_util.h b/tools/testing/selftests/kvm/include/proc_util.h
new file mode 100644
index 000000000000..704839b6d7af
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/proc_util.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_PROC_UTIL_H
+#define SELFTEST_KVM_PROC_UTIL_H
+
+#include <stdint.h>
+
+unsigned int vfio_msix_to_host_irq(const char *vfio_device_bdf, int msix);
+
+#endif /* SELFTEST_KVM_PROC_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/proc_util.c b/tools/testing/selftests/kvm/lib/proc_util.c
new file mode 100644
index 000000000000..84d30f055a0a
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/proc_util.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "kvm_util.h"
+#include "test_util.h"
+#include "proc_util.h"
+
+static FILE *open_proc_interrupts(void)
+{
+	FILE *fp;
+
+	fp = fopen("/proc/interrupts", "r");
+	TEST_ASSERT(fp, "fopen(/proc/interrupts) failed");
+
+	return fp;
+}
+
+unsigned int vfio_msix_to_host_irq(const char *device_bdf, int msix)
+{
+	char search_string[64];
+	char line[4096];
+	int irq = -1;
+	FILE *fp;
+
+	fp = open_proc_interrupts();
+
+	snprintf(search_string, sizeof(search_string), "vfio-msix[%d]", msix);
+
+	while (fgets(line, sizeof(line), fp)) {
+		if (strstr(line, device_bdf) && strstr(line, search_string)) {
+			TEST_ASSERT_EQ(1, sscanf(line, "%d:", &irq));
+			break;
+		}
+	}
+
+	fclose(fp);
+
+	TEST_ASSERT(irq != -1, "Failed to locate IRQ for %s %s", device_bdf,
+		    search_string);
+	return (unsigned int)irq;
+}
+
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 04/20] KVM: selftests: Initialize the default/global pRNG during kvm_selftest_init()
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

Initialize the default kvm_rng during selftest initialization so that the
pRNG can be used by tests before creating a VM.  As pointed out by Sashiko,
failure to actually initialize the generate makes it decidedly not random.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 875030c22d07..1016865d3f7a 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -24,6 +24,13 @@ u32 kvm_random_seed;
 struct kvm_random_state kvm_rng;
 static u32 last_kvm_seed;
 
+static void kvm_seed_rng(u32 seed)
+{
+	kvm_random_seed = last_kvm_seed = seed;
+	pr_info("Random seed: 0x%x\n", kvm_random_seed);
+	kvm_rng = new_kvm_random_state(kvm_random_seed);
+}
+
 static size_t vcpu_mmap_sz(void);
 
 int __open_path_or_exit(const char *path, int flags, const char *enoent_help)
@@ -515,11 +522,9 @@ struct kvm_vm *__vm_create(struct vm_shape shape, u32 nr_runnable_vcpus,
 	slot0 = memslot2region(vm, 0);
 	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
 
-	if (kvm_random_seed != last_kvm_seed) {
-		pr_info("Random seed: 0x%x\n", kvm_random_seed);
-		last_kvm_seed = kvm_random_seed;
-	}
-	kvm_rng = new_kvm_random_state(kvm_random_seed);
+	if (kvm_random_seed != last_kvm_seed)
+		kvm_seed_rng(kvm_random_seed);
+
 	sync_global_to_guest(vm, kvm_rng);
 
 	kvm_arch_vm_post_create(vm, nr_runnable_vcpus);
@@ -2279,8 +2284,7 @@ void __attribute((constructor)) kvm_selftest_init(void)
 	sigaction(SIGILL, &sig_sa, NULL);
 	sigaction(SIGFPE, &sig_sa, NULL);
 
-	kvm_random_seed = last_kvm_seed = random();
-	pr_info("Random seed: 0x%x\n", kvm_random_seed);
+	kvm_seed_rng(random());
 
 	kvm_selftest_arch_init();
 }
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 06/20] KVM: selftests: Add helper to generate random u64 in range [min,max]
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: Josh Hilke <jrhilke@google.com>

Introduce kvm_random_u64_in_range(state, min, max). This function
returns a random u64 in the inclusive range of [min, max] using a struct
kvm_random_state.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../testing/selftests/kvm/include/test_util.h  |  3 +++
 tools/testing/selftests/kvm/lib/test_util.c    | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 44c0104d60ac..d64c8a228207 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -134,6 +134,9 @@ static inline u64 kvm_random_u64(struct kvm_random_state *state)
 	return ((u64)kvm_random_u32(state) << 32) | kvm_random_u32(state);
 }
 
+u64 kvm_random_u64_in_range(struct kvm_random_state *state, u64 min,
+			    u64 max);
+
 enum vm_mem_backing_src_type {
 	VM_MEM_SRC_ANONYMOUS,
 	VM_MEM_SRC_ANONYMOUS_THP,
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index e98ca7ef439c..e208a57f190c 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -42,6 +42,24 @@ u32 kvm_random_u32(struct kvm_random_state *state)
 	return state->seed;
 }
 
+/* Returns a random u64 in the inclusive range [min, max] */
+u64 kvm_random_u64_in_range(struct kvm_random_state *state, u64 min,
+			    u64 max)
+{
+	u64 value;
+	u64 range;
+
+	TEST_ASSERT(min <= max, "PEBKAC, min = 0x%lx, max = 0x%lx", min, max);
+
+	value = kvm_random_u64(state);
+
+	range = max - min;
+	if (range == ULLONG_MAX)
+		return value;
+
+	return min + (value % (range + 1));
+}
+
 /*
  * Parses "[0-9]+[kmgt]?".
  */
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 02/20] KVM: selftests: Add macros to read/write+sync to/from guest memory
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: David Matlack <dmatlack@google.com>

Add SYNC_FROM_GUEST_AND_READ(vm, variable), to read a variable value
from the guest. Add WRITE_AND_SYNC_TO_GUEST(vm, variable, value) to
write a value to a guest variable. These macros improve the readability
of code which reads and writes data between host and guest in tests.

Use the new macro in existing tests that do back-to-back write+sync.

No functional changes are intended.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: David Matlack <dmatlack@google.com>
Co-developed-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/dirty_log_test.c  |  9 +++-----
 .../testing/selftests/kvm/include/kvm_util.h  | 10 +++++++++
 tools/testing/selftests/kvm/mmu_stress_test.c |  9 +++-----
 tools/testing/selftests/kvm/steal_time.c      | 22 +++++++------------
 4 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index 74ca096bf976..087e94a8a81a 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -708,8 +708,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 
 		sync_global_to_guest(vm, iteration);
 
-		WRITE_ONCE(nr_writes, 0);
-		sync_global_to_guest(vm, nr_writes);
+		WRITE_AND_SYNC_TO_GUEST(vm, nr_writes, 0);
 
 		dirty_ring_prev_iteration_last_page = dirty_ring_last_page;
 		WRITE_ONCE(dirty_ring_vcpu_ring_full, false);
@@ -775,16 +774,14 @@ static void run_test(enum vm_guest_mode mode, void *arg)
 		 * writing memory during verification, pages that this thread
 		 * sees as clean may be written with this iteration's value.
 		 */
-		WRITE_ONCE(vcpu_stop, true);
-		sync_global_to_guest(vm, vcpu_stop);
+		WRITE_AND_SYNC_TO_GUEST(vm, vcpu_stop, true);
 		sem_wait(&sem_vcpu_stop);
 
 		/*
 		 * Clear vcpu_stop after the vCPU thread has acknowledge the
 		 * stop request and is waiting, i.e. is definitely not running!
 		 */
-		WRITE_ONCE(vcpu_stop, false);
-		sync_global_to_guest(vm, vcpu_stop);
+		WRITE_AND_SYNC_TO_GUEST(vm, vcpu_stop, false);
 
 		/*
 		 * Sync the number of writes performed before verification, the
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 04a910164a29..c1f588154398 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -1138,6 +1138,16 @@ vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
 	memcpy(&(g), _p, sizeof(g));				\
 })
 
+#define SYNC_FROM_GUEST_AND_READ(_vm, _variable) ({		\
+	sync_global_from_guest(_vm, _variable);			\
+	READ_ONCE(_variable);					\
+})
+
+#define WRITE_AND_SYNC_TO_GUEST(_vm, _variable, _value) do {	\
+	WRITE_ONCE(_variable, _value);				\
+	sync_global_to_guest(_vm, _variable);			\
+} while (0)
+
 /*
  * Write a global value, but only in the VM's (guest's) domain.  Primarily used
  * for "globals" that hold per-VM values (VMs always duplicate code and global
diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index 54d281419d31..473ef4c0ea9f 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -155,10 +155,8 @@ static void *vcpu_worker(void *data)
 		    "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);
 
 	atomic_inc(&nr_ro_faults);
-	if (atomic_read(&nr_ro_faults) == nr_vcpus) {
-		WRITE_ONCE(all_vcpus_hit_ro_fault, true);
-		sync_global_to_guest(vm, all_vcpus_hit_ro_fault);
-	}
+	if (atomic_read(&nr_ro_faults) == nr_vcpus)
+		WRITE_AND_SYNC_TO_GUEST(vm, all_vcpus_hit_ro_fault, true);
 
 #if defined(__x86_64__) || defined(__aarch64__)
 	/*
@@ -383,8 +381,7 @@ int main(int argc, char *argv[])
 	rendezvous_with_vcpus(&time_run2, "run 2");
 
 	mprotect(mem, slot_size, PROT_READ);
-	mprotect_ro_done = true;
-	sync_global_to_guest(vm, mprotect_ro_done);
+	WRITE_AND_SYNC_TO_GUEST(vm, mprotect_ro_done, true);
 
 	rendezvous_with_vcpus(&time_ro, "mprotect RO");
 	mprotect(mem, slot_size, PROT_READ | PROT_WRITE);
diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c
index 76fcdd1fd3cb..2de87549fcc0 100644
--- a/tools/testing/selftests/kvm/steal_time.c
+++ b/tools/testing/selftests/kvm/steal_time.c
@@ -70,8 +70,8 @@ static bool is_steal_time_supported(struct kvm_vcpu *vcpu)
 static void steal_time_init(struct kvm_vcpu *vcpu, u32 i)
 {
 	/* ST_GPA_BASE is identity mapped */
-	st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
-	sync_global_to_guest(vcpu->vm, st_gva[i]);
+	WRITE_AND_SYNC_TO_GUEST(vcpu->vm, st_gva[i],
+				(void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE));
 
 	vcpu_set_msr(vcpu, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED);
 }
@@ -187,8 +187,7 @@ static void steal_time_init(struct kvm_vcpu *vcpu, u32 i)
 	};
 
 	/* ST_GPA_BASE is identity mapped */
-	st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
-	sync_global_to_guest(vm, st_gva[i]);
+	WRITE_AND_SYNC_TO_GUEST(vm, st_gva[i], (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE));
 
 	st_ipa = (ulong)st_gva[i];
 	vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev);
@@ -310,10 +309,8 @@ static bool is_steal_time_supported(struct kvm_vcpu *vcpu)
 static void steal_time_init(struct kvm_vcpu *vcpu, u32 i)
 {
 	/* ST_GPA_BASE is identity mapped */
-	st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
-	st_gpa[i] = addr_gva2gpa(vcpu->vm, (gva_t)st_gva[i]);
-	sync_global_to_guest(vcpu->vm, st_gva[i]);
-	sync_global_to_guest(vcpu->vm, st_gpa[i]);
+	WRITE_AND_SYNC_TO_GUEST(vcpu->vm, st_gva[i], (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE));
+	WRITE_AND_SYNC_TO_GUEST(vcpu->vm, st_gpa[i], addr_gva2gpa(vcpu->vm, (gva_t)st_gva[i]));
 }
 
 static void steal_time_dump(struct kvm_vm *vm, u32 vcpu_idx)
@@ -442,8 +439,7 @@ static void steal_time_init(struct kvm_vcpu *vcpu, u32 i)
 	};
 
 	/* ST_GPA_BASE is identity mapped */
-	st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
-	sync_global_to_guest(vm, st_gva[i]);
+	WRITE_AND_SYNC_TO_GUEST(vm, st_gva[i], (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE));
 
 	err = __vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &attr);
 	TEST_ASSERT(err == 0, "No PV stealtime Feature");
@@ -549,8 +545,7 @@ int main(int ac, char **av)
 
 		/* Second VCPU run, expect guest stolen time to be <= run_delay */
 		run_vcpu(vcpus[i]);
-		sync_global_from_guest(vm, guest_stolen_time[i]);
-		stolen_time = guest_stolen_time[i];
+		stolen_time = SYNC_FROM_GUEST_AND_READ(vm, guest_stolen_time[i]);
 		run_delay = get_run_delay();
 		TEST_ASSERT(stolen_time <= run_delay,
 			    "Expected stolen time <= %ld, got %ld",
@@ -570,8 +565,7 @@ int main(int ac, char **av)
 
 		/* Run VCPU again to confirm stolen time is consistent with run_delay */
 		run_vcpu(vcpus[i]);
-		sync_global_from_guest(vm, guest_stolen_time[i]);
-		stolen_time = guest_stolen_time[i] - stolen_time;
+		stolen_time = SYNC_FROM_GUEST_AND_READ(vm, guest_stolen_time[i]) - stolen_time;
 		TEST_ASSERT(stolen_time >= run_delay,
 			    "Expected stolen time >= %ld, got %ld",
 			    run_delay, stolen_time);
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 03/20] KVM: selftests: Rename guest_rng to kvm_rng
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: Josh Hilke <jrhilke@google.com>

Rename functions prefixed with 'guest_random_' to 'kvm_random_' and the
global random state variable 'guest_rng' to 'kvm_rng', as the pRNG isn't
strictly limited to guest code.  This will allow using the pRNG in host
code without creating confusing/misleading function calls.

No functional changes are intended.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/dirty_log_perf_test.c       |  4 ++--
 tools/testing/selftests/kvm/dirty_log_test.c  |  2 +-
 .../testing/selftests/kvm/include/test_util.h | 22 +++++++++----------
 .../selftests/kvm/include/x86/kvm_util_arch.h |  4 ++--
 tools/testing/selftests/kvm/lib/kvm_util.c    | 20 ++++++++---------
 tools/testing/selftests/kvm/lib/memstress.c   |  8 +++----
 tools/testing/selftests/kvm/lib/test_util.c   |  6 ++---
 .../testing/selftests/kvm/x86/sev_dbg_test.c  |  2 +-
 8 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index ef779fa91827..7c5abe1ae9e0 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -311,7 +311,7 @@ int main(int argc, char *argv[])
 	int opt;
 
 	/* Override the seed to be deterministic by default. */
-	guest_random_seed = 1;
+	kvm_random_seed = 1;
 
 	dirty_log_manual_caps =
 		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
@@ -357,7 +357,7 @@ int main(int argc, char *argv[])
 			p.phys_offset = strtoull(optarg, NULL, 0);
 			break;
 		case 'r':
-			guest_random_seed = atoi_positive("Random seed", optarg);
+			kvm_random_seed = atoi_positive("Random seed", optarg);
 			break;
 		case 's':
 			p.backing_src = parse_backing_src_type(optarg);
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index 087e94a8a81a..e8419d7da1ea 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -121,7 +121,7 @@ static void guest_code(void)
 	while (true) {
 		while (!READ_ONCE(vcpu_stop)) {
 			addr = guest_test_virt_mem;
-			addr += (guest_random_u64(&guest_rng) % guest_num_pages)
+			addr += (kvm_random_u64(&kvm_rng) % guest_num_pages)
 				* guest_page_size;
 			addr = align_down(addr, host_page_size);
 
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index a56271c237ae..44c0104d60ac 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -108,30 +108,30 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2);
 struct timespec timespec_elapsed(struct timespec start);
 struct timespec timespec_div(struct timespec ts, int divisor);
 
-struct guest_random_state {
+struct kvm_random_state {
 	u32 seed;
 };
 
-extern u32 guest_random_seed;
-extern struct guest_random_state guest_rng;
+extern u32 kvm_random_seed;
+extern struct kvm_random_state kvm_rng;
 
-struct guest_random_state new_guest_random_state(u32 seed);
-u32 guest_random_u32(struct guest_random_state *state);
+struct kvm_random_state new_kvm_random_state(u32 seed);
+u32 kvm_random_u32(struct kvm_random_state *state);
 
-static inline bool __guest_random_bool(struct guest_random_state *state,
+static inline bool __kvm_random_bool(struct kvm_random_state *state,
 				       u8 percent)
 {
-	return (guest_random_u32(state) % 100) < percent;
+	return (kvm_random_u32(state) % 100) < percent;
 }
 
-static inline bool guest_random_bool(struct guest_random_state *state)
+static inline bool kvm_random_bool(struct kvm_random_state *state)
 {
-	return __guest_random_bool(state, 50);
+	return __kvm_random_bool(state, 50);
 }
 
-static inline u64 guest_random_u64(struct guest_random_state *state)
+static inline u64 kvm_random_u64(struct kvm_random_state *state)
 {
-	return ((u64)guest_random_u32(state) << 32) | guest_random_u32(state);
+	return ((u64)kvm_random_u32(state) << 32) | kvm_random_u32(state);
 }
 
 enum vm_mem_backing_src_type {
diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
index c33ab6e04171..6904dbda79f9 100644
--- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
@@ -55,9 +55,9 @@ static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch)
 do {											\
 	const typeof(mem) val = (__val);						\
 											\
-	if (!is_forced_emulation_enabled || guest_random_bool(&guest_rng)) {		\
+	if (!is_forced_emulation_enabled || kvm_random_bool(&kvm_rng)) {		\
 		(mem) = val;								\
-	} else if (guest_random_bool(&guest_rng)) {					\
+	} else if (kvm_random_bool(&kvm_rng)) {					\
 		__asm__ __volatile__(KVM_FEP "mov %1, %0"				\
 				     : "+m" (mem)					\
 				     : "r" (val) : "memory");				\
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 195f3fdae1e3..875030c22d07 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -20,9 +20,9 @@
 
 #define KVM_UTIL_MIN_PFN	2
 
-u32 guest_random_seed;
-struct guest_random_state guest_rng;
-static u32 last_guest_seed;
+u32 kvm_random_seed;
+struct kvm_random_state kvm_rng;
+static u32 last_kvm_seed;
 
 static size_t vcpu_mmap_sz(void);
 
@@ -515,12 +515,12 @@ struct kvm_vm *__vm_create(struct vm_shape shape, u32 nr_runnable_vcpus,
 	slot0 = memslot2region(vm, 0);
 	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
 
-	if (guest_random_seed != last_guest_seed) {
-		pr_info("Random seed: 0x%x\n", guest_random_seed);
-		last_guest_seed = guest_random_seed;
+	if (kvm_random_seed != last_kvm_seed) {
+		pr_info("Random seed: 0x%x\n", kvm_random_seed);
+		last_kvm_seed = kvm_random_seed;
 	}
-	guest_rng = new_guest_random_state(guest_random_seed);
-	sync_global_to_guest(vm, guest_rng);
+	kvm_rng = new_kvm_random_state(kvm_random_seed);
+	sync_global_to_guest(vm, kvm_rng);
 
 	kvm_arch_vm_post_create(vm, nr_runnable_vcpus);
 
@@ -2279,8 +2279,8 @@ void __attribute((constructor)) kvm_selftest_init(void)
 	sigaction(SIGILL, &sig_sa, NULL);
 	sigaction(SIGFPE, &sig_sa, NULL);
 
-	guest_random_seed = last_guest_seed = random();
-	pr_info("Random seed: 0x%x\n", guest_random_seed);
+	kvm_random_seed = last_kvm_seed = random();
+	pr_info("Random seed: 0x%x\n", kvm_random_seed);
 
 	kvm_selftest_arch_init();
 }
diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c
index 6dcd15910a06..3599b75d97c9 100644
--- a/tools/testing/selftests/kvm/lib/memstress.c
+++ b/tools/testing/selftests/kvm/lib/memstress.c
@@ -48,14 +48,14 @@ void memstress_guest_code(u32 vcpu_idx)
 {
 	struct memstress_args *args = &memstress_args;
 	struct memstress_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx];
-	struct guest_random_state rand_state;
+	struct kvm_random_state rand_state;
 	gva_t gva;
 	u64 pages;
 	u64 addr;
 	u64 page;
 	int i;
 
-	rand_state = new_guest_random_state(guest_random_seed + vcpu_idx);
+	rand_state = new_kvm_random_state(kvm_random_seed + vcpu_idx);
 
 	gva = vcpu_args->gva;
 	pages = vcpu_args->pages;
@@ -69,13 +69,13 @@ void memstress_guest_code(u32 vcpu_idx)
 
 		for (i = 0; i < pages; i++) {
 			if (args->random_access)
-				page = guest_random_u32(&rand_state) % pages;
+				page = kvm_random_u32(&rand_state) % pages;
 			else
 				page = i;
 
 			addr = gva + (page * args->guest_page_size);
 
-			if (__guest_random_bool(&rand_state, args->write_percent))
+			if (__kvm_random_bool(&rand_state, args->write_percent))
 				*(u64 *)addr = 0x0123456789ABCDEF;
 			else
 				READ_ONCE(*(u64 *)addr);
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index bab1bd2b775b..e98ca7ef439c 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -30,13 +30,13 @@ void __attribute__((used)) expect_sigbus_handler(int signum)
  * Park-Miller LCG using standard constants.
  */
 
-struct guest_random_state new_guest_random_state(u32 seed)
+struct kvm_random_state new_kvm_random_state(u32 seed)
 {
-	struct guest_random_state s = {.seed = seed};
+	struct kvm_random_state s = {.seed = seed};
 	return s;
 }
 
-u32 guest_random_u32(struct guest_random_state *state)
+u32 kvm_random_u32(struct kvm_random_state *state)
 {
 	state->seed = (u64)state->seed * 48271 % ((u32)(1 << 31) - 1);
 	return state->seed;
diff --git a/tools/testing/selftests/kvm/x86/sev_dbg_test.c b/tools/testing/selftests/kvm/x86/sev_dbg_test.c
index a9d8e4c059f9..eaa8201b937d 100644
--- a/tools/testing/selftests/kvm/x86/sev_dbg_test.c
+++ b/tools/testing/selftests/kvm/x86/sev_dbg_test.c
@@ -34,7 +34,7 @@ static void validate_buffers(void)
 
 static void ____test_sev_dbg(struct kvm_vm *vm, int i, int j, int nr_bytes)
 {
-	u8 pattern = guest_random_u32(&guest_rng);
+	u8 pattern = kvm_random_u32(&kvm_rng);
 
 	if (i + nr_bytes > BUFFER_SIZE || j + nr_bytes > BUFFER_SIZE)
 		return;
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 01/20] KVM: selftests: Build and link selftests/vfio/lib into KVM selftests
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke
In-Reply-To: <20260613002031.745413-1-seanjc@google.com>

From: David Matlack <dmatlack@google.com>

Include libvfio.mk into the KVM selftests Makefile and link it into all
KVM selftests by adding it to LIBKVM_OBJS.

This lays the groundwork for future changes to utilize VFIO devices to
verify IRQ bypass in KVM selftests.

Note that KVM selftests build their own copy of selftests/vfio/lib and
the resulting object files are placed in $(OUTPUT)/lib. This allows the
KVM and VFIO selftests to apply different CFLAGS when building without
conflicting with each other.

Signed-off-by: David Matlack <dmatlack@google.com>
Signed-off-by: Josh Hilke <jrhilke@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 9462919d2660..59ec232b1e93 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -258,6 +258,7 @@ OVERRIDE_TARGETS = 1
 # which causes the environment variable to override the makefile).
 include ../lib.mk
 include ../cgroup/lib/libcgroup.mk
+include ../vfio/lib/libvfio.mk
 
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
@@ -312,7 +313,9 @@ LIBKVM_S := $(filter %.S,$(LIBKVM))
 LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
 LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
 LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) $(LIBCGROUP_O)
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
+LIBKVM_OBJS += $(LIBCGROUP_O)
+LIBKVM_OBJS += $(LIBVFIO_O)
 SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
 SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH)/%.o, $(SPLIT_TESTS))
 
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply related

* [PATCH v7 00/20] KVM: selftests: Add eventfd+VFIO IRQ test
From: Sean Christopherson @ 2026-06-13  0:20 UTC (permalink / raw)
  To: Paolo Bonzini, Marc Zyngier, Oliver Upton, Sean Christopherson
  Cc: Joey Gouly, Steffen Eiden, Suzuki K Poulose, Zenghui Yu, kvm,
	linux-arm-kernel, kvmarm, linux-kernel, David Matlack, Josh Hilke

David and Josh's series to add a selftest for verifying interrupt delivery
via eventfd (via KVM_IRQFD), and also from a real device, wired up via VFIO.

I originally wanted to get this into 7.2, but that's not going to happen.  But
I hope to get this applied early in the 7.3 cycle so that additional features
and whatnot can be developed on top without too much pain (hopefully).

Gory details in the patches, and in the v5 cover letter.

v7:
 - Seed kvm_rng during kvm_selftest_init(). [Sashiko]
 - Seed libc's RNG during kvm_selftest_init(), so that the seed (from random())
   that's printed and passed to kvm_rng isn't the same every time.
 - Init irq_cpu to -1 in all paths. [Sashiko]

v6:
 - Massage most changelogs.
 - Fix SoB ordering issues.
 - Clean up KVM_SET_GSI_ROUTING helper.
 - Remove misleading "IRQ injection" and "emulated eventfd" terminology.
 - Add GUEST_RECEIVED_INTERRUPT() to simplifiy the core loop.
 - Use cpu_relax() in tight loops while waiting for interrupts.
 - Print as much information as possible in the actual assert, instead of
   printing to stdout separately.
 - Make a best guess as to the right VFIO vs. IOMMUFD mode instead of
   assuming IOMMUFD, and give the user the option to overide said guess.
 - Simplify open_proc_irq_smp_affinity_list() +
   write_proc_irq_smp_affinity_list() into proc_irq_set_smp_affinity().
 - Drop print_proc_irq_file() and kvm_print_vcpu_affinity() (for now) to avoid
   potential issues on systems with high CPU counts.
 - Drop the blocking/HLT testing as it was at best broken.
 - Use -e for "empty", not -c for "clear", when completely tearing down GSI
   routing, because routing can be "cleared" without completely emptying the
   routing information.
 - Use the main task's CPU affinity as the available_cpus set.
 - Allow overcommiting vCPUs:pCPUs.
 - Set the target vCPU's affinity instead of batching when vCPU0 is targeted.
 - Add support for 256+ vCPUs with x2APIC.
 - Restrict xAPIC mode to 255 vCPUs.
 - Restrict the test to KVM selftest's max supported vCPUs.

v5:
 - https://lore.kernel.org/all/20260604020143.748245-1-jrhilke@google.com
 - Rename get_proc_vfio_irq_number() to vfio_msix_to_host_irq()
 - Rename open_proc_irq_affinity() and write_proc_irq_affinity() to include "_smp_affinity_list"
 - Print /proc/irq/<irq>/smp_affinity and effective_affinity on timeout failures
 - Convert IRQ type from 'int' to 'unsigned int' across helpers and the test
 - Fix compiler warnings for uninitialized variables in irq_test.c
 - Remove rate-limiting on affinity changes

v4: https://lore.kernel.org/kvm/20260530002134.558837-1-jrhilke@google.com

David Matlack (11):
  KVM: selftests: Build and link selftests/vfio/lib into KVM selftests
  KVM: selftests: Add macros to read/write+sync to/from guest memory
  KVM: selftests: Add an irqfd send+receive (and later IRQ bypass) test
  KVM: selftests: Add helper to get host IRQ from device MSI-X for IRQ
    bypass test
  KVM: selftests: Add VFIO device support to eventfd IRQ test
  KVM: selftests: Verify interrupts are received when IRQ affinity
    changes in IRQ test
  KVM: selftests: Add option to set empty routing between IRQs in
    eventfd IRQ test
  KVM: selftests: Make number of IRQs configurable in IRQ test
  KVM: selftests: Verify vCPU migration during IRQ delivery in IRQ test
  KVM: selftests: Make number of vCPUs configurable in IRQ test
  KVM: selftests: Add xAPIC support in eventfd IRQ test

Josh Hilke (6):
  KVM: selftests: Rename guest_rng to kvm_rng
  KVM: selftests: Add helper to generate random u64 in range [min,max]
  KVM: selftests: Add a helper to set proc IRQ affinity for IRQ test
  KVM: selftests: Add kvm_gettid() wrapper and convert users
  KVM: selftests: Add kvm_sched_getaffinity() wrapper and convert users
  KVM: selftests: Add a utility to pin a task to a random CPU, given a
    CPU set

Sean Christopherson (3):
  KVM: selftests: Initialize the default/global pRNG during
    kvm_selftest_init()
  KVM: selftests: Seed libc's RNG before using it to generate a seed for
    KVM's pRNG
  KVM: selftests: Verify non-postable IRQ remapping in IRQ test

 tools/testing/selftests/kvm/Makefile.kvm      |   7 +-
 tools/testing/selftests/kvm/arch_timer.c      |   2 +-
 .../kvm/arm64/arch_timer_edge_cases.c         |   2 +-
 .../selftests/kvm/demand_paging_test.c        |   2 +-
 .../selftests/kvm/dirty_log_perf_test.c       |   4 +-
 tools/testing/selftests/kvm/dirty_log_test.c  |  11 +-
 .../selftests/kvm/include/kvm_syscalls.h      |   7 +
 .../testing/selftests/kvm/include/kvm_util.h  |  12 +
 .../testing/selftests/kvm/include/proc_util.h |  11 +
 .../testing/selftests/kvm/include/test_util.h |  25 +-
 .../selftests/kvm/include/x86/kvm_util_arch.h |   4 +-
 tools/testing/selftests/kvm/irq_test.c        | 350 ++++++++++++++++++
 tools/testing/selftests/kvm/lib/assert.c      |   8 +-
 tools/testing/selftests/kvm/lib/kvm_util.c    |  53 ++-
 tools/testing/selftests/kvm/lib/memstress.c   |   8 +-
 tools/testing/selftests/kvm/lib/proc_util.c   |  54 +++
 tools/testing/selftests/kvm/lib/test_util.c   |  27 +-
 tools/testing/selftests/kvm/mmu_stress_test.c |  15 +-
 tools/testing/selftests/kvm/rseq_test.c       |   6 +-
 tools/testing/selftests/kvm/steal_time.c      |  22 +-
 .../testing/selftests/kvm/x86/sev_dbg_test.c  |   2 +-
 21 files changed, 548 insertions(+), 84 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/include/proc_util.h
 create mode 100644 tools/testing/selftests/kvm/irq_test.c
 create mode 100644 tools/testing/selftests/kvm/lib/proc_util.c


base-commit: c1f7303302927f9cbf4efedf70f0512cde168c65
-- 
2.54.0.1136.gdb2ca164c4-goog



^ permalink raw reply

* Re: [RFC PATCH 0/2] kasan: hw_tags: Add option to tag only at allocation time
From: Isaac Manjarres @ 2026-06-13  0:16 UTC (permalink / raw)
  To: Dev Jain
  Cc: ryabinin.a.a, akpm, corbet, glider, andreyknvl, dvyukov,
	vincenzo.frascino, kasan-dev, linux-mm, linux-kernel, skhan,
	workflows, linux-doc, linux-arm-kernel, ryan.roberts,
	anshuman.khandual, kaleshsingh, 21cnbao, david, will,
	catalin.marinas
In-Reply-To: <20260612044425.763060-1-dev.jain@arm.com>

On Fri, Jun 12, 2026 at 04:44:22AM +0000, Dev Jain wrote:
> Introduce a boot option to tag only at allocation time of the objects. This
> reduces KASAN MTE overhead, the tradeoff being reduced ability of
> catching bugs.
> 
> Now, when a memory object will be freed, it will retain the random tag it
> had at allocation time. This compromises on catching UAF bugs, till the
> time the object is not reallocated, at which point it will have a new
> random tag.
> 
> Hence, not catching "use-after-free-before-reallocation" and not catching
> "double-free" will be the compromise for reduced KASAN overhead.
> 
> This is an RFC because we are not clear about the performance benefit.
> 
> Android folks, please help with testing!
> 
> ---
> Applies on Linus master (9716c086c8e8).
> 
> Dev Jain (2):
>   kasan: hw_tags: Use KASAN_PAGE_REDZONE for vmalloc redzoning
>   kasan: hw_tags: Add boot option to elide free time poisoning
> 
>  Documentation/dev-tools/kasan.rst |  4 +++
>  mm/kasan/hw_tags.c                | 45 +++++++++++++++++++++++++++++--
>  mm/kasan/kasan.h                  | 23 +++++++++++++++-
>  3 files changed, 69 insertions(+), 3 deletions(-)
> 
> -- 
> 2.43.0

I tested out this series on one of our devices that has MTE support,
and didn't see any functional issues.

One thing I did notice though, and it's independent of this patch, is
that the vmalloc_oob is failing, but that happens even if these patches
aren't present.

Thanks,
Isaac


^ permalink raw reply

* Re: [PATCH v2 3/3] arm64: escalate smp_send_stop() to an SDEI NMI as a last resort
From: Doug Anderson @ 2026-06-12 23:44 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Catalin Marinas, Will Deacon, James Morse, Mark Rutland,
	Marc Zyngier, Petr Mladek, Thomas Gleixner, Andrew Morton,
	Baoquan He, Puranjay Mohan, Usama Arif, Breno Leitao,
	Julien Thierry, Lecopzer Chen, Sumit Garg, kernel-team, kexec,
	linux-arm-kernel, linux-kernel
In-Reply-To: <airhxVP7vAVehIXQ@thinkstation>

Hi,

On Thu, Jun 11, 2026 at 10:47 AM Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> > FWIW, I'm not totally sure I followed the logic for why "die_on_crash"
> > needs to be "false" for the SDEI case,
>
> It's not about kexec mechanics, it's about the SDEI dispatch state.
>
> The SDEI stop handler parks inside an SDEI event that it deliberately
> never completes — completing it makes firmware resume the wedged
> context, which is the opposite of what we want. PSCI CPU_OFF from inside
> that not-yet-completed event silently wedges EL3 on at least one
> production firmware (still root-causing on the firmware side), so the
> SDEI path saves the crashed context and parks instead of powering off.
>
> The only consequence is that an SMP capture kernel can't re-online that
> CPU. The dump itself is complete. I've left "power the SDEI-stopped CPU
> off too" as a follow-up and called it out in the cover letter. The IPI
> crash path is unaffected and still does CPU_OFF, exactly as before.

Ah, OK. This makes sense. I read the cover letter, but I guess this
part of it didn't stick in my mind. I wouldn't mind an explanation of
what's going on being included as a comment in the code. Then someone
down the line won't be left wondering.


> > Do you have any reasoning for why you don't pick a separate EVENT ID
> > for "backtrace" vs. "stop". If you absolutely have to share an ID
> > because they're a limited resource then I guess it's fine, but it
> > would make the code easier to understand / reason about if they were
> > separate IDs.
> >
> > If you had a separate EVENT ID, then it seems like you could
> > completely eliminate the (potentially large) `sdei_nmi_stop_mask`
> > variable, right? Any time a "STOP" event fires you can unconditionally
> > consider it to be a stop w/ no globals needed, right?
>
> Separate event IDs aren't available: SDEI_EVENT_SIGNAL only ever signals
> event 0 — it's the one architecturally software-signalled event. Every
> other event number is an interrupt-bound event that firmware has to
> define and bind, which is the firmware dependency this series is
> specifically trying not to add. So backtrace and stop are stuck sharing
> event 0.

Oh well.


> But you're right that the mask should go — just not via a second event. A
> stop is terminal and system-wide (sdei_nmi_stop_cpus() is only reached
> from smp_send_stop(), which never returns), so once a stop is requested
> every later event-0 fire is a stop too. I replaced the cpumask with a
> single write-once flag the handler reads; a backtrace that races in
> after a stop has begun just stops that CPU, which is fine. So the
> (potentially large) variable is gone.

Yeah, this sounds much better, thanks!


^ permalink raw reply

* [PATCH v2] arm64: tlbflush: Reset active_cpu on ASID rollover
From: Sayali Kulkarni @ 2026-06-12 23:21 UTC (permalink / raw)
  To: catalin.marinas
  Cc: linux-arm-kernel, linux-kernel, will, ryan.roberts, linu.cherian,
	yang, cl, sskulkarni
In-Reply-To: <airWIxSd_a5kR65-@arm.com>

From: Sayali Kulkarni <sskulkarni@amperecomputing.com>

Hi Catalin,  

Thank you for the review. I’ve addressed your feedback in v2:  

- Moved `WRITE_ONCE(mm->context.active_cpu, ACTIVE_CPU_NONE)` from `check_and_switch_context()` to `new_context()` after the `set_asid` label. At this point, a brand new ASID has been allocated that no CPU has ever used, so the reset is safe even for multi-threaded processes where other CPUs may still be running with the old ASID via `reserved_asids`.  
- Updated the commit message to correct the safety reasoning: `flush_context()` only sets `tlb_flush_pending`; it does not issue a global TLB flush.  

Thanks,  
Sayali


Once active_cpu flips to ACTIVE_CPU_MULTIPLE it never resets, even if
the process settles back to one CPU. Reset it to ACTIVE_CPU_NONE in
new_context() after a new ASID is allocated at the set_asid label.

At this point a brand new ASID has been assigned that no CPU has ever
used, so ACTIVE_CPU_NONE accurately reflects reality. Any other threads
of the same process continue running with the old ASID via
reserved_asids and are unaffected.

This gives processes a fresh chance at the local-only flush fast path
after each ASID generation rollover.

Signed-off-by: Sayali Kulkarni <sskulkarni@amperecomputing.com> (Ampere)
---
 arch/arm64/mm/context.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index f34ed78393e0..46c7fd07b9bf 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -209,6 +209,7 @@ static u64 new_context(struct mm_struct *mm)
 set_asid:
 	__set_bit(asid, asid_map);
 	cur_idx = asid;
+	WRITE_ONCE(mm->context.active_cpu, ACTIVE_CPU_NONE);
 	return asid2ctxid(asid, generation);
 }
 
-- 
2.47.3



^ permalink raw reply related

* [PATCH v4 00/31] Introduce SCMI Telemetry FS support
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
  To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
	linux-doc
  Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
	etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
	elif.topuz, lukasz.luba, philip.radford, brauner,
	souvik.chakravarty, leitao, kas, puranjay, usama.arif,
	kernel-team, Cristian Marussi

Hi all,

--------------------------------------------------------------------------------
[TLDR Summary]
This series introduces a new SCMI driver which uses a new Telemetry FS to expose
and configure SCMI Telemetry Data Events retrieved from the platform SCMI FW
at runtime. The patches carrying the new STLMFS Filesystem support are tagged
with 'stlmfs'.
--------------------------------------------------------------------------------

the upcoming SCMI v4.0 specification [0] introduces a new SCMI protocol
dedicated to System Telemetry.

In a nutshell, the SCMI Telemetry protocol allows an agent to discover at
runtime the set of Telemetry Data Events (DEs) available on a specific
platform and provides the means to configure the set of DEs that a user is
interested into, while reading them back using the collection method that
is deeemed more suitable for the usecase at hand. (...amongst the various
possible collection methods allowed by SCMI specification)

Without delving into the gory details of the whole SCMI Telemetry protocol
let's just say that the SCMI platform/server firmware advertises a number
of Telemetry Data Events, each one identified by a 32bit unique ID, and an
SCMI agent/client, like Linux, can discover them and read back at will the
associated data value in a number of ways.
Data collection is mainly intended to happen on demand via shared memory
areas exposed by the platform firmware, discovered dynamically via SCMI
Telemetry and accessed by Linux on-demand, but some DE can also be reported
via SCMI Notifications asynchronous messages or via direct dedicated
FastChannels (another kind of SCMI memory based access): all of this
underlying mechanism is anyway hidden to the user since it is mediated by
the kernel driver which will return the proper data value when queried.

Anyway, the set of well-known architected DE IDs defined by the spec is
limited to a dozen IDs, which means that the vast majority of DE IDs are
customizable per-platform: as a consequence, though, the same ID, say
'0x1234', could represent completely different things on different systems.

Precise definitions and semantic of such custom Data Event IDs are out of
the scope of the SCMI Telemetry specification and of this implementation:
they are supposed to be provided using some kind of JSON-like description
file that will have to be consumed by a userspace tool which would be
finally in charge of making sense of the set of available DEs.

IOW, in turn, this means that even though the DEs enumerated via SCMI come
with some sort of topological and qualitative description provided by the
protocol (like unit of measurements, name, topology info etc), kernel-wise
we CANNOT be completely sure of "what is what" without being fed-back some
sort of information about the DEs by the afore mentioned userspace tool.

For these reasons, currently this series does NOT attempt to register any
of these DEs with any of the usual in-kernel subsystems (like HWMON, IIO,
PERF etc), simply because we cannot be sure which DE is suitable, or even
desirable, for a given subsystem. This also means there are NO in-kernel
users of these Telemetry data events as of now.

So, while we do not exclude, for the future, to feed/register some of the
discovered DEs to/with some of the above mentioned Kernel subsystems, as
of now we have ONLY modeled a custom userspace API to make SCMI Telemetry
available to userspace tools.

In deciding which kind of interface to expose SCMI Telemetry data to a
user, this new SCMI Telemetry driver aims at satisfying 2 main reqs:

 - exposing an FS-based human-readable interface that can be used to
   discover, configure and access our Telemetry data directly also from
   the shell without special tools

 - exposing alternative machine-friendly, more-performant, binary
   interfaces that can be used to avoid the overhead of multiple accesses
   to the VFS and that can be more suitable to access with custom tools

In the initial RFC posted a few months ago [1], the above was achieved
with a combination of a SysFS interface, for the human-readable side of
the story, and a classic chardev/ioctl for the plain binary access.

Since V1, instead, we moved away from this combined approach, especially
away from SysFS, for the following reason:

 1. "Abusing SysFS": SysFS is a handy way to expose device related
      properties in a common way, using a few common helpers built on
      kernfs; this means, though, that unfortunately in our scenario I had
      to generate a dummy simple device for EACH SCMI Telemetry DataEvent
      that I got to discover at runtime and attach to them, all of the
      properties I need.
      This by itself seemed to me abusing the SysFS framework, but, even
      ignoring this, the impact on the system when we have to deal with
      hundreds or tens of thousands of DEs is sensible.
      In some test scenario I ended with 50k DE devices and half-a-millon
      related property files ... O_o

 2. "SysFS constraints": SysFS usage itself has its well-known constraints
      and best practices, like the one-file/one-value rule, and due to the
      fact that any virtual file with a complex structure or handling logic
      is frowned upon, you can forget about IOCTLs and mmap'ing to provide
      a more performant interface within SysFs, which is the reason why,
      in the previous RFC, there was an additional alternative chardev
      interface.
      These latter limitations around the implementation of files with a
      more complex semantic (i.e. with a broader set of file_operations)
      derive from the underlying KernFS support, so KernFS is equally not
      suitable as a building block for our implementation.

 2. "Chardev limitations": Given the nature of the protocol, the hybrid
      approach employing character devices was itself problematic: first
      of all because there is an upper limit on the number of chardev we
      can create, dictated by the range of available minor numbers, and
      then because the fact itself to have to maintain 2 completely
      different interfaces (FS + chardev) is painful.

As a final remark, please NOTE THAT all of this is supposed to be available
in production systems across a number of heterogeneous platforms: for these
reasons the easy choice, debugFS, is NOT an option here.

Due to the above reasoning, since V1 we opted for a new approach with the
proposed interfaces now based on a full fledged, unified, virtual pseudo
filesystem implemented from scratch, so that we can:

 - expose all the DEs property we like as before with SysFS, but without
   any of the constraint imposed by the usage of SysFs or kernfs.

 - easily expose additional alternative views of the same set of DEs
   using symlinking capabilities (e.g. alternative topological view)

 - additionally expose a few alternative and more performant interfaces
   by embedding in that same FS, a few special virtual files:

   + 'control': to issue IOCTLs for quicker discovery and on-demand access
   		to data
   + 'pipe' [TBD]: to provide a stream of events using a virtual
   		   infinite-style file
   + 'raw_<N>' [TBD]: to provide direct memory mapped access to the raw
   		      SCMI Telemetry data from userspace

 - use a mount option to enable a lazy enumeration operation mode to delay
   SCMI related background discovery activities to the effective point in
   time when the user needs it (if ever) so as to mitigate the effect at
   boot-time of the initial SCMI full discovery process


INTERFACES
===========

We propose a couple of interfaces, both rooted in the same unified
SCMI Telemetry Filesystem STLMFS, which can be mounted with:

	mount -t stlmfs none /sys/fs/arm_telemetry/

The new pseudo FS rationale, design and related ABI interface is documented
in detail at:

 - Documentation/filesystems/stlmfs.rst
 - Documentation/ABI/testing/stlmfs

...anyway, roughly, STLMFS exposes the following interfaces, rooted at
different points in the FS:

 1. a FS based human-readable API tree

   This API present the discovered DEs and DEs-groups rooted under a
   structrure like this:

	/sys/fs/arm_telemetry/tlm_0/
	|-- all_des_enable
	|-- all_des_tstamp_enable
	|-- available_update_intervals_ms
	|-- current_update_interval_ms
	|-- de_implementation_version
	|-- des
	|   |-- 0x00000000/
	|   |-- 0x00000016/
	|   |-- 0x00001010/
	|   |-- 0x0000A000/
	|   |-- 0x0000A001/
	|   |-- 0x0000A002/
	|   |-- 0x0000A005/
	|   |-- 0x0000A007/
	|   |-- 0x0000A008/
	|   |-- 0x0000A00A/
	|   |-- 0x0000A00B/
	|   |-- 0x0000A00C/
	|   `-- 0x0000A010/
	|-- des_bulk_read
	|-- des_single_sample_read
	|-- groups
	|   |-- 0/
	|   `-- 1/
	|-- intervals_discrete
	|-- reset
	|-- tlm_enable
	`-- version

	At the top level we have general configuration knobs to:

	- enable/disable all DEs with or without tstamp
	- configure the update interval that the platform will use
	- enable Telemetry as a whole
	- read all the enabled DEs in a buffer one-per-line
		<DE_ID> <TIMESTAMP> <DATA_VALUE>
	- des_single_sample_read to request an immediate updated read of
	  all the enabled DEs in a single buffer one-per-line:
		<DE_ID> <TIMESTAMP> <DATA_VALUE>
        
	where each DE in turn is represented by a flat subtree like:

	tlm_0/des/0x0000A001/
	|-- compo_instance_id
	|-- compo_type
	|-- enable
	|-- instance_id
	|-- name
	|-- persistent
	|-- tstamp_enable
	|-- tstamp_exp
	|-- type
	|-- unit
	|-- unit_exp
	`-- value

	where, beside a bunch of description items, you can:

	- enable/disable a single DE
	- read back its tstamp and data from 'value' as in:
		<TIMESTAMP>: <DATA_VALUE>

	then for each (optionally) discovered group of DEs:

	scmi_tlm_0/groups/0/
	|-- available_update_intervals_ms
	|-- composing_des
	|-- current_update_interval_ms
	|-- des_bulk_read
	|-- des_single_sample_read
	|-- enable
	|-- intervals_discrete
	`-- tstamp_enable

	you can find the knobs to:
	
	- enable/disable the group as a whole
	- lookup group composition
	- set a per-group update interval (if supported)
	- des_bulk_read to read all the enabled DEs for this group in a
	  single buffer one-per-line:
		<DE_ID> <TIMESTAMP> <DATA_VALUE>
	- des_single_sample_read to request an immediate updated read of
	  all the enabled DEs for this group in a single buffer
	  one-per-line:
		<DE_ID> <TIMESTAMP> <DATA_VALUE>

 2. Leveraging the capabilities offered by the full-fledged filesystem
    implementation and the topological information provided by SCMI
    Telemetry we expose also and alternative view of the above tree, by
    symlinking a few of the same entries above under another, topologically
    sorted, subtree:


        by-components/                                                           
        ├── cpu                                                                  
        │   ├── 0                                                                
        │   │   ├── celsius                                                      
        │   │   │   └── 0                                                        
        │   │   │       └── 0x00000001[pe_0] -> ../../../../../des/0x00000001    
        │   │   └── cycles                                                       
        │   │       ├── 0                                                        
        │   │       │   └── 0x00001010[] -> ../../../../../des/0x00001010        
        │   │       └── 1                                                        
        │   │           └── 0x00002020[] -> ../../../../../des/0x00002020        
        │   ├── 1                                                                
        │   │   └── celsius                                                      
        │   │       └── 0                                                        
        │   │           └── 0x00000002[pe_1] -> ../../../../../des/0x00000002    
        │   └── 2                                                                
        │       └── celsius                                                      
        │           └── 0                                                        
        │               └── 0x00000003[pe_2] -> ../../../../../des/0x00000003    
        ├── interconnnect                                                        
        │   └── 0                                                                
        │       └── hertz                                                        
        │           └── 0                                                        
        │               ├── 0x0000A008[A008_de] -> ../../../../../des/0x0000A008 
        │               └── 0x0000A00B[] -> ../../../../../des/0x0000A00B        
        ├── mem_cntrl                                                            
        │   └── 0                                                                
        │       ├── bps                                                          
        │       │   └── 0                                                        
        │       │       └── 0x0000A00A[] -> ../../../../../des/0x0000A00A        
        │       ├── celsius                                                      
        │       │   └── 0                                                        
        │       │       └── 0x0000A007[DRAM_temp] -> ../../../../../des/0x0000A007
        │       └── joules                                                       
        │           └── 0                                                        
        │               └── 0x0000A002[DRAM_energy] -> ../../../../../des/0x0000A002
        ├── periph                                                               
        │   ├── 0                                                                
        │   │   └── messages                                                     
        │   │       └── 0                                                        
        │   │           └── 0x00000016[device_16] -> ../../../../../des/0x00000016
        │   ├── 1                                                                
        │   │   └── messages                                                     
        │   │       └── 0                                                        
        │   │           └── 0x00000017[device_17] -> ../../../../../des/0x00000017
        │   └── 2                                                                
        │       └── messages                                                     
        │           └── 0                                                        
        │               └── 0x00000018[device_18] -> ../../../../../des/0x00000018
        └── unspec                                                               
                └── 0                                                            
                    ├── celsius                                                  
                    │   └── 0                                                    
                    │       └── 0x0000A005[] -> ../../../../../des/0x0000A005    
                    ├── counts                                                   
                    │   └── 0                                                    
                    │       └── 0x0000A00C[] -> ../../../../../des/0x0000A00C    
                    ├── joules                                                   
                    │   └── 0                                                    
                    │       ├── 0x0000A000[SOC_Energy] -> ../../../../../des/0x0000A000
                    │       └── 0x0000A001[] -> ../../../../../des/0x0000A001    
                    └── state                                                    
                        └── 0                                                    
                            └── 0x0000A010[] -> ../../../../../des/0x0000A010    
                                                                                 
  ...so as to provide the human user with a more understandable topological
  layout of the madness...

All of this is nice and fancy human-readable, easily scriptable, but
certainly not the fastest possible to access especially on huge trees...

 ... so for the afore-mentioned reasons we alternatively expose

 3. a more performant API based on IOCTLs as described fully in:

	include/uapi/linux/scmi.h

   As described succinctly in the above UAPI header too, this API is meant
   to be called on a few special files named 'control' that are populated
   into the tree:

   .
   |-- all_des_enable
   .....
   |-- components
   |   |-- cpu
   |   |-- interconnnect
   |   |-- mem_cntrl
   |   |-- periph
   |   `-- unspec
   |-- control
   .....................

   |-- groups
   |   |-- 0
   |   |   |-- available_update_intervals_ms
   |   |   |-- composing_des
   |   |   |-- control
   .....................
   |   |-- 1
   |   |   |-- available_update_intervals_ms
   |   |   |-- composing_des
   |   |   |-- control
   .....................
   |   `-- 2
   |       |-- available_update_intervals_ms
   |       |-- composing_des
   |       |-- control
   .....................

  This allows a tool to:

   - use some IOCTLs to configure a set of properties equivalent to the
     ones above in FS
   - use some other IOCTLs for direct access to data in binary format
     for a single DEs or all of them

 4. [FUTURE/NOT IN THIS SERIES]
    Add another alternative, completely binary, direct raw accessbinterface
    via a new set of memory mappable special files so as to allow userspace
    tools to access SCMI Telemetry data directly in binary form without any
    kernel mediation.

NOTE THAT this series, at the firmware interface level NOW supports ONLY
the latest SCMI v4.0 specification [0].

Missing feats & future steps
----------------------------
 - add direct access interface via mmap-able 'raw' files
 - add streaming mode interface via 'pipe' file (tentative)
 - evolve/enhance app in tools/testing/scmi/stlm to be interactive

KNOWN ISSUES
------------
 - STLMFS code layout and location...nothing lives in fs/ and no distinct
   FS Kconfig...but the SCMI Telemetry driver itself has no point in existing
   without the FS that exposes...so should I split the pure FS part into fs/
   anyway or not ?
 - residual sparse/smatch static analyzers errors
 - stlm tool utility is minimal for testing or development

Based on V7.1-rc7, tested on an emulated setup.

This series is available also at [2].

If you still reading...any feedback welcome :P

Thanks,
Cristian

----
v3 --> v4
 - rebased on v7.1-rc7
 - updatded doc to detail Concurrency model
 - bail out on FW_BUG errors
 - make all_des_enable/all_des_tstamp_enable entry readable
 - refactored access to TDE values
 - refactored common accessors for tlm_priv (FIX WARN on kfree)
 - make all files by default world readable and user writable (if needed)
 - added uid/god/umask mount options (and docs)
 - added generation counter to aid spotting config changes (and docs)
 - added DebugFS configurable support to debug/dump SHMTI areas (and docs)
 - hide FS entries when NOT supported (like des_simple_sample_read)
 - fixed output format of des/<NNN>/value to -> <TS> <VALUE>
 - renamed top-dir by_components to by-components
 - add a .remove method to SCMI System Telemetry Driver
 - use kzalloc_obj
V2 --> V3
 - rebased on v7.0-rc5
 - ported the firmware interface to SCMI v4.0 BETA
 - split the SCMI protocol layer in a lot of small patches
 - completd filesystem and ABI documentation
 - renamed components subtree to by_components
 - fixed uninitialized var in scmi_telemetry_de_subdir_symlink
 - renamd tstamp_exp to tstamp_rate
 - swap logic in scmi_telemetry_initial_state_lookup
 - use memcpy_from_le32 where required
 - changed a dfew dev_err into Telemetry traces
 - define and use new helper scmi_telemetry_de_unlink
 - simplify a few assignments with ternary ops
 - added a missing __mmust_check on the internal SCMI API
 - reworked and clarified de_data_read returned errno:
 	ENODATA vs EINVAL vs ENODEV/ENOENT
 - removed some risky/unneeded devres allocations
 - various checkpatch fixes
 - reworked and clarified usage of traces in Telemetry
 - added the missing DT binding for protocol 0x1B
 - split out unrelated change around notification from patch
   adding support for protocol internal notifier
 - more comments

V1 --> V2
 - rebased on v6.19-rc3
 - harden TDCF shared memory areas accesses by using proper accessors
 - reworked protocol resources lifecycle to allow lazy enumeration
 - using NEW FS mount API
 - reworked FS inode allocation to use a std kmem_cache
 - fixed a few IOCTLs support routine to support lazy enumeration
 - added (RFC) a new FS lazy mount option to support lazily population of
   some subtrees of the FS (des/ groups/ components/)
 - reworked implementation of components/ alternative FS view to use
   symlinks instead of hardlinks
 - added a basic simple (RFC) testing tool to exercise UAPI ioctls interface
 - hardened Telmetry protocol and driver to support partial out-of-spec FW
   lacking some cmds (best effort)
 - reworked probing races handling
 - reviewed behaviour on unmount/unload
 - added support for Boot_ON Telemetry by supporting SCMI Telemetry cmds:
   + DE_ENABLED_LIST
   + CONFIG_GET
 - added FS and ABI docs

RFC --> V1
---
 - moved from SysFS/chardev to a full fledged FS
 - added support for SCMI Telemetry BLK timestamps


Thanks,
Cristian

[0]: https://developer.arm.com/documentation/den0056/f/?lang=en
[1]: https://lore.kernel.org/arm-scmi/20250620192813.2463367-1-cristian.marussi@arm.com/
[2]: https://git.kernel.org/pub/scm/linux/kernel/git/cris/linux.git/log/?h=scmi_telemetry_unified_fs_V4

Cristian Marussi (31):
  firmware: arm_scmi: Add new SCMIv4.0 error codes definitions
  firmware: arm_scmi: Reduce the scope of protocols mutex
  firmware: arm_scmi: Allow registration of unknown-size events/reports
  firmware: arm_scmi: Allow protocols to register for notifications
  uapi: Add ARM SCMI definitions
  dt-bindings: firmware: arm,scmi: Add support for telemetry protocol
  include: trace: Add Telemetry trace events
  firmware: arm_scmi: Add basic Telemetry support
  firmware: arm_scmi: Add support to parse SHMTIs areas
  firmware: arm_scmi: Add Telemetry configuration operations
  firmware: arm_scmi: Add Telemetry DataEvent read capabilities
  firmware: arm_scmi: Add support for Telemetry reset
  firmware: arm_scmi: Add Telemetry notification support
  firmware: arm_scmi: Add support for boot-on Telemetry
  firmware: arm_scmi: Add Telemetry generation counter
  firmware: arm_scmi: Add common per-protocol debugfs support
  firmware: arm_scmi: Add Telemetry debugfs SHMTI dump support
  firmware: arm_scmi: Add Telemetry debugfs ABI documentation
  firmware: arm_scmi: stlmfs: Add System Telemetry filesystem driver
  fs/stlmfs: Document ARM SCMI Telemetry filesystem
  firmware: arm_scmi: stlmfs: Add basic mount options
  fs/stlmfs: Document ARM SCMI Telemetry FS mount options
  firmware: arm_scmi: stlmfs: Add ioctls support
  fs/stlmfs: Document alternative ioctl based binary interface
  firmware: arm_scmi: stlmfs: Add by-components view
  fs/stlmfs: Document alternative topological view
  firmware: arm_scmi: stlmfs: Add generation file
  [RFC] docs: stlmfs: Document ARM SCMI Telemetry FS ABI
  firmware: arm_scmi: stlmfs: Add lazy population support
  fs/stlmfs: Document lazy mode and related mount option
  [RFC] tools/scmi: Add SCMI Telemetry testing tool

 Documentation/ABI/testing/debugfs-scmi        |   22 +
 Documentation/ABI/testing/stlmfs              |  348 ++
 .../bindings/firmware/arm,scmi.yaml           |    8 +
 Documentation/filesystems/stlmfs.rst          |  342 ++
 MAINTAINERS                                   |    1 +
 drivers/firmware/arm_scmi/Kconfig             |   24 +
 drivers/firmware/arm_scmi/Makefile            |    3 +-
 drivers/firmware/arm_scmi/common.h            |   10 +
 drivers/firmware/arm_scmi/driver.c            |   93 +-
 drivers/firmware/arm_scmi/notify.c            |   30 +-
 drivers/firmware/arm_scmi/notify.h            |    8 +-
 drivers/firmware/arm_scmi/protocols.h         |   13 +
 .../firmware/arm_scmi/scmi_system_telemetry.c | 3146 ++++++++++++++++
 drivers/firmware/arm_scmi/telemetry.c         | 3300 +++++++++++++++++
 include/linux/scmi_protocol.h                 |  203 +-
 include/trace/events/scmi.h                   |   48 +-
 include/uapi/linux/scmi.h                     |  289 ++
 tools/testing/scmi/Makefile                   |   25 +
 tools/testing/scmi/stlm.c                     |  434 +++
 19 files changed, 8307 insertions(+), 40 deletions(-)
 create mode 100644 Documentation/ABI/testing/stlmfs
 create mode 100644 Documentation/filesystems/stlmfs.rst
 create mode 100644 drivers/firmware/arm_scmi/scmi_system_telemetry.c
 create mode 100644 drivers/firmware/arm_scmi/telemetry.c
 create mode 100644 include/uapi/linux/scmi.h
 create mode 100644 tools/testing/scmi/Makefile
 create mode 100644 tools/testing/scmi/stlm.c

-- 
2.54.0



^ permalink raw reply

* Re: [PATCH v14 10/44] arm64: RMI: Add support for SRO
From: Dan Williams (nvidia) @ 2026-06-12 23:07 UTC (permalink / raw)
  To: Steven Price, Gavin Shan, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <50d70588-2ebc-4c9b-98ec-68f3d04a9d21@arm.com>

Steven Price wrote:
[..]
> > alloc_pages_exact() will fail if the requested size exceeds the maximal
> > allowed
> > size (1 << MAX_PAGE_ORDER). The maximal size is usually smaller than
> > PUD_SIZE
> > but PUD_SIZE is allowed by the RMM.
> 
> This is an area where to be honest I'm really not sure what to do.
> Technically the RMM is allowed to ask for a contiguous range of 512GB
> pages (on a 4K system - larger with larger page sizes) - but clearly no
> real OS is going to be able to provide anything like that.
> 
> In practise we don't expect the RMM to do anything so crazy. It's not
> really clear to be whether even 2MB (PMD_SIZE) is needed. But the spec
> is written to be generic.
> 
> So my current approach is to calculate the required size and pass it
> into alloc_pages_exact(). For "stupidly large" values this will fail and
> Linux just doesn't support an RMM which attempts this. If there is ever
> a usecase which needs this then we'd need to find a different method of
> providing the memory (most likely some form of carveout to avoid
> fragmentation). But my view is we should wait for that usecase to be
> identified first.

Just some comparison comments as I am also going through the TDX patches
which enable "Extension SEAMCALLs". These new SEAMCALLs are similar to
the SRO mechanism [1].

TDX asks for an upfront delegation of memory at init time using
alloc_contig_pages() that is never returned until entire module is
shutdown. alloc_contig_pages() is not subject to the MAX_ORDER limit,
but not sure that alloc_contig_pages() is suitable for small+dynamic
runtime memory add / release that SRO potentially wants to do?

Does SRO always balance the size of RMI_OP_MEM_REQ_DONATE with
RMI_OP_MEM_REQ_RECLAIM, or might some donate requests be a one way
donation like TDX? Just poking to see if there is a path to preallocate
a pool vs the fine grained per-operation alloc/free.

[1]: http://lore.kernel.org/20260522034128.3144354-3-yilun.xu@linux.intel.com


^ permalink raw reply

* [GIT PULL] Qualcomm clock updates for v7.2
From: Bjorn Andersson @ 2026-06-12 22:48 UTC (permalink / raw)
  To: Stephen Boyd, linux-clk
  Cc: linux-arm-msm, linux-arm-kernel, Vivek Aknurwar, Luca Weiss,
	Jagadeesh Kona, Krzysztof Kozlowski, Luo Jie, Bartosz Golaszewski,
	Kathiravan Thirumoorthy, Alexander Koskovich, Biswapriyo Nath,
	Konrad Dybcio, Phillip Varney


The following changes since commit 254f49634ee16a731174d2ae34bc50bd5f45e731:

  Linux 7.1-rc1 (2026-04-26 14:19:00 -0700)

are available in the Git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux.git tags/qcom-clk-for-7.2

for you to fetch changes up to e108373c54fbc844b7f541c6fd7ecb31772afd3c:

  clk: qcom: regmap-phy-mux: Rework the implementation (2026-06-08 09:17:24 -0500)

----------------------------------------------------------------
Qualcomm clock updates for v7.2

Introduce global, TCSR, and RPMh clock controllers for the Hawi mobile
SoC.

Introduce GX clock for Milos, and ensure that camera clock controller
votes for interconnect bandwidth in order to ensure the TOP_GDSC can be
turned on.

Introduce camera and video clock controllers for Hamoa and Purwa. Reduce
the max_register of the display clock controller to avoid regmap
attemting to dump protected registers.

Introduce global clock controller for the IPQ9650 SoC and add IPQ5332
support to the cmnpll driver.

Add missing USB2 PHY reset to the Nord NegCC.

Rework the PHY mux clock implementation as necessary for upcoming USB4
support.

----------------------------------------------------------------
Alexander Koskovich (1):
      clk: qcom: clk-rpmh: Make all VRMs optional

Bartosz Golaszewski (2):
      dt-bindings: clock: qcom: add the definition for the USB2 PHY reset
      clk: qcom: nord: negcc: add support for the USB2 PHY reset

Biswapriyo Nath (1):
      dt-bindings: clock: qcom,sm6125-dispcc: reference qcom,gcc.yaml

Bjorn Andersson (2):
      Merge branch '20260507-ipq9650_boot_to_shell-v3-1-62742b49c991@oss.qualcomm.com' into clk-for-7.2
      Merge branch '20260106-qcom_ipq5332_cmnpll-v2-2-f9f7e4efbd79@oss.qualcomm.com' into clk-for-7.2

Jagadeesh Kona (5):
      dt-bindings: clock: qcom: Add X1P42100 video clock controller
      dt-bindings: clock: qcom: Add X1P42100 camera clock controller
      clk: qcom: videocc-x1p42100: Add support for video clock controller
      clk: qcom: camcc-x1e80100: Add support for camera QDSS debug clocks
      clk: qcom: camcc-x1p42100: Add support for camera clock controller

Kathiravan Thirumoorthy (2):
      dt-bindings: clock: add Qualcomm IPQ9650 GCC
      clk: qcom: add Global Clock controller (GCC) driver for IPQ9650 SoC

Konrad Dybcio (1):
      clk: qcom: regmap-phy-mux: Rework the implementation

Krzysztof Kozlowski (3):
      clk: qcom: dispcc-x1e80100: Fix (possibly) dumping regmap
      clk: qcom: Constify qcom_cc_driver_data and list of critical CBCR registers
      dt-bindings: clock: qcom,kaanapali-gxclkctl: Correctly use additionalProperties

Luca Weiss (6):
      dt-bindings: clock: qcom: document the Milos GX clock controller
      clk: qcom: Add support for GXCLK for Milos
      interconnect: Add devm_of_icc_get_by_index() as exported API for users
      dt-bindings: clock: qcom,milos-camcc: Document interconnect path
      clk: qcom: gdsc: Support enabling interconnect path for power domain
      clk: qcom: camcc-milos: Declare icc path dependency for CAMSS_TOP_GDSC

Luo Jie (3):
      dt-bindings: clock: qcom: Add CMN PLL support for IPQ5332 SoC
      clk: qcom: cmnpll: Account for reference clock divider
      clk: qcom: cmnpll: Add IPQ5332 SoC support

Phillip Varney (1):
      clk: qcom: a53: Corrected frequency multiplier for 1152MHz

Vivek Aknurwar (7):
      dt-bindings: clock: qcom-rpmhcc: Add RPMHCC bindings for Hawi
      dt-bindings: clock: qcom: Add Hawi TCSR clock controller
      dt-bindings: clock: qcom: Add Hawi global clock controller
      clk: qcom: rpmh: Add support for Hawi RPMH clocks
      clk: qcom: Add Hawi TCSR clock controller driver
      clk: qcom: clk-alpha-pll: Add support for Taycan EHA_T PLL
      clk: qcom: Add support for global clock controller on Hawi

 .../bindings/clock/qcom,dispcc-sm6125.yaml         |   17 +-
 .../devicetree/bindings/clock/qcom,hawi-gcc.yaml   |   63 +
 .../bindings/clock/qcom,ipq9574-cmn-pll.yaml       |    1 +
 .../bindings/clock/qcom,ipq9650-gcc.yaml           |   68 +
 .../bindings/clock/qcom,kaanapali-gxclkctl.yaml    |    2 +-
 .../bindings/clock/qcom,milos-camcc.yaml           |    8 +
 .../bindings/clock/qcom,milos-gxclkctl.yaml        |   61 +
 .../devicetree/bindings/clock/qcom,rpmhcc.yaml     |    1 +
 .../bindings/clock/qcom,sm8450-videocc.yaml        |    3 +
 .../bindings/clock/qcom,sm8550-tcsr.yaml           |    2 +
 .../bindings/clock/qcom,x1e80100-camcc.yaml        |    1 +
 drivers/clk/qcom/Kconfig                           |   48 +
 drivers/clk/qcom/Makefile                          |    7 +-
 drivers/clk/qcom/a53-pll.c                         |    2 +-
 drivers/clk/qcom/camcc-milos.c                     |    7 +
 drivers/clk/qcom/camcc-x1e80100.c                  |   64 +
 drivers/clk/qcom/camcc-x1p42100.c                  | 2223 ++++++++++++
 drivers/clk/qcom/clk-alpha-pll.h                   |    6 +
 drivers/clk/qcom/clk-regmap-phy-mux.c              |   52 +-
 drivers/clk/qcom/clk-rpmh.c                        |   41 +-
 drivers/clk/qcom/dispcc-x1e80100.c                 |    2 +-
 drivers/clk/qcom/gcc-hawi.c                        | 3657 ++++++++++++++++++++
 drivers/clk/qcom/gcc-ipq9650.c                     | 3445 ++++++++++++++++++
 drivers/clk/qcom/gcc-nord.c                        |    2 +-
 drivers/clk/qcom/gdsc.c                            |   33 +
 drivers/clk/qcom/gdsc.h                            |    5 +
 drivers/clk/qcom/gpucc-sm8750.c                    |    4 +-
 drivers/clk/qcom/gxclkctl-kaanapali.c              |    1 +
 drivers/clk/qcom/ipq-cmn-pll.c                     |   30 +-
 drivers/clk/qcom/negcc-nord.c                      |    3 +-
 drivers/clk/qcom/nwgcc-nord.c                      |    4 +-
 drivers/clk/qcom/segcc-nord.c                      |    2 +-
 drivers/clk/qcom/tcsrcc-hawi.c                     |  158 +
 drivers/clk/qcom/videocc-x1p42100.c                |  585 ++++
 drivers/interconnect/core.c                        |   20 +
 include/dt-bindings/clock/qcom,hawi-gcc.h          |  253 ++
 include/dt-bindings/clock/qcom,hawi-tcsrcc.h       |   16 +
 include/dt-bindings/clock/qcom,ipq5332-cmn-pll.h   |   19 +
 include/dt-bindings/clock/qcom,ipq9650-gcc.h       |  172 +
 include/dt-bindings/clock/qcom,nord-negcc.h        |    1 +
 include/dt-bindings/clock/qcom,rpmh.h              |    2 +
 include/dt-bindings/clock/qcom,x1e80100-camcc.h    |    3 +
 include/dt-bindings/clock/qcom,x1p42100-videocc.h  |   48 +
 include/dt-bindings/reset/qcom,ipq9650-gcc.h       |  215 ++
 include/linux/interconnect.h                       |    6 +
 45 files changed, 11313 insertions(+), 50 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,hawi-gcc.yaml
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,ipq9650-gcc.yaml
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,milos-gxclkctl.yaml
 create mode 100644 drivers/clk/qcom/camcc-x1p42100.c
 create mode 100644 drivers/clk/qcom/gcc-hawi.c
 create mode 100644 drivers/clk/qcom/gcc-ipq9650.c
 create mode 100644 drivers/clk/qcom/tcsrcc-hawi.c
 create mode 100644 drivers/clk/qcom/videocc-x1p42100.c
 create mode 100644 include/dt-bindings/clock/qcom,hawi-gcc.h
 create mode 100644 include/dt-bindings/clock/qcom,hawi-tcsrcc.h
 create mode 100644 include/dt-bindings/clock/qcom,ipq5332-cmn-pll.h
 create mode 100644 include/dt-bindings/clock/qcom,ipq9650-gcc.h
 create mode 100644 include/dt-bindings/clock/qcom,x1p42100-videocc.h
 create mode 100644 include/dt-bindings/reset/qcom,ipq9650-gcc.h


^ permalink raw reply

* [PATCH v4 31/31] [RFC] tools/scmi: Add SCMI Telemetry testing tool
From: Cristian Marussi @ 2026-06-12 22:38 UTC (permalink / raw)
  To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
	linux-doc
  Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
	etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
	elif.topuz, lukasz.luba, philip.radford, brauner,
	souvik.chakravarty, leitao, kas, puranjay, usama.arif,
	kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>

Add a testing tool that exercises the SCMI ioctls UAPI interface: as of
now the tool simply queries the initial state of the SCMI Telemetry
subsystem, tries to enable all the existent Data Events and dumps all
the Telemetry data.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v3 --> v4
 - added generation file support

Basic implementation just to exercise a few IOCTls: to be refined and
extended to support a more interactive usage.
---
 tools/testing/scmi/Makefile |  25 +++
 tools/testing/scmi/stlm.c   | 434 ++++++++++++++++++++++++++++++++++++
 2 files changed, 459 insertions(+)
 create mode 100644 tools/testing/scmi/Makefile
 create mode 100644 tools/testing/scmi/stlm.c

diff --git a/tools/testing/scmi/Makefile b/tools/testing/scmi/Makefile
new file mode 100644
index 000000000000..a6a101f8398b
--- /dev/null
+++ b/tools/testing/scmi/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CC?=$(CROSS_COMPILE)gcc
+OBJS = stlm.o
+
+CFLAGS=-Wall -static -std=gnu11 -I ../../../include/uapi/
+ifneq ($(DEBUG), )
+	CFLAGS+=-O0 -g -ggdb
+else
+	CFLAGS+=-static
+endif
+
+all: stlm
+
+stlm: $(OBJS)
+	$(CC) $(CFLAGS) $^ -o $@
+
+%.o: %.c
+	$(CC) $(CFLAGS) -c $<
+
+clean:
+	rm -f *.o
+	rm -f stlm
+
+.PHONY: clean
diff --git a/tools/testing/scmi/stlm.c b/tools/testing/scmi/stlm.c
new file mode 100644
index 000000000000..f153b6e6a4cd
--- /dev/null
+++ b/tools/testing/scmi/stlm.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <unistd.h>
+
+#include <linux/scmi.h>
+
+#define SLEEP_MS	3000
+#define DEF_TLM_ROOT	"/sys/fs/arm_telemetry/"
+
+#define IOCTL_ERR_STR(_ioctl)	"IOCTL:" #_ioctl
+
+struct tlm_de {
+	struct scmi_tlm_de_info *info;
+	struct scmi_tlm_de_config cfg;
+	struct scmi_tlm_de_sample sample;
+};
+
+struct tlm_group {
+	int fd;
+	struct scmi_tlm_grp_info *info;
+	struct scmi_tlm_grp_desc *desc;
+	struct scmi_tlm_intervals *ivs;
+};
+
+struct tlm_state {
+	int dfd;
+	int fd;
+	int g_dfd;
+	const char *path;
+	struct scmi_tlm_base_info info;
+	struct scmi_tlm_config cfg;
+	struct scmi_tlm_intervals *ivs;
+	unsigned int num_des;
+	struct tlm_de *des;
+	unsigned int num_groups;
+	struct tlm_group *grps;
+};
+
+static inline void dump_state(struct tlm_state *st)
+{
+	uint32_t *uuid32 = st->info.de_impl_version;
+	uint16_t *uuid16 = (uint16_t *)&st->info.de_impl_version[1];
+
+	fprintf(stdout, "- SYSTEM TELEMETRY @instance: %s\n\n", st->path);
+	fprintf(stdout, "+ Version: 0x%08X\n", st->info.version);
+	fprintf(stdout, "+ DEs#: %d\n", st->info.num_des);
+	fprintf(stdout, "+ GRPS#: %d\n", st->info.num_groups);
+	fprintf(stdout, "+ INTRV#: %d\n", st->info.num_intervals);
+
+	fprintf(stdout, "+ UUID: ");
+	fprintf(stdout, "%X-", uuid32[0]);
+	fprintf(stdout, "%X-", uuid16[0]);
+	fprintf(stdout, "%X-", uuid16[1]);
+	fprintf(stdout, "%X", uuid16[2]);
+	fprintf(stdout, "%X\n", uuid32[3]);
+
+	fprintf(stdout, "\n+ TLM_ENABLED: %d\n", st->cfg.enable);
+	fprintf(stdout, "+ CURRENT_UPDATE_INTERVAL: %d\n",
+		st->cfg.current_update_interval);
+
+	fprintf(stdout, "+ Found #%u Global Update Intervals\n",
+		st->info.num_intervals);
+	for (int i = 0; i < st->ivs->num_intervals; i++)
+		fprintf(stdout, "\t[%d]::%u\n", i, st->ivs->update_intervals[i]);
+
+	if (st->info.num_des != st->num_des) {
+		fprintf(stdout, "\n++++++ DES NOT FULLY_ENUMERATED ++++++\n");
+		fprintf(stdout, "+++ DECLARED:%u  ENUMERATED:%u +++\n",
+			st->info.num_des, st->num_des);
+	}
+
+	fprintf(stdout, "\n+ Found #%d DEs:\n", st->num_des);
+	for (int i = 0; i < st->num_des; i++)
+		fprintf(stdout, "\t0x%08X %s %s -- TS:%16llu %016llX\n",
+			st->des[i].info->id,
+			st->des[i].cfg.enable ? "ON" : "--",
+			st->des[i].cfg.t_enable ? "TS_ON" : "-----",
+			st->des[i].sample.tstamp, st->des[i].sample.val);
+	fprintf(stdout, "\n");
+
+	fprintf(stdout, "+ Found %d GRPs: ", st->num_groups);
+	for (int i = 0; i < st->num_groups; i++) {
+		fprintf(stdout, "\n\tGRP_ID:%d  DES#:%d  INTRVS#:%d\n",
+			st->grps[i].info->id, st->grps[i].info->num_des,
+			st->grps[i].info->num_intervals);
+
+		fprintf(stdout, "\tCOMPOSING_DES:");
+		for (int j = 0; j < st->grps[i].desc->num_des; j++)
+			fprintf(stdout, "0x%08X ",
+				st->grps[i].desc->composing_des[j]);
+		fprintf(stdout, "\n");
+	}
+}
+
+static int discover_base_info(int fd, struct scmi_tlm_base_info *info)
+{
+	int ret;
+
+	ret = ioctl(fd, SCMI_TLM_GET_INFO, info);
+	if (ret) {
+		perror(IOCTL_ERR_STR(SCMI_TLM_GET_INFO));
+		return ret;
+	}
+
+	return ret;
+}
+
+static struct scmi_tlm_des_list *scmi_get_des_list(int fd, int num_des)
+{
+	struct scmi_tlm_des_list *dsl;
+	size_t size = sizeof(*dsl) + num_des * sizeof(dsl->des[0]);
+	int ret;
+
+	dsl = malloc(size);
+	if (!dsl)
+		return NULL;
+
+	bzero(dsl, size);
+	dsl->num_des = num_des;
+	ret = ioctl(fd, SCMI_TLM_GET_DE_LIST, dsl);
+	if (ret) {
+		perror(IOCTL_ERR_STR(SCMI_TLM_GET_DE_LIST));
+		return NULL;
+	}
+
+	return dsl;
+}
+
+static struct tlm_de *enumerate_des(struct tlm_state *st)
+{
+	struct scmi_tlm_des_list *dsl;
+	struct tlm_de *des;
+
+	dsl = scmi_get_des_list(st->fd, st->info.num_des);
+	if (!dsl)
+		return NULL;
+
+	st->num_des = dsl->num_des;
+	des = malloc(sizeof(*des) * st->num_des);
+	if (!des)
+		return NULL;
+
+	bzero(des, sizeof(*des) * st->num_des);
+	for (int i = 0; i < st->num_des; i++) {
+		struct tlm_de *de = &des[i];
+		int ret;
+
+		de->info = &dsl->des[i];
+		de->cfg.id = de->info->id;
+		ret = ioctl(st->fd, SCMI_TLM_GET_DE_CFG, &de->cfg);
+		if (ret) {
+			perror(IOCTL_ERR_STR(SCMI_TLM_GET_DE_CFG));
+			continue;
+		}
+
+		if (!de->cfg.enable)
+			continue;
+
+		/* Collect initial sample */
+		de->sample.id = de->info->id;
+		ret = ioctl(st->fd, SCMI_TLM_GET_DE_VALUE, &de->sample);
+		if (ret) {
+			perror(IOCTL_ERR_STR(SCMI_TLM_GET_DE_VALUE));
+			continue;
+		}
+	}
+
+	return des;
+}
+
+static int get_current_config(int fd, struct scmi_tlm_config *cfg)
+{
+	int ret;
+
+	ret = ioctl(fd, SCMI_TLM_GET_CFG, cfg);
+	if (ret) {
+		perror(IOCTL_ERR_STR(SCMI_TLM_GET_CFG));
+		return ret;
+	}
+
+	return ret;
+}
+
+static struct scmi_tlm_grps_list *scmi_get_grps_list(int fd, int num_groups)
+{
+	struct scmi_tlm_grps_list *gsl;
+	size_t size = sizeof(*gsl) + num_groups * sizeof(gsl->grps[0]);
+	int ret;
+
+	gsl = malloc(size);
+	if (!gsl)
+		return NULL;
+
+	bzero(gsl, size);
+	gsl->num_grps = num_groups;
+	ret = ioctl(fd, SCMI_TLM_GET_GRP_LIST, gsl);
+	if (ret) {
+		perror(IOCTL_ERR_STR(SCMI_TLM_GET_GRP_LIST));
+		return NULL;
+	}
+
+	return gsl;
+}
+
+static struct scmi_tlm_intervals *enumerate_intervals(int fd, int num_intervals)
+{
+	struct scmi_tlm_intervals *ivs;
+	size_t sz;
+	int ret;
+
+	sz = sizeof(*ivs) + sizeof(*ivs->update_intervals) * num_intervals;
+	ivs = malloc(sz);
+	if (!ivs)
+		return NULL;
+
+	memset(ivs, 0, sz);
+
+	ivs->num_intervals = num_intervals;
+	ret = ioctl(fd, SCMI_TLM_GET_INTRVS, ivs);
+	if (ret) {
+		perror(IOCTL_ERR_STR(SCMI_TLM_GET_INTRVS));
+		free(ivs);
+		return NULL;
+	}
+
+	return ivs;
+}
+
+static struct tlm_group *enumerate_groups(struct tlm_state *st)
+{
+	struct scmi_tlm_grps_list *gsl;
+	struct tlm_group *grps;
+
+	gsl = scmi_get_grps_list(st->fd, st->info.num_groups);
+	if (!gsl)
+		return NULL;
+
+	st->g_dfd = openat(st->dfd, "groups", O_RDONLY);
+	if (st->g_dfd < 0)
+		return NULL;
+
+	st->num_groups = gsl->num_grps;
+	grps = malloc(sizeof(*grps) * st->num_groups);
+	if (!grps)
+		return NULL;
+
+	bzero(grps, sizeof(*grps) * st->num_groups);
+	for (int i = 0; i < st->num_groups; i++) {
+		struct tlm_group *grp = &grps[i];
+		char gctrl[32];
+		size_t size;
+		int ret;
+
+		snprintf(gctrl, 32, "%d/control", i);
+		grp->fd = openat(st->g_dfd, gctrl, O_RDWR);
+		if (grp->fd < 0)
+			return NULL;
+
+		grp->info = &gsl->grps[i];
+		size = sizeof(*grp->desc) + sizeof(uint32_t) * grp->info->num_des;
+		grp->desc = malloc(size);
+		if (!grp->desc)
+			return NULL;
+
+		bzero(grp->desc, size);
+		grp->desc->num_des = grp->info->num_des;
+		ret = ioctl(grp->fd, SCMI_TLM_GET_GRP_DESC, grp->desc);
+		if (ret) {
+			perror(IOCTL_ERR_STR(SCMI_TLM_GET_GRP_DESC));
+			continue;
+		}
+
+		grp->ivs = enumerate_intervals(grp->fd, grp->info->num_intervals);
+	}
+
+	return grps;
+}
+
+static int get_tlm_state(const char *path, struct tlm_state *st)
+{
+	int ret;
+
+	st->dfd = open(path, O_RDONLY);
+	if (st->dfd < 0) {
+		perror("open");
+		return st->dfd;
+	}
+
+	st->fd = openat(st->dfd, "control", O_RDWR);
+	if (st->fd < 0) {
+		perror("openat");
+		return st->fd;
+	}
+
+	ret = discover_base_info(st->fd, &st->info);
+	if (ret)
+		return ret;
+
+	st->ivs = enumerate_intervals(st->fd, st->info.num_intervals);
+	if (!st->ivs)
+		return -1;
+
+	ret = get_current_config(st->fd, &st->cfg);
+	if (ret)
+		return ret;
+
+	if (st->info.num_des)
+		st->des = enumerate_des(st);
+
+	if (st->info.num_groups)
+		st->grps = enumerate_groups(st);
+
+	st->path = path;
+
+	return 0;
+}
+
+#define MAX_GENERATIONS		5
+
+static void get_tlm_generation(struct tlm_state *st)
+{
+	int fd, i = 0;
+	struct pollfd pfds[1];
+
+	fd = openat(st->dfd, "generation", O_RDONLY);
+	if (fd < 0) {
+		perror("openat");
+		return ;
+	}
+
+	pfds[0].fd = fd;
+	pfds[0].events = POLLIN;
+
+	do {
+		int ret;
+
+		pfds[0].revents = 0;
+		ret = poll(pfds, 1, -1);
+		if (ret < 0 ) {
+			perror("poll generation");
+			break;;
+		}
+
+		if (!pfds[0].revents)
+			continue;
+
+		if (pfds[0].revents & POLLIN) {
+			int n;
+			char buf[32] = {};
+
+			n = read(fd, buf, 32);
+			if (n < 0) {
+				perror("read generation");
+				break;
+			}
+
+			fprintf(stdout, "Generation[%u]: %s\n", i, buf);
+		}
+	} while (i++ < MAX_GENERATIONS);
+
+	close(fd);
+}
+
+int main(int argc, char **argv)
+{
+	const char *tlm_root_instance = DEF_TLM_ROOT "tlm_0/";
+	struct scmi_tlm_data_read *bulk;
+	struct scmi_tlm_de_config de_cfg = {};
+	struct tlm_state st = {};
+	size_t bulk_sz;
+	int ret;
+
+	ret = get_tlm_state(tlm_root_instance, &st);
+	if (ret)
+		return ret;
+
+	dump_state(&st);
+
+	get_tlm_generation(&st);
+
+	bulk_sz = sizeof(*bulk) + sizeof(bulk->samples[0]) * st.info.num_des;
+	bulk = malloc(bulk_sz);
+	if (!bulk)
+		return -1;
+
+	bzero(bulk, bulk_sz);
+	bulk->num_samples = st.info.num_des;
+	ret = ioctl(st.fd, SCMI_TLM_SINGLE_SAMPLE, bulk);
+	if (ret) {
+		perror(IOCTL_ERR_STR(SCMI_TLM_SINGLE_SAMPLE));
+		return -1;
+	}
+
+	fprintf(stdout, "\n--- Enabling ALL DEs with timestamp...\n");
+	de_cfg.enable = 1;
+	de_cfg.t_enable = 1;
+	ret = ioctl(st.fd, SCMI_TLM_SET_ALL_CFG, &de_cfg);
+	if (ret) {
+		perror(IOCTL_ERR_STR(SCMI_TLM_SET_ALL_CFG));
+		return ret;
+	}
+
+	fprintf(stdout, "\n- Single ASYNC read -\n-------------------\n");
+	for (int i = 0; i < bulk->num_samples; i++)
+		fprintf(stdout, "0x%08X %016llu %016llX\n",
+			bulk->samples[i].id, bulk->samples[i].tstamp,
+			bulk->samples[i].val);
+
+	bzero(bulk, bulk_sz);
+	bulk->num_samples = st.info.num_des;
+	ret = ioctl(st.fd, SCMI_TLM_BULK_READ, bulk);
+	if (ret) {
+		perror(IOCTL_ERR_STR(SCMI_TLM_BULK_READ));
+		return -1;
+	}
+
+	fprintf(stdout, "\n- BULK read -\n-------------------\n");
+	for (int i = 0; i < bulk->num_samples; i++)
+		fprintf(stdout, "0x%08X %016llu %016llX\n",
+			bulk->samples[i].id, bulk->samples[i].tstamp,
+			bulk->samples[i].val);
+
+	return 0;
+}
-- 
2.54.0



^ permalink raw reply related

* [PATCH v4 30/31] fs/stlmfs: Document lazy mode and related mount option
From: Cristian Marussi @ 2026-06-12 22:38 UTC (permalink / raw)
  To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
	linux-doc
  Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
	etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
	elif.topuz, lukasz.luba, philip.radford, brauner,
	souvik.chakravarty, leitao, kas, puranjay, usama.arif,
	kernel-team, Cristian Marussi, Jonathan Corbet, Shuah Khan
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>

Document optional lazy enumeration behaviour and related mount option.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: linux-doc@vger.kernel.org
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
 Documentation/filesystems/stlmfs.rst | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/stlmfs.rst b/Documentation/filesystems/stlmfs.rst
index b5b3cd649775..fe69d40f9249 100644
--- a/Documentation/filesystems/stlmfs.rst
+++ b/Documentation/filesystems/stlmfs.rst
@@ -74,8 +74,12 @@ Design
 STLMFS is a pseudo filesystem used to expose ARM SCMI Telemetry data
 discovered dynamically at run-time via SCMI.
 
-Inodes are all dynamically created at mount-time from a dedicated
-kmem_cache based on the gathered available SCMI Telemetry information.
+Normally all of the top level file/inodes are dynamically created at
+mount-time from a dedicated kmem_cache based on the gathered available
+SCMI Telemetry information, but it is possible to enable a lazy enumeration
+and FS population mode that delays SCMI Telemetry resources enumerations
+and related FS population till the moment a user steps into the related FS
+subdirectories: *des/* *groups/* and *components/*.
 
 Since inodes represent the discovered Telemetry entities, which in turn are
 statically defined at the platform level and immutable throughout the same
@@ -128,6 +132,19 @@ Note that all of the above options are explicitly designed NOT to support
 a remount operation, so as not have surprising effects on permissions of
 already discovered/created telemetry files.
 
+It is possible to mount it in lazy-mode by using the *lazy* mount option::
+
+	mount -t stlmfs -o lazy none /sys/fs/arm_telemetry
+
+In this latter case, the des/ groups/ and components/ directory will be
+created empty at mount-time and only filled later when 'walked in'.
+
+This allows a user to benefit from a lazy enumeration scheme of the SCMI
+Telemetry resources by delaying such, usually expensive, message exchanges
+to the last possible moment: ideally, even never, if using some of the
+other alternative binary interfaces that does not need any resource
+enumeration at all.
+
 Usage
 =====
 
-- 
2.54.0



^ permalink raw reply related

* [PATCH v4 29/31] firmware: arm_scmi: stlmfs: Add lazy population support
From: Cristian Marussi @ 2026-06-12 22:37 UTC (permalink / raw)
  To: linux-kernel, linux-arm-kernel, arm-scmi, linux-fsdevel,
	linux-doc
  Cc: sudeep.holla, james.quinlan, f.fainelli, vincent.guittot,
	etienne.carriere, peng.fan, michal.simek, d-gole, jic23,
	elif.topuz, lukasz.luba, philip.radford, brauner,
	souvik.chakravarty, leitao, kas, puranjay, usama.arif,
	kernel-team, Cristian Marussi
In-Reply-To: <20260612223802.1337232-1-cristian.marussi@arm.com>

Add a filesystem mount option to the SCMI Telemetry filesystem so as to
delay resources enumeration and related fs subtrees population to the
last possible moment when the related fs paths are accessed.

Only basic global fs entries are populated at mount time when the lazy
mount option is used.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
---
v3 --> v4
 - add lazy support to show_options
 - make FS entries world-Readable and user-Writable where applicable
 - added stlmfs tag in $SUBJECT
---
 .../firmware/arm_scmi/scmi_system_telemetry.c | 521 +++++++++++++++---
 1 file changed, 430 insertions(+), 91 deletions(-)

diff --git a/drivers/firmware/arm_scmi/scmi_system_telemetry.c b/drivers/firmware/arm_scmi/scmi_system_telemetry.c
index df45fc212e13..9cb8779d2b59 100644
--- a/drivers/firmware/arm_scmi/scmi_system_telemetry.c
+++ b/drivers/firmware/arm_scmi/scmi_system_telemetry.c
@@ -42,12 +42,14 @@ enum {
 	Opt_uid,
 	Opt_gid,
 	Opt_umask,
+	Opt_lazy,
 };
 
 static const struct fs_parameter_spec stlmfs_param_spec[] = {
 	fsparam_uid("uid", Opt_uid),
 	fsparam_gid("gid", Opt_gid),
 	fsparam_u32oct("umask", Opt_umask),
+	fsparam_flag_no("lazy", Opt_lazy),
 	{}
 };
 
@@ -56,18 +58,29 @@ struct stlmfs_fs_context {
 	kuid_t uid;
 	kgid_t gid;
 	umode_t umask;
+	bool lazy;
+};
+
+struct stlmfs_lazy_tracker {
+	bool des;
+	bool grps;
+	bool topo;
 };
 
 struct stlmfs_sb_info {
 	kuid_t uid;
 	kgid_t gid;
 	umode_t umask;
+	bool lazy;
+	unsigned int num_inst;
+	struct stlmfs_lazy_tracker populated[] __counted_by(num_inst);
 };
 
 static struct kmem_cache *stlmfs_inode_cachep;
 
 static DEFINE_MUTEX(stlmfs_mtx);
 static struct super_block *stlmfs_sb;
+static unsigned int stlmfs_instances;
 
 static atomic_t scmi_tlm_instance_count = ATOMIC_INIT(0);
 
@@ -134,9 +147,11 @@ struct scmi_tlm_class {
 #define	TLM_IS_STATE	BIT(0)
 #define	TLM_IS_GROUP	BIT(1)
 #define	TLM_IS_DYNAMIC	BIT(2)
+#define	TLM_IS_LAZY	BIT(3)
 #define IS_STATE(_f)	((_f) & TLM_IS_STATE)
 #define IS_GROUP(_f)	((_f) & TLM_IS_GROUP)
 #define IS_DYNAMIC(_f)	((_f) & TLM_IS_DYNAMIC)
+#define IS_LAZY(_f)	((_f) & TLM_IS_LAZY)
 	const struct file_operations *f_op;
 	const struct inode_operations *i_op;
 };
@@ -166,6 +181,10 @@ struct scmi_tlm_class {
  * @info: SCMI instance information data reference.
  * @vfs_inode: The embedded VFS inode that will be initialized and plugged
  *	       into the live filesystem at mount time.
+ * @node: List item field.
+ * @children: A list containing all the children of this node.
+ * @num_children: Number of items stored in the @children list.
+ * @mtx: A mutex to protect the @children list.
  *
  * This structure is used to describe each SCMI Telemetry entity discovered
  * at probe time, store its related SCMI data, and link to the proper
@@ -181,6 +200,11 @@ struct scmi_tlm_inode {
 		const struct scmi_telemetry_info *info;
 	};
 	struct inode vfs_inode;
+	struct list_head node;
+	struct list_head children;
+	unsigned int num_children;
+	/* Mutext to protect @children list */
+	struct mutex mtx;
 };
 
 #define to_tlm_inode(t)	container_of(t, struct scmi_tlm_inode, vfs_inode)
@@ -195,8 +219,6 @@ struct scmi_tlm_inode {
  * struct scmi_tlm_instance  - Telemetry instance descriptor
  * @id: Progressive number identifying this probed instance; it will be used
  *	to name the top node at the root of this instance.
- * @res_enumerated: A flag to indicate if full resources enumeration has been
- *		    successfully performed.
  * @name: Name to be used for the top root node of the instance. (tlm_<id>)
  * @node: A node to link this in the list of all instances.
  * @sb: A reference to the current super_block.
@@ -211,7 +233,6 @@ struct scmi_tlm_inode {
  */
 struct scmi_tlm_instance {
 	int id;
-	bool res_enumerated;
 	char name[MAX_INST_NAME];
 	struct list_head node;
 	struct super_block *sb;
@@ -224,6 +245,8 @@ struct scmi_tlm_instance {
 	const struct scmi_telemetry_info *info;
 };
 
+static int scmi_telemetry_groups_initialize(const struct scmi_tlm_instance *ti);
+static int scmi_telemetry_topology_view_initialize(const struct scmi_tlm_instance *ti);
 static int scmi_telemetry_instance_register(struct super_block *sb,
 					    struct scmi_tlm_instance *ti);
 
@@ -778,12 +801,10 @@ stlmfs_create_dentry(struct super_block *sb, struct scmi_tlm_setup *tsp,
 		     struct dentry *parent, const struct scmi_tlm_class *cls,
 		     const void *priv)
 {
-	struct scmi_tlm_inode *tlmi;
+	struct scmi_tlm_inode *tlmi, *tlmi_parent;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
 	struct dentry *dentry;
-	struct inode *inode;
-
-	if (!parent)
-		parent = sb->s_root;
+	struct inode *inode, *i_parent;
 
 	/*
 	 * Bail-out when called on a bad tree, so that there is NO need to
@@ -792,7 +813,15 @@ stlmfs_create_dentry(struct super_block *sb, struct scmi_tlm_setup *tsp,
 	if (IS_ERR(parent))
 		return parent;
 
-	dentry = simple_start_creating(parent, cls->name);
+	i_parent = d_inode(parent);
+	if (!i_parent)
+		return ERR_PTR(-ENOENT);
+
+	if (!sbi->lazy)
+		dentry = simple_start_creating(parent, cls->name);
+	else
+		dentry = d_alloc_name(parent, cls->name);
+
 	if (IS_ERR(dentry))
 		return dentry;
 
@@ -815,14 +844,24 @@ stlmfs_create_dentry(struct super_block *sb, struct scmi_tlm_setup *tsp,
 	inode->i_private = (void *)priv;
 
 	tlmi = to_tlm_inode(inode);
-
 	tlmi->cls = cls;
 	tlmi->tsp = tsp;
 	tlmi->priv = priv;
 
+	tlmi_parent = to_tlm_inode(i_parent);
+	if (sbi->lazy && tlmi_parent->cls && IS_LAZY(tlmi_parent->cls->flags)) {
+		scoped_guard(mutex, &tlmi_parent->mtx) {
+			list_add(&tlmi->node, &tlmi_parent->children);
+			tlmi_parent->num_children++;
+		}
+	}
+
 	d_make_persistent(dentry, inode);
 
-	simple_done_creating(dentry);
+	if (!sbi->lazy)
+		simple_done_creating(dentry);
+	else
+		dput(dentry);
 
 	return dentry;
 }
@@ -1477,8 +1516,6 @@ static const struct scmi_tlm_class tlm_tops[] = {
 
 DEFINE_TLM_CLASS(reset_tlmo, "reset", 0, S_IFREG | 0200, &reset_fops, NULL);
 
-DEFINE_TLM_CLASS(des_dir_cls, "des", 0,
-		 S_IFDIR | 0700, NULL, NULL);
 DEFINE_TLM_CLASS(name_tlmo, "name", 0,
 		 S_IFREG | 0444, &string_ro_fops, NULL);
 DEFINE_TLM_CLASS(ena_tlmo, "enable", TLM_IS_STATE,
@@ -1550,48 +1587,72 @@ static int scmi_telemetry_de_populate(struct super_block *sb,
 	return 0;
 }
 
+static struct dentry *
+scmi_telemetry_subdir_create(struct super_block *sb, struct scmi_tlm_setup *tsp,
+			     const char *dname, struct dentry *parent,
+			     const void *priv)
+{
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
+	struct dentry *dentry;
+
+	struct scmi_tlm_class *tlm_cls __free(kfree) =
+		kzalloc(sizeof(*tlm_cls), GFP_KERNEL);
+	if (!tlm_cls)
+		return ERR_PTR(-ENOMEM);
+
+	tlm_cls->name = dname;
+	tlm_cls->mode = S_IFDIR | 0755;
+	tlm_cls->flags = TLM_IS_DYNAMIC;
+	if (sbi->lazy)
+		tlm_cls->flags |= TLM_IS_LAZY;
+	dentry = stlmfs_create_dentry(sb, tsp, parent, tlm_cls, priv);
+	if (IS_ERR(dentry))
+		return dentry;
+
+	retain_and_null_ptr(tlm_cls);
+
+	return dentry;
+}
+
 static int
-scmi_telemetry_des_lazy_enumerate(struct scmi_tlm_instance *ti,
-				  const struct scmi_telemetry_res_info *rinfo)
+scmi_telemetry_des_enumerate(const struct scmi_tlm_instance *ti,
+			     const struct scmi_telemetry_res_info *rinfo)
 {
 	struct scmi_tlm_setup *tsp = ti->tsp;
 	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
 
 	for (int i = 0; i < rinfo->num_des; i++) {
 		const struct scmi_telemetry_de *de = rinfo->des[i];
 		struct dentry *de_dir_dentry;
 		int ret;
 
-		struct scmi_tlm_class *de_tlm_cls __free(kfree) =
-			kzalloc(sizeof(*de_tlm_cls), GFP_KERNEL);
-		if (!de_tlm_cls)
-			return -ENOMEM;
-
-		de_tlm_cls->name = kasprintf(GFP_KERNEL, "0x%08X", de->info->id);
-		if (!de_tlm_cls->name)
+		const char *dname __free(kfree) =
+			kasprintf(GFP_KERNEL, "0x%08X", de->info->id);
+		if (!dname)
 			return -ENOMEM;
 
-		de_tlm_cls->mode = S_IFDIR | 0700;
-		de_tlm_cls->flags = TLM_IS_DYNAMIC;
-		de_dir_dentry = stlmfs_create_dentry(sb, tsp, ti->des_dentry,
-						     de_tlm_cls, de);
+		de_dir_dentry = scmi_telemetry_subdir_create(sb, tsp, dname,
+							     ti->des_dentry, de);
+		if (IS_ERR(de_dir_dentry))
+			return PTR_ERR(de_dir_dentry);
 
 		ret = scmi_telemetry_de_populate(sb, tsp, de_dir_dentry, de,
 						 rinfo->fully_enumerated);
 		if (ret)
 			return ret;
 
-		retain_and_null_ptr(de_tlm_cls);
+		retain_and_null_ptr(dname);
 	}
 
-	ti->res_enumerated = true;
+	sbi->populated[ti->id].des = true;
 
 	dev_info(tsp->dev, "Found %d Telemetry DE resources.\n", rinfo->num_des);
 
 	return 0;
 }
 
-static int scmi_telemetry_des_initialize(struct scmi_tlm_instance *ti)
+static int scmi_telemetry_des_initialize(const struct scmi_tlm_instance *ti)
 {
 	const struct scmi_telemetry_res_info *rinfo;
 
@@ -1599,9 +1660,196 @@ static int scmi_telemetry_des_initialize(struct scmi_tlm_instance *ti)
 	if (!rinfo)
 		return -ENODEV;
 
-	return scmi_telemetry_des_lazy_enumerate(ti, rinfo);
+	return scmi_telemetry_des_enumerate(ti, rinfo);
+}
+
+static inline struct dentry *
+scmi_telemetry_dentry_lookup(struct inode *dir, struct dentry *dentry,
+			     unsigned int flags)
+{
+	struct dentry *d, *dentry_dir;
+
+	const char *dname __free(kfree) =
+		kmemdup_nul(dentry->d_name.name, dentry->d_name.len, GFP_KERNEL);
+	if (!dname)
+		return ERR_PTR(-ENOMEM);
+
+	dentry_dir = d_find_alias(dir);
+	if (!dentry_dir)
+		return simple_lookup(dir, dentry, flags);
+
+	d = stlmfs_lookup_by_name(dentry_dir, dname);
+	dput(dentry_dir);
+
+	return d;
+}
+
+static struct dentry *
+stlmfs_lazy_des_lookup(struct inode *dir, struct dentry *dentry,
+		       unsigned int flags)
+{
+	struct scmi_tlm_inode *tlmi = to_tlm_inode(dir);
+	struct scmi_tlm_instance *ti = (struct scmi_tlm_instance *)tlmi->priv;
+	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
+	int ret;
+
+	if (sbi->populated[ti->id].des)
+		return simple_lookup(dir, dentry, flags);
+
+	ret = scmi_telemetry_des_initialize(ti);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return scmi_telemetry_dentry_lookup(dir, dentry, flags);
+}
+
+static const struct inode_operations lazy_des_dir_iops = {
+	.lookup = stlmfs_lazy_des_lookup,
+};
+
+static struct dentry *
+stlmfs_lazy_grps_lookup(struct inode *dir, struct dentry *dentry,
+			unsigned int flags)
+{
+	struct scmi_tlm_inode *tlmi = to_tlm_inode(dir);
+	struct scmi_tlm_instance *ti = (struct scmi_tlm_instance *)tlmi->priv;
+	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
+	int ret;
+
+	if (sbi->populated[ti->id].grps)
+		return simple_lookup(dir, dentry, flags);
+
+	ret = scmi_telemetry_groups_initialize(ti);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return scmi_telemetry_dentry_lookup(dir, dentry, flags);
+}
+
+static const struct inode_operations lazy_grps_dir_iops = {
+	.lookup = stlmfs_lazy_grps_lookup,
+};
+
+static struct dentry *
+stlmfs_lazy_compo_lookup(struct inode *dir, struct dentry *dentry,
+			 unsigned int flags)
+{
+	struct scmi_tlm_inode *tlmi = to_tlm_inode(dir);
+	struct scmi_tlm_instance *ti = (struct scmi_tlm_instance *)tlmi->priv;
+	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
+	int ret;
+
+	if (sbi->populated[ti->id].topo)
+		return simple_lookup(dir, dentry, flags);
+
+	ret = scmi_telemetry_topology_view_initialize(ti);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return scmi_telemetry_dentry_lookup(dir, dentry, flags);
 }
 
+static const struct inode_operations lazy_compo_dir_iops = {
+	.lookup = stlmfs_lazy_compo_lookup,
+};
+
+static inline void
+scmi_telemetry_children_dir_emit(struct dir_context *ctx,
+				 struct scmi_tlm_inode *tlmi_parent)
+{
+	struct scmi_tlm_inode *tlmi;
+
+	if (ctx->pos >= tlmi_parent->num_children)
+		return;
+
+	guard(mutex)(&tlmi_parent->mtx);
+	list_for_each_entry(tlmi, &tlmi_parent->children, node) {
+		if (!dir_emit(ctx, tlmi->cls->name, strlen(tlmi->cls->name),
+			      tlmi->vfs_inode.i_ino,
+			      S_ISDIR(tlmi->cls->mode) ? DT_DIR : DT_REG))
+			break;
+		ctx->pos++;
+	}
+}
+
+static int
+stlmfs_lazy_des_iterate_shared(struct file *filp, struct dir_context *ctx)
+{
+	struct scmi_tlm_inode *tlmi_des = to_tlm_inode(file_inode(filp));
+	const struct scmi_tlm_instance *ti = tlmi_des->priv;
+	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
+
+	if (!sbi->populated[ti->id].des) {
+		int ret;
+
+		ret = scmi_telemetry_des_initialize(ti);
+		if (ret)
+			return ret;
+	}
+
+	scmi_telemetry_children_dir_emit(ctx, tlmi_des);
+
+	return 0;
+}
+
+static const struct file_operations lazy_des_fops = {
+	.iterate_shared = stlmfs_lazy_des_iterate_shared,
+};
+
+static int
+stlmfs_lazy_grps_iterate_shared(struct file *filp, struct dir_context *ctx)
+{
+	struct scmi_tlm_inode *tlmi_des = to_tlm_inode(file_inode(filp));
+	const struct scmi_tlm_instance *ti = tlmi_des->priv;
+	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
+
+	if (!sbi->populated[ti->id].grps) {
+		int ret;
+
+		ret = scmi_telemetry_groups_initialize(ti);
+		if (ret)
+			return ret;
+	}
+
+	scmi_telemetry_children_dir_emit(ctx, tlmi_des);
+
+	return 0;
+}
+
+static const struct file_operations lazy_grps_fops = {
+	.iterate_shared = stlmfs_lazy_grps_iterate_shared,
+};
+
+static int
+stlmfs_lazy_compo_iterate_shared(struct file *filp, struct dir_context *ctx)
+{
+	struct scmi_tlm_inode *tlmi_des = to_tlm_inode(file_inode(filp));
+	const struct scmi_tlm_instance *ti = tlmi_des->priv;
+	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
+
+	if (!sbi->populated[ti->id].topo) {
+		int ret;
+
+		ret = scmi_telemetry_topology_view_initialize(ti);
+		if (ret)
+			return ret;
+	}
+
+	scmi_telemetry_children_dir_emit(ctx, tlmi_des);
+
+	return 0;
+}
+
+static const struct file_operations lazy_compo_fops = {
+	.iterate_shared = stlmfs_lazy_compo_iterate_shared,
+};
+
 DEFINE_TLM_CLASS(version_tlmo, "version", 0,
 		 S_IFREG | 0444, &sa_x32_ro_fops, NULL);
 
@@ -1728,8 +1976,6 @@ static const struct scmi_tlm_class tlm_grps[] = {
 DEFINE_TLM_CLASS(grp_data_tlmo, "des_bulk_read", TLM_IS_GROUP,
 		 S_IFREG | 0444, &scmi_tlm_data_fops, NULL);
 
-DEFINE_TLM_CLASS(groups_dir_cls, "groups", 0, S_IFDIR | 0700, NULL, NULL);
-
 DEFINE_TLM_CLASS(grp_single_sample_tlmo, "des_single_sample_read", TLM_IS_GROUP,
 		 S_IFREG | 0444, &scmi_tlm_single_sample_fops, NULL);
 
@@ -2146,67 +2392,85 @@ DEFINE_TLM_CLASS(ctrl_tlmo, "control", 0,
 DEFINE_TLM_CLASS(grp_ctrl_tlmo, "control", TLM_IS_GROUP,
 		 S_IFREG | 0666, &scmi_tlm_ctrl_fops, NULL);
 
-static int scmi_telemetry_groups_initialize(struct scmi_tlm_instance *ti)
+static int
+scmi_telemetry_grp_populate(struct super_block *sb, struct scmi_tlm_setup *tsp,
+			    struct dentry *parent,
+			    const struct scmi_telemetry_group *grp,
+			    bool single_read_support,
+			    bool per_group_config_support)
+{
+	for (const struct scmi_tlm_class *gto = tlm_grps; gto->name; gto++)
+		stlmfs_create_dentry(sb, tsp, parent, gto, grp);
+
+	stlmfs_create_dentry(sb, tsp, parent, &grp_composing_des_tlmo,
+			     grp->des_str);
+
+	stlmfs_create_dentry(sb, tsp, parent, &grp_ctrl_tlmo, grp);
+	stlmfs_create_dentry(sb, tsp, parent, &grp_data_tlmo, grp);
+	if (single_read_support)
+		stlmfs_create_dentry(sb, tsp, parent, &grp_single_sample_tlmo, grp);
+
+	if (per_group_config_support) {
+		stlmfs_create_dentry(sb, tsp, parent,
+				     &grp_current_interval_tlmo, grp);
+		stlmfs_create_dentry(sb, tsp, parent,
+				     &grp_available_interval_tlmo, grp);
+		stlmfs_create_dentry(sb, tsp, parent,
+				     &grp_intervals_discrete_tlmo, grp);
+	}
+
+	return 0;
+}
+
+static int
+scmi_telemetry_groups_enumerate(const struct scmi_tlm_instance *ti,
+				const struct scmi_telemetry_res_info *rinfo)
 {
-	const struct scmi_telemetry_res_info *rinfo;
 	struct scmi_tlm_setup *tsp = ti->tsp;
 	struct super_block *sb = ti->sb;
-	struct device *dev = tsp->dev;
-	struct dentry *grp_dir_dentry;
-
-	if (ti->info->base.num_groups == 0)
-		return 0;
-
-	rinfo = scmi_telemetry_res_info_get(tsp);
-	if (!rinfo)
-		return -ENODEV;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
 
 	for (int i = 0; i < rinfo->num_groups; i++) {
-		const struct scmi_telemetry_group *grp = &rinfo->grps[i];
-
-		struct scmi_tlm_class *grp_tlm_cls __free(kfree) =
-			kzalloc(sizeof(*grp_tlm_cls), GFP_KERNEL);
-		if (!grp_tlm_cls)
-			return -ENOMEM;
+		struct dentry *grp_dentry;
+		int ret;
 
-		grp_tlm_cls->name = kasprintf(GFP_KERNEL, "%u", grp->info->id);
-		if (!grp_tlm_cls->name)
+		const char *dname __free(kfree) =
+			kasprintf(GFP_KERNEL, "%u", rinfo->grps[i].info->id);
+		if (!dname)
 			return -ENOMEM;
 
-		grp_tlm_cls->mode = S_IFDIR | 0700;
-		grp_tlm_cls->flags = TLM_IS_DYNAMIC;
+		grp_dentry = scmi_telemetry_subdir_create(sb, tsp, dname,
+							  ti->grps_dentry,
+							  &rinfo->grps[i]);
+		if (IS_ERR(grp_dentry))
+			return PTR_ERR(grp_dentry);
 
-		grp_dir_dentry = stlmfs_create_dentry(sb, tsp, ti->grps_dentry,
-						      grp_tlm_cls, grp);
+		ret = scmi_telemetry_grp_populate(sb, tsp, grp_dentry,
+						  &rinfo->grps[i],
+						  ti->info->single_read_support,
+						  ti->info->per_group_config_support);
+		if (ret)
+			return ret;
 
-		for (const struct scmi_tlm_class *gto = tlm_grps; gto->name; gto++)
-			stlmfs_create_dentry(sb, tsp, grp_dir_dentry, gto, grp);
+		retain_and_null_ptr(dname);
+	}
 
-		stlmfs_create_dentry(sb, tsp, grp_dir_dentry,
-				     &grp_composing_des_tlmo, grp->des_str);
+	sbi->populated[ti->id].grps = true;
 
-		stlmfs_create_dentry(sb, tsp, grp_dir_dentry, &grp_ctrl_tlmo, grp);
-		stlmfs_create_dentry(sb, tsp, grp_dir_dentry, &grp_data_tlmo, grp);
-		if (ti->info->single_read_support)
-			stlmfs_create_dentry(sb, tsp, grp_dir_dentry,
-					     &grp_single_sample_tlmo, grp);
+	dev_info(tsp->dev, "Found %d Telemetry GROUPS resources.\n", rinfo->num_groups);
 
-		if (ti->info->per_group_config_support) {
-			stlmfs_create_dentry(sb, tsp, grp_dir_dentry,
-					     &grp_current_interval_tlmo, grp);
-			stlmfs_create_dentry(sb, tsp, grp_dir_dentry,
-					     &grp_available_interval_tlmo, grp);
-			stlmfs_create_dentry(sb, tsp, grp_dir_dentry,
-					     &grp_intervals_discrete_tlmo, grp);
-		}
+	return 0;
+}
 
-		retain_and_null_ptr(grp_tlm_cls);
-	}
+static int scmi_telemetry_groups_initialize(const struct scmi_tlm_instance *ti)
+{
+	const struct scmi_telemetry_res_info *rinfo;
 
-	dev_info(dev, "Found %d Telemetry GROUPS resources.\n",
-		 rinfo->num_groups);
+	rinfo = scmi_telemetry_res_info_get(ti->tsp);
+	if (!rinfo || !rinfo->fully_enumerated)
+		return -ENODEV;
 
-	return 0;
+	return scmi_telemetry_groups_enumerate(ti, rinfo);
 }
 
 static struct scmi_tlm_instance *scmi_tlm_init(struct scmi_tlm_setup *tsp,
@@ -2263,6 +2527,7 @@ static int scmi_telemetry_probe(struct scmi_device *sdev)
 
 	mutex_lock(&stlmfs_mtx);
 	list_add(&ti->node, &scmi_telemetry_instances);
+	stlmfs_instances++;
 	sb = stlmfs_sb;
 	mutex_unlock(&stlmfs_mtx);
 
@@ -2320,6 +2585,9 @@ static struct inode *stlmfs_alloc_inode(struct super_block *sb)
 		return NULL;
 
 	tlmi->cls = NULL;
+	mutex_init(&tlmi->mtx);
+	INIT_LIST_HEAD(&tlmi->children);
+	tlmi->num_children = 0;
 
 	return &tlmi->vfs_inode;
 }
@@ -2346,6 +2614,8 @@ static int stlmfs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->gid));
 	if (sbi->umask != SCMI_TLM_DEFAULT_UMASK)
 		seq_printf(seq, ",umask=%04u", sbi->umask);
+	if (sbi->lazy)
+		seq_printf(seq, ",lazy");
 
 	return 0;
 }
@@ -2423,6 +2693,7 @@ scmi_telemetry_topology_path_get(struct super_block *sb,
 				 struct scmi_tlm_setup *tsp,
 				 struct dentry *parent, const char *dname)
 {
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
 	struct dentry *dentry;
 
 	dentry = stlmfs_lookup_by_name(parent, dname);
@@ -2438,6 +2709,8 @@ scmi_telemetry_topology_path_get(struct super_block *sb,
 
 		dir_tlm_cls->mode = S_IFDIR | 0755;
 		dir_tlm_cls->flags = TLM_IS_DYNAMIC;
+		if (sbi->lazy)
+			dir_tlm_cls->flags |= TLM_IS_LAZY;
 
 		dentry = stlmfs_create_dentry(sb, tsp, parent,
 					      dir_tlm_cls, NULL);
@@ -2449,7 +2722,7 @@ scmi_telemetry_topology_path_get(struct super_block *sb,
 }
 
 static int scmi_telemetry_topology_add_node(struct super_block *sb,
-					    struct scmi_tlm_instance *ti,
+					    const struct scmi_tlm_instance *ti,
 					    const struct scmi_telemetry_de *de)
 {
 	struct dentry *ctype, *cinst, *cunit, *dinst;
@@ -2490,21 +2763,19 @@ static int scmi_telemetry_topology_add_node(struct super_block *sb,
 	return ret;
 }
 
-DEFINE_TLM_CLASS(compo_dir_cls, "by-components", 0, S_IFDIR | 0700, NULL, NULL);
-
-static int scmi_telemetry_topology_view_add(struct scmi_tlm_instance *ti)
+static int
+scmi_telemetry_topology_view_initialize(const struct scmi_tlm_instance *ti)
 {
 	const struct scmi_telemetry_res_info *rinfo;
 	struct scmi_tlm_setup *tsp = ti->tsp;
+	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
 	struct device *dev = tsp->dev;
 
 	rinfo = scmi_telemetry_res_info_get(tsp);
 	if (!rinfo || !rinfo->fully_enumerated)
 		return -ENODEV;
 
-	ti->compo_dentry =
-		stlmfs_create_dentry(ti->sb, tsp, ti->top_dentry, &compo_dir_cls, NULL);
-
 	for (int i = 0; i < rinfo->num_des; i++) {
 		int ret;
 
@@ -2514,13 +2785,51 @@ static int scmi_telemetry_topology_view_add(struct scmi_tlm_instance *ti)
 				rinfo->des[i]->info->name);
 	}
 
+	sbi->populated[ti->id].topo = true;
+
+	if (sbi->lazy && !sbi->populated[ti->id].des) {
+		int ret;
+
+		ret = scmi_telemetry_des_initialize(ti);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
+static struct dentry *
+scmi_telemetry_top_dentry_create(struct scmi_tlm_instance *ti, bool lazy,
+				 const char *dname, struct dentry *parent,
+				 const struct file_operations *lazy_fops,
+				 const struct inode_operations *lazy_dir_iops,
+				 void *priv)
+{
+	struct scmi_tlm_setup *tsp = ti->tsp;
+	struct super_block *sb = ti->sb;
+
+	struct scmi_tlm_class *tlm_cls __free(kfree) =
+		kzalloc(sizeof(*tlm_cls), GFP_KERNEL);
+	if (!tlm_cls)
+		return ERR_PTR(-ENOMEM);
+
+	tlm_cls->name = kasprintf(GFP_KERNEL, "%s", dname);
+	tlm_cls->mode = S_IFDIR | 0755;
+	tlm_cls->flags = TLM_IS_DYNAMIC;
+	if (lazy) {
+		tlm_cls->flags |= TLM_IS_LAZY;
+		tlm_cls->f_op = lazy_fops;
+		tlm_cls->i_op = lazy_dir_iops;
+	}
+
+	return stlmfs_create_dentry(sb, tsp, parent, no_free_ptr(tlm_cls), priv);
+}
+
 static int scmi_tlm_root_dentries_initialize(struct scmi_tlm_instance *ti)
 {
 	struct scmi_tlm_setup *tsp = ti->tsp;
 	struct super_block *sb = ti->sb;
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
 
 	scnprintf(ti->name, MAX_INST_NAME, "tlm_%d", ti->id);
 
@@ -2543,10 +2852,25 @@ static int scmi_tlm_root_dentries_initialize(struct scmi_tlm_instance *ti)
 		stlmfs_create_dentry(sb, tsp, ti->top_dentry,
 				     &single_sample_tlmo, ti->info);
 	stlmfs_create_dentry(sb, tsp, ti->top_dentry, &ctrl_tlmo, ti->info);
-	ti->des_dentry =
-		stlmfs_create_dentry(sb, tsp, ti->top_dentry, &des_dir_cls, NULL);
-	ti->grps_dentry =
-		stlmfs_create_dentry(sb, tsp, ti->top_dentry, &groups_dir_cls, NULL);
+
+	ti->des_dentry = scmi_telemetry_top_dentry_create(ti, sbi->lazy, "des",
+							  ti->top_dentry,
+							  &lazy_des_fops,
+							  &lazy_des_dir_iops,
+							  ti);
+
+	ti->grps_dentry = scmi_telemetry_top_dentry_create(ti, sbi->lazy, "groups",
+							   ti->top_dentry,
+							   &lazy_grps_fops,
+							   &lazy_grps_dir_iops,
+							   ti);
+
+	ti->compo_dentry = scmi_telemetry_top_dentry_create(ti, sbi->lazy,
+							    "by-components",
+							    ti->top_dentry,
+							    &lazy_compo_fops,
+							    &lazy_compo_dir_iops,
+							    ti);
 
 	return 0;
 }
@@ -2554,6 +2878,7 @@ static int scmi_tlm_root_dentries_initialize(struct scmi_tlm_instance *ti)
 static int scmi_telemetry_instance_register(struct super_block *sb,
 					    struct scmi_tlm_instance *ti)
 {
+	struct stlmfs_sb_info *sbi = sb->s_fs_info;
 	int ret;
 
 	ti->sb = sb;
@@ -2561,6 +2886,9 @@ static int scmi_telemetry_instance_register(struct super_block *sb,
 	if (ret)
 		return ret;
 
+	if (sbi->lazy)
+		return 0;
+
 	ret = scmi_telemetry_des_initialize(ti);
 	if (ret)
 		return ret;
@@ -2572,11 +2900,12 @@ static int scmi_telemetry_instance_register(struct super_block *sb,
 			 ti->top_cls.name);
 	}
 
-	ret = scmi_telemetry_topology_view_add(ti);
-	if (ret)
+	ret = scmi_telemetry_topology_view_initialize(ti);
+	if (ret) {
 		dev_warn(ti->tsp->dev,
 			 "Failed to create topology view for instance %s.\n",
 			 ti->top_cls.name);
+	}
 
 	return 0;
 }
@@ -2593,11 +2922,13 @@ static int stlmfs_fill_super(struct super_block *sb, struct fs_context *fc)
 		return 0;
 
 	struct stlmfs_sb_info *sbi __free(kfree) =
-		kzalloc(sizeof(*sbi), GFP_KERNEL);
+		kzalloc(struct_size(sbi, populated, stlmfs_instances), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 
 	ctx = fc->fs_private;
+	sbi->num_inst = stlmfs_instances;
+	sbi->lazy = ctx->lazy;
 	sbi->uid = ctx->uid;
 	sbi->gid = ctx->gid;
 	sbi->umask = ctx->umask;
@@ -2673,6 +3004,10 @@ static int stlmfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
 		ctx->umask = result.uint_32 & 07777;
 		ctx->opts |= BIT(Opt_umask);
 		break;
+	case Opt_lazy:
+		ctx->lazy = result.boolean;
+		ctx->opts |= BIT(Opt_lazy);
+		break;
 	default:
 		return -ENOPARAM;
 	}
@@ -2692,6 +3027,8 @@ static int stlmfs_reconfigure(struct fs_context *fc)
 		return invalfc(fc, "gid cannot be changed on remount");
 	if (ctx->opts & BIT(Opt_umask))
 		return invalfc(fc, "umask cannot be changed on remount");
+	if (ctx->opts & BIT(Opt_lazy))
+		return invalfc(fc, "lazy cannot be changed on remount");
 
 	return 0;
 }
@@ -2712,6 +3049,7 @@ static int stlmfs_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	/* defaults */
+	ctx->lazy = false;
 	ctx->uid = GLOBAL_ROOT_UID;
 	ctx->gid = GLOBAL_ROOT_GID;
 	ctx->umask = SCMI_TLM_DEFAULT_UMASK;
@@ -2740,6 +3078,7 @@ static struct file_system_type scmi_telemetry_fs = {
 	.name = TLM_FS_NAME,
 	.kill_sb = stlmfs_kill_sb,
 	.init_fs_context = stlmfs_init_fs_context,
+	.parameters = stlmfs_param_spec,
 	.fs_flags = 0,
 };
 
-- 
2.54.0



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox