Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v14 21/30] KVM: arm64: Initialise hyp_nr_cpus for nVHE hyp
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

Knowing the number of CPUs is necessary for determining the boundaries
of per-cpu variables, which will be used for upcoming hypervisor
tracing. hyp_nr_cpus which stores this value, is only initialised for
the pKVM hypervisor. Make it accessible for the nVHE hypervisor as well.

With the kernel now responsible for initialising hyp_nr_cpus, the
nr_cpus parameter is no longer needed in __pkvm_init.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 76ce2b94bd97..4bf63025061e 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -129,8 +129,7 @@ void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __pkvm_init_switch_pgd(phys_addr_t pgd, unsigned long sp,
 		void (*fn)(void));
-int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
-		unsigned long *per_cpu_base, u32 hyp_va_bits);
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits);
 void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
 #endif
 
@@ -147,5 +146,6 @@ extern u64 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val);
 extern unsigned long kvm_nvhe_sym(__icache_flags);
 extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
 extern unsigned int kvm_nvhe_sym(kvm_host_sve_max_vl);
+extern unsigned long kvm_nvhe_sym(hyp_nr_cpus);
 
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 410ffd41fd73..1371f9b3ecea 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -35,6 +35,7 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_nested.h>
 #include <asm/kvm_pkvm.h>
@@ -2465,7 +2466,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits)
 	preempt_disable();
 	cpu_hyp_init_context();
 	ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
-				num_possible_cpus(), kern_hyp_va(per_cpu_base),
+				kern_hyp_va(per_cpu_base),
 				hyp_va_bits);
 	cpu_hyp_init_features();
 
@@ -2674,6 +2675,8 @@ static int __init init_hyp_mode(void)
 		kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
 	}
 
+	kvm_nvhe_sym(hyp_nr_cpus) = num_possible_cpus();
+
 	/*
 	 * Map the Hyp-code called directly from the host
 	 */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 5f9d56754e39..f8a7b8c04c49 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -30,8 +30,6 @@ enum pkvm_component_id {
 	PKVM_ID_FFA,
 };
 
-extern unsigned long hyp_nr_cpus;
-
 int __pkvm_prot_finalize(void);
 int __pkvm_host_share_hyp(u64 pfn);
 int __pkvm_host_unshare_hyp(u64 pfn);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index e7790097db93..8ae3c348ed81 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -486,17 +486,15 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
 	DECLARE_REG(unsigned long, size, host_ctxt, 2);
-	DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
-	DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
-	DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+	DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 3);
+	DECLARE_REG(u32, hyp_va_bits, host_ctxt, 4);
 
 	/*
 	 * __pkvm_init() will return only if an error occurred, otherwise it
 	 * will tail-call in __pkvm_init_finalise() which will have to deal
 	 * with the host context directly.
 	 */
-	cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
-					    hyp_va_bits);
+	cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, per_cpu_base, hyp_va_bits);
 }
 
 static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 90bd014e952f..d8e5b563fd3d 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -341,8 +341,7 @@ void __noreturn __pkvm_init_finalise(void)
 	__host_enter(host_ctxt);
 }
 
-int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
-		unsigned long *per_cpu_base, u32 hyp_va_bits)
+int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long *per_cpu_base, u32 hyp_va_bits)
 {
 	struct kvm_nvhe_init_params *params;
 	void *virt = hyp_phys_to_virt(phys);
@@ -355,7 +354,6 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
 		return -EINVAL;
 
 	hyp_spin_lock_init(&pkvm_pgd_lock);
-	hyp_nr_cpus = nr_cpus;
 
 	ret = divide_memory_pool(virt, size);
 	if (ret)
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 22/30] KVM: arm64: Support unaligned fixmap in the pKVM hyp
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

Return the fixmap VA with the page offset, instead of the page base
address. This allows to use hyp_fixmap_map() seamlessly regardless of
the address alignment. While at it, do the same for hyp_fixblock_map().

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 218976287d3f..c98cec0c150f 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -244,7 +244,7 @@ static void *fixmap_map_slot(struct hyp_fixmap_slot *slot, phys_addr_t phys)
 
 void *hyp_fixmap_map(phys_addr_t phys)
 {
-	return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys);
+	return fixmap_map_slot(this_cpu_ptr(&fixmap_slots), phys) + offset_in_page(phys);
 }
 
 static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
@@ -366,7 +366,7 @@ void *hyp_fixblock_map(phys_addr_t phys, size_t *size)
 #ifdef HAS_FIXBLOCK
 	*size = PMD_SIZE;
 	hyp_spin_lock(&hyp_fixblock_lock);
-	return fixmap_map_slot(&hyp_fixblock_slot, phys);
+	return fixmap_map_slot(&hyp_fixblock_slot, phys) + offset_in_page(phys);
 #else
 	*size = PAGE_SIZE;
 	return hyp_fixmap_map(phys);
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 23/30] KVM: arm64: Add tracing capability for the nVHE/pKVM hyp
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

There is currently no way to inspect or log what's happening at EL2
when the nVHE or pKVM hypervisor is used. With the growing set of
features for pKVM, the need for tooling is more pressing. And tracefs,
by its reliability, versatility and support for user-space is fit for
purpose.

Add support to write into a tracefs compatible ring-buffer. There's no
way the hypervisor could log events directly into the host tracefs
ring-buffers. So instead let's use our own, where the hypervisor is the
writer and the host the reader.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index a1ad12c72ebf..d46d9196d661 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -89,6 +89,10 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
 	__KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid,
+	__KVM_HOST_SMCCC_FUNC___tracing_load,
+	__KVM_HOST_SMCCC_FUNC___tracing_unload,
+	__KVM_HOST_SMCCC_FUNC___tracing_enable,
+	__KVM_HOST_SMCCC_FUNC___tracing_swap_reader,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_hyptrace.h b/arch/arm64/include/asm/kvm_hyptrace.h
new file mode 100644
index 000000000000..9c30a479bc36
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_hyptrace.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ARM64_KVM_HYPTRACE_H_
+#define __ARM64_KVM_HYPTRACE_H_
+
+#include <linux/ring_buffer.h>
+
+struct hyp_trace_desc {
+	unsigned long			bpages_backing_start;
+	size_t				bpages_backing_size;
+	struct trace_buffer_desc	trace_buffer_desc;
+
+};
+#endif
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 14b2d0b0b831..d215348370fa 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -72,6 +72,11 @@ config NVHE_EL2_DEBUG
 
 if NVHE_EL2_DEBUG
 
+config NVHE_EL2_TRACING
+	bool
+	depends on TRACING
+	default y
+
 config PKVM_DISABLE_STAGE2_ON_PANIC
 	bool "Disable the host stage-2 on panic"
 	default n
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h
new file mode 100644
index 000000000000..7da8788ce527
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ARM64_KVM_HYP_NVHE_TRACE_H
+#define __ARM64_KVM_HYP_NVHE_TRACE_H
+#include <asm/kvm_hyptrace.h>
+
+#ifdef CONFIG_NVHE_EL2_TRACING
+void *tracing_reserve_entry(unsigned long length);
+void tracing_commit_entry(void);
+
+int __tracing_load(unsigned long desc_va, size_t desc_size);
+void __tracing_unload(void);
+int __tracing_enable(bool enable);
+int __tracing_swap_reader(unsigned int cpu);
+#else
+static inline void *tracing_reserve_entry(unsigned long length) { return NULL; }
+static inline void tracing_commit_entry(void) { }
+
+static inline int __tracing_load(unsigned long desc_va, size_t desc_size) { return -ENODEV; }
+static inline void __tracing_unload(void) { }
+static inline int __tracing_enable(bool enable) { return -ENODEV; }
+static inline int __tracing_swap_reader(unsigned int cpu) { return -ENODEV; }
+#endif
+#endif
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 8dc95257c291..f1840628d2d6 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -29,9 +29,12 @@ hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
 hyp-obj-y += ../../../kernel/smccc-call.o
 hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
-hyp-obj-$(CONFIG_NVHE_EL2_TRACING) += clock.o
+hyp-obj-$(CONFIG_NVHE_EL2_TRACING) += clock.o trace.o
 hyp-obj-y += $(lib-objs)
 
+# Path to simple_ring_buffer.c
+CFLAGS_trace.nvhe.o += -I$(objtree)/kernel/trace/
+
 ##
 ## Build rules for compiling nVHE hyp code
 ## Output of this folder is `kvm_nvhe.o`, a partially linked object
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 8ae3c348ed81..ae04204ea81f 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -18,6 +18,7 @@
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/pkvm.h>
+#include <nvhe/trace.h>
 #include <nvhe/trap_handler.h>
 
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -587,6 +588,33 @@ static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
 }
 
+static void handle___tracing_load(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1);
+	DECLARE_REG(size_t, desc_size, host_ctxt, 2);
+
+	cpu_reg(host_ctxt, 1) = __tracing_load(desc_hva, desc_size);
+}
+
+static void handle___tracing_unload(struct kvm_cpu_context *host_ctxt)
+{
+	__tracing_unload();
+}
+
+static void handle___tracing_enable(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(bool, enable, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __tracing_enable(enable);
+}
+
+static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -628,6 +656,10 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_vcpu_load),
 	HANDLE_FUNC(__pkvm_vcpu_put),
 	HANDLE_FUNC(__pkvm_tlb_flush_vmid),
+	HANDLE_FUNC(__tracing_load),
+	HANDLE_FUNC(__tracing_unload),
+	HANDLE_FUNC(__tracing_enable),
+	HANDLE_FUNC(__tracing_swap_reader),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c
new file mode 100644
index 000000000000..282cba70ce9b
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/trace.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <nvhe/clock.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
+#include <nvhe/trace.h>
+
+#include <asm/percpu.h>
+#include <asm/kvm_mmu.h>
+#include <asm/local.h>
+
+#include "simple_ring_buffer.c"
+
+static DEFINE_PER_CPU(struct simple_rb_per_cpu, __simple_rbs);
+
+static struct hyp_trace_buffer {
+	struct simple_rb_per_cpu __percpu	*simple_rbs;
+	void					*bpages_backing_start;
+	size_t					bpages_backing_size;
+	hyp_spinlock_t				lock;
+} trace_buffer = {
+	.simple_rbs = &__simple_rbs,
+	.lock = __HYP_SPIN_LOCK_UNLOCKED,
+};
+
+static bool hyp_trace_buffer_loaded(struct hyp_trace_buffer *trace_buffer)
+{
+	return trace_buffer->bpages_backing_size > 0;
+}
+
+void *tracing_reserve_entry(unsigned long length)
+{
+	return simple_ring_buffer_reserve(this_cpu_ptr(trace_buffer.simple_rbs), length,
+					  trace_clock());
+}
+
+void tracing_commit_entry(void)
+{
+	simple_ring_buffer_commit(this_cpu_ptr(trace_buffer.simple_rbs));
+}
+
+static int __admit_host_mem(void *start, u64 size)
+{
+	if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size) || !size)
+		return -EINVAL;
+
+	if (!is_protected_kvm_enabled())
+		return 0;
+
+	return __pkvm_host_donate_hyp(hyp_virt_to_pfn(start), size >> PAGE_SHIFT);
+}
+
+static void __release_host_mem(void *start, u64 size)
+{
+	if (!is_protected_kvm_enabled())
+		return;
+
+	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(start), size >> PAGE_SHIFT));
+}
+
+static int hyp_trace_buffer_load_bpage_backing(struct hyp_trace_buffer *trace_buffer,
+					       struct hyp_trace_desc *desc)
+{
+	void *start = (void *)kern_hyp_va(desc->bpages_backing_start);
+	size_t size = desc->bpages_backing_size;
+	int ret;
+
+	ret = __admit_host_mem(start, size);
+	if (ret)
+		return ret;
+
+	memset(start, 0, size);
+
+	trace_buffer->bpages_backing_start = start;
+	trace_buffer->bpages_backing_size = size;
+
+	return 0;
+}
+
+static void hyp_trace_buffer_unload_bpage_backing(struct hyp_trace_buffer *trace_buffer)
+{
+	void *start = trace_buffer->bpages_backing_start;
+	size_t size = trace_buffer->bpages_backing_size;
+
+	if (!size)
+		return;
+
+	memset(start, 0, size);
+
+	__release_host_mem(start, size);
+
+	trace_buffer->bpages_backing_start = 0;
+	trace_buffer->bpages_backing_size = 0;
+}
+
+static void *__pin_shared_page(unsigned long kern_va)
+{
+	void *va = kern_hyp_va((void *)kern_va);
+
+	if (!is_protected_kvm_enabled())
+		return va;
+
+	return hyp_pin_shared_mem(va, va + PAGE_SIZE) ? NULL : va;
+}
+
+static void __unpin_shared_page(void *va)
+{
+	if (!is_protected_kvm_enabled())
+		return;
+
+	hyp_unpin_shared_mem(va, va + PAGE_SIZE);
+}
+
+static void hyp_trace_buffer_unload(struct hyp_trace_buffer *trace_buffer)
+{
+	int cpu;
+
+	hyp_assert_lock_held(&trace_buffer->lock);
+
+	if (!hyp_trace_buffer_loaded(trace_buffer))
+		return;
+
+	for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
+		simple_ring_buffer_unload_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
+					     __unpin_shared_page);
+
+	hyp_trace_buffer_unload_bpage_backing(trace_buffer);
+}
+
+static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer,
+				 struct hyp_trace_desc *desc)
+{
+	struct simple_buffer_page *bpages;
+	struct ring_buffer_desc *rb_desc;
+	int ret, cpu;
+
+	hyp_assert_lock_held(&trace_buffer->lock);
+
+	if (hyp_trace_buffer_loaded(trace_buffer))
+		return -EINVAL;
+
+	ret = hyp_trace_buffer_load_bpage_backing(trace_buffer, desc);
+	if (ret)
+		return ret;
+
+	bpages = trace_buffer->bpages_backing_start;
+	for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
+		ret = simple_ring_buffer_init_mm(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
+						 bpages, rb_desc, __pin_shared_page,
+						__unpin_shared_page);
+		if (ret)
+			break;
+
+		bpages += rb_desc->nr_page_va;
+	}
+
+	if (ret)
+		hyp_trace_buffer_unload(trace_buffer);
+
+	return ret;
+}
+
+static bool hyp_trace_desc_validate(struct hyp_trace_desc *desc, size_t desc_size)
+{
+	struct ring_buffer_desc *rb_desc;
+	unsigned int cpu;
+	size_t nr_bpages;
+	void *desc_end;
+
+	/*
+	 * Both desc_size and bpages_backing_size are untrusted host-provided
+	 * values. We rely on __pkvm_host_donate_hyp() to enforce their validity.
+	 */
+	desc_end = (void *)desc + desc_size;
+	nr_bpages = desc->bpages_backing_size / sizeof(struct simple_buffer_page);
+
+	for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
+		/* Can we read nr_page_va? */
+		if ((void *)rb_desc + struct_size(rb_desc, page_va, 0) > desc_end)
+			return false;
+
+		/* Overflow desc? */
+		if ((void *)rb_desc + struct_size(rb_desc, page_va, rb_desc->nr_page_va) > desc_end)
+			return false;
+
+		/* Overflow bpages backing memory? */
+		if (nr_bpages < rb_desc->nr_page_va)
+			return false;
+
+		if (cpu >= hyp_nr_cpus)
+			return false;
+
+		if (cpu != rb_desc->cpu)
+			return false;
+
+		nr_bpages -= rb_desc->nr_page_va;
+	}
+
+	return true;
+}
+
+int __tracing_load(unsigned long desc_hva, size_t desc_size)
+{
+	struct hyp_trace_desc *desc = (struct hyp_trace_desc *)kern_hyp_va(desc_hva);
+	int ret;
+
+	ret = __admit_host_mem(desc, desc_size);
+	if (ret)
+		return ret;
+
+	if (!hyp_trace_desc_validate(desc, desc_size))
+		goto err_release_desc;
+
+	hyp_spin_lock(&trace_buffer.lock);
+
+	ret = hyp_trace_buffer_load(&trace_buffer, desc);
+
+	hyp_spin_unlock(&trace_buffer.lock);
+
+err_release_desc:
+	__release_host_mem(desc, desc_size);
+	return ret;
+}
+
+void __tracing_unload(void)
+{
+	hyp_spin_lock(&trace_buffer.lock);
+	hyp_trace_buffer_unload(&trace_buffer);
+	hyp_spin_unlock(&trace_buffer.lock);
+}
+
+int __tracing_enable(bool enable)
+{
+	int cpu, ret = enable ? -EINVAL : 0;
+
+	hyp_spin_lock(&trace_buffer.lock);
+
+	if (!hyp_trace_buffer_loaded(&trace_buffer))
+		goto unlock;
+
+	for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
+		simple_ring_buffer_enable_tracing(per_cpu_ptr(trace_buffer.simple_rbs, cpu),
+						  enable);
+
+	ret = 0;
+
+unlock:
+	hyp_spin_unlock(&trace_buffer.lock);
+
+	return ret;
+}
+
+int __tracing_swap_reader(unsigned int cpu)
+{
+	int ret = -ENODEV;
+
+	if (cpu >= hyp_nr_cpus)
+		return -EINVAL;
+
+	hyp_spin_lock(&trace_buffer.lock);
+
+	if (hyp_trace_buffer_loaded(&trace_buffer))
+		ret = simple_ring_buffer_swap_reader_page(
+				per_cpu_ptr(trace_buffer.simple_rbs, cpu));
+
+	hyp_spin_unlock(&trace_buffer.lock);
+
+	return ret;
+}
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 24/30] KVM: arm64: Add trace remote for the nVHE/pKVM hyp
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

In both protected and nVHE mode, the hypervisor is capable of writing
events into tracefs compatible ring-buffers. Create a trace remote so
the kernel can read those buffers.

This currently doesn't provide any event support which will come later.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index d215348370fa..17edfe3ae615 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -75,6 +75,7 @@ if NVHE_EL2_DEBUG
 config NVHE_EL2_TRACING
 	bool
 	depends on TRACING
+	select TRACE_REMOTE
 	default y
 
 config PKVM_DISABLE_STAGE2_ON_PANIC
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 3ebc0570345c..59612d2f277c 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -30,6 +30,8 @@ kvm-$(CONFIG_HW_PERF_EVENTS)  += pmu-emul.o pmu.o
 kvm-$(CONFIG_ARM64_PTR_AUTH)  += pauth.o
 kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
 
+kvm-$(CONFIG_NVHE_EL2_TRACING) += hyp_trace.o
+
 always-y := hyp_constants.h hyp-constants.s
 
 define rule_gen_hyp_constants
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1371f9b3ecea..87a3d28d5f0f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -24,6 +24,7 @@
 
 #define CREATE_TRACE_POINTS
 #include "trace_arm.h"
+#include "hyp_trace.h"
 
 #include <linux/uaccess.h>
 #include <asm/ptrace.h>
@@ -2415,6 +2416,10 @@ static int __init init_subsystems(void)
 
 	kvm_register_perf_callbacks();
 
+	err = kvm_hyp_trace_init();
+	if (err)
+		kvm_err("Failed to initialize Hyp tracing\n");
+
 out:
 	if (err)
 		hyp_cpu_pm_exit();
diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c
new file mode 100644
index 000000000000..2866effe28ec
--- /dev/null
+++ b/arch/arm64/kvm/hyp_trace.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <linux/trace_remote.h>
+#include <linux/simple_ring_buffer.h>
+
+#include <asm/kvm_host.h>
+#include <asm/kvm_hyptrace.h>
+#include <asm/kvm_mmu.h>
+
+#include "hyp_trace.h"
+
+/* Access to this struct within the trace_remote_callbacks are protected by the trace_remote lock */
+static struct hyp_trace_buffer {
+	struct hyp_trace_desc	*desc;
+	size_t			desc_size;
+} trace_buffer;
+
+static int __map_hyp(void *start, size_t size)
+{
+	if (is_protected_kvm_enabled())
+		return 0;
+
+	return create_hyp_mappings(start, start + size, PAGE_HYP);
+}
+
+static int __share_page(unsigned long va)
+{
+	return kvm_share_hyp((void *)va, (void *)va + 1);
+}
+
+static void __unshare_page(unsigned long va)
+{
+	kvm_unshare_hyp((void *)va, (void *)va + 1);
+}
+
+static int hyp_trace_buffer_alloc_bpages_backing(struct hyp_trace_buffer *trace_buffer, size_t size)
+{
+	int nr_bpages = (PAGE_ALIGN(size) / PAGE_SIZE) + 1;
+	size_t backing_size;
+	void *start;
+
+	backing_size = PAGE_ALIGN(sizeof(struct simple_buffer_page) * nr_bpages *
+				  num_possible_cpus());
+
+	start = alloc_pages_exact(backing_size, GFP_KERNEL_ACCOUNT);
+	if (!start)
+		return -ENOMEM;
+
+	trace_buffer->desc->bpages_backing_start = (unsigned long)start;
+	trace_buffer->desc->bpages_backing_size = backing_size;
+
+	return __map_hyp(start, backing_size);
+}
+
+static void hyp_trace_buffer_free_bpages_backing(struct hyp_trace_buffer *trace_buffer)
+{
+	free_pages_exact((void *)trace_buffer->desc->bpages_backing_start,
+			 trace_buffer->desc->bpages_backing_size);
+}
+
+static void hyp_trace_buffer_unshare_hyp(struct hyp_trace_buffer *trace_buffer, int last_cpu)
+{
+	struct ring_buffer_desc *rb_desc;
+	int cpu, p;
+
+	for_each_ring_buffer_desc(rb_desc, cpu, &trace_buffer->desc->trace_buffer_desc) {
+		if (cpu > last_cpu)
+			break;
+
+		__share_page(rb_desc->meta_va);
+		for (p = 0; p < rb_desc->nr_page_va; p++)
+			__unshare_page(rb_desc->page_va[p]);
+	}
+}
+
+static int hyp_trace_buffer_share_hyp(struct hyp_trace_buffer *trace_buffer)
+{
+	struct ring_buffer_desc *rb_desc;
+	int cpu, p, ret = 0;
+
+	for_each_ring_buffer_desc(rb_desc, cpu, &trace_buffer->desc->trace_buffer_desc) {
+		ret = __share_page(rb_desc->meta_va);
+		if (ret)
+			break;
+
+		for (p = 0; p < rb_desc->nr_page_va; p++) {
+			ret = __share_page(rb_desc->page_va[p]);
+			if (ret)
+				break;
+		}
+
+		if (ret) {
+			for (p--; p >= 0; p--)
+				__unshare_page(rb_desc->page_va[p]);
+			break;
+		}
+	}
+
+	if (ret)
+		hyp_trace_buffer_unshare_hyp(trace_buffer, cpu--);
+
+	return ret;
+}
+
+static struct trace_buffer_desc *hyp_trace_load(unsigned long size, void *priv)
+{
+	struct hyp_trace_buffer *trace_buffer = priv;
+	struct hyp_trace_desc *desc;
+	size_t desc_size;
+	int ret;
+
+	if (WARN_ON(trace_buffer->desc))
+		return ERR_PTR(-EINVAL);
+
+	desc_size = trace_buffer_desc_size(size, num_possible_cpus());
+	if (desc_size == SIZE_MAX)
+		return ERR_PTR(-E2BIG);
+
+	desc_size = PAGE_ALIGN(desc_size);
+	desc = (struct hyp_trace_desc *)alloc_pages_exact(desc_size, GFP_KERNEL);
+	if (!desc)
+		return ERR_PTR(-ENOMEM);
+
+	ret = __map_hyp(desc, desc_size);
+	if (ret)
+		goto err_free_desc;
+
+	trace_buffer->desc = desc;
+
+	ret = hyp_trace_buffer_alloc_bpages_backing(trace_buffer, size);
+	if (ret)
+		goto err_free_desc;
+
+	ret = trace_remote_alloc_buffer(&desc->trace_buffer_desc, desc_size, size,
+					cpu_possible_mask);
+	if (ret)
+		goto err_free_backing;
+
+	ret = hyp_trace_buffer_share_hyp(trace_buffer);
+	if (ret)
+		goto err_free_buffer;
+
+	ret = kvm_call_hyp_nvhe(__tracing_load, (unsigned long)desc, desc_size);
+	if (ret)
+		goto err_unload_pages;
+
+	return &desc->trace_buffer_desc;
+
+err_unload_pages:
+	hyp_trace_buffer_unshare_hyp(trace_buffer, INT_MAX);
+
+err_free_buffer:
+	trace_remote_free_buffer(&desc->trace_buffer_desc);
+
+err_free_backing:
+	hyp_trace_buffer_free_bpages_backing(trace_buffer);
+
+err_free_desc:
+	free_pages_exact(desc, desc_size);
+	trace_buffer->desc = NULL;
+
+	return ERR_PTR(ret);
+}
+
+static void hyp_trace_unload(struct trace_buffer_desc *desc, void *priv)
+{
+	struct hyp_trace_buffer *trace_buffer = priv;
+
+	if (WARN_ON(desc != &trace_buffer->desc->trace_buffer_desc))
+		return;
+
+	kvm_call_hyp_nvhe(__tracing_unload);
+	hyp_trace_buffer_unshare_hyp(trace_buffer, INT_MAX);
+	trace_remote_free_buffer(desc);
+	hyp_trace_buffer_free_bpages_backing(trace_buffer);
+	free_pages_exact(trace_buffer->desc, trace_buffer->desc_size);
+	trace_buffer->desc = NULL;
+}
+
+static int hyp_trace_enable_tracing(bool enable, void *priv)
+{
+	return kvm_call_hyp_nvhe(__tracing_enable, enable);
+}
+
+static int hyp_trace_swap_reader_page(unsigned int cpu, void *priv)
+{
+	return kvm_call_hyp_nvhe(__tracing_swap_reader, cpu);
+}
+
+static int hyp_trace_reset(unsigned int cpu, void *priv)
+{
+	return 0;
+}
+
+static int hyp_trace_enable_event(unsigned short id, bool enable, void *priv)
+{
+	return 0;
+}
+
+static struct trace_remote_callbacks trace_remote_callbacks = {
+	.load_trace_buffer	= hyp_trace_load,
+	.unload_trace_buffer	= hyp_trace_unload,
+	.enable_tracing		= hyp_trace_enable_tracing,
+	.swap_reader_page	= hyp_trace_swap_reader_page,
+	.reset			= hyp_trace_reset,
+	.enable_event		= hyp_trace_enable_event,
+};
+
+int __init kvm_hyp_trace_init(void)
+{
+	if (is_kernel_in_hyp_mode())
+		return 0;
+
+	return trace_remote_register("hypervisor", &trace_remote_callbacks, &trace_buffer, NULL, 0);
+}
diff --git a/arch/arm64/kvm/hyp_trace.h b/arch/arm64/kvm/hyp_trace.h
new file mode 100644
index 000000000000..c991b1ec65f1
--- /dev/null
+++ b/arch/arm64/kvm/hyp_trace.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ARM64_KVM_HYP_TRACE_H__
+#define __ARM64_KVM_HYP_TRACE_H__
+
+#ifdef CONFIG_NVHE_EL2_TRACING
+int kvm_hyp_trace_init(void);
+#else
+static inline int kvm_hyp_trace_init(void) { return 0; }
+#endif
+#endif
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 25/30] KVM: arm64: Sync boot clock with the nVHE/pKVM hyp
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort, Thomas Gleixner,
	Stephen Boyd, Christopher S. Hall, Richard Cochran
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

Configure the hypervisor tracing clock with the kernel boot clock. For
tracing purposes, the boot clock is interesting: it doesn't stop on
suspend. However, it is corrected on a regular basis, which implies the
need to re-evaluate it every once in a while.

Cc: John Stultz <jstultz@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@kernel.org>
Cc: Christopher S. Hall <christopher.s.hall@intel.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index d46d9196d661..9c4fe3bfbfff 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -93,6 +93,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___tracing_unload,
 	__KVM_HOST_SMCCC_FUNC___tracing_enable,
 	__KVM_HOST_SMCCC_FUNC___tracing_swap_reader,
+	__KVM_HOST_SMCCC_FUNC___tracing_update_clock,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h
index 7da8788ce527..fd641e1b1c23 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trace.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h
@@ -11,6 +11,7 @@ int __tracing_load(unsigned long desc_va, size_t desc_size);
 void __tracing_unload(void);
 int __tracing_enable(bool enable);
 int __tracing_swap_reader(unsigned int cpu);
+void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc);
 #else
 static inline void *tracing_reserve_entry(unsigned long length) { return NULL; }
 static inline void tracing_commit_entry(void) { }
@@ -19,5 +20,6 @@ static inline int __tracing_load(unsigned long desc_va, size_t desc_size) { retu
 static inline void __tracing_unload(void) { }
 static inline int __tracing_enable(bool enable) { return -ENODEV; }
 static inline int __tracing_swap_reader(unsigned int cpu) { return -ENODEV; }
+static inline void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { }
 #endif
 #endif
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index ae04204ea81f..02d5271199d5 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -615,6 +615,16 @@ static void handle___tracing_swap_reader(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __tracing_swap_reader(cpu);
 }
 
+static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u32, mult, host_ctxt, 1);
+	DECLARE_REG(u32, shift, host_ctxt, 2);
+	DECLARE_REG(u64, epoch_ns, host_ctxt, 3);
+	DECLARE_REG(u64, epoch_cyc, host_ctxt, 4);
+
+	__tracing_update_clock(mult, shift, epoch_ns, epoch_cyc);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -660,6 +670,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__tracing_unload),
 	HANDLE_FUNC(__tracing_enable),
 	HANDLE_FUNC(__tracing_swap_reader),
+	HANDLE_FUNC(__tracing_update_clock),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c
index 282cba70ce9b..2c8e6f49d7de 100644
--- a/arch/arm64/kvm/hyp/nvhe/trace.c
+++ b/arch/arm64/kvm/hyp/nvhe/trace.c
@@ -271,3 +271,19 @@ int __tracing_swap_reader(unsigned int cpu)
 
 	return ret;
 }
+
+void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc)
+{
+	int cpu;
+
+	/* After this loop, all CPUs are observing the new bank... */
+	for (cpu = 0; cpu < hyp_nr_cpus; cpu++) {
+		struct simple_rb_per_cpu *simple_rb = per_cpu_ptr(trace_buffer.simple_rbs, cpu);
+
+		while (READ_ONCE(simple_rb->status) == SIMPLE_RB_WRITING)
+			;
+	}
+
+	/* ...we can now override the old one and swap. */
+	trace_clock_update(mult, shift, epoch_ns, epoch_cyc);
+}
diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c
index 2866effe28ec..1e5fc27f0e9d 100644
--- a/arch/arm64/kvm/hyp_trace.c
+++ b/arch/arm64/kvm/hyp_trace.c
@@ -4,15 +4,133 @@
  * Author: Vincent Donnefort <vdonnefort@google.com>
  */
 
+#include <linux/cpumask.h>
 #include <linux/trace_remote.h>
+#include <linux/tracefs.h>
 #include <linux/simple_ring_buffer.h>
 
+#include <asm/arch_timer.h>
 #include <asm/kvm_host.h>
 #include <asm/kvm_hyptrace.h>
 #include <asm/kvm_mmu.h>
 
 #include "hyp_trace.h"
 
+/* Same 10min used by clocksource when width is more than 32-bits */
+#define CLOCK_MAX_CONVERSION_S	600
+/*
+ * Time to give for the clock init. Long enough to get a good mult/shift
+ * estimation. Short enough to not delay the tracing start too much.
+ */
+#define CLOCK_INIT_MS		100
+/*
+ * Time between clock checks. Must be small enough to catch clock deviation when
+ * it is still tiny.
+ */
+#define CLOCK_UPDATE_MS		500
+
+static struct hyp_trace_clock {
+	u64			cycles;
+	u64			cyc_overflow64;
+	u64			boot;
+	u32			mult;
+	u32			shift;
+	struct delayed_work	work;
+	struct completion	ready;
+	struct mutex		lock;
+	bool			running;
+} hyp_clock;
+
+static void __hyp_clock_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct hyp_trace_clock *hyp_clock;
+	struct system_time_snapshot snap;
+	u64 rate, delta_cycles;
+	u64 boot, delta_boot;
+
+	hyp_clock = container_of(dwork, struct hyp_trace_clock, work);
+
+	ktime_get_snapshot(&snap);
+	boot = ktime_to_ns(snap.boot);
+
+	delta_boot = boot - hyp_clock->boot;
+	delta_cycles = snap.cycles - hyp_clock->cycles;
+
+	/* Compare hyp clock with the kernel boot clock */
+	if (hyp_clock->mult) {
+		u64 err, cur = delta_cycles;
+
+		if (WARN_ON_ONCE(cur >= hyp_clock->cyc_overflow64)) {
+			__uint128_t tmp = (__uint128_t)cur * hyp_clock->mult;
+
+			cur = tmp >> hyp_clock->shift;
+		} else {
+			cur *= hyp_clock->mult;
+			cur >>= hyp_clock->shift;
+		}
+		cur += hyp_clock->boot;
+
+		err = abs_diff(cur, boot);
+		/* No deviation, only update epoch if necessary */
+		if (!err) {
+			if (delta_cycles >= (hyp_clock->cyc_overflow64 >> 1))
+				goto fast_forward;
+
+			goto resched;
+		}
+
+		/* Warn if the error is above tracing precision (1us) */
+		if (err > NSEC_PER_USEC)
+			pr_warn_ratelimited("hyp trace clock off by %lluus\n",
+					    err / NSEC_PER_USEC);
+	}
+
+	rate = div64_u64(delta_cycles * NSEC_PER_SEC, delta_boot);
+
+	clocks_calc_mult_shift(&hyp_clock->mult, &hyp_clock->shift,
+			       rate, NSEC_PER_SEC, CLOCK_MAX_CONVERSION_S);
+
+	/* Add a comfortable 50% margin */
+	hyp_clock->cyc_overflow64 = (U64_MAX / hyp_clock->mult) >> 1;
+
+fast_forward:
+	hyp_clock->cycles = snap.cycles;
+	hyp_clock->boot = boot;
+	kvm_call_hyp_nvhe(__tracing_update_clock, hyp_clock->mult,
+			  hyp_clock->shift, hyp_clock->boot, hyp_clock->cycles);
+	complete(&hyp_clock->ready);
+
+resched:
+	schedule_delayed_work(&hyp_clock->work,
+			      msecs_to_jiffies(CLOCK_UPDATE_MS));
+}
+
+static void hyp_trace_clock_enable(struct hyp_trace_clock *hyp_clock, bool enable)
+{
+	struct system_time_snapshot snap;
+
+	if (hyp_clock->running == enable)
+		return;
+
+	if (!enable) {
+		cancel_delayed_work_sync(&hyp_clock->work);
+		hyp_clock->running = false;
+	}
+
+	ktime_get_snapshot(&snap);
+
+	hyp_clock->boot = ktime_to_ns(snap.boot);
+	hyp_clock->cycles = snap.cycles;
+	hyp_clock->mult = 0;
+
+	init_completion(&hyp_clock->ready);
+	INIT_DELAYED_WORK(&hyp_clock->work, __hyp_clock_work);
+	schedule_delayed_work(&hyp_clock->work, msecs_to_jiffies(CLOCK_INIT_MS));
+	wait_for_completion(&hyp_clock->ready);
+	hyp_clock->running = true;
+}
+
 /* Access to this struct within the trace_remote_callbacks are protected by the trace_remote lock */
 static struct hyp_trace_buffer {
 	struct hyp_trace_desc	*desc;
@@ -183,6 +301,8 @@ static void hyp_trace_unload(struct trace_buffer_desc *desc, void *priv)
 
 static int hyp_trace_enable_tracing(bool enable, void *priv)
 {
+	hyp_trace_clock_enable(&hyp_clock, enable);
+
 	return kvm_call_hyp_nvhe(__tracing_enable, enable);
 }
 
@@ -201,7 +321,22 @@ static int hyp_trace_enable_event(unsigned short id, bool enable, void *priv)
 	return 0;
 }
 
+static int hyp_trace_clock_show(struct seq_file *m, void *v)
+{
+	seq_puts(m, "[boot]\n");
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(hyp_trace_clock);
+
+static int hyp_trace_init_tracefs(struct dentry *d, void *priv)
+{
+	return tracefs_create_file("trace_clock", 0440, d, NULL, &hyp_trace_clock_fops) ?
+		0 : -ENOMEM;
+}
+
 static struct trace_remote_callbacks trace_remote_callbacks = {
+	.init			= hyp_trace_init_tracefs,
 	.load_trace_buffer	= hyp_trace_load,
 	.unload_trace_buffer	= hyp_trace_unload,
 	.enable_tracing		= hyp_trace_enable_tracing,
@@ -212,8 +347,22 @@ static struct trace_remote_callbacks trace_remote_callbacks = {
 
 int __init kvm_hyp_trace_init(void)
 {
+	int cpu;
+
 	if (is_kernel_in_hyp_mode())
 		return 0;
 
+#ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
+	for_each_possible_cpu(cpu) {
+		const struct arch_timer_erratum_workaround *wa =
+			per_cpu(timer_unstable_counter_workaround, cpu);
+
+		if (wa && wa->read_cntvct_el0) {
+			pr_warn("hyp trace can't handle CNTVCT workaround '%s'\n", wa->desc);
+			return -EOPNOTSUPP;
+		}
+	}
+#endif
+
 	return trace_remote_register("hypervisor", &trace_remote_callbacks, &trace_buffer, NULL, 0);
 }
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 26/30] KVM: arm64: Add trace reset to the nVHE/pKVM hyp
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

Make the hypervisor reset either the whole tracing buffer or a specific
ring-buffer, on remotes/hypervisor/trace or per_cpu/<cpu>/trace write
access.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 9c4fe3bfbfff..66abf77cf371 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -94,6 +94,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___tracing_enable,
 	__KVM_HOST_SMCCC_FUNC___tracing_swap_reader,
 	__KVM_HOST_SMCCC_FUNC___tracing_update_clock,
+	__KVM_HOST_SMCCC_FUNC___tracing_reset,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h
index fd641e1b1c23..44912869d184 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trace.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h
@@ -12,6 +12,7 @@ void __tracing_unload(void);
 int __tracing_enable(bool enable);
 int __tracing_swap_reader(unsigned int cpu);
 void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc);
+int __tracing_reset(unsigned int cpu);
 #else
 static inline void *tracing_reserve_entry(unsigned long length) { return NULL; }
 static inline void tracing_commit_entry(void) { }
@@ -21,5 +22,6 @@ static inline void __tracing_unload(void) { }
 static inline int __tracing_enable(bool enable) { return -ENODEV; }
 static inline int __tracing_swap_reader(unsigned int cpu) { return -ENODEV; }
 static inline void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { }
+static inline int __tracing_reset(unsigned int cpu) { return -ENODEV; }
 #endif
 #endif
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 02d5271199d5..9b05f0c87586 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -625,6 +625,13 @@ static void handle___tracing_update_clock(struct kvm_cpu_context *host_ctxt)
 	__tracing_update_clock(mult, shift, epoch_ns, epoch_cyc);
 }
 
+static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __tracing_reset(cpu);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -671,6 +678,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__tracing_enable),
 	HANDLE_FUNC(__tracing_swap_reader),
 	HANDLE_FUNC(__tracing_update_clock),
+	HANDLE_FUNC(__tracing_reset),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c
index 2c8e6f49d7de..a6ca27b18e15 100644
--- a/arch/arm64/kvm/hyp/nvhe/trace.c
+++ b/arch/arm64/kvm/hyp/nvhe/trace.c
@@ -287,3 +287,20 @@ void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc)
 	/* ...we can now override the old one and swap. */
 	trace_clock_update(mult, shift, epoch_ns, epoch_cyc);
 }
+
+int __tracing_reset(unsigned int cpu)
+{
+	int ret = -ENODEV;
+
+	if (cpu >= hyp_nr_cpus)
+		return -EINVAL;
+
+	hyp_spin_lock(&trace_buffer.lock);
+
+	if (hyp_trace_buffer_loaded(&trace_buffer))
+		ret = simple_ring_buffer_reset(per_cpu_ptr(trace_buffer.simple_rbs, cpu));
+
+	hyp_spin_unlock(&trace_buffer.lock);
+
+	return ret;
+}
diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c
index 1e5fc27f0e9d..09bc192e3514 100644
--- a/arch/arm64/kvm/hyp_trace.c
+++ b/arch/arm64/kvm/hyp_trace.c
@@ -313,7 +313,7 @@ static int hyp_trace_swap_reader_page(unsigned int cpu, void *priv)
 
 static int hyp_trace_reset(unsigned int cpu, void *priv)
 {
-	return 0;
+	return kvm_call_hyp_nvhe(__tracing_reset, cpu);
 }
 
 static int hyp_trace_enable_event(unsigned short id, bool enable, void *priv)
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 27/30] KVM: arm64: Add event support to the nVHE/pKVM hyp and trace remote
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

Allow the creation of hypervisor and trace remote events with a single
macro HYP_EVENT(). That macro expands in the kernel side to add all
the required declarations (based on REMOTE_EVENT()) as well as in the
hypervisor side to create the trace_<event>() function.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 66abf77cf371..47d250436f8c 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -95,6 +95,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___tracing_swap_reader,
 	__KVM_HOST_SMCCC_FUNC___tracing_update_clock,
 	__KVM_HOST_SMCCC_FUNC___tracing_reset,
+	__KVM_HOST_SMCCC_FUNC___tracing_enable_event,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_define_hypevents.h b/arch/arm64/include/asm/kvm_define_hypevents.h
new file mode 100644
index 000000000000..77d6790252a6
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_define_hypevents.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define REMOTE_EVENT_INCLUDE_FILE arch/arm64/include/asm/kvm_hypevents.h
+
+#define REMOTE_EVENT_SECTION "_hyp_events"
+
+#define HE_STRUCT(__args)		__args
+#define HE_PRINTK(__args...)		__args
+#define he_field			re_field
+
+#define HYP_EVENT(__name, __proto, __struct, __assign, __printk) \
+	REMOTE_EVENT(__name, 0, RE_STRUCT(__struct), RE_PRINTK(__printk))
+
+#define HYP_EVENT_MULTI_READ
+#include <trace/define_remote_events.h>
+#undef HYP_EVENT_MULTI_READ
diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h
new file mode 100644
index 000000000000..d6e033c96c52
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_hypevents.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if !defined(__ARM64_KVM_HYPEVENTS_H_) || defined(HYP_EVENT_MULTI_READ)
+#define __ARM64_KVM_HYPEVENTS_H_
+
+#ifdef __KVM_NVHE_HYPERVISOR__
+#include <nvhe/trace.h>
+#endif
+
+#endif
diff --git a/arch/arm64/include/asm/kvm_hyptrace.h b/arch/arm64/include/asm/kvm_hyptrace.h
index 9c30a479bc36..de133b735f72 100644
--- a/arch/arm64/include/asm/kvm_hyptrace.h
+++ b/arch/arm64/include/asm/kvm_hyptrace.h
@@ -10,4 +10,17 @@ struct hyp_trace_desc {
 	struct trace_buffer_desc	trace_buffer_desc;
 
 };
+
+struct hyp_event_id {
+	unsigned short	id;
+	atomic_t	enabled;
+};
+
+extern struct remote_event __hyp_events_start[];
+extern struct remote_event __hyp_events_end[];
+
+/* hyp_event section used by the hypervisor */
+extern struct hyp_event_id __hyp_event_ids_start[];
+extern struct hyp_event_id __hyp_event_ids_end[];
+
 #endif
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index d7b0d12b1015..d4c7d45ae6bc 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -138,6 +138,10 @@ KVM_NVHE_ALIAS(__hyp_data_start);
 KVM_NVHE_ALIAS(__hyp_data_end);
 KVM_NVHE_ALIAS(__hyp_rodata_start);
 KVM_NVHE_ALIAS(__hyp_rodata_end);
+#ifdef CONFIG_NVHE_EL2_TRACING
+KVM_NVHE_ALIAS(__hyp_event_ids_start);
+KVM_NVHE_ALIAS(__hyp_event_ids_end);
+#endif
 
 /* pKVM static key */
 KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 2964aad0362e..4bf2a83c3448 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -13,12 +13,23 @@
 	*(__kvm_ex_table)					\
 	__stop___kvm_ex_table = .;
 
+#ifdef CONFIG_NVHE_EL2_TRACING
+#define HYPERVISOR_EVENT_IDS 					\
+	. = ALIGN(PAGE_SIZE);					\
+	__hyp_event_ids_start = .;				\
+	*(HYP_SECTION_NAME(.event_ids))				\
+	__hyp_event_ids_end = .;
+#else
+#define HYPERVISOR_EVENT_IDS
+#endif
+
 #define HYPERVISOR_RODATA_SECTIONS				\
 	HYP_SECTION_NAME(.rodata) : {				\
 		. = ALIGN(PAGE_SIZE);				\
 		__hyp_rodata_start = .;				\
 		*(HYP_SECTION_NAME(.data..ro_after_init))	\
 		*(HYP_SECTION_NAME(.rodata))			\
+		HYPERVISOR_EVENT_IDS				\
 		. = ALIGN(PAGE_SIZE);				\
 		__hyp_rodata_end = .;				\
 	}
@@ -307,6 +318,13 @@ SECTIONS
 
 	HYPERVISOR_DATA_SECTION
 
+#ifdef CONFIG_NVHE_EL2_TRACING
+	.data.hyp_events : {
+		__hyp_events_start = .;
+		*(SORT(_hyp_events.*))
+		__hyp_events_end = .;
+	}
+#endif
 	/*
 	 * Data written with the MMU off but read with the MMU on requires
 	 * cache lines to be invalidated, discarding up to a Cache Writeback
diff --git a/arch/arm64/kvm/hyp/include/nvhe/define_events.h b/arch/arm64/kvm/hyp/include/nvhe/define_events.h
new file mode 100644
index 000000000000..776d4c6cb702
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/define_events.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#undef HYP_EVENT
+#define HYP_EVENT(__name, __proto, __struct, __assign, __printk)	\
+	struct hyp_event_id hyp_event_id_##__name			\
+	__section(".hyp.event_ids."#__name) = {				\
+		.enabled = ATOMIC_INIT(0),				\
+	}
+
+#define HYP_EVENT_MULTI_READ
+#include <asm/kvm_hypevents.h>
+#undef HYP_EVENT_MULTI_READ
+
+#undef HYP_EVENT
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h
index 44912869d184..802a18b77c56 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trace.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h
@@ -1,9 +1,36 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 #ifndef __ARM64_KVM_HYP_NVHE_TRACE_H
 #define __ARM64_KVM_HYP_NVHE_TRACE_H
+
+#include <linux/trace_remote_event.h>
+
 #include <asm/kvm_hyptrace.h>
 
+#define HE_PROTO(__args...)	__args
+#define HE_ASSIGN(__args...)	__args
+#define HE_STRUCT		RE_STRUCT
+#define he_field		re_field
+
 #ifdef CONFIG_NVHE_EL2_TRACING
+
+#define HYP_EVENT(__name, __proto, __struct, __assign, __printk)		\
+	REMOTE_EVENT_FORMAT(__name, __struct);					\
+	extern struct hyp_event_id hyp_event_id_##__name;			\
+	static __always_inline void trace_##__name(__proto)			\
+	{									\
+		struct remote_event_format_##__name *__entry;			\
+		size_t length = sizeof(*__entry);				\
+										\
+		if (!atomic_read(&hyp_event_id_##__name.enabled))		\
+			return;							\
+		__entry = tracing_reserve_entry(length);			\
+		if (!__entry)							\
+			return;							\
+		__entry->hdr.id = hyp_event_id_##__name.id;			\
+		__assign							\
+		tracing_commit_entry();						\
+	}
+
 void *tracing_reserve_entry(unsigned long length);
 void tracing_commit_entry(void);
 
@@ -13,9 +40,12 @@ int __tracing_enable(bool enable);
 int __tracing_swap_reader(unsigned int cpu);
 void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc);
 int __tracing_reset(unsigned int cpu);
+int __tracing_enable_event(unsigned short id, bool enable);
 #else
 static inline void *tracing_reserve_entry(unsigned long length) { return NULL; }
 static inline void tracing_commit_entry(void) { }
+#define HYP_EVENT(__name, __proto, __struct, __assign, __printk)      \
+	static inline void trace_##__name(__proto) {}
 
 static inline int __tracing_load(unsigned long desc_va, size_t desc_size) { return -ENODEV; }
 static inline void __tracing_unload(void) { }
@@ -23,5 +53,6 @@ static inline int __tracing_enable(bool enable) { return -ENODEV; }
 static inline int __tracing_swap_reader(unsigned int cpu) { return -ENODEV; }
 static inline void __tracing_update_clock(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) { }
 static inline int __tracing_reset(unsigned int cpu) { return -ENODEV; }
+static inline int __tracing_enable_event(unsigned short id, bool enable)  { return -ENODEV; }
 #endif
 #endif
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index f1840628d2d6..143d55ec7298 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -29,7 +29,7 @@ hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
 hyp-obj-y += ../../../kernel/smccc-call.o
 hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
-hyp-obj-$(CONFIG_NVHE_EL2_TRACING) += clock.o trace.o
+hyp-obj-$(CONFIG_NVHE_EL2_TRACING) += clock.o trace.o events.o
 hyp-obj-y += $(lib-objs)
 
 # Path to simple_ring_buffer.c
diff --git a/arch/arm64/kvm/hyp/nvhe/events.c b/arch/arm64/kvm/hyp/nvhe/events.c
new file mode 100644
index 000000000000..add9383aadb5
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/events.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google LLC
+ * Author: Vincent Donnefort <vdonnefort@google.com>
+ */
+
+#include <nvhe/mm.h>
+#include <nvhe/trace.h>
+
+#include <nvhe/define_events.h>
+
+int __tracing_enable_event(unsigned short id, bool enable)
+{
+	struct hyp_event_id *event_id = &__hyp_event_ids_start[id];
+	atomic_t *enabled;
+
+	if (event_id >= __hyp_event_ids_end)
+		return -EINVAL;
+
+	enabled = hyp_fixmap_map(__hyp_pa(&event_id->enabled));
+	atomic_set(enabled, enable);
+	hyp_fixmap_unmap();
+
+	return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 9b05f0c87586..fc5953f31b4b 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -632,6 +632,14 @@ static void handle___tracing_reset(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __tracing_reset(cpu);
 }
 
+static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned short, id, host_ctxt, 1);
+	DECLARE_REG(bool, enable, host_ctxt, 2);
+
+	cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -679,6 +687,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__tracing_swap_reader),
 	HANDLE_FUNC(__tracing_update_clock),
 	HANDLE_FUNC(__tracing_reset),
+	HANDLE_FUNC(__tracing_enable_event),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index d724f6d69302..7a02837203d1 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -16,6 +16,12 @@ SECTIONS {
 	HYP_SECTION(.text)
 	HYP_SECTION(.data..ro_after_init)
 	HYP_SECTION(.rodata)
+#ifdef CONFIG_NVHE_EL2_TRACING
+	. = ALIGN(PAGE_SIZE);
+	BEGIN_HYP_SECTION(.event_ids)
+		*(SORT(.hyp.event_ids.*))
+	END_HYP_SECTION
+#endif
 
 	/*
 	 * .hyp..data..percpu needs to be page aligned to maintain the same
diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c
index 09bc192e3514..0144cd26703e 100644
--- a/arch/arm64/kvm/hyp_trace.c
+++ b/arch/arm64/kvm/hyp_trace.c
@@ -318,6 +318,25 @@ static int hyp_trace_reset(unsigned int cpu, void *priv)
 
 static int hyp_trace_enable_event(unsigned short id, bool enable, void *priv)
 {
+	struct hyp_event_id *event_id = lm_alias(&__hyp_event_ids_start[id]);
+	struct page *page;
+	atomic_t *enabled;
+	void *map;
+
+	if (is_protected_kvm_enabled())
+		return kvm_call_hyp_nvhe(__tracing_enable_event, id, enable);
+
+	enabled = &event_id->enabled;
+	page = virt_to_page(enabled);
+	map = vmap(&page, 1, VM_MAP, PAGE_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	enabled = map + offset_in_page(enabled);
+	atomic_set(enabled, enable);
+
+	vunmap(map);
+
 	return 0;
 }
 
@@ -345,6 +364,19 @@ static struct trace_remote_callbacks trace_remote_callbacks = {
 	.enable_event		= hyp_trace_enable_event,
 };
 
+#include <asm/kvm_define_hypevents.h>
+
+static void __init hyp_trace_init_events(void)
+{
+	struct hyp_event_id *hyp_event_id = __hyp_event_ids_start;
+	struct remote_event *event = __hyp_events_start;
+	int id = 0;
+
+	/* Events on both sides hypervisor are sorted */
+	for (; event < __hyp_events_end; event++, hyp_event_id++, id++)
+		event->id = hyp_event_id->id = id;
+}
+
 int __init kvm_hyp_trace_init(void)
 {
 	int cpu;
@@ -364,5 +396,8 @@ int __init kvm_hyp_trace_init(void)
 	}
 #endif
 
-	return trace_remote_register("hypervisor", &trace_remote_callbacks, &trace_buffer, NULL, 0);
+	hyp_trace_init_events();
+
+	return trace_remote_register("hypervisor", &trace_remote_callbacks, &trace_buffer,
+				     __hyp_events_start, __hyp_events_end - __hyp_events_start);
 }
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 28/30] KVM: arm64: Add hyp_enter/hyp_exit events to nVHE/pKVM hyp
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

The hyp_enter and hyp_exit events are logged by the hypervisor any time
it is entered and exited.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 2ca264b3db5f..b50ac3bb4bc9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -920,6 +920,9 @@ struct kvm_vcpu_arch {
 
 	/* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */
 	struct vncr_tlb	*vncr_tlb;
+
+	/* Hyp-readable copy of kvm_vcpu::pid */
+	pid_t pid;
 };
 
 /*
diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h
index d6e033c96c52..221a1dacb2f0 100644
--- a/arch/arm64/include/asm/kvm_hypevents.h
+++ b/arch/arm64/include/asm/kvm_hypevents.h
@@ -7,4 +7,43 @@
 #include <nvhe/trace.h>
 #endif
 
+#ifndef __HYP_ENTER_EXIT_REASON
+#define __HYP_ENTER_EXIT_REASON
+enum hyp_enter_exit_reason {
+	HYP_REASON_SMC,
+	HYP_REASON_HVC,
+	HYP_REASON_PSCI,
+	HYP_REASON_HOST_ABORT,
+	HYP_REASON_GUEST_EXIT,
+	HYP_REASON_ERET_HOST,
+	HYP_REASON_ERET_GUEST,
+	HYP_REASON_UNKNOWN	/* Must be last */
+};
+#endif
+
+HYP_EVENT(hyp_enter,
+	HE_PROTO(struct kvm_cpu_context *host_ctxt, u8 reason),
+	HE_STRUCT(
+		he_field(u8, reason)
+		he_field(pid_t, vcpu)
+	),
+	HE_ASSIGN(
+		__entry->reason = reason;
+		__entry->vcpu = __tracing_get_vcpu_pid(host_ctxt);
+	),
+	HE_PRINTK("reason=%s vcpu=%d", __hyp_enter_exit_reason_str(__entry->reason), __entry->vcpu)
+);
+
+HYP_EVENT(hyp_exit,
+	HE_PROTO(struct kvm_cpu_context *host_ctxt, u8 reason),
+	HE_STRUCT(
+		he_field(u8, reason)
+		he_field(pid_t, vcpu)
+	),
+	HE_ASSIGN(
+		__entry->reason = reason;
+		__entry->vcpu = __tracing_get_vcpu_pid(host_ctxt);
+	),
+	HE_PRINTK("reason=%s vcpu=%d", __hyp_enter_exit_reason_str(__entry->reason), __entry->vcpu)
+);
 #endif
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 87a3d28d5f0f..04c43c9eb764 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -707,6 +707,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
 		vcpu_set_on_unsupported_cpu(vcpu);
+
+	vcpu->arch.pid = pid_nr(vcpu->pid);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h b/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h
new file mode 100644
index 000000000000..1258bc84477f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/arm-smccc.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__
+#define __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__
+
+#include <asm/kvm_hypevents.h>
+
+#include <linux/arm-smccc.h>
+
+#define hyp_smccc_1_1_smc(...)					\
+	do {							\
+		trace_hyp_exit(NULL, HYP_REASON_SMC);		\
+		arm_smccc_1_1_smc(__VA_ARGS__);			\
+		trace_hyp_enter(NULL, HYP_REASON_SMC);		\
+	} while (0)
+
+#define hyp_smccc_1_2_smc(...)					\
+	do {							\
+		trace_hyp_exit(NULL, HYP_REASON_SMC);		\
+		arm_smccc_1_2_smc(__VA_ARGS__);			\
+		trace_hyp_enter(NULL, HYP_REASON_SMC);		\
+	} while (0)
+
+#endif /* __ARM64_KVM_HYP_NVHE_ARM_SMCCC_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h
index 802a18b77c56..8813ff250f8e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trace.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h
@@ -6,6 +6,18 @@
 
 #include <asm/kvm_hyptrace.h>
 
+static inline pid_t __tracing_get_vcpu_pid(struct kvm_cpu_context *host_ctxt)
+{
+	struct kvm_vcpu *vcpu;
+
+	if (!host_ctxt)
+		host_ctxt = host_data_ptr(host_ctxt);
+
+	vcpu = host_ctxt->__hyp_running_vcpu;
+
+	return vcpu ? vcpu->arch.pid : 0;
+}
+
 #define HE_PROTO(__args...)	__args
 #define HE_ASSIGN(__args...)	__args
 #define HE_STRUCT		RE_STRUCT
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 94161ea1cd60..1af722771178 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -26,10 +26,10 @@
  * the duration and are therefore serialised.
  */
 
-#include <linux/arm-smccc.h>
 #include <linux/arm_ffa.h>
 #include <asm/kvm_pkvm.h>
 
+#include <nvhe/arm-smccc.h>
 #include <nvhe/ffa.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/memory.h>
@@ -147,7 +147,7 @@ static int ffa_map_hyp_buffers(u64 ffa_page_count)
 {
 	struct arm_smccc_1_2_regs res;
 
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_FN64_RXTX_MAP,
 		.a1 = hyp_virt_to_phys(hyp_buffers.tx),
 		.a2 = hyp_virt_to_phys(hyp_buffers.rx),
@@ -161,7 +161,7 @@ static int ffa_unmap_hyp_buffers(void)
 {
 	struct arm_smccc_1_2_regs res;
 
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_RXTX_UNMAP,
 		.a1 = HOST_FFA_ID,
 	}, &res);
@@ -172,7 +172,7 @@ static int ffa_unmap_hyp_buffers(void)
 static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
 			     u32 handle_hi, u32 fraglen, u32 endpoint_id)
 {
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_MEM_FRAG_TX,
 		.a1 = handle_lo,
 		.a2 = handle_hi,
@@ -184,7 +184,7 @@ static void ffa_mem_frag_tx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
 static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
 			     u32 handle_hi, u32 fragoff)
 {
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_MEM_FRAG_RX,
 		.a1 = handle_lo,
 		.a2 = handle_hi,
@@ -196,7 +196,7 @@ static void ffa_mem_frag_rx(struct arm_smccc_1_2_regs *res, u32 handle_lo,
 static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len,
 			  u32 fraglen)
 {
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = func_id,
 		.a1 = len,
 		.a2 = fraglen,
@@ -206,7 +206,7 @@ static void ffa_mem_xfer(struct arm_smccc_1_2_regs *res, u64 func_id, u32 len,
 static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo,
 			     u32 handle_hi, u32 flags)
 {
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_MEM_RECLAIM,
 		.a1 = handle_lo,
 		.a2 = handle_hi,
@@ -216,7 +216,7 @@ static void ffa_mem_reclaim(struct arm_smccc_1_2_regs *res, u32 handle_lo,
 
 static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len)
 {
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_FN64_MEM_RETRIEVE_REQ,
 		.a1 = len,
 		.a2 = len,
@@ -225,7 +225,7 @@ static void ffa_retrieve_req(struct arm_smccc_1_2_regs *res, u32 len)
 
 static void ffa_rx_release(struct arm_smccc_1_2_regs *res)
 {
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_RX_RELEASE,
 	}, res);
 }
@@ -728,7 +728,7 @@ static int hyp_ffa_post_init(void)
 	size_t min_rxtx_sz;
 	struct arm_smccc_1_2_regs res;
 
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
 		.a0 = FFA_ID_GET,
 	}, &res);
 	if (res.a0 != FFA_SUCCESS)
@@ -737,7 +737,7 @@ static int hyp_ffa_post_init(void)
 	if (res.a2 != HOST_FFA_ID)
 		return -EINVAL;
 
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs){
 		.a0 = FFA_FEATURES,
 		.a1 = FFA_FN64_RXTX_MAP,
 	}, &res);
@@ -788,7 +788,7 @@ static void do_ffa_version(struct arm_smccc_1_2_regs *res,
 	 * first if TEE supports it.
 	 */
 	if (FFA_MINOR_VERSION(ffa_req_version) < FFA_MINOR_VERSION(hyp_ffa_version)) {
-		arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+		hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 			.a0 = FFA_VERSION,
 			.a1 = ffa_req_version,
 		}, res);
@@ -824,7 +824,7 @@ static void do_ffa_part_get(struct arm_smccc_1_2_regs *res,
 		goto out_unlock;
 	}
 
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_PARTITION_INFO_GET,
 		.a1 = uuid0,
 		.a2 = uuid1,
@@ -939,7 +939,7 @@ int hyp_ffa_init(void *pages)
 	if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
 		return 0;
 
-	arm_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
+	hyp_smccc_1_2_smc(&(struct arm_smccc_1_2_regs) {
 		.a0 = FFA_VERSION,
 		.a1 = FFA_VERSION_1_2,
 	}, &res);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index fc5953f31b4b..547d63679022 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -12,6 +12,7 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_host.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
 #include <asm/kvm_mmu.h>
 
 #include <nvhe/ffa.h>
@@ -137,6 +138,8 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	hyp_vcpu->vcpu.arch.vsesr_el2	= host_vcpu->arch.vsesr_el2;
 
 	hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3;
+
+	hyp_vcpu->vcpu.arch.pid = host_vcpu->arch.pid;
 }
 
 static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
@@ -728,7 +731,9 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
 
 static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 {
+	trace_hyp_exit(host_ctxt, HYP_REASON_SMC);
 	__kvm_hyp_host_forward_smc(host_ctxt);
+	trace_hyp_enter(host_ctxt, HYP_REASON_SMC);
 }
 
 static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
@@ -815,15 +820,19 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
 {
 	u64 esr = read_sysreg_el2(SYS_ESR);
 
+
 	switch (ESR_ELx_EC(esr)) {
 	case ESR_ELx_EC_HVC64:
+		trace_hyp_enter(host_ctxt, HYP_REASON_HVC);
 		handle_host_hcall(host_ctxt);
 		break;
 	case ESR_ELx_EC_SMC64:
+		trace_hyp_enter(host_ctxt, HYP_REASON_SMC);
 		handle_host_smc(host_ctxt);
 		break;
 	case ESR_ELx_EC_IABT_LOW:
 	case ESR_ELx_EC_DABT_LOW:
+		trace_hyp_enter(host_ctxt, HYP_REASON_HOST_ABORT);
 		handle_host_mem_abort(host_ctxt);
 		break;
 	case ESR_ELx_EC_SYS64:
@@ -833,4 +842,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
 	default:
 		BUG();
 	}
+
+	trace_hyp_exit(host_ctxt, HYP_REASON_ERET_HOST);
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index c3e196fb8b18..ab4c7bddb163 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -6,11 +6,12 @@
 
 #include <asm/kvm_asm.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
 #include <asm/kvm_mmu.h>
-#include <linux/arm-smccc.h>
 #include <linux/kvm_host.h>
 #include <uapi/linux/psci.h>
 
+#include <nvhe/arm-smccc.h>
 #include <nvhe/memory.h>
 #include <nvhe/trap_handler.h>
 
@@ -65,7 +66,7 @@ static unsigned long psci_call(unsigned long fn, unsigned long arg0,
 {
 	struct arm_smccc_res res;
 
-	arm_smccc_1_1_smc(fn, arg0, arg1, arg2, &res);
+	hyp_smccc_1_1_smc(fn, arg0, arg1, arg2, &res);
 	return res.a0;
 }
 
@@ -206,6 +207,7 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
 	struct kvm_cpu_context *host_ctxt;
 
 	host_ctxt = host_data_ptr(host_ctxt);
+	trace_hyp_enter(host_ctxt, HYP_REASON_PSCI);
 
 	if (is_cpu_on)
 		boot_args = this_cpu_ptr(&cpu_on_args);
@@ -221,6 +223,7 @@ asmlinkage void __noreturn __kvm_host_psci_cpu_entry(bool is_cpu_on)
 	write_sysreg_el1(INIT_SCTLR_EL1_MMU_OFF, SYS_SCTLR);
 	write_sysreg(INIT_PSTATE_EL1, SPSR_EL2);
 
+	trace_hyp_exit(host_ctxt, HYP_REASON_PSCI);
 	__host_enter(host_ctxt);
 }
 
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 779089e42681..ca60721501d1 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -7,7 +7,6 @@
 #include <hyp/switch.h>
 #include <hyp/sysreg-sr.h>
 
-#include <linux/arm-smccc.h>
 #include <linux/kvm_host.h>
 #include <linux/types.h>
 #include <linux/jump_label.h>
@@ -21,6 +20,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_hypevents.h>
 #include <asm/kvm_mmu.h>
 #include <asm/fpsimd.h>
 #include <asm/debug-monitors.h>
@@ -308,10 +308,13 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	__debug_switch_to_guest(vcpu);
 
 	do {
+		trace_hyp_exit(host_ctxt, HYP_REASON_ERET_GUEST);
+
 		/* Jump in the fire! */
 		exit_code = __guest_enter(vcpu);
 
 		/* And we're baaack! */
+		trace_hyp_enter(host_ctxt, HYP_REASON_GUEST_EXIT);
 	} while (fixup_guest_exit(vcpu, &exit_code));
 
 	__sysreg_save_state_nvhe(guest_ctxt);
diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c
index 0144cd26703e..1ad6a55ba95c 100644
--- a/arch/arm64/kvm/hyp_trace.c
+++ b/arch/arm64/kvm/hyp_trace.c
@@ -364,8 +364,26 @@ static struct trace_remote_callbacks trace_remote_callbacks = {
 	.enable_event		= hyp_trace_enable_event,
 };
 
+static const char *__hyp_enter_exit_reason_str(u8 reason);
+
 #include <asm/kvm_define_hypevents.h>
 
+static const char *__hyp_enter_exit_reason_str(u8 reason)
+{
+	static const char strs[][12] = {
+		"smc",
+		"hvc",
+		"psci",
+		"host_abort",
+		"guest_exit",
+		"eret_host",
+		"eret_guest",
+		"unknown",
+	};
+
+	return strs[min(reason, HYP_REASON_UNKNOWN)];
+}
+
 static void __init hyp_trace_init_events(void)
 {
 	struct hyp_event_id *hyp_event_id = __hyp_event_ids_start;
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 29/30] KVM: arm64: Add selftest event support to nVHE/pKVM hyp
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

Add a selftest event that can be triggered from a `write_event` tracefs
file. This intends to be used by trace remote selftests.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 47d250436f8c..c8eb992d3ac8 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -96,6 +96,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___tracing_update_clock,
 	__KVM_HOST_SMCCC_FUNC___tracing_reset,
 	__KVM_HOST_SMCCC_FUNC___tracing_enable_event,
+	__KVM_HOST_SMCCC_FUNC___tracing_write_event,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h
index 221a1dacb2f0..743c49bd878f 100644
--- a/arch/arm64/include/asm/kvm_hypevents.h
+++ b/arch/arm64/include/asm/kvm_hypevents.h
@@ -46,4 +46,15 @@ HYP_EVENT(hyp_exit,
 	),
 	HE_PRINTK("reason=%s vcpu=%d", __hyp_enter_exit_reason_str(__entry->reason), __entry->vcpu)
 );
+
+HYP_EVENT(selftest,
+	HE_PROTO(u64 id),
+	HE_STRUCT(
+		he_field(u64, id)
+	),
+	HE_ASSIGN(
+		__entry->id = id;
+	),
+	RE_PRINTK("id=%llu", __entry->id)
+);
 #endif
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 547d63679022..eff9cb208627 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -643,6 +643,13 @@ static void handle___tracing_enable_event(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __tracing_enable_event(id, enable);
 }
 
+static void handle___tracing_write_event(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, id, host_ctxt, 1);
+
+	trace_selftest(id);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -691,6 +698,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__tracing_update_clock),
 	HANDLE_FUNC(__tracing_reset),
 	HANDLE_FUNC(__tracing_enable_event),
+	HANDLE_FUNC(__tracing_write_event),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c
index 1ad6a55ba95c..c1e28f6581ab 100644
--- a/arch/arm64/kvm/hyp_trace.c
+++ b/arch/arm64/kvm/hyp_trace.c
@@ -348,8 +348,30 @@ static int hyp_trace_clock_show(struct seq_file *m, void *v)
 }
 DEFINE_SHOW_ATTRIBUTE(hyp_trace_clock);
 
+static ssize_t hyp_trace_write_event_write(struct file *f, const char __user *ubuf,
+					   size_t cnt, loff_t *pos)
+{
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	kvm_call_hyp_nvhe(__tracing_write_event, val);
+
+	return cnt;
+}
+
+static const struct file_operations hyp_trace_write_event_fops = {
+	.write	= hyp_trace_write_event_write,
+};
+
 static int hyp_trace_init_tracefs(struct dentry *d, void *priv)
 {
+	if (!tracefs_create_file("write_event", 0200, d, NULL, &hyp_trace_write_event_fops))
+		return -ENOMEM;
+
 	return tracefs_create_file("trace_clock", 0440, d, NULL, &hyp_trace_clock_fops) ?
 		0 : -ENOMEM;
 }
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* [PATCH v14 30/30] tracing: selftests: Add hypervisor trace remote tests
From: Vincent Donnefort @ 2026-03-09 16:25 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, linux-trace-kernel, maz,
	oliver.upton, joey.gouly, suzuki.poulose, yuzenghui
  Cc: kvmarm, linux-arm-kernel, jstultz, qperret, will, aneesh.kumar,
	kernel-team, linux-kernel, Vincent Donnefort, Shuah Khan,
	linux-kselftest
In-Reply-To: <20260309162516.2623589-1-vdonnefort@google.com>

Run the trace remote selftests with the trace remote 'hypervisor', This
trace remote is most likely created when the arm64 KVM nVHE/pKVM
hypervisor is in use.

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: linux-kselftest@vger.kernel.org
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc
new file mode 100644
index 000000000000..64bf859d6406
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/buffer_size.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor trace buffer size
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/buffer_size.tc
+
+set -e
+setup_remote "hypervisor"
+test_buffer_size
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc
new file mode 100644
index 000000000000..7fe3b09b34e3
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/reset.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor trace buffer reset
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/reset.tc
+
+set -e
+setup_remote "hypervisor"
+test_reset
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc
new file mode 100644
index 000000000000..b937c19ca7f9
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor non-consuming trace read
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/trace.tc
+
+set -e
+setup_remote "hypervisor"
+test_trace
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc
new file mode 100644
index 000000000000..66aa1b76c147
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/trace_pipe.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor consuming trace read
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/trace_pipe.tc
+
+set -e
+setup_remote "hypervisor"
+test_trace_pipe
diff --git a/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc
new file mode 100644
index 000000000000..1dafde3414ab
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/remotes/hypervisor/unloading.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test hypervisor trace buffer unloading
+# requires: remotes/hypervisor/write_event
+
+SOURCE_REMOTE_TEST=1
+. $TEST_DIR/remotes/unloading.tc
+
+set -e
+setup_remote "hypervisor"
+test_unloading
-- 
2.53.0.473.g4a7958ca14-goog


^ permalink raw reply related

* Re: [PATCH v3 16/18] rtla/trace: Fix I/O handling in save_trace_to_file()
From: Wander Lairson Costa @ 2026-03-09 16:46 UTC (permalink / raw)
  To: Tomas Glozar
  Cc: Steven Rostedt, Ivan Pravdin, Crystal Wood, Costa Shulyupin,
	John Kacur, Haiyong Sun, Tiezhu Yang, Daniel Wagner,
	Daniel Bristot de Oliveira,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:BPF [MISC]:Keyword:(?:b|_)bpf(?:b|_)
In-Reply-To: <CAP4=nvSBEgfJg8qFYakiSfhE-Cp3RF9OoT2NR2qO+oRk+1BGEg@mail.gmail.com>

On Wed, Mar 04, 2026 at 11:30:13AM +0100, Tomas Glozar wrote:
> čt 15. 1. 2026 v 18:29 odesílatel Wander Lairson Costa
> <wander@redhat.com> napsal:
> > diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
> > index fed3362527b08..8e93b48d33ef8 100644
> > --- a/tools/tracing/rtla/src/trace.c
> > +++ b/tools/tracing/rtla/src/trace.c
> > @@ -73,6 +73,7 @@ int save_trace_to_file(struct tracefs_instance *inst, const char *filename)
> >         char buffer[4096];
> >         int out_fd, in_fd;
> >         int retval = -1;
> > +       ssize_t n_read;
> >
> >         if (!inst || !filename)
> >                 return 0;
> > @@ -90,15 +91,30 @@ int save_trace_to_file(struct tracefs_instance *inst, const char *filename)
> >                 goto out_close_in;
> >         }
> >
> > -       do {
> > -               retval = read(in_fd, buffer, sizeof(buffer));
> > -               if (retval <= 0)
> > +       for (;;) {
> > +               n_read = read(in_fd, buffer, sizeof(buffer));
> > +               if (n_read < 0) {
> > +                       if (errno == EINTR)
> > +                               continue;
> > +                       err_msg("Error reading trace file: %s\n", strerror(errno));
> >                         goto out_close;
> > +               }
> > +               if (n_read == 0)
> > +                       break;
> >
> > -               retval = write(out_fd, buffer, retval);
> > -               if (retval < 0)
> > -                       goto out_close;
> > -       } while (retval > 0);
> > +               ssize_t n_written = 0;
> 
> Why break the style of declaring all variables at the beginning of the
> function? n_read, added in the same commit, keeps the style.
> 
> This also applies to the previous patch.

In this case `n_written` is harmless and I see no problem in moving it to the
beginning of the function.

For `written` and `w`, declaring them where they are right now brings the
benefit of making them const correct. If we move them to the top of the
function, we lose that. Since the kernel moved to gnu11, the C89 top-of-block
declarations are relaxed. Keeping them at the point of initialization minimizes
their scope and documents the intent. This helps avoid programmer mistakes.

However, I fail to follow my own advice more than I am willing to admit.

> 
> Tomas
> 


^ permalink raw reply

* Re: [PATCH v3 12/18] rtla: Enforce exact match for time unit suffixes
From: Wander Lairson Costa @ 2026-03-09 17:15 UTC (permalink / raw)
  To: Tomas Glozar
  Cc: Steven Rostedt, Crystal Wood, Ivan Pravdin, Costa Shulyupin,
	John Kacur, Tiezhu Yang, Daniel Wagner,
	Daniel Bristot de Oliveira,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:BPF [MISC]:Keyword:(?:b|_)bpf(?:b|_)
In-Reply-To: <CAP4=nvTZ-k0QGsr_oyGmsr+gy6vbjmKv6TDVnUzQw=Av8YkEUQ@mail.gmail.com>

On Wed, Mar 04, 2026 at 02:57:26PM +0100, Tomas Glozar wrote:
> čt 15. 1. 2026 v 18:28 odesílatel Wander Lairson Costa
> <wander@redhat.com> napsal:
> >
> > The parse_ns_duration() function currently uses prefix matching for
> > detecting time units. This approach is problematic as it silently
> > accepts malformed strings such as "100nsx" or "100us_invalid" by
> > ignoring the trailing characters, leading to potential configuration
> > errors.
> >
> > Switch to using strcmp() for suffix comparison to enforce exact matches.
> > This ensures that the parser strictly validates the time unit and
> > rejects any input containing invalid trailing characters, thereby
> > improving the robustness of the configuration parsing.
> 
> This solution is incorrect. We need to be able to parse deadline
> priority correctly, whose format includes two suffixes:
> 
> d:runtime[us|ms|s]:period[us|ms|s]
> (see manpages)
> 
> and is parsed like this:
> 
> int parse_prio(char *arg, struct sched_attr *sched_param)
> {
> ...
>     switch (arg[0]) {
>     case 'd':
>     case 'D':
>         /* d:runtime:period */
>         if (strlen(arg) < 4)
>             return -1;
> 
>         runtime = get_long_ns_after_colon(arg);
>         if (runtime == INVALID_VAL)
>             return -1;
> 
>         period = get_long_ns_after_colon(&arg[2]);
>         if (period == INVALID_VAL)
>             return -1;
> ...
> 
> Your commit breaks that:
> 
> $ rtla timerlat -P d:10ms:100ms
> Invalid -P priority

You're right, I will fix the bug.

> 
> Tomas
> 


^ permalink raw reply

* Re: [PATCH v3 12/18] rtla: Enforce exact match for time unit suffixes
From: Wander Lairson Costa @ 2026-03-09 17:17 UTC (permalink / raw)
  To: Tomas Glozar
  Cc: Steven Rostedt, Crystal Wood, Ivan Pravdin, Costa Shulyupin,
	John Kacur, Tiezhu Yang, Daniel Wagner,
	Daniel Bristot de Oliveira,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:BPF [MISC]:Keyword:(?:b|_)bpf(?:b|_)
In-Reply-To: <CAP4=nvQ6EeecMggeVciOaW_-=sa2QJ2r9DX0Xifzgsv5JBhAAg@mail.gmail.com>

On Tue, Mar 03, 2026 at 03:27:35PM +0100, Tomas Glozar wrote:
> čt 15. 1. 2026 v 18:28 odesílatel Wander Lairson Costa
> <wander@redhat.com> napsal:
> >
> > The parse_ns_duration() function currently uses prefix matching for
> > detecting time units. This approach is problematic as it silently
> > accepts malformed strings such as "100nsx" or "100us_invalid" by
> > ignoring the trailing characters, leading to potential configuration
> > errors.
> >
> > Switch to using strcmp() for suffix comparison to enforce exact matches.
> > This ensures that the parser strictly validates the time unit and
> > rejects any input containing invalid trailing characters, thereby
> > improving the robustness of the configuration parsing.
> >
> 
> Why not use your strncmp_static() helper to protect from overrun?

As I said before, sometime I fail to follow my own rules *facepalm*.
> 
> > Signed-off-by: Wander Lairson Costa <wander@redhat.com>
> > ---
> >  tools/tracing/rtla/src/utils.c | 8 ++++----
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
> > index 486d96e8290fb..b029fe5970c31 100644
> > --- a/tools/tracing/rtla/src/utils.c
> > +++ b/tools/tracing/rtla/src/utils.c
> > @@ -211,15 +211,15 @@ long parse_ns_duration(char *val)
> >         t = strtol(val, &end, 10);
> >
> >         if (end) {
> > -               if (!strncmp(end, "ns", 2)) {
> > +               if (strcmp(end, "ns") == 0) {
> >                         return t;
> > -               } else if (!strncmp(end, "us", 2)) {
> > +               } else if (strcmp(end, "us") == 0) {
> >                         t *= 1000;
> >                         return t;
> > -               } else if (!strncmp(end, "ms", 2)) {
> > +               } else if (strcmp(end, "ms") == 0) {
> >                         t *= 1000 * 1000;
> >                         return t;
> > -               } else if (!strncmp(end, "s", 1)) {
> > +               } else if (strcmp(end, "s") == 0) {
> >                         t *= 1000 * 1000 * 1000;
> >                         return t;
> >                 }
> > --
> > 2.52.0
> >
> 
> Tomas
> 


^ permalink raw reply

* Re: [PATCH v3 00/12] vfs: change inode->i_ino from unsigned long to u64
From: Mimi Zohar @ 2026-03-09 17:47 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-fsdevel, linux-kernel, linux-trace-kernel, nvdimm, fsverity,
	linux-mm, netfs, linux-ext4, linux-f2fs-devel, linux-nfs,
	linux-cifs, samba-technical, linux-nilfs, v9fs, linux-afs, autofs,
	ceph-devel, codalist, ecryptfs, linux-mtd, jfs-discussion, ntfs3,
	ocfs2-devel, devel, linux-unionfs, apparmor,
	linux-security-module, linux-integrity, selinux, amd-gfx,
	dri-devel, linux-media, linaro-mm-sig, netdev, linux-perf-users,
	linux-fscrypt, linux-xfs, linux-hams, linux-x25, audit,
	linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <20260304-iino-u64-v3-0-2257ad83d372@kernel.org>

[ I/O socket time out.  Trimming the To list.]

On Wed, 2026-03-04 at 10:32 -0500, Jeff Layton wrote:
> This version squashes all of the format-string changes and the i_ino
> type change into the same patch. This results in a giant 600+ line patch
> at the end of the series, but it does remain bisectable.  Because the
> patchset was reorganized (again) some of the R-b's and A-b's have been
> dropped.
> 
> The entire pile is in the "iino-u64" branch of my tree, if anyone is
> interested in testing this.
> 
>     https://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git/
> 
> Original cover letter follows:
> 
> ----------------------8<-----------------------
> 
> Christian said [1] to "just do it" when I proposed this, so here we are!
> 
> For historical reasons, the inode->i_ino field is an unsigned long,
> which means that it's 32 bits on 32 bit architectures. This has caused a
> number of filesystems to implement hacks to hash a 64-bit identifier
> into a 32-bit field, and deprives us of a universal identifier field for
> an inode.
> 
> This patchset changes the inode->i_ino field from an unsigned long to a
> u64. This shouldn't make any material difference on 64-bit hosts, but
> 32-bit hosts will see struct inode grow by at least 4 bytes. This could
> have effects on slabcache sizes and field alignment.
> 
> The bulk of the changes are to format strings and tracepoints, since the
> kernel itself doesn't care that much about the i_ino field. The first
> patch changes some vfs function arguments, so check that one out
> carefully.
> 
> With this change, we may be able to shrink some inode structures. For
> instance, struct nfs_inode has a fileid field that holds the 64-bit
> inode number. With this set of changes, that field could be eliminated.
> I'd rather leave that sort of cleanups for later just to keep this
> simple.
> 
> Much of this set was generated by LLM, but I attributed it to myself
> since I consider this to be in the "menial tasks" category of LLM usage.
> 
> [1]: https://lore.kernel.org/linux-fsdevel/20260219-portrait-winkt-959070cee42f@brauner/
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>

Jeff, missing from this patch set is EVM.  In hmac_add_misc() EVM copies the
i_ino and calculates either an HMAC or file meta-data hash, which is then
signed. 

Mimi

^ permalink raw reply

* Re: [PATCH v3 00/12] vfs: change inode->i_ino from unsigned long to u64
From: Jeff Layton @ 2026-03-09 17:59 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: linux-fsdevel, linux-kernel, linux-trace-kernel, nvdimm, fsverity,
	linux-mm, netfs, linux-ext4, linux-f2fs-devel, linux-nfs,
	linux-cifs, samba-technical, linux-nilfs, v9fs, linux-afs, autofs,
	ceph-devel, codalist, ecryptfs, linux-mtd, jfs-discussion, ntfs3,
	ocfs2-devel, devel, linux-unionfs, apparmor,
	linux-security-module, linux-integrity, selinux, amd-gfx,
	dri-devel, linux-media, linaro-mm-sig, netdev, linux-perf-users,
	linux-fscrypt, linux-xfs, linux-hams, linux-x25, audit,
	linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <05b5d55c49b5a1bbc43a5315e3c84872e7e634b3.camel@linux.ibm.com>

On Mon, 2026-03-09 at 13:47 -0400, Mimi Zohar wrote:
> [ I/O socket time out.  Trimming the To list.]
> 
> On Wed, 2026-03-04 at 10:32 -0500, Jeff Layton wrote:
> > This version squashes all of the format-string changes and the i_ino
> > type change into the same patch. This results in a giant 600+ line patch
> > at the end of the series, but it does remain bisectable.  Because the
> > patchset was reorganized (again) some of the R-b's and A-b's have been
> > dropped.
> > 
> > The entire pile is in the "iino-u64" branch of my tree, if anyone is
> > interested in testing this.
> > 
> >     https://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git/
> > 
> > Original cover letter follows:
> > 
> > ----------------------8<-----------------------
> > 
> > Christian said [1] to "just do it" when I proposed this, so here we are!
> > 
> > For historical reasons, the inode->i_ino field is an unsigned long,
> > which means that it's 32 bits on 32 bit architectures. This has caused a
> > number of filesystems to implement hacks to hash a 64-bit identifier
> > into a 32-bit field, and deprives us of a universal identifier field for
> > an inode.
> > 
> > This patchset changes the inode->i_ino field from an unsigned long to a
> > u64. This shouldn't make any material difference on 64-bit hosts, but
> > 32-bit hosts will see struct inode grow by at least 4 bytes. This could
> > have effects on slabcache sizes and field alignment.
> > 
> > The bulk of the changes are to format strings and tracepoints, since the
> > kernel itself doesn't care that much about the i_ino field. The first
> > patch changes some vfs function arguments, so check that one out
> > carefully.
> > 
> > With this change, we may be able to shrink some inode structures. For
> > instance, struct nfs_inode has a fileid field that holds the 64-bit
> > inode number. With this set of changes, that field could be eliminated.
> > I'd rather leave that sort of cleanups for later just to keep this
> > simple.
> > 
> > Much of this set was generated by LLM, but I attributed it to myself
> > since I consider this to be in the "menial tasks" category of LLM usage.
> > 
> > [1]: https://lore.kernel.org/linux-fsdevel/20260219-portrait-winkt-959070cee42f@brauner/
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> 
> Jeff, missing from this patch set is EVM.  In hmac_add_misc() EVM copies the
> i_ino and calculates either an HMAC or file meta-data hash, which is then
> signed. 
> 
> 

Thanks Mimi, good catch.

It looks like we should just be able to change the ino field to a u64
alongside everything else. Something like this:

diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index c0ca4eedb0fe..77b6c2fa345e 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -144,7 +144,7 @@ static void hmac_add_misc(struct shash_desc *desc, struct inode *inode,
                          char type, char *digest)
 {
        struct h_misc {
-               unsigned long ino;
+               u64 ino;
                __u32 generation;
                uid_t uid;
                gid_t gid;



That should make no material difference on 64-bit hosts. What's the
effect on 32-bit? Will they just need to remeasure everything or would
the consequences be more dire? Do we have any clue whether anyone is
using EVM in 32-bit environments?

Thanks,
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply related

* Re: [PATCH v3 00/12] vfs: change inode->i_ino from unsigned long to u64
From: Mimi Zohar @ 2026-03-09 19:00 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-fsdevel, linux-kernel, linux-trace-kernel, nvdimm, fsverity,
	linux-mm, netfs, linux-ext4, linux-f2fs-devel, linux-nfs,
	linux-cifs, samba-technical, linux-nilfs, v9fs, linux-afs, autofs,
	ceph-devel, codalist, ecryptfs, linux-mtd, jfs-discussion, ntfs3,
	ocfs2-devel, devel, linux-unionfs, apparmor,
	linux-security-module, linux-integrity, selinux, amd-gfx,
	dri-devel, linux-media, linaro-mm-sig, netdev, linux-perf-users,
	linux-fscrypt, linux-xfs, linux-hams, linux-x25, audit,
	linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <f22758116dabd3c135a833bcb5cfcd2ea4f6ecf4.camel@kernel.org>

On Mon, 2026-03-09 at 13:59 -0400, Jeff Layton wrote:
> On Mon, 2026-03-09 at 13:47 -0400, Mimi Zohar wrote:
> > [ I/O socket time out.  Trimming the To list.]
> > 
> > On Wed, 2026-03-04 at 10:32 -0500, Jeff Layton wrote:
> > > This version squashes all of the format-string changes and the i_ino
> > > type change into the same patch. This results in a giant 600+ line patch
> > > at the end of the series, but it does remain bisectable.  Because the
> > > patchset was reorganized (again) some of the R-b's and A-b's have been
> > > dropped.
> > > 
> > > The entire pile is in the "iino-u64" branch of my tree, if anyone is
> > > interested in testing this.
> > > 
> > >     https://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git/
> > > 
> > > Original cover letter follows:
> > > 
> > > ----------------------8<-----------------------
> > > 
> > > Christian said [1] to "just do it" when I proposed this, so here we are!
> > > 
> > > For historical reasons, the inode->i_ino field is an unsigned long,
> > > which means that it's 32 bits on 32 bit architectures. This has caused a
> > > number of filesystems to implement hacks to hash a 64-bit identifier
> > > into a 32-bit field, and deprives us of a universal identifier field for
> > > an inode.
> > > 
> > > This patchset changes the inode->i_ino field from an unsigned long to a
> > > u64. This shouldn't make any material difference on 64-bit hosts, but
> > > 32-bit hosts will see struct inode grow by at least 4 bytes. This could
> > > have effects on slabcache sizes and field alignment.
> > > 
> > > The bulk of the changes are to format strings and tracepoints, since the
> > > kernel itself doesn't care that much about the i_ino field. The first
> > > patch changes some vfs function arguments, so check that one out
> > > carefully.
> > > 
> > > With this change, we may be able to shrink some inode structures. For
> > > instance, struct nfs_inode has a fileid field that holds the 64-bit
> > > inode number. With this set of changes, that field could be eliminated.
> > > I'd rather leave that sort of cleanups for later just to keep this
> > > simple.
> > > 
> > > Much of this set was generated by LLM, but I attributed it to myself
> > > since I consider this to be in the "menial tasks" category of LLM usage.
> > > 
> > > [1]: https://lore.kernel.org/linux-fsdevel/20260219-portrait-winkt-959070cee42f@brauner/
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > 
> > Jeff, missing from this patch set is EVM.  In hmac_add_misc() EVM copies the
> > i_ino and calculates either an HMAC or file meta-data hash, which is then
> > signed. 
> > 
> > 
> 
> Thanks Mimi, good catch.
> 
> It looks like we should just be able to change the ino field to a u64
> alongside everything else. Something like this:
> 
> diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
> index c0ca4eedb0fe..77b6c2fa345e 100644
> --- a/security/integrity/evm/evm_crypto.c
> +++ b/security/integrity/evm/evm_crypto.c
> @@ -144,7 +144,7 @@ static void hmac_add_misc(struct shash_desc *desc, struct inode *inode,
>                           char type, char *digest)
>  {
>         struct h_misc {
> -               unsigned long ino;
> +               u64 ino;
>                 __u32 generation;
>                 uid_t uid;
>                 gid_t gid;
> 

Agreed.

> 
> That should make no material difference on 64-bit hosts. What's the
> effect on 32-bit? Will they just need to remeasure everything or would
> the consequences be more dire? Do we have any clue whether anyone is
> using EVM in 32-bit environments?

All good questions. Unfortunately I don't know the answer to most of them. What
we do know: changing the size of the i_ino field would affect EVM file metadata
verification and would require relabeling the filesystem.  Even packages
containing EVM portable signatures, which don't include or verify the i_ino
number, would be affected.

Mimi





^ permalink raw reply

* Re: [PATCH v4 0/5] mm: zone lock tracepoint instrumentation
From: Steven Rostedt @ 2026-03-09 19:13 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Dmitry Ilvokhin, Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Masami Hiramatsu, Mathieu Desnoyers, Rafael J. Wysocki,
	Pavel Machek, Len Brown, Brendan Jackman, Johannes Weiner, Zi Yan,
	Oscar Salvador, Qi Zheng, Shakeel Butt, linux-kernel, linux-mm,
	linux-trace-kernel, linux-pm
In-Reply-To: <aa7dgQDMEz34eadj@casper.infradead.org>

On Mon, 9 Mar 2026 14:47:29 +0000
Matthew Wilcox <willy@infradead.org> wrote:

> > CONFIG_LOCK_STAT provides useful statistics, but it is primarily a
> > debug facility and is generally too heavyweight for the production
> > environments.  
> 
> Yes, agreed.  I think that is what needs to change.

The biggest issue with making a generic light weight LOCK_STAT is that
locks are extremely optimized. Any addition of generic lock encoding will
cause a noticeable overhead when compiled in, even when disabled.

Most of the lock code is inlined for the fast path. Now if we want to add
lock stats, we would need to add hooks into those inlined paths. This will
undoubtedly increase the size of text, which will have an impact on I$.

The other issue is the data we store for the lock. A lock is usually just a
word (or long) in size, embedded in a structure. LOCKDEP and LOCK_STAT adds
a key per lock. This increases the data size of the kernel.

LOCK_STAT was designed on top of LOCKDEP. Perhaps a lighter version of
LOCK_STAT could be designed without LOCKDEP, but it would be a project on
its own.

-- Steve

^ permalink raw reply

* Re: [PATCH v3 00/12] vfs: change inode->i_ino from unsigned long to u64
From: Jeff Layton @ 2026-03-09 19:33 UTC (permalink / raw)
  To: Mimi Zohar
  Cc: linux-fsdevel, linux-kernel, linux-trace-kernel, nvdimm, fsverity,
	linux-mm, netfs, linux-ext4, linux-f2fs-devel, linux-nfs,
	linux-cifs, samba-technical, linux-nilfs, v9fs, linux-afs, autofs,
	ceph-devel, codalist, ecryptfs, linux-mtd, jfs-discussion, ntfs3,
	ocfs2-devel, devel, linux-unionfs, apparmor,
	linux-security-module, linux-integrity, selinux, amd-gfx,
	dri-devel, linux-media, linaro-mm-sig, netdev, linux-perf-users,
	linux-fscrypt, linux-xfs, linux-hams, linux-x25, audit,
	linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <c9500adc562665d44feaca9206f23a5ba07432c1.camel@linux.ibm.com>

On Mon, 2026-03-09 at 15:00 -0400, Mimi Zohar wrote:
> On Mon, 2026-03-09 at 13:59 -0400, Jeff Layton wrote:
> > On Mon, 2026-03-09 at 13:47 -0400, Mimi Zohar wrote:
> > > [ I/O socket time out.  Trimming the To list.]
> > > 
> > > On Wed, 2026-03-04 at 10:32 -0500, Jeff Layton wrote:
> > > > This version squashes all of the format-string changes and the i_ino
> > > > type change into the same patch. This results in a giant 600+ line patch
> > > > at the end of the series, but it does remain bisectable.  Because the
> > > > patchset was reorganized (again) some of the R-b's and A-b's have been
> > > > dropped.
> > > > 
> > > > The entire pile is in the "iino-u64" branch of my tree, if anyone is
> > > > interested in testing this.
> > > > 
> > > >     https://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux.git/
> > > > 
> > > > Original cover letter follows:
> > > > 
> > > > ----------------------8<-----------------------
> > > > 
> > > > Christian said [1] to "just do it" when I proposed this, so here we are!
> > > > 
> > > > For historical reasons, the inode->i_ino field is an unsigned long,
> > > > which means that it's 32 bits on 32 bit architectures. This has caused a
> > > > number of filesystems to implement hacks to hash a 64-bit identifier
> > > > into a 32-bit field, and deprives us of a universal identifier field for
> > > > an inode.
> > > > 
> > > > This patchset changes the inode->i_ino field from an unsigned long to a
> > > > u64. This shouldn't make any material difference on 64-bit hosts, but
> > > > 32-bit hosts will see struct inode grow by at least 4 bytes. This could
> > > > have effects on slabcache sizes and field alignment.
> > > > 
> > > > The bulk of the changes are to format strings and tracepoints, since the
> > > > kernel itself doesn't care that much about the i_ino field. The first
> > > > patch changes some vfs function arguments, so check that one out
> > > > carefully.
> > > > 
> > > > With this change, we may be able to shrink some inode structures. For
> > > > instance, struct nfs_inode has a fileid field that holds the 64-bit
> > > > inode number. With this set of changes, that field could be eliminated.
> > > > I'd rather leave that sort of cleanups for later just to keep this
> > > > simple.
> > > > 
> > > > Much of this set was generated by LLM, but I attributed it to myself
> > > > since I consider this to be in the "menial tasks" category of LLM usage.
> > > > 
> > > > [1]: https://lore.kernel.org/linux-fsdevel/20260219-portrait-winkt-959070cee42f@brauner/
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > 
> > > Jeff, missing from this patch set is EVM.  In hmac_add_misc() EVM copies the
> > > i_ino and calculates either an HMAC or file meta-data hash, which is then
> > > signed. 
> > > 
> > > 
> > 
> > Thanks Mimi, good catch.
> > 
> > It looks like we should just be able to change the ino field to a u64
> > alongside everything else. Something like this:
> > 
> > diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
> > index c0ca4eedb0fe..77b6c2fa345e 100644
> > --- a/security/integrity/evm/evm_crypto.c
> > +++ b/security/integrity/evm/evm_crypto.c
> > @@ -144,7 +144,7 @@ static void hmac_add_misc(struct shash_desc *desc, struct inode *inode,
> >                           char type, char *digest)
> >  {
> >         struct h_misc {
> > -               unsigned long ino;
> > +               u64 ino;
> >                 __u32 generation;
> >                 uid_t uid;
> >                 gid_t gid;
> > 
> 
> Agreed.
>
> > 
> > That should make no material difference on 64-bit hosts. What's the
> > effect on 32-bit? Will they just need to remeasure everything or would
> > the consequences be more dire? Do we have any clue whether anyone is
> > using EVM in 32-bit environments?
> 
> All good questions. Unfortunately I don't know the answer to most of them. What
> we do know: changing the size of the i_ino field would affect EVM file metadata
> verification and would require relabeling the filesystem.  Even packages
> containing EVM portable signatures, which don't include or verify the i_ino
> number, would be affected.
> 

Ouch. Technically, I guess this is ABI...

While converting to u64 seems like the ideal thing to do, the other
option might be to just keep this as an unsigned long for now.

No effect on 64-bit, but that could keep things working 32-bit when the
i_ino casts properly to a u32. ext4 would be fine since they don't
issue inode numbers larger than UINT_MAX. xfs and btrfs are a bit more
iffy, but worst case they'd just need to be relabeled (which is what
they'll need to do anyway).

If we do that, then we should probably add a comment to this function
explaining why it's an unsigned long.

Thoughts?
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply

* [PATCH 1/2] tracing: preserve repeated boot-time tracing parameters
From: Wesley Atwell @ 2026-03-09 19:52 UTC (permalink / raw)
  To: rostedt, mhiramat
  Cc: mark.rutland, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
	Wesley Atwell

Bootconfig expands arrays into repeated param=value entries, and the
kernel command line can repeat the same tracing parameter as well.
Several tracing __setup() handlers still overwrite their boot buffers,
so only the last ftrace filter, graph filter, trace option, kprobe
event, or trace trigger entry survives boot.

Preserve repeated values in the format their existing parsers already
consume: comma-delimited lists for ftrace filters and trace options,
semicolon-delimited lists for kprobe events, and per-chunk parsing for
trace_trigger=. The trace_trigger parser tokenizes its storage in
place, so keep a running length and only parse the newly appended
chunk into bootup_triggers[].

Fixes: 2af15d6a44b8 ("ftrace: add kernel command line function filtering")
Fixes: 7bcfaf54f591 ("tracing: Add trace_options kernel command line parameter")
Fixes: a01fdc897fa5 ("tracing: Add trace_trigger kernel command line option")
Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter")
Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 kernel/trace/ftrace.c       | 29 +++++++++++++++++++++++++----
 kernel/trace/trace.c        | 23 ++++++++++++++++++++++-
 kernel/trace/trace_events.c | 23 ++++++++++++++++++++---
 kernel/trace/trace_kprobe.c | 23 ++++++++++++++++++++++-
 4 files changed, 89 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8df69e702706..cdd46f639333 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6835,13 +6835,34 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
 static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
 static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
 
+static void __init append_ftrace_boot_param(char *buf, const char *str,
+					    char sep)
+{
+	size_t len, str_len;
+
+	if (buf[0] == '\0') {
+		strscpy(buf, str, FTRACE_FILTER_SIZE);
+		return;
+	}
+
+	len = strlen(buf);
+	str_len = strlen(str);
+	if (!str_len)
+		return;
+	if (str_len >= FTRACE_FILTER_SIZE - len - 1)
+		return;
+
+	buf[len] = sep;
+	strscpy(buf + len + 1, str, FTRACE_FILTER_SIZE - len - 1);
+}
+
 /* Used by function selftest to not test if filter is set */
 bool ftrace_filter_param __initdata;
 
 static int __init set_ftrace_notrace(char *str)
 {
 	ftrace_filter_param = true;
-	strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	append_ftrace_boot_param(ftrace_notrace_buf, str, ',');
 	return 1;
 }
 __setup("ftrace_notrace=", set_ftrace_notrace);
@@ -6849,7 +6870,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
 static int __init set_ftrace_filter(char *str)
 {
 	ftrace_filter_param = true;
-	strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	append_ftrace_boot_param(ftrace_filter_buf, str, ',');
 	return 1;
 }
 __setup("ftrace_filter=", set_ftrace_filter);
@@ -6861,14 +6882,14 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
 
 static int __init set_graph_function(char *str)
 {
-	strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+	append_ftrace_boot_param(ftrace_graph_buf, str, ',');
 	return 1;
 }
 __setup("ftrace_graph_filter=", set_graph_function);
 
 static int __init set_graph_notrace_function(char *str)
 {
-	strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+	append_ftrace_boot_param(ftrace_graph_notrace_buf, str, ',');
 	return 1;
 }
 __setup("ftrace_graph_notrace=", set_graph_notrace_function);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ebd996f8710e..42d03d36ae39 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -327,9 +327,30 @@ __setup("trace_instance=", boot_instance);
 
 static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
 
+static void __init append_trace_boot_options(const char *str)
+{
+	size_t len, str_len;
+
+	if (trace_boot_options_buf[0] == '\0') {
+		strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+		return;
+	}
+
+	len = strlen(trace_boot_options_buf);
+	str_len = strlen(str);
+	if (!str_len)
+		return;
+	if (str_len >= MAX_TRACER_SIZE - len - 1)
+		return;
+
+	trace_boot_options_buf[len] = ',';
+	strscpy(trace_boot_options_buf + len + 1, str,
+		MAX_TRACER_SIZE - len - 1);
+}
+
 static int __init set_trace_boot_options(char *str)
 {
-	strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+	append_trace_boot_options(str);
 	return 1;
 }
 __setup("trace_options=", set_trace_boot_options);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 249d1cba72c0..c3981f62e4bc 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3679,20 +3679,37 @@ static struct boot_triggers {
 } bootup_triggers[MAX_BOOT_TRIGGERS];
 
 static char bootup_trigger_buf[COMMAND_LINE_SIZE];
+static int bootup_trigger_buf_len;
 static int nr_boot_triggers;
 
 static __init int setup_trace_triggers(char *str)
 {
 	char *trigger;
 	char *buf;
+	ssize_t copied;
 	int i;
+	int start;
 
-	strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
+	if (bootup_trigger_buf_len >= COMMAND_LINE_SIZE)
+		return 1;
+
+	start = bootup_trigger_buf_len;
+	if (start && !*str)
+		return 1;
+
+	copied = strscpy(bootup_trigger_buf + start, str,
+			 COMMAND_LINE_SIZE - start);
+	if (copied < 0) {
+		if (start)
+			return 1;
+		copied = strlen(bootup_trigger_buf + start);
+	}
+	bootup_trigger_buf_len += copied + 1;
 	trace_set_ring_buffer_expanded(NULL);
 	disable_tracing_selftest("running event triggers");
 
-	buf = bootup_trigger_buf;
-	for (i = 0; i < MAX_BOOT_TRIGGERS; i++) {
+	buf = bootup_trigger_buf + start;
+	for (i = nr_boot_triggers; i < MAX_BOOT_TRIGGERS; i++) {
 		trigger = strsep(&buf, ",");
 		if (!trigger)
 			break;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a5dbb72528e0..a63a56b55570 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,9 +29,30 @@
 /* Kprobe early definition from command line */
 static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata;
 
+static void __init append_kprobe_boot_event(const char *str)
+{
+	size_t len, str_len;
+
+	if (kprobe_boot_events_buf[0] == '\0') {
+		strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+		return;
+	}
+
+	len = strlen(kprobe_boot_events_buf);
+	str_len = strlen(str);
+	if (!str_len)
+		return;
+	if (str_len >= COMMAND_LINE_SIZE - len - 1)
+		return;
+
+	kprobe_boot_events_buf[len] = ';';
+	strscpy(kprobe_boot_events_buf + len + 1, str,
+		COMMAND_LINE_SIZE - len - 1);
+}
+
 static int __init set_kprobe_boot_events(char *str)
 {
-	strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
+	append_kprobe_boot_event(str);
 	disable_tracing_selftest("running kprobe events");
 
 	return 1;
-- 
2.34.1


^ permalink raw reply related

* [PATCH 2/2] tracing: drain deferred trigger frees if kthread startup fails
From: Wesley Atwell @ 2026-03-09 19:52 UTC (permalink / raw)
  To: rostedt, mhiramat
  Cc: mark.rutland, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
	Wesley Atwell
In-Reply-To: <20260309195220.2726094-1-atwellwea@gmail.com>

Boot-time trigger registration can fail before the trigger-data cleanup
kthread exists. Deferring those frees until late init is fine, but the
post-boot fallback must still drain the deferred list if kthread
creation never succeeds.

Otherwise, boot-deferred nodes can accumulate on
trigger_data_free_list, later frees fall back to synchronously freeing
only the current object, and the older queued entries are leaked
forever.

Keep the deferred boot-time behavior, but when kthread creation fails,
drain the whole queued list synchronously. Do the same in the late-init
drain path so queued entries are not stranded there either.

Fixes: 61d445af0a7c ("tracing: Add bulk garbage collection of freeing event_trigger_data")
Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 kernel/trace/trace_events_trigger.c | 79 ++++++++++++++++++++++++-----
 1 file changed, 65 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d5230b759a2d..3b47a361b867 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread;
 static struct llist_head trigger_data_free_list;
 static DEFINE_MUTEX(trigger_data_kthread_mutex);
 
+static int trigger_kthread_fn(void *ignore);
+
+static void trigger_start_kthread_locked(void)
+{
+	lockdep_assert_held(&trigger_data_kthread_mutex);
+
+	if (!trigger_kthread) {
+		struct task_struct *kthread;
+
+		kthread = kthread_create(trigger_kthread_fn, NULL,
+					 "trigger_data_free");
+		if (!IS_ERR(kthread))
+			WRITE_ONCE(trigger_kthread, kthread);
+	}
+}
+
+static void trigger_data_free_queued_locked(void)
+{
+	struct event_trigger_data *data, *tmp;
+	struct llist_node *llnodes;
+
+	lockdep_assert_held(&trigger_data_kthread_mutex);
+
+	llnodes = llist_del_all(&trigger_data_free_list);
+	if (!llnodes)
+		return;
+
+	tracepoint_synchronize_unregister();
+
+	llist_for_each_entry_safe(data, tmp, llnodes, llist)
+		kfree(data);
+}
+
 /* Bulk garbage collection of event_trigger_data elements */
 static int trigger_kthread_fn(void *ignore)
 {
@@ -56,30 +89,48 @@ void trigger_data_free(struct event_trigger_data *data)
 	if (data->cmd_ops->set_filter)
 		data->cmd_ops->set_filter(NULL, data, NULL);
 
+	/*
+	 * Boot-time trigger registration can fail before kthread creation
+	 * works. Keep the deferred-free semantics during boot and let late
+	 * init start the kthread to drain the list.
+	 */
+	if (system_state == SYSTEM_BOOTING && !trigger_kthread) {
+		llist_add(&data->llist, &trigger_data_free_list);
+		return;
+	}
+
 	if (unlikely(!trigger_kthread)) {
 		guard(mutex)(&trigger_data_kthread_mutex);
-		/* Check again after taking mutex */
-		if (!trigger_kthread) {
-			struct task_struct *kthread;
 
-			kthread = kthread_create(trigger_kthread_fn, NULL,
-						 "trigger_data_free");
-			if (!IS_ERR(kthread))
-				WRITE_ONCE(trigger_kthread, kthread);
+		trigger_start_kthread_locked();
+		if (!trigger_kthread) {
+			llist_add(&data->llist, &trigger_data_free_list);
+			trigger_data_free_queued_locked();
+			return;
 		}
 	}
 
-	if (!trigger_kthread) {
-		/* Do it the slow way */
-		tracepoint_synchronize_unregister();
-		kfree(data);
-		return;
-	}
-
 	llist_add(&data->llist, &trigger_data_free_list);
 	wake_up_process(trigger_kthread);
 }
 
+static int __init trigger_data_free_init(void)
+{
+	guard(mutex)(&trigger_data_kthread_mutex);
+
+	if (llist_empty(&trigger_data_free_list))
+		return 0;
+
+	trigger_start_kthread_locked();
+	if (trigger_kthread)
+		wake_up_process(trigger_kthread);
+	else
+		trigger_data_free_queued_locked();
+
+	return 0;
+}
+late_initcall(trigger_data_free_init);
+
 static inline void data_ops_trigger(struct event_trigger_data *data,
 				    struct trace_buffer *buffer,  void *rec,
 				    struct ring_buffer_event *event)
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 00/18] rtla: Robustness and code quality improvements
From: Wander Lairson Costa @ 2026-03-09 19:46 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar, Wander Lairson Costa, Crystal Wood,
	Ivan Pravdin, Costa Shulyupin, John Kacur, Tiezhu Yang,
	Daniel Bristot de Oliveira, Daniel Wagner,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:BPF [MISC]:Keyword:(?:b|_)bpf(?:b|_)

This series addresses multiple issues in the rtla codebase related to
error handling, string manipulation safety, and code maintainability.
The changes improve the tool's reliability and bring the code more in
line with kernel coding practices.

The series can be broadly divided into three categories:

Bug fixes address several correctness issues: a resource leak where
opendir() was not paired with closedir() on success paths, I/O loops
that failed to handle EINTR and partial writes correctly, a missing
bounds check when indexing the softirq_name array with kernel-provided
data, improper handling of pthread_create() failures, and a loop
condition that checked a pointer instead of the character it points to.

String handling improvements replace unsafe patterns throughout the
codebase. The strncpy() function is replaced with a new strscpy()
implementation that guarantees NUL-termination and provides truncation
detection. A str_has_prefix() helper replaces verbose strncmp/strlen
patterns for prefix matching. String comparisons are tightened to use
exact matching where appropriate, preventing silent acceptance of
malformed input like "100nsx" being parsed as "100ns".

Code quality improvements reduce duplication and improve readability.
A common_threshold_handler() consolidates repeated threshold action
logic. The extract_arg() macro simplifies key=value parsing. Magic
numbers are replaced with named constants (MAX_PATH, ARRAY_SIZE), and
redundant strlen() calls are cached in local variables.

All changes have been tested with the existing rtla test suite.

v4:
- Patch 3: Rename commit title from "rtla: Simplify argument parsing"
  to "rtla/actions: Simplify argument parsing" to clarify scope. Remove
  spurious whitespace change to container_of macro (Tomas Glozar).
- Patch 12: Fix a regression where strcmp() broke SCHED_DEADLINE
  priority parsing in "d:runtime:period" format (e.g., "d:10ms:100ms").
  Introduce match_time_unit() helper that checks the suffix is followed
  by end-of-string or ':' delimiter. Note that strncmp_static() was
  also considered but would fail because ARRAY_SIZE() includes the NUL
  terminator (Tomas Glozar).
- Patch 15: Add Fixes tag for stable backport tracking (Tomas Glozar).

v3:
- Address v2 feedback:
  - Rename common_restart() to common_threshold_handler() to better
    reflect its purpose (Tomas Glozar).
  - Implement a proper strscpy() for safer string handling instead of
    manual buffer sizing (Steven Rostedt).
  - Remove restart_result enum in favor of simpler, direct return
    values (Tomas Glozar).
- Add several new bug fixes, including a softirq vector bounds check,
  pthread_create() failure handling, robust I/O handling for
  EINTR/partial writes, and a resource leak fix.
- Introduce str_has_prefix() helper to replace verbose strncmp/strlen
  patterns.
- Tighten string parsing to enforce exact matching and reject invalid
  suffixes (e.g., "100nsx").
- Drop patches already merged via RTLA v6.20 pull request.

v2:
- exit on memory allocation failure
- remove redundant strlen() calls
- fix possible race on condition on stop_tracing variable access
- ensure null termination on read() calls
- fix checkpatch reports
- make extract_args() an inline function
- add the usage of common_restart() in more places

Wander Lairson Costa (18):
  rtla: Exit on memory allocation failures during initialization
  rtla: Use strdup() to simplify code
  rtla/actions: Simplify argument parsing
  rtla: Introduce common_threshold_handler() helper
  rtla: Replace magic number with MAX_PATH
  rtla: Simplify code by caching string lengths
  rtla: Add strscpy() and replace strncpy() calls
  rtla/timerlat: Add bounds check for softirq vector
  rtla: Handle pthread_create() failure properly
  rtla: Add str_has_prefix() helper function
  rtla: Use str_has_prefix() for prefix checks
  rtla: Enforce exact match for time unit suffixes
  rtla: Use str_has_prefix() for option prefix check
  rtla/timerlat: Simplify RTLA_NO_BPF environment variable check
  rtla/trace: Fix write loop in trace_event_save_hist()
  rtla/trace: Fix I/O handling in save_trace_to_file()
  rtla/utils: Fix resource leak in set_comm_sched_attr()
  rtla/utils: Fix loop condition in PID validation

 tools/tracing/rtla/src/actions.c       | 103 ++++++++++++---------
 tools/tracing/rtla/src/actions.h       |   8 +-
 tools/tracing/rtla/src/common.c        |  65 ++++++++++----
 tools/tracing/rtla/src/common.h        |  18 ++++
 tools/tracing/rtla/src/osnoise.c       |  26 ++----
 tools/tracing/rtla/src/osnoise_hist.c  |  22 ++---
 tools/tracing/rtla/src/osnoise_top.c   |  22 ++---
 tools/tracing/rtla/src/timerlat.c      |   5 +-
 tools/tracing/rtla/src/timerlat_aa.c   |  10 +--
 tools/tracing/rtla/src/timerlat_hist.c |  41 ++++-----
 tools/tracing/rtla/src/timerlat_top.c  |  54 +++++------
 tools/tracing/rtla/src/timerlat_u.c    |   4 +-
 tools/tracing/rtla/src/trace.c         | 102 +++++++++++----------
 tools/tracing/rtla/src/trace.h         |   4 +-
 tools/tracing/rtla/src/utils.c         | 119 +++++++++++++++++++++----
 tools/tracing/rtla/src/utils.h         |  29 +++++-
 16 files changed, 388 insertions(+), 244 deletions(-)

-- 
2.53.0


^ permalink raw reply

* [PATCH v4 01/18] rtla: Exit on memory allocation failures during initialization
From: Wander Lairson Costa @ 2026-03-09 19:46 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar, Wander Lairson Costa, Ivan Pravdin,
	Crystal Wood, Costa Shulyupin, John Kacur, Tiezhu Yang,
	Daniel Bristot de Oliveira, Daniel Wagner,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:BPF [MISC]:Keyword:(?:b|_)bpf(?:b|_)
In-Reply-To: <20260309195040.1019085-1-wander@redhat.com>

Most memory allocations in rtla happen during early initialization
before any resources are acquired that would require cleanup. In these
cases, propagating allocation errors just adds complexity without any
benefit. There's nothing to clean up, and the program must exit anyway.

This patch introduces fatal allocation wrappers (calloc_fatal,
reallocarray_fatal, strdup_fatal) that call fatal() on allocation
failure. These wrappers simplify the code by eliminating unnecessary
error propagation paths.

The patch converts early allocations to use these wrappers in
actions_init() and related action functions, osnoise_context_alloc()
and osnoise_init_tool(), trace_instance_init() and trace event
functions, and parameter structure allocations in main functions.

This simplifies the code while maintaining the same behavior: immediate
exit on allocation failure during initialization. Allocations that
require cleanup, such as those in histogram allocation functions with
goto cleanup paths, are left unchanged and continue to return errors.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
---
 tools/tracing/rtla/src/actions.c       | 50 ++++++++++++--------------
 tools/tracing/rtla/src/actions.h       |  8 ++---
 tools/tracing/rtla/src/osnoise.c       | 22 ++++--------
 tools/tracing/rtla/src/osnoise_hist.c  | 22 ++++--------
 tools/tracing/rtla/src/osnoise_top.c   | 22 ++++--------
 tools/tracing/rtla/src/timerlat_hist.c | 22 ++++--------
 tools/tracing/rtla/src/timerlat_top.c  | 22 ++++--------
 tools/tracing/rtla/src/trace.c         | 30 ++++------------
 tools/tracing/rtla/src/trace.h         |  4 +--
 tools/tracing/rtla/src/utils.c         | 35 ++++++++++++++++++
 tools/tracing/rtla/src/utils.h         |  3 ++
 11 files changed, 108 insertions(+), 132 deletions(-)

diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c
index a42615011962d..22b8283a183f3 100644
--- a/tools/tracing/rtla/src/actions.c
+++ b/tools/tracing/rtla/src/actions.c
@@ -15,7 +15,7 @@ void
 actions_init(struct actions *self)
 {
 	self->size = action_default_size;
-	self->list = calloc(self->size, sizeof(struct action));
+	self->list = calloc_fatal(self->size, sizeof(struct action));
 	self->len = 0;
 	self->continue_flag = false;
 
@@ -50,8 +50,10 @@ static struct action *
 actions_new(struct actions *self)
 {
 	if (self->len >= self->size) {
-		self->size *= 2;
-		self->list = realloc(self->list, self->size * sizeof(struct action));
+		const size_t new_size = self->size * 2;
+
+		self->list = reallocarray_fatal(self->list, new_size, sizeof(struct action));
+		self->size = new_size;
 	}
 
 	return &self->list[self->len++];
@@ -60,25 +62,21 @@ actions_new(struct actions *self)
 /*
  * actions_add_trace_output - add an action to output trace
  */
-int
+void
 actions_add_trace_output(struct actions *self, const char *trace_output)
 {
 	struct action *action = actions_new(self);
 
 	self->present[ACTION_TRACE_OUTPUT] = true;
 	action->type = ACTION_TRACE_OUTPUT;
-	action->trace_output = calloc(strlen(trace_output) + 1, sizeof(char));
-	if (!action->trace_output)
-		return -1;
+	action->trace_output = calloc_fatal(strlen(trace_output) + 1, sizeof(char));
 	strcpy(action->trace_output, trace_output);
-
-	return 0;
 }
 
 /*
  * actions_add_trace_output - add an action to send signal to a process
  */
-int
+void
 actions_add_signal(struct actions *self, int signal, int pid)
 {
 	struct action *action = actions_new(self);
@@ -87,40 +85,32 @@ actions_add_signal(struct actions *self, int signal, int pid)
 	action->type = ACTION_SIGNAL;
 	action->signal = signal;
 	action->pid = pid;
-
-	return 0;
 }
 
 /*
  * actions_add_shell - add an action to execute a shell command
  */
-int
+void
 actions_add_shell(struct actions *self, const char *command)
 {
 	struct action *action = actions_new(self);
 
 	self->present[ACTION_SHELL] = true;
 	action->type = ACTION_SHELL;
-	action->command = calloc(strlen(command) + 1, sizeof(char));
-	if (!action->command)
-		return -1;
+	action->command = calloc_fatal(strlen(command) + 1, sizeof(char));
 	strcpy(action->command, command);
-
-	return 0;
 }
 
 /*
  * actions_add_continue - add an action to resume measurement
  */
-int
+void
 actions_add_continue(struct actions *self)
 {
 	struct action *action = actions_new(self);
 
 	self->present[ACTION_CONTINUE] = true;
 	action->type = ACTION_CONTINUE;
-
-	return 0;
 }
 
 /*
@@ -176,7 +166,8 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn)
 				/* Only one argument allowed */
 				return -1;
 		}
-		return actions_add_trace_output(self, trace_output);
+		actions_add_trace_output(self, trace_output);
+		break;
 	case ACTION_SIGNAL:
 		/* Takes two arguments, num (signal) and pid */
 		while (token != NULL) {
@@ -200,21 +191,26 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn)
 			/* Missing argument */
 			return -1;
 
-		return actions_add_signal(self, signal, pid);
+		actions_add_signal(self, signal, pid);
+		break;
 	case ACTION_SHELL:
 		if (token == NULL)
 			return -1;
-		if (strlen(token) > 8 && strncmp(token, "command=", 8) == 0)
-			return actions_add_shell(self, token + 8);
-		return -1;
+		if (strlen(token) > 8 && strncmp(token, "command=", 8))
+			return -1;
+		actions_add_shell(self, token + 8);
+		break;
 	case ACTION_CONTINUE:
 		/* Takes no argument */
 		if (token != NULL)
 			return -1;
-		return actions_add_continue(self);
+		actions_add_continue(self);
+		break;
 	default:
 		return -1;
 	}
+
+	return 0;
 }
 
 /*
diff --git a/tools/tracing/rtla/src/actions.h b/tools/tracing/rtla/src/actions.h
index fb77069c972ba..034048682fefb 100644
--- a/tools/tracing/rtla/src/actions.h
+++ b/tools/tracing/rtla/src/actions.h
@@ -49,9 +49,9 @@ struct actions {
 
 void actions_init(struct actions *self);
 void actions_destroy(struct actions *self);
-int actions_add_trace_output(struct actions *self, const char *trace_output);
-int actions_add_signal(struct actions *self, int signal, int pid);
-int actions_add_shell(struct actions *self, const char *command);
-int actions_add_continue(struct actions *self);
+void actions_add_trace_output(struct actions *self, const char *trace_output);
+void actions_add_signal(struct actions *self, int signal, int pid);
+void actions_add_shell(struct actions *self, const char *command);
+void actions_add_continue(struct actions *self);
 int actions_parse(struct actions *self, const char *trigger, const char *tracefn);
 int actions_perform(struct actions *self);
diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index 945eb61efc465..ec074cd53dd84 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -938,9 +938,7 @@ struct osnoise_context *osnoise_context_alloc(void)
 {
 	struct osnoise_context *context;
 
-	context = calloc(1, sizeof(*context));
-	if (!context)
-		return NULL;
+	context = calloc_fatal(1, sizeof(*context));
 
 	context->orig_stop_us		= OSNOISE_OPTION_INIT_VAL;
 	context->stop_us		= OSNOISE_OPTION_INIT_VAL;
@@ -1017,24 +1015,16 @@ void osnoise_destroy_tool(struct osnoise_tool *top)
 struct osnoise_tool *osnoise_init_tool(char *tool_name)
 {
 	struct osnoise_tool *top;
-	int retval;
-
-	top = calloc(1, sizeof(*top));
-	if (!top)
-		return NULL;
 
+	top = calloc_fatal(1, sizeof(*top));
 	top->context = osnoise_context_alloc();
-	if (!top->context)
-		goto out_err;
 
-	retval = trace_instance_init(&top->trace, tool_name);
-	if (retval)
-		goto out_err;
+	if (trace_instance_init(&top->trace, tool_name)) {
+		osnoise_destroy_tool(top);
+		return NULL;
+	}
 
 	return top;
-out_err:
-	osnoise_destroy_tool(top);
-	return NULL;
 }
 
 /*
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 9d70ea34807ff..efbd2e834cf0e 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -466,9 +466,7 @@ static struct common_params
 	int c;
 	char *trace_output = NULL;
 
-	params = calloc(1, sizeof(*params));
-	if (!params)
-		exit(1);
+	params = calloc_fatal(1, sizeof(*params));
 
 	actions_init(&params->common.threshold_actions);
 	actions_init(&params->common.end_actions);
@@ -579,22 +577,16 @@ static struct common_params
 			params->common.hist.with_zeros = 1;
 			break;
 		case '4': /* trigger */
-			if (params->common.events) {
-				retval = trace_event_add_trigger(params->common.events, optarg);
-				if (retval)
-					fatal("Error adding trigger %s", optarg);
-			} else {
+			if (params->common.events)
+				trace_event_add_trigger(params->common.events, optarg);
+			else
 				fatal("--trigger requires a previous -e");
-			}
 			break;
 		case '5': /* filter */
-			if (params->common.events) {
-				retval = trace_event_add_filter(params->common.events, optarg);
-				if (retval)
-					fatal("Error adding filter %s", optarg);
-			} else {
+			if (params->common.events)
+				trace_event_add_filter(params->common.events, optarg);
+			else
 				fatal("--filter requires a previous -e");
-			}
 			break;
 		case '6':
 			params->common.warmup = get_llong_from_str(optarg);
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index d54d47947fb44..d2b4ac64e77b4 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -319,9 +319,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 	int c;
 	char *trace_output = NULL;
 
-	params = calloc(1, sizeof(*params));
-	if (!params)
-		exit(1);
+	params = calloc_fatal(1, sizeof(*params));
 
 	actions_init(&params->common.threshold_actions);
 	actions_init(&params->common.end_actions);
@@ -410,22 +408,16 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			params->threshold = get_llong_from_str(optarg);
 			break;
 		case '0': /* trigger */
-			if (params->common.events) {
-				retval = trace_event_add_trigger(params->common.events, optarg);
-				if (retval)
-					fatal("Error adding trigger %s", optarg);
-			} else {
+			if (params->common.events)
+				trace_event_add_trigger(params->common.events, optarg);
+			else
 				fatal("--trigger requires a previous -e");
-			}
 			break;
 		case '1': /* filter */
-			if (params->common.events) {
-				retval = trace_event_add_filter(params->common.events, optarg);
-				if (retval)
-					fatal("Error adding filter %s", optarg);
-			} else {
+			if (params->common.events)
+				trace_event_add_filter(params->common.events, optarg);
+			else
 				fatal("--filter requires a previous -e");
-			}
 			break;
 		case '2':
 			params->common.warmup = get_llong_from_str(optarg);
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 4e8c38a61197c..6ea397421f1c9 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -766,9 +766,7 @@ static struct common_params
 	int c;
 	char *trace_output = NULL;
 
-	params = calloc(1, sizeof(*params));
-	if (!params)
-		exit(1);
+	params = calloc_fatal(1, sizeof(*params));
 
 	actions_init(&params->common.threshold_actions);
 	actions_init(&params->common.end_actions);
@@ -914,22 +912,16 @@ static struct common_params
 			params->common.hist.with_zeros = 1;
 			break;
 		case '6': /* trigger */
-			if (params->common.events) {
-				retval = trace_event_add_trigger(params->common.events, optarg);
-				if (retval)
-					fatal("Error adding trigger %s", optarg);
-			} else {
+			if (params->common.events)
+				trace_event_add_trigger(params->common.events, optarg);
+			else
 				fatal("--trigger requires a previous -e");
-			}
 			break;
 		case '7': /* filter */
-			if (params->common.events) {
-				retval = trace_event_add_filter(params->common.events, optarg);
-				if (retval)
-					fatal("Error adding filter %s", optarg);
-			} else {
+			if (params->common.events)
+				trace_event_add_filter(params->common.events, optarg);
+			else
 				fatal("--filter requires a previous -e");
-			}
 			break;
 		case '8':
 			params->dma_latency = get_llong_from_str(optarg);
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 284b74773c2b5..dd727cb48b551 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -537,9 +537,7 @@ static struct common_params
 	int c;
 	char *trace_output = NULL;
 
-	params = calloc(1, sizeof(*params));
-	if (!params)
-		exit(1);
+	params = calloc_fatal(1, sizeof(*params));
 
 	actions_init(&params->common.threshold_actions);
 	actions_init(&params->common.end_actions);
@@ -664,22 +662,16 @@ static struct common_params
 			params->common.user_data = true;
 			break;
 		case '0': /* trigger */
-			if (params->common.events) {
-				retval = trace_event_add_trigger(params->common.events, optarg);
-				if (retval)
-					fatal("Error adding trigger %s", optarg);
-			} else {
+			if (params->common.events)
+				trace_event_add_trigger(params->common.events, optarg);
+			else
 				fatal("--trigger requires a previous -e");
-			}
 			break;
 		case '1': /* filter */
-			if (params->common.events) {
-				retval = trace_event_add_filter(params->common.events, optarg);
-				if (retval)
-					fatal("Error adding filter %s", optarg);
-			} else {
+			if (params->common.events)
+				trace_event_add_filter(params->common.events, optarg);
+			else
 				fatal("--filter requires a previous -e");
-			}
 			break;
 		case '2': /* dma-latency */
 			params->dma_latency = get_llong_from_str(optarg);
diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
index b8be3e28680ee..211ca54b15b0e 100644
--- a/tools/tracing/rtla/src/trace.c
+++ b/tools/tracing/rtla/src/trace.c
@@ -191,9 +191,7 @@ void trace_instance_destroy(struct trace_instance *trace)
  */
 int trace_instance_init(struct trace_instance *trace, char *tool_name)
 {
-	trace->seq = calloc(1, sizeof(*trace->seq));
-	if (!trace->seq)
-		goto out_err;
+	trace->seq = calloc_fatal(1, sizeof(*trace->seq));
 
 	trace_seq_init(trace->seq);
 
@@ -274,15 +272,9 @@ struct trace_events *trace_event_alloc(const char *event_string)
 {
 	struct trace_events *tevent;
 
-	tevent = calloc(1, sizeof(*tevent));
-	if (!tevent)
-		return NULL;
+	tevent = calloc_fatal(1, sizeof(*tevent));
 
-	tevent->system = strdup(event_string);
-	if (!tevent->system) {
-		free(tevent);
-		return NULL;
-	}
+	tevent->system = strdup_fatal(event_string);
 
 	tevent->event = strstr(tevent->system, ":");
 	if (tevent->event) {
@@ -296,31 +288,23 @@ struct trace_events *trace_event_alloc(const char *event_string)
 /*
  * trace_event_add_filter - record an event filter
  */
-int trace_event_add_filter(struct trace_events *event, char *filter)
+void trace_event_add_filter(struct trace_events *event, char *filter)
 {
 	if (event->filter)
 		free(event->filter);
 
-	event->filter = strdup(filter);
-	if (!event->filter)
-		return 1;
-
-	return 0;
+	event->filter = strdup_fatal(filter);
 }
 
 /*
  * trace_event_add_trigger - record an event trigger action
  */
-int trace_event_add_trigger(struct trace_events *event, char *trigger)
+void trace_event_add_trigger(struct trace_events *event, char *trigger)
 {
 	if (event->trigger)
 		free(event->trigger);
 
-	event->trigger = strdup(trigger);
-	if (!event->trigger)
-		return 1;
-
-	return 0;
+	event->trigger = strdup_fatal(trigger);
 }
 
 /*
diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h
index 1e5aee4b828dd..95b911a2228b2 100644
--- a/tools/tracing/rtla/src/trace.h
+++ b/tools/tracing/rtla/src/trace.h
@@ -45,6 +45,6 @@ void trace_events_destroy(struct trace_instance *instance,
 int trace_events_enable(struct trace_instance *instance,
 			  struct trace_events *events);
 
-int trace_event_add_filter(struct trace_events *event, char *filter);
-int trace_event_add_trigger(struct trace_events *event, char *trigger);
+void trace_event_add_filter(struct trace_events *event, char *filter);
+void trace_event_add_trigger(struct trace_events *event, char *trigger);
 int trace_set_buffer_size(struct trace_instance *trace, int size);
diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 18986a5aed3c1..75cdcc63d5a15 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -1032,3 +1032,38 @@ int strtoi(const char *s, int *res)
 	*res = (int) lres;
 	return 0;
 }
+
+static inline void fatal_alloc(void)
+{
+	fatal("Error allocating memory\n");
+}
+
+void *calloc_fatal(size_t n, size_t size)
+{
+	void *p = calloc(n, size);
+
+	if (!p)
+		fatal_alloc();
+
+	return p;
+}
+
+void *reallocarray_fatal(void *p, size_t n, size_t size)
+{
+	p = reallocarray(p, n, size);
+
+	if (!p)
+		fatal_alloc();
+
+	return p;
+}
+
+char *strdup_fatal(const char *s)
+{
+	char *p = strdup(s);
+
+	if (!p)
+		fatal_alloc();
+
+	return p;
+}
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index f7c2a52a0ab54..e29c2eb5d569d 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -69,6 +69,9 @@ int set_comm_sched_attr(const char *comm_prefix, struct sched_attr *attr);
 int set_comm_cgroup(const char *comm_prefix, const char *cgroup);
 int set_pid_cgroup(pid_t pid, const char *cgroup);
 int set_cpu_dma_latency(int32_t latency);
+void *calloc_fatal(size_t n, size_t size);
+void *reallocarray_fatal(void *p, size_t n, size_t size);
+char *strdup_fatal(const char *s);
 #ifdef HAVE_LIBCPUPOWER_SUPPORT
 int save_cpu_idle_disable_state(unsigned int cpu);
 int restore_cpu_idle_disable_state(unsigned int cpu);
-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 02/18] rtla: Use strdup() to simplify code
From: Wander Lairson Costa @ 2026-03-09 19:46 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar, Wander Lairson Costa, Ivan Pravdin,
	Crystal Wood, Costa Shulyupin, John Kacur, Tiezhu Yang,
	Daniel Bristot de Oliveira, Daniel Wagner,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:BPF [MISC]:Keyword:(?:b|_)bpf(?:b|_)
In-Reply-To: <20260309195040.1019085-1-wander@redhat.com>

The actions_add_trace_output() and actions_add_shell() functions were
using calloc() followed by strcpy() to allocate and copy a string.
This can be simplified by using strdup(), which allocates memory and
copies the string in a single step.

Replace the calloc() and strcpy() calls with strdup(), making the
code more concise and readable.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
---
 tools/tracing/rtla/src/actions.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c
index 22b8283a183f3..0ac42ffd734a3 100644
--- a/tools/tracing/rtla/src/actions.c
+++ b/tools/tracing/rtla/src/actions.c
@@ -69,8 +69,7 @@ actions_add_trace_output(struct actions *self, const char *trace_output)
 
 	self->present[ACTION_TRACE_OUTPUT] = true;
 	action->type = ACTION_TRACE_OUTPUT;
-	action->trace_output = calloc_fatal(strlen(trace_output) + 1, sizeof(char));
-	strcpy(action->trace_output, trace_output);
+	action->trace_output = strdup_fatal(trace_output);
 }
 
 /*
@@ -97,8 +96,7 @@ actions_add_shell(struct actions *self, const char *command)
 
 	self->present[ACTION_SHELL] = true;
 	action->type = ACTION_SHELL;
-	action->command = calloc_fatal(strlen(command) + 1, sizeof(char));
-	strcpy(action->command, command);
+	action->command = strdup_fatal(command);
 }
 
 /*
-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 03/18] rtla/actions: Simplify argument parsing
From: Wander Lairson Costa @ 2026-03-09 19:46 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar, Wander Lairson Costa, Ivan Pravdin,
	Crystal Wood, Costa Shulyupin, John Kacur, Tiezhu Yang,
	Daniel Bristot de Oliveira, Daniel Wagner,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:BPF [MISC]:Keyword:(?:b|_)bpf(?:b|_)
In-Reply-To: <20260309195040.1019085-1-wander@redhat.com>

The actions_parse() function uses open-coded logic to extract arguments
from a string. This includes manual length checks and strncmp() calls,
which can be verbose and error-prone.

To simplify and improve the robustness of argument parsing, introduce a
new extract_arg() helper macro. This macro extracts the value from a
"key=value" pair, making the code more concise and readable.

Also, introduce STRING_LENGTH() and strncmp_static() macros to
perform compile-time calculations of string lengths and safer string
comparisons.

Refactor actions_parse() to use these new helpers, resulting in
cleaner and more maintainable code.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
---
 tools/tracing/rtla/src/actions.c | 57 +++++++++++++++++++++++---------
 tools/tracing/rtla/src/utils.h   | 10 ++++++
 2 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c
index 0ac42ffd734a3..b0d68b5de08db 100644
--- a/tools/tracing/rtla/src/actions.c
+++ b/tools/tracing/rtla/src/actions.c
@@ -111,6 +111,29 @@ actions_add_continue(struct actions *self)
 	action->type = ACTION_CONTINUE;
 }
 
+static inline const char *__extract_arg(const char *token, const char *opt, size_t opt_len)
+{
+	const size_t tok_len = strlen(token);
+
+	if (tok_len <= opt_len)
+		return NULL;
+
+	if (strncmp(token, opt, opt_len))
+		return NULL;
+
+	return token + opt_len;
+}
+
+/*
+ * extract_arg - extract argument value from option token
+ * @token: option token (e.g., "file=trace.txt")
+ * @opt: option name to match (e.g., "file")
+ *
+ * Returns pointer to argument value after "=" if token matches "opt=",
+ * otherwise returns NULL.
+ */
+#define extract_arg(token, opt) __extract_arg(token, opt "=", STRING_LENGTH(opt "="))
+
 /*
  * actions_parse - add an action based on text specification
  */
@@ -120,6 +143,7 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn)
 	enum action_type type = ACTION_NONE;
 	const char *token;
 	char trigger_c[strlen(trigger) + 1];
+	const char *arg_value;
 
 	/* For ACTION_SIGNAL */
 	int signal = 0, pid = 0;
@@ -152,12 +176,10 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn)
 		if (token == NULL)
 			trace_output = tracefn;
 		else {
-			if (strlen(token) > 5 && strncmp(token, "file=", 5) == 0) {
-				trace_output = token + 5;
-			} else {
+			trace_output = extract_arg(token, "file");
+			if (!trace_output)
 				/* Invalid argument */
 				return -1;
-			}
 
 			token = strtok(NULL, ",");
 			if (token != NULL)
@@ -169,17 +191,21 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn)
 	case ACTION_SIGNAL:
 		/* Takes two arguments, num (signal) and pid */
 		while (token != NULL) {
-			if (strlen(token) > 4 && strncmp(token, "num=", 4) == 0) {
-				if (strtoi(token + 4, &signal))
-					return -1;
-			} else if (strlen(token) > 4 && strncmp(token, "pid=", 4) == 0) {
-				if (strncmp(token + 4, "parent", 7) == 0)
-					pid = -1;
-				else if (strtoi(token + 4, &pid))
+			arg_value = extract_arg(token, "num");
+			if (arg_value) {
+				if (strtoi(arg_value, &signal))
 					return -1;
 			} else {
-				/* Invalid argument */
-				return -1;
+				arg_value = extract_arg(token, "pid");
+				if (arg_value) {
+					if (strncmp_static(arg_value, "parent") == 0)
+						pid = -1;
+					else if (strtoi(arg_value, &pid))
+						return -1;
+				} else {
+					/* Invalid argument */
+					return -1;
+				}
 			}
 
 			token = strtok(NULL, ",");
@@ -194,9 +220,10 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn)
 	case ACTION_SHELL:
 		if (token == NULL)
 			return -1;
-		if (strlen(token) > 8 && strncmp(token, "command=", 8))
+		arg_value = extract_arg(token, "command");
+		if (!arg_value)
 			return -1;
-		actions_add_shell(self, token + 8);
+		actions_add_shell(self, arg_value);
 		break;
 	case ACTION_CONTINUE:
 		/* Takes no argument */
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index e29c2eb5d569d..11458676f607d 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -14,6 +14,16 @@
 #define MAX_NICE		20
 #define MIN_NICE		-19
 
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#endif
+
+/* Calculate string length at compile time (excluding null terminator) */
+#define STRING_LENGTH(s) (ARRAY_SIZE(s) - sizeof(*(s)))
+
+/* Compare string with static string, length determined at compile time */
+#define strncmp_static(s1, s2) strncmp(s1, s2, ARRAY_SIZE(s2))
+
 #define container_of(ptr, type, member)({			\
 	const typeof(((type *)0)->member) *__mptr = (ptr);	\
 	(type *)((char *)__mptr - offsetof(type, member)) ; })
-- 
2.53.0


^ permalink raw reply related

* [PATCH v4 04/18] rtla: Introduce common_threshold_handler() helper
From: Wander Lairson Costa @ 2026-03-09 19:46 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar, Wander Lairson Costa, Ivan Pravdin,
	Crystal Wood, Costa Shulyupin, John Kacur, Tiezhu Yang,
	Daniel Bristot de Oliveira, Daniel Wagner,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:Real-time Linux Analysis (RTLA) tools,
	open list:BPF [MISC]:Keyword:(?:b|_)bpf(?:b|_)
In-Reply-To: <20260309195040.1019085-1-wander@redhat.com>

Several functions duplicate the logic for handling threshold actions.
When a threshold is reached, these functions stop the trace, perform
configured actions, and restart the trace if --on-threshold continue
is set.

Create common_threshold_handler() to centralize this shared logic and
avoid code duplication. The function executes the configured threshold
actions and restarts the necessary trace instances when appropriate.

Also add should_continue_tracing() helper to encapsulate the check
for whether tracing should continue after a threshold event, improving
code readability at call sites.

In timerlat_top_bpf_main_loop(), use common_params directly instead
of casting through timerlat_params when only common fields are needed.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 61 ++++++++++++++++++--------
 tools/tracing/rtla/src/common.h        | 18 ++++++++
 tools/tracing/rtla/src/timerlat_hist.c | 19 ++++----
 tools/tracing/rtla/src/timerlat_top.c  | 32 +++++++-------
 4 files changed, 86 insertions(+), 44 deletions(-)

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index ceff76a62a30b..cbc207fa58707 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -175,6 +175,38 @@ common_apply_config(struct osnoise_tool *tool, struct common_params *params)
 }
 
 
+/**
+ * common_threshold_handler - handle latency threshold overflow
+ * @tool: pointer to the osnoise_tool instance containing trace contexts
+ *
+ * Executes the configured threshold actions (e.g., saving trace, printing,
+ * sending signals). If the continue flag is set (--on-threshold continue),
+ * restarts the auxiliary trace instances to continue monitoring.
+ *
+ * Return: 0 for success, -1 for error.
+ */
+int
+common_threshold_handler(const struct osnoise_tool *tool)
+{
+	actions_perform(&tool->params->threshold_actions);
+
+	if (!should_continue_tracing(tool->params))
+		/* continue flag not set, break */
+		return 0;
+
+	/* continue action reached, re-enable tracing */
+	if (tool->record && trace_instance_start(&tool->record->trace))
+		goto err;
+	if (tool->aa && trace_instance_start(&tool->aa->trace))
+		goto err;
+
+	return 0;
+
+err:
+	err_msg("Error restarting trace\n");
+	return -1;
+}
+
 int run_tool(struct tool_ops *ops, int argc, char *argv[])
 {
 	struct common_params *params;
@@ -352,17 +384,14 @@ int top_main_loop(struct osnoise_tool *tool)
 				/* stop tracing requested, do not perform actions */
 				return 0;
 
-			actions_perform(&params->threshold_actions);
+			retval = common_threshold_handler(tool);
+			if (retval)
+				return retval;
+
 
-			if (!params->threshold_actions.continue_flag)
-				/* continue flag not set, break */
+			if (!should_continue_tracing(params))
 				return 0;
 
-			/* continue action reached, re-enable tracing */
-			if (record)
-				trace_instance_start(&record->trace);
-			if (tool->aa)
-				trace_instance_start(&tool->aa->trace);
 			trace_instance_start(trace);
 		}
 
@@ -403,18 +432,14 @@ int hist_main_loop(struct osnoise_tool *tool)
 				/* stop tracing requested, do not perform actions */
 				break;
 
-			actions_perform(&params->threshold_actions);
+			retval = common_threshold_handler(tool);
+			if (retval)
+				return retval;
 
-			if (!params->threshold_actions.continue_flag)
-				/* continue flag not set, break */
-				break;
+			if (!should_continue_tracing(params))
+				return 0;
 
-			/* continue action reached, re-enable tracing */
-			if (tool->record)
-				trace_instance_start(&tool->record->trace);
-			if (tool->aa)
-				trace_instance_start(&tool->aa->trace);
-			trace_instance_start(&tool->trace);
+			trace_instance_start(trace);
 		}
 
 		/* is there still any user-threads ? */
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index 7602c5593ef5d..c548decd3c40f 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -143,6 +143,24 @@ struct tool_ops {
 	void (*free)(struct osnoise_tool *tool);
 };
 
+/**
+ * should_continue_tracing - check if tracing should continue after threshold
+ * @params: pointer to the common parameters structure
+ *
+ * Returns true if the continue action was configured (--on-threshold continue),
+ * indicating that tracing should be restarted after handling the threshold event.
+ *
+ * Return: 1 if tracing should continue, 0 otherwise.
+ */
+static inline int
+should_continue_tracing(const struct common_params *params)
+{
+	return params->threshold_actions.continue_flag;
+}
+
+int
+common_threshold_handler(const struct osnoise_tool *tool);
+
 int osnoise_set_cpus(struct osnoise_context *context, char *cpus);
 void osnoise_restore_cpus(struct osnoise_context *context);
 
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 6ea397421f1c9..6b8eaef8a3a09 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -17,6 +17,7 @@
 #include "timerlat.h"
 #include "timerlat_aa.h"
 #include "timerlat_bpf.h"
+#include "common.h"
 
 struct timerlat_hist_cpu {
 	int			*irq;
@@ -1048,7 +1049,6 @@ static struct osnoise_tool
 
 static int timerlat_hist_bpf_main_loop(struct osnoise_tool *tool)
 {
-	struct timerlat_params *params = to_timerlat_params(tool->params);
 	int retval;
 
 	while (!stop_tracing) {
@@ -1056,18 +1056,17 @@ static int timerlat_hist_bpf_main_loop(struct osnoise_tool *tool)
 
 		if (!stop_tracing) {
 			/* Threshold overflow, perform actions on threshold */
-			actions_perform(&params->common.threshold_actions);
+			retval = common_threshold_handler(tool);
+			if (retval)
+				return retval;
 
-			if (!params->common.threshold_actions.continue_flag)
-				/* continue flag not set, break */
+			if (!should_continue_tracing(tool->params))
 				break;
 
-			/* continue action reached, re-enable tracing */
-			if (tool->record)
-				trace_instance_start(&tool->record->trace);
-			if (tool->aa)
-				trace_instance_start(&tool->aa->trace);
-			timerlat_bpf_restart_tracing();
+			if (timerlat_bpf_restart_tracing()) {
+				err_msg("Error restarting BPF trace\n");
+				return -1;
+			}
 		}
 	}
 	timerlat_bpf_detach();
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index dd727cb48b551..c6f6757c3fb6b 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -17,6 +17,7 @@
 #include "timerlat.h"
 #include "timerlat_aa.h"
 #include "timerlat_bpf.h"
+#include "common.h"
 
 struct timerlat_top_cpu {
 	unsigned long long	irq_count;
@@ -801,10 +802,10 @@ static struct osnoise_tool
 static int
 timerlat_top_bpf_main_loop(struct osnoise_tool *tool)
 {
-	struct timerlat_params *params = to_timerlat_params(tool->params);
+	const struct common_params *params = tool->params;
 	int retval, wait_retval;
 
-	if (params->common.aa_only) {
+	if (params->aa_only) {
 		/* Auto-analysis only, just wait for stop tracing */
 		timerlat_bpf_wait(-1);
 		return 0;
@@ -812,8 +813,8 @@ timerlat_top_bpf_main_loop(struct osnoise_tool *tool)
 
 	/* Pull and display data in a loop */
 	while (!stop_tracing) {
-		wait_retval = timerlat_bpf_wait(params->common.quiet ? -1 :
-						params->common.sleep_time);
+		wait_retval = timerlat_bpf_wait(params->quiet ? -1 :
+						params->sleep_time);
 
 		retval = timerlat_top_bpf_pull_data(tool);
 		if (retval) {
@@ -821,28 +822,27 @@ timerlat_top_bpf_main_loop(struct osnoise_tool *tool)
 			return retval;
 		}
 
-		if (!params->common.quiet)
+		if (!params->quiet)
 			timerlat_print_stats(tool);
 
 		if (wait_retval != 0) {
 			/* Stopping requested by tracer */
-			actions_perform(&params->common.threshold_actions);
+			retval = common_threshold_handler(tool);
+			if (retval)
+				return retval;
 
-			if (!params->common.threshold_actions.continue_flag)
-				/* continue flag not set, break */
+			if (!should_continue_tracing(tool->params))
 				break;
 
-			/* continue action reached, re-enable tracing */
-			if (tool->record)
-				trace_instance_start(&tool->record->trace);
-			if (tool->aa)
-				trace_instance_start(&tool->aa->trace);
-			timerlat_bpf_restart_tracing();
+			if (timerlat_bpf_restart_tracing()) {
+				err_msg("Error restarting BPF trace\n");
+				return -1;
+			}
 		}
 
 		/* is there still any user-threads ? */
-		if (params->common.user_workload) {
-			if (params->common.user.stopped_running) {
+		if (params->user_workload) {
+			if (params->user.stopped_running) {
 				debug_msg("timerlat user space threads stopped!\n");
 				break;
 			}
-- 
2.53.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox