Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v7 36/42] KVM: selftests: Provide function to look up guest_memfd details from gpa
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Introduce a new helper, kvm_gpa_to_guest_memfd(), to find the
guest_memfd-related details of a memory region that contains a given guest
physical address (GPA).

The function returns the file descriptor for the memfd, the offset into
the file that corresponds to the GPA, and the number of bytes remaining
in the region from that GPA.

kvm_gpa_to_guest_memfd() was factored out from vm_guest_mem_fallocate();
refactor vm_guest_mem_fallocate() to use the new helper.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h |  3 +++
 tools/testing/selftests/kvm/lib/kvm_util.c     | 37 ++++++++++++++++----------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index d4c285c6fbe44..e9b4ae9596e05 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -428,6 +428,9 @@ static inline void vm_enable_cap(struct kvm_vm *vm, u32 cap, u64 arg0)
 	vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap);
 }
 
+int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, gpa_t gpa, off_t *fd_offset,
+			   size_t *nr_bytes);
+
 /*
  * KVM_SET_MEMORY_ATTRIBUTES{,2} overwrites _all_ attributes.  These
  * flows need significant enhancements to support multiple attributes.
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index daa0c1e835a71..f8f0cd62f2f17 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1283,27 +1283,20 @@ void vm_guest_mem_fallocate(struct kvm_vm *vm, u64 base, u64 size,
 			    bool punch_hole)
 {
 	const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0);
-	struct userspace_mem_region *region;
 	u64 end = base + size;
-	gpa_t gpa, len;
 	off_t fd_offset;
-	int ret;
+	int fd, ret;
+	size_t len;
+	gpa_t gpa;
 
 	for (gpa = base; gpa < end; gpa += len) {
-		u64 offset;
-
-		region = userspace_mem_region_find(vm, gpa, gpa);
-		TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
-			    "Private memory region not found for GPA 0x%lx", gpa);
+		fd = kvm_gpa_to_guest_memfd(vm, gpa, &fd_offset, &len);
+		len = min(end - gpa, len);
 
-		offset = gpa - region->region.guest_phys_addr;
-		fd_offset = region->region.guest_memfd_offset + offset;
-		len = min_t(u64, end - gpa, region->region.memory_size - offset);
-
-		ret = fallocate(region->region.guest_memfd, mode, fd_offset, len);
+		ret = fallocate(fd, mode, fd_offset, len);
 		TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx",
 			    punch_hole ? "punch hole" : "allocate", gpa, len,
-			    region->region.guest_memfd, mode, fd_offset);
+			    fd, mode, fd_offset);
 	}
 }
 
@@ -1640,6 +1633,22 @@ void *addr_gpa2alias(struct kvm_vm *vm, gpa_t gpa)
 	return (void *) ((uintptr_t) region->host_alias + offset);
 }
 
+int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, gpa_t gpa, off_t *fd_offset,
+			   size_t *nr_bytes)
+{
+	struct userspace_mem_region *region;
+	gpa_t gpa_offset;
+
+	region = userspace_mem_region_find(vm, gpa, gpa);
+	TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
+		    "guest_memfd memory region not found for GPA 0x%lx", gpa);
+
+	gpa_offset = gpa - region->region.guest_phys_addr;
+	*fd_offset = region->region.guest_memfd_offset + gpa_offset;
+	*nr_bytes = region->region.memory_size - gpa_offset;
+	return region->region.guest_memfd;
+}
+
 /* Create an interrupt controller chip for the specified VM. */
 void vm_create_irqchip(struct kvm_vm *vm)
 {

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 37/42] KVM: selftests: Provide common function to set memory attributes
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Sean Christopherson <seanjc@google.com>

Introduce vm_mem_set_memory_attributes(), which handles setting of memory
attributes for a range of guest physical addresses, regardless of whether
the attributes should be set via guest_memfd or via the memory attributes
at the VM level.

Refactor existing vm_mem_set_{shared,private} functions to use the new
function. Opportunistically update the size parameter to use size_t instead
of u64.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h | 46 +++++++++++++++++++-------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index e9b4ae9596e05..a86418cdf5f4f 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -454,18 +454,6 @@ static inline void vm_set_memory_attributes(struct kvm_vm *vm, gpa_t gpa,
 	vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr);
 }
 
-static inline void vm_mem_set_private(struct kvm_vm *vm, gpa_t gpa,
-				      u64 size)
-{
-	vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
-}
-
-static inline void vm_mem_set_shared(struct kvm_vm *vm, gpa_t gpa,
-				     u64 size)
-{
-	vm_set_memory_attributes(vm, gpa, size, 0);
-}
-
 static inline int __gmem_set_memory_attributes(int fd, u64 offset,
 					       size_t size, u64 attributes,
 					       u64 *error_offset)
@@ -532,6 +520,40 @@ static inline void gmem_set_shared(int fd, u64 offset, size_t size)
 	gmem_set_memory_attributes(fd, offset, size, 0);
 }
 
+static inline void vm_mem_set_memory_attributes(struct kvm_vm *vm, gpa_t gpa,
+						size_t size, u64 attrs)
+{
+	if (kvm_has_gmem_attributes) {
+		gpa_t end = gpa + size;
+		off_t fd_offset;
+		gpa_t addr;
+		size_t len;
+		int fd;
+
+		for (addr = gpa; addr < end; addr += len) {
+			fd = kvm_gpa_to_guest_memfd(vm, addr, &fd_offset, &len);
+			len = min(end - addr, len);
+
+			gmem_set_memory_attributes(fd, fd_offset, len, attrs);
+		}
+	} else {
+		vm_set_memory_attributes(vm, gpa, size, attrs);
+	}
+}
+
+static inline void vm_mem_set_private(struct kvm_vm *vm, gpa_t gpa,
+				      size_t size)
+{
+	vm_mem_set_memory_attributes(vm, gpa, size,
+				     KVM_MEMORY_ATTRIBUTE_PRIVATE);
+}
+
+static inline void vm_mem_set_shared(struct kvm_vm *vm, gpa_t gpa,
+				     size_t size)
+{
+	vm_mem_set_memory_attributes(vm, gpa, size, 0);
+}
+
 void vm_guest_mem_fallocate(struct kvm_vm *vm, gpa_t gpa, u64 size,
 			    bool punch_hole);
 

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 38/42] KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Sean Christopherson <seanjc@google.com>

Check that a valid fd provided to mmap() must be accompanied by MAP_SHARED.

With an invalid fd (usually used for anonymous mappings), there are no
constraints on mmap() flags.

Add this check to make sure that when a guest_memfd is used as region->fd,
the flag provided to mmap() will include MAP_SHARED.

Signed-off-by: Sean Christopherson <seanjc@google.com>
[Rephrase assertion message.]
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index f8f0cd62f2f17..21c7e52a2bdac 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1088,6 +1088,9 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
 					     src_type == VM_MEM_SRC_SHARED_HUGETLB);
 	}
 
+	TEST_ASSERT(region->fd == -1 || backing_src_is_shared(src_type),
+		    "A valid fd provided to mmap() must be accompanied by MAP_SHARED.");
+
 	region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
 					vm_mem_backing_src_alias(src_type)->flag,
 					region->fd, mmap_offset);

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 39/42] KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Ackerley Tng <ackerleytng@google.com>

The TEST_EXPECT_SIGBUS macro is not thread-safe as it uses a global
sigjmp_buf and installs a global SIGBUS signal handler. If multiple threads
execute the macro concurrently, they will race on installing the signal
handler and stomp on other threads' jump buffers, leading to incorrect test
behavior.

Make TEST_EXPECT_SIGBUS thread-safe with the following changes:

Share the KVM tests' global signal handler. sigaction() applies to all
threads; without sharing a global signal handler, one thread may have
removed the signal handler that another thread added, hence leading to
unexpected signals.

The alternative of layering signal handlers was considered, but calling
sigaction() within TEST_EXPECT_SIGBUS() necessarily creates a race. To
avoid adding new setup and teardown routines to do sigaction() and keep
usage of TEST_EXPECT_SIGBUS() simple, share the KVM tests' global signal
handler.

Opportunistically rename report_unexpected_signal to
catchall_signal_handler.

To continue to only expect SIGBUS within specific regions of code, use a
thread-specific variable, expecting_sigbus, to replace installing and
removing signal handlers.

Make the execution environment for the thread, sigjmp_buf, a
thread-specific variable.

As part of TEST_EXPECT_SIGBUS(), assert the prerequisite for this setup,
that the current signal handler is the catchall_signal_handler.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/include/test_util.h | 32 +++++++++++++------------
 tools/testing/selftests/kvm/lib/kvm_util.c      | 18 ++++++++++----
 tools/testing/selftests/kvm/lib/test_util.c     |  7 ------
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index c280c3233f502..c9ba4e010f0b8 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -82,21 +82,23 @@ do {									\
 	__builtin_unreachable(); \
 } while (0)
 
-extern sigjmp_buf expect_sigbus_jmpbuf;
-void expect_sigbus_handler(int signum);
-
-#define TEST_EXPECT_SIGBUS(action)						\
-do {										\
-	struct sigaction sa_old, sa_new = {					\
-		.sa_handler = expect_sigbus_handler,				\
-	};									\
-										\
-	sigaction(SIGBUS, &sa_new, &sa_old);					\
-	if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) {				\
-		action;								\
-		TEST_FAIL("'%s' should have triggered SIGBUS", #action);	\
-	}									\
-	sigaction(SIGBUS, &sa_old, NULL);					\
+extern __thread sigjmp_buf expect_sigbus_jmpbuf;
+extern __thread volatile sig_atomic_t expecting_sigbus;
+extern void catchall_signal_handler(int signum);
+
+#define TEST_EXPECT_SIGBUS(action)					\
+do {									\
+	struct sigaction __sa = {};					\
+									\
+	TEST_ASSERT_EQ(sigaction(SIGBUS, NULL, &__sa), 0);		\
+	TEST_ASSERT_EQ(__sa.sa_handler, &catchall_signal_handler);	\
+									\
+	expecting_sigbus = true;					\
+	if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) {			\
+		action;							\
+		TEST_FAIL("'%s' should have triggered SIGBUS", #action);\
+	}								\
+	expecting_sigbus = false;					\
 } while (0)
 
 size_t parse_size(const char *size);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 21c7e52a2bdac..a7725fff58b46 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2270,13 +2270,20 @@ __weak void kvm_selftest_arch_init(void)
 {
 }
 
-static void report_unexpected_signal(int signum)
+__thread sigjmp_buf expect_sigbus_jmpbuf;
+__thread volatile sig_atomic_t expecting_sigbus;
+
+void catchall_signal_handler(int signum)
 {
+	switch (signum) {
+	case SIGBUS: {
+		if (expecting_sigbus)
+			siglongjmp(expect_sigbus_jmpbuf, 1);
+
+		TEST_FAIL("Unexpected SIGBUS (%d)\n", signum);
+	}
 #define KVM_CASE_SIGNUM(sig)					\
 	case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum)
-
-	switch (signum) {
-	KVM_CASE_SIGNUM(SIGBUS);
 	KVM_CASE_SIGNUM(SIGSEGV);
 	KVM_CASE_SIGNUM(SIGILL);
 	KVM_CASE_SIGNUM(SIGFPE);
@@ -2288,12 +2295,13 @@ static void report_unexpected_signal(int signum)
 void __attribute((constructor)) kvm_selftest_init(void)
 {
 	struct sigaction sig_sa = {
-		.sa_handler = report_unexpected_signal,
+		.sa_handler = catchall_signal_handler,
 	};
 
 	/* Tell stdout not to buffer its content. */
 	setbuf(stdout, NULL);
 
+	expecting_sigbus = false;
 	sigaction(SIGBUS, &sig_sa, NULL);
 	sigaction(SIGSEGV, &sig_sa, NULL);
 	sigaction(SIGILL, &sig_sa, NULL);
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index bab1bd2b775b6..30eb701e4becd 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -18,13 +18,6 @@
 
 #include "test_util.h"
 
-sigjmp_buf expect_sigbus_jmpbuf;
-
-void __attribute__((used)) expect_sigbus_handler(int signum)
-{
-	siglongjmp(expect_sigbus_jmpbuf, 1);
-}
-
 /*
  * Random number generator that is usable from guest code. This is the
  * Park-Miller LCG using standard constants.

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 40/42] KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Update the private memory conversions selftest to also test conversions
that are done "in-place" via per-guest_memfd memory attributes. In-place
conversions require the host to be able to mmap() the guest_memfd so that
the host and guest can share the same backing physical memory.

This includes several updates, that are conditioned on the system
supporting per-guest_memfd attributes (kvm_has_gmem_attributes):

1. Set up guest_memfd requesting MMAP and INIT_SHARED.

2. With in-place conversions, the host's mapping points directly to the
   guest's memory. When the guest converts a region to private, host access
   to that region is blocked. Update the test to expect a SIGBUS when
   attempting to access the host virtual address (HVA) of private memory.

3. Use vm_mem_set_memory_attributes(), which chooses how to set memory
   attributes based on whether kvm_has_gmem_attributes.

Restrict the test to using VM_MEM_SRC_SHMEM because guest_memfd's required
mmap() flags and page sizes happens to align with those of
VM_MEM_SRC_SHMEM. As long as VM_MEM_SRC_SHMEM is used for src_type,
vm_mem_add() works as intended.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../kvm/x86/private_mem_conversions_test.c         | 44 ++++++++++++++++++----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
index 289ad10063fca..4308c67952310 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
@@ -306,9 +306,12 @@ static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
 	if (do_fallocate)
 		vm_guest_mem_fallocate(vm, gpa, size, map_shared);
 
-	if (set_attributes)
-		vm_set_memory_attributes(vm, gpa, size,
-					 map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE);
+	if (set_attributes) {
+		u64 attrs = map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+		vm_mem_set_memory_attributes(vm, gpa, size, attrs);
+	}
+
 	run->hypercall.ret = 0;
 }
 
@@ -352,8 +355,20 @@ static void *__test_mem_conversions(void *__vcpu)
 				size_t nr_bytes = min_t(size_t, vm->page_size, size - i);
 				u8 *hva = addr_gpa2hva(vm, gpa + i);
 
-				/* In all cases, the host should observe the shared data. */
-				memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
+				/*
+				 * When using per-guest_memfd memory attributes,
+				 * i.e. in-place conversion, host accesses will
+				 * point at guest memory and should SIGBUS when
+				 * guest memory is private.  When using per-VM
+				 * attributes, i.e. separate backing for shared
+				 * vs. private, the host should always observe
+				 * the shared data.
+				 */
+				if (kvm_has_gmem_attributes &&
+				    uc.args[0] == SYNC_PRIVATE)
+					TEST_EXPECT_SIGBUS(READ_ONCE(*hva));
+				else
+					memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
 
 				/* For shared, write the new pattern to guest memory. */
 				if (uc.args[0] == SYNC_SHARED)
@@ -382,6 +397,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, u32 nr_v
 	const size_t slot_size = memfd_size / nr_memslots;
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	pthread_t threads[KVM_MAX_VCPUS];
+	u64 gmem_flags;
 	struct kvm_vm *vm;
 	int memfd, i;
 
@@ -397,12 +413,17 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, u32 nr_v
 
 	vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
 
-	memfd = vm_create_guest_memfd(vm, memfd_size, 0);
+	if (kvm_has_gmem_attributes)
+		gmem_flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;
+	else
+		gmem_flags = 0;
+
+	memfd = vm_create_guest_memfd(vm, memfd_size, gmem_flags);
 
 	for (i = 0; i < nr_memslots; i++)
 		vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
 			   BASE_DATA_SLOT + i, slot_size / vm->page_size,
-			   KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, 0);
+			   KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, gmem_flags);
 
 	for (i = 0; i < nr_vcpus; i++) {
 		gpa_t gpa =  BASE_DATA_GPA + i * per_cpu_size;
@@ -452,17 +473,24 @@ static void usage(const char *cmd)
 
 int main(int argc, char *argv[])
 {
-	enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
+	enum vm_mem_backing_src_type src_type;
 	u32 nr_memslots = 1;
 	u32 nr_vcpus = 1;
 	int opt;
 
 	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
 
+	src_type = kvm_has_gmem_attributes ? VM_MEM_SRC_SHMEM :
+					     DEFAULT_VM_MEM_SRC;
+
 	while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) {
 		switch (opt) {
 		case 's':
 			src_type = parse_backing_src_type(optarg);
+			TEST_ASSERT(!kvm_has_gmem_attributes ||
+				    src_type == VM_MEM_SRC_SHMEM,
+				    "Testing in-place conversions, only %s mem_type supported\n",
+				    vm_mem_backing_src_alias(VM_MEM_SRC_SHMEM)->name);
 			break;
 		case 'n':
 			nr_vcpus = atoi_positive("nr_vcpus", optarg);

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 41/42] KVM: selftests: Add script to exercise private_mem_conversions_test
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Add a wrapper script to simplify running the private_mem_conversions_test
with a variety of configurations. Manually invoking the test for all
supported memory backing source types is tedious.

The script automatically detects the availability of 2MB and 1GB hugepages
and builds a list of source types to test. It then iterates through the
list, running the test for each type with both a single memslot and
multiple memslots.

This makes it easier to get comprehensive test coverage across different
memory configurations.

Add and use a helper program in C to be able to read
KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES as defined in header files and then
issue the ioctl to read the KVM CAP.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm           |   4 +
 .../selftests/kvm/kvm_has_gmem_attributes.c        |  17 +++
 .../kvm/x86/private_mem_conversions_test.sh        | 128 +++++++++++++++++++++
 3 files changed, 149 insertions(+)

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 6232881be500a..e5769268936a7 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -54,6 +54,7 @@ LIBKVM_loongarch += lib/loongarch/exception.S
 
 # Non-compiled test targets
 TEST_PROGS_x86 += x86/nx_huge_pages_test.sh
+TEST_PROGS_x86 += x86/private_mem_conversions_test.sh
 
 # Compiled test targets valid on all architectures with libkvm support
 TEST_GEN_PROGS_COMMON = demand_paging_test
@@ -67,6 +68,8 @@ TEST_GEN_PROGS_COMMON += set_memory_region_test
 TEST_GEN_PROGS_COMMON += memslot_modification_stress_test
 TEST_GEN_PROGS_COMMON += memslot_perf_test
 
+TEST_GEN_PROGS_EXTENDED_COMMON += kvm_has_gmem_attributes
+
 # Compiled test targets
 TEST_GEN_PROGS_x86 = $(TEST_GEN_PROGS_COMMON)
 TEST_GEN_PROGS_x86 += x86/cpuid_test
@@ -245,6 +248,7 @@ SPLIT_TESTS += get-reg-list
 
 TEST_PROGS += $(TEST_PROGS_$(ARCH))
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH))
+TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_COMMON)
 TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH))
 LIBKVM += $(LIBKVM_$(ARCH))
 
diff --git a/tools/testing/selftests/kvm/kvm_has_gmem_attributes.c b/tools/testing/selftests/kvm/kvm_has_gmem_attributes.c
new file mode 100644
index 0000000000000..4f361349412fb
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_has_gmem_attributes.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Utility to check if KVM supports guest_memfd attributes.
+ *
+ * Copyright (C) 2025, Google LLC.
+ */
+
+#include <stdio.h>
+
+#include "kvm_util.h"
+
+int main(void)
+{
+	printf("%u\n", kvm_check_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES) > 0);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
new file mode 100755
index 0000000000000..7179a4fcdd498
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Wrapper script which runs different test setups of
+# private_mem_conversions_test.
+#
+# Copyright (C) 2025, Google LLC.
+
+NUM_VCPUS_TO_TEST=4
+NUM_MEMSLOTS_TO_TEST=$NUM_VCPUS_TO_TEST
+
+# Required pages are based on the test setup in the C code.
+REQUIRED_NUM_2M_HUGEPAGES=$((1024 * NUM_VCPUS_TO_TEST))
+REQUIRED_NUM_1G_HUGEPAGES=$((2 * NUM_VCPUS_TO_TEST))
+
+get_hugepage_count() {
+    local page_size_kb=$1
+    local path="/sys/kernel/mm/hugepages/hugepages-${page_size_kb}kB/nr_hugepages"
+    if [ -f "$path" ]; then
+        cat "$path"
+    else
+        echo 0
+    fi
+}
+
+get_default_hugepage_size_in_kb() {
+    local size=$(grep "Hugepagesize:" /proc/meminfo | awk '{print $2}')
+    echo "$size"
+}
+
+run_tests() {
+    local executable_path=$1
+    local src_type=$2
+    local num_memslots=$3
+    local num_vcpus=$4
+
+    echo "$executable_path -s $src_type -m $num_memslots -n $num_vcpus"
+    "$executable_path" -s "$src_type" -m "$num_memslots" -n "$num_vcpus"
+}
+
+script_dir=$(dirname "$(realpath "$0")")
+test_executable="${script_dir}/private_mem_conversions_test"
+kvm_has_gmem_attributes_tool="${script_dir}/../kvm_has_gmem_attributes"
+
+if [ ! -f "$test_executable" ]; then
+    echo "Error: Test executable not found at '$test_executable'" >&2
+    exit 1
+fi
+
+if [ ! -f "$kvm_has_gmem_attributes_tool" ]; then
+    echo "Error: kvm_has_gmem_attributes utility not found at '$kvm_has_gmem_attributes_tool'" >&2
+    exit 1
+fi
+
+kvm_has_gmem_attributes=$("$kvm_has_gmem_attributes_tool" | tail -n1)
+
+if [ "$kvm_has_gmem_attributes" -eq 1 ]; then
+    backing_src_types=("shmem")
+else
+    hugepage_2mb_count=$(get_hugepage_count 2048)
+    hugepage_2mb_enabled=$((hugepage_2mb_count >= REQUIRED_NUM_2M_HUGEPAGES))
+    hugepage_1gb_count=$(get_hugepage_count 1048576)
+    hugepage_1gb_enabled=$((hugepage_1gb_count >= REQUIRED_NUM_1G_HUGEPAGES))
+
+    default_hugepage_size_kb=$(get_default_hugepage_size_in_kb)
+    hugepage_default_enabled=0
+    if [ "$default_hugepage_size_kb" -eq 2048 ]; then
+        hugepage_default_enabled=$hugepage_2mb_enabled
+    elif [ "$default_hugepage_size_kb" -eq 1048576 ]; then
+        hugepage_default_enabled=$hugepage_1gb_enabled
+    fi
+
+    backing_src_types=("anonymous" "anonymous_thp")
+
+    if [ "$hugepage_default_enabled" -eq 1 ]; then
+        backing_src_types+=("anonymous_hugetlb")
+    else
+        echo "skipping anonymous_hugetlb backing source type"
+    fi
+
+    if [ "$hugepage_2mb_enabled" -eq 1 ]; then
+        backing_src_types+=("anonymous_hugetlb_2mb")
+    else
+        echo "skipping anonymous_hugetlb_2mb backing source type"
+    fi
+
+    if [ "$hugepage_1gb_enabled" -eq 1 ]; then
+        backing_src_types+=("anonymous_hugetlb_1gb")
+    else
+        echo "skipping anonymous_hugetlb_1gb backing source type"
+    fi
+
+    backing_src_types+=("shmem")
+
+    if [ "$hugepage_default_enabled" -eq 1 ]; then
+        backing_src_types+=("shared_hugetlb")
+    else
+        echo "skipping shared_hugetlb backing source type"
+    fi
+fi
+
+return_code=0
+for i in "${!backing_src_types[@]}"; do
+    src_type=${backing_src_types[$i]}
+    if [ "$i" -gt 0 ]; then
+        echo
+    fi
+
+    if ! run_tests "$test_executable" "$src_type" 1 1; then
+        return_code=$?
+        echo "Test failed for source type '$src_type'. Arguments: -s $src_type -m 1 -n 1" >&2
+        break
+    fi
+
+    if ! run_tests "$test_executable" "$src_type" 1 "$NUM_VCPUS_TO_TEST"; then
+        return_code=$?
+        echo "Test failed for source type '$src_type'. Arguments: -s $src_type -m 1 -n $NUM_VCPUS_TO_TEST" >&2
+        break
+    fi
+
+    if ! run_tests "$test_executable" "$src_type" "$NUM_MEMSLOTS_TO_TEST" "$NUM_VCPUS_TO_TEST"; then
+        return_code=$?
+        echo "Test failed for source type '$src_type'. Arguments: -s $src_type -m $NUM_MEMSLOTS_TO_TEST -n $NUM_VCPUS_TO_TEST" >&2
+        break
+    fi
+done
+
+exit "$return_code"

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 42/42] KVM: selftests: Update private memory exits test to work with per-gmem attributes
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Sean Christopherson <seanjc@google.com>

Skip setting memory to private in the private memory exits test when using
per-gmem memory attributes, as memory is initialized to private by default
for guest_memfd, and using vm_mem_set_private() on a guest_memfd instance
requires creating guest_memfd with GUEST_MEMFD_FLAG_MMAP (which is totally
doable, but would need to be conditional and is ultimately unnecessary).

Expect an emulated MMIO instead of a memory fault exit when attributes are
per-gmem, as deleting the memslot effectively drops the private status,
i.e. the GPA becomes shared and thus supports emulated MMIO.

Skip the "memslot not private" test entirely, as private vs. shared state
for x86 software-protected VMs comes from the memory attributes themselves,
and so when doing in-place conversions there can never be a disconnect
between the expected and actual states.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 .../selftests/kvm/x86/private_mem_kvm_exits_test.c | 36 ++++++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
index 10db9fe6d9063..70ed16066c63e 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
@@ -62,8 +62,9 @@ static void test_private_access_memslot_deleted(void)
 
 	virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
 
-	/* Request to access page privately */
-	vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+	/* Request to access page privately. */
+	if (!kvm_has_gmem_attributes)
+		vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
 
 	pthread_create(&vm_thread, NULL,
 		       (void *(*)(void *))run_vcpu_get_exit_reason,
@@ -74,10 +75,26 @@ static void test_private_access_memslot_deleted(void)
 	pthread_join(vm_thread, &thread_return);
 	exit_reason = (u32)(u64)thread_return;
 
-	TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+	/*
+	 * If attributes are tracked per-gmem, deleting the memslot that points
+	 * at the gmem instance effectively makes the memory shared, and so the
+	 * read should trigger emulated MMIO.
+	 *
+	 * If attributes are tracked per-VM, deleting the memslot shouldn't
+	 * affect the private attribute, and so KVM should generate a memory
+	 * fault exit (emulated MMIO on private GPAs is disallowed).
+	 */
+	if (kvm_has_gmem_attributes) {
+		TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MMIO);
+		TEST_ASSERT_EQ(vcpu->run->mmio.phys_addr, EXITS_TEST_GPA);
+		TEST_ASSERT_EQ(vcpu->run->mmio.len, sizeof(u64));
+		TEST_ASSERT_EQ(vcpu->run->mmio.is_write, false);
+	} else {
+		TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+		TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+		TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+		TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+	}
 
 	kvm_vm_free(vm);
 }
@@ -88,6 +105,13 @@ static void test_private_access_memslot_not_private(void)
 	struct kvm_vcpu *vcpu;
 	u32 exit_reason;
 
+	/*
+	 * Accessing non-private memory as private with a software-protected VM
+	 * isn't possible when doing in-place conversions.
+	 */
+	if (kvm_has_gmem_attributes)
+		return;
+
 	vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
 					   guest_repeatedly_read);
 

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-05-23 20:09 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	john.g.garry, loberman, neelx, sean, mproche, chjohnst,
	linux-block, linux-kernel, linux-trace-kernel

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices (SSDs) are starved of hardware
tags when sharing the same blk_mq_tag_set.

Currently, diagnosing this specific hardware queue contention is
difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
forces the current thread to block uninterruptible via io_schedule().
While this can be inferred via sched:sched_switch or dynamically
traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
dedicated, out-of-the-box observability for this event.

This patch introduces the block_rq_tag_wait tracepoint in the tag
allocation slow-path. It triggers immediately before the task state
is altered to TASK_UNINTERRUPTIBLE (ensuring safety for PREEMPT_RT
locks). It exposes the exact hardware context (hctx) that is starved,
the specific pool experiencing starvation (driver, software scheduler,
or reserved), and the exact pool depth.

This provides storage engineers with a zero-configuration, low-overhead
mechanism to definitively identify shared-tag bottlenecks. For example,
userspace can trivially replicate tag starvation counters using bpftrace:

    # bpftrace -e 'tracepoint:block:block_rq_tag_wait { @tag_waits[cpu] = count(); }'
    Attaching 1 probe...
    ^C
    @tag_waits[4]: 12
    @tag_waits[12]: 87

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Laurence Oberman <loberman@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
Changes since v6 [1]:
 - Dropped Patch 2. Observability is now driven entirely by the tracepoint,
   with the commit message updated to demonstrate how userspace (e.g.,
   bpftrace) can safely replicate counting out-of-band (Jens Axboe)

 - Moved tracepoint call above sbitmap_prepare_to_wait(). This prevents
   inadvertently resetting the task state under PREEMPT_RT locks

 - Updated the tracepoint signature and TP_fast_assign block to evaluate
   the allocation flags. If the submitting context is starved of a reserved
   tag (BLK_MQ_REQ_RESERVED), the tracepoint now accurately reports the
   severely constrained nr_reserved_tags depth instead of the total nr_tags
   depth.

Changes since v5 [2]:
 - Replaced this_cpu_inc() with raw_cpu_inc() within
   blk_mq_debugfs_inc_wait_tags(). This resolves a preemption warning
   triggered under CONFIG_DEBUG_PREEMPT=y, as the routine is invoked from a
   preemptible context immediately prior to io_schedule(). This adjustment
   deliberately prioritises the reduction of execution overhead over
   absolute statistical precision for this diagnostic interface.

Changes since v4 [3]:
 - Prevented a NULL pointer dereference in the tracepoint fast-assign for
   disk-less request queues by safely checking q->disk before resolving the
   dev_t

 - Fixed a Use-After-Free (UAF) and permanent memory leak by decoupling
   the per-CPU counter allocation from the volatile debugfs lifecycle and
   tying it directly to the core hctx lifecycle (i.e., blk_mq_init_hctx()
   and blk_mq_exit_hctx())

 - Fixed a potential compiler double-fetch bug by wrapping the per-CPU
   pointer evaluations with READ_ONCE() in blk_mq_debugfs_inc_wait_tags()

 - Passed the appropriate gfp_t flags down to the allocation routines to
   maintain the strict GFP_NOIO context

 - Updated kernel-doc descriptions to clarify that the NULL pointer
   checks guard against memory allocation failures under pressure, rather
   than initialisation race conditions

Changes since v3 [4]:
 - Transitioned tracking architecture from shared atomic_t variables to
   dynamically allocated per-CPU counters to resolve cache line bouncing
   (Bart Van Assche)

Changes since v2 [5]:
 - Added "Reviewed-by:" and "Tested-by:" tags for patch 1

 - Evaluate is_sched_tag directly within TP_fast_assign (Steven Rostedt)

 - Introduced atomic counters via debugfs

Changes since v1 [6]:
 - Improved the description of the trace point (Damien Le Moal)

 - Removed the redundant "active requests" (Laurence Oberman)

 - Introduced pool-specific starvation tracking

[1]: https://lore.kernel.org/lkml/20260517213614.350367-1-atomlin@atomlin.com/
[2]: https://lore.kernel.org/lkml/20260427020142.358912-1-atomlin@atomlin.com/
[3]: https://lore.kernel.org/lkml/20260419023036.1419514-1-atomlin@atomlin.com/
[4]: https://lore.kernel.org/lkml/20260319221956.332770-1-atomlin@atomlin.com/
[5]: https://lore.kernel.org/lkml/20260319015300.287653-1-atomlin@atomlin.com/
[6]: https://lore.kernel.org/lkml/20260317182835.258183-1-atomlin@atomlin.com/
---
 block/blk-mq-tag.c           |  6 +++++
 include/trace/events/block.h | 50 ++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..35deee5bbc73 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/kmemleak.h>
 
 #include <linux/delay.h>
+#include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
@@ -181,6 +182,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		/* Log the starvation event before altering task state */
+		trace_block_rq_tag_wait(data->q, data->hctx,
+					data->rq_flags & RQF_SCHED_TAGS,
+					data->flags);
+
 		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 6aa79e2d799c..15b2e0edd2d4 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -226,6 +226,56 @@ DECLARE_EVENT_CLASS(block_rq,
 		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
 );
 
+/**
+ * block_rq_tag_wait - triggered when a request is starved of a tag
+ * @q: request queue of the target device
+ * @hctx: hardware context of the request experiencing starvation
+ * @is_sched_tag: indicates whether the starved pool is the software scheduler
+ * @alloc_flags: allocation flags dictating the specific tag pool
+ *
+ * Called immediately before the submitting context is forced to block due
+ * to the exhaustion of available tags (i.e., physical hardware driver
+ * tags, software scheduler tags, or reserved tags). This trace point
+ * indicates that the context will be placed into an uninterruptible state
+ * via io_schedule() until an active request completes and relinquishes its
+ * assigned tag.
+ */
+TRACE_EVENT(block_rq_tag_wait,
+
+	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+		 bool is_sched_tag, unsigned int alloc_flags),
+
+	TP_ARGS(q, hctx, is_sched_tag, alloc_flags),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( u32,		hctx_id			)
+		__field( u32,		nr_tags			)
+		__field( bool,		is_sched_tag		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= q->disk ? disk_devt(q->disk) : 0;
+		__entry->hctx_id	= hctx->queue_num;
+		__entry->is_sched_tag	= is_sched_tag;
+
+		if (is_sched_tag) {
+			__entry->nr_tags = hctx->sched_tags->nr_tags;
+		} else if (alloc_flags & BLK_MQ_REQ_RESERVED) {
+			__entry->nr_tags = hctx->tags->nr_reserved_tags;
+		} else {
+			__entry->nr_tags = hctx->tags->nr_tags;
+		}
+
+	),
+
+	TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->hctx_id,
+		  __entry->is_sched_tag ? "scheduler" : "hardware",
+		  __entry->nr_tags)
+);
+
 /**
  * block_rq_insert - insert block operation request into queue
  * @rq: block IO operation request

base-commit: 6779b50faa562e6cca1aa6a4649a4d764c6c7e28
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH v7] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-05-23 23:55 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	john.g.garry, loberman, neelx, sean, mproche, chjohnst,
	linux-block, linux-kernel, linux-trace-kernel
In-Reply-To: <20260523200942.587199-1-atomlin@atomlin.com>

[-- Attachment #1: Type: text/plain, Size: 2171 bytes --]

On Sat, May 23, 2026 at 04:09:42PM -0400, Aaron Tomlin wrote:
> +/**
> + * block_rq_tag_wait - triggered when a request is starved of a tag
> + * @q: request queue of the target device
> + * @hctx: hardware context of the request experiencing starvation
> + * @is_sched_tag: indicates whether the starved pool is the software scheduler
> + * @alloc_flags: allocation flags dictating the specific tag pool
> + *
> + * Called immediately before the submitting context is forced to block due
> + * to the exhaustion of available tags (i.e., physical hardware driver
> + * tags, software scheduler tags, or reserved tags). This trace point
> + * indicates that the context will be placed into an uninterruptible state
> + * via io_schedule() until an active request completes and relinquishes its
> + * assigned tag.
> + */
> +TRACE_EVENT(block_rq_tag_wait,
> +
> +	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
> +		 bool is_sched_tag, unsigned int alloc_flags),
> +
> +	TP_ARGS(q, hctx, is_sched_tag, alloc_flags),
> +
> +	TP_STRUCT__entry(
> +		__field( dev_t,		dev			)
> +		__field( u32,		hctx_id			)
> +		__field( u32,		nr_tags			)
> +		__field( bool,		is_sched_tag		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= q->disk ? disk_devt(q->disk) : 0;
> +		__entry->hctx_id	= hctx->queue_num;
> +		__entry->is_sched_tag	= is_sched_tag;
> +
> +		if (is_sched_tag) {
> +			__entry->nr_tags = hctx->sched_tags->nr_tags;
> +		} else if (alloc_flags & BLK_MQ_REQ_RESERVED) {
> +			__entry->nr_tags = hctx->tags->nr_reserved_tags;
> +		} else {
> +			__entry->nr_tags = hctx->tags->nr_tags;
> +		}
> +
> +	),
> +
> +	TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->hctx_id,
> +		  __entry->is_sched_tag ? "scheduler" : "hardware",
> +		  __entry->nr_tags)
> +);
> +
>  /**
>   * block_rq_insert - insert block operation request into queue
>   * @rq: block IO operation request

I completely overlooked that a request could legitimately have both
RQF_SCHED_TAGS and BLK_MQ_REQ_RESERVED set simultaneously.

-- 
Aaron Tomlin

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [PATCH v8] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-05-24  1:42 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	john.g.garry, loberman, neelx, sean, mproche, chjohnst,
	linux-block, linux-kernel, linux-trace-kernel

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices (SSDs) are starved of hardware
tags when sharing the same blk_mq_tag_set.

Currently, diagnosing this specific hardware queue contention is
difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
forces the current thread to block uninterruptible via io_schedule().
While this can be inferred via sched:sched_switch or dynamically
traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
dedicated, out-of-the-box observability for this event.

This patch introduces the block_rq_tag_wait tracepoint in the tag
allocation slow-path. It triggers immediately before the task state
is altered to TASK_UNINTERRUPTIBLE (ensuring safety for PREEMPT_RT
locks). It exposes the exact hardware context (hctx) that is starved,
the specific pool experiencing starvation (driver, software scheduler,
or reserved), and the exact pool depth.

This provides storage engineers with a zero-configuration, low-overhead
mechanism to definitively identify shared-tag bottlenecks. For example,
userspace can trivially replicate tag starvation counters using bpftrace:

    # bpftrace -e 'tracepoint:block:block_rq_tag_wait { @tag_waits[cpu] = count(); }'
    Attaching 1 probe...
    ^C
    @tag_waits[4]: 12
    @tag_waits[12]: 87

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Laurence Oberman <loberman@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
Changes since v7 [1]:
 - Added an is_reserved boolean to the trace record to explicitly expose
   reserved pool starvation to userspace

 - Fixed TP_fast_assign to report the correct nr_reserved_tags depth
   when I/O schedulers utilise the reserved pool

Changes since v6 [2]:
 - Dropped Patch 2. Observability is now driven entirely by the tracepoint,
   with the commit message updated to demonstrate how userspace (e.g.,
   bpftrace) can safely replicate counting out-of-band (Jens Axboe)

 - Moved tracepoint call above sbitmap_prepare_to_wait(). This prevents
   inadvertently resetting the task state under PREEMPT_RT locks

 - Updated the tracepoint signature and TP_fast_assign block to evaluate
   the allocation flags. If the submitting context is starved of a reserved
   tag (BLK_MQ_REQ_RESERVED), the tracepoint now accurately reports the
   severely constrained nr_reserved_tags depth instead of the total nr_tags
   depth.

Changes since v5 [3]:
 - Replaced this_cpu_inc() with raw_cpu_inc() within
   blk_mq_debugfs_inc_wait_tags(). This resolves a preemption warning
   triggered under CONFIG_DEBUG_PREEMPT=y, as the routine is invoked from a
   preemptible context immediately prior to io_schedule(). This adjustment
   deliberately prioritises the reduction of execution overhead over
   absolute statistical precision for this diagnostic interface.

Changes since v4 [4]:
 - Prevented a NULL pointer dereference in the tracepoint fast-assign for
   disk-less request queues by safely checking q->disk before resolving the
   dev_t

 - Fixed a Use-After-Free (UAF) and permanent memory leak by decoupling
   the per-CPU counter allocation from the volatile debugfs lifecycle and
   tying it directly to the core hctx lifecycle (i.e., blk_mq_init_hctx()
   and blk_mq_exit_hctx())

 - Fixed a potential compiler double-fetch bug by wrapping the per-CPU
   pointer evaluations with READ_ONCE() in blk_mq_debugfs_inc_wait_tags()

 - Passed the appropriate gfp_t flags down to the allocation routines to
   maintain the strict GFP_NOIO context

 - Updated kernel-doc descriptions to clarify that the NULL pointer
   checks guard against memory allocation failures under pressure, rather
   than initialisation race conditions

Changes since v3 [5]:
 - Transitioned tracking architecture from shared atomic_t variables to
   dynamically allocated per-CPU counters to resolve cache line bouncing
   (Bart Van Assche)

Changes since v2 [6]:
 - Added "Reviewed-by:" and "Tested-by:" tags for patch 1

 - Evaluate is_sched_tag directly within TP_fast_assign (Steven Rostedt)

 - Introduced atomic counters via debugfs

Changes since v1 [7]:
 - Improved the description of the trace point (Damien Le Moal)

 - Removed the redundant "active requests" (Laurence Oberman)

 - Introduced pool-specific starvation tracking

[1]: https://lore.kernel.org/lkml/20260523200942.587199-1-atomlin@atomlin.com/
[2]: https://lore.kernel.org/lkml/20260517213614.350367-1-atomlin@atomlin.com/
[3]: https://lore.kernel.org/lkml/20260427020142.358912-1-atomlin@atomlin.com/
[4]: https://lore.kernel.org/lkml/20260419023036.1419514-1-atomlin@atomlin.com/
[5]: https://lore.kernel.org/lkml/20260319221956.332770-1-atomlin@atomlin.com/
[6]: https://lore.kernel.org/lkml/20260319015300.287653-1-atomlin@atomlin.com/
[7]: https://lore.kernel.org/lkml/20260317182835.258183-1-atomlin@atomlin.com/
---
 block/blk-mq-tag.c           |  6 ++++
 include/trace/events/block.h | 55 ++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..35deee5bbc73 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/kmemleak.h>
 
 #include <linux/delay.h>
+#include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
@@ -181,6 +182,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		/* Log the starvation event before altering task state */
+		trace_block_rq_tag_wait(data->q, data->hctx,
+					data->rq_flags & RQF_SCHED_TAGS,
+					data->flags);
+
 		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 6aa79e2d799c..736e176f6d17 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -226,6 +226,61 @@ DECLARE_EVENT_CLASS(block_rq,
 		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
 );
 
+/**
+ * block_rq_tag_wait - triggered when a request is starved of a tag
+ * @q: request queue of the target device
+ * @hctx: hardware context of the request experiencing starvation
+ * @is_sched_tag: indicates whether the starved pool is the software scheduler
+ * @alloc_flags: allocation flags dictating the specific tag pool
+ *
+ * Called immediately before the submitting context is forced to block due
+ * to the exhaustion of available tags (i.e., physical hardware driver
+ * tags, software scheduler tags, or reserved tags). This trace point
+ * indicates that the context will be placed into an uninterruptible state
+ * via io_schedule() until an active request completes and relinquishes its
+ * assigned tag.
+ */
+TRACE_EVENT(block_rq_tag_wait,
+
+	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+		 bool is_sched_tag, unsigned int alloc_flags),
+
+	TP_ARGS(q, hctx, is_sched_tag, alloc_flags),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( u32,		hctx_id			)
+		__field( u32,		nr_tags			)
+		__field( bool,		is_sched_tag		)
+		__field( bool,		is_reserved		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= q->disk ? disk_devt(q->disk) : 0;
+		__entry->hctx_id	= hctx->queue_num;
+		__entry->is_sched_tag	= is_sched_tag;
+		__entry->is_reserved	= alloc_flags & BLK_MQ_REQ_RESERVED;
+
+		if (__entry->is_reserved) {
+			__entry->nr_tags = is_sched_tag ?
+					   hctx->sched_tags->nr_reserved_tags :
+					   hctx->tags->nr_reserved_tags;
+		} else {
+			__entry->nr_tags = is_sched_tag ?
+					   hctx->sched_tags->nr_tags :
+					   hctx->tags->nr_tags;
+		}
+
+	),
+
+	TP_printk("%d,%d hctx=%u starved on %s%s tags (depth=%u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->hctx_id,
+		  __entry->is_sched_tag ? "scheduler" : "hardware",
+		  __entry->is_reserved ? " reserved" : "",
+		  __entry->nr_tags)
+);
+
 /**
  * block_rq_insert - insert block operation request into queue
  * @rq: block IO operation request

base-commit: 6779b50faa562e6cca1aa6a4649a4d764c6c7e28
-- 
2.51.0


^ permalink raw reply related

* Re: [RFC PATCH 2/2] tracing: Record and show boot ID in last_boot_info
From: Masami Hiramatsu @ 2026-05-24  1:44 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Theodore Ts'o, Jason A . Donenfeld, Mathieu Desnoyers,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260521111630.1f558754@gandalf.local.home>

On Thu, 21 May 2026 11:16:30 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 21 May 2026 23:57:16 +0900
> "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> 
> > @@ -4804,6 +4806,7 @@ struct trace_mod_entry {
> >  struct trace_scratch {
> >  	unsigned int		clock_id;
> >  	unsigned long		text_addr;
> > +	u8			boot_id[UUID_SIZE];
> >  	unsigned long		nr_entries;
> >  	struct trace_mod_entry	entries[];
> >  };
> 
> I just don't like wasting scratch space if boot_id isn't defined. But I
> can't figure out a way to optionally have it there without wasting space
> anyway.

Yeah, it needs to be placed in the scratch area or ring-buffer meta page.
In most cases the boot_id is enabled (random subsystem seems to provide
this UUID always), so it will be rarely waste of memory except
CONFIG_SYSCTL=n.

> 
> If the get_boot_id() is accepted by the random folks, then I'm fine with
> this change.

Yeah, BTW, Sashiko found this can be initialized before we get enough
entropy for random seed. Maybe we need one more delay.

Thank you,


> 
> -- Steve


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v6] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Masami Hiramatsu @ 2026-05-24 10:15 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Linux trace kernel, Mathieu Desnoyers, Mark Rutland,
	Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
	Tom Zanussi, Andrew Morton, Thomas Gleixner, Ian Rogers,
	Jiri Olsa, sashiko-bot@kernel.org,
	sashiko-reviews@lists.linux.dev
In-Reply-To: <20260522104521.74981686@gandalf.local.home>

On Fri, 22 May 2026 10:45:21 -0400
Steven Rostedt <rostedt@kernel.org> wrote:

> On Fri, 22 May 2026 07:23:22 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > > > @@ -653,6 +686,20 @@ static int parse_btf_arg(char *varname,
> > > >  		return -EOPNOTSUPP;
> > > >  	}
> > > >  
> > > > +	if (ctx->flags & TPARG_FL_TEVENT) {
> > > > +		int ret;
> > > > +
> > > > +		ret = parse_trace_event(varname, code, ctx);
> > > > +		if (ret < 0)
> > > > +			return ret;  
> > 
> > > When parse_trace_event() returns a negative error code (such as -EINVAL or
> > > -ENOENT) because a field name is invalid, the error is propagated back up
> > > the stack. Does this path miss calling trace_probe_log_err()? 
> > > If so, users might receive a generic failure without context or a caret
> > > pointing to the specific syntax error.  
> > 
> > Hmm, there's a comment in the parse_trace_event() that sets ctx->offset for
> > backward compatibility. I'll investigate to see if we can fix that now.
> 
> Masami,
> 
> I looked at the code for parse_trace_event() that has:
> 
> 	/* backward compatibility */
> 	ctx->offset = 0;
> 	return -EINVAL;
> 
> And it was originally introduced by commit 1b8b0cd754cd ("tracing/probes:
> Move event parameter fetching code to common parser"), with:
> 
> +               ret = parse_trace_event_arg(arg, code, ctx);
> +               if (!ret)
> +                       return 0;
> +               if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
> +                       code->op = FETCH_OP_COMM;
> +                       return 0;
> +               }
> +               /* backward compatibility */
> +               ctx->offset = 0;
> +               goto inval;
> +       }
> +
> 
> 
> What was the reason for the "backward compatibility"? Can we make it a real
> error now?

This is because a wrong eprobe syntax parser error position indicator.

In tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc:

check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd'      # BAD_FETCH_ARG
check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo'     # BAD_ATTACH_ARG

BAD_FETCH_ARG points the fetcharg name correctly, but the
BAD_ATTACH_ARG points wrong place in the test case.
I think we should fix test case. (Previously, since it was
a cleanup, I didn't changed it)

Thank you,

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [BUG] tracing/uprobe: oversized dynamic ustring triggers WARN_ON_ONCE panic
From: Yifei Chu @ 2026-05-24 14:44 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel

[-- Attachment #1.1: Type: text/plain, Size: 1802 bytes --]

Hello,

Short version: I can make trace_uprobe hit WARN_ON_ONCE() by creating an
uprobe/uretprobe event with several dynamic ustring fetch args. With
panic_on_warn=1, this becomes a reproducible panic.

The setup is pretty direct. The reproducers mount tracefs, create a trace
event with several ustring arguments pointing at a 4095-byte userspace
string, and then trigger the event. At probe hit time, the dynamic string
sizes are accumulated and prepare_uprobe_buffer() sees a payload larger
than MAX_UCB_BUFFER_SIZE/PAGE_SIZE:

WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)

I reproduced the same class through both uprobe and uretprobe events.

Tested environment:

Linux version 7.0.9, x86_64 QEMU
gcc 12.3.0, GNU ld 2.38
Boot args included: panic_on_warn=1 nokaslr console=ttyS0

Uprobe result:

WARNING: kernel/trace/trace_uprobe.c:982 at
prepare_uprobe_buffer.part.0+0x458/0x5b0
Kernel panic - not syncing: kernel: panic_on_warn set …

Uretprobe result:

triggering uretprobe oversized ustring buffer at offset 0x1db0
WARNING: kernel/trace/trace_uprobe.c:982 at
prepare_uprobe_buffer.part.0+0x458/0x5b0
uretprobe_dispatcher+0x328/0x3e0
Kernel panic - not syncing: kernel: panic_on_warn set …

I checked current mainline source and still see the runtime WARN path in
kernel/trace/trace_uprobe.c. I have reproduced the panic on the 7.0.9 QEMU
build above; I have not yet runtime-tested current mainline.

My expectation is that oversized user-controlled dynamic trace data should
be rejected, capped, or dropped before it reaches a WARN invariant. A
tracefs user should not be able to turn a long string fetch into a kernel
warning/panic.

The attached tarball has README files, both C reproducers, and the full
QEMU logs.

Thanks,
Chuyifei

[-- Attachment #1.2: Type: text/html, Size: 1911 bytes --]

[-- Attachment #2: trace_uprobe_uretprobe_ustring_warn_panic.tar.gz --]
[-- Type: application/x-tar, Size: 26669 bytes --]

^ permalink raw reply

* [BUG] tracing/kprobe: perf dynamic ustring sample can exceed PERF_MAX_TRACE_SIZE and WARN
From: Yifei Chu @ 2026-05-24 14:44 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-kernel

[-- Attachment #1.1: Type: text/plain, Size: 1776 bytes --]

Hello,

Short version: I can make a kprobe/kretprobe trace event with dynamic
ustring fetch args ask perf_trace_buf_alloc() for more than
PERF_MAX_TRACE_SIZE. That hits WARN_ONCE(), and with panic_on_warn=1 it
becomes a reproducible kernel panic.

The reproducers create a kprobe or kretprobe trace event with several
ustring args pointing at a 4095-byte userspace string, open the event
through perf_event_open(PERF_TYPE_TRACEPOINT), and trigger it. The dynamic
payload size is then passed to perf_trace_buf_alloc():

WARN_ONCE(size > PERF_MAX_TRACE_SIZE, …)

I reproduced this through both kprobe and kretprobe events.

Tested environment:

Linux version 7.0.9, x86_64 QEMU
gcc 12.3.0, GNU ld 2.38
Boot args included: panic_on_warn=1 nokaslr console=ttyS0

Kprobe result:

perf buffer not large enough, wanted 16420, have 8192
WARNING: kernel/trace/trace_event_perf.c:405 at
perf_trace_buf_alloc+0x111/0x160
Kernel panic - not syncing: kernel: panic_on_warn set …

Kretprobe result:

perf buffer not large enough, wanted 16428, have 8192
WARNING: kernel/trace/trace_event_perf.c:405 at
perf_trace_buf_alloc+0x111/0x160
kretprobe_perf_func+0x24b/0x750
Kernel panic - not syncing: kernel: panic_on_warn set …

I checked current mainline source and still see PERF_MAX_TRACE_SIZE as 8192
and the WARN_ONCE path in perf_trace_buf_alloc(). I have reproduced the
panic on the 7.0.9 QEMU build above; I have not yet runtime-tested current
mainline.

My expectation is that a user-defined dynamic trace payload that is too
large for the perf trace buffer should be rejected, capped, or dropped
without reaching WARN_ONCE().

The attached tarball has README files, both C reproducers, and the full
QEMU logs.

Thanks,
Chuyifei

[-- Attachment #1.2: Type: text/html, Size: 1888 bytes --]

[-- Attachment #2: trace_kprobe_kretprobe_perf_ustring_warn_panic.tar.gz --]
[-- Type: application/x-tar, Size: 26750 bytes --]

^ permalink raw reply

* [PATCH] tracing: fix CFI violation in probestub helper
From: Eva Kurchatova @ 2026-05-24 15:43 UTC (permalink / raw)
  To: mhiramat, rostedt
  Cc: linux-trace-kernel, linux-kernel, mathieu.desnoyers, peterz,
	jpoimboe, samitolvanen, eva.kurchatova

When multiple callbacks are registered on the same tracepoint, probestub
will be indirectly called via traceiter helper.

Pointer to probestub callback resides in __tracepoints section, which is
excluded from ENDBR checks in objtool. Pointers to regfunc/unregfunc
callbacks reside in extended structure however, which is not affected.

Registering multiple callbacks will result in a #CP exception due to
missed ENDBR in __probestub helper on a CFI-enabled machine.

Fix this by adding CFI_NOSEAL annotation to probestub declaration.

Fixes: d5173f753750 ("objtool: Exclude __tracepoints data from ENDBR checks")
Signed-off-by: Eva Kurchatova <eva.kurchatova@virtuozzo.com>
---
 include/linux/tracepoint.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 583d962abcc3..5a32a709759c 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -19,6 +19,7 @@
 #include <linux/rcupdate.h>
 #include <linux/tracepoint-defs.h>
 #include <linux/static_call.h>
+#include <asm/cfi.h>

 struct module;
 struct tracepoint;
@@ -356,6 +357,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	void __probestub_##_name(void *__data, proto)			\
 	{								\
 	}								\
+	CFI_NOSEAL(__probestub_##_name);				\
 	DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);

 #define DEFINE_TRACE_FN(_name, _reg, _unreg, _proto, _args)		\
-- 
2.54.0

^ permalink raw reply related

* [PATCH RESEND] rtla: Fix output files in source tree
From: Ben Hutchings @ 2026-05-24 16:24 UTC (permalink / raw)
  To: Steven Rostedt, Tomas Glozar; +Cc: linux-trace-kernel, Ian Rogers

[-- Attachment #1: Type: text/plain, Size: 4786 bytes --]

Some output files (src/timerlat.bpf.o, src/timerlat.skel.h,
example/timerlat_bpf_action.o, tests/bpf/bpf_action_map.o) are
currently generated in the source tree, preventing a fully out-of-tree
build.  To fix this:

- Add $(OUTPUT) to their filenames in the relevant Makefile rules, and
  create subdirectories as needed
- Add $(OUTPUT)src to the include path
- Add ${OUTPUT} to the BPF object filename in tests/timerlat.t

Fixes: e34293ddcebd ("rtla/timerlat: Add BPF skeleton to collect samples")
Fixes: 0304a3b7ec9a ("rtla/timerlat: Add example for BPF action program")
Fixes: 5525aebd4e0c ("rtla/tests: Test BPF action program")
Signed-off-by: Ben Hutchings <benh@debian.org>
Reviewed-by: Ian Rogers <irogers@google.com>
---
 tools/tracing/rtla/Makefile         | 31 ++++++++++++++++++-----------
 tools/tracing/rtla/tests/timerlat.t |  4 ++--
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 45690ee14544..f54da7be735d 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -66,30 +66,37 @@ ifeq ($(config),1)
   include Makefile.config
 endif
 
+INCLUDES	= -I$(OUTPUT)src
+
 CFLAGS		+= $(INCLUDES) $(LIB_INCLUDES)
 
 export CFLAGS OUTPUT srctree
 
 ifeq ($(BUILD_BPF_SKEL),1)
-src/timerlat.bpf.o: src/timerlat.bpf.c
+$(OUTPUT)src/timerlat.bpf.o: src/timerlat.bpf.c
+	mkdir -p $(@D)
 	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
 
-src/timerlat.skel.h: src/timerlat.bpf.o
+$(OUTPUT)src/timerlat.skel.h: $(OUTPUT)src/timerlat.bpf.o
+	mkdir -p $(@D)
 	$(QUIET_GENSKEL)$(SYSTEM_BPFTOOL) gen skeleton $< > $@
 
-example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
+$(OUTPUT)example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
+	mkdir -p $(@D)
 	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
 
-tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
+$(OUTPUT)tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
+	mkdir -p $(@D)
 	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
 else
-src/timerlat.skel.h:
-	$(Q)echo '/* BPF skeleton is disabled */' > src/timerlat.skel.h
+$(OUTPUT)src/timerlat.skel.h:
+	mkdir -p $(@D)
+	$(Q)echo '/* BPF skeleton is disabled */' > $@
 
-example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
+$(OUTPUT)example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
 	$(Q)echo "BPF skeleton support is disabled, skipping example/timerlat_bpf_action.o"
 
-tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
+$(OUTPUT)tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
 	$(Q)echo "BPF skeleton support is disabled, skipping tests/bpf/bpf_action_map.o"
 endif
 
@@ -103,7 +110,7 @@ static: $(RTLA_IN)
 rtla.%: fixdep FORCE
 	make -f $(srctree)/tools/build/Makefile.build dir=. $@
 
-$(RTLA_IN): fixdep FORCE src/timerlat.skel.h
+$(RTLA_IN): fixdep FORCE $(OUTPUT)src/timerlat.skel.h
 	make $(build)=rtla
 
 clean: doc_clean fixdep-clean
@@ -111,10 +118,10 @@ clean: doc_clean fixdep-clean
 	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-*
 	$(Q)rm -rf feature
-	$(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o
+	$(Q)rm -f $(OUTPUT)src/timerlat.bpf.o $(OUTPUT)src/timerlat.skel.h $(OUTPUT)example/timerlat_bpf_action.o
 	$(Q)rm -f $(UNIT_TESTS)
 
-check: $(RTLA) tests/bpf/bpf_action_map.o
+check: $(RTLA) $(OUTPUT)tests/bpf/bpf_action_map.o
 	RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f -v tests/
-examples: example/timerlat_bpf_action.o
+examples: $(OUTPUT)example/timerlat_bpf_action.o
 .PHONY: FORCE clean check
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index fd4935fd7b49..e0f3fc4df655 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -74,12 +74,12 @@ then
 	# Test BPF action program properly in BPF mode
 	[ -z "$BPFTOOL" ] && BPFTOOL=bpftool
 	check "hist with BPF action program (BPF mode)" \
-		"timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o --on-threshold shell,command='$BPFTOOL map dump name rtla_test_map'" \
+		"timerlat hist -T 2 --bpf-action ${OUTPUT}tests/bpf/bpf_action_map.o --on-threshold shell,command='$BPFTOOL map dump name rtla_test_map'" \
 		2 '"value": 42'
 else
 	# Test BPF action program failure in non-BPF mode
 	check "hist with BPF action program (non-BPF mode)" \
-		"timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o" \
+		"timerlat hist -T 2 --bpf-action ${OUTPUT}tests/bpf/bpf_action_map.o" \
 		1 "BPF actions are not supported in tracefs-only mode"
 fi
 done

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* Re: [PATCHv3 03/12] uprobes/x86: Allow to copy uprobe trampolines on fork
From: Jiri Olsa @ 2026-05-24 21:54 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko, bpf, linux-trace-kernel
In-Reply-To: <CAEf4BzYo-8PAXFJt9MHoUn9ux1O2YVxJADC0tGSsacVu_R8Stw@mail.gmail.com>

On Fri, May 22, 2026 at 11:50:54AM -0700, Andrii Nakryiko wrote:
> On Thu, May 21, 2026 at 5:44 AM Jiri Olsa <jolsa@kernel.org> wrote:
> >
> > When we do fork or clone without CLONE_VM the new process won't
> > have uprobe trampoline vma objects and at the same time it will
> > have optimized code calling that trampoline and crash.
> >
> > Fixing this by allowing vma uprobe trampoline objects to be copied
> > on fork to the new process.
> >
> > Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
> > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > ---
> >  arch/x86/kernel/uprobes.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> > index 6824376e253d..11ec6b89b135 100644
> > --- a/arch/x86/kernel/uprobes.c
> > +++ b/arch/x86/kernel/uprobes.c
> > @@ -701,7 +701,7 @@ static struct vm_area_struct *get_uprobe_trampoline(unsigned long vaddr)
> >                 return ERR_PTR(vaddr);
> >
> >         return _install_special_mapping(current->mm, vaddr, PAGE_SIZE,
> > -                               VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
> > +                               VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_IO,
> 
> so on fork we'll get sys_uprobe invocations which will go into uprobe
> trampoline and syscall will just keep returning -EPROTO, is that
> right?

so the child gets the inherited optimized call path.and now also the
trampoline, which executes the uprobe syscall and so if there's related
uprobe it will be processed

-EPROTO is returned for uprobe syscall executed out of uprobe trampoline


> 
> what would happen in the similar situation for process with int3
> uprobe being forked/cloned? Will it inherit int3 as well, and then
> will keep hitting interrupts that would just do nothing?
> 
> is there a way to restore original memory page for clones? this
> behavior (unless I'm misunderstanding) seems suboptimal
> performance-wise

the standard uprobes seem to handle this situation by removing the brekpoint
from current->mm, if the uprobe->handler returns UPROBE_HANDLER_REMOVE (for all
uprobe consumers) .. and the uprobe->handler does current->mm filter first in
uprobe_perf_func and returns UPROBE_HANDLER_REMOVE in case it does not match

this works also for optimized uprobes on top of standard uprobes

but we don't do that for uprobe_multi and I think we could do something similar,
the code is there, perhaps we just need to add the filter

jirka

^ permalink raw reply

* Re: [PATCHv3 02/12] uprobes/x86: Remove struct uprobe_trampoline object
From: Jiri Olsa @ 2026-05-24 22:13 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: oleg, peterz, mingo, mhiramat, andrii, bpf, linux-trace-kernel,
	ast, daniel, martin.lau, eddyz87, yonghong.song, clm,
	ihor.solodrai
In-Reply-To: <8861cc7af8e0dfff2674ca4ae6381ea2d82c242dec115774e8e11048bde358c4@mail.kernel.org>

On Thu, May 21, 2026 at 01:26:09PM +0000, bot+bpf-ci@kernel.org wrote:
> > commit 35d93b034b144060a547b507cfb4268692c328c2
> > Author: Jiri Olsa <jolsa@kernel.org>
> > Date:   Thu May 21 14:44:01 2026 +0200
> >
> >     uprobes/x86: Remove struct uprobe_trampoline object
> >
> >     Removing struct uprobe_trampoline object and it's tracking code,
> >     because it's not needed. We can do same thing directly on top of
> >     struct vm_area_struct objects.
> >
> >     This makes the code simpler and allows easy propagation of the
> >     trampoline vma object into child process in following change.
> >
> >     Note the original code called destroy_uprobe_trampoline if the
> >     optimiation failed, but it only freed the struct uprobe_trampoline
> >     object, not the vma.
> >
> >     Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> 
> This looks like a bug fix for a VMA resource leak. The commit message
> notes that when uprobe optimization failed, the original code only freed
> the struct uprobe_trampoline object but leaked the VMA created by
> _install_special_mapping. Should this include:
> 
>   Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")

no, the logic stays, we just removed one layer.. but we need separate
fix for the possible vma leak, will include it in the next version

jirka

^ permalink raw reply

* Re: [PATCH v8] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-05-24 22:39 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	john.g.garry, loberman, neelx, sean, mproche, chjohnst,
	linux-block, linux-kernel, linux-trace-kernel
In-Reply-To: <20260524014204.622699-1-atomlin@atomlin.com>

On Sat, May 23, 2026 at 09:42:04PM -0400, Aaron Tomlin wrote:
> diff --git a/include/trace/events/block.h b/include/trace/events/block.h
> index 6aa79e2d799c..736e176f6d17 100644
> --- a/include/trace/events/block.h
> +++ b/include/trace/events/block.h
> @@ -226,6 +226,61 @@ DECLARE_EVENT_CLASS(block_rq,
>  		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
>  );
>  
> +/**
> + * block_rq_tag_wait - triggered when a request is starved of a tag
> + * @q: request queue of the target device
> + * @hctx: hardware context of the request experiencing starvation
> + * @is_sched_tag: indicates whether the starved pool is the software scheduler
> + * @alloc_flags: allocation flags dictating the specific tag pool
> + *
> + * Called immediately before the submitting context is forced to block due
> + * to the exhaustion of available tags (i.e., physical hardware driver
> + * tags, software scheduler tags, or reserved tags). This trace point
> + * indicates that the context will be placed into an uninterruptible state
> + * via io_schedule() until an active request completes and relinquishes its
> + * assigned tag.
> + */
> +TRACE_EVENT(block_rq_tag_wait,
> +
> +	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
> +		 bool is_sched_tag, unsigned int alloc_flags),
> +
> +	TP_ARGS(q, hctx, is_sched_tag, alloc_flags),
> +
> +	TP_STRUCT__entry(
> +		__field( dev_t,		dev			)
> +		__field( u32,		hctx_id			)
> +		__field( u32,		nr_tags			)
> +		__field( bool,		is_sched_tag		)
> +		__field( bool,		is_reserved		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= q->disk ? disk_devt(q->disk) : 0;
> +		__entry->hctx_id	= hctx->queue_num;
> +		__entry->is_sched_tag	= is_sched_tag;
> +		__entry->is_reserved	= alloc_flags & BLK_MQ_REQ_RESERVED;
> +
> +		if (__entry->is_reserved) {
> +			__entry->nr_tags = is_sched_tag ?
> +					   hctx->sched_tags->nr_reserved_tags :
> +					   hctx->tags->nr_reserved_tags;
> +		} else {
> +			__entry->nr_tags = is_sched_tag ?
> +					   hctx->sched_tags->nr_tags :
> +					   hctx->tags->nr_tags;
> +		}
> +
> +	),
> +
> +	TP_printk("%d,%d hctx=%u starved on %s%s tags (depth=%u)",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->hctx_id,
> +		  __entry->is_sched_tag ? "scheduler" : "hardware",
> +		  __entry->is_reserved ? " reserved" : "",
> +		  __entry->nr_tags)
> +);

This is wrong.

If __entry->is_reserved is false, the current logic incorrectly reports the
total capacity pool depth (i.e., both reserved and standard tags combined).

I have refactored the TP_fast_assign block to evaluate the reserved status
orthogonally, ensuring nr_reserved_tags is correctly reported for I/O
schedulers. Additionally, the unreserved pool calculation has been fixed to
accurately subtract nr_reserved_tags from nr_tags.

I will include these corrections in the next iteration. Given the extent of
the functional changes to the tracepoint assignment logic, I will drop the
existing "Reviewed-by:" tags.

-- 
Aaron Tomlin

^ permalink raw reply

* [PATCH v9] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-05-25  0:51 UTC (permalink / raw)
  To: axboe, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	john.g.garry, loberman, neelx, sean, mproche, chjohnst,
	linux-block, linux-kernel, linux-trace-kernel

In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices (SSDs) are starved of hardware
tags when sharing the same blk_mq_tag_set.

Currently, diagnosing this specific hardware queue contention is
difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
forces the current thread to block uninterruptible via io_schedule().
While this can be inferred via sched:sched_switch or dynamically
traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
dedicated, out-of-the-box observability for this event.

This patch introduces the block_rq_tag_wait tracepoint in the tag
allocation slow-path. It triggers immediately before the task state
is altered to TASK_UNINTERRUPTIBLE (ensuring safety for PREEMPT_RT
locks). It exposes the exact hardware context (hctx) that is starved,
the specific pool experiencing starvation (driver, software scheduler,
or reserved), and the exact pool depth.

This provides storage engineers with a zero-configuration, low-overhead
mechanism to definitively identify shared-tag bottlenecks. For example,
userspace can trivially replicate tag starvation counters using bpftrace:

    # bpftrace -e 'tracepoint:block:block_rq_tag_wait { @tag_waits[cpu] = count(); }'
    Attaching 1 probe...
    ^C
    @tag_waits[4]: 12
    @tag_waits[12]: 87

Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
Hi Johannes, Damien, Chaitanya, Laurence,

I have dropped the earlier "Reviewed-by:" and "Tested-by:" tags because
of functional logic changes in the tracepoint assignment block. A fresh
review would be highly appreciated. Thank you.

Changes since v8 [1]:
 - Fixed the standard pool depth calculation in TP_fast_assign to
   accurately report the unreserved capacity by mathematically
   subtracting nr_reserved_tags from nr_tags

 - Removed "Reviewed-by:" and "Tested-by:" tags due to the functional
   logic updates in the tracepoint assignment block

Changes since v7 [2]:
 - Added an is_reserved boolean to the trace record to explicitly expose
   reserved pool starvation to userspace

 - Fixed TP_fast_assign to report the correct nr_reserved_tags depth
   when I/O schedulers utilise the reserved pool

Changes since v6 [3]:
 - Dropped Patch 2. Observability is now driven entirely by the tracepoint,
   with the commit message updated to demonstrate how userspace (e.g.,
   bpftrace) can safely replicate counting out-of-band (Jens Axboe)

 - Moved tracepoint call above sbitmap_prepare_to_wait(). This prevents
   inadvertently resetting the task state under PREEMPT_RT locks

 - Updated the tracepoint signature and TP_fast_assign block to evaluate
   the allocation flags. If the submitting context is starved of a reserved
   tag (BLK_MQ_REQ_RESERVED), the tracepoint now accurately reports the
   severely constrained nr_reserved_tags depth instead of the total nr_tags
   depth.

Changes since v5 [4]:
 - Replaced this_cpu_inc() with raw_cpu_inc() within
   blk_mq_debugfs_inc_wait_tags(). This resolves a preemption warning
   triggered under CONFIG_DEBUG_PREEMPT=y, as the routine is invoked from a
   preemptible context immediately prior to io_schedule(). This adjustment
   deliberately prioritises the reduction of execution overhead over
   absolute statistical precision for this diagnostic interface.

Changes since v4 [5]:
 - Prevented a NULL pointer dereference in the tracepoint fast-assign for
   disk-less request queues by safely checking q->disk before resolving the
   dev_t

 - Fixed a Use-After-Free (UAF) and permanent memory leak by decoupling
   the per-CPU counter allocation from the volatile debugfs lifecycle and
   tying it directly to the core hctx lifecycle (i.e., blk_mq_init_hctx()
   and blk_mq_exit_hctx())

 - Fixed a potential compiler double-fetch bug by wrapping the per-CPU
   pointer evaluations with READ_ONCE() in blk_mq_debugfs_inc_wait_tags()

 - Passed the appropriate gfp_t flags down to the allocation routines to
   maintain the strict GFP_NOIO context

 - Updated kernel-doc descriptions to clarify that the NULL pointer
   checks guard against memory allocation failures under pressure, rather
   than initialisation race conditions

Changes since v3 [6]:
 - Transitioned tracking architecture from shared atomic_t variables to
   dynamically allocated per-CPU counters to resolve cache line bouncing
   (Bart Van Assche)

Changes since v2 [7]:
 - Added "Reviewed-by:" and "Tested-by:" tags for patch 1

 - Evaluate is_sched_tag directly within TP_fast_assign (Steven Rostedt)

 - Introduced atomic counters via debugfs

Changes since v1 [8]:
 - Improved the description of the trace point (Damien Le Moal)

 - Removed the redundant "active requests" (Laurence Oberman)

 - Introduced pool-specific starvation tracking

[1]: https://lore.kernel.org/lkml/20260524014204.622699-1-atomlin@atomlin.com/
[2]: https://lore.kernel.org/lkml/20260523200942.587199-1-atomlin@atomlin.com/
[3]: https://lore.kernel.org/lkml/20260517213614.350367-1-atomlin@atomlin.com/
[4]: https://lore.kernel.org/lkml/20260427020142.358912-1-atomlin@atomlin.com/
[5]: https://lore.kernel.org/lkml/20260419023036.1419514-1-atomlin@atomlin.com/
[6]: https://lore.kernel.org/lkml/20260319221956.332770-1-atomlin@atomlin.com/
[7]: https://lore.kernel.org/lkml/20260319015300.287653-1-atomlin@atomlin.com/
[8]: https://lore.kernel.org/lkml/20260317182835.258183-1-atomlin@atomlin.com/
---
 block/blk-mq-tag.c           |  6 ++++
 include/trace/events/block.h | 59 ++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..35deee5bbc73 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
 #include <linux/kmemleak.h>
 
 #include <linux/delay.h>
+#include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
@@ -181,6 +182,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != BLK_MQ_NO_TAG)
 			break;
 
+		/* Log the starvation event before altering task state */
+		trace_block_rq_tag_wait(data->q, data->hctx,
+					data->rq_flags & RQF_SCHED_TAGS,
+					data->flags);
+
 		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 6aa79e2d799c..9c97a16850b9 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -226,6 +226,65 @@ DECLARE_EVENT_CLASS(block_rq,
 		  IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm)
 );
 
+/**
+ * block_rq_tag_wait - triggered when a request is starved of a tag
+ * @q: request queue of the target device
+ * @hctx: hardware context of the request experiencing starvation
+ * @is_sched_tag: indicates whether the starved pool is the software scheduler
+ * @alloc_flags: allocation flags dictating the specific tag pool
+ *
+ * Called immediately before the submitting context is forced to block due
+ * to the exhaustion of available tags (i.e., physical hardware driver
+ * tags, software scheduler tags, or reserved tags). This trace point
+ * indicates that the context will be placed into an uninterruptible state
+ * via sbitmap_prepare_to_wait(). If a tag is not acquired in the final
+ * lockless retry, the context will yield the CPU via io_schedule() until
+ * an active request completes and relinquishes its assigned tag.
+ */
+TRACE_EVENT(block_rq_tag_wait,
+
+	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+		 bool is_sched_tag, unsigned int alloc_flags),
+
+	TP_ARGS(q, hctx, is_sched_tag, alloc_flags),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( u32,		hctx_id			)
+		__field( u32,		nr_tags			)
+		__field( bool,		is_sched_tag		)
+		__field( bool,		is_reserved		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= q->disk ? disk_devt(q->disk) : 0;
+		__entry->hctx_id	= hctx->queue_num;
+		__entry->is_sched_tag	= is_sched_tag;
+		__entry->is_reserved	= alloc_flags & BLK_MQ_REQ_RESERVED;
+
+		if (__entry->is_reserved) {
+			__entry->nr_tags = is_sched_tag ?
+					   hctx->sched_tags->nr_reserved_tags :
+					   hctx->tags->nr_reserved_tags;
+		} else {
+			if (is_sched_tag)
+				__entry->nr_tags = hctx->sched_tags->nr_tags -
+						   hctx->sched_tags->nr_reserved_tags;
+			else
+				__entry->nr_tags = hctx->tags->nr_tags -
+						   hctx->tags->nr_reserved_tags;
+		}
+
+	),
+
+	TP_printk("%d,%d hctx=%u starved on %s%s tags (depth=%u)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->hctx_id,
+		  __entry->is_sched_tag ? "scheduler" : "hardware",
+		  __entry->is_reserved ? " reserved" : "",
+		  __entry->nr_tags)
+);
+
 /**
  * block_rq_insert - insert block operation request into queue
  * @rq: block IO operation request

base-commit: 6779b50faa562e6cca1aa6a4649a4d764c6c7e28
-- 
2.51.0


^ permalink raw reply related

* Re: [BUG] tracing/uprobe: oversized dynamic ustring triggers WARN_ON_ONCE panic
From: Masami Hiramatsu @ 2026-05-25  0:56 UTC (permalink / raw)
  To: Yifei Chu
  Cc: linux-trace-kernel, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel
In-Reply-To: <CAPJnbgJ3ayTqZvau6x4c5Y0=xsL_hUeDm1mVTKCYAhfGZCL6bg@mail.gmail.com>

On Sun, 24 May 2026 10:44:09 -0400
Yifei Chu <yifeichu24@gmail.com> wrote:

> Hello,
> 
> Short version: I can make trace_uprobe hit WARN_ON_ONCE() by creating an
> uprobe/uretprobe event with several dynamic ustring fetch args. With
> panic_on_warn=1, this becomes a reproducible panic.
> 
> The setup is pretty direct. The reproducers mount tracefs, create a trace
> event with several ustring arguments pointing at a 4095-byte userspace
> string, and then trigger the event. At probe hit time, the dynamic string
> sizes are accumulated and prepare_uprobe_buffer() sees a payload larger
> than MAX_UCB_BUFFER_SIZE/PAGE_SIZE:
> 
> WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)
> 
> I reproduced the same class through both uprobe and uretprobe events.

This should be fixed by [1]

[1] https://lore.kernel.org/all/20260428122302.706610ba@gandalf.local.home/

Thanks,

> 
> Tested environment:
> 
> Linux version 7.0.9, x86_64 QEMU
> gcc 12.3.0, GNU ld 2.38
> Boot args included: panic_on_warn=1 nokaslr console=ttyS0
> 
> Uprobe result:
> 
> WARNING: kernel/trace/trace_uprobe.c:982 at
> prepare_uprobe_buffer.part.0+0x458/0x5b0
> Kernel panic - not syncing: kernel: panic_on_warn set …
> 
> Uretprobe result:
> 
> triggering uretprobe oversized ustring buffer at offset 0x1db0
> WARNING: kernel/trace/trace_uprobe.c:982 at
> prepare_uprobe_buffer.part.0+0x458/0x5b0
> uretprobe_dispatcher+0x328/0x3e0
> Kernel panic - not syncing: kernel: panic_on_warn set …
> 
> I checked current mainline source and still see the runtime WARN path in
> kernel/trace/trace_uprobe.c. I have reproduced the panic on the 7.0.9 QEMU
> build above; I have not yet runtime-tested current mainline.
> 
> My expectation is that oversized user-controlled dynamic trace data should
> be rejected, capped, or dropped before it reaches a WARN invariant. A
> tracefs user should not be able to turn a long string fetch into a kernel
> warning/panic.
> 
> The attached tarball has README files, both C reproducers, and the full
> QEMU logs.
> 
> Thanks,
> Chuyifei


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [BUG] tracing/kprobe: perf dynamic ustring sample can exceed PERF_MAX_TRACE_SIZE and WARN
From: Masami Hiramatsu @ 2026-05-25  0:58 UTC (permalink / raw)
  To: Yifei Chu
  Cc: linux-trace-kernel, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel
In-Reply-To: <CAPJnbgKc7swb2MdOuRHcLUWNX5iApK=-RVN5r6kE74XF=nPgPg@mail.gmail.com>

Hi

On Sun, 24 May 2026 10:44:20 -0400
Yifei Chu <yifeichu24@gmail.com> wrote:

> Hello,
> 
> Short version: I can make a kprobe/kretprobe trace event with dynamic
> ustring fetch args ask perf_trace_buf_alloc() for more than
> PERF_MAX_TRACE_SIZE. That hits WARN_ONCE(), and with panic_on_warn=1 it
> becomes a reproducible kernel panic.
> 
> The reproducers create a kprobe or kretprobe trace event with several
> ustring args pointing at a 4095-byte userspace string, open the event
> through perf_event_open(PERF_TYPE_TRACEPOINT), and trigger it. The dynamic
> payload size is then passed to perf_trace_buf_alloc():
> 
> WARN_ONCE(size > PERF_MAX_TRACE_SIZE, …)
> 
> I reproduced this through both kprobe and kretprobe events.

This also should be fixed by [1]

[1] https://lore.kernel.org/all/20260428122302.706610ba@gandalf.local.home/

But thank you for reporting.

Thanks,

> 
> Tested environment:
> 
> Linux version 7.0.9, x86_64 QEMU
> gcc 12.3.0, GNU ld 2.38
> Boot args included: panic_on_warn=1 nokaslr console=ttyS0
> 
> Kprobe result:
> 
> perf buffer not large enough, wanted 16420, have 8192
> WARNING: kernel/trace/trace_event_perf.c:405 at
> perf_trace_buf_alloc+0x111/0x160
> Kernel panic - not syncing: kernel: panic_on_warn set …
> 
> Kretprobe result:
> 
> perf buffer not large enough, wanted 16428, have 8192
> WARNING: kernel/trace/trace_event_perf.c:405 at
> perf_trace_buf_alloc+0x111/0x160
> kretprobe_perf_func+0x24b/0x750
> Kernel panic - not syncing: kernel: panic_on_warn set …
> 
> I checked current mainline source and still see PERF_MAX_TRACE_SIZE as 8192
> and the WARN_ONCE path in perf_trace_buf_alloc(). I have reproduced the
> panic on the 7.0.9 QEMU build above; I have not yet runtime-tested current
> mainline.
> 
> My expectation is that a user-defined dynamic trace payload that is too
> large for the perf trace buffer should be rejected, capped, or dropped
> without reaching WARN_ONCE().
> 
> The attached tarball has README files, both C reproducers, and the full
> QEMU logs.
> 
> Thanks,
> Chuyifei


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Gregory Price @ 2026-05-25  1:50 UTC (permalink / raw)
  To: Balbir Singh
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm, david,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <ag6XyvxR-NU5rGn-@parvat>

On Thu, May 21, 2026 at 04:23:28PM +1000, Balbir Singh wrote:
> On Sun, Feb 22, 2026 at 03:48:15AM -0500, Gregory Price wrote:
> > Topic type: MM
> > 
> > Presenter: Gregory Price <gourry@gourry.net>
> > 
> > This series introduces N_MEMORY_PRIVATE, a NUMA node state for memory
> > managed by the buddy allocator but excluded from normal allocations.
> > 
> > I present it with an end-to-end Compressed RAM service (mm/cram.c)
> > that would otherwise not be possible (or would be considerably more
> > difficult, be device-specific, and add to the ZONE_DEVICE boondoggle).
> > 
> 
> Do we have updates/notes from the meeting?
> 

I have been on leave since LSF, but I do have some notes posted:

https://lore.kernel.org/linux-mm/af9i7dkNvGGxPHzu@gourry-fedora-PF4VCD3F/
https://lore.kernel.org/linux-mm/agYJcRgOHho8upVv@gourry-fedora-PF4VCD3F/

I will be trying to post an updated set stripped down without the GFP
flag as a first pass w/o RFC tags and no UAPI implications so that
device folks can play with this upstream.

I'm debating on whether to include OPS_MEMPOLICY in the initial version
if only because it's not intuitive how it interacts with pagecache. That
needs more time to bake.

> > 
> > page = alloc_pages_node(nid, __GFP_PRIVATE, 0);
> 
> Do we want to provide kernel level control over allocation of private
> pages, I assumed that only user space applications? I would assume
> node affinity would be the way to do so, unless we have multiple
> 

alloc_pages_node() is the kernel interface

> > 
> > /* Ok but I want to do something useful with it */
> > static const struct node_private_ops ops = {
> >         .migrate_to     = my_migrate_to,
> >         .folio_migrate  = my_folio_migrate,
> >         .flags = NP_OPS_MIGRATION | NP_OPS_MEMPOLICY,
> > };
> > node_private_set_ops(nid, &ops);
> >
> 
> Could you explain this further? Why does OPS_MIGRATION
> and OPS_MEMPOLICY needs to be set explictly?
>

Both of these have been removed from the upcoming version, but in this
RFC version i was testing OPS_MIGRATION as an explicit flag that meant
"migrate.c can touch the folios" while OPS_MEMPOLICY meant "mempolicy.c
can touch the folios".

As it turns out, OPS_MIGRATION is not a useful filter, as it doesn't
actually filter anything (anything using OPS_MIGRATION would also need
its own filter flag, so better to just drop it and do per-server
opt-ins).

~Gregory

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Gregory Price @ 2026-05-25  2:03 UTC (permalink / raw)
  To: Arun George/Arun George
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, dave.jiang, alison.schofield, vishal.l.verma, ira.weiny,
	longman, akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
	surenb, mhocko, osalvador, ziy, matthew.brost, joshua.hahnjy,
	rakie.kim, byungchul, ying.huang, apopple, axelrasmussen, yuanchu,
	weixugc, yury.norov, linux, mhiramat, mathieu.desnoyers, tj,
	hannes, mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman, gost.dev, arungeorge05, cpgs
In-Reply-To: <1891546521.01779449881859.JavaMail.epsvc@epcpadp2new>

On Fri, May 22, 2026 at 02:10:34PM +0530, Arun George/Arun George wrote:
> Thanks.
> 
> On 05-05-2026 01:15 pm, Gregory Price wrote:
> > In the scenario i'm talking about, a "write budget" is defined as a
> > number of pages that are allows to be mapped writable in the page
> > tables at any given time.
> > Agree. I was also in the same context.
> 
> I am trying to bring the device perspective here, and would like to 
> discuss a few corner cases and possible solutions.
> 
> As I see, solving the compressed memory problem statement has these 
> aspects mainly:
> 
> 1) Allocation control: private/managed memory concept.
> 2) Write control: write-protected PTEs, write-controlled use cases like 
> ZSWAP
> 3) Proactive reclaims: optional methods to ease back-pressure using 
> memory shrinkers, ballooning, kswapd, promotion etc. These methods will 
> be triggered based on notifications/interrupts from the device.
> 
> May be they are not enough to cover some corner cases for cram!
> 
>   I believe that this thin-provisioned memory infra is susceptible to 

I'm not understanding the "thin provisioned" terminology you're using
here.  Can you help define what you mean by thin-provision in this case?

> 'writes-above-media-capacity corner cases' (because of not handling 
> device back-pressure notifications in time) whichever methods we use in 
> the kernel. Even if we use write-controlled methods like ZSWAP and 
> pro-active reclaims, there could be corner cases where the communication 
> with the device could be broken and the write path is not aware of it 
> immediately. Note that OCP spec [1] says the device should mark the 
> memory location as 'poisoned' in 'over-capacity' writes.
> 

The intent is to use the low-watermark to prevent new allocations from
occurring, and the write-controls prevent writing to the device without
interposition.

With a sufficient watermark such that the interrupt is delivered within
some number of microseconds, that should be perfectly fine to prevent
poison from ever occurring at all.

Since poison is only delivered *on read*, the system can go a long,
long time before poison is discovered. From the end-user perspective,
this poison is basically unacceptable.

So either we can prevent poison from always occurring, or the hardware
is not viable to support in a scaled production.  

If you think a sufficiently conservative watermark + write-protection is
insufficient to defend against poison, then please let me know why.

> So I have the following proposals / options for this scenario.
> 
>     Option 1: Poisoned data management - This is about accepting that 
> poisoning of memory locations can happen in much more regular frequency 
> here than regular memories and we need to figure out potential recovery 
> mechanisms in host (not recovery of data; but recovery from the poison 
> situation). But I guess folks will not be okay with it in general, and I 
> am not aware of any workloads where data poisoning is tolerated (may be 
> caching workloads?).
> 

Given option 1, I would never put such a device into my production
environment.  The only reasonable action for handling poison is killing
the software, as the data is functionally corrupted.

>     Option 2 (preferred): Device assisted write budgeting - This is 
> about a device aware / assisted mechanism for the write-controlled 
> use-cases (Ex: ZSWAP) to know the 'safe number of  writes' that can be 
> performed to the device (Or allows to be mapped writable in the page 
> tables). This could be like a 'token bucket' algorithm, where the device 
> provides a 'budget / set of tokens' to the host. And it need to be 
> replenished periodically in the device communication code path; and if 
> the host does not find the token, writes cannot go ahead.
> 

When I say budgeting, I mean literally a budget of writable pages,
entirely controlled by software (mm/cram.c or zswap.c or whatever).

This has nothing to do with device operation / throttling / bandwidth
budgets etc.  It is simply a proposal of an optimization that allows the
user to say:  X out of Y possible pages may be mapped writable.

I don't think this would be part of an initial MVP for a compressed ram
service (regardless of it's cram.c or zswap.c)

> In short, the communication with the device has to be maintained to make 
> pages mapped writable. For MVP, this could be a simple constraint of 
> checking actual device capacity periodically to replenish write-budget 
> for CRAM. For other users of private nodes (GPU memory?), this 
> constraint may not be needed at all.
> 
> We are planning to send an RFC code which will fit into your CRAM infra 
> to discuss this poison management approach further.
> 

I'll try to get a new version out this or next week, apologies for the
lag on this series, I've had a number of disruptions and major movements
on the patch set since I last updated it in February.

~Gregory

^ permalink raw reply

* [PATCH] tracing/probes: Point the error offset correctly for eprobe argument error
From: Masami Hiramatsu (Google) @ 2026-05-25  2:21 UTC (permalink / raw)
  To: Steven Rostedt, Shuah Khan
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, linux-kselftest

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Fix to point the error offset correctly for eprobe argument error.
In the cleanup commit 1b8b0cd754cd ("tracing/probes: Move event parameter
fetching code to common parser"), due to incorrect backward compatibility
aimed at conforming to the test specifications, the error location was set
to 0 when a non-existent formal parameter was specified for Eprobe.
However, this should be corrected in both the test and the implementation
to point correct error position.

Fixes: 1b8b0cd754cd ("tracing/probes: Move event parameter fetching code to common parser")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_probe.c                         |    2 --
 .../test.d/dynevent/eprobes_syntax_errors.tc       |    2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 39f040c863e8..695310571b08 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -957,8 +957,6 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
 			code->op = FETCH_OP_COMM;
 			return 0;
 		}
-		/* backward compatibility */
-		ctx->offset = 0;
 		goto inval;
 	}
 
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
index 4f5e8c665156..2a680c086047 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
@@ -20,7 +20,7 @@ check_error 'e:foo/^123456789012345678901234567890123456789012345678901234567890
 check_error 'e:foo/^bar.1 syscalls/sys_enter_openat'	# BAD_EVENT_NAME
 
 check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd'	# BAD_FETCH_ARG
-check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo'	# BAD_ATTACH_ARG
+check_error 'e:foo/bar syscalls/sys_enter_openat arg=^$foo'	# BAD_ATTACH_ARG
 
 if grep -q '<attached-group>\.<attached-event>.*\[if <filter>\]' README; then
   check_error 'e:foo/bar syscalls/sys_enter_openat if ^'	# NO_EP_FILTER


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox