Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH 7/7] i2c: nomadik: add support for I2C_XFER_V2 - detailed fault reporting
From: Dmitry Guzman @ 2026-06-23 16:31 UTC (permalink / raw)
  To: Andi Shyti, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Linus Walleij
  Cc: linux-i2c, linux-kernel, linux-trace-kernel, linux-arm-kernel,
	Dmitry Guzman
In-Reply-To: <20260623-i2c-fault-reporting-v1-0-6db1a8aabf18@mobileye.com>

I2C_XFER_V2 is a new API that allows I2C clients to get the detailed
report in case of transmission failure. Previously, the only information
returned by I2C bus controller was the error code; there was no way to
find out how many messages or bytes in a certain message have been sent
or received until the fault condition occurred.

This commit introduces support of this feature in i2c-nomadik driver.

Signed-off-by: Dmitry Guzman <Dmitry.Guzman@mobileye.com>
---
 drivers/i2c/busses/i2c-nomadik.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c
index 9cff0c2757fafeaf809395e02a5e754570f65e08..1cf03d634fdc856dc335a58597e0fd31ab077078 100644
--- a/drivers/i2c/busses/i2c-nomadik.c
+++ b/drivers/i2c/busses/i2c-nomadik.c
@@ -197,6 +197,7 @@ struct i2c_nmk_client {
  * @stop: stop condition.
  * @xfer_wq: xfer done wait queue.
  * @result: controller propogated result.
+ * @bytes_cplt: number of bytes completed in the message that caused a fault.
  */
 struct nmk_i2c_dev {
 	struct i2c_vendor_data		*vendor;
@@ -216,6 +217,7 @@ struct nmk_i2c_dev {
 	int				stop;
 	struct wait_queue_head		xfer_wq;
 	int				result;
+	int				bytes_cplt;
 };
 
 /* controller's abort causes */
@@ -529,6 +531,8 @@ static int read_i2c(struct nmk_i2c_dev *priv, u16 flags)
 	int status = 0;
 	bool xfer_done;
 
+	priv->cli.xfer_bytes = 0;
+
 	mcr = load_i2c_mcr_reg(priv, flags);
 	writel(mcr, priv->virtbase + I2C_MCR);
 
@@ -653,6 +657,7 @@ static int nmk_i2c_xfer_one(struct nmk_i2c_dev *priv, u16 flags)
 {
 	int status;
 
+	priv->bytes_cplt = 0;
 	if (flags & I2C_M_RD) {
 		/* read operation */
 		priv->cli.operation = I2C_READ;
@@ -678,6 +683,16 @@ static int nmk_i2c_xfer_one(struct nmk_i2c_dev *priv, u16 flags)
 			status = priv->result;
 		}
 
+		if (flags & I2C_M_RD) {
+			/* For READ messages, return the number of bytes read from FIFO */
+			priv->bytes_cplt = priv->cli.xfer_bytes;
+		} else {
+			/* For WRITE messages, return the number of bytes sent on bus */
+			priv->bytes_cplt = FIELD_GET(I2C_SR_LENGTH, i2c_sr);
+			/* LENGTH value includes the last byte that has not been sent or ACKed */
+			if (priv->bytes_cplt > 0)
+				priv->bytes_cplt--;
+		}
 		init_hw(priv);
 
 		status = status ? status : priv->result;
@@ -687,10 +702,11 @@ static int nmk_i2c_xfer_one(struct nmk_i2c_dev *priv, u16 flags)
 }
 
 /**
- * nmk_i2c_xfer() - I2C transfer function used by kernel framework
+ * nmk_i2c_xfer_v2() - I2C transfer function used by kernel framework
  * @i2c_adap: Adapter pointer to the controller
  * @msgs: Pointer to data to be written.
  * @num_msgs: Number of messages to be executed
+ * @report: Pointer to transfer report to be written.
  *
  * This is the function called by the generic kernel i2c_transfer()
  * or i2c_smbus...() API calls. Note that this code is protected by the
@@ -733,14 +749,16 @@ static int nmk_i2c_xfer_one(struct nmk_i2c_dev *priv, u16 flags)
  * please use the i2c_smbus_read_i2c_block_data()
  * or i2c_smbus_write_i2c_block_data() API
  */
-static int nmk_i2c_xfer(struct i2c_adapter *i2c_adap,
-		struct i2c_msg msgs[], int num_msgs)
+static int nmk_i2c_xfer_v2(struct i2c_adapter *i2c_adap,
+		struct i2c_msg msgs[], int num_msgs,
+		struct i2c_transfer_report *report)
 {
 	int status = 0;
 	int i;
 	struct nmk_i2c_dev *priv = i2c_get_adapdata(i2c_adap);
 
 	pm_runtime_get_sync(&priv->adev->dev);
+	priv->bytes_cplt = 0;
 
 	/* setup the i2c controller */
 	setup_i2c_controller(priv);
@@ -760,10 +778,17 @@ static int nmk_i2c_xfer(struct i2c_adapter *i2c_adap,
 	pm_runtime_put_sync(&priv->adev->dev);
 
 	/* return the no. messages processed */
-	if (status)
+	if (status) {
+		report->msgs_cplt = i;
+		report->bytes_cplt = priv->bytes_cplt;
+		report->fault_msg_idx = i;
 		return status;
-	else
+	} else {
+		report->msgs_cplt = num_msgs;
+		report->bytes_cplt = 0;
+		report->fault_msg_idx = num_msgs;
 		return num_msgs;
+	}
 }
 
 /**
@@ -1014,7 +1039,7 @@ static unsigned int nmk_i2c_functionality(struct i2c_adapter *adap)
 }
 
 static const struct i2c_algorithm nmk_i2c_algo = {
-	.xfer = nmk_i2c_xfer,
+	.xfer_v2 = nmk_i2c_xfer_v2,
 	.functionality = nmk_i2c_functionality
 };
 

-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v7 03/10] tracing/probes: Support dumping fetcharg program for debugging dynamic events
From: Julian Braha @ 2026-06-23 18:31 UTC (permalink / raw)
  To: Masami Hiramatsu (Google), Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, linux-kernel, linux-trace-kernel,
	linux-doc, linux-kselftest
In-Reply-To: <178217907822.643090.14693478306190628970.stgit@devnote2>

Hi Masami,

On 6/23/26 02:44, Masami Hiramatsu (Google) wrote:

> +config PROBE_EVENTS_DUMP_FETCHARG
> +	depends on PROBE_EVENTS
> +	bool "Dump of dynamic probe event fetch-arguments"
> +	default n

Sorry, kconfig nitpick: could you match the style used by the rest of
the config options in this file? E.g. the type and prompt come first in
the list of attributes?

- Julian Braha

^ permalink raw reply

* Re: [PATCHv4 00/13] uprobes/x86: Fix red zone issue for optimized uprobes
From: Jiri Olsa @ 2026-06-23 19:11 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra
  Cc: Jiri Olsa, Ingo Molnar, Masami Hiramatsu, Andrii Nakryiko, bpf,
	linux-trace-kernel
In-Reply-To: <aiEiP54zktDqAZpG@krava>

hi, ping, thanks

jirka


On Thu, Jun 04, 2026 at 08:59:11AM +0200, Jiri Olsa wrote:
> On Tue, May 26, 2026 at 10:58:27PM +0200, Jiri Olsa wrote:
> > hi,
> > Andrii reported an issue with optimized uprobes [1] that can clobber
> > redzone area with call instruction storing return address on stack
> > where user code may keep temporary data without adjusting rsp.
> > 
> > Fixing this by moving the optimized uprobes on top of 10-bytes nop
> > instruction, so we can squeeze another instruction to escape the
> > redzone area before doing the call.
> > 
> > Note we need upstream update first for patch 3 (github.com/libbpf/usdt),
> > if we decide to take this change.
> > 
> > thanks,
> > jirka
> > 
> > 
> > v1: https://lore.kernel.org/bpf/20260514135342.22130-1-jolsa@kernel.org/
> > v2: https://lore.kernel.org/bpf/20260518105957.123445-1-jolsa@kernel.org/
> > v3: https://lore.kernel.org/bpf/20260521124411.31133-1-jolsa@kernel.org/
> > 
> > v4 changes:
> > - do not use 2nd int3 (ont +5 offset) because the call instruction
> >   is allways the same for the given nop10 address [Andrii/Peter]
> > - unmap unused trampoline vma after unsuccesfull optimization [sashiko]
> > - small change to patch#2 moved user_64bit_mode earlier in the path
> >   and pass/use mm_struct pointer directly from arch_uprobe_optimize
> >   instead of gettting current->mm
> >   Andrii, keeping your ack, please shout otherwise
> 
> hi,
> I think bots did not find anything substantial, I have just small
> selftests changes queued for v5
> 
> any other feedback/review would be great
> 
> thanks,
> jirka
> 
> 
> > 
> > v3 changes:
> > - use nop10 update suggested by Peter in [2]
> > - remove struct uprobe_trampoline object, use vma objects directly instead
> > - selftests fixes [sashiko]
> > - ack from Andrii
> > 
> > v2 changes:
> > - several selftest fixes [sashiko]
> > - consolidate is_lea_insn and is_call_insn insto single check [Jakub Sitnicki]
> > - use proper mm_struct object in __in_uprobe_trampoline check [sashiko]
> > - allow to copy uprobe trampolines vma objects on fork [sashiko]
> > - change uprobe syscall detection error from -ENXIO to -EPROTO [Andrii]
> > - added fork/clone tests
> > - I kept the selftest changes and nop5->nop10 changes in separate
> >   commits for easier review, we can squash them later if we want to keep
> >   bisect working properly
> > 
> > 
> > [1] https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
> > [2] https://lore.kernel.org/bpf/20260518104306.GU3102624@noisy.programming.kicks-ass.net/#t
> > ---
> > Andrii Nakryiko (1):
> >       selftests/bpf: Add tests for uprobe nop10 red zone clobbering
> > 
> > Jiri Olsa (12):
> >       uprobes/x86: Use proper mm_struct in __in_uprobe_trampoline
> >       uprobes/x86: Remove struct uprobe_trampoline object
> >       uprobes/x86: Allow to copy uprobe trampolines on fork
> >       uprobes/x86: Unmap trampoline vma object in case it's unused
> >       uprobes/x86: Move optimized uprobe from nop5 to nop10
> >       libbpf: Change has_nop_combo to work on top of nop10
> >       libbpf: Detect uprobe syscall with new error
> >       selftests/bpf: Emit nop,nop10 instructions combo for x86_64 arch
> >       selftests/bpf: Change uprobe syscall tests to use nop10
> >       selftests/bpf: Change uprobe/usdt trigger bench code to use nop10
> >       selftests/bpf: Add reattach tests for uprobe syscall
> >       selftests/bpf: Add tests for forked/cloned optimized uprobes
> > 
> >  arch/x86/kernel/uprobes.c                               | 379 +++++++++++++++++++++++++++++++++++++++++++-----------------------------
> >  include/linux/uprobes.h                                 |   5 -
> >  kernel/events/uprobes.c                                 |  10 --
> >  kernel/fork.c                                           |   1 -
> >  tools/lib/bpf/features.c                                |   4 +-
> >  tools/lib/bpf/usdt.c                                    |  16 +--
> >  tools/testing/selftests/bpf/bench.c                     |  20 ++--
> >  tools/testing/selftests/bpf/benchs/bench_trigger.c      |  38 ++++----
> >  tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh |   2 +-
> >  tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 307 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
> >  tools/testing/selftests/bpf/prog_tests/usdt.c           |  74 ++++++++++++--
> >  tools/testing/selftests/bpf/progs/test_usdt.c           |  25 +++++
> >  tools/testing/selftests/bpf/usdt.h                      |   2 +-
> >  tools/testing/selftests/bpf/usdt_2.c                    |  15 ++-
> >  14 files changed, 653 insertions(+), 245 deletions(-)

^ permalink raw reply

* Re: [PATCH] Documentation: tracing: fix typo in events documentation
From: Jonathan Corbet @ 2026-06-23 20:34 UTC (permalink / raw)
  To: Yudistira Putra, Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, Shuah Khan, linux-trace-kernel, linux-doc,
	linux-kernel, Yudistira Putra
In-Reply-To: <20260622143735.71778-1-pyudistira519@gmail.com>

Yudistira Putra <pyudistira519@gmail.com> writes:

> Fix a typo in the tracing events documentation: "can by built up"
> should be "can be built up".
>
> Signed-off-by: Yudistira Putra <pyudistira519@gmail.com>
> ---
>  Documentation/trace/events.rst | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
> index 18d112963dec..581f2260614b 100644
> --- a/Documentation/trace/events.rst
> +++ b/Documentation/trace/events.rst
> @@ -1064,7 +1064,7 @@ correct command type, and a pointer to an event-specific run_command()
>  callback that will be called to actually execute the event-specific
>  command function.
>  
> -Once that's done, the command string can by built up by successive
> +Once that's done, the command string can be built up by successive
>  calls to argument-adding functions.

Applied, thanks.

jon

^ permalink raw reply

* Re: [PATCH v8 01/46] KVM: guest_memfd: Introduce per-gmem attributes, use to guard user mappings
From: Ackerley Tng @ 2026-06-24  0:09 UTC (permalink / raw)
  To: Sean Christopherson, Binbin Wu
  Cc: aik, andrew.jones, brauner, chao.p.peng, david, jmattson,
	jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <ajnjTJdQKD1Kz3tf@google.com>

Sean Christopherson <seanjc@google.com> writes:

> On Mon, Jun 22, 2026, Binbin Wu wrote:
>> On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
>>
>> [...]
>>
>> >
>> > +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
>> > +{
>> > +	struct maple_tree *mt = &GMEM_I(inode)->attributes;
>> > +	void *entry = mtree_load(mt, index);
>> > +
>> > +	return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry);
>>
>> If the entry is unexpectedly missing, returning 0 means the attribute would
>> be treated as shared.  And then in kvm_gmem_fault_user_mapping(), it would
>> allow the userspace to fault in the folio.
>>
>> Should gmem deny such edge case?
>
> After several bugs this year where a WARN_ON_ONCE() fired, but was entirely
> insufficient to prevent true badness, I'm definitely senstive to making the "bad"
> behavior as harmless as possible.
>

I guess both are indeed awkward.

> However, in this case I think we're just hosed.  If KVM treats the memory as
> private, KVM will incorrectly do prepare(), incorrectly allow populate(), and
> will caused missed invalidations (though I suppose __kvm_gmem_set_attributes()
> "only" lies to userspace in that case).
>
> That said, assuming SHARED is definitely odd for cases where guest_memfd *can't*
> hold shared memory.  Ditto for assuming PRIVATE.  What if we instead fall back to
> the "init" state, e.g.?
>
> static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index)
> {
> 	struct maple_tree *mt = &GMEM_I(inode)->attributes;
> 	void *entry = mtree_load(mt, index);
>
> 	if (WARN_ON_ONCE(!entry)) {
> 		bool shared = GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED;
>
> 		return shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;

I was wondering if we should not only return the init state but also set
the init state, but that would involve performing a conversion to the
init state... Too complicated for an edge case.

> 	}
>
> 	return xa_to_value(entry);
> }

Thanks Binbin and Sean!

^ permalink raw reply

* Re: [PATCH v8 04/46] KVM: Decouple kvm_has_arch_private_mem from CONFIG_KVM_VM_MEMORY_ATTRIBUTES
From: Ackerley Tng @ 2026-06-24  0:13 UTC (permalink / raw)
  To: Binbin Wu
  Cc: aik, andrew.jones, brauner, chao.p.peng, david, jmattson,
	jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <a21bfc05-787e-4cd8-89af-8579357e6a12@linux.intel.com>

Binbin Wu <binbin.wu@linux.intel.com> writes:

> On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
>> From: Sean Christopherson <seanjc@google.com>
>>
>> When memory attributes become trackable in guest_memfd, the concept of
>> having private memory is no longer dependent on
>> CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
>>
>> With this, on x86, kvm_arch_has_private_mem() is defined if some CoCo
>> platform support (or the testing CONFIG_KVM_SW_PROTECTED_VM) is compiled
>> in.
>>
>> Signed-off-by: Sean Christopherson <seanjc@google.com>
>> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
>
> One nit below.
>
>> ---
>>  arch/x86/include/asm/kvm_host.h | 4 +++-
>>  include/linux/kvm_host.h        | 2 +-
>>  2 files changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 8e8eb8a5e8a6b..1bde67cf6eb0e 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -2394,7 +2394,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>>  		       int tdp_max_root_level, int tdp_huge_page_level);
>>
>>
>> -#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>> +#if defined(CONFIG_KVM_SW_PROTECTED_VM) ||	\
>> +	defined(CONFIG_KVM_INTEL_TDX) ||	\
>> +	defined(CONFIG_KVM_AMD_SEV)
>
> Nit:
> Vertically align the defined(XXX) statements for better readability?
>

Sean had this aligned with spaces, and checkpatch complained about
having no spaces before tabs, so I switched it to tabs instead since I
don't think alignment like that is officially documented either way.

Either way is fine :)

>>  #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
>>  #endif
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 201d0f2143976..d370e834d619e 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -722,7 +722,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
>>  }
>>  #endif
>>
>> -#ifndef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>> +#ifndef kvm_arch_has_private_mem
>>  static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
>>  {
>>  	return false;
>>

^ permalink raw reply

* Re: [PATCH v8 05/46] KVM: Make CONFIG_KVM_VM_MEMORY_ATTRIBUTES selectable
From: Ackerley Tng @ 2026-06-24  0:14 UTC (permalink / raw)
  To: Sean Christopherson, Julian Braha
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <ajnQVkLvFl_lMuGB@google.com>

Sean Christopherson <seanjc@google.com> writes:

> On Fri, Jun 19, 2026, Julian Braha wrote:
>> Hi Ackerley,
>>
>> On 6/19/26 01:31, Ackerley Tng via B4 Relay wrote:
>>
>> >  config KVM_VM_MEMORY_ATTRIBUTES
>> > -	bool
>> > +	depends on KVM_SW_PROTECTED_VM || KVM_INTEL_TDX || KVM_AMD_SEV
>> > +	bool "Enable per-VM PRIVATE vs. SHARED attributes (for CoCo VMs)"
>>
>> Sorry for the style nitpick, but could you keep the type and prompt as
>> the first attribute in the Kconfig option definition (like the other
>> options do)?
>
> No need to be sorry, I've no idea why I put the "depends" first.  I don't even
> know if that qualifies as a nit :-)
>
> Ackerley, if you can provide your SoB (for Fuad's feedback), I can fixup when
> applying (assuming nothing else necessitates v9).

Thanks, didn't notice this when consolidating this revision.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>

^ permalink raw reply

* Re: [PATCH v7 03/10] tracing/probes: Support dumping fetcharg program for debugging dynamic events
From: Masami Hiramatsu @ 2026-06-24  0:18 UTC (permalink / raw)
  To: Julian Braha
  Cc: Steven Rostedt, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <96b043ed-c527-4e5d-8eb7-631805da53fd@gmail.com>

On Tue, 23 Jun 2026 19:31:47 +0100
Julian Braha <julianbraha@gmail.com> wrote:

> Hi Masami,
> 
> On 6/23/26 02:44, Masami Hiramatsu (Google) wrote:
> 
> > +config PROBE_EVENTS_DUMP_FETCHARG
> > +	depends on PROBE_EVENTS
> > +	bool "Dump of dynamic probe event fetch-arguments"
> > +	default n
> 
> Sorry, kconfig nitpick: could you match the style used by the rest of
> the config options in this file? E.g. the type and prompt come first in
> the list of attributes?

Ah, good catch! Let me fix it.

Thanks,

> 
> - Julian Braha


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH] tracing/probes: ignore id update from btf_type_skip_modifiers
From: Masami Hiramatsu @ 2026-06-24  0:24 UTC (permalink / raw)
  To: Martin Kaiser; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <20260623132937.3494895-1-martin@kaiser.cx>

On Tue, 23 Jun 2026 15:29:32 +0200
Martin Kaiser <martin@kaiser.cx> wrote:

> We can pass NULL as id pointer to btf_type_skip_modifiers if we do not
> need the id of the returned btf_type.
> 

Good catch! Let me pick it to probes/core.

Thanks,

> Signed-off-by: Martin Kaiser <martin@kaiser.cx>
> ---
>  kernel/trace/trace_probe.c | 13 +++++--------
>  1 file changed, 5 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> index 9b3219e755cb..78bca283763f 100644
> --- a/kernel/trace/trace_probe.c
> +++ b/kernel/trace/trace_probe.c
> @@ -360,9 +360,8 @@ static bool btf_type_is_char_ptr(struct btf *btf, const struct btf_type *type)
>  {
>  	const struct btf_type *real_type;
>  	u32 intdata;
> -	s32 tid;
>  
> -	real_type = btf_type_skip_modifiers(btf, type->type, &tid);
> +	real_type = btf_type_skip_modifiers(btf, type->type, NULL);
>  	if (!real_type)
>  		return false;
>  
> @@ -379,14 +378,13 @@ static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type)
>  	const struct btf_type *real_type;
>  	const struct btf_array *array;
>  	u32 intdata;
> -	s32 tid;
>  
>  	if (BTF_INFO_KIND(type->info) != BTF_KIND_ARRAY)
>  		return false;
>  
>  	array = (const struct btf_array *)(type + 1);
>  
> -	real_type = btf_type_skip_modifiers(btf, array->type, &tid);
> +	real_type = btf_type_skip_modifiers(btf, array->type, NULL);
>  
>  	intdata = btf_type_int(real_type);
>  	return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED)
> @@ -589,7 +587,6 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
>  	struct btf *btf = ctx_btf(ctx);
>  	char *next;
>  	int is_ptr;
> -	s32 tid;
>  
>  	do {
>  		if (!is_struct) {
> @@ -600,7 +597,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
>  			}
>  
>  			/* Convert a struct pointer type to a struct type */
> -			type = btf_type_skip_modifiers(btf, type->type, &tid);
> +			type = btf_type_skip_modifiers(btf, type->type, NULL);
>  			if (!type) {
>  				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
>  				return -EINVAL;
> @@ -640,7 +637,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
>  				ctx->last_bitsize = 0;
>  			}
>  
> -			type = btf_type_skip_modifiers(btf, field->type, &tid);
> +			type = btf_type_skip_modifiers(btf, field->type, NULL);
>  			if (!type) {
>  				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
>  				return -EINVAL;
> @@ -759,7 +756,7 @@ static int parse_btf_arg(char *varname,
>  	return -ENOENT;
>  
>  found:
> -	type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
> +	type = btf_type_skip_modifiers(ctx->btf, tid, NULL);
>  found_type:
>  	if (!type) {
>  		trace_probe_log_err(ctx->offset, BAD_BTF_TID);
> -- 
> 2.43.7
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH] tracing/fprobe: Fix NULL pointer dereference in fprobe_fgraph_entry()
From: Masami Hiramatsu @ 2026-06-24  0:37 UTC (permalink / raw)
  To: Sechang Lim
  Cc: Steven Rostedt, Mathieu Desnoyers, Heiko Carstens, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260619184425.3824774-1-rhkrqnwk98@gmail.com>

On Fri, 19 Jun 2026 18:44:24 +0000
Sechang Lim <rhkrqnwk98@gmail.com> wrote:

> fprobe_fgraph_entry() sizes a shadow-stack reservation in one walk of
> the per-ip fprobe list and fills it in a second walk, both under
> rcu_read_lock() only. A fprobe registered on an already-live ip can
> become visible between the two walks, so the fill walk processes an
> exit_handler the sizing walk did not count and used runs past
> reserved_words. If the sizing walk counted nothing, fgraph_data is NULL
> and the first write_fprobe_header() faults:
> 
>   Oops: general protection fault, probably for non-canonical address ...
>   KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
>   RIP: 0010:fprobe_fgraph_entry+0xa38/0xf10 kernel/trace/fprobe.c:167
>   Call Trace:
>    <TASK>
>    function_graph_enter_regs+0x44c/0xa10 kernel/trace/fgraph.c:677
>    ftrace_graph_func+0xc5/0x140 arch/x86/kernel/ftrace.c:671
>    __kernel_text_address+0x9/0x40 kernel/extable.c:78
>    arch_stack_walk+0x117/0x170 arch/x86/kernel/stacktrace.c:26
>    kmem_cache_free+0x188/0x580 mm/slub.c:6378
>    tcp_data_queue+0x18d/0x6550 net/ipv4/tcp_input.c:5590
>    [...]
>    </TASK>
> 
> The list cannot be frozen across the two walks, so skip a node that does
> not fit the reservation and count it as missed.
> 

Ah, good catch! Yes, if the list is scanned repeatedly, there is a
possibility that it could be updated even when using RCU guards.
This is rare case, but hmm, I think we need something like
fgraph_increment_reserved_data() to avoid skipping.
Anyway, this looks good to me. Let me pick it.

Thanks!

> Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer")
> Signed-off-by: Sechang Lim <rhkrqnwk98@gmail.com>
> ---
>  kernel/trace/fprobe.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
> index f378613ad120..f215990b9061 100644
> --- a/kernel/trace/fprobe.c
> +++ b/kernel/trace/fprobe.c
> @@ -613,6 +613,16 @@ static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops
>  			continue;
>  
>  		data_size = fp->entry_data_size;
> +		/*
> +		 * The list may have grown since it was sized, so this node
> +		 * may not fit. Skip it as missed rather than overrun the
> +		 * reservation.
> +		 */
> +		if (fp->exit_handler &&
> +		    used + FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(data_size) > reserved_words) {
> +			fp->nmissed++;
> +			continue;
> +		}
>  		if (data_size && fp->exit_handler)
>  			data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG;
>  		else
> -- 
> 2.43.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH v2 1/1] rtla: fix missing unistd include
From: Andreas Ziegler @ 2026-06-24  3:33 UTC (permalink / raw)
  To: Tomas Glozar
  Cc: Steven Rostedt, linux-trace-kernel, linux-kernel, Andreas Ziegler
In-Reply-To: <CAP4=nvRozLS73PooixqWUhDh19eG6oPLTCSV8HvjhibsTLswtw@mail.gmail.com>

Compiling RTLA 7.1.x with GCC 16 and uClibc as standard library fails
with these errors:

src/common.c: In function ‘set_signals’:
src/common.c:40:17: error: implicit declaration of function ‘alarm’ [-Wimplicit-function-declaration]
   40 |                 alarm(params->duration);
      |                 ^~~~~
src/common.c: In function ‘common_apply_config’:
src/common.c:187:44: error: implicit declaration of function ‘getpid’; did you mean ‘getpt’? [-Wimplicit-function-declaration]
  187 |                 retval = sched_setaffinity(getpid(), sizeof(params->hk_cpu_set),
      |                                            ^~~~~~
      |                                            getpt
In file included from src/common.c:9:
src/common.c: In function ‘run_tool’:
src/common.c:262:19: error: implicit declaration of function ‘sysconf’; did you mean ‘sscanf’? [-Wimplicit-function-declaration]
  262 |         nr_cpus = get_nprocs_conf();
      |                   ^~~~~~~~~~~~~~~
src/common.c:262:19: error: ‘_SC_NPROCESSORS_CONF’ undeclared (first use in this function)
  262 |         nr_cpus = get_nprocs_conf();
      |                   ^~~~~~~~~~~~~~~
src/common.c:262:19: note: each undeclared identifier is reported only once for each function it appears in
src/common.c:370:17: error: implicit declaration of function ‘sleep’ [-Wimplicit-function-declaration]
  370 |                 sleep(1);
      |                 ^~~~~

Restore the missing unistd.h include.

Fixes: 115b06a00875 ("tools/rtla: Consolidate nr_cpus usage across all tools")

Signed-off-by: Andreas Ziegler <br025@umbiko.net>
---
Changes v1 -> v2:
  adapt commit message
  correct fixes: formatting
  rebase on current master (502d801f0ab0)

 tools/tracing/rtla/src/common.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index d0a8a6edbf0c..8c7f5e75b2ec 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -5,6 +5,7 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 #include <sys/sysinfo.h>
 
 #include "common.h"
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3] tracing: Use seq_buf for string concatenation
From: Masami Hiramatsu @ 2026-06-24  5:03 UTC (permalink / raw)
  To: Woradorn Laodhanadhaworn
  Cc: rostedt, mhiramat, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, linux-hardening, linux-kernel-mentees, shuah,
	skhan, me, jkoolstra
In-Reply-To: <20260623145147.12145-1-woradorn.laon@gmail.com>

On Tue, 23 Jun 2026 21:51:47 +0700
Woradorn Laodhanadhaworn <woradorn.laon@gmail.com> wrote:

> In preparation for removing the strlcat API[1],
> replace the string concatenation logic with a struct seq_buf,
> which tracks the current position and the remaining space internally.
> 
> Use seq_buf_str() to NUL-terminate before passing to early_enable_events().
> 
> Link: https://github.com/KSPP/linux/issues/370 [1]
> 

Looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks,

> Signed-off-by: Woradorn Laodhanadhaworn <woradorn.laon@gmail.com>
> ---
> v1 -> v2: 
> 	- Fixed WARN_ON when booting with empty trace_event.
> v2 -> v3: 
> 	- Addressed Sashiko's concern about the compound literal backing buffer.
> 	- Replaced the compund literal with an explicit declared buffer and pointed
> 	seq_buf.buffer to it. This guarantees the backing storage is placed in 
> 	`.init.data` and reclaimed after boot.
> 
> v1: https://lore.kernel.org/all/20260620175441.223342-1-woradorn.laon@gmail.com
> v2: https://lore.kernel.org/all/20260622094623.18469-1-woradorn.laon@gmail.com
> Sashiko: https://sashiko.dev/#/patchset/20260622094623.18469-1-woradorn.laon%40gmail.com
> 
>  kernel/trace/trace_events.c | 18 +++++++++++++-----
>  1 file changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index c46e623e7e0d..5ab630155ab6 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -22,6 +22,7 @@
>  #include <linux/sort.h>
>  #include <linux/slab.h>
>  #include <linux/delay.h>
> +#include <linux/seq_buf.h>
>  
>  #include <trace/events/sched.h>
>  #include <trace/syscall.h>
> @@ -4501,13 +4502,20 @@ extern struct trace_event_call *__start_ftrace_events[];
>  extern struct trace_event_call *__stop_ftrace_events[];
>  
>  static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
> +static struct seq_buf bootup_event_seq __initdata = {
> +	.buffer = bootup_event_buf,
> +	.size = COMMAND_LINE_SIZE,
> +};
>  
>  static __init int setup_trace_event(char *str)
>  {
> -	if (bootup_event_buf[0] != '\0')
> -		strlcat(bootup_event_buf, ",", COMMAND_LINE_SIZE);
> +	if (seq_buf_used(&bootup_event_seq) > 0)
> +		seq_buf_puts(&bootup_event_seq, ",");
> +
> +	seq_buf_puts(&bootup_event_seq, str);
>  
> -	strlcat(bootup_event_buf, str, COMMAND_LINE_SIZE);
> +	if (seq_buf_has_overflowed(&bootup_event_seq))
> +		return -ENOMEM;
>  
>  	trace_set_ring_buffer_expanded(NULL);
>  	disable_tracing_selftest("running event tracing");
> @@ -4766,7 +4774,7 @@ static __init int event_trace_enable(void)
>  	 */
>  	__trace_early_add_events(tr);
>  
> -	early_enable_events(tr, bootup_event_buf, false);
> +	early_enable_events(tr, (char *)seq_buf_str(&bootup_event_seq), false);
>  
>  	trace_printk_start_comm();
>  
> @@ -4794,7 +4802,7 @@ static __init int event_trace_enable_again(void)
>  	if (!tr)
>  		return -ENODEV;
>  
> -	early_enable_events(tr, bootup_event_buf, true);
> +	early_enable_events(tr, (char *)seq_buf_str(&bootup_event_seq), true);
>  
>  	return 0;
>  }
> -- 
> 2.43.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH] tracing: Fix NULL pointer dereference in func_set_flag()
From: Yuanhe Shu @ 2026-06-24  6:17 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel, stable,
	Yuanhe Shu

func_set_flag() dereferences tr->current_trace_flags before verifying
that the current tracer is actually the function tracer. When the active
tracer has been switched away from "function" (e.g., to "wakeup_rt"),
tr->current_trace_flags can be NULL, leading to a NULL pointer
dereference and kernel crash.

The call chain that triggers this is:

  trace_options_write()
    -> __set_tracer_option()
      -> trace->set_flag()          /* func_set_flag */

In func_set_flag(), the first operation is:

  if (!!set == !!(tr->current_trace_flags->val & bit))

This dereferences tr->current_trace_flags unconditionally. The safety
check that guards against a non-function tracer:

  if (tr->current_trace != &function_trace)
      return 0;

is placed *after* the dereference, which is too late.

This was observed with the following crash dump:

  BUG: unable to handle page fault at 0000000000000000
  RIP: func_set_flag+0xd

  Call Trace:
   __set_tracer_option+0x27
   trace_options_write+0x75
   vfs_write+0x12a
   ksys_write+0x66
   do_syscall_64+0x5b

  RIP: ffffffff914c973d  RSP: ff67ec88b01dfdf0  RFLAGS: 00010202
  RAX: 0000000000000000  RBX: ff3a826e80354580  RCX: 0000000000000001
  RDX: 0000000000000001  RSI: 0000000000000000  RDI: ffffffff93918080

The disassembly confirms the fault:

  func_set_flag+0:   mov 0x1f08(%rdi), %rax  ; RAX = tr->current_trace_flags = NULL
  func_set_flag+13:  mov (%rax), %eax        ; page fault: dereference NULL

At the time of the crash:
  tr->current_trace_flags = 0x0 (NULL)
  tr->current_trace = wakeup_rt_tracer (not function_trace)

The scenario is that a process opens a function tracer option file (such
as "func_stack_trace"), then the current tracer is switched to another
tracer (e.g., "wakeup_rt"), which sets current_trace_flags to NULL. When
the process subsequently writes to the option file, func_set_flag() is
invoked and crashes on the NULL dereference.

Fix this by moving the current_trace check before the
current_trace_flags dereference, so that func_set_flag() returns early
when the function tracer is not active.

Cc: stable@vger.kernel.org
Fixes: 76680d0d2825 ("tracing: Have function tracer define options per instance")
Signed-off-by: Yuanhe Shu <xiangzao@linux.alibaba.com>
---
 kernel/trace/trace_functions.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index f283391a4dc8..cd37f2013758 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -458,12 +458,12 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 	ftrace_func_t func;
 	u32 new_flags;
 
-	/* Do nothing if already set. */
-	if (!!set == !!(tr->current_trace_flags->val & bit))
+	/* We can change this flag only when current tracer is function. */
+	if (tr->current_trace != &function_trace)
 		return 0;
 
-	/* We can change this flag only when not running. */
-	if (tr->current_trace != &function_trace)
+	/* Do nothing if already set. */
+	if (!!set == !!(tr->current_trace_flags->val & bit))
 		return 0;
 
 	new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0);
-- 
2.39.3


^ permalink raw reply related

* Re: [PATCH v3] tracing: Use seq_buf for string concatenation
From: Markus Elfring @ 2026-06-24  7:15 UTC (permalink / raw)
  To: Woradorn Laodhanadhaworn, linux-trace-kernel, linux-hardening,
	linux-kernel-mentees, Steven Rostedt, Masami Hiramatsu
  Cc: LKML, Brigham Campbell, Jori Koolstra, Mathieu Desnoyers,
	Shuah Khan, Shuah Khan
In-Reply-To: <20260623145147.12145-1-woradorn.laon@gmail.com>

…
> +++ b/kernel/trace/trace_events.c
> @@ -4501,13 +4502,20 @@ extern struct trace_event_call *__start_ftrace_events[];
>  static __init int setup_trace_event(char *str)
>  {
> -	if (bootup_event_buf[0] != '\0')
> -		strlcat(bootup_event_buf, ",", COMMAND_LINE_SIZE);
> +	if (seq_buf_used(&bootup_event_seq) > 0)
> +		seq_buf_puts(&bootup_event_seq, ",");
…

I suggest to use the function “seq_buf_putc” instead at this source code place.
https://elixir.bootlin.com/linux/v7.1.1/source/lib/seq_buf.c#L203-L221

Is there a need for corresponding error detection?

Regards,
Markus

^ permalink raw reply

* Re: [syzbot] [trace?] general protection fault in mtree_load
From: Jiri Olsa @ 2026-06-24  7:49 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: syzbot, bp, dave.hansen, hpa, linux-kernel, linux-trace-kernel,
	mhiramat, mingo, peterz, syzkaller-bugs, tglx, x86
In-Reply-To: <ajky0IbEvV_UDj2a@redhat.com>

On Mon, Jun 22, 2026 at 03:04:16PM +0200, Oleg Nesterov wrote:
> On 06/21, syzbot wrote:
> >
> > Hello,
> >
> > syzbot found the following issue on:
> >
> > HEAD commit:    6b5a2b7d9bc1 Merge tag 'trace-tools-v7.2' of git://git.ker..
> > git tree:       upstream
> > console output: https://syzkaller.appspot.com/x/log.txt?x=16d56986580000
> > kernel config:  https://syzkaller.appspot.com/x/.config?x=ea6584355d75e0cd
> > dashboard link: https://syzkaller.appspot.com/bug?extid=61ce80689253f42e6d80
> > compiler:       gcc (Debian 14.2.0-19) 14.2.0, GNU ld (GNU Binutils for Debian) 2.44
> >
> > Unfortunately, I don't have any reproducer for this issue yet.
> >
> > Downloadable assets:
> > disk image (non-bootable): https://storage.googleapis.com/syzbot-assets/d900f083ada3/non_bootable_disk-6b5a2b7d.raw.xz
> > vmlinux: https://storage.googleapis.com/syzbot-assets/b3cb0499fbe9/vmlinux-6b5a2b7d.xz
> > kernel image: https://storage.googleapis.com/syzbot-assets/47cfbe57f6ea/bzImage-6b5a2b7d.xz
> >
> > IMPORTANT: if you fix the issue, please add the following tag to the commit:
> > Reported-by: syzbot+61ce80689253f42e6d80@syzkaller.appspotmail.com
> >
> > Oops: general protection fault, probably for non-canonical address 0xdffffc0000000011: 0000 [#1] SMP KASAN NOPTI
> > KASAN: null-ptr-deref in range [0x0000000000000088-0x000000000000008f]
> > CPU: 3 UID: 0 PID: 24402 Comm: syz.4.5217 Tainted: G             L      syzkaller #0 PREEMPT(full)
> > Tainted: [L]=SOFTLOCKUP
> > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> > RIP: 0010:mas_root lib/maple_tree.c:759 [inline]
> > RIP: 0010:mas_start lib/maple_tree.c:1179 [inline]
> > RIP: 0010:mtree_load+0x16d/0xa90 lib/maple_tree.c:5657
> > Code: 00 00 00 00 48 c7 44 24 78 ff ff ff ff e8 6b bd 84 f6 48 8b 5c 24 50 c6 84 24 9c 00 00 00 00 48 8d 7b 48 48 89 f8 48 c1 e8 03 <42> 80 3c 20 00 0f 85 d6 08 00 00 48 8b 5b 48 e8 6f 1a 08 00 31 ff
> > RSP: 0018:ffffc900039c76d8 EFLAGS: 00010206
> > RAX: 0000000000000011 RBX: 0000000000000040 RCX: ffffffff8b848746
> > RDX: ffff888041b6a540 RSI: ffffffff8b848775 RDI: 0000000000000088
> > RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000001
> > R10: 0000000000000001 R11: 000000000000751b R12: dffffc0000000000
> > R13: ffff88802693adc0 R14: 00001fff904365a7 R15: dffffc0000000000
> > FS:  0000000000000000(0000) GS:ffff8880d665f000(0000) knlGS:0000000000000000
> > CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > CR2: 00007f44aa04f156 CR3: 00000000364d5000 CR4: 0000000000352ef0
> > Call Trace:
> >  <TASK>
> >  vma_lookup include/linux/mm.h:4204 [inline]
> >  __in_uprobe_trampoline arch/x86/kernel/uprobes.c:766 [inline]
> >  __is_optimized arch/x86/kernel/uprobes.c:1056 [inline]
> >  is_optimized arch/x86/kernel/uprobes.c:1067 [inline]
> >  set_orig_insn+0x1ec/0x2a0 arch/x86/kernel/uprobes.c:1098
> >  remove_breakpoint kernel/events/uprobes.c:1185 [inline]
> >  register_for_each_vma+0xbb7/0xdb0 kernel/events/uprobes.c:1318
> >  uprobe_unregister_nosync+0x12a/0x1c0 kernel/events/uprobes.c:1343
> >  bpf_uprobe_unregister kernel/trace/bpf_trace.c:2936 [inline]
> >  bpf_uprobe_multi_link_release+0xb3/0x1c0 kernel/trace/bpf_trace.c:2947
> >  bpf_link_free+0xec/0x4a0 kernel/bpf/syscall.c:3273
> >  bpf_link_put_direct kernel/bpf/syscall.c:3326 [inline]
> >  bpf_link_release+0x5d/0x80 kernel/bpf/syscall.c:3333
> >  __fput+0x3ff/0xb50 fs/file_table.c:512
> >  task_work_run+0x150/0x240 kernel/task_work.c:233
> >  exit_task_work include/linux/task_work.h:40 [inline]
> 
> current->mm is already NULL, the exiting task has already passed exit_mm().
> 
> Hopefully
> 
> 	[PATCHv4 01/13] uprobes/x86: Use proper mm_struct in __in_uprobe_trampoline
> 	https://lore.kernel.org/all/20260526205840.173790-2-jolsa@kernel.org/
> 
> should help...

yes, that sould fix it

thanks,
jirka

^ permalink raw reply

* [PATCH v3 0/2] tracing: Remove trace_printk.h from kernel.h
From: Steven Rostedt @ 2026-06-24  8:18 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Linus Torvalds, Sebastian Andrzej Siewior, John Ogness,
	Thomas Gleixner, Peter Zijlstra, Julia Lawall, Yury Norov

Remove trace_printk.h by creating a trace_controls.h for those places that
need access to tracing prototypes like tracing_off() and for the places that
need trace_printk() directly, to have it included directly.

Changse since v2: https://lore.kernel.org/all/20260622130739.375198646@kernel.org/

- Update change log in patch 1

- Remove #ifdef DEBUG and always include trace_printk.h in patch 2.

Steven Rostedt (2):
      tracing: Move non-trace_printk prototypes into trace_controls.h
      tracing: Remove trace_printk.h from kernel.h

----
 arch/powerpc/kvm/book3s_xics.c         |  1 +
 arch/powerpc/xmon/xmon.c               |  1 +
 arch/s390/kernel/ipl.c                 |  1 +
 arch/s390/kernel/machine_kexec.c       |  1 +
 drivers/gpu/drm/i915/gt/intel_gtt.h    |  1 +
 drivers/gpu/drm/i915/i915_gem.h        |  2 ++
 drivers/hwtracing/stm/dummy_stm.c      |  1 +
 drivers/infiniband/hw/hfi1/trace_dbg.h |  1 +
 drivers/tty/sysrq.c                    |  1 +
 drivers/usb/early/xhci-dbc.c           |  1 +
 fs/ext4/inline.c                       |  1 +
 include/linux/ftrace.h                 |  2 ++
 include/linux/kernel.h                 |  1 -
 include/linux/sunrpc/debug.h           |  1 +
 include/linux/trace_controls.h         | 54 ++++++++++++++++++++++++++++++++
 include/linux/trace_printk.h           | 56 ++--------------------------------
 kernel/debug/debug_core.c              |  1 +
 kernel/panic.c                         |  1 +
 kernel/rcu/rcu.h                       |  2 ++
 kernel/rcu/rcutorture.c                |  1 +
 kernel/trace/ring_buffer_benchmark.c   |  1 +
 kernel/trace/trace.h                   |  1 +
 kernel/trace/trace_benchmark.c         |  1 +
 lib/sys_info.c                         |  1 +
 samples/fprobe/fprobe_example.c        |  1 +
 samples/ftrace/ftrace-direct-too.c     |  1 -
 samples/trace_printk/trace-printk.c    |  1 +
 27 files changed, 83 insertions(+), 55 deletions(-)
 create mode 100644 include/linux/trace_controls.h

^ permalink raw reply

* [PATCH v3 2/2] tracing: Remove trace_printk.h from kernel.h
From: Steven Rostedt @ 2026-06-24  8:18 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Linus Torvalds, Sebastian Andrzej Siewior, John Ogness,
	Thomas Gleixner, Peter Zijlstra, Julia Lawall, Yury Norov
In-Reply-To: <20260624081806.120105649@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

There have been complaints about trace_printk.h causing more build time
for being in kernel.h. Move it out of kernel.h and place it in the headers
and C files that use it.

Link: https://lore.kernel.org/all/CAHk-=wikCBeVFjVXiY4o-oepdbjAoir5+TcAgtL12c4u1TpZLQ@mail.gmail.com/

Suggested-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
Changes since v2: https://patch.msgid.link/20260622131029.816825024@kernel.org

- Remove #ifdef DEBUG and just always include trace_printk.h in dummy_stm.c.

 arch/powerpc/kvm/book3s_xics.c         | 1 +
 drivers/gpu/drm/i915/gt/intel_gtt.h    | 1 +
 drivers/gpu/drm/i915/i915_gem.h        | 1 +
 drivers/hwtracing/stm/dummy_stm.c      | 1 +
 drivers/infiniband/hw/hfi1/trace_dbg.h | 1 +
 drivers/usb/early/xhci-dbc.c           | 1 +
 fs/ext4/inline.c                       | 1 +
 include/linux/ftrace.h                 | 2 ++
 include/linux/kernel.h                 | 1 -
 include/linux/sunrpc/debug.h           | 1 +
 include/linux/trace_printk.h           | 5 +++--
 kernel/trace/ring_buffer_benchmark.c   | 1 +
 samples/fprobe/fprobe_example.c        | 1 +
 samples/ftrace/ftrace-direct-too.c     | 1 -
 samples/trace_printk/trace-printk.c    | 1 +
 15 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 74a44fa702b0..ef5eb596a56e 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -26,6 +26,7 @@
 #if 1
 #define XICS_DBG(fmt...) do { } while (0)
 #else
+#include <linux/trace_printk.h>
 #define XICS_DBG(fmt...) trace_printk(fmt)
 #endif
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
index b54ee4f25af1..f6f223090760 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -35,6 +35,7 @@
 #define I915_GFP_ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
 
 #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GTT)
+#include <linux/trace_printk.h>
 #define GTT_TRACE(...) trace_printk(__VA_ARGS__)
 #else
 #define GTT_TRACE(...)
diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index 1da8fb61c09e..f490052e8964 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -117,6 +117,7 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file);
 
 #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM)
 #include <linux/trace_controls.h>
+#include <linux/trace_printk.h>
 #define GEM_TRACE(...) trace_printk(__VA_ARGS__)
 #define GEM_TRACE_ERR(...) do {						\
 	pr_err(__VA_ARGS__);						\
diff --git a/drivers/hwtracing/stm/dummy_stm.c b/drivers/hwtracing/stm/dummy_stm.c
index 38528ffdc0b3..7c5e48ebfb9f 100644
--- a/drivers/hwtracing/stm/dummy_stm.c
+++ b/drivers/hwtracing/stm/dummy_stm.c
@@ -8,6 +8,7 @@
  */
 
 #undef DEBUG
+#include <linux/trace_printk.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
index 58304b91380f..30df5e246586 100644
--- a/drivers/infiniband/hw/hfi1/trace_dbg.h
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -103,6 +103,7 @@ __hfi1_trace_def(IOCTL);
  */
 
 #ifdef HFI1_EARLY_DBG
+#include <linux/trace_printk.h>
 #define hfi1_dbg_early(fmt, ...) \
 	trace_printk(fmt, ##__VA_ARGS__)
 #else
diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c
index 41118bba9197..955c73bd601f 100644
--- a/drivers/usb/early/xhci-dbc.c
+++ b/drivers/usb/early/xhci-dbc.c
@@ -30,6 +30,7 @@ static struct xdbc_state xdbc;
 static bool early_console_keep;
 
 #ifdef XDBC_TRACE
+#include <linux/trace_printk.h>
 #define	xdbc_trace	trace_printk
 #else
 static inline void xdbc_trace(const char *fmt, ...) { }
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8045e4ff270c..0eff4a0c6a6c 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -934,6 +934,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
 }
 
 #ifdef INLINE_DIR_DEBUG
+#include <linux/trace_printk.h>
 void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
 			  void *inline_start, int inline_size)
 {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 02bc5027523a..b5336a81e619 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -8,6 +8,8 @@
 #define _LINUX_FTRACE_H
 
 #include <linux/trace_recursion.h>
+#include <linux/trace_controls.h>
+#include <linux/trace_printk.h>
 #include <linux/trace_clock.h>
 #include <linux/jump_label.h>
 #include <linux/kallsyms.h>
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e5570a16cbb1..e87a40fbd152 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -31,7 +31,6 @@
 #include <linux/build_bug.h>
 #include <linux/sprintf.h>
 #include <linux/static_call_types.h>
-#include <linux/trace_printk.h>
 #include <linux/util_macros.h>
 #include <linux/wordpart.h>
 
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index ab61bed2f7af..7524f5d82fba 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -29,6 +29,7 @@ extern unsigned int		nlm_debug;
 # define ifdebug(fac)		if (unlikely(rpc_debug & RPCDBG_##fac))
 
 # if IS_ENABLED(CONFIG_SUNRPC_DEBUG_TRACE)
+#  include <linux/trace_printk.h>
 #  define __sunrpc_printk(fmt, ...)	trace_printk(fmt, ##__VA_ARGS__)
 # else
 #  define __sunrpc_printk(fmt, ...)	printk(KERN_DEFAULT fmt, ##__VA_ARGS__)
diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
index a488ea9e9f85..74ce4f8995c4 100644
--- a/include/linux/trace_printk.h
+++ b/include/linux/trace_printk.h
@@ -1,11 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_TRACE_PRINTK_H
 #define _LINUX_TRACE_PRINTK_H
+#if !defined(__ASSEMBLY__) && !defined(__GENKSYMS__) && !defined(BUILD_VDSO)
 
-#include <linux/compiler_attributes.h>
 #include <linux/instruction_pointer.h>
 #include <linux/stddef.h>
 #include <linux/stringify.h>
+#include <linux/stdarg.h>
 
 #ifdef CONFIG_TRACING
 static inline __printf(1, 2)
@@ -147,5 +148,5 @@ ftrace_vprintk(const char *fmt, va_list ap)
 	return 0;
 }
 #endif /* CONFIG_TRACING */
-
+#endif /* !defined(__ASSEMBLY__) && !defined(__GENKSYMS__) && !defined(BUILD_VDSO) */
 #endif
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 593e3b59e42e..2bb25caebb75 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/trace_printk.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
 #include <uapi/linux/sched/types.h>
diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
index bfe98ce826f3..de81b9b4ca7d 100644
--- a/samples/fprobe/fprobe_example.c
+++ b/samples/fprobe/fprobe_example.c
@@ -12,6 +12,7 @@
 
 #define pr_fmt(fmt) "%s: " fmt, __func__
 
+#include <linux/trace_printk.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/fprobe.h>
diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c
index bf2411aa6fd7..159190f4103f 100644
--- a/samples/ftrace/ftrace-direct-too.c
+++ b/samples/ftrace/ftrace-direct-too.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/module.h>
-
 #include <linux/mm.h> /* for handle_mm_fault() */
 #include <linux/ftrace.h>
 #if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c
index cfc159580263..ff37aeb8523e 100644
--- a/samples/trace_printk/trace-printk.c
+++ b/samples/trace_printk/trace-printk.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+#include <linux/trace_printk.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/irq_work.h>
-- 
2.53.0



^ permalink raw reply related

* [PATCH v3 1/2] tracing: Move non-trace_printk prototypes into trace_controls.h
From: Steven Rostedt @ 2026-06-24  8:18 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Linus Torvalds, Sebastian Andrzej Siewior, John Ogness,
	Thomas Gleixner, Peter Zijlstra, Julia Lawall, Yury Norov
In-Reply-To: <20260624081806.120105649@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

In order to remove the include to trace_printk.h from kernel.h the tracing
control prototypes need to be separated into their own header file as they
are used in other common header files like rcu.h. There's no point in
removing trace_printk.h from kernel.h if it just gets added back to other
common headers.

Prototypes are very cheap for the compiler and should not be an issue.

ftrace_dump() and trace_dump_stack() are also moved into trace_controls.h,
as they are used in cases where things go wrong. The main use case is to
do a trace_dump_stack(); tracing_off(); ftrace_dump(); in a place that
detected that something went wrong, whereas, trace_printk() is added to
normal code during debugging and removed before committing upstream. The
dump code is fine to keep in production.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
Changes since v2: https://patch.msgid.link/20260622131029.655382134@kernel.org

- Updated the change log

 arch/powerpc/xmon/xmon.c         |  1 +
 arch/s390/kernel/ipl.c           |  1 +
 arch/s390/kernel/machine_kexec.c |  1 +
 drivers/gpu/drm/i915/i915_gem.h  |  1 +
 drivers/tty/sysrq.c              |  1 +
 include/linux/trace_controls.h   | 54 ++++++++++++++++++++++++++++++++
 include/linux/trace_printk.h     | 51 ------------------------------
 kernel/debug/debug_core.c        |  1 +
 kernel/panic.c                   |  1 +
 kernel/rcu/rcu.h                 |  2 ++
 kernel/rcu/rcutorture.c          |  1 +
 kernel/trace/trace.h             |  1 +
 kernel/trace/trace_benchmark.c   |  1 +
 lib/sys_info.c                   |  1 +
 14 files changed, 67 insertions(+), 51 deletions(-)
 create mode 100644 include/linux/trace_controls.h

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index cb3a3244ae6f..2135f319e0dd 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/debugfs.h>
+#include <linux/trace_controls.h>
 
 #include <asm/ptrace.h>
 #include <asm/smp.h>
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 3c346b02ceb9..baac66cc4de4 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -22,6 +22,7 @@
 #include <linux/debug_locks.h>
 #include <linux/vmalloc.h>
 #include <linux/secure_boot.h>
+#include <linux/trace_controls.h>
 #include <asm/asm-extable.h>
 #include <asm/machine.h>
 #include <asm/diag.h>
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index baeb3dcfc1c8..33f9a89eb3ad 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -12,6 +12,7 @@
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/trace_controls.h>
 #include <linux/debug_locks.h>
 #include <linux/cpufeature.h>
 #include <asm/guarded_storage.h>
diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index 20b3cb29cfff..1da8fb61c09e 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -116,6 +116,7 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file);
 #endif
 
 #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM)
+#include <linux/trace_controls.h>
 #define GEM_TRACE(...) trace_printk(__VA_ARGS__)
 #define GEM_TRACE_ERR(...) do {						\
 	pr_err(__VA_ARGS__);						\
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index c2e4b31b699a..d3f72dc430b8 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -324,6 +324,7 @@ static const struct sysrq_key_op sysrq_showstate_blocked_op = {
 };
 
 #ifdef CONFIG_TRACING
+#include <linux/trace_controls.h>
 #include <linux/ftrace.h>
 
 static void sysrq_ftrace_dump(u8 key)
diff --git a/include/linux/trace_controls.h b/include/linux/trace_controls.h
new file mode 100644
index 000000000000..995b97e963b4
--- /dev/null
+++ b/include/linux/trace_controls.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TRACE_CONTROLS_H
+#define _LINUX_TRACE_CONTROLS_H
+
+
+/*
+ * General tracing related utility functions - trace_printk(),
+ * tracing_on/tracing_off and tracing_start()/tracing_stop
+ *
+ * Use tracing_on/tracing_off when you want to quickly turn on or off
+ * tracing. It simply enables or disables the recording of the trace events.
+ * This also corresponds to the user space /sys/kernel/tracing/tracing_on
+ * file, which gives a means for the kernel and userspace to interact.
+ * Place a tracing_off() in the kernel where you want tracing to end.
+ * From user space, examine the trace, and then echo 1 > tracing_on
+ * to continue tracing.
+ *
+ * tracing_stop/tracing_start has slightly more overhead. It is used
+ * by things like suspend to ram where disabling the recording of the
+ * trace is not enough, but tracing must actually stop because things
+ * like calling smp_processor_id() may crash the system.
+ *
+ * Most likely, you want to use tracing_on/tracing_off.
+ */
+enum ftrace_dump_mode {
+	DUMP_NONE,
+	DUMP_ALL,
+	DUMP_ORIG,
+	DUMP_PARAM,
+};
+
+#ifdef CONFIG_TRACING
+void tracing_on(void);
+void tracing_off(void);
+int tracing_is_on(void);
+void tracing_snapshot(void);
+void tracing_snapshot_alloc(void);
+void tracing_start(void);
+void tracing_stop(void);
+void trace_dump_stack(int skip);
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
+#else
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
+static inline void tracing_on(void) { }
+static inline void tracing_off(void) { }
+static inline int tracing_is_on(void) { return 0; }
+static inline void tracing_snapshot(void) { }
+static inline void tracing_snapshot_alloc(void) { }
+static inline void trace_dump_stack(int skip) { }
+static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
+#endif
+
+#endif /* _LINUX_TRACE_CONTROLS_H */
diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
index 3d54f440dccf..a488ea9e9f85 100644
--- a/include/linux/trace_printk.h
+++ b/include/linux/trace_printk.h
@@ -7,43 +7,7 @@
 #include <linux/stddef.h>
 #include <linux/stringify.h>
 
-/*
- * General tracing related utility functions - trace_printk(),
- * tracing_on/tracing_off and tracing_start()/tracing_stop
- *
- * Use tracing_on/tracing_off when you want to quickly turn on or off
- * tracing. It simply enables or disables the recording of the trace events.
- * This also corresponds to the user space /sys/kernel/tracing/tracing_on
- * file, which gives a means for the kernel and userspace to interact.
- * Place a tracing_off() in the kernel where you want tracing to end.
- * From user space, examine the trace, and then echo 1 > tracing_on
- * to continue tracing.
- *
- * tracing_stop/tracing_start has slightly more overhead. It is used
- * by things like suspend to ram where disabling the recording of the
- * trace is not enough, but tracing must actually stop because things
- * like calling smp_processor_id() may crash the system.
- *
- * Most likely, you want to use tracing_on/tracing_off.
- */
-
-enum ftrace_dump_mode {
-	DUMP_NONE,
-	DUMP_ALL,
-	DUMP_ORIG,
-	DUMP_PARAM,
-};
-
 #ifdef CONFIG_TRACING
-void tracing_on(void);
-void tracing_off(void);
-int tracing_is_on(void);
-void tracing_snapshot(void);
-void tracing_snapshot_alloc(void);
-
-extern void tracing_start(void);
-extern void tracing_stop(void);
-
 static inline __printf(1, 2)
 void ____trace_printk_check_format(const char *fmt, ...)
 {
@@ -149,8 +113,6 @@ int __trace_printk(unsigned long ip, const char *fmt, ...);
 extern int __trace_bputs(unsigned long ip, const char *str);
 extern int __trace_puts(unsigned long ip, const char *str);
 
-extern void trace_dump_stack(int skip);
-
 /*
  * The double __builtin_constant_p is because gcc will give us an error
  * if we try to allocate the static variable to fmt if it is not a
@@ -173,19 +135,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
 
 extern __printf(2, 0) int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
-
-extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
-static inline void tracing_start(void) { }
-static inline void tracing_stop(void) { }
-static inline void trace_dump_stack(int skip) { }
-
-static inline void tracing_on(void) { }
-static inline void tracing_off(void) { }
-static inline int tracing_is_on(void) { return 0; }
-static inline void tracing_snapshot(void) { }
-static inline void tracing_snapshot_alloc(void) { }
-
 static inline __printf(1, 2)
 int trace_printk(const char *fmt, ...)
 {
@@ -196,7 +146,6 @@ ftrace_vprintk(const char *fmt, va_list ap)
 {
 	return 0;
 }
-static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 
 #endif
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index b276504c1c6b..f9c83a470c98 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,7 @@
 
 #define pr_fmt(fmt) "KGDB: " fmt
 
+#include <linux/trace_controls.h>
 #include <linux/pid_namespace.h>
 #include <linux/clocksource.h>
 #include <linux/serial_core.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index 213725b612aa..1415e910371d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -9,6 +9,7 @@
  * This function is used through-out the kernel (including mm and fs)
  * to indicate a major problem.
  */
+#include <linux/trace_controls.h>
 #include <linux/debug_locks.h>
 #include <linux/sched/debug.h>
 #include <linux/interrupt.h>
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index fa6d30ce73d1..b3e2c8f25a4f 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -280,6 +280,8 @@ extern int rcu_cpu_stall_notifiers;
 
 #ifdef CONFIG_RCU_STALL_COMMON
 
+#include <linux/trace_controls.h>
+
 extern int rcu_cpu_stall_ftrace_dump;
 extern int rcu_cpu_stall_suppress;
 extern int rcu_cpu_stall_timeout;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 882a158ada7b..76bf0184b267 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -39,6 +39,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/trace_clock.h>
+#include <linux/trace_controls.h>
 #include <asm/byteorder.h>
 #include <linux/torture.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 80fe152af1dd..2537c33ddd49 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/once_lite.h>
 #include <linux/ftrace_regs.h>
+#include <linux/trace_controls.h>
 #include <linux/llist.h>
 
 #include "pid_list.h"
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e19c32f2a938..69cc39008c36 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/trace_clock.h>
+#include <linux/trace_controls.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace_benchmark.h"
diff --git a/lib/sys_info.c b/lib/sys_info.c
index f32a06ec9ed4..e3c9ca05601b 100644
--- a/lib/sys_info.c
+++ b/lib/sys_info.c
@@ -8,6 +8,7 @@
 #include <linux/ftrace.h>
 #include <linux/nmi.h>
 #include <linux/sched/debug.h>
+#include <linux/trace_controls.h>
 #include <linux/string.h>
 #include <linux/sysctl.h>
 
-- 
2.53.0



^ permalink raw reply related

* Re: [PATCH v6 8/8] x86/setup: prepend embedded bootconfig cmdline before parse_early_param
From: Masami Hiramatsu @ 2026-06-24  8:47 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Andrew Morton, Nathan Chancellor, paulmck, Nicolas Schier,
	Nick Desaulniers, Bill Wendling, Justin Stitt, Jonathan Corbet,
	Shuah Khan, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, llvm, linux-doc,
	kernel-team
In-Reply-To: <20260623-bootconfig_using_tools-v6-8-640c2f587a3c@debian.org>

On Tue, 23 Jun 2026 09:15:35 -0700
Breno Leitao <leitao@debian.org> wrote:

> Call xbc_prepend_embedded_cmdline() in setup_arch() right after the
> CONFIG_CMDLINE merge and before strscpy(command_line, ...) so the
> build-time-rendered embedded bootconfig "kernel" subtree is part of
> boot_command_line by the time parse_early_param() runs. early_param()
> handlers (mem=, earlycon=, loglevel=, ...) now see values supplied via
> CONFIG_BOOT_CONFIG_EMBED_FILE without parsing bootconfig at runtime.
> 
> Gate the prepend on the same opt-in the runtime parser uses: prepend
> when "bootconfig" is present on the command line, or when
> CONFIG_BOOT_CONFIG_FORCE is set. Detect it with parse_args(), exactly
> as setup_boot_config() does, so both agree on what counts as opt-in:
> any "bootconfig" key regardless of value (bare, =0, =1, ...), and only
> before the "--" that separates init arguments. Sharing the parser keeps
> the early and late paths from diverging -- e.g. "bootconfig=0" or a
> "-- bootconfig" meant for init must not apply the embedded keys early
> while the runtime parser skips them.
> 
> The prepend necessarily runs before setup_boot_config() detects an
> initrd bootconfig, so an initrd cannot override the embedded "kernel"
> keys for early_param(). This is intentional: the embedded cmdline acts
> like a build-time CONFIG_CMDLINE. An initrd bootconfig's "kernel" keys
> never reached early_param() anyway (they apply late via
> extra_command_line), so nothing is lost -- the initrd keys still apply
> late, with last-wins keeping the embedded values in effect.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  arch/x86/Kconfig        |  1 +
>  arch/x86/kernel/setup.c | 43 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 44 insertions(+)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 0de23e6471973..8ab11199c16d5 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -127,6 +127,7 @@ config X86
>  	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
>  	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
>  	select ARCH_SUPPORTS_CFI		if X86_64
> +	select ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG
>  	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI
>  	select ARCH_SUPPORTS_LTO_CLANG
>  	select ARCH_SUPPORTS_LTO_CLANG_THIN
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 46882ce79c3a4..c973a2cebcd04 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -6,6 +6,7 @@
>   * parts of early kernel initialization.
>   */
>  #include <linux/acpi.h>
> +#include <linux/bootconfig.h>
>  #include <linux/console.h>
>  #include <linux/cpu.h>
>  #include <linux/crash_dump.h>
> @@ -881,6 +882,37 @@ static void __init x86_report_nx(void)
>   * Note: On x86_64, fixmaps are ready for use even before this is called.
>   */
>  
> +#ifdef CONFIG_CMDLINE_FROM_BOOTCONFIG
> +static int __init bootconfig_optin(char *param, char *val,
> +				   const char *unused, void *arg)
> +{
> +	if (!strcmp(param, "bootconfig"))
> +		*(bool *)arg = true;
> +	return 0;
> +}
> +
> +/*
> + * Did the user opt in to bootconfig on the kernel command line? Use
> + * parse_args() so this matches setup_boot_config() exactly, including
> + * stopping at the "--" that separates init arguments.
> + */
> +static bool __init bootconfig_cmdline_requested(void)
> +{
> +	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
> +	bool found = false;
> +
> +	if (IS_ENABLED(CONFIG_BOOT_CONFIG_FORCE))
> +		return true;
> +
> +	strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
> +	if (IS_ERR(parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0,
> +			      &found, bootconfig_optin)))
> +		return false;
> +
> +	return found;
> +}

It seems that this should be placed in a common place because it will
be used from other architectures (and init/main.c too). Maybe we can
introduce something like this?

bool __init bootconfig_cmdline_requested(const char *boot_cmdline, int *end_offset);

Thanks,

> +#endif
> +
>  void __init setup_arch(char **cmdline_p)
>  {
>  #ifdef CONFIG_X86_32
> @@ -924,6 +956,17 @@ void __init setup_arch(char **cmdline_p)
>  	builtin_cmdline_added = true;
>  #endif
>  
> +#ifdef CONFIG_CMDLINE_FROM_BOOTCONFIG
> +	/*
> +	 * Prepend the build-time-rendered embedded "kernel" keys here so
> +	 * parse_early_param() below sees them, gating on the same opt-in
> +	 * as the runtime parser (see bootconfig_cmdline_requested()).
> +	 */
> +	if (bootconfig_cmdline_requested())
> +		xbc_prepend_embedded_cmdline(boot_command_line,
> +					     COMMAND_LINE_SIZE);
> +#endif
> +
>  	strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
>  	*cmdline_p = command_line;
>  
> 
> -- 
> 2.53.0-Meta
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v6 6/8] Documentation: bootconfig: document build-time cmdline rendering
From: Masami Hiramatsu @ 2026-06-24  8:47 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Andrew Morton, Nathan Chancellor, paulmck, Nicolas Schier,
	Nick Desaulniers, Bill Wendling, Justin Stitt, Jonathan Corbet,
	Shuah Khan, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, llvm, linux-doc,
	kernel-team
In-Reply-To: <20260623-bootconfig_using_tools-v6-6-640c2f587a3c@debian.org>

On Tue, 23 Jun 2026 09:15:33 -0700
Breno Leitao <leitao@debian.org> wrote:

> Add a section describing CONFIG_CMDLINE_FROM_BOOTCONFIG: what it
> does (renders the embedded "kernel" subtree to a flat cmdline at
> build time so early_param() handlers see the values), what it
> requires (BOOT_CONFIG_EMBED, a non-empty BOOT_CONFIG_EMBED_FILE,
> and ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG -- currently x86 only),
> the bootconfig opt-in semantics, the initrd-vs-embedded precedence,
> and the soft-error overflow behavior.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  Documentation/admin-guide/bootconfig.rst | 81 ++++++++++++++++++++++++++++++++
>  1 file changed, 81 insertions(+)
> 
> diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
> index f712758472d5c..349cefbb2bbcd 100644
> --- a/Documentation/admin-guide/bootconfig.rst
> +++ b/Documentation/admin-guide/bootconfig.rst
> @@ -234,6 +234,87 @@ Kconfig option selected.
>  Note that even if you set this option, you can override the embedded
>  bootconfig by another bootconfig which attached to the initrd.
>  
> +Rendering Embedded kernel.* Keys at Build Time
> +----------------------------------------------
> +
> +By default, the embedded bootconfig (``CONFIG_BOOT_CONFIG_EMBED=y``) is
> +parsed at runtime, after ``parse_early_param()`` has already run. Early
> +parameter handlers (``mem=``, ``earlycon=``, ``loglevel=``, ...) therefore
> +cannot see values supplied via the embedded ``kernel`` subtree.
> +
> +``CONFIG_CMDLINE_FROM_BOOTCONFIG`` resolves this by rendering the
> +``kernel`` subtree of ``CONFIG_BOOT_CONFIG_EMBED_FILE`` into a flat cmdline
> +string at kernel build time (via ``tools/bootconfig -C``) and prepending
> +it to ``boot_command_line`` during early architecture setup, so the keys
> +are visible to ``parse_early_param()``.
> +
> +The option requires ``CONFIG_BOOT_CONFIG_EMBED=y``, a non-empty
> +``CONFIG_BOOT_CONFIG_EMBED_FILE``, and an architecture that selects
> +``CONFIG_ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG``. Currently only x86
> +selects it; on other architectures the embedded bootconfig still works,
> +but only through the late runtime parser.

As commented by Sashiko, here we need to mention that this option requires
CONFIG_CMDLINE to be empty. This means user can NOT set both option
at once (This also means user doesn't have to worry about configuration
conflicts.)

Thanks,



-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v3 2/2] tracing: Remove trace_printk.h from kernel.h
From: David Laight @ 2026-06-24 10:11 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
	Mathieu Desnoyers, Andrew Morton, Linus Torvalds,
	Sebastian Andrzej Siewior, John Ogness, Thomas Gleixner,
	Peter Zijlstra, Julia Lawall, Yury Norov
In-Reply-To: <20260624081948.301578807@kernel.org>

On Wed, 24 Jun 2026 04:18:08 -0400
Steven Rostedt <rostedt@kernel.org> wrote:

> From: Steven Rostedt <rostedt@goodmis.org>
> 
> There have been complaints about trace_printk.h causing more build time
> for being in kernel.h. Move it out of kernel.h and place it in the headers
> and C files that use it.
> 
> Link: https://lore.kernel.org/all/CAHk-=wikCBeVFjVXiY4o-oepdbjAoir5+TcAgtL12c4u1TpZLQ@mail.gmail.com/

That is all about changes to the file causing everything to be rebuilt,
not the contents of the file slowing down builds.
The two are different.

The part you are moving out of normal builds is just a few #defines.
They won't have a significant effect on build times either.

So there is no point splitting out trace_controls.h.

	David



^ permalink raw reply

* Re: [RFC PATCH 1/3] mm/compaction: skip isolate mlocked folios when compact_unevictable_allowed=0
From: Wandun @ 2026-06-24 11:08 UTC (permalink / raw)
  To: Vlastimil Babka (SUSE), linux-mm, linux-kernel,
	linux-trace-kernel, linux-rt-devel
  Cc: akpm, surenb, mhocko, jackmanb, hannes, ziy, rostedt, mhiramat,
	mathieu.desnoyers, david, ljs, liam, rppt, bigeasy, clrkwllms,
	Alexander.Krabler, Hugh Dickins
In-Reply-To: <c8793c0f-7156-4cb7-9e6e-7909397e2fff@kernel.org>



On 6/22/26 17:55, Vlastimil Babka (SUSE) wrote:
> On 6/18/26 13:43, Wandun wrote:
>>
>>
>> On 6/18/26 02:52, Vlastimil Babka (SUSE) wrote:
>>> On 6/4/26 04:38, Wandun Chen wrote:
>>>> From: Wandun Chen <chenwandun@lixiang.com>
>>>>
>>>> compact_unevictable_allowed is default 0 under PREEMPT_RT,
>>>> isolate_migratepages_block() skips folios with PG_unevictable set.
>>>> However, mlock_folio() sets PG_mlocked immediately but defers
>>>> PG_unevictable to mlock_folio_batch(), result in a folio with
>>>> PG_mlocked=1 but PG_unevictable=0. Compaction will isolate such a
>>>> folio.
>>>>
>>>> Fix by checking folio_test_mlocked() together with the existing
>>>> folio_test_unevictable() check.
>>>>
>>>> A similar issue has been reported by Alexander Krabler on a 6.12-rt
>>>> aarch64 system. Vlastimil suggested to check the mlocked flag [1].
>>>>
>>>> Reported-by: Alexander Krabler <Alexander.Krabler@kuka.com>
>>>> Closes: https://lore.kernel.org/all/DU0PR01MB10385345F7153F334100981888259A@DU0PR01MB10385.eurprd01.prod.exchangelabs.com/
>>>> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>>>> Signed-off-by: Wandun Chen <chenwandun@lixiang.com>
>>>> Link: https://lore.kernel.org/all/33275585-f2db-4779-89f0-3ae24b455a67@suse.cz/ [1]
>>>
>>> Well in that thread, Hugh doubted my suggestion and then it seems we didn't
>>> concluded anything. Did you actually in practice observe the issue that
>>> Alexander had, and that this patch fixed it, or is that theoretical?
>>>
>> Yes, I wrote a test case that can reproduce it in a few second.
>>
>> The test case contains 3 steps:
>> 1. mlockall
>> 2. mmap file(2GB) + trigger file write page fault;
>> 3. during step 1, trigger compact via /proc/sys/vm/compact_memory
>>
>>
>> My reproduction environment is qemu with 4GB ram, 8 core, aarch64,
>> preempt_rt and includes the tracepoint in patch 02.
>> After running the reproduction program for a few seconds, the
>> following output appears.
> 
> Ah, nice.
> 
>> repro-403     [004] ....1   101.270505: mm_compaction_isolate_folio: pfn=0x71e3a mode=0x0 flags=referenced|uptodate|mlocked
>> repro-403     [004] ....1   101.270507: mm_compaction_isolate_folio: pfn=0x71e3b mode=0x0 flags=referenced|uptodate|mlocked
>> repro-403     [004] ....1   101.270513: mm_compaction_isolate_folio: pfn=0x71e3c mode=0x0 flags=referenced|uptodate|mlocked
>> repro-403     [004] ....1   101.270515: mm_compaction_isolate_folio: pfn=0x71e3d mode=0x0 flags=uptodate|mlocked
>> repro-403     [004] ....1   101.270517: mm_compaction_isolate_folio: pfn=0x71e3e mode=0x0 flags=uptodate|mlocked
>> repro-403     [004] ....1   101.270520: mm_compaction_isolate_folio: pfn=0x71e3f mode=0x0 flags=uptodate|mlocked
>>
>>
>> Unfortunately, I recently found that there is still a bug in the
>> fix patch. Setting mlocked in the mlock_folio function could happen
>> even after the page is successfully isolated, so it still cannot
>> prevent migration. Because of this, I need to think more about how
>> to fix it.
>>
>> Perhaps we should double-check whether the page is mlocked during
>> the actual migration phase.
> 
> So IIUC the isolation+migration might be started between the folio is
> allocated, and mlocked? In that case the check during migration could still

Yes, in that case it still be racy, it is not a good idea to check page flags.

> be racy, and if the page is isolated, it's already bad for the RT process.

IIUC, more accurately, the migration entry in the page talbe is real a bad for
RT process, because isolate page doesn't modify the page table, so memory
access continues as usual, therefore a new idea occur.

S1. In the mlock[all] syscall, if mlock_vma_pages_range hit a migration entry,
    then, it should wait for the migration to complete.

S2. During the unmap phase of memory migration, prevent a page from being unmapped
    if the page's associated vma is markd with VM_LOCKED, similar to how reclaim is
    disabled for pages in a VM_LOCKED vma(try_to_unmap_one).


For a page handled during the mlock[all] syscall:
  - if migration has been already finished, there is noting to do;
  - if migration is in progress and the migration etnry is already filled, we
    wait (S1)
  - if the page is in-fight, going to be isolated/migrated, S2 prevents the unmap.

For a page handled during a page fault: VM_LOCKED is already set on the vma,
so S2 guarantees it will not be unmapped, hence no migration entry.


Thanks a lot for the detailed feedback, Vlastimil.

Best regards,
Wandun


> 
> So this would only be a short-term problem after the mlockall, but we don't
> have a way for the RT process to know the moment it's all settled, right?

Yes, some pages may have been isolated and will do migration.

> Probably the proper solution would be for mlock[all]() itself to wait for an
> isolated page, and only continue once it knows it can't be isolated anymore.
> This might howver would go against some of the folio batching optimizations?
> 
>> What do you think of this best-effort approach?
>>
>>
>> Best regards,
>> Wandun
>>
>>
>>
>>
>>
>> The full reproducer is as below:
>>
>> /* gcc repro.c -o repro -lpthread */
>>
>> #define _GNU_SOURCE
>> #include <fcntl.h>
>> #include <pthread.h>
>> #include <stdio.h>
>> #include <stdlib.h>
>> #include <sys/mman.h>
>> #include <unistd.h>
>>
>> #define PAGE_SIZE       4096
>> #define NR_PAGES        32
>> #define FILE_SIZE       (2ULL * 1024 * 1024 * 1024)
>>
>> static void *worker_fn(void *arg)
>> {
>> 	int fd = (long)arg;
>> 	size_t len = (size_t)FILE_SIZE;
>> 	char *p = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>> 	if (p == MAP_FAILED)
>> 		return NULL;
>>
>> 	for (size_t off = 0; off + NR_PAGES * PAGE_SIZE <= len;
>> 	     off += NR_PAGES * PAGE_SIZE) {
>> 		for (int i = 0; i < NR_PAGES; i++)
>> 			p[off + i * PAGE_SIZE] = 1;
>> 		usleep(200);
>> 	}
>>
>> 	munmap(p, len);
>> 	return NULL;
>> }
>>
>> static void *compact_fn(void *arg)
>> {
>> 	(void)arg;
>> 	int fd = open("/proc/sys/vm/compact_memory", O_WRONLY);
>> 	if (fd < 0)
>> 		return NULL;
>>
>> 	while (1) {
>> 		if (write(fd, "1", 1) < 0) {}
>> 		usleep(5000);
>> 	}
>> }
>>
>> int main(void)
>> {
>> 	mlockall(MCL_CURRENT | MCL_FUTURE);
>>
>> 	int fd = open("./repro_largefile.dat", O_RDWR | O_CREAT, 0600);
>> 	if (fd < 0)
>> 		return 1;
>> 	unlink("./repro_largefile.dat");
>> 	if (ftruncate(fd, (off_t)FILE_SIZE) < 0)
>> 		return 1;
>>
>> 	printf("repro_largefile: 1 worker, %d pages/batch, Ctrl-C to stop\n",
>> 	       NR_PAGES);
>>
>> 	pthread_t compact, worker;
>> 	pthread_create(&compact, NULL, compact_fn, NULL);
>> 	pthread_create(&worker, NULL, worker_fn, (void *)(long)fd);
>>
>> 	pthread_join(worker, NULL);
>> 	return 0;
>> }
>>
>>>> ---
>>>>  mm/compaction.c | 3 ++-
>>>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/mm/compaction.c b/mm/compaction.c
>>>> index b776f35ad020..7e07b792bcb5 100644
>>>> --- a/mm/compaction.c
>>>> +++ b/mm/compaction.c
>>>> @@ -1116,7 +1116,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
>>>>  		is_unevictable = folio_test_unevictable(folio);
>>>>  
>>>>  		/* Compaction might skip unevictable pages but CMA takes them */
>>>> -		if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
>>>> +		if (!(mode & ISOLATE_UNEVICTABLE) &&
>>>> +		    (is_unevictable || folio_test_mlocked(folio)))
>>>>  			goto isolate_fail_put;
>>>>  
>>>>  		/*
>>>
>>
> 


^ permalink raw reply

* [PATCH 6.6.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-24 12:22 UTC (permalink / raw)
  To: stable
  Cc: Bjoern Doebel, Steven Rostedt, Masami Hiramatsu,
	linux-trace-kernel, linux-kernel, Mathieu Desnoyers,
	David Howells

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Assisted-by: Kiro:claude-opus-4.8
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 67 ++++++-------------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 18 insertions(+), 75 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index ded528d23f85..382fbaa701f9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -129,9 +129,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 508edf1f3f1e..52c7dbccafed 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5084,28 +5084,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -5130,51 +5122,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6b35666a4e0b..f57baf67726d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4792,21 +4792,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..628c25693cef 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* [PATCH 6.1.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-24 12:23 UTC (permalink / raw)
  To: stable
  Cc: Bjoern Doebel, Steven Rostedt, Masami Hiramatsu,
	linux-trace-kernel, linux-kernel, Mathieu Desnoyers,
	David Howells

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Assisted-by: Kiro:claude-opus-4.8
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 67 ++++++-------------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 18 insertions(+), 75 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3e7bfc0f65ae..b53335ed2d0e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -130,9 +130,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d3a31ba7c710..5edc4126d0c6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5082,28 +5082,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -5128,51 +5120,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 25f31d7718c6..5ef1c79dc5c9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4819,21 +4819,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..628c25693cef 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* [PATCH 5.15.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-24 12:23 UTC (permalink / raw)
  To: stable
  Cc: Bjoern Doebel, Steven Rostedt, Masami Hiramatsu,
	linux-trace-kernel, linux-kernel, Mathieu Desnoyers,
	David Howells

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Assisted-by: Kiro:claude-opus-4.8
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 67 ++++++-------------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 18 insertions(+), 75 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3e7bfc0f65ae..b53335ed2d0e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -130,9 +130,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e44115db0efe..770dc7c60656 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5037,28 +5037,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -5083,51 +5075,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 537360be8e4e..1a29a9d9e868 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4803,21 +4803,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..628c25693cef 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox