Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v3 1/8] dma-debug: Allow multiple invocations of overlapping entries
From: Leon Romanovsky @ 2026-03-16 19:06 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm
In-Reply-To: <20260316-dma-debug-overlap-v3-0-1dde90a7f08b@nvidia.com>

From: Leon Romanovsky <leonro@nvidia.com>

Repeated DMA mappings with DMA_ATTR_CPU_CACHE_CLEAN trigger the
following splat. This prevents using the attribute in cases where a DMA
region is shared and reused more than seven times.

 ------------[ cut here ]------------
 DMA-API: exceeded 7 overlapping mappings of cacheline 0x000000000438c440
 WARNING: kernel/dma/debug.c:467 at add_dma_entry+0x219/0x280, CPU#4: ibv_rc_pingpong/1644
 Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_fwctl zram zsmalloc mlx5_ib fuse rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_core ib_core
 CPU: 4 UID: 2733 PID: 1644 Comm: ibv_rc_pingpong Not tainted 6.19.0+ #129 PREEMPT
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
 RIP: 0010:add_dma_entry+0x221/0x280
 Code: c0 0f 84 f2 fe ff ff 83 e8 01 89 05 6d 99 11 01 e9 e4 fe ff ff 0f 8e 1f ff ff ff 48 8d 3d 07 ef 2d 01 be 07 00 00 00 48 89 e2 <67> 48 0f b9 3a e9 06 ff ff ff 48 c7 c7 98 05 2b 82 c6 05 72 92 28
 RSP: 0018:ff1100010e657970 EFLAGS: 00010002
 RAX: 0000000000000007 RBX: ff1100010234eb00 RCX: 0000000000000000
 RDX: ff1100010e657970 RSI: 0000000000000007 RDI: ffffffff82678660
 RBP: 000000000438c440 R08: 0000000000000228 R09: 0000000000000000
 R10: 00000000000001be R11: 000000000000089d R12: 0000000000000800
 R13: 00000000ffffffef R14: 0000000000000202 R15: ff1100010234eb00
 FS:  00007fb15f3f6740(0000) GS:ff110008dcc19000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007fb15f32d3a0 CR3: 0000000116f59001 CR4: 0000000000373eb0
 Call Trace:
  <TASK>
  debug_dma_map_sg+0x1b4/0x390
  __dma_map_sg_attrs+0x6d/0x1a0
  dma_map_sgtable+0x19/0x30
  ib_umem_get+0x284/0x3b0 [ib_uverbs]
  mlx5_ib_reg_user_mr+0x68/0x2a0 [mlx5_ib]
  ib_uverbs_reg_mr+0x17f/0x2a0 [ib_uverbs]
  ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xc2/0x130 [ib_uverbs]
  ib_uverbs_cmd_verbs+0xa0b/0xae0 [ib_uverbs]
  ? ib_uverbs_handler_UVERBS_METHOD_QUERY_PORT_SPEED+0xe0/0xe0 [ib_uverbs]
  ? mmap_region+0x7a/0xb0
  ? do_mmap+0x3b8/0x5c0
  ib_uverbs_ioctl+0xa7/0x110 [ib_uverbs]
  __x64_sys_ioctl+0x14f/0x8b0
  ? ksys_mmap_pgoff+0xc5/0x190
  do_syscall_64+0x8c/0xbf0
  entry_SYSCALL_64_after_hwframe+0x4b/0x53
 RIP: 0033:0x7fb15f5e4eed
 Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
 RSP: 002b:00007ffe09a5c540 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
 RAX: ffffffffffffffda RBX: 00007ffe09a5c5d0 RCX: 00007fb15f5e4eed
 RDX: 00007ffe09a5c5f0 RSI: 00000000c0181b01 RDI: 0000000000000003
 RBP: 00007ffe09a5c590 R08: 0000000000000028 R09: 00007ffe09a5c794
 R10: 0000000000000001 R11: 0000000000000246 R12: 00007ffe09a5c794
 R13: 000000000000000c R14: 0000000025a49170 R15: 000000000000000c
  </TASK>
 ---[ end trace 0000000000000000 ]---

Fixes: 61868dc55a11 ("dma-mapping: add DMA_ATTR_CPU_CACHE_CLEAN")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 kernel/dma/debug.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 86f87e43438c3..be207be749968 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -453,7 +453,7 @@ static int active_cacheline_set_overlap(phys_addr_t cln, int overlap)
 	return overlap;
 }
 
-static void active_cacheline_inc_overlap(phys_addr_t cln)
+static void active_cacheline_inc_overlap(phys_addr_t cln, bool is_cache_clean)
 {
 	int overlap = active_cacheline_read_overlap(cln);
 
@@ -462,7 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
 	/* If we overflowed the overlap counter then we're potentially
 	 * leaking dma-mappings.
 	 */
-	WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
+	WARN_ONCE(!is_cache_clean && overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
 		  pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
 		  ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
 }
@@ -495,7 +495,7 @@ static int active_cacheline_insert(struct dma_debug_entry *entry,
 	if (rc == -EEXIST) {
 		struct dma_debug_entry *existing;
 
-		active_cacheline_inc_overlap(cln);
+		active_cacheline_inc_overlap(cln, entry->is_cache_clean);
 		existing = radix_tree_lookup(&dma_active_cacheline, cln);
 		/* A lookup failure here after we got -EEXIST is unexpected. */
 		WARN_ON(!existing);

-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 0/8] RDMA: Enable operation with DMA debug enabled
From: Leon Romanovsky @ 2026-03-16 19:06 UTC (permalink / raw)
  To: Marek Szyprowski, Robin Murphy, Michael S. Tsirkin, Petr Tesarik,
	Jonathan Corbet, Shuah Khan, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, Jason Gunthorpe, Leon Romanovsky,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Joerg Roedel,
	Will Deacon, Andrew Morton
  Cc: iommu, linux-kernel, linux-doc, virtualization, linux-rdma,
	linux-trace-kernel, linux-mm

Add a new DMA_ATTR_REQUIRE_COHERENT attribute to the DMA API to mark
mappings that must run on a DMA‑coherent system. Such buffers cannot
use the SWIOTLB path, may overlap with CPU caches, and do not depend on
explicit cache flushing.

Mappings using this attribute are rejected on systems where cache
side‑effects could lead to data corruption, and therefore do not need
the cache‑overlap debugging logic. This series also includes fixes for
DMA_ATTR_CPU_CACHE_CLEAN handling.
Thanks.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
Changes in v3:
- Enriched commit messages and documentation
- Added ROB tags
- Link to v2: https://patch.msgid.link/20260311-dma-debug-overlap-v2-0-e00bc2ca346d@nvidia.com

Changes in v2:
- Added DMA_ATTR_REQUIRE_COHERENT attribute
- Added HMM patch which needs this attribute as well
- Renamed DMA_ATTR_CPU_CACHE_CLEAN to be DMA_ATTR_DEBUGGING_IGNORE_CACHELINES
- Link to v1: https://patch.msgid.link/20260307-dma-debug-overlap-v1-0-c034c38872af@nvidia.com

---
Leon Romanovsky (8):
      dma-debug: Allow multiple invocations of overlapping entries
      dma-mapping: handle DMA_ATTR_CPU_CACHE_CLEAN in trace output
      dma-mapping: Clarify valid conditions for CPU cache line overlap
      dma-mapping: Introduce DMA require coherency attribute
      dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set
      iommu/dma: add support for DMA_ATTR_REQUIRE_COHERENT attribute
      RDMA/umem: Tell DMA mapping that UMEM requires coherency
      mm/hmm: Indicate that HMM requires DMA coherency

 Documentation/core-api/dma-attributes.rst | 38 ++++++++++++++++++++++++-------
 drivers/infiniband/core/umem.c            |  5 ++--
 drivers/iommu/dma-iommu.c                 | 21 +++++++++++++----
 drivers/virtio/virtio_ring.c              | 10 ++++----
 include/linux/dma-mapping.h               | 15 ++++++++----
 include/trace/events/dma.h                |  4 +++-
 kernel/dma/debug.c                        |  9 ++++----
 kernel/dma/direct.h                       |  7 +++---
 kernel/dma/mapping.c                      |  6 +++++
 mm/hmm.c                                  |  4 ++--
 10 files changed, 86 insertions(+), 33 deletions(-)
---
base-commit: 11439c4635edd669ae435eec308f4ab8a0804808
change-id: 20260305-dma-debug-overlap-21487c3fa02c

Best regards,
--  
Leon Romanovsky <leonro@nvidia.com>


^ permalink raw reply

* Re: [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift
From: Wesley Atwell @ 2026-03-16 18:03 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, davem, kuba, edumazet, ncardwell, linux-kernel, linux-api,
	linux-doc, linux-kselftest, linux-trace-kernel, mptcp, dsahern,
	horms, kuniyu, andrew+netdev, willemdebruijn.kernel, jasowang,
	skhan, corbet, matttbe, martineau, geliang, rostedt, mhiramat,
	mathieu.desnoyers, 0x7f454c46
In-Reply-To: <e93ce797-4153-4e6e-89b6-3839a0b8bca2@redhat.com>

Hi Paolo,

The link I meant is the netdev regression discussion that led to:

026dfef287c0 ("tcp: give up on stronger sk_rcvbuf checks (for now)")

The report/discussion thread is here:

https://lore.kernel.org/20260225122355.585fd57b@kernel.org

The revert posting itself is here:

https://patch.msgid.link/20260227003359.2391017-1-kuba@kernel.org

I should not have implied that there was a separate prior
regression report beyond that thread.

Thanks for the note on formatting. I will keep follow-ups in plaintext
and avoid top-posting.

Thanks,
Wesley

^ permalink raw reply

* Re: [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift
From: Paolo Abeni @ 2026-03-16 17:47 UTC (permalink / raw)
  To: Wesley Atwell
  Cc: netdev, davem, kuba, edumazet, ncardwell, linux-kernel, linux-api,
	linux-doc, linux-kselftest, linux-trace-kernel, mptcp, dsahern,
	horms, kuniyu, andrew+netdev, willemdebruijn.kernel, jasowang,
	skhan, corbet, matttbe, martineau, geliang, rostedt, mhiramat,
	mathieu.desnoyers, 0x7f454c46
In-Reply-To: <CAN=sVvyNpkyok_bt8eQSmqc4f7g7QoZBUmRmNRLoFz1HasEzMA@mail.gmail.com>

Hi,

On 3/16/26 6:29 PM, Wesley Atwell wrote:
> The strongest real anchor here is the already documented regression
> around sender-visible rwnd diverging from hard receive-memory backing,
> rather than a general receive-accounting cleanup.

I likely missed some of the prior discussion. Could you please share a
pointer/link to the mentioned regression report?

When posting on netdev please:
- use plaintext only messages
- avoid top-posting

Thanks,

Paolo


^ permalink raw reply

* Re: [PATCH v4 0/5] mm: zone lock tracepoint instrumentation
From: Dmitry Ilvokhin @ 2026-03-16 17:40 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Matthew Wilcox, Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Masami Hiramatsu, Mathieu Desnoyers, Rafael J. Wysocki,
	Pavel Machek, Len Brown, Brendan Jackman, Johannes Weiner, Zi Yan,
	Oscar Salvador, Qi Zheng, Shakeel Butt, linux-kernel, linux-mm,
	linux-trace-kernel, linux-pm
In-Reply-To: <20260309171700.063318b5@gandalf.local.home>

Thanks for the discussion and for the feedback on generic lock
instrumentation.

Following up on the earlier points about lightweight LOCK_STAT and
tracepoints.

While the thread has focused on spin_lock() calls, the real gap is on
the unlock path: there is currently no release-side tracepoint to
correlate holders and waiters or measure contended hold times.

A possible generic solution is a trace_contended_release() for spin
locks, for example:

    if (trace_contended_release_enabled() &&
        atomic_read(&lock->val) & ~_Q_LOCKED_MASK)
        trace_contended_release(lock);

This might work on x86, but could increase code size and regress
performance on arches where spin_unlock() is inlined, such as arm64
under !PREEMPTION.

So even a generic release-side tracepoint has nontrivial downsides (not
to mention lightweight LOCK_STAT, since a disabled tracepoint is about
as lightweight as it gets, the more stats we add, the less lightweight
the mechanism becomes).

The zone lock wrappers provide a practical, production-safe solution to
observe this specific high-contention lock, if the generic approach
proves impractical.

^ permalink raw reply

* [PATCH v2] tracing: Fix failure to read user space from system call trace events
From: Steven Rostedt @ 2026-03-16 17:07 UTC (permalink / raw)
  To: LKML, Linux Trace Kernel; +Cc: Masami Hiramatsu, Mathieu Desnoyers

From: Steven Rostedt <rostedt@goodmis.org>

The system call trace events call trace_user_fault_read() to read the user
space part of some system calls. This is done by grabbing a per-cpu
buffer, disabling migration, enabling preemption, calling
copy_from_user(), disabling preemption, enabling migration and checking if
the task was preempted while preemption was enabled. If it was, the buffer
is considered corrupted and it tries again.

There's a safety mechanism that will fail out of this loop if it fails 100
times (with a warning). That warning message was triggered in some
pi_futex stress tests. Enabling the sched_switch trace event and
traceoff_on_warning, showed the problem:

 pi_mutex_hammer-1375    [006] d..21   138.981648: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981651: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981656: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981659: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981664: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981667: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981671: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981675: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981679: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981682: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981687: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981690: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981695: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981698: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981703: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981706: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981711: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981714: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981719: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981722: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981727: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981730: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981735: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981738: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95

What happened was the task 1375 was flagged to be migrated. When
preemption was enabled, the migration thread woke up to migrate that task,
but failed because migration for that task was disabled. This caused the
loop to fail to exit because the task scheduled out while trying to read
user space.

Every time the task enabled preemption the migration thread would schedule
in, try to migrate the task, fail and let the task continue. But because
the loop would only enable preemption with migration disabled, it would
always fail because each time it enabled preemption to read user space,
the migration thread would try to migrate it.

To solve this, when the loop fails to read user space without being
scheduled out, enabled and disable preemption with migration enabled. This
will allow the migration task to successfully migrate the task and the
next loop should succeed to read user space without being scheduled out.

Cc: stable@vger.kernel.org
Fixes: 64cf7d058a005 ("tracing: Have trace_marker use per-cpu data to read user space")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v1: https://patch.msgid.link/20260303120404.1824b894@gandalf.local.home

- Removed extra whitespace at end of comment line.

 kernel/trace/trace.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ebd996f8710e..bb4a62f4b953 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6783,6 +6783,23 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
 	 */
 
 	do {
+		/*
+		 * It is possible that something is trying to migrate this
+		 * task. What happens then, is when preemption is enabled,
+		 * the migration thread will preempt this task, try to
+		 * migrate it, fail, then let it run again. That will
+		 * cause this to loop again and never succeed.
+		 * On failures, enabled and disable preemption with
+		 * migration enabled, to allow the migration thread to
+		 * migrate this task.
+		 */
+		if (trys) {
+			preempt_enable_notrace();
+			preempt_disable_notrace();
+			cpu = smp_processor_id();
+			buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
+		}
+
 		/*
 		 * If for some reason, copy_from_user() always causes a context
 		 * switch, this would then cause an infinite loop.
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH v8 0/3] mm: vmscan: add PID and cgroup ID to vmscan tracepoints
From: Andrew Morton @ 2026-03-16 16:46 UTC (permalink / raw)
  To: Thomas Ballasi
  Cc: axelrasmussen, david, hannes, linux-mm, linux-trace-kernel,
	lorenzo.stoakes, mhiramat, mhocko, rostedt, shakeel.butt, weixugc,
	yuanchu, zhengqi.arch
In-Reply-To: <20260316160908.42727-1-tballasi@linux.microsoft.com>

On Mon, 16 Mar 2026 09:09:05 -0700 Thomas Ballasi <tballasi@linux.microsoft.com> wrote:

> Changes in v8:
> - Removed in_task() mention in PID commit message
> - Moved __entry->pid to __entry->ent.pid
> 

Below is how this update altered mm.git.

It's a minor change, so I'll retain Shakeel's Acked-by and Reviewed-by
on the first two patches.


--- a/include/trace/events/vmscan.h~b
+++ a/include/trace/events/vmscan.h
@@ -132,20 +132,18 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_rec
 		__field(	unsigned long,	gfp_flags	)
 		__field(	u64,	memcg_id	)
 		__field(	int,	order		)
-		__field(	int,	pid		)
 	),
 
 	TP_fast_assign(
 		__entry->gfp_flags	= (__force unsigned long)gfp_flags;
 		__entry->order		= order;
-		__entry->pid		= current->pid;
 		__entry->memcg_id	= mem_cgroup_id(memcg);
 	),
 
 	TP_printk("order=%d gfp_flags=%s pid=%d memcg_id=%llu %s",
 		__entry->order,
 		show_gfp_flags(__entry->gfp_flags),
-		__entry->pid,
+		__entry->ent.pid,
 		__entry->memcg_id,
 		__event_in_irq() ? "(in-irq)" : "")
 );
@@ -182,18 +180,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_rec
 	TP_STRUCT__entry(
 		__field(	unsigned long,	nr_reclaimed	)
 		__field(	u64,	memcg_id	)
-		__field(	int,	pid		)
 	),
 
 	TP_fast_assign(
 		__entry->nr_reclaimed	= nr_reclaimed;
 		__entry->memcg_id	= mem_cgroup_id(memcg);
-		__entry->pid		= current->pid;
 	),
 
 	TP_printk("nr_reclaimed=%lu pid=%d memcg_id=%llu %s",
 		__entry->nr_reclaimed,
-		__entry->pid,
+		__entry->ent.pid,
 		__entry->memcg_id,
 		__event_in_irq() ? "(in-irq)" : "")
 );
@@ -238,10 +234,9 @@ TRACE_EVENT(mm_shrink_slab_start,
 		__field(unsigned long, cache_items)
 		__field(unsigned long long, delta)
 		__field(unsigned long, total_scan)
-		__field(u64, memcg_id)
 		__field(int, priority)
 		__field(int, nid)
-		__field(int, pid)
+		__field(u64, memcg_id)
 	),
 
 	TP_fast_assign(
@@ -255,14 +250,13 @@ TRACE_EVENT(mm_shrink_slab_start,
 		__entry->priority = priority;
 		__entry->nid = sc->nid;
 		__entry->memcg_id = mem_cgroup_id(memcg);
-		__entry->pid = current->pid;
 	),
 
 	TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d %s",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nid,
-		__entry->pid,
+		__entry->ent.pid,
 		__entry->memcg_id,
 		__entry->nr_objects_to_shrink,
 		show_gfp_flags(__entry->gfp_flags),
@@ -288,7 +282,6 @@ TRACE_EVENT(mm_shrink_slab_end,
 		__field(long, total_scan)
 		__field(int, nid)
 		__field(int, retval)
-		__field(int, pid)
 		__field(u64, memcg_id)
 	),
 
@@ -300,7 +293,6 @@ TRACE_EVENT(mm_shrink_slab_end,
 		__entry->total_scan = total_scan;
 		__entry->nid = nid;
 		__entry->retval = shrinker_retval;
-		__entry->pid = current->pid;
 		__entry->memcg_id = mem_cgroup_id(memcg);
 	),
 
@@ -308,7 +300,7 @@ TRACE_EVENT(mm_shrink_slab_end,
 		__entry->shrink,
 		__entry->shr,
 		__entry->nid,
-		__entry->pid,
+		__entry->ent.pid,
 		__entry->memcg_id,
 		__entry->unused_scan,
 		__entry->new_scan,
_


^ permalink raw reply

* [PATCH v8 3/3] mm: vmscan: add PIDs to vmscan tracepoints
From: Thomas Ballasi @ 2026-03-16 16:09 UTC (permalink / raw)
  To: tballasi
  Cc: akpm, axelrasmussen, david, hannes, linux-mm, linux-trace-kernel,
	lorenzo.stoakes, mhiramat, mhocko, rostedt, shakeel.butt, weixugc,
	yuanchu, zhengqi.arch
In-Reply-To: <20260316160908.42727-1-tballasi@linux.microsoft.com>

The changes aims at adding additionnal tracepoints variables to help
debuggers attribute them to specific processes.

Signed-off-by: Thomas Ballasi <tballasi@linux.microsoft.com>
---
 include/trace/events/vmscan.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 1212f6a7c223e..645b036572707 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -130,10 +130,12 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
 		__entry->memcg_id	= mem_cgroup_id(memcg);
 	),
 
-	TP_printk("order=%d gfp_flags=%s memcg_id=%llu",
+	TP_printk("order=%d gfp_flags=%s pid=%d memcg_id=%llu %s",
 		__entry->order,
 		show_gfp_flags(__entry->gfp_flags),
-		__entry->memcg_id)
+		__entry->ent.pid,
+		__entry->memcg_id,
+		__event_in_irq() ? "(in-irq)" : "")
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
@@ -175,9 +177,11 @@ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
 		__entry->memcg_id	= mem_cgroup_id(memcg);
 	),
 
-	TP_printk("nr_reclaimed=%lu memcg_id=%llu",
+	TP_printk("nr_reclaimed=%lu pid=%d memcg_id=%llu %s",
 		__entry->nr_reclaimed,
-		__entry->memcg_id)
+		__entry->ent.pid,
+		__entry->memcg_id,
+		__event_in_irq() ? "(in-irq)" : "")
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end,
@@ -238,17 +242,19 @@ TRACE_EVENT(mm_shrink_slab_start,
 		__entry->memcg_id = mem_cgroup_id(memcg);
 	),
 
-	TP_printk("%pS %p: nid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
+	TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d %s",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nid,
+		__entry->ent.pid,
 		__entry->memcg_id,
 		__entry->nr_objects_to_shrink,
 		show_gfp_flags(__entry->gfp_flags),
 		__entry->cache_items,
 		__entry->delta,
 		__entry->total_scan,
-		__entry->priority)
+		__entry->priority,
+		__event_in_irq() ? "(in-irq)" : "")
 );
 
 TRACE_EVENT(mm_shrink_slab_end,
@@ -280,15 +286,17 @@ TRACE_EVENT(mm_shrink_slab_end,
 		__entry->memcg_id = mem_cgroup_id(memcg);
 	),
 
-	TP_printk("%pS %p: nid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
+	TP_printk("%pS %p: nid: %d pid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d %s",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nid,
+		__entry->ent.pid,
 		__entry->memcg_id,
 		__entry->unused_scan,
 		__entry->new_scan,
 		__entry->total_scan,
-		__entry->retval)
+		__entry->retval,
+		__event_in_irq() ? "(in-irq)" : "")
 );
 
 TRACE_EVENT(mm_vmscan_lru_isolate,
-- 
2.45.3


^ permalink raw reply related

* [PATCH v8 2/3] mm: vmscan: add cgroup IDs to vmscan tracepoints
From: Thomas Ballasi @ 2026-03-16 16:09 UTC (permalink / raw)
  To: tballasi
  Cc: akpm, axelrasmussen, david, hannes, linux-mm, linux-trace-kernel,
	lorenzo.stoakes, mhiramat, mhocko, rostedt, shakeel.butt, weixugc,
	yuanchu, zhengqi.arch
In-Reply-To: <20260316160908.42727-1-tballasi@linux.microsoft.com>

Memory reclaim events are currently difficult to attribute to
specific cgroups, making debugging memory pressure issues
challenging.  This patch adds memory cgroup ID (memcg_id) to key
vmscan tracepoints to enable better correlation and analysis.

For operations not associated with a specific cgroup, the field
is defaulted to 0.

Signed-off-by: Thomas Ballasi <tballasi@linux.microsoft.com>
---
 include/trace/events/vmscan.h | 83 ++++++++++++++++++++---------------
 mm/shrinker.c                 |  6 ++-
 mm/vmscan.c                   | 17 +++----
 3 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 490958fa10dee..1212f6a7c223e 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -114,85 +114,92 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 
 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
 
-	TP_PROTO(int order, gfp_t gfp_flags),
+	TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg),
 
-	TP_ARGS(order, gfp_flags),
+	TP_ARGS(gfp_flags, order, memcg),
 
 	TP_STRUCT__entry(
-		__field(	int,	order		)
 		__field(	unsigned long,	gfp_flags	)
+		__field(	u64,	memcg_id	)
+		__field(	int,	order		)
 	),
 
 	TP_fast_assign(
-		__entry->order		= order;
 		__entry->gfp_flags	= (__force unsigned long)gfp_flags;
+		__entry->order		= order;
+		__entry->memcg_id	= mem_cgroup_id(memcg);
 	),
 
-	TP_printk("order=%d gfp_flags=%s",
+	TP_printk("order=%d gfp_flags=%s memcg_id=%llu",
 		__entry->order,
-		show_gfp_flags(__entry->gfp_flags))
+		show_gfp_flags(__entry->gfp_flags),
+		__entry->memcg_id)
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
 
-	TP_PROTO(int order, gfp_t gfp_flags),
+	TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg),
 
-	TP_ARGS(order, gfp_flags)
+	TP_ARGS(gfp_flags, order, memcg)
 );
 
 #ifdef CONFIG_MEMCG
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
 
-	TP_PROTO(int order, gfp_t gfp_flags),
+	TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg),
 
-	TP_ARGS(order, gfp_flags)
+	TP_ARGS(gfp_flags, order, memcg)
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
 
-	TP_PROTO(int order, gfp_t gfp_flags),
+	TP_PROTO(gfp_t gfp_flags, int order, struct mem_cgroup *memcg),
 
-	TP_ARGS(order, gfp_flags)
+	TP_ARGS(gfp_flags, order, memcg)
 );
 #endif /* CONFIG_MEMCG */
 
 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
 
-	TP_PROTO(unsigned long nr_reclaimed),
+	TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg),
 
-	TP_ARGS(nr_reclaimed),
+	TP_ARGS(nr_reclaimed, memcg),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	nr_reclaimed	)
+		__field(	u64,	memcg_id	)
 	),
 
 	TP_fast_assign(
 		__entry->nr_reclaimed	= nr_reclaimed;
+		__entry->memcg_id	= mem_cgroup_id(memcg);
 	),
 
-	TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed)
+	TP_printk("nr_reclaimed=%lu memcg_id=%llu",
+		__entry->nr_reclaimed,
+		__entry->memcg_id)
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end,
 
-	TP_PROTO(unsigned long nr_reclaimed),
+	TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg),
 
-	TP_ARGS(nr_reclaimed)
+	TP_ARGS(nr_reclaimed, memcg)
 );
 
 #ifdef CONFIG_MEMCG
 DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end,
 
-	TP_PROTO(unsigned long nr_reclaimed),
+	TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg),
 
-	TP_ARGS(nr_reclaimed)
+	TP_ARGS(nr_reclaimed, memcg)
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end,
 
-	TP_PROTO(unsigned long nr_reclaimed),
+	TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg),
 
-	TP_ARGS(nr_reclaimed)
+	TP_ARGS(nr_reclaimed, memcg)
 );
 #endif /* CONFIG_MEMCG */
 
@@ -200,39 +207,42 @@ TRACE_EVENT(mm_shrink_slab_start,
 	TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
 		long nr_objects_to_shrink, unsigned long cache_items,
 		unsigned long long delta, unsigned long total_scan,
-		int priority),
+		int priority, struct mem_cgroup *memcg),
 
 	TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan,
-		priority),
+		priority, memcg),
 
 	TP_STRUCT__entry(
 		__field(struct shrinker *, shr)
 		__field(void *, shrink)
-		__field(int, nid)
 		__field(long, nr_objects_to_shrink)
 		__field(unsigned long, gfp_flags)
 		__field(unsigned long, cache_items)
 		__field(unsigned long long, delta)
 		__field(unsigned long, total_scan)
 		__field(int, priority)
+		__field(int, nid)
+		__field(u64, memcg_id)
 	),
 
 	TP_fast_assign(
 		__entry->shr = shr;
 		__entry->shrink = shr->scan_objects;
-		__entry->nid = sc->nid;
 		__entry->nr_objects_to_shrink = nr_objects_to_shrink;
 		__entry->gfp_flags = (__force unsigned long)sc->gfp_mask;
 		__entry->cache_items = cache_items;
 		__entry->delta = delta;
 		__entry->total_scan = total_scan;
 		__entry->priority = priority;
+		__entry->nid = sc->nid;
+		__entry->memcg_id = mem_cgroup_id(memcg);
 	),
 
-	TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
+	TP_printk("%pS %p: nid: %d memcg_id: %llu objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nid,
+		__entry->memcg_id,
 		__entry->nr_objects_to_shrink,
 		show_gfp_flags(__entry->gfp_flags),
 		__entry->cache_items,
@@ -243,35 +253,38 @@ TRACE_EVENT(mm_shrink_slab_start,
 
 TRACE_EVENT(mm_shrink_slab_end,
 	TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval,
-		long unused_scan_cnt, long new_scan_cnt, long total_scan),
+		long unused_scan_cnt, long new_scan_cnt, long total_scan, struct mem_cgroup *memcg),
 
 	TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt,
-		total_scan),
+		total_scan, memcg),
 
 	TP_STRUCT__entry(
 		__field(struct shrinker *, shr)
-		__field(int, nid)
 		__field(void *, shrink)
 		__field(long, unused_scan)
 		__field(long, new_scan)
-		__field(int, retval)
 		__field(long, total_scan)
+		__field(int, nid)
+		__field(int, retval)
+		__field(u64, memcg_id)
 	),
 
 	TP_fast_assign(
 		__entry->shr = shr;
-		__entry->nid = nid;
 		__entry->shrink = shr->scan_objects;
 		__entry->unused_scan = unused_scan_cnt;
 		__entry->new_scan = new_scan_cnt;
-		__entry->retval = shrinker_retval;
 		__entry->total_scan = total_scan;
+		__entry->nid = nid;
+		__entry->retval = shrinker_retval;
+		__entry->memcg_id = mem_cgroup_id(memcg);
 	),
 
-	TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
+	TP_printk("%pS %p: nid: %d memcg_id: %llu unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
 		__entry->shrink,
 		__entry->shr,
 		__entry->nid,
+		__entry->memcg_id,
 		__entry->unused_scan,
 		__entry->new_scan,
 		__entry->total_scan,
@@ -504,9 +517,9 @@ TRACE_EVENT(mm_vmscan_node_reclaim_begin,
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end,
 
-	TP_PROTO(unsigned long nr_reclaimed),
+	TP_PROTO(unsigned long nr_reclaimed, struct mem_cgroup *memcg),
 
-	TP_ARGS(nr_reclaimed)
+	TP_ARGS(nr_reclaimed, memcg)
 );
 
 TRACE_EVENT(mm_vmscan_throttled,
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 4a93fd433689a..ddf784f996a59 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -410,7 +410,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	total_scan = min(total_scan, (2 * freeable));
 
 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
-				   freeable, delta, total_scan, priority);
+				   freeable, delta, total_scan, priority,
+				   shrinkctl->memcg);
 
 	/*
 	 * Normally, we should not scan less than batch_size objects in one
@@ -461,7 +462,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 	 */
 	new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
 
-	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
+	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan,
+				 shrinkctl->memcg);
 	return freed;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 05d9354a59c65..b3117814ec436 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6652,11 +6652,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		return 1;
 
 	set_task_reclaim_state(current, &sc.reclaim_state);
-	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
+	trace_mm_vmscan_direct_reclaim_begin(sc.gfp_mask, order, 0);
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
-	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed, 0);
 	set_task_reclaim_state(current, NULL);
 
 	return nr_reclaimed;
@@ -6685,8 +6685,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 
-	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
-						      sc.gfp_mask);
+	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.gfp_mask,
+						      sc.order,
+						      memcg);
 
 	/*
 	 * NOTE: Although we can get the priority field, using it
@@ -6697,7 +6698,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 	 */
 	shrink_lruvec(lruvec, &sc);
 
-	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed, memcg);
 
 	*nr_scanned = sc.nr_scanned;
 
@@ -6733,13 +6734,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 
 	set_task_reclaim_state(current, &sc.reclaim_state);
-	trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
+	trace_mm_vmscan_memcg_reclaim_begin(sc.gfp_mask, 0, memcg);
 	noreclaim_flag = memalloc_noreclaim_save();
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	memalloc_noreclaim_restore(noreclaim_flag);
-	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed, memcg);
 	set_task_reclaim_state(current, NULL);
 
 	return nr_reclaimed;
@@ -7685,7 +7686,7 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
 	delayacct_freepages_end();
 	psi_memstall_leave(&pflags);
 
-	trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed);
+	trace_mm_vmscan_node_reclaim_end(sc->nr_reclaimed, 0);
 
 	return sc->nr_reclaimed;
 }
-- 
2.45.3


^ permalink raw reply related

* [PATCH v8 1/3] tracing: Add __event_in_*irq() helpers
From: Thomas Ballasi @ 2026-03-16 16:09 UTC (permalink / raw)
  To: tballasi
  Cc: akpm, axelrasmussen, david, hannes, linux-mm, linux-trace-kernel,
	lorenzo.stoakes, mhiramat, mhocko, rostedt, shakeel.butt, weixugc,
	yuanchu, zhengqi.arch
In-Reply-To: <20260316160908.42727-1-tballasi@linux.microsoft.com>

From: Steven Rostedt <rostedt@goodmis.org>

Some trace events want to expose in their output if they were triggered in
an interrupt or softirq context. Instead of recording this in the event
structure itself, as this information is stored in the flags portion of
the event header, add helper macros that can be used in the print format:

  TP_printk("val=%d %s", __entry->val, __event_in_irq() ? "(in-irq)" : "")

This will output "(in-irq)" for the event in the trace data if the event
was triggered in hard or soft interrupt context.

Link: https://lore.kernel.org/all/20251229132942.31a2b583@gandalf.local.home/

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Thomas Ballasi <tballasi@linux.microsoft.com>
---
 include/trace/stages/stage3_trace_output.h |  8 ++++++++
 include/trace/stages/stage7_class_define.h | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h
index 1e7b0bef95f52..53a23988a3b8a 100644
--- a/include/trace/stages/stage3_trace_output.h
+++ b/include/trace/stages/stage3_trace_output.h
@@ -150,3 +150,11 @@
 
 #undef __get_buf
 #define __get_buf(len)		trace_seq_acquire(p, (len))
+
+#undef __event_in_hardirq
+#undef __event_in_softirq
+#undef __event_in_irq
+
+#define __event_in_hardirq()	(__entry->ent.flags & TRACE_FLAG_HARDIRQ)
+#define __event_in_softirq()	(__entry->ent.flags & TRACE_FLAG_SOFTIRQ)
+#define __event_in_irq()	(__entry->ent.flags & (TRACE_FLAG_HARDIRQ | TRACE_FLAG_SOFTIRQ))
diff --git a/include/trace/stages/stage7_class_define.h b/include/trace/stages/stage7_class_define.h
index fcd564a590f43..47008897a7956 100644
--- a/include/trace/stages/stage7_class_define.h
+++ b/include/trace/stages/stage7_class_define.h
@@ -26,6 +26,25 @@
 #undef __print_hex_dump
 #undef __get_buf
 
+#undef __event_in_hardirq
+#undef __event_in_softirq
+#undef __event_in_irq
+
+/*
+ * The TRACE_FLAG_* are enums. Instead of using TRACE_DEFINE_ENUM(),
+ * use their hardcoded values. These values are parsed by user space
+ * tooling elsewhere so they will never change.
+ *
+ * See "enum trace_flag_type" in linux/trace_events.h:
+ *   TRACE_FLAG_HARDIRQ
+ *   TRACE_FLAG_SOFTIRQ
+ */
+
+/* This is what is displayed in the format files */
+#define __event_in_hardirq()	(REC->common_flags & 0x8)
+#define __event_in_softirq()	(REC->common_flags & 0x10)
+#define __event_in_irq()	(REC->common_flags & 0x18)
+
 /*
  * The below is not executed in the kernel. It is only what is
  * displayed in the print format for userspace to parse.
-- 
2.45.3


^ permalink raw reply related

* [PATCH v8 0/3] mm: vmscan: add PID and cgroup ID to vmscan tracepoints
From: Thomas Ballasi @ 2026-03-16 16:09 UTC (permalink / raw)
  To: tballasi
  Cc: akpm, axelrasmussen, david, hannes, linux-mm, linux-trace-kernel,
	lorenzo.stoakes, mhiramat, mhocko, rostedt, shakeel.butt, weixugc,
	yuanchu, zhengqi.arch
In-Reply-To: <20260223171544.4750-1-tballasi@linux.microsoft.com>

Changes in v8:
- Removed in_task() mention in PID commit message
- Moved __entry->pid to __entry->ent.pid

Link to v7:
https://lore.kernel.org/linux-trace-kernel/20260223171544.4750-1-tballasi@linux.microsoft.com/

Signed-off-by: Thomas Ballasi <tballasi@linux.microsoft.com>

Steven Rostedt (1):
  tracing: Add __event_in_*irq() helpers

Thomas Ballasi (2):
  mm: vmscan: add cgroup IDs to vmscan tracepoints
  mm: vmscan: add PIDs to vmscan tracepoints

 include/trace/events/vmscan.h              | 95 +++++++++++++---------
 include/trace/stages/stage3_trace_output.h |  8 ++
 include/trace/stages/stage7_class_define.h | 19 +++++
 mm/shrinker.c                              |  6 +-
 mm/vmscan.c                                | 17 ++--
 5 files changed, 98 insertions(+), 47 deletions(-)

-- 
2.45.3


^ permalink raw reply

* Re: [PATCH v2 2/2] locking: Add contended_release tracepoint
From: Dmitry Ilvokhin @ 2026-03-16 15:32 UTC (permalink / raw)
  To: Usama Arif
  Cc: Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Peter Zijlstra, Ingo Molnar,
	Will Deacon, Boqun Feng, Waiman Long, linux-mm, linux-kernel,
	linux-trace-kernel, kernel-team
In-Reply-To: <20260312113815.2107882-1-usama.arif@linux.dev>

On Thu, Mar 12, 2026 at 04:38:14AM -0700, Usama Arif wrote:
> On Tue, 10 Mar 2026 17:49:39 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:
> 
> > Add the contended_release trace event. This tracepoint fires on the
> > holder side when a contended lock is released, complementing the
> > existing contention_begin/contention_end tracepoints which fire on the
> > waiter side.
> > 
> > This enables correlating lock hold time under contention with waiter
> > events by lock address.
> > 
> > Add trace_contended_release() calls to the slowpath unlock paths of
> > sleepable locks: mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and
> > RT-specific rwbase locks. Each call site fires only when there are
> > blocked waiters being woken, except percpu_up_write() which always wakes
> > via __wake_up().
> > 
> > Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
> > ---
> >  include/trace/events/lock.h   | 17 +++++++++++++++++
> >  kernel/locking/mutex.c        |  1 +
> >  kernel/locking/percpu-rwsem.c |  3 +++
> >  kernel/locking/rtmutex.c      |  1 +
> >  kernel/locking/rwbase_rt.c    |  8 +++++++-
> >  kernel/locking/rwsem.c        |  9 +++++++--
> >  kernel/locking/semaphore.c    |  4 +++-
> >  7 files changed, 39 insertions(+), 4 deletions(-)
> > 

[...]

> > diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
> > index f3ee7a0d6047..1eee51766aaf 100644
> > --- a/kernel/locking/percpu-rwsem.c
> > +++ b/kernel/locking/percpu-rwsem.c
> > @@ -263,6 +263,8 @@ void percpu_up_write(struct percpu_rw_semaphore *sem)
> >  {
> >  	rwsem_release(&sem->dep_map, _RET_IP_);
> >  
> > +	trace_contended_release(sem);
> > +
> 
> Hello!
> 
> I saw that you mentioned in the commmit message that you do this for only
> blocked waiters except for percpu_up_write(). We can use
> waitqueue_active(&sem->waiters) to check for this over here so that
> its consistent with every other call?

Thanks for the feedback, Usama.

I thought about it and even mentioned in the comment, but I forgot what
was the reason. Now, I think you are correct. I added wq_has_sleeper()
locally instead of waitqueue_active() locally, since we are not holding
the lock here and waitqueue_active() requires a barrier based on the
comment. It might be not very important here, but I'd rather make it
correct even for tracepoint.

Note that __percpu_up_read() doesn't need this guard. Maybe I was
thinking at __percpu_up_read() part before and just made it symmetric.

Anyway, thanks for suggestion.

> 
> 
> >  	/*
> >  	 * Signal the writer is done, no fast path yet.
> >  	 *
> > @@ -297,6 +299,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
> >  	 * writer.
> >  	 */
> >  	smp_mb(); /* B matches C */
> > +	trace_contended_release(sem);
> 
> Should we do this after this_cpu_dec(*sem->read_count)?

Good point. I moved it after this_cpu_dec() so the tracepoint fires
after the lock is released but before rcuwait_wake_up(). I also went
through all other call sites and made the placement consistent where
possible: after release, before wake. It should be fixed in v3.

^ permalink raw reply

* Re: [PATCH v3 17/17] tools/bootconfig: fix fd leak in load_xbc_file() on fstat failure
From: Josh Law @ 2026-03-16 15:14 UTC (permalink / raw)
  To: Markus Elfring, linux-trace-kernel, Andrew Morton,
	Masami Hiramatsu; +Cc: LKML
In-Reply-To: <ecb34439-8695-4562-a9e7-9fac74323adb@web.de>



On 16 March 2026 12:15:08 GMT, Markus Elfring <Markus.Elfring@web.de> wrote:
>> If fstat() fails after open() succeeds, load_xbc_file() returns
>> -errno without closing the file descriptor.  Add the missing close()
>> call on the error path.
>
>https://elixir.bootlin.com/linux/v7.0-rc3/source/tools/bootconfig/main.c#L139-L153
>
>How do you think about to use a corresponding goto chain?
>
>
>Would you like to  add any tags (like “Fixes” and “Cc”) accordingly?
>https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v7.0-rc4#n145
>
>Regards,
>Markus


Hello markus, i submitted a V6 (yes a v6) that responds to all that, and includes extra patches to fix some warnings under W123 (maximum GCC warning level)

Thanks.


V/R


Josh Law

^ permalink raw reply

* Re: [PATCH v3] tracing: Generate undef symbols allowlist for simple_ring_buffer
From: Steven Rostedt @ 2026-03-16 14:09 UTC (permalink / raw)
  To: Vincent Donnefort
  Cc: maz, arnd, nathan, linux-trace-kernel, kvmarm, kernel-team
In-Reply-To: <20260316092845.3367411-1-vdonnefort@google.com>

On Mon, 16 Mar 2026 09:28:45 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> Compiler and tooling-generated symbols are difficult to maintain
> across all supported architectures. Make the allowlist more robust by
> replacing the harcoded list with a mechanism that automatically detects
> these symbols.
> 
> This mechanism generates a C function designed to trigger common
> compiler-inserted symbols.
> 
> Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
> Reviewed-by: Nathan Chancellor <nathan@kernel.org>
> Tested-by: Nathan Chancellor <nathan@kernel.org>

I take it that Marc will take this?

Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve

^ permalink raw reply

* Re: [PATCH V2] tracing: Revert "tracing: Remove pid in task_rename tracing output"
From: Steven Rostedt @ 2026-03-16 13:54 UTC (permalink / raw)
  To: Xuewen Yan
  Cc: Xuewen Yan, mhiramat, mathieu.desnoyers, elver, kees,
	lorenzo.stoakes, brauner, schuster.simon, david, linux-kernel,
	linux-trace-kernel, guohua.yan, ke.wang, jing.xia
In-Reply-To: <CAB8ipk8M=Uij65LZCbSAjJPPPsXWYY4kGBQwMm1WqJHHEb33eQ@mail.gmail.com>

On Mon, 16 Mar 2026 10:00:13 +0800
Xuewen Yan <xuewen.yan94@gmail.com> wrote:

> Unless there are any further comments, could you please help to take
> this through the tracing tree?

I guess this file doesn't really have an owner, so yeah, I'll take it.

-- Steve

^ permalink raw reply

* Re: [PATCH 50/61] iommu: Prefer IS_ERR_OR_NULL over manual NULL check
From: Robin Murphy @ 2026-03-16 13:30 UTC (permalink / raw)
  To: Philipp Hahn, amd-gfx, apparmor, bpf, ceph-devel, cocci, dm-devel,
	dri-devel, gfs2, intel-gfx, intel-wired-lan, iommu, kvm,
	linux-arm-kernel, linux-block, linux-bluetooth, linux-btrfs,
	linux-cifs, linux-clk, linux-erofs, linux-ext4, linux-fsdevel,
	linux-gpio, linux-hyperv, linux-input, linux-kernel, linux-leds,
	linux-media, linux-mips, linux-mm, linux-modules, linux-mtd,
	linux-nfs, linux-omap, linux-phy, linux-pm, linux-rockchip,
	linux-s390, linux-scsi, linux-sctp, linux-security-module,
	linux-sh, linux-sound, linux-stm32, linux-trace-kernel, linux-usb,
	linux-wireless, netdev, ntfs3, samba-technical, sched-ext,
	target-devel, tipc-discussion, v9fs
  Cc: Joerg Roedel, Will Deacon
In-Reply-To: <20260310-b4-is_err_or_null-v1-50-bd63b656022d@avm.de>

On 2026-03-10 11:49 am, Philipp Hahn wrote:
> Prefer using IS_ERR_OR_NULL() over using IS_ERR() and a manual NULL
> check.

AFAICS it doesn't look possible for the argument to be anything other 
than valid at both callsites, so *both* conditions here seem in fact to 
be entirely redundant.

> Change generated with coccinelle.

Please use coccinelle responsibly. Mechanical changes are great for 
scripted API updates, but for cleanup, whilst it's ideal for *finding* 
areas of code that are worth looking at, the code then wants actually 
looking at, in its whole context, because meaningful cleanup often goes 
deeper than trivial replacement.

In particular, anywhere IS_ERR_OR_NULL() is genuinely relevant is 
usually a sign of bad interface design, so if you're looking at this 
then you really should be looking first and foremost to remove any 
checks that are already unnecessary, and for the remainder, to see if 
the thing being checked can be improved to not mix the two different 
styles. That would be constructive and (usually) welcome cleanup. Simply 
churning a bunch of code with this ugly macro that's arguably less 
readable than what it replaces, not so much.

Thanks,
Robin.

> To: Joerg Roedel <joro@8bytes.org>
> To: Will Deacon <will@kernel.org>
> To: Robin Murphy <robin.murphy@arm.com>
> Cc: iommu@lists.linux.dev
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Philipp Hahn <phahn-oss@avm.de>
> ---
>   drivers/iommu/omap-iommu.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
> index 8231d7d6bb6a9202025643639a6b28e6faa84659..500a42b57a997696ff37c76f028a717ab71d01f9 100644
> --- a/drivers/iommu/omap-iommu.c
> +++ b/drivers/iommu/omap-iommu.c
> @@ -881,7 +881,7 @@ static int omap_iommu_attach(struct omap_iommu *obj, u32 *iopgd)
>    **/
>   static void omap_iommu_detach(struct omap_iommu *obj)
>   {
> -	if (!obj || IS_ERR(obj))
> +	if (IS_ERR_OR_NULL(obj))
>   		return;
>   
>   	spin_lock(&obj->iommu_lock);
> 

^ permalink raw reply

* Re: [PATCH v3 17/17] tools/bootconfig: fix fd leak in load_xbc_file() on fstat failure
From: Markus Elfring @ 2026-03-16 12:15 UTC (permalink / raw)
  To: Josh Law, linux-trace-kernel, Andrew Morton, Masami Hiramatsu; +Cc: LKML
In-Reply-To: <20260314223425.142966-18-objecting@objecting.org>

> If fstat() fails after open() succeeds, load_xbc_file() returns
> -errno without closing the file descriptor.  Add the missing close()
> call on the error path.

https://elixir.bootlin.com/linux/v7.0-rc3/source/tools/bootconfig/main.c#L139-L153

How do you think about to use a corresponding goto chain?

Would you like to  add any tags (like “Fixes” and “Cc”) accordingly?
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v7.0-rc4#n145

Regards,
Markus

^ permalink raw reply

* Re: [PATCH net-next v2 07/14] tcp: honor the maximum advertised window after live retraction
From: Paolo Abeni @ 2026-03-16 11:44 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-8-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> +/* Sender-visible window rescue does not relax hard receive-memory admission.
> + * If growth did not make room, fall back to the established prune/collapse
> + * path.
> + */
>  static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
>  				 unsigned int size)
>  {
> -	if (!tcp_can_ingest(sk, skb) ||
> -	    !sk_rmem_schedule(sk, skb, size)) {
> +	bool can_ingest = tcp_can_ingest(sk, skb);
> +	bool scheduled = can_ingest && sk_rmem_schedule(sk, skb, size);
> +
> +	if (!scheduled) {
> +		int pruned = tcp_prune_queue(sk, skb);
>  
> -		if (tcp_prune_queue(sk, skb) < 0)
> +		if (pruned < 0)
>  			return -1;
>  
>  		while (!sk_rmem_schedule(sk, skb, size)) {
> -			if (!tcp_prune_ofo_queue(sk, skb))
> +			bool pruned_ofo = tcp_prune_ofo_queue(sk, skb);
> +
> +			if (!pruned_ofo)
>  				return -1;
>  		}
>  	}

The above chunk is AFAICS pure noise. Please have a more careful local
review of this series before any next revision.

/P


^ permalink raw reply

* Re: [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
From: Paolo Abeni @ 2026-03-16 11:31 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-6-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 57a2a6daaad3..53781cf591d2 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk)
>  	 * scaled window will not line up with the MSS boundary anyway.
>  	 */
>  	if (tp->rx_opt.rcv_wscale) {
> +		int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
> +
>  		window = free_space;
>  
>  		/* Advertise enough space so that it won't get scaled away.
> -		 * Import case: prevent zero window announcement if
> +		 * Important case: prevent zero-window announcement if
>  		 * 1<<rcv_wscale > mss.
>  		 */
> -		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
> +		window = ALIGN(window, rcv_wscale);
> +
> +		/* Back any scale-quantization slack before we expose it.
> +		 * Otherwise tcp_can_ingest() can reject data which is still
> +		 * within the sender-visible window.
> +		 */
> +		if (window > free_space &&
> +		    (!tcp_rcvbuf_grow_allowed(sk) ||
> +		     !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window))))
> +			window = round_down(free_space, rcv_wscale);

It looks like this can cause the advertised window to shrink even if we
are in the 'do not allow window to shrink' branch.

Also why the other branch (shrinking allowed) is not touched?

/P


^ permalink raw reply

* Re: [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
From: Paolo Abeni @ 2026-03-16 11:24 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-6-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> From: Wesley Atwell <atwellwea@gmail.com>
> 
> Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
> more sender-visible window than the current hard receive-memory backing
> can cover.
> 
> The new helper keeps backlog and memory-pressure limits in the same
> units as the rest of the receive path, while __tcp_select_window()
> backs any rounding slack before advertising it.
> 
> Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
> ---
>  include/net/tcp.h     | 12 ++++++++++++
>  net/ipv4/tcp_input.c  | 36 ++++++++++++++++++++++++++++++++++--
>  net/ipv4/tcp_output.c | 15 +++++++++++++--
>  3 files changed, 59 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index fc22ab6b80d5..5b479ad44f89 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
>  enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
>  void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
>  void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
>  void tcp_rcv_space_adjust(struct sock *sk);
>  int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
>  void tcp_twsk_destructor(struct sock *sk);
> @@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
>  	return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
>  }
>  
> +/* Passive children clone the listener's sk_socket until accept() grafts
> + * their own struct socket, 

AFAICS, the above statement is false, see sk_set_socket() in sk_clone()

> so only sockets that point back to themselves
> + * should autotune receive-buffer backing.
> + */
> +static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
> +{
> +	struct socket *sock = READ_ONCE(sk->sk_socket);

You can just check `sk->sk_socket`. Also you could re-use this helper in
tcp_data_queue_ofo().

/P


^ permalink raw reply

* Re: [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift
From: Paolo Abeni @ 2026-03-16 11:09 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> From: Wesley Atwell <atwellwea@gmail.com>
> 
> This series keeps sender-visible TCP receive-window accounting tied to the
> scaling basis that was in force when the window was advertised, even if
> later receive-side truesize inflation lowers scaling_ratio or the live
> receive window retracts below the largest right edge already exposed to the
> sender.
> 
> After the receive-window retraction changes, the receive path needs to keep
> track of two related pieces of sender-visible state:
> 
>   1. the live advertised receive window
>   2. the maximum advertised right edge and the basis it was exposed with
> 
> This repost snapshots both, uses them to repair receive-buffer backing when
> ratio drift would otherwise strand sender-visible space, extends
> TCP_REPAIR_WINDOW so repair/restore can round-trip the new state, and adds
> truesize-drift coverage through TUN packetdrill tests and netdevsim-based
> selftests.

The series is IMHO significantly not trivial. Can the end-user meet the
relevant condition in practice? How? What is the net benefit in
practice? Is that observable under usual conditions or require
exceptional circumstances?

I think we need a strong motivation to merge this kind of changes.

/P


^ permalink raw reply

* Re: [PATCH v3] tracing: Generate undef symbols allowlist for simple_ring_buffer
From: Arnd Bergmann @ 2026-03-16 11:06 UTC (permalink / raw)
  To: Vincent Donnefort, Marc Zyngier
  Cc: Steven Rostedt, Nathan Chancellor, linux-trace-kernel, kvmarm,
	kernel-team
In-Reply-To: <20260316092845.3367411-1-vdonnefort@google.com>

On Mon, Mar 16, 2026, at 10:28, Vincent Donnefort wrote:
> Compiler and tooling-generated symbols are difficult to maintain
> across all supported architectures. Make the allowlist more robust by
> replacing the harcoded list with a mechanism that automatically detects
> these symbols.
>
> This mechanism generates a C function designed to trigger common
> compiler-inserted symbols.
>
> Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
> Reviewed-by: Nathan Chancellor <nathan@kernel.org>
> Tested-by: Nathan Chancellor <nathan@kernel.org>

Added to my randconfig build setup now, I'll let you know if
any regressions remain.

      Arnd

^ permalink raw reply

* Re: [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
From: Paolo Abeni @ 2026-03-16 11:04 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-6-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> From: Wesley Atwell <atwellwea@gmail.com>
> 
> Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
> more sender-visible window than the current hard receive-memory backing
> can cover.
> 
> The new helper keeps backlog and memory-pressure limits in the same
> units as the rest of the receive path, while __tcp_select_window()
> backs any rounding slack before advertising it.
> 
> Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
> ---
>  include/net/tcp.h     | 12 ++++++++++++
>  net/ipv4/tcp_input.c  | 36 ++++++++++++++++++++++++++++++++++--
>  net/ipv4/tcp_output.c | 15 +++++++++++++--
>  3 files changed, 59 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index fc22ab6b80d5..5b479ad44f89 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
>  enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
>  void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
>  void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
>  void tcp_rcv_space_adjust(struct sock *sk);
>  int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
>  void tcp_twsk_destructor(struct sock *sk);
> @@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
>  	return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
>  }
>  
> +/* Passive children clone the listener's sk_socket until accept() grafts
> + * their own struct socket, so only sockets that point back to themselves
> + * should autotune receive-buffer backing.
> + */
> +static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
> +{
> +	struct socket *sock = READ_ONCE(sk->sk_socket);
> +
> +	return sock && READ_ONCE(sock->sk) == sk;

This is executed under the sk socket lock, ONCE annotation not needed.

> +}
> +
>  /* Note: caller must be prepared to deal with negative returns */
>  static inline int tcp_space(const struct sock *sk)
>  {
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 352f814a4ff6..32256519a085 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk)
>  				    (u32)TCP_INIT_CWND * tp->advmss);
>  }
>  
> +/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
> + * bytes beyond sk_rmem_alloc while preserving sender-visible headroom
> + * already consumed by sk_backlog.len.
> + */
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
> +{
> +	struct net *net = sock_net(sk);
> +	int backlog;
> +	int rmem2;
> +	int target;
> +
> +	needed = max(needed, 0);
> +	backlog = READ_ONCE(sk->sk_backlog.len);
> +	target = tcp_rmem_used(sk) + backlog + needed;
> +
> +	if (target <= READ_ONCE(sk->sk_rcvbuf))
> +		return true;
> +
> +	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
> +	if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
> +	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
> +	    tcp_under_memory_pressure(sk) ||
> +	    sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
> +		return false;
> +
> +	WRITE_ONCE(sk->sk_rcvbuf,
> +		   min_t(int, rmem2,
> +			 max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
> +
> +	return target <= READ_ONCE(sk->sk_rcvbuf);

Same here, and more cases below.

/P


^ permalink raw reply

* Re: [PATCH v2] tracing: Generate undef symbols allowlist for simple_ring_buffer
From: Vincent Donnefort @ 2026-03-16  9:31 UTC (permalink / raw)
  To: Nathan Chancellor
  Cc: maz, rostedt, arnd, linux-trace-kernel, kvmarm, kernel-team
In-Reply-To: <20260313163724.GA2573924@ax162>

On Fri, Mar 13, 2026 at 09:37:24AM -0700, Nathan Chancellor wrote:
> On Fri, Mar 13, 2026 at 10:58:29AM +0000, Vincent Donnefort wrote:
> > Compiler and tooling-generated symbols are difficult to maintain
> > across all supported architectures. Make the allowlist more robust by
> > replacing the harcoded list with a mechanism that automatically detects
> > these symbols.
> > 
> > This mechanism generates a C function designed to trigger common
> > compiler-inserted symbols.
> > 
> > Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
> > 
> > ---
> > 
> > Changes in v2:
> > 
> >   - Use filechk (Nathan)
> >   - Removed deprecated extra-y (Nathan)
> >   - Added simple_ring_buffer in allowlist (Nathan)
> >   - Added memcpy() to generate more symbols (Nathan)
> >   - Added __sancov 
> > 
> > diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> > index beb15936829d..96627a909ecc 100644
> > --- a/kernel/trace/Makefile
> > +++ b/kernel/trace/Makefile
> > @@ -136,17 +136,42 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
> >  # simple_ring_buffer is used by the pKVM hypervisor which does not have access
> >  # to all kernel symbols. Fail the build if forbidden symbols are found.
> >  #
> > -UNDEFINED_ALLOWLIST := memset alt_cb_patch_nops __x86 __ubsan __asan __kasan __gcov __aeabi_unwind
> > -UNDEFINED_ALLOWLIST += __stack_chk_fail stackleak_track_stack __ref_stack __sanitizer llvm_gcda llvm_gcov
> > -UNDEFINED_ALLOWLIST += .TOC\. __clear_pages_unrolled __memmove copy_page warn_slowpath_fmt
> > -UNDEFINED_ALLOWLIST += ftrace_likely_update __hwasan_load __hwasan_store __hwasan_tag_memory
> > -UNDEFINED_ALLOWLIST += warn_bogus_irq_restore __stack_chk_guard
> > -UNDEFINED_ALLOWLIST := $(addprefix -e , $(UNDEFINED_ALLOWLIST))
> > +# undefsyms_base generates a set of compiler and tooling-generated symbols that can
> > +# safely be ignored for simple_ring_buffer.
> > +#
> > +filechk_undefsyms_base = \
> > +	echo '$(pound)include <linux/atomic.h>'; \
> > +	echo '$(pound)include <linux/string.h>'; \
> > +	echo '$(pound)include <asm/page.h>'; \
> > +	echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \
> > +	echo 'void undefsyms_base(void *p, int n);'; \
> > +	echo 'void undefsyms_base(void *p, int n) {'; \
> > +	echo '	char buffer[256] = { 0 };'; \
> > +	echo '	u32 u = 0;'; \
> > +	echo '	memset((char * volatile)page, 8, PAGE_SIZE);'; \
> > +	echo '	memset((char * volatile)buffer, 8, sizeof(buffer));'; \
> > +	echo '	memcpy((void * volatile)p, buffer, sizeof(buffer));'; \
> > +	echo '	cmpxchg((u32 * volatile)&u, 0, 8);'; \
> > +	echo '	WARN_ON(n == 0xdeadbeef);'; \
> > +	echo '}'
> > +
> > +$(obj)/undefsyms_base.c: FORCE
> > +	$(call filechk,undefsyms_base)
> > +
> > +clean-files += undefsyms_base.c
> > +
> > +$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c
> > +
> > +targets += undefsyms_base.o
> > +
> > +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
> > +		      simple_ring_buffer \
> > +		      $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
> >  
> >  quiet_cmd_check_undefined = NM      $<
> > -      cmd_check_undefined = test -z "`$(NM) -u $< | grep -v $(UNDEFINED_ALLOWLIST)`"
> > +      cmd_check_undefined = test -z "`$(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST))`"
> >  
> > -$(obj)/%.o.checked: $(obj)/%.o FORCE
> > +$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
> >  	$(call if_changed,check_undefined)
> >  
> >  always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked
> > 
> > base-commit: 33f2e266515717c4b2df585dadefa0525557726c
> > -- 
> > 2.53.0.851.ga537e3e6e9-goog
> > 
> 
> Thanks! This is almost perfect for my tests, one final thing that I
> noticed as a result of my full overnight builds. For ARCH=riscv (and
> some other architectures from a quick grep), there is some logic in
> their include/asm/string.h files to avoid FORTIFY_SOURCE when KASAN is
> enabled for the entire build but not enabled for the particular file. As
> undefsyms_base.o is not linked into vmlinux or modules, it does not
> automatically have KASAN enabled.
> 
>   $ cat allmod.config
>   CONFIG_GCOV_KERNEL=n
>   CONFIG_LTO_CLANG_THIN=y
>   CONFIG_WERROR=n
> 
>   $ make -skj"$(nproc)" ARCH=riscv KCONFIG_ALLCONFIG=1 LLVM=1 mrproper allmodconfig kernel/trace/
>   Unexpected symbols in kernel/trace/simple_ring_buffer.o:
>                    U __fortify_panic
>                    U __write_overflow_field
>   ...
> 
> This cures that for me.
> 
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index 260382f62dbf..55af887a90e2 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -164,6 +164,11 @@ $(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c
>  
>  targets += undefsyms_base.o
>  
> +# ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when
> +# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags
> +# because it is not linked into vmlinux.
> +KASAN_SANITIZE_undefsyms_base.o := y
> +
>  UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
>  		      simple_ring_buffer \
>  		      $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
> --
> 
> With that addressed:
> 
> Reviewed-by: Nathan Chancellor <nathan@kernel.org>
> Tested-by: Nathan Chancellor <nathan@kernel.org>

I've just sent a v3 with all that. I have tested locally with allmodconfig and
many architectures with both clang and gcc.

Thanks a lot for your help!

--
Vincent

> 
> Cheers,
> Nathan

^ permalink raw reply

* [PATCH v3] tracing: Generate undef symbols allowlist for simple_ring_buffer
From: Vincent Donnefort @ 2026-03-16  9:28 UTC (permalink / raw)
  To: maz
  Cc: rostedt, arnd, nathan, linux-trace-kernel, kvmarm, kernel-team,
	Vincent Donnefort

Compiler and tooling-generated symbols are difficult to maintain
across all supported architectures. Make the allowlist more robust by
replacing the harcoded list with a mechanism that automatically detects
these symbols.

This mechanism generates a C function designed to trigger common
compiler-inserted symbols.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>

---

Changes in v3:

  - Enforce KASAN to ensure FORTIFY_SOURCE isn't disabled on some arch (Nathan) 

Changes in v2:

  - Use filechk (Nathan)
  - Removed deprecated extra-y (Nathan)
  - Added simple_ring_buffer in allowlist (Nathan)
  - Added memcpy() to generate more symbols (Nathan)
  - Added __sancov 

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index beb15936829d..f4503a001d4c 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -136,17 +136,47 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o
 # simple_ring_buffer is used by the pKVM hypervisor which does not have access
 # to all kernel symbols. Fail the build if forbidden symbols are found.
 #
-UNDEFINED_ALLOWLIST := memset alt_cb_patch_nops __x86 __ubsan __asan __kasan __gcov __aeabi_unwind
-UNDEFINED_ALLOWLIST += __stack_chk_fail stackleak_track_stack __ref_stack __sanitizer llvm_gcda llvm_gcov
-UNDEFINED_ALLOWLIST += .TOC\. __clear_pages_unrolled __memmove copy_page warn_slowpath_fmt
-UNDEFINED_ALLOWLIST += ftrace_likely_update __hwasan_load __hwasan_store __hwasan_tag_memory
-UNDEFINED_ALLOWLIST += warn_bogus_irq_restore __stack_chk_guard
-UNDEFINED_ALLOWLIST := $(addprefix -e , $(UNDEFINED_ALLOWLIST))
+# undefsyms_base generates a set of compiler and tooling-generated symbols that can
+# safely be ignored for simple_ring_buffer.
+#
+filechk_undefsyms_base = \
+	echo '$(pound)include <linux/atomic.h>'; \
+	echo '$(pound)include <linux/string.h>'; \
+	echo '$(pound)include <asm/page.h>'; \
+	echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \
+	echo 'void undefsyms_base(void *p, int n);'; \
+	echo 'void undefsyms_base(void *p, int n) {'; \
+	echo '	char buffer[256] = { 0 };'; \
+	echo '	u32 u = 0;'; \
+	echo '	memset((char * volatile)page, 8, PAGE_SIZE);'; \
+	echo '	memset((char * volatile)buffer, 8, sizeof(buffer));'; \
+	echo '	memcpy((void * volatile)p, buffer, sizeof(buffer));'; \
+	echo '	cmpxchg((u32 * volatile)&u, 0, 8);'; \
+	echo '	WARN_ON(n == 0xdeadbeef);'; \
+	echo '}'
+
+$(obj)/undefsyms_base.c: FORCE
+	$(call filechk,undefsyms_base)
+
+clean-files += undefsyms_base.c
+
+$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c
+
+targets += undefsyms_base.o
+
+# Ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when
+# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags
+# because it is not linked into vmlinux.
+KASAN_SANITIZE_undefsyms_base.o := y
+
+UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \
+		      simple_ring_buffer \
+		      $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}')
 
 quiet_cmd_check_undefined = NM      $<
-      cmd_check_undefined = test -z "`$(NM) -u $< | grep -v $(UNDEFINED_ALLOWLIST)`"
+      cmd_check_undefined = test -z "`$(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST))`"
 
-$(obj)/%.o.checked: $(obj)/%.o FORCE
+$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE
 	$(call if_changed,check_undefined)
 
 always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked

base-commit: 33f2e266515717c4b2df585dadefa0525557726c
-- 
2.53.0.851.ga537e3e6e9-goog


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox