The Linux Kernel Mailing List
 help / color / mirror / Atom feed
From: hawk@kernel.org
To: Andrew Morton <akpm@linux-foundation.org>, linux-mm@kvack.org
Cc: Vlastimil Babka <vbabka@kernel.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Michal Hocko <mhocko@suse.com>, Zi Yan <ziy@nvidia.com>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <ljs@kernel.org>, Shuah Khan <shuah@kernel.org>,
	linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
	kernel-team@cloudflare.com, hawk@kernel.org
Subject: [PATCH 1/2] mm/page_alloc: add tracepoints for zone->lock acquisitions
Date: Fri,  8 May 2026 18:22:06 +0200	[thread overview]
Message-ID: <20260508162207.3315781-1-hawk@kernel.org> (raw)

From: Jesper Dangaard Brouer <hawk@kernel.org>

Add tracepoints to the page allocator fast paths that acquire
zone->lock, allowing diagnosis of lock contention in production.

Three tracepoints are introduced:
  kmem:mm_zone_lock_contended - fires when trylock fails (lock is held)
  kmem:mm_zone_locked         - fires on every acquisition
  kmem:mm_zone_lock_unlock    - fires on every release

Each event records the NUMA node, zone name, batch count, and caller.
The mm_zone_locked event additionally records wait_ns: the time spent
spinning when contended, measured via local_clock() with IRQs disabled
to ensure accurate same-CPU timestamps.

The lock/unlock paths are wrapped in __zone_lock()/__zone_unlock()
helpers that use trylock-first to separate the contended and
uncontended cases.  Only the fast paths (free_pcppages_bulk,
rmqueue_bulk, free_one_page) are covered.  Other zone->lock holders
such as compaction, page isolation, and memory hotplug are not
instrumented.

For minimum overhead in production, enable only mm_zone_lock_contended
which fires only on actual contention.  Enable mm_zone_locked for
wait-time analysis, and add mm_zone_lock_unlock for hold-time
measurement.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 include/trace/events/kmem.h | 101 ++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c             |  50 +++++++++++++++---
 2 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index cd7920c81f85..870c68c70d57 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -458,6 +458,107 @@ TRACE_EVENT(rss_stat,
 		__print_symbolic(__entry->member, TRACE_MM_PAGES),
 		__entry->size)
 	);
+
+/*
+ * Tracepoints for zone->lock on the page allocator fast paths only.
+ * Other code paths that acquire zone->lock (compaction, isolation,
+ * memory hotplug, vmstat, etc.) are not covered here.
+ *
+ * Three events:
+ *   mm_zone_lock_contended - trylock failed, about to spin
+ *   mm_zone_locked         - lock acquired, includes wait_ns when
+ *                            contended (zero otherwise)
+ *   mm_zone_lock_unlock    - lock released
+ *
+ * For production use with minimum overhead, enable only
+ * mm_zone_lock_contended -- it fires only when trylock detects the
+ * lock is already held.
+ *
+ * For wait-time analysis, enable mm_zone_locked -- its wait_ns
+ * field gives the spin duration directly.  Adding unlock allows
+ * hold-time measurement, at the cost of one event per acquisition.
+ */
+TRACE_EVENT(mm_zone_lock_contended,
+
+	TP_PROTO(struct zone *zone, int count, unsigned long caller),
+
+	TP_ARGS(zone, count, caller),
+
+	TP_STRUCT__entry(
+		__field(	int,		node_id		)
+		__string(	name,		zone->name	)
+		__field(	int,		count		)
+		__field(	unsigned long,	caller		)
+	),
+
+	TP_fast_assign(
+		__entry->node_id = zone_to_nid(zone);
+		__assign_str(name);
+		__entry->count = count;
+		__entry->caller = caller;
+	),
+
+	TP_printk("node=%d zone=%-8s count=%-5d caller=%pS",
+		  __entry->node_id, __get_str(name),
+		  __entry->count, (void *)__entry->caller)
+);
+
+TRACE_EVENT(mm_zone_locked,
+
+	TP_PROTO(struct zone *zone, int count, bool contended,
+		 unsigned long caller, u64 wait_ns),
+
+	TP_ARGS(zone, count, contended, caller, wait_ns),
+
+	TP_STRUCT__entry(
+		__field(	int,		node_id		)
+		__string(	name,		zone->name	)
+		__field(	int,		count		)
+		__field(	bool,		contended	)
+		__field(	unsigned long,	caller		)
+		__field(	u64,		wait_ns		)
+	),
+
+	TP_fast_assign(
+		__entry->node_id = zone_to_nid(zone);
+		__assign_str(name);
+		__entry->count = count;
+		__entry->contended = contended;
+		__entry->caller = caller;
+		__entry->wait_ns = wait_ns;
+	),
+
+	TP_printk("node=%d zone=%-8s count=%-5d contended=%d caller=%pS wait=%llu ns",
+		  __entry->node_id, __get_str(name),
+		  __entry->count, __entry->contended,
+		  (void *)__entry->caller, __entry->wait_ns)
+);
+
+TRACE_EVENT(mm_zone_lock_unlock,
+
+	TP_PROTO(struct zone *zone, int count, unsigned long caller),
+
+	TP_ARGS(zone, count, caller),
+
+	TP_STRUCT__entry(
+		__field(	int,		node_id		)
+		__string(	name,		zone->name	)
+		__field(	int,		count		)
+		__field(	unsigned long,	caller		)
+	),
+
+	TP_fast_assign(
+		__entry->node_id = zone_to_nid(zone);
+		__assign_str(name);
+		__entry->count = count;
+		__entry->caller = caller;
+	),
+
+	TP_printk("node=%d zone=%-8s count=%-5d caller=%pS",
+		  __entry->node_id, __get_str(name),
+		  __entry->count, (void *)__entry->caller)
+);
+
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 227d58dc3de6..08018e9beab4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -19,6 +19,7 @@
 #include <linux/highmem.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
+#include <linux/sched/clock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kasan.h>
@@ -1447,6 +1448,43 @@ bool free_pages_prepare(struct page *page, unsigned int order)
 	return __free_pages_prepare(page, order, FPI_NONE);
 }
 
+/*
+ * Helper functions for locking zone->lock with tracepoints.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments.  The @count parameter indicates the number
+ * of pages being freed or allocated in the batch operation.
+ *
+ * For minimum overhead attach to kmem:mm_zone_lock_contended, which
+ * only gets activated when trylock detects lock is contended.
+ */
+static inline void
+__zone_lock(struct zone *zone, int count, unsigned long *flags)
+	__acquires(&zone->lock)
+{
+	unsigned long caller = _RET_IP_;
+	u64 wait_start, wait_time = 0;
+	bool contended;
+
+	local_irq_save(*flags);
+	contended = !spin_trylock(&zone->lock);
+	if (contended) {
+		wait_start = local_clock();
+		trace_mm_zone_lock_contended(zone, count, caller);
+		spin_lock(&zone->lock);
+		wait_time = local_clock() - wait_start;
+	}
+	trace_mm_zone_locked(zone, count, contended, caller, wait_time);
+}
+
+static inline void
+__zone_unlock(struct zone *zone, int count, unsigned long *flags)
+	__releases(&zone->lock)
+{
+	trace_mm_zone_lock_unlock(zone, count, _RET_IP_);
+	spin_unlock_irqrestore(&zone->lock, *flags);
+}
+
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone.
@@ -1469,7 +1507,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	/* Ensure requested pindex is drained first. */
 	pindex = pindex - 1;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	__zone_lock(zone, count, &flags);
 
 	while (count > 0) {
 		struct list_head *list;
@@ -1502,7 +1540,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 		} while (count > 0 && !list_empty(list));
 	}
 
-	spin_unlock_irqrestore(&zone->lock, flags);
+	__zone_unlock(zone, count, &flags);
 }
 
 /* Split a multi-block free page into its individual pageblocks. */
@@ -1551,7 +1589,7 @@ static void free_one_page(struct zone *zone, struct page *page,
 			return;
 		}
 	} else {
-		spin_lock_irqsave(&zone->lock, flags);
+		__zone_lock(zone, 1 << order, &flags);
 	}
 
 	/* The lock succeeded. Process deferred pages. */
@@ -1569,7 +1607,7 @@ static void free_one_page(struct zone *zone, struct page *page,
 		}
 	}
 	split_large_buddy(zone, page, pfn, order, fpi_flags);
-	spin_unlock_irqrestore(&zone->lock, flags);
+	__zone_unlock(zone, 1 << order, &flags);
 
 	__count_vm_events(PGFREE, 1 << order);
 }
@@ -2525,7 +2563,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		if (!spin_trylock_irqsave(&zone->lock, flags))
 			return 0;
 	} else {
-		spin_lock_irqsave(&zone->lock, flags);
+		__zone_lock(zone, count, &flags);
 	}
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype,
@@ -2545,7 +2583,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 */
 		list_add_tail(&page->pcp_list, list);
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
+	__zone_unlock(zone, i, &flags);
 
 	return i;
 }
-- 
2.43.0


             reply	other threads:[~2026-05-08 16:22 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-08 16:22 hawk [this message]
2026-05-08 16:22 ` [PATCH 2/2] selftests/mm: add zone->lock tracepoint verification test hawk
2026-05-08 20:15   ` David Hildenbrand (Arm)
2026-05-08 17:29 ` [PATCH 1/2] mm/page_alloc: add tracepoints for zone->lock acquisitions Andrew Morton
2026-05-08 17:38   ` Vlastimil Babka (SUSE)
2026-05-08 17:40     ` Vlastimil Babka (SUSE)
2026-05-08 18:07       ` Dmitry Ilvokhin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260508162207.3315781-1-hawk@kernel.org \
    --to=hawk@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=david@kernel.org \
    --cc=kernel-team@cloudflare.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=ljs@kernel.org \
    --cc=mhocko@suse.com \
    --cc=rostedt@goodmis.org \
    --cc=shuah@kernel.org \
    --cc=surenb@google.com \
    --cc=vbabka@kernel.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox