[PATCH] alloc_tag: add per-numa node stats

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] alloc_tag: add per-numa node stats
  2025-05-30  0:39 [PATCH 0/1] alloc_tag: add per-numa " Casey Chen
@ 2025-05-30  0:39 ` Casey Chen
  0 siblings, 0 replies; 15+ messages in thread
From: Casey Chen @ 2025-05-30  0:39 UTC (permalink / raw)
  To: linux-mm, surenb, kent.overstreet; +Cc: yzhong, cachen

Add per-numa stats for each alloc_tag. We used to have only one
alloc_tag_counters per CPU, now each CPU has one per numa node.

bytes/calls in total and for each numa node are now displayed
together in a single row for each alloc_tag in /proc/allocinfo.

Note for percpu allocation, per-numa stats doesn't make sense.
Numa nodes for per-CPU memory vary. Each CPU usually gets copy
from its local numa node. We don't have a way to count numa node
stats by CPU, so just store all stats in numa 0. Also, the 'bytes'
field is just the number needed by a single CPU, to get the total
bytes, multiply it by number of possible CPUs. Added boolean field
'percpu' to mark all percpu allocations in /proc/allocinfo.

To minimize memory usage, alloc_tag stats counters are dynamically
allocated with percpu allocator. Increase PERCPU_DYNAMIC_RESERVE to
accommodate counters for in-kernel alloc_tags.

For in-kernel alloc_tag, pcpu_alloc_noprof() is called to allocate
stats counters, which is not accounted for in profiling stats.

Signed-off-by: Casey Chen <cachen@purestorage.com>
Reviewed-by: Yuanyuan Zhong <yzhong@purestorage.com>
---
 include/linux/alloc_tag.h | 49 ++++++++++++++++++++++++++++-----------
 include/linux/codetag.h   |  4 ++++
 include/linux/percpu.h    |  2 +-
 lib/alloc_tag.c           | 43 ++++++++++++++++++++++++++++------
 mm/page_alloc.c           | 35 ++++++++++++++--------------
 mm/percpu.c               |  8 +++++--
 mm/show_mem.c             | 20 ++++++++++++----
 mm/slub.c                 | 11 ++++++---
 8 files changed, 123 insertions(+), 49 deletions(-)

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 8f7931eb7d16..99d4a1823e51 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -15,6 +15,8 @@
 #include <linux/static_key.h>
 #include <linux/irqflags.h>
 
+extern int num_numa_nodes;
+
 struct alloc_tag_counters {
 	u64 bytes;
 	u64 calls;
@@ -134,16 +136,34 @@ static inline bool mem_alloc_profiling_enabled(void)
 				   &mem_alloc_profiling_key);
 }
 
+static inline struct alloc_tag_counters alloc_tag_read_nid(struct alloc_tag *tag, int nid)
+{
+	struct alloc_tag_counters v = { 0, 0 };
+	struct alloc_tag_counters *counters;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		counters = per_cpu_ptr(tag->counters, cpu);
+		v.bytes += counters[nid].bytes;
+		v.calls += counters[nid].calls;
+	}
+
+	return v;
+}
+
 static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
 {
 	struct alloc_tag_counters v = { 0, 0 };
-	struct alloc_tag_counters *counter;
+	struct alloc_tag_counters *counters;
 	int cpu;
+	int nid;
 
 	for_each_possible_cpu(cpu) {
-		counter = per_cpu_ptr(tag->counters, cpu);
-		v.bytes += counter->bytes;
-		v.calls += counter->calls;
+		counters = per_cpu_ptr(tag->counters, cpu);
+		for (nid = 0; nid < num_numa_nodes; nid++) {
+			v.bytes += counters[nid].bytes;
+			v.calls += counters[nid].calls;
+		}
 	}
 
 	return v;
@@ -179,7 +199,7 @@ static inline bool __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag
 	return true;
 }
 
-static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag, int nid)
 {
 	if (unlikely(!__alloc_tag_ref_set(ref, tag)))
 		return false;
@@ -190,17 +210,18 @@ static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *t
 	 * Each new reference for every sub-allocation needs to increment call
 	 * counter because when we free each part the counter will be decremented.
 	 */
-	this_cpu_inc(tag->counters->calls);
+	this_cpu_inc(tag->counters[nid].calls);
 	return true;
 }
 
-static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
+				 int nid, size_t bytes)
 {
-	if (likely(alloc_tag_ref_set(ref, tag)))
-		this_cpu_add(tag->counters->bytes, bytes);
+	if (likely(alloc_tag_ref_set(ref, tag, nid)))
+		this_cpu_add(tag->counters[nid].bytes, bytes);
 }
 
-static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
+static inline void alloc_tag_sub(union codetag_ref *ref, int nid, size_t bytes)
 {
 	struct alloc_tag *tag;
 
@@ -215,8 +236,8 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
 
 	tag = ct_to_alloc_tag(ref->ct);
 
-	this_cpu_sub(tag->counters->bytes, bytes);
-	this_cpu_dec(tag->counters->calls);
+	this_cpu_sub(tag->counters[nid].bytes, bytes);
+	this_cpu_dec(tag->counters[nid].calls);
 
 	ref->ct = NULL;
 }
@@ -228,8 +249,8 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
 #define DEFINE_ALLOC_TAG(_alloc_tag)
 static inline bool mem_alloc_profiling_enabled(void) { return false; }
 static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
-				 size_t bytes) {}
-static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
+				 int nid, size_t bytes) {}
+static inline void alloc_tag_sub(union codetag_ref *ref, int nid, size_t bytes) {}
 #define alloc_tag_record(p)	do {} while (0)
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 5f2b9a1f722c..79d6b96c61f6 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -16,6 +16,10 @@ struct module;
 #define CODETAG_SECTION_START_PREFIX	"__start_"
 #define CODETAG_SECTION_STOP_PREFIX	"__stop_"
 
+enum codetag_flags {
+	CODETAG_PERCPU_ALLOC	= (1 << 0), /* codetag tracking percpu allocation */
+};
+
 /*
  * An instance of this structure is created in a special ELF section at every
  * code location being tagged.  At runtime, the special section is treated as
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 85bf8dd9f087..d92c27fbcd0d 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -43,7 +43,7 @@
 # define PERCPU_DYNAMIC_SIZE_SHIFT      12
 #endif /* LOCKDEP and PAGE_SIZE > 4KiB */
 #else
-#define PERCPU_DYNAMIC_SIZE_SHIFT      10
+#define PERCPU_DYNAMIC_SIZE_SHIFT      13
 #endif
 
 /*
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index d48b80f3f007..b4d2d5663c4c 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -42,6 +42,9 @@ struct allocinfo_private {
 	bool print_header;
 };
 
+int num_numa_nodes;
+static unsigned long pcpu_counters_size;
+
 static void *allocinfo_start(struct seq_file *m, loff_t *pos)
 {
 	struct allocinfo_private *priv;
@@ -95,9 +98,16 @@ static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
 {
 	struct alloc_tag *tag = ct_to_alloc_tag(ct);
 	struct alloc_tag_counters counter = alloc_tag_read(tag);
-	s64 bytes = counter.bytes;
+	int nid;
+
+	seq_buf_printf(out, "percpu %c total %12lli %8llu ",
+			ct->flags & CODETAG_PERCPU_ALLOC ? 'y' : 'n',
+			counter.bytes, counter.calls);
+	for (nid = 0; nid < num_numa_nodes; nid++) {
+		counter = alloc_tag_read_nid(tag, nid);
+		seq_buf_printf(out, "numa%d %12lli %8llu ", nid, counter.bytes, counter.calls);
+	}
 
-	seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
 	codetag_to_text(out, ct);
 	seq_buf_putc(out, ' ');
 	seq_buf_putc(out, '\n');
@@ -184,7 +194,7 @@ void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
 
 		if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) {
 			/* Set new reference to point to the original tag */
-			alloc_tag_ref_set(&ref, tag);
+			alloc_tag_ref_set(&ref, tag, folio_nid(folio));
 			update_page_tag_ref(handle, &ref);
 			put_page_tag_ref(handle);
 		}
@@ -247,19 +257,36 @@ static void shutdown_mem_profiling(bool remove_file)
 void __init alloc_tag_sec_init(void)
 {
 	struct alloc_tag *last_codetag;
+	int i;
 
 	if (!mem_profiling_support)
 		return;
 
-	if (!static_key_enabled(&mem_profiling_compressed))
-		return;
-
 	kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name(
 					SECTION_START(ALLOC_TAG_SECTION_NAME));
 	last_codetag = (struct alloc_tag *)kallsyms_lookup_name(
 					SECTION_STOP(ALLOC_TAG_SECTION_NAME));
 	kernel_tags.count = last_codetag - kernel_tags.first_tag;
 
+	num_numa_nodes = num_possible_nodes();
+	pcpu_counters_size = num_numa_nodes * sizeof(struct alloc_tag_counters);
+	for (i = 0; i < kernel_tags.count; i++) {
+		/* Each CPU has one counter per numa node */
+		kernel_tags.first_tag[i].counters =
+			pcpu_alloc_noprof(pcpu_counters_size,
+					  sizeof(struct alloc_tag_counters),
+					  false, GFP_KERNEL | __GFP_ZERO);
+		if (!kernel_tags.first_tag[i].counters) {
+			while (--i >= 0)
+				free_percpu(kernel_tags.first_tag[i].counters);
+			pr_info("Failed to allocate per-cpu alloc_tag counters\n");
+			return;
+		}
+	}
+
+	if (!static_key_enabled(&mem_profiling_compressed))
+		return;
+
 	/* Check if kernel tags fit into page flags */
 	if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) {
 		shutdown_mem_profiling(false); /* allocinfo file does not exist yet */
@@ -622,7 +649,9 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag
 	stop_tag = ct_to_alloc_tag(stop);
 	for (tag = start_tag; tag < stop_tag; tag++) {
 		WARN_ON(tag->counters);
-		tag->counters = alloc_percpu(struct alloc_tag_counters);
+		tag->counters = __alloc_percpu_gfp(pcpu_counters_size,
+						   sizeof(struct alloc_tag_counters),
+						   GFP_KERNEL | __GFP_ZERO);
 		if (!tag->counters) {
 			while (--tag >= start_tag) {
 				free_percpu(tag->counters);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90b06f3d004c..8219d8de6f97 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1107,58 +1107,59 @@ void __clear_page_tag_ref(struct page *page)
 /* Should be called only if mem_alloc_profiling_enabled() */
 static noinline
 void __pgalloc_tag_add(struct page *page, struct task_struct *task,
-		       unsigned int nr)
+		       int nid, unsigned int nr)
 {
 	union pgtag_ref_handle handle;
 	union codetag_ref ref;
 
 	if (get_page_tag_ref(page, &ref, &handle)) {
-		alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+		alloc_tag_add(&ref, task->alloc_tag, nid, PAGE_SIZE * nr);
 		update_page_tag_ref(handle, &ref);
 		put_page_tag_ref(handle);
 	}
 }
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr)
+				   int nid, unsigned int nr)
 {
 	if (mem_alloc_profiling_enabled())
-		__pgalloc_tag_add(page, task, nr);
+		__pgalloc_tag_add(page, task, nid, nr);
 }
 
 /* Should be called only if mem_alloc_profiling_enabled() */
 static noinline
-void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+void __pgalloc_tag_sub(struct page *page, int nid, unsigned int nr)
 {
 	union pgtag_ref_handle handle;
 	union codetag_ref ref;
 
 	if (get_page_tag_ref(page, &ref, &handle)) {
-		alloc_tag_sub(&ref, PAGE_SIZE * nr);
+		alloc_tag_sub(&ref, nid, PAGE_SIZE * nr);
 		update_page_tag_ref(handle, &ref);
 		put_page_tag_ref(handle);
 	}
 }
 
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+static inline void pgalloc_tag_sub(struct page *page, int nid, unsigned int nr)
 {
 	if (mem_alloc_profiling_enabled())
-		__pgalloc_tag_sub(page, nr);
+		__pgalloc_tag_sub(page, nid, nr);
 }
 
 /* When tag is not NULL, assuming mem_alloc_profiling_enabled */
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag,
+					 int nid, unsigned int nr)
 {
 	if (tag)
-		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+		this_cpu_sub(tag->counters[nid].bytes, PAGE_SIZE * nr);
 }
 
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr) {}
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+				   int nid, unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, int nid, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, int nid, unsigned int nr) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
@@ -1197,7 +1198,7 @@ __always_inline bool free_pages_prepare(struct page *page,
 		/* Do not let hwpoison pages hit pcplists/buddy */
 		reset_page_owner(page, order);
 		page_table_check_free(page, order);
-		pgalloc_tag_sub(page, 1 << order);
+		pgalloc_tag_sub(page, page_to_nid(page), 1 << order);
 
 		/*
 		 * The page is isolated and accounted for.
@@ -1251,7 +1252,7 @@ __always_inline bool free_pages_prepare(struct page *page,
 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	reset_page_owner(page, order);
 	page_table_check_free(page, order);
-	pgalloc_tag_sub(page, 1 << order);
+	pgalloc_tag_sub(page, page_to_nid(page), 1 << order);
 
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
@@ -1707,7 +1708,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 
 	set_page_owner(page, order, gfp_flags);
 	page_table_check_alloc(page, order);
-	pgalloc_tag_add(page, current, 1 << order);
+	pgalloc_tag_add(page, current, page_to_nid(page), 1 << order);
 }
 
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -5064,7 +5065,7 @@ static void ___free_pages(struct page *page, unsigned int order,
 	if (put_page_testzero(page))
 		__free_frozen_pages(page, order, fpi_flags);
 	else if (!head) {
-		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+		pgalloc_tag_sub_pages(tag, page_to_nid(page), (1 << order) - 1);
 		while (order-- > 0)
 			__free_frozen_pages(page + (1 << order), order,
 					    fpi_flags);
diff --git a/mm/percpu.c b/mm/percpu.c
index b35494c8ede2..130450e9718e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1691,15 +1691,19 @@ static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
 				      size_t size)
 {
 	if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
+		/* For percpu allocation, store all alloc_tag stats on numa node 0 */
 		alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
-			      current->alloc_tag, size);
+			      current->alloc_tag, 0, size);
+		if (current->alloc_tag)
+			current->alloc_tag->ct.flags |= CODETAG_PERCPU_ALLOC;
 	}
 }
 
 static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
 {
+	/* percpu alloc_tag stats is stored on numa node 0 so subtract from node 0 */
 	if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
-		alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
+		alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, 0, size);
 }
 #else
 static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 03e8d968fd1a..132b3aa82d83 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de>
  */
 
+#include <linux/alloc_tag.h>
 #include <linux/blkdev.h>
 #include <linux/cma.h>
 #include <linux/cpuset.h>
@@ -433,18 +434,27 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 				struct alloc_tag *tag = ct_to_alloc_tag(ct);
 				struct alloc_tag_counters counter = alloc_tag_read(tag);
 				char bytes[10];
+				int nid;
 
 				string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes));
+				pr_notice("percpu %c total %12s %8llu ",
+						ct->flags & CODETAG_PERCPU_ALLOC ? 'y' : 'n',
+						bytes, counter.calls);
+
+				for (nid = 0; nid < num_numa_nodes; nid++) {
+					counter = alloc_tag_read_nid(tag, nid);
+					string_get_size(counter.bytes, 1, STRING_UNITS_2,
+							bytes, sizeof(bytes));
+					pr_notice("numa%d %12s %8llu ", nid, bytes, counter.calls);
+				}
 
 				/* Same as alloc_tag_to_text() but w/o intermediate buffer */
 				if (ct->modname)
-					pr_notice("%12s %8llu %s:%u [%s] func:%s\n",
-						  bytes, counter.calls, ct->filename,
+					pr_notice("%s:%u [%s] func:%s\n", ct->filename,
 						  ct->lineno, ct->modname, ct->function);
 				else
-					pr_notice("%12s %8llu %s:%u func:%s\n",
-						  bytes, counter.calls, ct->filename,
-						  ct->lineno, ct->function);
+					pr_notice("%s:%u func:%s\n",
+						  ct->filename, ct->lineno, ct->function);
 			}
 		}
 	}
diff --git a/mm/slub.c b/mm/slub.c
index be8b09e09d30..068b88b85d80 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2104,8 +2104,12 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 	 * If other users appear then mem_alloc_profiling_enabled()
 	 * check should be added before alloc_tag_add().
 	 */
-	if (likely(obj_exts))
-		alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
+	if (likely(obj_exts)) {
+		struct page *page = virt_to_page(object);
+
+		alloc_tag_add(&obj_exts->ref, current->alloc_tag,
+				page_to_nid(page), s->size);
+	}
 }
 
 static inline void
@@ -2133,8 +2137,9 @@ __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p
 
 	for (i = 0; i < objects; i++) {
 		unsigned int off = obj_to_index(s, slab, p[i]);
+		struct page *page = virt_to_page(p[i]);
 
-		alloc_tag_sub(&obj_exts[off].ref, s->size);
+		alloc_tag_sub(&obj_exts[off].ref, page_to_nid(page), s->size);
 	}
 }
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH] alloc_tag: add per-NUMA node stats
@ 2025-06-10 23:30 Casey Chen
  2025-06-11  1:21 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Casey Chen @ 2025-06-10 23:30 UTC (permalink / raw)
  To: akpm, surenb
  Cc: kent.overstreet, corbet, dennis, tj, cl, vbabka, mhocko, jackmanb,
	hannes, ziy, rientjes, roman.gushchin, harry.yoo, linux-mm,
	linux-kernel, linux-doc, yzhong, Casey Chen

Add support for tracking per-NUMA node statistics in /proc/allocinfo.
Previously, each alloc_tag had a single set of counters (bytes and
calls), aggregated across all CPUs. With this change, each CPU can
maintain separate counters for each NUMA node, allowing finer-grained
memory allocation profiling.

This feature is controlled by the new
CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:

* When enabled (=y), the output includes per-node statistics following
  the total bytes/calls:

<size> <calls> <tag info>
...
315456       9858     mm/dmapool.c:338 func:pool_alloc_page
        nid0     94912        2966
        nid1     220544       6892
7680         60       mm/dmapool.c:254 func:dma_pool_create
        nid0     4224         33
        nid1     3456         27

* When disabled (=n), the output remains unchanged:
<size> <calls> <tag info>
...
315456       9858     mm/dmapool.c:338 func:pool_alloc_page
7680         60       mm/dmapool.c:254 func:dma_pool_create

To minimize memory overhead, per-NUMA stats counters are dynamically
allocated using the percpu allocator. PERCPU_DYNAMIC_RESERVE has been
increased to ensure sufficient space for in-kernel alloc_tag counters.

For in-kernel alloc_tag instances, pcpu_alloc_noprof() is used to
allocate counters. These allocations are excluded from the profiling
statistics themselves.

Signed-off-by: Casey Chen <cachen@purestorage.com>
Reviewed-by: Yuanyuan Zhong <yzhong@purestorage.com>
---
 Documentation/mm/allocation-profiling.rst |  3 ++
 include/linux/alloc_tag.h                 | 49 ++++++++++++------
 include/linux/codetag.h                   |  4 ++
 include/linux/percpu.h                    |  2 +-
 lib/Kconfig.debug                         |  7 +++
 lib/alloc_tag.c                           | 61 ++++++++++++++++++++---
 mm/page_alloc.c                           | 35 ++++++-------
 mm/percpu.c                               |  8 ++-
 mm/show_mem.c                             | 27 +++++++---
 mm/slub.c                                 | 11 ++--
 10 files changed, 156 insertions(+), 51 deletions(-)

diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst
index 316311240e6a..13d1d0cb91bf 100644
--- a/Documentation/mm/allocation-profiling.rst
+++ b/Documentation/mm/allocation-profiling.rst
@@ -17,6 +17,9 @@ kconfig options:
   adds warnings for allocations that weren't accounted because of a
   missing annotation
 
+- CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
+  adds memory allocation profiling stats for each numa node, off by default.
+
 Boot parameter:
   sysctl.vm.mem_profiling={0|1|never}[,compressed]
 
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 8f7931eb7d16..04f5beb44ef9 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -15,6 +15,8 @@
 #include <linux/static_key.h>
 #include <linux/irqflags.h>
 
+extern int pcpu_counters_num;
+
 struct alloc_tag_counters {
 	u64 bytes;
 	u64 calls;
@@ -134,16 +136,34 @@ static inline bool mem_alloc_profiling_enabled(void)
 				   &mem_alloc_profiling_key);
 }
 
+static inline struct alloc_tag_counters alloc_tag_read_nid(struct alloc_tag *tag, int nid)
+{
+	struct alloc_tag_counters v = { 0, 0 };
+	struct alloc_tag_counters *counters;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		counters = per_cpu_ptr(tag->counters, cpu);
+		v.bytes += counters[nid].bytes;
+		v.calls += counters[nid].calls;
+	}
+
+	return v;
+}
+
 static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
 {
 	struct alloc_tag_counters v = { 0, 0 };
-	struct alloc_tag_counters *counter;
+	struct alloc_tag_counters *counters;
 	int cpu;
+	int nid;
 
 	for_each_possible_cpu(cpu) {
-		counter = per_cpu_ptr(tag->counters, cpu);
-		v.bytes += counter->bytes;
-		v.calls += counter->calls;
+		counters = per_cpu_ptr(tag->counters, cpu);
+		for (nid = 0; nid < pcpu_counters_num; nid++) {
+			v.bytes += counters[nid].bytes;
+			v.calls += counters[nid].calls;
+		}
 	}
 
 	return v;
@@ -179,7 +199,7 @@ static inline bool __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag
 	return true;
 }
 
-static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag, int nid)
 {
 	if (unlikely(!__alloc_tag_ref_set(ref, tag)))
 		return false;
@@ -190,17 +210,18 @@ static inline bool alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *t
 	 * Each new reference for every sub-allocation needs to increment call
 	 * counter because when we free each part the counter will be decremented.
 	 */
-	this_cpu_inc(tag->counters->calls);
+	this_cpu_inc(tag->counters[nid].calls);
 	return true;
 }
 
-static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
+				 int nid, size_t bytes)
 {
-	if (likely(alloc_tag_ref_set(ref, tag)))
-		this_cpu_add(tag->counters->bytes, bytes);
+	if (likely(alloc_tag_ref_set(ref, tag, nid)))
+		this_cpu_add(tag->counters[nid].bytes, bytes);
 }
 
-static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
+static inline void alloc_tag_sub(union codetag_ref *ref, int nid, size_t bytes)
 {
 	struct alloc_tag *tag;
 
@@ -215,8 +236,8 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
 
 	tag = ct_to_alloc_tag(ref->ct);
 
-	this_cpu_sub(tag->counters->bytes, bytes);
-	this_cpu_dec(tag->counters->calls);
+	this_cpu_sub(tag->counters[nid].bytes, bytes);
+	this_cpu_dec(tag->counters[nid].calls);
 
 	ref->ct = NULL;
 }
@@ -228,8 +249,8 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
 #define DEFINE_ALLOC_TAG(_alloc_tag)
 static inline bool mem_alloc_profiling_enabled(void) { return false; }
 static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
-				 size_t bytes) {}
-static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
+				 int nid, size_t bytes) {}
+static inline void alloc_tag_sub(union codetag_ref *ref, int nid, size_t bytes) {}
 #define alloc_tag_record(p)	do {} while (0)
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 5f2b9a1f722c..79d6b96c61f6 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -16,6 +16,10 @@ struct module;
 #define CODETAG_SECTION_START_PREFIX	"__start_"
 #define CODETAG_SECTION_STOP_PREFIX	"__stop_"
 
+enum codetag_flags {
+	CODETAG_PERCPU_ALLOC	= (1 << 0), /* codetag tracking percpu allocation */
+};
+
 /*
  * An instance of this structure is created in a special ELF section at every
  * code location being tagged.  At runtime, the special section is treated as
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 85bf8dd9f087..d92c27fbcd0d 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -43,7 +43,7 @@
 # define PERCPU_DYNAMIC_SIZE_SHIFT      12
 #endif /* LOCKDEP and PAGE_SIZE > 4KiB */
 #else
-#define PERCPU_DYNAMIC_SIZE_SHIFT      10
+#define PERCPU_DYNAMIC_SIZE_SHIFT      13
 #endif
 
 /*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 166b9d830a85..ba2d9c7e050b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1037,6 +1037,13 @@ config MEM_ALLOC_PROFILING_DEBUG
 	  Adds warnings with helpful error messages for memory allocation
 	  profiling.
 
+config MEM_ALLOC_PROFILING_PER_NUMA_STATS
+	bool "Memory allocation profiling per-NUMA stats"
+	default n
+	depends on MEM_ALLOC_PROFILING
+	help
+	  Display allocation stats on every NUMA node.
+
 source "lib/Kconfig.kasan"
 source "lib/Kconfig.kfence"
 source "lib/Kconfig.kmsan"
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index d48b80f3f007..b503685dff73 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -42,6 +42,9 @@ struct allocinfo_private {
 	bool print_header;
 };
 
+int pcpu_counters_num;
+static unsigned long pcpu_counters_size;
+
 static void *allocinfo_start(struct seq_file *m, loff_t *pos)
 {
 	struct allocinfo_private *priv;
@@ -88,7 +91,7 @@ static void print_allocinfo_header(struct seq_buf *buf)
 {
 	/* Output format version, so we can change it. */
 	seq_buf_printf(buf, "allocinfo - version: 1.0\n");
-	seq_buf_printf(buf, "#     <size>  <calls> <tag info>\n");
+	seq_buf_printf(buf, "<size> <calls> <tag info>\n");
 }
 
 static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
@@ -97,12 +100,29 @@ static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
 	struct alloc_tag_counters counter = alloc_tag_read(tag);
 	s64 bytes = counter.bytes;
 
-	seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
+	seq_buf_printf(out, "%-12lli %-8llu ", bytes, counter.calls);
 	codetag_to_text(out, ct);
 	seq_buf_putc(out, ' ');
 	seq_buf_putc(out, '\n');
 }
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
+static void alloc_tag_to_text_all_nids(struct seq_buf *out, struct codetag *ct)
+{
+	struct alloc_tag *tag = ct_to_alloc_tag(ct);
+	struct alloc_tag_counters counter;
+	s64 bytes;
+	int nid;
+
+	for (nid = 0; nid < pcpu_counters_num; nid++) {
+		counter = alloc_tag_read_nid(tag, nid);
+		bytes = counter.bytes;
+		seq_buf_printf(out, "        nid%-5u %-12lli %-8llu\n",
+				nid, bytes, counter.calls);
+	}
+}
+#endif
+
 static int allocinfo_show(struct seq_file *m, void *arg)
 {
 	struct allocinfo_private *priv = (struct allocinfo_private *)arg;
@@ -116,6 +136,9 @@ static int allocinfo_show(struct seq_file *m, void *arg)
 		priv->print_header = false;
 	}
 	alloc_tag_to_text(&buf, priv->iter.ct);
+#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
+	alloc_tag_to_text_all_nids(&buf, priv->iter.ct);
+#endif
 	seq_commit(m, seq_buf_used(&buf));
 	return 0;
 }
@@ -184,7 +207,7 @@ void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
 
 		if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) {
 			/* Set new reference to point to the original tag */
-			alloc_tag_ref_set(&ref, tag);
+			alloc_tag_ref_set(&ref, tag, folio_nid(folio));
 			update_page_tag_ref(handle, &ref);
 			put_page_tag_ref(handle);
 		}
@@ -247,19 +270,41 @@ static void shutdown_mem_profiling(bool remove_file)
 void __init alloc_tag_sec_init(void)
 {
 	struct alloc_tag *last_codetag;
+	int i;
 
 	if (!mem_profiling_support)
 		return;
 
-	if (!static_key_enabled(&mem_profiling_compressed))
-		return;
-
 	kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name(
 					SECTION_START(ALLOC_TAG_SECTION_NAME));
 	last_codetag = (struct alloc_tag *)kallsyms_lookup_name(
 					SECTION_STOP(ALLOC_TAG_SECTION_NAME));
 	kernel_tags.count = last_codetag - kernel_tags.first_tag;
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
+	pcpu_counters_num = num_possible_nodes();
+#else
+	pcpu_counters_num = 1;
+#endif
+
+	pcpu_counters_size = pcpu_counters_num * sizeof(struct alloc_tag_counters);
+	for (i = 0; i < kernel_tags.count; i++) {
+		/* Each CPU has one alloc_tag_counters per numa node */
+		kernel_tags.first_tag[i].counters =
+			pcpu_alloc_noprof(pcpu_counters_size,
+					  sizeof(struct alloc_tag_counters),
+					  false, GFP_KERNEL | __GFP_ZERO);
+		if (!kernel_tags.first_tag[i].counters) {
+			while (--i >= 0)
+				free_percpu(kernel_tags.first_tag[i].counters);
+			pr_info("Failed to allocate per-cpu alloc_tag counters\n");
+			return;
+		}
+	}
+
+	if (!static_key_enabled(&mem_profiling_compressed))
+		return;
+
 	/* Check if kernel tags fit into page flags */
 	if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) {
 		shutdown_mem_profiling(false); /* allocinfo file does not exist yet */
@@ -622,7 +667,9 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag
 	stop_tag = ct_to_alloc_tag(stop);
 	for (tag = start_tag; tag < stop_tag; tag++) {
 		WARN_ON(tag->counters);
-		tag->counters = alloc_percpu(struct alloc_tag_counters);
+		tag->counters = __alloc_percpu_gfp(pcpu_counters_size,
+						   sizeof(struct alloc_tag_counters),
+						   GFP_KERNEL | __GFP_ZERO);
 		if (!tag->counters) {
 			while (--tag >= start_tag) {
 				free_percpu(tag->counters);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90b06f3d004c..8219d8de6f97 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1107,58 +1107,59 @@ void __clear_page_tag_ref(struct page *page)
 /* Should be called only if mem_alloc_profiling_enabled() */
 static noinline
 void __pgalloc_tag_add(struct page *page, struct task_struct *task,
-		       unsigned int nr)
+		       int nid, unsigned int nr)
 {
 	union pgtag_ref_handle handle;
 	union codetag_ref ref;
 
 	if (get_page_tag_ref(page, &ref, &handle)) {
-		alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+		alloc_tag_add(&ref, task->alloc_tag, nid, PAGE_SIZE * nr);
 		update_page_tag_ref(handle, &ref);
 		put_page_tag_ref(handle);
 	}
 }
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr)
+				   int nid, unsigned int nr)
 {
 	if (mem_alloc_profiling_enabled())
-		__pgalloc_tag_add(page, task, nr);
+		__pgalloc_tag_add(page, task, nid, nr);
 }
 
 /* Should be called only if mem_alloc_profiling_enabled() */
 static noinline
-void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+void __pgalloc_tag_sub(struct page *page, int nid, unsigned int nr)
 {
 	union pgtag_ref_handle handle;
 	union codetag_ref ref;
 
 	if (get_page_tag_ref(page, &ref, &handle)) {
-		alloc_tag_sub(&ref, PAGE_SIZE * nr);
+		alloc_tag_sub(&ref, nid, PAGE_SIZE * nr);
 		update_page_tag_ref(handle, &ref);
 		put_page_tag_ref(handle);
 	}
 }
 
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+static inline void pgalloc_tag_sub(struct page *page, int nid, unsigned int nr)
 {
 	if (mem_alloc_profiling_enabled())
-		__pgalloc_tag_sub(page, nr);
+		__pgalloc_tag_sub(page, nid, nr);
 }
 
 /* When tag is not NULL, assuming mem_alloc_profiling_enabled */
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag,
+					 int nid, unsigned int nr)
 {
 	if (tag)
-		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+		this_cpu_sub(tag->counters[nid].bytes, PAGE_SIZE * nr);
 }
 
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr) {}
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+				   int nid, unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, int nid, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, int nid, unsigned int nr) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
@@ -1197,7 +1198,7 @@ __always_inline bool free_pages_prepare(struct page *page,
 		/* Do not let hwpoison pages hit pcplists/buddy */
 		reset_page_owner(page, order);
 		page_table_check_free(page, order);
-		pgalloc_tag_sub(page, 1 << order);
+		pgalloc_tag_sub(page, page_to_nid(page), 1 << order);
 
 		/*
 		 * The page is isolated and accounted for.
@@ -1251,7 +1252,7 @@ __always_inline bool free_pages_prepare(struct page *page,
 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	reset_page_owner(page, order);
 	page_table_check_free(page, order);
-	pgalloc_tag_sub(page, 1 << order);
+	pgalloc_tag_sub(page, page_to_nid(page), 1 << order);
 
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
@@ -1707,7 +1708,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 
 	set_page_owner(page, order, gfp_flags);
 	page_table_check_alloc(page, order);
-	pgalloc_tag_add(page, current, 1 << order);
+	pgalloc_tag_add(page, current, page_to_nid(page), 1 << order);
 }
 
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -5064,7 +5065,7 @@ static void ___free_pages(struct page *page, unsigned int order,
 	if (put_page_testzero(page))
 		__free_frozen_pages(page, order, fpi_flags);
 	else if (!head) {
-		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+		pgalloc_tag_sub_pages(tag, page_to_nid(page), (1 << order) - 1);
 		while (order-- > 0)
 			__free_frozen_pages(page + (1 << order), order,
 					    fpi_flags);
diff --git a/mm/percpu.c b/mm/percpu.c
index b35494c8ede2..130450e9718e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1691,15 +1691,19 @@ static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
 				      size_t size)
 {
 	if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
+		/* For percpu allocation, store all alloc_tag stats on numa node 0 */
 		alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
-			      current->alloc_tag, size);
+			      current->alloc_tag, 0, size);
+		if (current->alloc_tag)
+			current->alloc_tag->ct.flags |= CODETAG_PERCPU_ALLOC;
 	}
 }
 
 static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
 {
+	/* percpu alloc_tag stats is stored on numa node 0 so subtract from node 0 */
 	if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
-		alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
+		alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, 0, size);
 }
 #else
 static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 03e8d968fd1a..b2ff55afb3cc 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de>
  */
 
+#include <linux/alloc_tag.h>
 #include <linux/blkdev.h>
 #include <linux/cma.h>
 #include <linux/cpuset.h>
@@ -428,6 +429,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 		nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false);
 		if (nr) {
 			pr_notice("Memory allocations:\n");
+			pr_notice("<size> <calls> <tag info>\n");
 			for (i = 0; i < nr; i++) {
 				struct codetag *ct = tags[i].ct;
 				struct alloc_tag *tag = ct_to_alloc_tag(ct);
@@ -435,16 +437,27 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 				char bytes[10];
 
 				string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes));
-
 				/* Same as alloc_tag_to_text() but w/o intermediate buffer */
 				if (ct->modname)
-					pr_notice("%12s %8llu %s:%u [%s] func:%s\n",
-						  bytes, counter.calls, ct->filename,
-						  ct->lineno, ct->modname, ct->function);
+					pr_notice("%-12s %-8llu %s:%u [%s] func:%s\n",
+						bytes, counter.calls, ct->filename,
+						ct->lineno, ct->modname, ct->function);
 				else
-					pr_notice("%12s %8llu %s:%u func:%s\n",
-						  bytes, counter.calls, ct->filename,
-						  ct->lineno, ct->function);
+					pr_notice("%-12s %-8llu %s:%u func:%s\n",
+						bytes, counter.calls,
+						ct->filename, ct->lineno, ct->function);
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
+				int nid;
+
+				for (nid = 0; nid < pcpu_counters_num; nid++) {
+					counter = alloc_tag_read_nid(tag, nid);
+					string_get_size(counter.bytes, 1, STRING_UNITS_2,
+							bytes, sizeof(bytes));
+					pr_notice("        nid%-5u %-12lld %-8lld\n",
+						  nid, counter.bytes, counter.calls);
+				}
+#endif
 			}
 		}
 	}
diff --git a/mm/slub.c b/mm/slub.c
index be8b09e09d30..068b88b85d80 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2104,8 +2104,12 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 	 * If other users appear then mem_alloc_profiling_enabled()
 	 * check should be added before alloc_tag_add().
 	 */
-	if (likely(obj_exts))
-		alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
+	if (likely(obj_exts)) {
+		struct page *page = virt_to_page(object);
+
+		alloc_tag_add(&obj_exts->ref, current->alloc_tag,
+				page_to_nid(page), s->size);
+	}
 }
 
 static inline void
@@ -2133,8 +2137,9 @@ __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p
 
 	for (i = 0; i < objects; i++) {
 		unsigned int off = obj_to_index(s, slab, p[i]);
+		struct page *page = virt_to_page(p[i]);
 
-		alloc_tag_sub(&obj_exts[off].ref, s->size);
+		alloc_tag_sub(&obj_exts[off].ref, page_to_nid(page), s->size);
 	}
 }
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-06-10 23:30 [PATCH] alloc_tag: add per-NUMA node stats Casey Chen
@ 2025-06-11  1:21 ` Andrew Morton
  2025-06-11  1:33   ` Casey Chen
  2025-06-11  3:41   ` Kent Overstreet
  2025-06-12  5:36 ` David Wang
  2025-06-18 22:16 ` Kent Overstreet
  2 siblings, 2 replies; 15+ messages in thread
From: Andrew Morton @ 2025-06-11  1:21 UTC (permalink / raw)
  To: Casey Chen
  Cc: surenb, kent.overstreet, corbet, dennis, tj, cl, vbabka, mhocko,
	jackmanb, hannes, ziy, rientjes, roman.gushchin, harry.yoo,
	linux-mm, linux-kernel, linux-doc, yzhong

On Tue, 10 Jun 2025 17:30:53 -0600 Casey Chen <cachen@purestorage.com> wrote:

> Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> Previously, each alloc_tag had a single set of counters (bytes and
> calls), aggregated across all CPUs. With this change, each CPU can
> maintain separate counters for each NUMA node, allowing finer-grained
> memory allocation profiling.
> 
> This feature is controlled by the new
> CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> 
> * When enabled (=y), the output includes per-node statistics following
>   the total bytes/calls:
> 
> <size> <calls> <tag info>
> ...
> 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
>         nid0     94912        2966
>         nid1     220544       6892
> 7680         60       mm/dmapool.c:254 func:dma_pool_create
>         nid0     4224         33
>         nid1     3456         27
> 
> * When disabled (=n), the output remains unchanged:
> <size> <calls> <tag info>
> ...
> 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> 7680         60       mm/dmapool.c:254 func:dma_pool_create
> 
> To minimize memory overhead, per-NUMA stats counters are dynamically
> allocated using the percpu allocator. PERCPU_DYNAMIC_RESERVE has been
> increased to ensure sufficient space for in-kernel alloc_tag counters.
> 
> For in-kernel alloc_tag instances, pcpu_alloc_noprof() is used to
> allocate counters. These allocations are excluded from the profiling
> statistics themselves.

What is glaringly missing here is "why".

What is the use case?  Why does Linux want this?  What benefit does
this bring to our users?  This is the most important part of the
changelog because it tells Andrew why he is even looking at this patch.


Probably related to the above omission: why per-nid?  It would be more
flexible to present the per-cpu counts and let userspace aggregate that
into per-node info if that is desirable.

>
> ...
>
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -15,6 +15,8 @@
>  #include <linux/static_key.h>
>  #include <linux/irqflags.h>
>  
> +extern int pcpu_counters_num;

This globally-visible variable's identifier is too generic - the name
should communicate which subsystem the variable belongs to.  Perhaps
alloc_tag_num_pcpu_counters?  It's long, but only used in a few places.

In fact, it's a count-of-nodes so a better name would be alloc_tag_num_nodes.

Also, as it's written to a single time, __read_mostly is appropriate.

>  struct alloc_tag_counters {
>  	u64 bytes;
>  	u64 calls;
> @@ -134,16 +136,34 @@ static inline bool mem_alloc_profiling_enabled(void)
>  				   &mem_alloc_profiling_key);
>  }
>  
> +static inline struct alloc_tag_counters alloc_tag_read_nid(struct alloc_tag *tag, int nid)
> +{
> +	struct alloc_tag_counters v = { 0, 0 };
> +	struct alloc_tag_counters *counters;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {

for_each_possible_cpu() is lame - potentially much more expensive than
for_each_online_cpu.  Is it impractical to use for_each_online_cpu()?

Probably doesn't matter for a userspace displaying function but
userspace can do weird and unexpected things.

> +		counters = per_cpu_ptr(tag->counters, cpu);
> +		v.bytes += counters[nid].bytes;
> +		v.calls += counters[nid].calls;
> +	}
> +
> +	return v;
> +}
> +
>
> ...
>
>  static int allocinfo_show(struct seq_file *m, void *arg)
>  {
>  	struct allocinfo_private *priv = (struct allocinfo_private *)arg;
> @@ -116,6 +136,9 @@ static int allocinfo_show(struct seq_file *m, void *arg)
>  		priv->print_header = false;
>  	}
>  	alloc_tag_to_text(&buf, priv->iter.ct);
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
> +	alloc_tag_to_text_all_nids(&buf, priv->iter.ct);
> +#endif

We can eliminate the ifdef by adding

#else
static inline void alloc_tag_to_text_all_nids(struct seq_buf *out, struct codetag *ct)
{
}
#endif

above.

> +static void alloc_tag_to_text_all_nids(struct seq_buf *out, struct codetag *ct)

>  	seq_commit(m, seq_buf_used(&buf));
>  	return 0;
>  }
>
> ...
>
> @@ -247,19 +270,41 @@ static void shutdown_mem_profiling(bool remove_file)
>  void __init alloc_tag_sec_init(void)
>  {
>  	struct alloc_tag *last_codetag;
> +	int i;
>  
>  	if (!mem_profiling_support)
>  		return;
>  
> -	if (!static_key_enabled(&mem_profiling_compressed))
> -		return;
> -
>  	kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name(
>  					SECTION_START(ALLOC_TAG_SECTION_NAME));
>  	last_codetag = (struct alloc_tag *)kallsyms_lookup_name(
>  					SECTION_STOP(ALLOC_TAG_SECTION_NAME));
>  	kernel_tags.count = last_codetag - kernel_tags.first_tag;
>  
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
> +	pcpu_counters_num = num_possible_nodes();
> +#else
> +	pcpu_counters_num = 1;
> +#endif

In the CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS=n case, let's make
pcpu_counters_num a constant "1", visible to all compilation units. 

That way the compiler can optimize away all the

	for (nid = 0; nid < pcpu_counters_num; nid++)

looping.

> +	pcpu_counters_size = pcpu_counters_num * sizeof(struct alloc_tag_counters);
> 
> +	for (i = 0; i < kernel_tags.count; i++) {
> +		/* Each CPU has one alloc_tag_counters per numa node */
> +		kernel_tags.first_tag[i].counters =
> +			pcpu_alloc_noprof(pcpu_counters_size,
> +					  sizeof(struct alloc_tag_counters),
> +					  false, GFP_KERNEL | __GFP_ZERO);
> +		if (!kernel_tags.first_tag[i].counters) {
> +			while (--i >= 0)
> +				free_percpu(kernel_tags.first_tag[i].counters);
> +			pr_info("Failed to allocate per-cpu alloc_tag counters\n");

pr_err(), methinks.

> +			return;

And now what happens.  Will the kernel even work?

This code path is untestable unless the developer jumps through hoops
and it will never be tested again.

We assume that __init-time allocations always succeed, so a hearty
panic() here would be OK.

> +		}
> +	}
> +
> +	if (!static_key_enabled(&mem_profiling_compressed))
> +		return;
> +
>  	/* Check if kernel tags fit into page flags */
>  	if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) {
>  		shutdown_mem_profiling(false); /* allocinfo file does not exist yet */
> @@ -622,7 +667,9 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag
>  	stop_tag = ct_to_alloc_tag(stop);
>  	for (tag = start_tag; tag < stop_tag; tag++) {
>  		WARN_ON(tag->counters);
> -		tag->counters = alloc_percpu(struct alloc_tag_counters);
> +		tag->counters = __alloc_percpu_gfp(pcpu_counters_size,
> +						   sizeof(struct alloc_tag_counters),
> +						   GFP_KERNEL | __GFP_ZERO);
>  		if (!tag->counters) {
>  			while (--tag >= start_tag) {
>  				free_percpu(tag->counters);

Ditto here, actually.

Not that it matters much.  It's init.text and gets thrown away, shrug.

> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
>
> ...
>
> @@ -428,6 +429,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>  		nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false);
>  		if (nr) {
>  			pr_notice("Memory allocations:\n");
> +			pr_notice("<size> <calls> <tag info>\n");
>  			for (i = 0; i < nr; i++) {
>  				struct codetag *ct = tags[i].ct;
>  				struct alloc_tag *tag = ct_to_alloc_tag(ct);
> @@ -435,16 +437,27 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>  				char bytes[10];
>  
>  				string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes));
> -
>  				/* Same as alloc_tag_to_text() but w/o intermediate buffer */
>  				if (ct->modname)
> -					pr_notice("%12s %8llu %s:%u [%s] func:%s\n",
> -						  bytes, counter.calls, ct->filename,
> -						  ct->lineno, ct->modname, ct->function);
> +					pr_notice("%-12s %-8llu %s:%u [%s] func:%s\n",
> +						bytes, counter.calls, ct->filename,
> +						ct->lineno, ct->modname, ct->function);
>  				else
> -					pr_notice("%12s %8llu %s:%u func:%s\n",
> -						  bytes, counter.calls, ct->filename,
> -						  ct->lineno, ct->function);
> +					pr_notice("%-12s %-8llu %s:%u func:%s\n",
> +						bytes, counter.calls,
> +						ct->filename, ct->lineno, ct->function);
> +
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
> +				int nid;

C99 definition.

> +				for (nid = 0; nid < pcpu_counters_num; nid++) {

If we're going to use C99 (is OK now) then it's better to go all the
way and give `i' loop scope.  "for (int i..".

> +					counter = alloc_tag_read_nid(tag, nid);
> +					string_get_size(counter.bytes, 1, STRING_UNITS_2,
> +							bytes, sizeof(bytes));
> +					pr_notice("        nid%-5u %-12lld %-8lld\n",
> +						  nid, counter.bytes, counter.calls);
> +				}
> +#endif
>  			}
>  		}
>  	}
>
> ...
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-06-11  1:21 ` Andrew Morton
@ 2025-06-11  1:33   ` Casey Chen
  2025-06-11  3:47     ` Kent Overstreet
  2025-06-11  3:41   ` Kent Overstreet
  1 sibling, 1 reply; 15+ messages in thread
From: Casey Chen @ 2025-06-11  1:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: surenb, kent.overstreet, corbet, dennis, tj, cl, vbabka, mhocko,
	jackmanb, hannes, ziy, rientjes, roman.gushchin, harry.yoo,
	linux-mm, linux-kernel, linux-doc, yzhong

On Tue, Jun 10, 2025 at 6:21 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Tue, 10 Jun 2025 17:30:53 -0600 Casey Chen <cachen@purestorage.com> wrote:
>
> > Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> > Previously, each alloc_tag had a single set of counters (bytes and
> > calls), aggregated across all CPUs. With this change, each CPU can
> > maintain separate counters for each NUMA node, allowing finer-grained
> > memory allocation profiling.
> >
> > This feature is controlled by the new
> > CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> >
> > * When enabled (=y), the output includes per-node statistics following
> >   the total bytes/calls:
> >
> > <size> <calls> <tag info>
> > ...
> > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> >         nid0     94912        2966
> >         nid1     220544       6892
> > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> >         nid0     4224         33
> >         nid1     3456         27
> >
> > * When disabled (=n), the output remains unchanged:
> > <size> <calls> <tag info>
> > ...
> > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> >
> > To minimize memory overhead, per-NUMA stats counters are dynamically
> > allocated using the percpu allocator. PERCPU_DYNAMIC_RESERVE has been
> > increased to ensure sufficient space for in-kernel alloc_tag counters.
> >
> > For in-kernel alloc_tag instances, pcpu_alloc_noprof() is used to
> > allocate counters. These allocations are excluded from the profiling
> > statistics themselves.
>
> What is glaringly missing here is "why".
>
> What is the use case?  Why does Linux want this?  What benefit does
> this bring to our users?  This is the most important part of the
> changelog because it tells Andrew why he is even looking at this patch.
>
>
> Probably related to the above omission: why per-nid?  It would be more
> flexible to present the per-cpu counts and let userspace aggregate that
> into per-node info if that is desirable.
>

Hi Andrew,

Thanks for taking time reviewing my patch. Sorry I didn't include you
in the previous conversion. See
https://lore.kernel.org/all/CAJuCfpHhSUhxer-6MP3503w6520YLfgBTGp7Q9Qm9kgN4TNsfw@mail.gmail.com/T/#u
It includes some motivations and people's opinions. You can take a
look while I am fixing your comments ASAP.
Basically, we want to know if there is any NUMA imbalance on memory
allocation. Further we could optimize our system based on the NUMA
nodes stats.

> >
> > ...
> >
> > --- a/include/linux/alloc_tag.h
> > +++ b/include/linux/alloc_tag.h
> > @@ -15,6 +15,8 @@
> >  #include <linux/static_key.h>
> >  #include <linux/irqflags.h>
> >
> > +extern int pcpu_counters_num;
>
> This globally-visible variable's identifier is too generic - the name
> should communicate which subsystem the variable belongs to.  Perhaps
> alloc_tag_num_pcpu_counters?  It's long, but only used in a few places.
>
> In fact, it's a count-of-nodes so a better name would be alloc_tag_num_nodes.
>
> Also, as it's written to a single time, __read_mostly is appropriate.
>
> >  struct alloc_tag_counters {
> >       u64 bytes;
> >       u64 calls;
> > @@ -134,16 +136,34 @@ static inline bool mem_alloc_profiling_enabled(void)
> >                                  &mem_alloc_profiling_key);
> >  }
> >
> > +static inline struct alloc_tag_counters alloc_tag_read_nid(struct alloc_tag *tag, int nid)
> > +{
> > +     struct alloc_tag_counters v = { 0, 0 };
> > +     struct alloc_tag_counters *counters;
> > +     int cpu;
> > +
> > +     for_each_possible_cpu(cpu) {
>
> for_each_possible_cpu() is lame - potentially much more expensive than
> for_each_online_cpu.  Is it impractical to use for_each_online_cpu()?
>
> Probably doesn't matter for a userspace displaying function but
> userspace can do weird and unexpected things.
>
> > +             counters = per_cpu_ptr(tag->counters, cpu);
> > +             v.bytes += counters[nid].bytes;
> > +             v.calls += counters[nid].calls;
> > +     }
> > +
> > +     return v;
> > +}
> > +
> >
> > ...
> >
> >  static int allocinfo_show(struct seq_file *m, void *arg)
> >  {
> >       struct allocinfo_private *priv = (struct allocinfo_private *)arg;
> > @@ -116,6 +136,9 @@ static int allocinfo_show(struct seq_file *m, void *arg)
> >               priv->print_header = false;
> >       }
> >       alloc_tag_to_text(&buf, priv->iter.ct);
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
> > +     alloc_tag_to_text_all_nids(&buf, priv->iter.ct);
> > +#endif
>
> We can eliminate the ifdef by adding
>
> #else
> static inline void alloc_tag_to_text_all_nids(struct seq_buf *out, struct codetag *ct)
> {
> }
> #endif
>
> above.
>
> > +static void alloc_tag_to_text_all_nids(struct seq_buf *out, struct codetag *ct)
>
> >       seq_commit(m, seq_buf_used(&buf));
> >       return 0;
> >  }
> >
> > ...
> >
> > @@ -247,19 +270,41 @@ static void shutdown_mem_profiling(bool remove_file)
> >  void __init alloc_tag_sec_init(void)
> >  {
> >       struct alloc_tag *last_codetag;
> > +     int i;
> >
> >       if (!mem_profiling_support)
> >               return;
> >
> > -     if (!static_key_enabled(&mem_profiling_compressed))
> > -             return;
> > -
> >       kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name(
> >                                       SECTION_START(ALLOC_TAG_SECTION_NAME));
> >       last_codetag = (struct alloc_tag *)kallsyms_lookup_name(
> >                                       SECTION_STOP(ALLOC_TAG_SECTION_NAME));
> >       kernel_tags.count = last_codetag - kernel_tags.first_tag;
> >
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
> > +     pcpu_counters_num = num_possible_nodes();
> > +#else
> > +     pcpu_counters_num = 1;
> > +#endif
>
> In the CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS=n case, let's make
> pcpu_counters_num a constant "1", visible to all compilation units.
>
> That way the compiler can optimize away all the
>
>         for (nid = 0; nid < pcpu_counters_num; nid++)
>
> looping.
>
> > +     pcpu_counters_size = pcpu_counters_num * sizeof(struct alloc_tag_counters);
> >
> > +     for (i = 0; i < kernel_tags.count; i++) {
> > +             /* Each CPU has one alloc_tag_counters per numa node */
> > +             kernel_tags.first_tag[i].counters =
> > +                     pcpu_alloc_noprof(pcpu_counters_size,
> > +                                       sizeof(struct alloc_tag_counters),
> > +                                       false, GFP_KERNEL | __GFP_ZERO);
> > +             if (!kernel_tags.first_tag[i].counters) {
> > +                     while (--i >= 0)
> > +                             free_percpu(kernel_tags.first_tag[i].counters);
> > +                     pr_info("Failed to allocate per-cpu alloc_tag counters\n");
>
> pr_err(), methinks.
>
> > +                     return;
>
> And now what happens.  Will the kernel even work?
>
> This code path is untestable unless the developer jumps through hoops
> and it will never be tested again.
>
> We assume that __init-time allocations always succeed, so a hearty
> panic() here would be OK.
>
> > +             }
> > +     }
> > +
> > +     if (!static_key_enabled(&mem_profiling_compressed))
> > +             return;
> > +
> >       /* Check if kernel tags fit into page flags */
> >       if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) {
> >               shutdown_mem_profiling(false); /* allocinfo file does not exist yet */
> > @@ -622,7 +667,9 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag
> >       stop_tag = ct_to_alloc_tag(stop);
> >       for (tag = start_tag; tag < stop_tag; tag++) {
> >               WARN_ON(tag->counters);
> > -             tag->counters = alloc_percpu(struct alloc_tag_counters);
> > +             tag->counters = __alloc_percpu_gfp(pcpu_counters_size,
> > +                                                sizeof(struct alloc_tag_counters),
> > +                                                GFP_KERNEL | __GFP_ZERO);
> >               if (!tag->counters) {
> >                       while (--tag >= start_tag) {
> >                               free_percpu(tag->counters);
>
> Ditto here, actually.
>
> Not that it matters much.  It's init.text and gets thrown away, shrug.
>
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> >
> > ...
> >
> > @@ -428,6 +429,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> >               nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false);
> >               if (nr) {
> >                       pr_notice("Memory allocations:\n");
> > +                     pr_notice("<size> <calls> <tag info>\n");
> >                       for (i = 0; i < nr; i++) {
> >                               struct codetag *ct = tags[i].ct;
> >                               struct alloc_tag *tag = ct_to_alloc_tag(ct);
> > @@ -435,16 +437,27 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> >                               char bytes[10];
> >
> >                               string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes));
> > -
> >                               /* Same as alloc_tag_to_text() but w/o intermediate buffer */
> >                               if (ct->modname)
> > -                                     pr_notice("%12s %8llu %s:%u [%s] func:%s\n",
> > -                                               bytes, counter.calls, ct->filename,
> > -                                               ct->lineno, ct->modname, ct->function);
> > +                                     pr_notice("%-12s %-8llu %s:%u [%s] func:%s\n",
> > +                                             bytes, counter.calls, ct->filename,
> > +                                             ct->lineno, ct->modname, ct->function);
> >                               else
> > -                                     pr_notice("%12s %8llu %s:%u func:%s\n",
> > -                                               bytes, counter.calls, ct->filename,
> > -                                               ct->lineno, ct->function);
> > +                                     pr_notice("%-12s %-8llu %s:%u func:%s\n",
> > +                                             bytes, counter.calls,
> > +                                             ct->filename, ct->lineno, ct->function);
> > +
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS
> > +                             int nid;
>
> C99 definition.
>
> > +                             for (nid = 0; nid < pcpu_counters_num; nid++) {
>
> If we're going to use C99 (is OK now) then it's better to go all the
> way and give `i' loop scope.  "for (int i..".
>
> > +                                     counter = alloc_tag_read_nid(tag, nid);
> > +                                     string_get_size(counter.bytes, 1, STRING_UNITS_2,
> > +                                                     bytes, sizeof(bytes));
> > +                                     pr_notice("        nid%-5u %-12lld %-8lld\n",
> > +                                               nid, counter.bytes, counter.calls);
> > +                             }
> > +#endif
> >                       }
> >               }
> >       }
> >
> > ...
> >


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-06-11  1:21 ` Andrew Morton
  2025-06-11  1:33   ` Casey Chen
@ 2025-06-11  3:41   ` Kent Overstreet
  1 sibling, 0 replies; 15+ messages in thread
From: Kent Overstreet @ 2025-06-11  3:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Casey Chen, surenb, corbet, dennis, tj, cl, vbabka, mhocko,
	jackmanb, hannes, ziy, rientjes, roman.gushchin, harry.yoo,
	linux-mm, linux-kernel, linux-doc, yzhong

On Tue, Jun 10, 2025 at 06:21:55PM -0700, Andrew Morton wrote:
> On Tue, 10 Jun 2025 17:30:53 -0600 Casey Chen <cachen@purestorage.com> wrote:
> 
> > Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> > Previously, each alloc_tag had a single set of counters (bytes and
> > calls), aggregated across all CPUs. With this change, each CPU can
> > maintain separate counters for each NUMA node, allowing finer-grained
> > memory allocation profiling.
> > 
> > This feature is controlled by the new
> > CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> > 
> > * When enabled (=y), the output includes per-node statistics following
> >   the total bytes/calls:
> > 
> > <size> <calls> <tag info>
> > ...
> > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> >         nid0     94912        2966
> >         nid1     220544       6892
> > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> >         nid0     4224         33
> >         nid1     3456         27
> > 
> > * When disabled (=n), the output remains unchanged:
> > <size> <calls> <tag info>
> > ...
> > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> > 
> > To minimize memory overhead, per-NUMA stats counters are dynamically
> > allocated using the percpu allocator. PERCPU_DYNAMIC_RESERVE has been
> > increased to ensure sufficient space for in-kernel alloc_tag counters.
> > 
> > For in-kernel alloc_tag instances, pcpu_alloc_noprof() is used to
> > allocate counters. These allocations are excluded from the profiling
> > statistics themselves.
> 
> What is glaringly missing here is "why".
> 
> What is the use case?  Why does Linux want this?  What benefit does
> this bring to our users?  This is the most important part of the
> changelog because it tells Andrew why he is even looking at this patch.

In the last thread, I was pushing to get some input from other people
working on userspace profiling - it seems like that's where we're going
with this, so we need to make sure people working in the same area are
talking to each other, and if we're building tools to export data we
want to make sure it's relevant and useful.

Steven Rostadt sent out some pings, but I don't think we've heard back
from the relevant people, and I do want that to happen before (if) this
goes in.

And more basic than this, now that we've got the ability to map kernel
address -> code that owns it, we should be seeing if that is relevant
and interesting to the tooling people before we get fancier.

Casey also mentioned that this was being used for some internal stuff at
Pure, and I'd still like to hear more about that. If they already know
interesting and cool stuff we can use this for - great, let's please
hear what it is.

But we definitely need some wider context before merging this.

> Probably related to the above omission: why per-nid?  It would be more
> flexible to present the per-cpu counts and let userspace aggregate that
> into per-node info if that is desirable.

Per nid makes more sense to me if this is for profiling, and since this
is a text interface meant primarily for human consumption we don't want
to bloat that excessively.

We can always add multiple interfaces for viewing the same data, and
there's nothing preventing us from adding that later if it turns out
something else does want per-cpu counts.

Per-cpu counts would, I believe, bloat the data structures quite a bit
since the counters are internally percpu - would there then be counters
in nr_cpus^2? I hope not :)

> > +static inline struct alloc_tag_counters alloc_tag_read_nid(struct alloc_tag *tag, int nid)
> > +{
> > +	struct alloc_tag_counters v = { 0, 0 };
> > +	struct alloc_tag_counters *counters;
> > +	int cpu;
> > +
> > +	for_each_possible_cpu(cpu) {
> 
> for_each_possible_cpu() is lame - potentially much more expensive than
> for_each_online_cpu.  Is it impractical to use for_each_online_cpu()?
> 
> Probably doesn't matter for a userspace displaying function but
> userspace can do weird and unexpected things.

for_each_online_cpu() means having to think hard about cpu hotplug and
possibly adding callbacks, no? I think simple and dumb is fine here.

> > +	for (i = 0; i < kernel_tags.count; i++) {
> > +		/* Each CPU has one alloc_tag_counters per numa node */
> > +		kernel_tags.first_tag[i].counters =
> > +			pcpu_alloc_noprof(pcpu_counters_size,
> > +					  sizeof(struct alloc_tag_counters),
> > +					  false, GFP_KERNEL | __GFP_ZERO);
> > +		if (!kernel_tags.first_tag[i].counters) {
> > +			while (--i >= 0)
> > +				free_percpu(kernel_tags.first_tag[i].counters);
> > +			pr_info("Failed to allocate per-cpu alloc_tag counters\n");
> 
> pr_err(), methinks.
> 
> > +			return;
> 
> And now what happens.  Will the kernel even work?
> 
> This code path is untestable unless the developer jumps through hoops
> and it will never be tested again.
> 
> We assume that __init-time allocations always succeed, so a hearty
> panic() here would be OK.

percpu allocations are more limited than normal allocations, and we did
hit that when developing memory allocation profiling, so - maybe WARN()?

> > +				int nid;
> 
> C99 definition.
> 
> > +				for (nid = 0; nid < pcpu_counters_num; nid++) {
> 
> If we're going to use C99 (is OK now) then it's better to go all the
> way and give `i' loop scope.  "for (int i..".

Yes please; variable reuse can lead to some nasties.

Going full C99 has its own problems (you can't do it after labels, and
only one of gcc or clang complains and I forget which - perpetual
headache).

But C99 for loops are a total no brainer.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-06-11  1:33   ` Casey Chen
@ 2025-06-11  3:47     ` Kent Overstreet
  0 siblings, 0 replies; 15+ messages in thread
From: Kent Overstreet @ 2025-06-11  3:47 UTC (permalink / raw)
  To: Casey Chen
  Cc: Andrew Morton, surenb, corbet, dennis, tj, cl, vbabka, mhocko,
	jackmanb, hannes, ziy, rientjes, roman.gushchin, harry.yoo,
	linux-mm, linux-kernel, linux-doc, yzhong

On Tue, Jun 10, 2025 at 06:33:58PM -0700, Casey Chen wrote:
> On Tue, Jun 10, 2025 at 6:21 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Tue, 10 Jun 2025 17:30:53 -0600 Casey Chen <cachen@purestorage.com> wrote:
> >
> > > Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> > > Previously, each alloc_tag had a single set of counters (bytes and
> > > calls), aggregated across all CPUs. With this change, each CPU can
> > > maintain separate counters for each NUMA node, allowing finer-grained
> > > memory allocation profiling.
> > >
> > > This feature is controlled by the new
> > > CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> > >
> > > * When enabled (=y), the output includes per-node statistics following
> > >   the total bytes/calls:
> > >
> > > <size> <calls> <tag info>
> > > ...
> > > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> > >         nid0     94912        2966
> > >         nid1     220544       6892
> > > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> > >         nid0     4224         33
> > >         nid1     3456         27
> > >
> > > * When disabled (=n), the output remains unchanged:
> > > <size> <calls> <tag info>
> > > ...
> > > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> > > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> > >
> > > To minimize memory overhead, per-NUMA stats counters are dynamically
> > > allocated using the percpu allocator. PERCPU_DYNAMIC_RESERVE has been
> > > increased to ensure sufficient space for in-kernel alloc_tag counters.
> > >
> > > For in-kernel alloc_tag instances, pcpu_alloc_noprof() is used to
> > > allocate counters. These allocations are excluded from the profiling
> > > statistics themselves.
> >
> > What is glaringly missing here is "why".
> >
> > What is the use case?  Why does Linux want this?  What benefit does
> > this bring to our users?  This is the most important part of the
> > changelog because it tells Andrew why he is even looking at this patch.
> >
> >
> > Probably related to the above omission: why per-nid?  It would be more
> > flexible to present the per-cpu counts and let userspace aggregate that
> > into per-node info if that is desirable.
> >
> 
> Hi Andrew,
> 
> Thanks for taking time reviewing my patch. Sorry I didn't include you
> in the previous conversion. See
> https://lore.kernel.org/all/CAJuCfpHhSUhxer-6MP3503w6520YLfgBTGp7Q9Qm9kgN4TNsfw@mail.gmail.com/T/#u

It's good practice to add lore links to any and all previous discussion
to the commit message for the latest patch, like so:

Link: https://lore.kernel.org/all/CAJuCfpHhSUhxer-6MP3503w6520YLfgBTGp7Q9Qm9kgN4TNsfw@mail.gmail.com/T/#u

Make sure to give as much as context as possible - and your commit
message should always include _rationale_ - none of us can keep up with
everything :)


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-06-10 23:30 [PATCH] alloc_tag: add per-NUMA node stats Casey Chen
  2025-06-11  1:21 ` Andrew Morton
@ 2025-06-12  5:36 ` David Wang
  2025-06-12 15:37   ` Kent Overstreet
  2025-06-18 22:16 ` Kent Overstreet
  2 siblings, 1 reply; 15+ messages in thread
From: David Wang @ 2025-06-12  5:36 UTC (permalink / raw)
  To: cachen
  Cc: akpm, cl, corbet, dennis, hannes, harry.yoo, jackmanb,
	kent.overstreet, linux-doc, linux-kernel, linux-mm, mhocko,
	rientjes, roman.gushchin, surenb, tj, vbabka, yzhong, ziy

Hi, 

On Tue, 10 Jun 2025 17:30:53 -0600 Casey Chen <cachen@purestorage.com> wrote:
> Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> Previously, each alloc_tag had a single set of counters (bytes and
> calls), aggregated across all CPUs. With this change, each CPU can
> maintain separate counters for each NUMA node, allowing finer-grained
> memory allocation profiling.
> 
> This feature is controlled by the new
> CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> 
> * When enabled (=y), the output includes per-node statistics following
>   the total bytes/calls:
> 
> <size> <calls> <tag info>
> ...
> 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
>         nid0     94912        2966
>         nid1     220544       6892
> 7680         60       mm/dmapool.c:254 func:dma_pool_create
>         nid0     4224         33
>         nid1     3456         27
> 
> * When disabled (=n), the output remains unchanged:
> <size> <calls> <tag info>
> ...
> 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> 7680         60       mm/dmapool.c:254 func:dma_pool_create
> 
> To minimize memory overhead, per-NUMA stats counters are dynamically
> allocated using the percpu allocator. PERCPU_DYNAMIC_RESERVE has been
> increased to ensure sufficient space for in-kernel alloc_tag counters.
> 
> For in-kernel alloc_tag instances, pcpu_alloc_noprof() is used to
> allocate counters. These allocations are excluded from the profiling
> statistics themselves.

Considering NUMA balance, I have two questions:
1. Do we need the granularity of calling sites?
We need that granularity to identify a possible memory leak, or somewhere
we can optimize its memory usage.
But for NUMA unbalance, the calling site would mostly be *innocent*, the
clue normally lies in the cpu making memory allocation, memory interface, etc...
The point is, when NUMA unbalance happened, can it be fixed by adjusting the calling sites?
Isn't <cpu, memory interface/slab name, numa id> enough to be used as key for numa
stats analysis?
Any example explaining why you need the information of calling sites and how the
information of calling site help?

2. There are other factors affecting NUMA balance, cgroup usage, e.g...
Could we collect those information at calling sites as well.



Thanks
David



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-06-12  5:36 ` David Wang
@ 2025-06-12 15:37   ` Kent Overstreet
  0 siblings, 0 replies; 15+ messages in thread
From: Kent Overstreet @ 2025-06-12 15:37 UTC (permalink / raw)
  To: David Wang
  Cc: cachen, akpm, cl, corbet, dennis, hannes, harry.yoo, jackmanb,
	linux-doc, linux-kernel, linux-mm, mhocko, rientjes,
	roman.gushchin, surenb, tj, vbabka, yzhong, ziy

On Thu, Jun 12, 2025 at 01:36:05PM +0800, David Wang wrote:
> Hi, 
> 
> On Tue, 10 Jun 2025 17:30:53 -0600 Casey Chen <cachen@purestorage.com> wrote:
> > Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> > Previously, each alloc_tag had a single set of counters (bytes and
> > calls), aggregated across all CPUs. With this change, each CPU can
> > maintain separate counters for each NUMA node, allowing finer-grained
> > memory allocation profiling.
> > 
> > This feature is controlled by the new
> > CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> > 
> > * When enabled (=y), the output includes per-node statistics following
> >   the total bytes/calls:
> > 
> > <size> <calls> <tag info>
> > ...
> > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> >         nid0     94912        2966
> >         nid1     220544       6892
> > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> >         nid0     4224         33
> >         nid1     3456         27
> > 
> > * When disabled (=n), the output remains unchanged:
> > <size> <calls> <tag info>
> > ...
> > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> > 
> > To minimize memory overhead, per-NUMA stats counters are dynamically
> > allocated using the percpu allocator. PERCPU_DYNAMIC_RESERVE has been
> > increased to ensure sufficient space for in-kernel alloc_tag counters.
> > 
> > For in-kernel alloc_tag instances, pcpu_alloc_noprof() is used to
> > allocate counters. These allocations are excluded from the profiling
> > statistics themselves.
> 
> Considering NUMA balance, I have two questions:
> 1. Do we need the granularity of calling sites?
> We need that granularity to identify a possible memory leak, or somewhere
> we can optimize its memory usage.
> But for NUMA unbalance, the calling site would mostly be *innocent*, the
> clue normally lies in the cpu making memory allocation, memory interface, etc...
> The point is, when NUMA unbalance happened, can it be fixed by adjusting the calling sites?
> Isn't <cpu, memory interface/slab name, numa id> enough to be used as key for numa
> stats analysis?

kmalloc_node().

Per callsite is the right granularity.

But AFAIK correlating profiling information with the allocation is still
an entirely manual process, so that's the part I'm interested in right
now.

Under the hood memory allocation profiling gives you the ability to map
any specific allocation to the line of code that owns it - that is, map
kernel virtual address to codetag.

But I don't know if perf collects _data_ addresses on cache misses. Does
anyone?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-06-10 23:30 [PATCH] alloc_tag: add per-NUMA node stats Casey Chen
  2025-06-11  1:21 ` Andrew Morton
  2025-06-12  5:36 ` David Wang
@ 2025-06-18 22:16 ` Kent Overstreet
  2025-07-08 21:52   ` David Rientjes
  2 siblings, 1 reply; 15+ messages in thread
From: Kent Overstreet @ 2025-06-18 22:16 UTC (permalink / raw)
  To: Casey Chen
  Cc: akpm, surenb, corbet, dennis, tj, cl, vbabka, mhocko, jackmanb,
	hannes, ziy, rientjes, roman.gushchin, harry.yoo, linux-mm,
	linux-kernel, linux-doc, yzhong

On Tue, Jun 10, 2025 at 05:30:53PM -0600, Casey Chen wrote:
> Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> Previously, each alloc_tag had a single set of counters (bytes and
> calls), aggregated across all CPUs. With this change, each CPU can
> maintain separate counters for each NUMA node, allowing finer-grained
> memory allocation profiling.
> 
> This feature is controlled by the new
> CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> 
> * When enabled (=y), the output includes per-node statistics following
>   the total bytes/calls:
> 
> <size> <calls> <tag info>
> ...
> 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
>         nid0     94912        2966
>         nid1     220544       6892
> 7680         60       mm/dmapool.c:254 func:dma_pool_create
>         nid0     4224         33
>         nid1     3456         27

I just received a report of memory reclaim issues where it seems DMA32
is stuffed full.

So naturally, instrumenting to see what's consuming DMA32 is going to be
the first thing to do, which made me think of your patchset.

I wonder if we should think about something a bit more general, so it's
easy to break out accounting different ways depending on what we want to
debug.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-06-18 22:16 ` Kent Overstreet
@ 2025-07-08 21:52   ` David Rientjes
  2025-07-08 22:38     ` Christoph Lameter (Ampere)
                       ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: David Rientjes @ 2025-07-08 21:52 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Casey Chen, Andrew Morton, surenb, corbet, dennis, tj, cl,
	Vlastimil Babka, mhocko, jackmanb, hannes, ziy, roman.gushchin,
	harry.yoo, linux-mm, linux-kernel, linux-doc, yzhong,
	Sourav Panda

On Wed, 18 Jun 2025, Kent Overstreet wrote:

> On Tue, Jun 10, 2025 at 05:30:53PM -0600, Casey Chen wrote:
> > Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> > Previously, each alloc_tag had a single set of counters (bytes and
> > calls), aggregated across all CPUs. With this change, each CPU can
> > maintain separate counters for each NUMA node, allowing finer-grained
> > memory allocation profiling.
> > 
> > This feature is controlled by the new
> > CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> > 
> > * When enabled (=y), the output includes per-node statistics following
> >   the total bytes/calls:
> > 
> > <size> <calls> <tag info>
> > ...
> > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> >         nid0     94912        2966
> >         nid1     220544       6892
> > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> >         nid0     4224         33
> >         nid1     3456         27
> 
> I just received a report of memory reclaim issues where it seems DMA32
> is stuffed full.
> 
> So naturally, instrumenting to see what's consuming DMA32 is going to be
> the first thing to do, which made me think of your patchset.
> 
> I wonder if we should think about something a bit more general, so it's
> easy to break out accounting different ways depending on what we want to
> debug.
> 

Right, per-node memory attribution, or per zone, is very useful.

Casey, what's the latest status of your patch?  Using alloc_tag for 
attributing memory overheads has been exceedingly useful for Google Cloud 
and adding better insight it for per-node breakdown would be even better.  

Our use case is quite simple: we sell guest memory to the customer as 
persistent hugetlb and keep some memory on the host for ourselves (VMM, 
host userspace, host kernel).  We track every page of that overhead memory 
because memory pressure here can cause all sorts of issues like userspace 
unresponsiveness.  We also want to sell as much guest memory as possible 
to avoid stranding cpus.

To do that, per-node breakdown of memory allocations would be a tremendous 
help.  We have memory that is asymmetric for NUMA, even for memory that 
has affinity to the NIC.  Being able to inspect the origins of memory for 
a specific NUMA node that is under memory pressure where other NUMA nodes 
are not under memory pressure would be excellent.

Adding Sourav Panda as well as he may have additional thoughts on this.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-07-08 21:52   ` David Rientjes
@ 2025-07-08 22:38     ` Christoph Lameter (Ampere)
  2025-07-09 19:14       ` David Rientjes
  2025-07-08 22:53     ` Casey Chen
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 15+ messages in thread
From: Christoph Lameter (Ampere) @ 2025-07-08 22:38 UTC (permalink / raw)
  To: David Rientjes
  Cc: Kent Overstreet, Casey Chen, Andrew Morton, surenb, corbet,
	dennis, tj, Vlastimil Babka, mhocko, jackmanb, hannes, ziy,
	roman.gushchin, harry.yoo, linux-mm, linux-kernel, linux-doc,
	yzhong, Sourav Panda

On Tue, 8 Jul 2025, David Rientjes wrote:

> Right, per-node memory attribution, or per zone, is very useful.

I thought that was already possible using cgroups? That way you have the
numbers for an application which may be much more useful.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-07-08 21:52   ` David Rientjes
  2025-07-08 22:38     ` Christoph Lameter (Ampere)
@ 2025-07-08 22:53     ` Casey Chen
  2025-07-08 23:07     ` Casey Chen
  2025-07-10  5:54     ` Sourav Panda
  3 siblings, 0 replies; 15+ messages in thread
From: Casey Chen @ 2025-07-08 22:53 UTC (permalink / raw)
  To: David Rientjes
  Cc: Kent Overstreet, Andrew Morton, surenb, corbet, dennis, tj, cl,
	Vlastimil Babka, mhocko, jackmanb, hannes, ziy, roman.gushchin,
	harry.yoo, linux-mm, linux-kernel, linux-doc, yzhong,
	Sourav Panda

[-- Attachment #1: Type: text/plain, Size: 2851 bytes --]

On Tue, Jul 8, 2025 at 2:52 PM David Rientjes <rientjes@google.com> wrote:

> On Wed, 18 Jun 2025, Kent Overstreet wrote:
>
> > On Tue, Jun 10, 2025 at 05:30:53PM -0600, Casey Chen wrote:
> > > Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> > > Previously, each alloc_tag had a single set of counters (bytes and
> > > calls), aggregated across all CPUs. With this change, each CPU can
> > > maintain separate counters for each NUMA node, allowing finer-grained
> > > memory allocation profiling.
> > >
> > > This feature is controlled by the new
> > > CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> > >
> > > * When enabled (=y), the output includes per-node statistics following
> > >   the total bytes/calls:
> > >
> > > <size> <calls> <tag info>
> > > ...
> > > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> > >         nid0     94912        2966
> > >         nid1     220544       6892
> > > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> > >         nid0     4224         33
> > >         nid1     3456         27
> >
> > I just received a report of memory reclaim issues where it seems DMA32
> > is stuffed full.
> >
> > So naturally, instrumenting to see what's consuming DMA32 is going to be
> > the first thing to do, which made me think of your patchset.
> >
> > I wonder if we should think about something a bit more general, so it's
> > easy to break out accounting different ways depending on what we want to
> > debug.
> >
>
> Right, per-node memory attribution, or per zone, is very useful.
>
> Casey, what's the latest status of your patch?  Using alloc_tag for
> attributing memory overheads has been exceedingly useful for Google Cloud
> and adding better insight it for per-node breakdown would be even better.
>
> Our use case is quite simple: we sell guest memory to the customer as
> persistent hugetlb and keep some memory on the host for ourselves (VMM,
> host userspace, host kernel).  We track every page of that overhead memory
> because memory pressure here can cause all sorts of issues like userspace
> unresponsiveness.  We also want to sell as much guest memory as possible
> to avoid stranding cpus.
>
> To do that, per-node breakdown of memory allocations would be a tremendous
> help.  We have memory that is asymmetric for NUMA, even for memory that
> has affinity to the NIC.  Being able to inspect the origins of memory for
> a specific NUMA node that is under memory pressure where other NUMA nodes
> are not under memory pressure would be excellent.
>
> Adding Sourav Panda as well as he may have additional thoughts on this.
>

Thanks for your interest. I have been busy with other team projects and
will get back on this soon. I will address comments and send out a new code
review.

[-- Attachment #2: Type: text/html, Size: 3590 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-07-08 21:52   ` David Rientjes
  2025-07-08 22:38     ` Christoph Lameter (Ampere)
  2025-07-08 22:53     ` Casey Chen
@ 2025-07-08 23:07     ` Casey Chen
  2025-07-10  5:54     ` Sourav Panda
  3 siblings, 0 replies; 15+ messages in thread
From: Casey Chen @ 2025-07-08 23:07 UTC (permalink / raw)
  To: David Rientjes
  Cc: Kent Overstreet, Andrew Morton, surenb, corbet, dennis, tj, cl,
	Vlastimil Babka, mhocko, jackmanb, hannes, ziy, roman.gushchin,
	harry.yoo, linux-mm, linux-kernel, linux-doc, yzhong,
	Sourav Panda

On Tue, Jul 8, 2025 at 2:52 PM David Rientjes <rientjes@google.com> wrote:
>
> On Wed, 18 Jun 2025, Kent Overstreet wrote:
>
> > On Tue, Jun 10, 2025 at 05:30:53PM -0600, Casey Chen wrote:
> > > Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> > > Previously, each alloc_tag had a single set of counters (bytes and
> > > calls), aggregated across all CPUs. With this change, each CPU can
> > > maintain separate counters for each NUMA node, allowing finer-grained
> > > memory allocation profiling.
> > >
> > > This feature is controlled by the new
> > > CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> > >
> > > * When enabled (=y), the output includes per-node statistics following
> > >   the total bytes/calls:
> > >
> > > <size> <calls> <tag info>
> > > ...
> > > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> > >         nid0     94912        2966
> > >         nid1     220544       6892
> > > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> > >         nid0     4224         33
> > >         nid1     3456         27
> >
> > I just received a report of memory reclaim issues where it seems DMA32
> > is stuffed full.
> >
> > So naturally, instrumenting to see what's consuming DMA32 is going to be
> > the first thing to do, which made me think of your patchset.
> >
> > I wonder if we should think about something a bit more general, so it's
> > easy to break out accounting different ways depending on what we want to
> > debug.
> >
>
> Right, per-node memory attribution, or per zone, is very useful.
>
> Casey, what's the latest status of your patch?  Using alloc_tag for
> attributing memory overheads has been exceedingly useful for Google Cloud
> and adding better insight it for per-node breakdown would be even better.
>
> Our use case is quite simple: we sell guest memory to the customer as
> persistent hugetlb and keep some memory on the host for ourselves (VMM,
> host userspace, host kernel).  We track every page of that overhead memory
> because memory pressure here can cause all sorts of issues like userspace
> unresponsiveness.  We also want to sell as much guest memory as possible
> to avoid stranding cpus.
>
> To do that, per-node breakdown of memory allocations would be a tremendous
> help.  We have memory that is asymmetric for NUMA, even for memory that
> has affinity to the NIC.  Being able to inspect the origins of memory for
> a specific NUMA node that is under memory pressure where other NUMA nodes
> are not under memory pressure would be excellent.
>
> Adding Sourav Panda as well as he may have additional thoughts on this.

Thanks for your interest. I have been busy with other team projects
and will get back on this soon. I will address comments and send out a
new code review.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-07-08 22:38     ` Christoph Lameter (Ampere)
@ 2025-07-09 19:14       ` David Rientjes
  0 siblings, 0 replies; 15+ messages in thread
From: David Rientjes @ 2025-07-09 19:14 UTC (permalink / raw)
  To: Christoph Lameter (Ampere)
  Cc: Kent Overstreet, Casey Chen, Andrew Morton, surenb, corbet,
	dennis, tj, Vlastimil Babka, mhocko, jackmanb, hannes, ziy,
	roman.gushchin, harry.yoo, linux-mm, linux-kernel, linux-doc,
	yzhong, Sourav Panda

On Tue, 8 Jul 2025, Christoph Lameter (Ampere) wrote:

> > Right, per-node memory attribution, or per zone, is very useful.
> 
> I thought that was already possible using cgroups? That way you have the
> numbers for an application which may be much more useful.
> 
> 

Cgroups can tell us the nature of the memory if it's charged, yes.  Memory 
Allocation Profiling, which this patch series extends, however, provides 
much more detailed insight such as the caller that allocated the memory.  
It's been invaluable to identify memory efficiency savings and now, with 
Casey's patch, can be used to identify where NUMA imbalances exist for the 
same callers.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] alloc_tag: add per-NUMA node stats
  2025-07-08 21:52   ` David Rientjes
                       ` (2 preceding siblings ...)
  2025-07-08 23:07     ` Casey Chen
@ 2025-07-10  5:54     ` Sourav Panda
  3 siblings, 0 replies; 15+ messages in thread
From: Sourav Panda @ 2025-07-10  5:54 UTC (permalink / raw)
  To: David Rientjes
  Cc: Kent Overstreet, Casey Chen, Andrew Morton, surenb, corbet,
	dennis, tj, cl, Vlastimil Babka, mhocko, jackmanb, hannes, ziy,
	roman.gushchin, harry.yoo, linux-mm, linux-kernel, linux-doc,
	yzhong

On Tue, Jul 8, 2025 at 2:52 PM David Rientjes <rientjes@google.com> wrote:
>
> On Wed, 18 Jun 2025, Kent Overstreet wrote:
>
> > On Tue, Jun 10, 2025 at 05:30:53PM -0600, Casey Chen wrote:
> > > Add support for tracking per-NUMA node statistics in /proc/allocinfo.
> > > Previously, each alloc_tag had a single set of counters (bytes and
> > > calls), aggregated across all CPUs. With this change, each CPU can
> > > maintain separate counters for each NUMA node, allowing finer-grained
> > > memory allocation profiling.
> > >
> > > This feature is controlled by the new
> > > CONFIG_MEM_ALLOC_PROFILING_PER_NUMA_STATS option:
> > >
> > > * When enabled (=y), the output includes per-node statistics following
> > >   the total bytes/calls:
> > >
> > > <size> <calls> <tag info>
> > > ...
> > > 315456       9858     mm/dmapool.c:338 func:pool_alloc_page
> > >         nid0     94912        2966
> > >         nid1     220544       6892
> > > 7680         60       mm/dmapool.c:254 func:dma_pool_create
> > >         nid0     4224         33
> > >         nid1     3456         27
> >
> > I just received a report of memory reclaim issues where it seems DMA32
> > is stuffed full.
> >
> > So naturally, instrumenting to see what's consuming DMA32 is going to be
> > the first thing to do, which made me think of your patchset.
> >
> > I wonder if we should think about something a bit more general, so it's
> > easy to break out accounting different ways depending on what we want to
> > debug.
> >
>
> Right, per-node memory attribution, or per zone, is very useful.
>
> Casey, what's the latest status of your patch?  Using alloc_tag for
> attributing memory overheads has been exceedingly useful for Google Cloud
> and adding better insight it for per-node breakdown would be even better.
>
> Our use case is quite simple: we sell guest memory to the customer as
> persistent hugetlb and keep some memory on the host for ourselves (VMM,
> host userspace, host kernel).  We track every page of that overhead memory
> because memory pressure here can cause all sorts of issues like userspace
> unresponsiveness.  We also want to sell as much guest memory as possible
> to avoid stranding cpus.
>
> To do that, per-node breakdown of memory allocations would be a tremendous
> help.  We have memory that is asymmetric for NUMA, even for memory that
> has affinity to the NIC.  Being able to inspect the origins of memory for
> a specific NUMA node that is under memory pressure where other NUMA nodes
> are not under memory pressure would be excellent.
>
> Adding Sourav Panda as well as he may have additional thoughts on this.

I agree with David, especially the point regarding NIC affinity. I was
dealing with a similar bug today, but pertaining to SSD where this
patchset would have helped in the investigation.

That being said, I think pgalloc_tag_swap() has to be modified as
well, which gets called by __migrate_folio().


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2025-07-10  5:54 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-10 23:30 [PATCH] alloc_tag: add per-NUMA node stats Casey Chen
2025-06-11  1:21 ` Andrew Morton
2025-06-11  1:33   ` Casey Chen
2025-06-11  3:47     ` Kent Overstreet
2025-06-11  3:41   ` Kent Overstreet
2025-06-12  5:36 ` David Wang
2025-06-12 15:37   ` Kent Overstreet
2025-06-18 22:16 ` Kent Overstreet
2025-07-08 21:52   ` David Rientjes
2025-07-08 22:38     ` Christoph Lameter (Ampere)
2025-07-09 19:14       ` David Rientjes
2025-07-08 22:53     ` Casey Chen
2025-07-08 23:07     ` Casey Chen
2025-07-10  5:54     ` Sourav Panda
  -- strict thread matches above, loose matches on Subject: below --
2025-05-30  0:39 [PATCH 0/1] alloc_tag: add per-numa " Casey Chen
2025-05-30  0:39 ` [PATCH] " Casey Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).