All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
@ 2026-05-06  2:22 Hao Ge
  2026-05-09  0:12 ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: Hao Ge @ 2026-05-06  2:22 UTC (permalink / raw)
  To: Suren Baghdasaryan, Kent Overstreet, Andrew Morton
  Cc: linux-mm, linux-kernel, Hao Ge

Pages allocated before page_ext is available have their codetag left
uninitialized. Track these early PFNs and clear their codetag in
clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
warnings when they are freed later.

Currently a fixed-size array of 8192 entries is used, with a warning if
the limit is exceeded. However, the number of early allocations depends
on the number of CPUs and can be larger than 8192.

Replace the fixed-size array with a dynamically allocated linked list
of pfn_pool structs. Each node is allocated via alloc_page() and mapped
to a pfn_pool containing a next pointer, an atomic slot counter, and a
PFN array that fills the remainder of the page.

The tracking pages themselves are allocated via alloc_page(), which
would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
%__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
so that the early path can skip recording allocations that carry this
flag.

Suggested-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Hao Ge <hao.ge@linux.dev>
---
v5:
- Cache pool->count before the inner loop in clear_early_alloc_pfn_tag_refs(),
  as the value cannot change after synchronize_rcu() (suggested by Suren Baghdasaryan)
- Mask out GFP_ZONEMASK when allocating tracking pages to avoid propagating
  the original allocation's zone constraints to internal bookkeeping memory
  (suggested by Suren Baghdasaryan)

v4:
- Use struct pfn_pool with named fields (next, atomic_t count, pfns[]) mapped
  onto the page body, replacing the page->lru/page->private approach
  (suggested by Suren Baghdasaryan)

v3:
- Simplify linked list: use page->lru for chaining and page->private as
  slot counter, removing the early_pfn_node struct and freelist (suggested
  by Suren Baghdasaryan)
- Pass gfp_flags through alloc_tag_add_early_pfn() but strip
  __GFP_DIRECT_RECLAIM instead of selecting GFP_KERNEL/GFP_ATOMIC,
  because __alloc_tag_add_early_pfn() is invoked under rcu_read_lock().

v2:
- Use cmpxchg to atomically update early_pfn_pages, preventing page leak under concurrent allocation
- Pass gfp_flags through the full call chain and use gfpflags_allow_blocking()
  to select GFP_KERNEL vs GFP_ATOMIC, avoiding unnecessary GFP_ATOMIC in process context
---
 include/linux/alloc_tag.h |   4 +-
 lib/alloc_tag.c           | 147 ++++++++++++++++++++++++--------------
 mm/page_alloc.c           |  12 ++--
 3 files changed, 103 insertions(+), 60 deletions(-)

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 02de2ede560f..068ba2e77c5d 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -163,11 +163,11 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
 {
 	WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
 }
-void alloc_tag_add_early_pfn(unsigned long pfn);
+void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
 #else
 static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
 static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
-static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
+static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
 #endif
 
 /* Caller should verify both ref and tag to be valid */
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index ed1bdcf1f8ab..b9ca95d1f506 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -767,60 +767,95 @@ static __init bool need_page_alloc_tagging(void)
  * their codetag uninitialized. Track these early PFNs so we can clear
  * their codetag refs later to avoid warnings when they are freed.
  *
- * Early allocations include:
- *   - Base allocations independent of CPU count
- *   - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
- *     such as trace ring buffers, scheduler per-cpu data)
- *
- * For simplicity, we fix the size to 8192.
- * If insufficient, a warning will be triggered to alert the user.
+ * Each page is cast to a pfn_pool: the first few bytes hold metadata
+ * (next pointer and slot count), the remainder stores PFNs.
+ */
+struct pfn_pool {
+	struct pfn_pool *next;
+	atomic_t count;
+	unsigned long pfns[];
+};
+
+#define PFN_POOL_SIZE			((PAGE_SIZE - offsetof(struct pfn_pool, pfns)) / \
+					 sizeof(unsigned long))
+
+/*
+ * Skip early PFN recording for a page allocation.  Reuses the
+ * %__GFP_NO_OBJ_EXT bit.  Used by __alloc_tag_add_early_pfn() to avoid
+ * recursion when allocating pages for the early PFN tracking list
+ * itself.
  *
- * TODO: Replace fixed-size array with dynamic allocation using
- * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
+ * Codetags of the pages allocated with __GFP_NO_CODETAG should be
+ * cleared (via clear_page_tag_ref()) before freeing the pages to prevent
+ * alloc_tag_sub_check() from triggering a warning.
  */
-#define EARLY_ALLOC_PFN_MAX		8192
+#define __GFP_NO_CODETAG		__GFP_NO_OBJ_EXT
 
-static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
-static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
+static struct pfn_pool *current_pfn_pool __initdata;
 
-static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
+static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
 {
-	int old_idx, new_idx;
+	struct pfn_pool *pool;
+	int idx;
 
 	do {
-		old_idx = atomic_read(&early_pfn_count);
-		if (old_idx >= EARLY_ALLOC_PFN_MAX) {
-			pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
-				      EARLY_ALLOC_PFN_MAX);
-			return;
+		pool = READ_ONCE(current_pfn_pool);
+		if (!pool || atomic_read(&pool->count) >= PFN_POOL_SIZE) {
+			gfp_t gfp = gfp_flags & ~(__GFP_DIRECT_RECLAIM | GFP_ZONEMASK);
+			struct page *new_page = alloc_page(gfp | __GFP_NO_CODETAG);
+			struct pfn_pool *new;
+
+			if (!new_page) {
+				pr_warn_once("early PFN tracking page allocation failed\n");
+				return;
+			}
+			new = page_address(new_page);
+			new->next = pool;
+			atomic_set(&new->count, 0);
+			if (cmpxchg(&current_pfn_pool, pool, new) != pool) {
+				clear_page_tag_ref(new_page);
+				__free_page(new_page);
+				continue;
+			}
+			pool = new;
 		}
-		new_idx = old_idx + 1;
-	} while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
+		idx = atomic_read(&pool->count);
+		if (idx >= PFN_POOL_SIZE)
+			continue;
+		if (atomic_cmpxchg(&pool->count, idx, idx + 1) == idx)
+			break;
+	} while (1);
 
-	early_pfns[old_idx] = pfn;
+	pool->pfns[idx] = pfn;
 }
 
-typedef void alloc_tag_add_func(unsigned long pfn);
+typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
 static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
 	RCU_INITIALIZER(__alloc_tag_add_early_pfn);
 
-void alloc_tag_add_early_pfn(unsigned long pfn)
+void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
 {
 	alloc_tag_add_func *alloc_tag_add;
 
 	if (static_key_enabled(&mem_profiling_compressed))
 		return;
 
+	/* Skip allocations for the tracking list itself to avoid recursion. */
+	if (gfp_flags & __GFP_NO_CODETAG)
+		return;
+
 	rcu_read_lock();
 	alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
 	if (alloc_tag_add)
-		alloc_tag_add(pfn);
+		alloc_tag_add(pfn, gfp_flags);
 	rcu_read_unlock();
 }
 
 static void __init clear_early_alloc_pfn_tag_refs(void)
 {
-	unsigned int i;
+	struct pfn_pool *pool, *next;
+	struct page *page;
+	int i;
 
 	if (static_key_enabled(&mem_profiling_compressed))
 		return;
@@ -829,37 +864,45 @@ static void __init clear_early_alloc_pfn_tag_refs(void)
 	/* Make sure we are not racing with __alloc_tag_add_early_pfn() */
 	synchronize_rcu();
 
-	for (i = 0; i < atomic_read(&early_pfn_count); i++) {
-		unsigned long pfn = early_pfns[i];
-
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-			union pgtag_ref_handle handle;
-			union codetag_ref ref;
-
-			if (get_page_tag_ref(page, &ref, &handle)) {
-				/*
-				 * An early-allocated page could be freed and reallocated
-				 * after its page_ext is initialized but before we clear it.
-				 * In that case, it already has a valid tag set.
-				 * We should not overwrite that valid tag with CODETAG_EMPTY.
-				 *
-				 * Note: there is still a small race window between checking
-				 * ref.ct and calling set_codetag_empty(). We accept this
-				 * race as it's unlikely and the extra complexity of atomic
-				 * cmpxchg is not worth it for this debug-only code path.
-				 */
-				if (ref.ct) {
+	for (pool = current_pfn_pool; pool; pool = next) {
+		int nr_pfns = atomic_read(&pool->count);
+
+		for (i = 0; i < nr_pfns; i++) {
+			unsigned long pfn = pool->pfns[i];
+
+			if (pfn_valid(pfn)) {
+				union pgtag_ref_handle handle;
+				union codetag_ref ref;
+
+				if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
+					/*
+					 * An early-allocated page could be freed and reallocated
+					 * after its page_ext is initialized but before we clear it.
+					 * In that case, it already has a valid tag set.
+					 * We should not overwrite that valid tag
+					 * with CODETAG_EMPTY.
+					 *
+					 * Note: there is still a small race window between checking
+					 * ref.ct and calling set_codetag_empty(). We accept this
+					 * race as it's unlikely and the extra complexity of atomic
+					 * cmpxchg is not worth it for this debug-only code path.
+					 */
+					if (ref.ct) {
+						put_page_tag_ref(handle);
+						continue;
+					}
+
+					set_codetag_empty(&ref);
+					update_page_tag_ref(handle, &ref);
 					put_page_tag_ref(handle);
-					continue;
 				}
-
-				set_codetag_empty(&ref);
-				update_page_tag_ref(handle, &ref);
-				put_page_tag_ref(handle);
 			}
 		}
 
+		next = pool->next;
+		page = virt_to_page(pool);
+		clear_page_tag_ref(page);
+		__free_page(page);
 	}
 }
 #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 04494bc2e46f..819d44ffd470 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1284,7 +1284,7 @@ void __clear_page_tag_ref(struct page *page)
 /* Should be called only if mem_alloc_profiling_enabled() */
 static noinline
 void __pgalloc_tag_add(struct page *page, struct task_struct *task,
-		       unsigned int nr)
+		       unsigned int nr, gfp_t gfp_flags)
 {
 	union pgtag_ref_handle handle;
 	union codetag_ref ref;
@@ -1298,17 +1298,17 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
 		 * page_ext is not available yet, record the pfn so we can
 		 * clear the tag ref later when page_ext is initialized.
 		 */
-		alloc_tag_add_early_pfn(page_to_pfn(page));
+		alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
 		if (task->alloc_tag)
 			alloc_tag_set_inaccurate(task->alloc_tag);
 	}
 }
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr)
+				   unsigned int nr, gfp_t gfp_flags)
 {
 	if (mem_alloc_profiling_enabled())
-		__pgalloc_tag_add(page, task, nr);
+		__pgalloc_tag_add(page, task, nr, gfp_flags);
 }
 
 /* Should be called only if mem_alloc_profiling_enabled() */
@@ -1341,7 +1341,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr) {}
+				   unsigned int nr, gfp_t gfp_flags) {}
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
 
@@ -1896,7 +1896,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 
 	set_page_owner(page, order, gfp_flags);
 	page_table_check_alloc(page, order);
-	pgalloc_tag_add(page, current, 1 << order);
+	pgalloc_tag_add(page, current, 1 << order, gfp_flags);
 }
 
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
  2026-05-06  2:22 [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list Hao Ge
@ 2026-05-09  0:12 ` Andrew Morton
  2026-05-27  2:00   ` Andrew Morton
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2026-05-09  0:12 UTC (permalink / raw)
  To: Hao Ge; +Cc: Suren Baghdasaryan, Kent Overstreet, linux-mm, linux-kernel

On Wed,  6 May 2026 10:22:56 +0800 Hao Ge <hao.ge@linux.dev> wrote:

> Pages allocated before page_ext is available have their codetag left
> uninitialized. Track these early PFNs and clear their codetag in
> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> warnings when they are freed later.
> 
> Currently a fixed-size array of 8192 entries is used, with a warning if
> the limit is exceeded. However, the number of early allocations depends
> on the number of CPUs and can be larger than 8192.
> 
> Replace the fixed-size array with a dynamically allocated linked list
> of pfn_pool structs. Each node is allocated via alloc_page() and mapped
> to a pfn_pool containing a next pointer, an atomic slot counter, and a
> PFN array that fills the remainder of the page.
> 
> The tracking pages themselves are allocated via alloc_page(), which
> would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
> recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
> %__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
> so that the early path can skip recording allocations that carry this
> flag.

AI review asked a couple of things.  I have a feeling we saw at least
one of these, so probably already dealt with.
	https://sashiko.dev/#/patchset/20260506022256.32664-1-hao.ge@linux.dev


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
  2026-05-09  0:12 ` Andrew Morton
@ 2026-05-27  2:00   ` Andrew Morton
  2026-05-27  5:22     ` Hao Ge
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2026-05-27  2:00 UTC (permalink / raw)
  To: Hao Ge, Suren Baghdasaryan, Kent Overstreet, linux-mm,
	linux-kernel

On Fri, 8 May 2026 17:12:51 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:

> On Wed,  6 May 2026 10:22:56 +0800 Hao Ge <hao.ge@linux.dev> wrote:
> 
> > Pages allocated before page_ext is available have their codetag left
> > uninitialized. Track these early PFNs and clear their codetag in
> > clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> > warnings when they are freed later.
> > 
> > Currently a fixed-size array of 8192 entries is used, with a warning if
> > the limit is exceeded. However, the number of early allocations depends
> > on the number of CPUs and can be larger than 8192.
> > 
> > Replace the fixed-size array with a dynamically allocated linked list
> > of pfn_pool structs. Each node is allocated via alloc_page() and mapped
> > to a pfn_pool containing a next pointer, an atomic slot counter, and a
> > PFN array that fills the remainder of the page.
> > 
> > The tracking pages themselves are allocated via alloc_page(), which
> > would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
> > recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
> > %__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
> > so that the early path can skip recording allocations that carry this
> > flag.
> 
> AI review asked a couple of things.  I have a feeling we saw at least
> one of these, so probably already dealt with.
> 	https://sashiko.dev/#/patchset/20260506022256.32664-1-hao.ge@linux.dev

Please?

Also, this patch has no evidence of human review.


From: Hao Ge <hao.ge@linux.dev>
Subject: mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
Date: Wed, 6 May 2026 10:22:56 +0800

Pages allocated before page_ext is available have their codetag left
uninitialized.  Track these early PFNs and clear their codetag in
clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set" warnings
when they are freed later.

Currently a fixed-size array of 8192 entries is used, with a warning if
the limit is exceeded.  However, the number of early allocations depends
on the number of CPUs and can be larger than 8192.

Replace the fixed-size array with a dynamically allocated linked list of
pfn_pool structs.  Each node is allocated via alloc_page() and mapped to a
pfn_pool containing a next pointer, an atomic slot counter, and a PFN
array that fills the remainder of the page.

The tracking pages themselves are allocated via alloc_page(), which would
trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
indefinitely.  Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
can skip recording allocations that carry this flag.

Link: https://lore.kernel.org/20260506022256.32664-1-hao.ge@linux.dev
Signed-off-by: Hao Ge <hao.ge@linux.dev>
Suggested-by: Suren Baghdasaryan <surenb@google.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/alloc_tag.h |    4 
 lib/alloc_tag.c           |  145 +++++++++++++++++++++++-------------
 mm/page_alloc.c           |   12 +-
 3 files changed, 102 insertions(+), 59 deletions(-)

--- a/include/linux/alloc_tag.h~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
+++ a/include/linux/alloc_tag.h
@@ -163,11 +163,11 @@ static inline void alloc_tag_sub_check(u
 {
 	WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
 }
-void alloc_tag_add_early_pfn(unsigned long pfn);
+void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
 #else
 static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
 static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
-static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
+static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
 #endif
 
 /* Caller should verify both ref and tag to be valid */
--- a/lib/alloc_tag.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
+++ a/lib/alloc_tag.c
@@ -767,60 +767,95 @@ static __init bool need_page_alloc_taggi
  * their codetag uninitialized. Track these early PFNs so we can clear
  * their codetag refs later to avoid warnings when they are freed.
  *
- * Early allocations include:
- *   - Base allocations independent of CPU count
- *   - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
- *     such as trace ring buffers, scheduler per-cpu data)
- *
- * For simplicity, we fix the size to 8192.
- * If insufficient, a warning will be triggered to alert the user.
+ * Each page is cast to a pfn_pool: the first few bytes hold metadata
+ * (next pointer and slot count), the remainder stores PFNs.
+ */
+struct pfn_pool {
+	struct pfn_pool *next;
+	atomic_t count;
+	unsigned long pfns[];
+};
+
+#define PFN_POOL_SIZE			((PAGE_SIZE - offsetof(struct pfn_pool, pfns)) / \
+					 sizeof(unsigned long))
+
+/*
+ * Skip early PFN recording for a page allocation.  Reuses the
+ * %__GFP_NO_OBJ_EXT bit.  Used by __alloc_tag_add_early_pfn() to avoid
+ * recursion when allocating pages for the early PFN tracking list
+ * itself.
  *
- * TODO: Replace fixed-size array with dynamic allocation using
- * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
+ * Codetags of the pages allocated with __GFP_NO_CODETAG should be
+ * cleared (via clear_page_tag_ref()) before freeing the pages to prevent
+ * alloc_tag_sub_check() from triggering a warning.
  */
-#define EARLY_ALLOC_PFN_MAX		8192
+#define __GFP_NO_CODETAG		__GFP_NO_OBJ_EXT
 
-static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
-static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
+static struct pfn_pool *current_pfn_pool __initdata;
 
-static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
+static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
 {
-	int old_idx, new_idx;
+	struct pfn_pool *pool;
+	int idx;
 
 	do {
-		old_idx = atomic_read(&early_pfn_count);
-		if (old_idx >= EARLY_ALLOC_PFN_MAX) {
-			pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
-				      EARLY_ALLOC_PFN_MAX);
-			return;
+		pool = READ_ONCE(current_pfn_pool);
+		if (!pool || atomic_read(&pool->count) >= PFN_POOL_SIZE) {
+			gfp_t gfp = gfp_flags & ~(__GFP_DIRECT_RECLAIM | GFP_ZONEMASK);
+			struct page *new_page = alloc_page(gfp | __GFP_NO_CODETAG);
+			struct pfn_pool *new;
+
+			if (!new_page) {
+				pr_warn_once("early PFN tracking page allocation failed\n");
+				return;
+			}
+			new = page_address(new_page);
+			new->next = pool;
+			atomic_set(&new->count, 0);
+			if (cmpxchg(&current_pfn_pool, pool, new) != pool) {
+				clear_page_tag_ref(new_page);
+				__free_page(new_page);
+				continue;
+			}
+			pool = new;
 		}
-		new_idx = old_idx + 1;
-	} while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
+		idx = atomic_read(&pool->count);
+		if (idx >= PFN_POOL_SIZE)
+			continue;
+		if (atomic_cmpxchg(&pool->count, idx, idx + 1) == idx)
+			break;
+	} while (1);
 
-	early_pfns[old_idx] = pfn;
+	pool->pfns[idx] = pfn;
 }
 
-typedef void alloc_tag_add_func(unsigned long pfn);
+typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
 static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
 	RCU_INITIALIZER(__alloc_tag_add_early_pfn);
 
-void alloc_tag_add_early_pfn(unsigned long pfn)
+void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
 {
 	alloc_tag_add_func *alloc_tag_add;
 
 	if (static_key_enabled(&mem_profiling_compressed))
 		return;
 
+	/* Skip allocations for the tracking list itself to avoid recursion. */
+	if (gfp_flags & __GFP_NO_CODETAG)
+		return;
+
 	rcu_read_lock();
 	alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
 	if (alloc_tag_add)
-		alloc_tag_add(pfn);
+		alloc_tag_add(pfn, gfp_flags);
 	rcu_read_unlock();
 }
 
 static void __init clear_early_alloc_pfn_tag_refs(void)
 {
-	unsigned int i;
+	struct pfn_pool *pool, *next;
+	struct page *page;
+	int i;
 
 	if (static_key_enabled(&mem_profiling_compressed))
 		return;
@@ -829,37 +864,45 @@ static void __init clear_early_alloc_pfn
 	/* Make sure we are not racing with __alloc_tag_add_early_pfn() */
 	synchronize_rcu();
 
-	for (i = 0; i < atomic_read(&early_pfn_count); i++) {
-		unsigned long pfn = early_pfns[i];
+	for (pool = current_pfn_pool; pool; pool = next) {
+		int nr_pfns = atomic_read(&pool->count);
+
+		for (i = 0; i < nr_pfns; i++) {
+			unsigned long pfn = pool->pfns[i];
 
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-			union pgtag_ref_handle handle;
-			union codetag_ref ref;
-
-			if (get_page_tag_ref(page, &ref, &handle)) {
-				/*
-				 * An early-allocated page could be freed and reallocated
-				 * after its page_ext is initialized but before we clear it.
-				 * In that case, it already has a valid tag set.
-				 * We should not overwrite that valid tag with CODETAG_EMPTY.
-				 *
-				 * Note: there is still a small race window between checking
-				 * ref.ct and calling set_codetag_empty(). We accept this
-				 * race as it's unlikely and the extra complexity of atomic
-				 * cmpxchg is not worth it for this debug-only code path.
-				 */
-				if (ref.ct) {
+			if (pfn_valid(pfn)) {
+				union pgtag_ref_handle handle;
+				union codetag_ref ref;
+
+				if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
+					/*
+					 * An early-allocated page could be freed and reallocated
+					 * after its page_ext is initialized but before we clear it.
+					 * In that case, it already has a valid tag set.
+					 * We should not overwrite that valid tag
+					 * with CODETAG_EMPTY.
+					 *
+					 * Note: there is still a small race window between checking
+					 * ref.ct and calling set_codetag_empty(). We accept this
+					 * race as it's unlikely and the extra complexity of atomic
+					 * cmpxchg is not worth it for this debug-only code path.
+					 */
+					if (ref.ct) {
+						put_page_tag_ref(handle);
+						continue;
+					}
+
+					set_codetag_empty(&ref);
+					update_page_tag_ref(handle, &ref);
 					put_page_tag_ref(handle);
-					continue;
 				}
-
-				set_codetag_empty(&ref);
-				update_page_tag_ref(handle, &ref);
-				put_page_tag_ref(handle);
 			}
 		}
 
+		next = pool->next;
+		page = virt_to_page(pool);
+		clear_page_tag_ref(page);
+		__free_page(page);
 	}
 }
 #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
--- a/mm/page_alloc.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
+++ a/mm/page_alloc.c
@@ -1255,7 +1255,7 @@ void __clear_page_tag_ref(struct page *p
 /* Should be called only if mem_alloc_profiling_enabled() */
 static noinline
 void __pgalloc_tag_add(struct page *page, struct task_struct *task,
-		       unsigned int nr)
+		       unsigned int nr, gfp_t gfp_flags)
 {
 	union pgtag_ref_handle handle;
 	union codetag_ref ref;
@@ -1269,17 +1269,17 @@ void __pgalloc_tag_add(struct page *page
 		 * page_ext is not available yet, record the pfn so we can
 		 * clear the tag ref later when page_ext is initialized.
 		 */
-		alloc_tag_add_early_pfn(page_to_pfn(page));
+		alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
 		if (task->alloc_tag)
 			alloc_tag_set_inaccurate(task->alloc_tag);
 	}
 }
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr)
+				   unsigned int nr, gfp_t gfp_flags)
 {
 	if (mem_alloc_profiling_enabled())
-		__pgalloc_tag_add(page, task, nr);
+		__pgalloc_tag_add(page, task, nr, gfp_flags);
 }
 
 /* Should be called only if mem_alloc_profiling_enabled() */
@@ -1312,7 +1312,7 @@ static inline void pgalloc_tag_sub_pages
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-				   unsigned int nr) {}
+				   unsigned int nr, gfp_t gfp_flags) {}
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
 static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
 
@@ -1867,7 +1867,7 @@ inline void post_alloc_hook(struct page
 
 	set_page_owner(page, order, gfp_flags);
 	page_table_check_alloc(page, order);
-	pgalloc_tag_add(page, current, 1 << order);
+	pgalloc_tag_add(page, current, 1 << order, gfp_flags);
 }
 
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
_



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
  2026-05-27  2:00   ` Andrew Morton
@ 2026-05-27  5:22     ` Hao Ge
  2026-06-02 23:40       ` Suren Baghdasaryan
  2026-06-03 16:54       ` Suren Baghdasaryan
  0 siblings, 2 replies; 9+ messages in thread
From: Hao Ge @ 2026-05-27  5:22 UTC (permalink / raw)
  To: Andrew Morton, Suren Baghdasaryan; +Cc: linux-mm, linux-kernel, Kent Overstreet


On 2026/5/27 10:00, Andrew Morton wrote:
> On Fri, 8 May 2026 17:12:51 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:
>
>> On Wed,  6 May 2026 10:22:56 +0800 Hao Ge <hao.ge@linux.dev> wrote:
>>
>>> Pages allocated before page_ext is available have their codetag left
>>> uninitialized. Track these early PFNs and clear their codetag in
>>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
>>> warnings when they are freed later.
>>>
>>> Currently a fixed-size array of 8192 entries is used, with a warning if
>>> the limit is exceeded. However, the number of early allocations depends
>>> on the number of CPUs and can be larger than 8192.
>>>
>>> Replace the fixed-size array with a dynamically allocated linked list
>>> of pfn_pool structs. Each node is allocated via alloc_page() and mapped
>>> to a pfn_pool containing a next pointer, an atomic slot counter, and a
>>> PFN array that fills the remainder of the page.
>>>
>>> The tracking pages themselves are allocated via alloc_page(), which
>>> would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
>>> recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
>>> %__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
>>> so that the early path can skip recording allocations that carry this
>>> flag.
>> AI review asked a couple of things.  I have a feeling we saw at least
>> one of these, so probably already dealt with.
>> 	https://sashiko.dev/#/patchset/20260506022256.32664-1-hao.ge@linux.dev

Hi Andrew

My apologies. I'm also waiting for Suren's review. He may have been tied 
up lately

and might not have time to get to this.


Sashiko raised two issues this time. I've already responded to the first 
one.

See the link below:

https://lore.kernel.org/all/0b9969e2-b208-46c2-a9a5-bf620239275a@linux.dev/

If I haven't missed any details, it should be a false positive.


As for the second point, let me address it.

The early PFN tracking window is entirely within mm_core_init(),

which is called from start_kernel():

start_kernel()

     mm_core_init()

         memblock_free_all();

         mem_init() //start early PFN tracking

         kmem_cache_init()                           // SLUB bootstrap + 
kmalloc caches
...
         page_ext_init()                                   // clears 
alloc_tag_add_early_pfn_ptr

     ...

     rest_init() //spawns kernel_init thread


kernel_init() → kernel_init_freeable()            // separate thread, later

     smp_init()                                    // secondary CPUs 
come online here

Within the early PFN window (mem_init() to page_ext_init()):

  1. We are still in start_kernel(), single CPU. The buddy allocator

was just initialized from memblock and should have plenty of free

pages, so alloc_page() would likely be satisfied from the fast

path. If so, the __GFP_NOFAIL without __GFP_DIRECT_RECLAIM

check in the slowpath would not be reached.

2. Since only the boot CPU is running, alloc_page() targets the

boot node, which has memory. So even if __GFP_THISNODE were

inherited, it would not fail on the boot node during this window.


So Sashiko's analysis applies to the general case, and indeed the issues

he raised could occur there.

However, in the early boot scenario, I believe the current patch is safe,

even though it is not fully generic (after all, no one can predict 
future use cases).

Therefore, I agree with his suggestion that using a clean mask like  
GFP_NOWAIT | __GFP_NOWARN.


In any case, I will wait for your and Suren's feedback. You may have 
different opinions on this matter.


Thanks

Best Regards

Hao


> Please?
>
> Also, this patch has no evidence of human review.
>
>
> From: Hao Ge <hao.ge@linux.dev>
> Subject: mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
> Date: Wed, 6 May 2026 10:22:56 +0800
>
> Pages allocated before page_ext is available have their codetag left
> uninitialized.  Track these early PFNs and clear their codetag in
> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set" warnings
> when they are freed later.
>
> Currently a fixed-size array of 8192 entries is used, with a warning if
> the limit is exceeded.  However, the number of early allocations depends
> on the number of CPUs and can be larger than 8192.
>
> Replace the fixed-size array with a dynamically allocated linked list of
> pfn_pool structs.  Each node is allocated via alloc_page() and mapped to a
> pfn_pool containing a next pointer, an atomic slot counter, and a PFN
> array that fills the remainder of the page.
>
> The tracking pages themselves are allocated via alloc_page(), which would
> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
> indefinitely.  Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
> bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
> can skip recording allocations that carry this flag.
>
> Link: https://lore.kernel.org/20260506022256.32664-1-hao.ge@linux.dev
> Signed-off-by: Hao Ge <hao.ge@linux.dev>
> Suggested-by: Suren Baghdasaryan <surenb@google.com>
> Cc: Brendan Jackman <jackmanb@google.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Kent Overstreet <kent.overstreet@linux.dev>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Vlastimil Babka <vbabka@kernel.org>
> Cc: Zi Yan <ziy@nvidia.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>
>   include/linux/alloc_tag.h |    4
>   lib/alloc_tag.c           |  145 +++++++++++++++++++++++-------------
>   mm/page_alloc.c           |   12 +-
>   3 files changed, 102 insertions(+), 59 deletions(-)
>
> --- a/include/linux/alloc_tag.h~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> +++ a/include/linux/alloc_tag.h
> @@ -163,11 +163,11 @@ static inline void alloc_tag_sub_check(u
>   {
>   	WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
>   }
> -void alloc_tag_add_early_pfn(unsigned long pfn);
> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
>   #else
>   static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
>   static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
>   #endif
>   
>   /* Caller should verify both ref and tag to be valid */
> --- a/lib/alloc_tag.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> +++ a/lib/alloc_tag.c
> @@ -767,60 +767,95 @@ static __init bool need_page_alloc_taggi
>    * their codetag uninitialized. Track these early PFNs so we can clear
>    * their codetag refs later to avoid warnings when they are freed.
>    *
> - * Early allocations include:
> - *   - Base allocations independent of CPU count
> - *   - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
> - *     such as trace ring buffers, scheduler per-cpu data)
> - *
> - * For simplicity, we fix the size to 8192.
> - * If insufficient, a warning will be triggered to alert the user.
> + * Each page is cast to a pfn_pool: the first few bytes hold metadata
> + * (next pointer and slot count), the remainder stores PFNs.
> + */
> +struct pfn_pool {
> +	struct pfn_pool *next;
> +	atomic_t count;
> +	unsigned long pfns[];
> +};
> +
> +#define PFN_POOL_SIZE			((PAGE_SIZE - offsetof(struct pfn_pool, pfns)) / \
> +					 sizeof(unsigned long))
> +
> +/*
> + * Skip early PFN recording for a page allocation.  Reuses the
> + * %__GFP_NO_OBJ_EXT bit.  Used by __alloc_tag_add_early_pfn() to avoid
> + * recursion when allocating pages for the early PFN tracking list
> + * itself.
>    *
> - * TODO: Replace fixed-size array with dynamic allocation using
> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
> + * Codetags of the pages allocated with __GFP_NO_CODETAG should be
> + * cleared (via clear_page_tag_ref()) before freeing the pages to prevent
> + * alloc_tag_sub_check() from triggering a warning.
>    */
> -#define EARLY_ALLOC_PFN_MAX		8192
> +#define __GFP_NO_CODETAG		__GFP_NO_OBJ_EXT
>   
> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
> +static struct pfn_pool *current_pfn_pool __initdata;
>   
> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>   {
> -	int old_idx, new_idx;
> +	struct pfn_pool *pool;
> +	int idx;
>   
>   	do {
> -		old_idx = atomic_read(&early_pfn_count);
> -		if (old_idx >= EARLY_ALLOC_PFN_MAX) {
> -			pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
> -				      EARLY_ALLOC_PFN_MAX);
> -			return;
> +		pool = READ_ONCE(current_pfn_pool);
> +		if (!pool || atomic_read(&pool->count) >= PFN_POOL_SIZE) {
> +			gfp_t gfp = gfp_flags & ~(__GFP_DIRECT_RECLAIM | GFP_ZONEMASK);
> +			struct page *new_page = alloc_page(gfp | __GFP_NO_CODETAG);
> +			struct pfn_pool *new;
> +
> +			if (!new_page) {
> +				pr_warn_once("early PFN tracking page allocation failed\n");
> +				return;
> +			}
> +			new = page_address(new_page);
> +			new->next = pool;
> +			atomic_set(&new->count, 0);
> +			if (cmpxchg(&current_pfn_pool, pool, new) != pool) {
> +				clear_page_tag_ref(new_page);
> +				__free_page(new_page);
> +				continue;
> +			}
> +			pool = new;
>   		}
> -		new_idx = old_idx + 1;
> -	} while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
> +		idx = atomic_read(&pool->count);
> +		if (idx >= PFN_POOL_SIZE)
> +			continue;
> +		if (atomic_cmpxchg(&pool->count, idx, idx + 1) == idx)
> +			break;
> +	} while (1);
>   
> -	early_pfns[old_idx] = pfn;
> +	pool->pfns[idx] = pfn;
>   }
>   
> -typedef void alloc_tag_add_func(unsigned long pfn);
> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
>   static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
>   	RCU_INITIALIZER(__alloc_tag_add_early_pfn);
>   
> -void alloc_tag_add_early_pfn(unsigned long pfn)
> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>   {
>   	alloc_tag_add_func *alloc_tag_add;
>   
>   	if (static_key_enabled(&mem_profiling_compressed))
>   		return;
>   
> +	/* Skip allocations for the tracking list itself to avoid recursion. */
> +	if (gfp_flags & __GFP_NO_CODETAG)
> +		return;
> +
>   	rcu_read_lock();
>   	alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
>   	if (alloc_tag_add)
> -		alloc_tag_add(pfn);
> +		alloc_tag_add(pfn, gfp_flags);
>   	rcu_read_unlock();
>   }
>   
>   static void __init clear_early_alloc_pfn_tag_refs(void)
>   {
> -	unsigned int i;
> +	struct pfn_pool *pool, *next;
> +	struct page *page;
> +	int i;
>   
>   	if (static_key_enabled(&mem_profiling_compressed))
>   		return;
> @@ -829,37 +864,45 @@ static void __init clear_early_alloc_pfn
>   	/* Make sure we are not racing with __alloc_tag_add_early_pfn() */
>   	synchronize_rcu();
>   
> -	for (i = 0; i < atomic_read(&early_pfn_count); i++) {
> -		unsigned long pfn = early_pfns[i];
> +	for (pool = current_pfn_pool; pool; pool = next) {
> +		int nr_pfns = atomic_read(&pool->count);
> +
> +		for (i = 0; i < nr_pfns; i++) {
> +			unsigned long pfn = pool->pfns[i];
>   
> -		if (pfn_valid(pfn)) {
> -			struct page *page = pfn_to_page(pfn);
> -			union pgtag_ref_handle handle;
> -			union codetag_ref ref;
> -
> -			if (get_page_tag_ref(page, &ref, &handle)) {
> -				/*
> -				 * An early-allocated page could be freed and reallocated
> -				 * after its page_ext is initialized but before we clear it.
> -				 * In that case, it already has a valid tag set.
> -				 * We should not overwrite that valid tag with CODETAG_EMPTY.
> -				 *
> -				 * Note: there is still a small race window between checking
> -				 * ref.ct and calling set_codetag_empty(). We accept this
> -				 * race as it's unlikely and the extra complexity of atomic
> -				 * cmpxchg is not worth it for this debug-only code path.
> -				 */
> -				if (ref.ct) {
> +			if (pfn_valid(pfn)) {
> +				union pgtag_ref_handle handle;
> +				union codetag_ref ref;
> +
> +				if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
> +					/*
> +					 * An early-allocated page could be freed and reallocated
> +					 * after its page_ext is initialized but before we clear it.
> +					 * In that case, it already has a valid tag set.
> +					 * We should not overwrite that valid tag
> +					 * with CODETAG_EMPTY.
> +					 *
> +					 * Note: there is still a small race window between checking
> +					 * ref.ct and calling set_codetag_empty(). We accept this
> +					 * race as it's unlikely and the extra complexity of atomic
> +					 * cmpxchg is not worth it for this debug-only code path.
> +					 */
> +					if (ref.ct) {
> +						put_page_tag_ref(handle);
> +						continue;
> +					}
> +
> +					set_codetag_empty(&ref);
> +					update_page_tag_ref(handle, &ref);
>   					put_page_tag_ref(handle);
> -					continue;
>   				}
> -
> -				set_codetag_empty(&ref);
> -				update_page_tag_ref(handle, &ref);
> -				put_page_tag_ref(handle);
>   			}
>   		}
>   
> +		next = pool->next;
> +		page = virt_to_page(pool);
> +		clear_page_tag_ref(page);
> +		__free_page(page);
>   	}
>   }
>   #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
> --- a/mm/page_alloc.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> +++ a/mm/page_alloc.c
> @@ -1255,7 +1255,7 @@ void __clear_page_tag_ref(struct page *p
>   /* Should be called only if mem_alloc_profiling_enabled() */
>   static noinline
>   void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> -		       unsigned int nr)
> +		       unsigned int nr, gfp_t gfp_flags)
>   {
>   	union pgtag_ref_handle handle;
>   	union codetag_ref ref;
> @@ -1269,17 +1269,17 @@ void __pgalloc_tag_add(struct page *page
>   		 * page_ext is not available yet, record the pfn so we can
>   		 * clear the tag ref later when page_ext is initialized.
>   		 */
> -		alloc_tag_add_early_pfn(page_to_pfn(page));
> +		alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
>   		if (task->alloc_tag)
>   			alloc_tag_set_inaccurate(task->alloc_tag);
>   	}
>   }
>   
>   static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> -				   unsigned int nr)
> +				   unsigned int nr, gfp_t gfp_flags)
>   {
>   	if (mem_alloc_profiling_enabled())
> -		__pgalloc_tag_add(page, task, nr);
> +		__pgalloc_tag_add(page, task, nr, gfp_flags);
>   }
>   
>   /* Should be called only if mem_alloc_profiling_enabled() */
> @@ -1312,7 +1312,7 @@ static inline void pgalloc_tag_sub_pages
>   #else /* CONFIG_MEM_ALLOC_PROFILING */
>   
>   static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> -				   unsigned int nr) {}
> +				   unsigned int nr, gfp_t gfp_flags) {}
>   static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
>   static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
>   
> @@ -1867,7 +1867,7 @@ inline void post_alloc_hook(struct page
>   
>   	set_page_owner(page, order, gfp_flags);
>   	page_table_check_alloc(page, order);
> -	pgalloc_tag_add(page, current, 1 << order);
> +	pgalloc_tag_add(page, current, 1 << order, gfp_flags);
>   }
>   
>   static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
> _
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
  2026-05-27  5:22     ` Hao Ge
@ 2026-06-02 23:40       ` Suren Baghdasaryan
  2026-06-03 16:54       ` Suren Baghdasaryan
  1 sibling, 0 replies; 9+ messages in thread
From: Suren Baghdasaryan @ 2026-06-02 23:40 UTC (permalink / raw)
  To: Hao Ge; +Cc: Andrew Morton, linux-mm, linux-kernel, Kent Overstreet

On Tue, May 26, 2026 at 10:22 PM Hao Ge <hao.ge@linux.dev> wrote:
>
>
> On 2026/5/27 10:00, Andrew Morton wrote:
> > On Fri, 8 May 2026 17:12:51 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> >> On Wed,  6 May 2026 10:22:56 +0800 Hao Ge <hao.ge@linux.dev> wrote:
> >>
> >>> Pages allocated before page_ext is available have their codetag left
> >>> uninitialized. Track these early PFNs and clear their codetag in
> >>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> >>> warnings when they are freed later.
> >>>
> >>> Currently a fixed-size array of 8192 entries is used, with a warning if
> >>> the limit is exceeded. However, the number of early allocations depends
> >>> on the number of CPUs and can be larger than 8192.
> >>>
> >>> Replace the fixed-size array with a dynamically allocated linked list
> >>> of pfn_pool structs. Each node is allocated via alloc_page() and mapped
> >>> to a pfn_pool containing a next pointer, an atomic slot counter, and a
> >>> PFN array that fills the remainder of the page.
> >>>
> >>> The tracking pages themselves are allocated via alloc_page(), which
> >>> would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
> >>> recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
> >>> %__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
> >>> so that the early path can skip recording allocations that carry this
> >>> flag.
> >> AI review asked a couple of things.  I have a feeling we saw at least
> >> one of these, so probably already dealt with.
> >>      https://sashiko.dev/#/patchset/20260506022256.32664-1-hao.ge@linux.dev
>
> Hi Andrew
>
> My apologies. I'm also waiting for Suren's review. He may have been tied
> up lately
>
> and might not have time to get to this.

Sorry folks, I was on vacation and will be fully back to work
tomorrow. I'll start on these reviews the first thing once I'm at my
station.


>
>
> Sashiko raised two issues this time. I've already responded to the first
> one.
>
> See the link below:
>
> https://lore.kernel.org/all/0b9969e2-b208-46c2-a9a5-bf620239275a@linux.dev/
>
> If I haven't missed any details, it should be a false positive.
>
>
> As for the second point, let me address it.
>
> The early PFN tracking window is entirely within mm_core_init(),
>
> which is called from start_kernel():
>
> start_kernel()
>
>      mm_core_init()
>
>          memblock_free_all();
>
>          mem_init() //start early PFN tracking
>
>          kmem_cache_init()                           // SLUB bootstrap +
> kmalloc caches
> ...
>          page_ext_init()                                   // clears
> alloc_tag_add_early_pfn_ptr
>
>      ...
>
>      rest_init() //spawns kernel_init thread
>
>
> kernel_init() → kernel_init_freeable()            // separate thread, later
>
>      smp_init()                                    // secondary CPUs
> come online here
>
> Within the early PFN window (mem_init() to page_ext_init()):
>
>   1. We are still in start_kernel(), single CPU. The buddy allocator
>
> was just initialized from memblock and should have plenty of free
>
> pages, so alloc_page() would likely be satisfied from the fast
>
> path. If so, the __GFP_NOFAIL without __GFP_DIRECT_RECLAIM
>
> check in the slowpath would not be reached.
>
> 2. Since only the boot CPU is running, alloc_page() targets the
>
> boot node, which has memory. So even if __GFP_THISNODE were
>
> inherited, it would not fail on the boot node during this window.
>
>
> So Sashiko's analysis applies to the general case, and indeed the issues
>
> he raised could occur there.
>
> However, in the early boot scenario, I believe the current patch is safe,
>
> even though it is not fully generic (after all, no one can predict
> future use cases).
>
> Therefore, I agree with his suggestion that using a clean mask like
> GFP_NOWAIT | __GFP_NOWARN.
>
>
> In any case, I will wait for your and Suren's feedback. You may have
> different opinions on this matter.
>
>
> Thanks
>
> Best Regards
>
> Hao
>
>
> > Please?
> >
> > Also, this patch has no evidence of human review.
> >
> >
> > From: Hao Ge <hao.ge@linux.dev>
> > Subject: mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
> > Date: Wed, 6 May 2026 10:22:56 +0800
> >
> > Pages allocated before page_ext is available have their codetag left
> > uninitialized.  Track these early PFNs and clear their codetag in
> > clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set" warnings
> > when they are freed later.
> >
> > Currently a fixed-size array of 8192 entries is used, with a warning if
> > the limit is exceeded.  However, the number of early allocations depends
> > on the number of CPUs and can be larger than 8192.
> >
> > Replace the fixed-size array with a dynamically allocated linked list of
> > pfn_pool structs.  Each node is allocated via alloc_page() and mapped to a
> > pfn_pool containing a next pointer, an atomic slot counter, and a PFN
> > array that fills the remainder of the page.
> >
> > The tracking pages themselves are allocated via alloc_page(), which would
> > trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
> > indefinitely.  Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
> > bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
> > can skip recording allocations that carry this flag.
> >
> > Link: https://lore.kernel.org/20260506022256.32664-1-hao.ge@linux.dev
> > Signed-off-by: Hao Ge <hao.ge@linux.dev>
> > Suggested-by: Suren Baghdasaryan <surenb@google.com>
> > Cc: Brendan Jackman <jackmanb@google.com>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Kent Overstreet <kent.overstreet@linux.dev>
> > Cc: Michal Hocko <mhocko@suse.com>
> > Cc: Vlastimil Babka <vbabka@kernel.org>
> > Cc: Zi Yan <ziy@nvidia.com>
> > Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> > ---
> >
> >   include/linux/alloc_tag.h |    4
> >   lib/alloc_tag.c           |  145 +++++++++++++++++++++++-------------
> >   mm/page_alloc.c           |   12 +-
> >   3 files changed, 102 insertions(+), 59 deletions(-)
> >
> > --- a/include/linux/alloc_tag.h~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> > +++ a/include/linux/alloc_tag.h
> > @@ -163,11 +163,11 @@ static inline void alloc_tag_sub_check(u
> >   {
> >       WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> >   }
> > -void alloc_tag_add_early_pfn(unsigned long pfn);
> > +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
> >   #else
> >   static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
> >   static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
> > -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
> > +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
> >   #endif
> >
> >   /* Caller should verify both ref and tag to be valid */
> > --- a/lib/alloc_tag.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> > +++ a/lib/alloc_tag.c
> > @@ -767,60 +767,95 @@ static __init bool need_page_alloc_taggi
> >    * their codetag uninitialized. Track these early PFNs so we can clear
> >    * their codetag refs later to avoid warnings when they are freed.
> >    *
> > - * Early allocations include:
> > - *   - Base allocations independent of CPU count
> > - *   - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
> > - *     such as trace ring buffers, scheduler per-cpu data)
> > - *
> > - * For simplicity, we fix the size to 8192.
> > - * If insufficient, a warning will be triggered to alert the user.
> > + * Each page is cast to a pfn_pool: the first few bytes hold metadata
> > + * (next pointer and slot count), the remainder stores PFNs.
> > + */
> > +struct pfn_pool {
> > +     struct pfn_pool *next;
> > +     atomic_t count;
> > +     unsigned long pfns[];
> > +};
> > +
> > +#define PFN_POOL_SIZE                        ((PAGE_SIZE - offsetof(struct pfn_pool, pfns)) / \
> > +                                      sizeof(unsigned long))
> > +
> > +/*
> > + * Skip early PFN recording for a page allocation.  Reuses the
> > + * %__GFP_NO_OBJ_EXT bit.  Used by __alloc_tag_add_early_pfn() to avoid
> > + * recursion when allocating pages for the early PFN tracking list
> > + * itself.
> >    *
> > - * TODO: Replace fixed-size array with dynamic allocation using
> > - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
> > + * Codetags of the pages allocated with __GFP_NO_CODETAG should be
> > + * cleared (via clear_page_tag_ref()) before freeing the pages to prevent
> > + * alloc_tag_sub_check() from triggering a warning.
> >    */
> > -#define EARLY_ALLOC_PFN_MAX          8192
> > +#define __GFP_NO_CODETAG             __GFP_NO_OBJ_EXT
> >
> > -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
> > -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
> > +static struct pfn_pool *current_pfn_pool __initdata;
> >
> > -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
> > +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >   {
> > -     int old_idx, new_idx;
> > +     struct pfn_pool *pool;
> > +     int idx;
> >
> >       do {
> > -             old_idx = atomic_read(&early_pfn_count);
> > -             if (old_idx >= EARLY_ALLOC_PFN_MAX) {
> > -                     pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
> > -                                   EARLY_ALLOC_PFN_MAX);
> > -                     return;
> > +             pool = READ_ONCE(current_pfn_pool);
> > +             if (!pool || atomic_read(&pool->count) >= PFN_POOL_SIZE) {
> > +                     gfp_t gfp = gfp_flags & ~(__GFP_DIRECT_RECLAIM | GFP_ZONEMASK);
> > +                     struct page *new_page = alloc_page(gfp | __GFP_NO_CODETAG);
> > +                     struct pfn_pool *new;
> > +
> > +                     if (!new_page) {
> > +                             pr_warn_once("early PFN tracking page allocation failed\n");
> > +                             return;
> > +                     }
> > +                     new = page_address(new_page);
> > +                     new->next = pool;
> > +                     atomic_set(&new->count, 0);
> > +                     if (cmpxchg(&current_pfn_pool, pool, new) != pool) {
> > +                             clear_page_tag_ref(new_page);
> > +                             __free_page(new_page);
> > +                             continue;
> > +                     }
> > +                     pool = new;
> >               }
> > -             new_idx = old_idx + 1;
> > -     } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
> > +             idx = atomic_read(&pool->count);
> > +             if (idx >= PFN_POOL_SIZE)
> > +                     continue;
> > +             if (atomic_cmpxchg(&pool->count, idx, idx + 1) == idx)
> > +                     break;
> > +     } while (1);
> >
> > -     early_pfns[old_idx] = pfn;
> > +     pool->pfns[idx] = pfn;
> >   }
> >
> > -typedef void alloc_tag_add_func(unsigned long pfn);
> > +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
> >   static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
> >       RCU_INITIALIZER(__alloc_tag_add_early_pfn);
> >
> > -void alloc_tag_add_early_pfn(unsigned long pfn)
> > +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >   {
> >       alloc_tag_add_func *alloc_tag_add;
> >
> >       if (static_key_enabled(&mem_profiling_compressed))
> >               return;
> >
> > +     /* Skip allocations for the tracking list itself to avoid recursion. */
> > +     if (gfp_flags & __GFP_NO_CODETAG)
> > +             return;
> > +
> >       rcu_read_lock();
> >       alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
> >       if (alloc_tag_add)
> > -             alloc_tag_add(pfn);
> > +             alloc_tag_add(pfn, gfp_flags);
> >       rcu_read_unlock();
> >   }
> >
> >   static void __init clear_early_alloc_pfn_tag_refs(void)
> >   {
> > -     unsigned int i;
> > +     struct pfn_pool *pool, *next;
> > +     struct page *page;
> > +     int i;
> >
> >       if (static_key_enabled(&mem_profiling_compressed))
> >               return;
> > @@ -829,37 +864,45 @@ static void __init clear_early_alloc_pfn
> >       /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
> >       synchronize_rcu();
> >
> > -     for (i = 0; i < atomic_read(&early_pfn_count); i++) {
> > -             unsigned long pfn = early_pfns[i];
> > +     for (pool = current_pfn_pool; pool; pool = next) {
> > +             int nr_pfns = atomic_read(&pool->count);
> > +
> > +             for (i = 0; i < nr_pfns; i++) {
> > +                     unsigned long pfn = pool->pfns[i];
> >
> > -             if (pfn_valid(pfn)) {
> > -                     struct page *page = pfn_to_page(pfn);
> > -                     union pgtag_ref_handle handle;
> > -                     union codetag_ref ref;
> > -
> > -                     if (get_page_tag_ref(page, &ref, &handle)) {
> > -                             /*
> > -                              * An early-allocated page could be freed and reallocated
> > -                              * after its page_ext is initialized but before we clear it.
> > -                              * In that case, it already has a valid tag set.
> > -                              * We should not overwrite that valid tag with CODETAG_EMPTY.
> > -                              *
> > -                              * Note: there is still a small race window between checking
> > -                              * ref.ct and calling set_codetag_empty(). We accept this
> > -                              * race as it's unlikely and the extra complexity of atomic
> > -                              * cmpxchg is not worth it for this debug-only code path.
> > -                              */
> > -                             if (ref.ct) {
> > +                     if (pfn_valid(pfn)) {
> > +                             union pgtag_ref_handle handle;
> > +                             union codetag_ref ref;
> > +
> > +                             if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
> > +                                     /*
> > +                                      * An early-allocated page could be freed and reallocated
> > +                                      * after its page_ext is initialized but before we clear it.
> > +                                      * In that case, it already has a valid tag set.
> > +                                      * We should not overwrite that valid tag
> > +                                      * with CODETAG_EMPTY.
> > +                                      *
> > +                                      * Note: there is still a small race window between checking
> > +                                      * ref.ct and calling set_codetag_empty(). We accept this
> > +                                      * race as it's unlikely and the extra complexity of atomic
> > +                                      * cmpxchg is not worth it for this debug-only code path.
> > +                                      */
> > +                                     if (ref.ct) {
> > +                                             put_page_tag_ref(handle);
> > +                                             continue;
> > +                                     }
> > +
> > +                                     set_codetag_empty(&ref);
> > +                                     update_page_tag_ref(handle, &ref);
> >                                       put_page_tag_ref(handle);
> > -                                     continue;
> >                               }
> > -
> > -                             set_codetag_empty(&ref);
> > -                             update_page_tag_ref(handle, &ref);
> > -                             put_page_tag_ref(handle);
> >                       }
> >               }
> >
> > +             next = pool->next;
> > +             page = virt_to_page(pool);
> > +             clear_page_tag_ref(page);
> > +             __free_page(page);
> >       }
> >   }
> >   #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
> > --- a/mm/page_alloc.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> > +++ a/mm/page_alloc.c
> > @@ -1255,7 +1255,7 @@ void __clear_page_tag_ref(struct page *p
> >   /* Should be called only if mem_alloc_profiling_enabled() */
> >   static noinline
> >   void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> > -                    unsigned int nr)
> > +                    unsigned int nr, gfp_t gfp_flags)
> >   {
> >       union pgtag_ref_handle handle;
> >       union codetag_ref ref;
> > @@ -1269,17 +1269,17 @@ void __pgalloc_tag_add(struct page *page
> >                * page_ext is not available yet, record the pfn so we can
> >                * clear the tag ref later when page_ext is initialized.
> >                */
> > -             alloc_tag_add_early_pfn(page_to_pfn(page));
> > +             alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
> >               if (task->alloc_tag)
> >                       alloc_tag_set_inaccurate(task->alloc_tag);
> >       }
> >   }
> >
> >   static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> > -                                unsigned int nr)
> > +                                unsigned int nr, gfp_t gfp_flags)
> >   {
> >       if (mem_alloc_profiling_enabled())
> > -             __pgalloc_tag_add(page, task, nr);
> > +             __pgalloc_tag_add(page, task, nr, gfp_flags);
> >   }
> >
> >   /* Should be called only if mem_alloc_profiling_enabled() */
> > @@ -1312,7 +1312,7 @@ static inline void pgalloc_tag_sub_pages
> >   #else /* CONFIG_MEM_ALLOC_PROFILING */
> >
> >   static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> > -                                unsigned int nr) {}
> > +                                unsigned int nr, gfp_t gfp_flags) {}
> >   static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
> >   static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
> >
> > @@ -1867,7 +1867,7 @@ inline void post_alloc_hook(struct page
> >
> >       set_page_owner(page, order, gfp_flags);
> >       page_table_check_alloc(page, order);
> > -     pgalloc_tag_add(page, current, 1 << order);
> > +     pgalloc_tag_add(page, current, 1 << order, gfp_flags);
> >   }
> >
> >   static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
> > _
> >

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
  2026-05-27  5:22     ` Hao Ge
  2026-06-02 23:40       ` Suren Baghdasaryan
@ 2026-06-03 16:54       ` Suren Baghdasaryan
  2026-06-04  2:46         ` Hao Ge
  1 sibling, 1 reply; 9+ messages in thread
From: Suren Baghdasaryan @ 2026-06-03 16:54 UTC (permalink / raw)
  To: Hao Ge
  Cc: Andrew Morton, linux-mm, linux-kernel, Kent Overstreet,
	Roman Gushchin

On Tue, May 26, 2026 at 10:22 PM Hao Ge <hao.ge@linux.dev> wrote:
>
>
> On 2026/5/27 10:00, Andrew Morton wrote:
> > On Fri, 8 May 2026 17:12:51 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> >> On Wed,  6 May 2026 10:22:56 +0800 Hao Ge <hao.ge@linux.dev> wrote:
> >>
> >>> Pages allocated before page_ext is available have their codetag left
> >>> uninitialized. Track these early PFNs and clear their codetag in
> >>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> >>> warnings when they are freed later.
> >>>
> >>> Currently a fixed-size array of 8192 entries is used, with a warning if
> >>> the limit is exceeded. However, the number of early allocations depends
> >>> on the number of CPUs and can be larger than 8192.
> >>>
> >>> Replace the fixed-size array with a dynamically allocated linked list
> >>> of pfn_pool structs. Each node is allocated via alloc_page() and mapped
> >>> to a pfn_pool containing a next pointer, an atomic slot counter, and a
> >>> PFN array that fills the remainder of the page.
> >>>
> >>> The tracking pages themselves are allocated via alloc_page(), which
> >>> would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
> >>> recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
> >>> %__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
> >>> so that the early path can skip recording allocations that carry this
> >>> flag.
> >> AI review asked a couple of things.  I have a feeling we saw at least
> >> one of these, so probably already dealt with.
> >>      https://sashiko.dev/#/patchset/20260506022256.32664-1-hao.ge@linux.dev
>
> Hi Andrew
>
> My apologies. I'm also waiting for Suren's review. He may have been tied
> up lately
>
> and might not have time to get to this.
>
>
> Sashiko raised two issues this time. I've already responded to the first
> one.
>
> See the link below:
>
> https://lore.kernel.org/all/0b9969e2-b208-46c2-a9a5-bf620239275a@linux.dev/
>
> If I haven't missed any details, it should be a false positive.

That seems to be the case. I wonder why Sashiko did not consider
that... CC'ing Roman to see if Sashiko can be improved (unless we both
are missing something).

>
>
> As for the second point, let me address it.
>
> The early PFN tracking window is entirely within mm_core_init(),
>
> which is called from start_kernel():
>
> start_kernel()
>
>      mm_core_init()
>
>          memblock_free_all();
>
>          mem_init() //start early PFN tracking
>
>          kmem_cache_init()                           // SLUB bootstrap +
> kmalloc caches
> ...
>          page_ext_init()                                   // clears
> alloc_tag_add_early_pfn_ptr
>
>      ...
>
>      rest_init() //spawns kernel_init thread
>
>
> kernel_init() → kernel_init_freeable()            // separate thread, later
>
>      smp_init()                                    // secondary CPUs
> come online here
>
> Within the early PFN window (mem_init() to page_ext_init()):
>
>   1. We are still in start_kernel(), single CPU. The buddy allocator
>
> was just initialized from memblock and should have plenty of free
>
> pages, so alloc_page() would likely be satisfied from the fast
>
> path. If so, the __GFP_NOFAIL without __GFP_DIRECT_RECLAIM
>
> check in the slowpath would not be reached.
>
> 2. Since only the boot CPU is running, alloc_page() targets the
>
> boot node, which has memory. So even if __GFP_THISNODE were
>
> inherited, it would not fail on the boot node during this window.
>
>
> So Sashiko's analysis applies to the general case, and indeed the issues
>
> he raised could occur there.
>
> However, in the early boot scenario, I believe the current patch is safe,
>
> even though it is not fully generic (after all, no one can predict
> future use cases).
>
> Therefore, I agree with his suggestion that using a clean mask like
> GFP_NOWAIT | __GFP_NOWARN.

This sounds good to me. With that change feel free to add:

Acked-by: Suren Baghdasaryan <surenb@google.com>

>
>
> In any case, I will wait for your and Suren's feedback. You may have
> different opinions on this matter.
>
>
> Thanks
>
> Best Regards
>
> Hao
>
>
> > Please?
> >
> > Also, this patch has no evidence of human review.
> >
> >
> > From: Hao Ge <hao.ge@linux.dev>
> > Subject: mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
> > Date: Wed, 6 May 2026 10:22:56 +0800
> >
> > Pages allocated before page_ext is available have their codetag left
> > uninitialized.  Track these early PFNs and clear their codetag in
> > clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set" warnings
> > when they are freed later.
> >
> > Currently a fixed-size array of 8192 entries is used, with a warning if
> > the limit is exceeded.  However, the number of early allocations depends
> > on the number of CPUs and can be larger than 8192.
> >
> > Replace the fixed-size array with a dynamically allocated linked list of
> > pfn_pool structs.  Each node is allocated via alloc_page() and mapped to a
> > pfn_pool containing a next pointer, an atomic slot counter, and a PFN
> > array that fills the remainder of the page.
> >
> > The tracking pages themselves are allocated via alloc_page(), which would
> > trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
> > indefinitely.  Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
> > bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
> > can skip recording allocations that carry this flag.
> >
> > Link: https://lore.kernel.org/20260506022256.32664-1-hao.ge@linux.dev
> > Signed-off-by: Hao Ge <hao.ge@linux.dev>
> > Suggested-by: Suren Baghdasaryan <surenb@google.com>
> > Cc: Brendan Jackman <jackmanb@google.com>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Kent Overstreet <kent.overstreet@linux.dev>
> > Cc: Michal Hocko <mhocko@suse.com>
> > Cc: Vlastimil Babka <vbabka@kernel.org>
> > Cc: Zi Yan <ziy@nvidia.com>
> > Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> > ---
> >
> >   include/linux/alloc_tag.h |    4
> >   lib/alloc_tag.c           |  145 +++++++++++++++++++++++-------------
> >   mm/page_alloc.c           |   12 +-
> >   3 files changed, 102 insertions(+), 59 deletions(-)
> >
> > --- a/include/linux/alloc_tag.h~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> > +++ a/include/linux/alloc_tag.h
> > @@ -163,11 +163,11 @@ static inline void alloc_tag_sub_check(u
> >   {
> >       WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> >   }
> > -void alloc_tag_add_early_pfn(unsigned long pfn);
> > +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
> >   #else
> >   static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
> >   static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
> > -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
> > +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
> >   #endif
> >
> >   /* Caller should verify both ref and tag to be valid */
> > --- a/lib/alloc_tag.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> > +++ a/lib/alloc_tag.c
> > @@ -767,60 +767,95 @@ static __init bool need_page_alloc_taggi
> >    * their codetag uninitialized. Track these early PFNs so we can clear
> >    * their codetag refs later to avoid warnings when they are freed.
> >    *
> > - * Early allocations include:
> > - *   - Base allocations independent of CPU count
> > - *   - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
> > - *     such as trace ring buffers, scheduler per-cpu data)
> > - *
> > - * For simplicity, we fix the size to 8192.
> > - * If insufficient, a warning will be triggered to alert the user.
> > + * Each page is cast to a pfn_pool: the first few bytes hold metadata
> > + * (next pointer and slot count), the remainder stores PFNs.
> > + */
> > +struct pfn_pool {
> > +     struct pfn_pool *next;
> > +     atomic_t count;
> > +     unsigned long pfns[];
> > +};
> > +
> > +#define PFN_POOL_SIZE                        ((PAGE_SIZE - offsetof(struct pfn_pool, pfns)) / \
> > +                                      sizeof(unsigned long))
> > +
> > +/*
> > + * Skip early PFN recording for a page allocation.  Reuses the
> > + * %__GFP_NO_OBJ_EXT bit.  Used by __alloc_tag_add_early_pfn() to avoid
> > + * recursion when allocating pages for the early PFN tracking list
> > + * itself.
> >    *
> > - * TODO: Replace fixed-size array with dynamic allocation using
> > - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
> > + * Codetags of the pages allocated with __GFP_NO_CODETAG should be
> > + * cleared (via clear_page_tag_ref()) before freeing the pages to prevent
> > + * alloc_tag_sub_check() from triggering a warning.
> >    */
> > -#define EARLY_ALLOC_PFN_MAX          8192
> > +#define __GFP_NO_CODETAG             __GFP_NO_OBJ_EXT
> >
> > -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
> > -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
> > +static struct pfn_pool *current_pfn_pool __initdata;
> >
> > -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
> > +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >   {
> > -     int old_idx, new_idx;
> > +     struct pfn_pool *pool;
> > +     int idx;
> >
> >       do {
> > -             old_idx = atomic_read(&early_pfn_count);
> > -             if (old_idx >= EARLY_ALLOC_PFN_MAX) {
> > -                     pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
> > -                                   EARLY_ALLOC_PFN_MAX);
> > -                     return;
> > +             pool = READ_ONCE(current_pfn_pool);
> > +             if (!pool || atomic_read(&pool->count) >= PFN_POOL_SIZE) {
> > +                     gfp_t gfp = gfp_flags & ~(__GFP_DIRECT_RECLAIM | GFP_ZONEMASK);
> > +                     struct page *new_page = alloc_page(gfp | __GFP_NO_CODETAG);
> > +                     struct pfn_pool *new;
> > +
> > +                     if (!new_page) {
> > +                             pr_warn_once("early PFN tracking page allocation failed\n");
> > +                             return;
> > +                     }
> > +                     new = page_address(new_page);
> > +                     new->next = pool;
> > +                     atomic_set(&new->count, 0);
> > +                     if (cmpxchg(&current_pfn_pool, pool, new) != pool) {
> > +                             clear_page_tag_ref(new_page);
> > +                             __free_page(new_page);
> > +                             continue;
> > +                     }
> > +                     pool = new;
> >               }
> > -             new_idx = old_idx + 1;
> > -     } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
> > +             idx = atomic_read(&pool->count);
> > +             if (idx >= PFN_POOL_SIZE)
> > +                     continue;
> > +             if (atomic_cmpxchg(&pool->count, idx, idx + 1) == idx)
> > +                     break;
> > +     } while (1);
> >
> > -     early_pfns[old_idx] = pfn;
> > +     pool->pfns[idx] = pfn;
> >   }
> >
> > -typedef void alloc_tag_add_func(unsigned long pfn);
> > +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
> >   static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
> >       RCU_INITIALIZER(__alloc_tag_add_early_pfn);
> >
> > -void alloc_tag_add_early_pfn(unsigned long pfn)
> > +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >   {
> >       alloc_tag_add_func *alloc_tag_add;
> >
> >       if (static_key_enabled(&mem_profiling_compressed))
> >               return;
> >
> > +     /* Skip allocations for the tracking list itself to avoid recursion. */
> > +     if (gfp_flags & __GFP_NO_CODETAG)
> > +             return;
> > +
> >       rcu_read_lock();
> >       alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
> >       if (alloc_tag_add)
> > -             alloc_tag_add(pfn);
> > +             alloc_tag_add(pfn, gfp_flags);
> >       rcu_read_unlock();
> >   }
> >
> >   static void __init clear_early_alloc_pfn_tag_refs(void)
> >   {
> > -     unsigned int i;
> > +     struct pfn_pool *pool, *next;
> > +     struct page *page;
> > +     int i;
> >
> >       if (static_key_enabled(&mem_profiling_compressed))
> >               return;
> > @@ -829,37 +864,45 @@ static void __init clear_early_alloc_pfn
> >       /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
> >       synchronize_rcu();
> >
> > -     for (i = 0; i < atomic_read(&early_pfn_count); i++) {
> > -             unsigned long pfn = early_pfns[i];
> > +     for (pool = current_pfn_pool; pool; pool = next) {
> > +             int nr_pfns = atomic_read(&pool->count);
> > +
> > +             for (i = 0; i < nr_pfns; i++) {
> > +                     unsigned long pfn = pool->pfns[i];
> >
> > -             if (pfn_valid(pfn)) {
> > -                     struct page *page = pfn_to_page(pfn);
> > -                     union pgtag_ref_handle handle;
> > -                     union codetag_ref ref;
> > -
> > -                     if (get_page_tag_ref(page, &ref, &handle)) {
> > -                             /*
> > -                              * An early-allocated page could be freed and reallocated
> > -                              * after its page_ext is initialized but before we clear it.
> > -                              * In that case, it already has a valid tag set.
> > -                              * We should not overwrite that valid tag with CODETAG_EMPTY.
> > -                              *
> > -                              * Note: there is still a small race window between checking
> > -                              * ref.ct and calling set_codetag_empty(). We accept this
> > -                              * race as it's unlikely and the extra complexity of atomic
> > -                              * cmpxchg is not worth it for this debug-only code path.
> > -                              */
> > -                             if (ref.ct) {
> > +                     if (pfn_valid(pfn)) {
> > +                             union pgtag_ref_handle handle;
> > +                             union codetag_ref ref;
> > +
> > +                             if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
> > +                                     /*
> > +                                      * An early-allocated page could be freed and reallocated
> > +                                      * after its page_ext is initialized but before we clear it.
> > +                                      * In that case, it already has a valid tag set.
> > +                                      * We should not overwrite that valid tag
> > +                                      * with CODETAG_EMPTY.
> > +                                      *
> > +                                      * Note: there is still a small race window between checking
> > +                                      * ref.ct and calling set_codetag_empty(). We accept this
> > +                                      * race as it's unlikely and the extra complexity of atomic
> > +                                      * cmpxchg is not worth it for this debug-only code path.
> > +                                      */
> > +                                     if (ref.ct) {
> > +                                             put_page_tag_ref(handle);
> > +                                             continue;
> > +                                     }
> > +
> > +                                     set_codetag_empty(&ref);
> > +                                     update_page_tag_ref(handle, &ref);
> >                                       put_page_tag_ref(handle);
> > -                                     continue;
> >                               }
> > -
> > -                             set_codetag_empty(&ref);
> > -                             update_page_tag_ref(handle, &ref);
> > -                             put_page_tag_ref(handle);
> >                       }
> >               }
> >
> > +             next = pool->next;
> > +             page = virt_to_page(pool);
> > +             clear_page_tag_ref(page);
> > +             __free_page(page);
> >       }
> >   }
> >   #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
> > --- a/mm/page_alloc.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> > +++ a/mm/page_alloc.c
> > @@ -1255,7 +1255,7 @@ void __clear_page_tag_ref(struct page *p
> >   /* Should be called only if mem_alloc_profiling_enabled() */
> >   static noinline
> >   void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> > -                    unsigned int nr)
> > +                    unsigned int nr, gfp_t gfp_flags)
> >   {
> >       union pgtag_ref_handle handle;
> >       union codetag_ref ref;
> > @@ -1269,17 +1269,17 @@ void __pgalloc_tag_add(struct page *page
> >                * page_ext is not available yet, record the pfn so we can
> >                * clear the tag ref later when page_ext is initialized.
> >                */
> > -             alloc_tag_add_early_pfn(page_to_pfn(page));
> > +             alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
> >               if (task->alloc_tag)
> >                       alloc_tag_set_inaccurate(task->alloc_tag);
> >       }
> >   }
> >
> >   static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> > -                                unsigned int nr)
> > +                                unsigned int nr, gfp_t gfp_flags)
> >   {
> >       if (mem_alloc_profiling_enabled())
> > -             __pgalloc_tag_add(page, task, nr);
> > +             __pgalloc_tag_add(page, task, nr, gfp_flags);
> >   }
> >
> >   /* Should be called only if mem_alloc_profiling_enabled() */
> > @@ -1312,7 +1312,7 @@ static inline void pgalloc_tag_sub_pages
> >   #else /* CONFIG_MEM_ALLOC_PROFILING */
> >
> >   static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> > -                                unsigned int nr) {}
> > +                                unsigned int nr, gfp_t gfp_flags) {}
> >   static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
> >   static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
> >
> > @@ -1867,7 +1867,7 @@ inline void post_alloc_hook(struct page
> >
> >       set_page_owner(page, order, gfp_flags);
> >       page_table_check_alloc(page, order);
> > -     pgalloc_tag_add(page, current, 1 << order);
> > +     pgalloc_tag_add(page, current, 1 << order, gfp_flags);
> >   }
> >
> >   static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
> > _
> >


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
  2026-06-03 16:54       ` Suren Baghdasaryan
@ 2026-06-04  2:46         ` Hao Ge
  2026-06-04  5:29           ` Hao Ge
  2026-06-04 23:52           ` Suren Baghdasaryan
  0 siblings, 2 replies; 9+ messages in thread
From: Hao Ge @ 2026-06-04  2:46 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Andrew Morton, linux-mm, linux-kernel, Kent Overstreet,
	Roman Gushchin


On 2026/6/4 00:54, Suren Baghdasaryan wrote:
> On Tue, May 26, 2026 at 10:22 PM Hao Ge <hao.ge@linux.dev> wrote:
>>
>> On 2026/5/27 10:00, Andrew Morton wrote:
>>> On Fri, 8 May 2026 17:12:51 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:
>>>
>>>> On Wed,  6 May 2026 10:22:56 +0800 Hao Ge <hao.ge@linux.dev> wrote:
>>>>
>>>>> Pages allocated before page_ext is available have their codetag left
>>>>> uninitialized. Track these early PFNs and clear their codetag in
>>>>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
>>>>> warnings when they are freed later.
>>>>>
>>>>> Currently a fixed-size array of 8192 entries is used, with a warning if
>>>>> the limit is exceeded. However, the number of early allocations depends
>>>>> on the number of CPUs and can be larger than 8192.
>>>>>
>>>>> Replace the fixed-size array with a dynamically allocated linked list
>>>>> of pfn_pool structs. Each node is allocated via alloc_page() and mapped
>>>>> to a pfn_pool containing a next pointer, an atomic slot counter, and a
>>>>> PFN array that fills the remainder of the page.
>>>>>
>>>>> The tracking pages themselves are allocated via alloc_page(), which
>>>>> would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
>>>>> recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
>>>>> %__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
>>>>> so that the early path can skip recording allocations that carry this
>>>>> flag.
>>>> AI review asked a couple of things.  I have a feeling we saw at least
>>>> one of these, so probably already dealt with.
>>>>       https://sashiko.dev/#/patchset/20260506022256.32664-1-hao.ge@linux.dev
>> Hi Andrew
>>
>> My apologies. I'm also waiting for Suren's review. He may have been tied
>> up lately
>>
>> and might not have time to get to this.
>>
>>
>> Sashiko raised two issues this time. I've already responded to the first
>> one.
>>
>> See the link below:
>>
>> https://lore.kernel.org/all/0b9969e2-b208-46c2-a9a5-bf620239275a@linux.dev/
>>
>> If I haven't missed any details, it should be a false positive.
> That seems to be the case. I wonder why Sashiko did not consider
> that... CC'ing Roman to see if Sashiko can be improved (unless we both
> are missing something).
>
>>
>> As for the second point, let me address it.
>>
>> The early PFN tracking window is entirely within mm_core_init(),
>>
>> which is called from start_kernel():
>>
>> start_kernel()
>>
>>       mm_core_init()
>>
>>           memblock_free_all();
>>
>>           mem_init() //start early PFN tracking
>>
>>           kmem_cache_init()                           // SLUB bootstrap +
>> kmalloc caches
>> ...
>>           page_ext_init()                                   // clears
>> alloc_tag_add_early_pfn_ptr
>>
>>       ...
>>
>>       rest_init() //spawns kernel_init thread
>>
>>
>> kernel_init() → kernel_init_freeable()            // separate thread, later
>>
>>       smp_init()                                    // secondary CPUs
>> come online here
>>
>> Within the early PFN window (mem_init() to page_ext_init()):
>>
>>    1. We are still in start_kernel(), single CPU. The buddy allocator
>>
>> was just initialized from memblock and should have plenty of free
>>
>> pages, so alloc_page() would likely be satisfied from the fast
>>
>> path. If so, the __GFP_NOFAIL without __GFP_DIRECT_RECLAIM
>>
>> check in the slowpath would not be reached.
>>
>> 2. Since only the boot CPU is running, alloc_page() targets the
>>
>> boot node, which has memory. So even if __GFP_THISNODE were
>>
>> inherited, it would not fail on the boot node during this window.
>>
>>
>> So Sashiko's analysis applies to the general case, and indeed the issues
>>
>> he raised could occur there.
>>
>> However, in the early boot scenario, I believe the current patch is safe,
>>
>> even though it is not fully generic (after all, no one can predict
>> future use cases).
>>
>> Therefore, I agree with his suggestion that using a clean mask like
>> GFP_NOWAIT | __GFP_NOWARN.

Hi Suren

I've been thinking about the GFP flags issue for the past few days. 
There are actually a couple of issues

with the suggestion of using GFP_NOWAIT | __GFP_NOWARN.

First, GFP_NOWAIT already includes __GFP_NOWARN, so it's redundant.

Second, GFP_NOWAIT also includes __GFP_KSWAPD_RECLAIM, which is exactly 
the same issue he flagged

previously with GFP_ATOMIC — it can still trigger wakeup_kswapd() and 
acquire scheduler locks, leading to

potential deadlock in the same scenario he described.

So I think __GFP_HIGH | __GFP_NO_CODETAG is the right choice.

Since this runs under rcu_read_lock(), we can't have __GFP_DIRECT_RECLAIM.

And since Sashiko pointed out the scheduler lock concern with 
__GFP_KSWAPD_RECLAIM,

we can't have that either.

I have posted the v6 revision, would you please kindly review it at your 
convenience?

https://lore.kernel.org/all/20260604024008.46592-1-hao.ge@linux.dev/

Thanks

Best Regards

Hao

> This sounds good to me. With that change feel free to add:
>
> Acked-by: Suren Baghdasaryan <surenb@google.com>
>
>>
>> In any case, I will wait for your and Suren's feedback. You may have
>> different opinions on this matter.
>>
>>
>> Thanks
>>
>> Best Regards
>>
>> Hao
>>
>>
>>> Please?
>>>
>>> Also, this patch has no evidence of human review.
>>>
>>>
>>> From: Hao Ge <hao.ge@linux.dev>
>>> Subject: mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
>>> Date: Wed, 6 May 2026 10:22:56 +0800
>>>
>>> Pages allocated before page_ext is available have their codetag left
>>> uninitialized.  Track these early PFNs and clear their codetag in
>>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set" warnings
>>> when they are freed later.
>>>
>>> Currently a fixed-size array of 8192 entries is used, with a warning if
>>> the limit is exceeded.  However, the number of early allocations depends
>>> on the number of CPUs and can be larger than 8192.
>>>
>>> Replace the fixed-size array with a dynamically allocated linked list of
>>> pfn_pool structs.  Each node is allocated via alloc_page() and mapped to a
>>> pfn_pool containing a next pointer, an atomic slot counter, and a PFN
>>> array that fills the remainder of the page.
>>>
>>> The tracking pages themselves are allocated via alloc_page(), which would
>>> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
>>> indefinitely.  Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
>>> bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
>>> can skip recording allocations that carry this flag.
>>>
>>> Link: https://lore.kernel.org/20260506022256.32664-1-hao.ge@linux.dev
>>> Signed-off-by: Hao Ge <hao.ge@linux.dev>
>>> Suggested-by: Suren Baghdasaryan <surenb@google.com>
>>> Cc: Brendan Jackman <jackmanb@google.com>
>>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>>> Cc: Kent Overstreet <kent.overstreet@linux.dev>
>>> Cc: Michal Hocko <mhocko@suse.com>
>>> Cc: Vlastimil Babka <vbabka@kernel.org>
>>> Cc: Zi Yan <ziy@nvidia.com>
>>> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
>>> ---
>>>
>>>    include/linux/alloc_tag.h |    4
>>>    lib/alloc_tag.c           |  145 +++++++++++++++++++++++-------------
>>>    mm/page_alloc.c           |   12 +-
>>>    3 files changed, 102 insertions(+), 59 deletions(-)
>>>
>>> --- a/include/linux/alloc_tag.h~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
>>> +++ a/include/linux/alloc_tag.h
>>> @@ -163,11 +163,11 @@ static inline void alloc_tag_sub_check(u
>>>    {
>>>        WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
>>>    }
>>> -void alloc_tag_add_early_pfn(unsigned long pfn);
>>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
>>>    #else
>>>    static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
>>>    static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
>>> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
>>> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
>>>    #endif
>>>
>>>    /* Caller should verify both ref and tag to be valid */
>>> --- a/lib/alloc_tag.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
>>> +++ a/lib/alloc_tag.c
>>> @@ -767,60 +767,95 @@ static __init bool need_page_alloc_taggi
>>>     * their codetag uninitialized. Track these early PFNs so we can clear
>>>     * their codetag refs later to avoid warnings when they are freed.
>>>     *
>>> - * Early allocations include:
>>> - *   - Base allocations independent of CPU count
>>> - *   - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
>>> - *     such as trace ring buffers, scheduler per-cpu data)
>>> - *
>>> - * For simplicity, we fix the size to 8192.
>>> - * If insufficient, a warning will be triggered to alert the user.
>>> + * Each page is cast to a pfn_pool: the first few bytes hold metadata
>>> + * (next pointer and slot count), the remainder stores PFNs.
>>> + */
>>> +struct pfn_pool {
>>> +     struct pfn_pool *next;
>>> +     atomic_t count;
>>> +     unsigned long pfns[];
>>> +};
>>> +
>>> +#define PFN_POOL_SIZE                        ((PAGE_SIZE - offsetof(struct pfn_pool, pfns)) / \
>>> +                                      sizeof(unsigned long))
>>> +
>>> +/*
>>> + * Skip early PFN recording for a page allocation.  Reuses the
>>> + * %__GFP_NO_OBJ_EXT bit.  Used by __alloc_tag_add_early_pfn() to avoid
>>> + * recursion when allocating pages for the early PFN tracking list
>>> + * itself.
>>>     *
>>> - * TODO: Replace fixed-size array with dynamic allocation using
>>> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
>>> + * Codetags of the pages allocated with __GFP_NO_CODETAG should be
>>> + * cleared (via clear_page_tag_ref()) before freeing the pages to prevent
>>> + * alloc_tag_sub_check() from triggering a warning.
>>>     */
>>> -#define EARLY_ALLOC_PFN_MAX          8192
>>> +#define __GFP_NO_CODETAG             __GFP_NO_OBJ_EXT
>>>
>>> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
>>> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
>>> +static struct pfn_pool *current_pfn_pool __initdata;
>>>
>>> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
>>> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>>>    {
>>> -     int old_idx, new_idx;
>>> +     struct pfn_pool *pool;
>>> +     int idx;
>>>
>>>        do {
>>> -             old_idx = atomic_read(&early_pfn_count);
>>> -             if (old_idx >= EARLY_ALLOC_PFN_MAX) {
>>> -                     pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
>>> -                                   EARLY_ALLOC_PFN_MAX);
>>> -                     return;
>>> +             pool = READ_ONCE(current_pfn_pool);
>>> +             if (!pool || atomic_read(&pool->count) >= PFN_POOL_SIZE) {
>>> +                     gfp_t gfp = gfp_flags & ~(__GFP_DIRECT_RECLAIM | GFP_ZONEMASK);
>>> +                     struct page *new_page = alloc_page(gfp | __GFP_NO_CODETAG);
>>> +                     struct pfn_pool *new;
>>> +
>>> +                     if (!new_page) {
>>> +                             pr_warn_once("early PFN tracking page allocation failed\n");
>>> +                             return;
>>> +                     }
>>> +                     new = page_address(new_page);
>>> +                     new->next = pool;
>>> +                     atomic_set(&new->count, 0);
>>> +                     if (cmpxchg(&current_pfn_pool, pool, new) != pool) {
>>> +                             clear_page_tag_ref(new_page);
>>> +                             __free_page(new_page);
>>> +                             continue;
>>> +                     }
>>> +                     pool = new;
>>>                }
>>> -             new_idx = old_idx + 1;
>>> -     } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
>>> +             idx = atomic_read(&pool->count);
>>> +             if (idx >= PFN_POOL_SIZE)
>>> +                     continue;
>>> +             if (atomic_cmpxchg(&pool->count, idx, idx + 1) == idx)
>>> +                     break;
>>> +     } while (1);
>>>
>>> -     early_pfns[old_idx] = pfn;
>>> +     pool->pfns[idx] = pfn;
>>>    }
>>>
>>> -typedef void alloc_tag_add_func(unsigned long pfn);
>>> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
>>>    static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
>>>        RCU_INITIALIZER(__alloc_tag_add_early_pfn);
>>>
>>> -void alloc_tag_add_early_pfn(unsigned long pfn)
>>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>>>    {
>>>        alloc_tag_add_func *alloc_tag_add;
>>>
>>>        if (static_key_enabled(&mem_profiling_compressed))
>>>                return;
>>>
>>> +     /* Skip allocations for the tracking list itself to avoid recursion. */
>>> +     if (gfp_flags & __GFP_NO_CODETAG)
>>> +             return;
>>> +
>>>        rcu_read_lock();
>>>        alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
>>>        if (alloc_tag_add)
>>> -             alloc_tag_add(pfn);
>>> +             alloc_tag_add(pfn, gfp_flags);
>>>        rcu_read_unlock();
>>>    }
>>>
>>>    static void __init clear_early_alloc_pfn_tag_refs(void)
>>>    {
>>> -     unsigned int i;
>>> +     struct pfn_pool *pool, *next;
>>> +     struct page *page;
>>> +     int i;
>>>
>>>        if (static_key_enabled(&mem_profiling_compressed))
>>>                return;
>>> @@ -829,37 +864,45 @@ static void __init clear_early_alloc_pfn
>>>        /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
>>>        synchronize_rcu();
>>>
>>> -     for (i = 0; i < atomic_read(&early_pfn_count); i++) {
>>> -             unsigned long pfn = early_pfns[i];
>>> +     for (pool = current_pfn_pool; pool; pool = next) {
>>> +             int nr_pfns = atomic_read(&pool->count);
>>> +
>>> +             for (i = 0; i < nr_pfns; i++) {
>>> +                     unsigned long pfn = pool->pfns[i];
>>>
>>> -             if (pfn_valid(pfn)) {
>>> -                     struct page *page = pfn_to_page(pfn);
>>> -                     union pgtag_ref_handle handle;
>>> -                     union codetag_ref ref;
>>> -
>>> -                     if (get_page_tag_ref(page, &ref, &handle)) {
>>> -                             /*
>>> -                              * An early-allocated page could be freed and reallocated
>>> -                              * after its page_ext is initialized but before we clear it.
>>> -                              * In that case, it already has a valid tag set.
>>> -                              * We should not overwrite that valid tag with CODETAG_EMPTY.
>>> -                              *
>>> -                              * Note: there is still a small race window between checking
>>> -                              * ref.ct and calling set_codetag_empty(). We accept this
>>> -                              * race as it's unlikely and the extra complexity of atomic
>>> -                              * cmpxchg is not worth it for this debug-only code path.
>>> -                              */
>>> -                             if (ref.ct) {
>>> +                     if (pfn_valid(pfn)) {
>>> +                             union pgtag_ref_handle handle;
>>> +                             union codetag_ref ref;
>>> +
>>> +                             if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
>>> +                                     /*
>>> +                                      * An early-allocated page could be freed and reallocated
>>> +                                      * after its page_ext is initialized but before we clear it.
>>> +                                      * In that case, it already has a valid tag set.
>>> +                                      * We should not overwrite that valid tag
>>> +                                      * with CODETAG_EMPTY.
>>> +                                      *
>>> +                                      * Note: there is still a small race window between checking
>>> +                                      * ref.ct and calling set_codetag_empty(). We accept this
>>> +                                      * race as it's unlikely and the extra complexity of atomic
>>> +                                      * cmpxchg is not worth it for this debug-only code path.
>>> +                                      */
>>> +                                     if (ref.ct) {
>>> +                                             put_page_tag_ref(handle);
>>> +                                             continue;
>>> +                                     }
>>> +
>>> +                                     set_codetag_empty(&ref);
>>> +                                     update_page_tag_ref(handle, &ref);
>>>                                        put_page_tag_ref(handle);
>>> -                                     continue;
>>>                                }
>>> -
>>> -                             set_codetag_empty(&ref);
>>> -                             update_page_tag_ref(handle, &ref);
>>> -                             put_page_tag_ref(handle);
>>>                        }
>>>                }
>>>
>>> +             next = pool->next;
>>> +             page = virt_to_page(pool);
>>> +             clear_page_tag_ref(page);
>>> +             __free_page(page);
>>>        }
>>>    }
>>>    #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
>>> --- a/mm/page_alloc.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
>>> +++ a/mm/page_alloc.c
>>> @@ -1255,7 +1255,7 @@ void __clear_page_tag_ref(struct page *p
>>>    /* Should be called only if mem_alloc_profiling_enabled() */
>>>    static noinline
>>>    void __pgalloc_tag_add(struct page *page, struct task_struct *task,
>>> -                    unsigned int nr)
>>> +                    unsigned int nr, gfp_t gfp_flags)
>>>    {
>>>        union pgtag_ref_handle handle;
>>>        union codetag_ref ref;
>>> @@ -1269,17 +1269,17 @@ void __pgalloc_tag_add(struct page *page
>>>                 * page_ext is not available yet, record the pfn so we can
>>>                 * clear the tag ref later when page_ext is initialized.
>>>                 */
>>> -             alloc_tag_add_early_pfn(page_to_pfn(page));
>>> +             alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
>>>                if (task->alloc_tag)
>>>                        alloc_tag_set_inaccurate(task->alloc_tag);
>>>        }
>>>    }
>>>
>>>    static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
>>> -                                unsigned int nr)
>>> +                                unsigned int nr, gfp_t gfp_flags)
>>>    {
>>>        if (mem_alloc_profiling_enabled())
>>> -             __pgalloc_tag_add(page, task, nr);
>>> +             __pgalloc_tag_add(page, task, nr, gfp_flags);
>>>    }
>>>
>>>    /* Should be called only if mem_alloc_profiling_enabled() */
>>> @@ -1312,7 +1312,7 @@ static inline void pgalloc_tag_sub_pages
>>>    #else /* CONFIG_MEM_ALLOC_PROFILING */
>>>
>>>    static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
>>> -                                unsigned int nr) {}
>>> +                                unsigned int nr, gfp_t gfp_flags) {}
>>>    static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
>>>    static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
>>>
>>> @@ -1867,7 +1867,7 @@ inline void post_alloc_hook(struct page
>>>
>>>        set_page_owner(page, order, gfp_flags);
>>>        page_table_check_alloc(page, order);
>>> -     pgalloc_tag_add(page, current, 1 << order);
>>> +     pgalloc_tag_add(page, current, 1 << order, gfp_flags);
>>>    }
>>>
>>>    static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
>>> _
>>>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
  2026-06-04  2:46         ` Hao Ge
@ 2026-06-04  5:29           ` Hao Ge
  2026-06-04 23:52           ` Suren Baghdasaryan
  1 sibling, 0 replies; 9+ messages in thread
From: Hao Ge @ 2026-06-04  5:29 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Andrew Morton, linux-mm, linux-kernel, Kent Overstreet,
	Roman Gushchin


On 2026/6/4 10:46, Hao Ge wrote:
>
> On 2026/6/4 00:54, Suren Baghdasaryan wrote:
>> On Tue, May 26, 2026 at 10:22 PM Hao Ge <hao.ge@linux.dev> wrote:
>>>
>>> On 2026/5/27 10:00, Andrew Morton wrote:
>>>> On Fri, 8 May 2026 17:12:51 -0700 Andrew Morton 
>>>> <akpm@linux-foundation.org> wrote:
>>>>
>>>>> On Wed,  6 May 2026 10:22:56 +0800 Hao Ge <hao.ge@linux.dev> wrote:
>>>>>
>>>>>> Pages allocated before page_ext is available have their codetag left
>>>>>> uninitialized. Track these early PFNs and clear their codetag in
>>>>>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
>>>>>> warnings when they are freed later.
>>>>>>
>>>>>> Currently a fixed-size array of 8192 entries is used, with a 
>>>>>> warning if
>>>>>> the limit is exceeded. However, the number of early allocations 
>>>>>> depends
>>>>>> on the number of CPUs and can be larger than 8192.
>>>>>>
>>>>>> Replace the fixed-size array with a dynamically allocated linked 
>>>>>> list
>>>>>> of pfn_pool structs. Each node is allocated via alloc_page() and 
>>>>>> mapped
>>>>>> to a pfn_pool containing a next pointer, an atomic slot counter, 
>>>>>> and a
>>>>>> PFN array that fills the remainder of the page.
>>>>>>
>>>>>> The tracking pages themselves are allocated via alloc_page(), which
>>>>>> would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
>>>>>> recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
>>>>>> %__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
>>>>>> so that the early path can skip recording allocations that carry 
>>>>>> this
>>>>>> flag.
>>>>> AI review asked a couple of things.  I have a feeling we saw at least
>>>>> one of these, so probably already dealt with.
>>>>> https://sashiko.dev/#/patchset/20260506022256.32664-1-hao.ge@linux.dev 
>>>>>
>>> Hi Andrew
>>>
>>> My apologies. I'm also waiting for Suren's review. He may have been 
>>> tied
>>> up lately
>>>
>>> and might not have time to get to this.
>>>
>>>
>>> Sashiko raised two issues this time. I've already responded to the 
>>> first
>>> one.
>>>
>>> See the link below:
>>>
>>> https://lore.kernel.org/all/0b9969e2-b208-46c2-a9a5-bf620239275a@linux.dev/ 
>>>
>>>
>>> If I haven't missed any details, it should be a false positive.
>> That seems to be the case. I wonder why Sashiko did not consider
>> that... CC'ing Roman to see if Sashiko can be improved (unless we both
>> are missing something).
>>
>>>
>>> As for the second point, let me address it.
>>>
>>> The early PFN tracking window is entirely within mm_core_init(),
>>>
>>> which is called from start_kernel():
>>>
>>> start_kernel()
>>>
>>>       mm_core_init()
>>>
>>>           memblock_free_all();
>>>
>>>           mem_init() //start early PFN tracking
>>>
>>>           kmem_cache_init()                           // SLUB 
>>> bootstrap +
>>> kmalloc caches
>>> ...
>>>           page_ext_init()                                   // clears
>>> alloc_tag_add_early_pfn_ptr
>>>
>>>       ...
>>>
>>>       rest_init() //spawns kernel_init thread
>>>
>>>
>>> kernel_init() → kernel_init_freeable()            // separate 
>>> thread, later
>>>
>>>       smp_init()                                    // secondary CPUs
>>> come online here
>>>
>>> Within the early PFN window (mem_init() to page_ext_init()):
>>>
>>>    1. We are still in start_kernel(), single CPU. The buddy allocator
>>>
>>> was just initialized from memblock and should have plenty of free
>>>
>>> pages, so alloc_page() would likely be satisfied from the fast
>>>
>>> path. If so, the __GFP_NOFAIL without __GFP_DIRECT_RECLAIM
>>>
>>> check in the slowpath would not be reached.
>>>
>>> 2. Since only the boot CPU is running, alloc_page() targets the
>>>
>>> boot node, which has memory. So even if __GFP_THISNODE were
>>>
>>> inherited, it would not fail on the boot node during this window.
>>>
>>>
>>> So Sashiko's analysis applies to the general case, and indeed the 
>>> issues
>>>
>>> he raised could occur there.
>>>
>>> However, in the early boot scenario, I believe the current patch is 
>>> safe,
>>>
>>> even though it is not fully generic (after all, no one can predict
>>> future use cases).
>>>
>>> Therefore, I agree with his suggestion that using a clean mask like
>>> GFP_NOWAIT | __GFP_NOWARN.
>
> Hi Suren
>
> I've been thinking about the GFP flags issue for the past few days. 
> There are actually a couple of issues
>
> with the suggestion of using GFP_NOWAIT | __GFP_NOWARN.
>
> First, GFP_NOWAIT already includes __GFP_NOWARN, so it's redundant.
>
> Second, GFP_NOWAIT also includes __GFP_KSWAPD_RECLAIM, which is 
> exactly the same issue he flagged
>
> previously with GFP_ATOMIC — it can still trigger wakeup_kswapd() and 
> acquire scheduler locks, leading to
>
> potential deadlock in the same scenario he described.

Sorry for missing the relevant link earlier, I've attached it below.

https://sashiko.dev/#/patchset/20260420141534.1009462-1-hao.ge%40linux.dev


 > +    page = alloc_page(GFP_ATOMIC | __GFP_NO_CODETAG | __GFP_ZERO);
Can this lead to a deadlock by introducing lock recursion?
alloc_early_pfn_node() is invoked as a post-allocation hook for early boot
pages via pgalloc_tag_add(). GFP_ATOMIC includes __GFP_KSWAPD_RECLAIM,
which triggers wakeup_kswapd() and acquires scheduler locks.
If the original allocation was made under scheduler locks and intentionally
stripped __GFP_KSWAPD_RECLAIM to prevent recursion, does this hardcoded
GFP_ATOMIC force it back on? Should the hook inherit or constrain its flags
based on the caller's gfp_flags instead?


>
> So I think __GFP_HIGH | __GFP_NO_CODETAG is the right choice.
>
> Since this runs under rcu_read_lock(), we can't have 
> __GFP_DIRECT_RECLAIM.
>
> And since Sashiko pointed out the scheduler lock concern with 
> __GFP_KSWAPD_RECLAIM,
>
> we can't have that either.
>
> I have posted the v6 revision, would you please kindly review it at 
> your convenience?
>
> https://lore.kernel.org/all/20260604024008.46592-1-hao.ge@linux.dev/


v6 sashiko review details as follows:

https://sashiko.dev/#/patchset/20260604024008.46592-1-hao.ge%40linux.dev

Apart from the known false positive, no further review comments from 
sashiko.


Thanks

Best Regards

Hao

>
> Thanks
>
> Best Regards
>
> Hao
>
>> This sounds good to me. With that change feel free to add:
>>
>> Acked-by: Suren Baghdasaryan <surenb@google.com>
>>
>>>
>>> In any case, I will wait for your and Suren's feedback. You may have
>>> different opinions on this matter.
>>>
>>>
>>> Thanks
>>>
>>> Best Regards
>>>
>>> Hao
>>>
>>>
>>>> Please?
>>>>
>>>> Also, this patch has no evidence of human review.
>>>>
>>>>
>>>> From: Hao Ge <hao.ge@linux.dev>
>>>> Subject: mm/alloc_tag: replace fixed-size early PFN array with 
>>>> dynamic linked list
>>>> Date: Wed, 6 May 2026 10:22:56 +0800
>>>>
>>>> Pages allocated before page_ext is available have their codetag left
>>>> uninitialized.  Track these early PFNs and clear their codetag in
>>>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set" 
>>>> warnings
>>>> when they are freed later.
>>>>
>>>> Currently a fixed-size array of 8192 entries is used, with a 
>>>> warning if
>>>> the limit is exceeded.  However, the number of early allocations 
>>>> depends
>>>> on the number of CPUs and can be larger than 8192.
>>>>
>>>> Replace the fixed-size array with a dynamically allocated linked 
>>>> list of
>>>> pfn_pool structs.  Each node is allocated via alloc_page() and 
>>>> mapped to a
>>>> pfn_pool containing a next pointer, an atomic slot counter, and a PFN
>>>> array that fills the remainder of the page.
>>>>
>>>> The tracking pages themselves are allocated via alloc_page(), which 
>>>> would
>>>> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
>>>> indefinitely.  Introduce __GFP_NO_CODETAG (reuses the 
>>>> %__GFP_NO_OBJ_EXT
>>>> bit) and pass gfp_flags through pgalloc_tag_add() so that the early 
>>>> path
>>>> can skip recording allocations that carry this flag.
>>>>
>>>> Link: https://lore.kernel.org/20260506022256.32664-1-hao.ge@linux.dev
>>>> Signed-off-by: Hao Ge <hao.ge@linux.dev>
>>>> Suggested-by: Suren Baghdasaryan <surenb@google.com>
>>>> Cc: Brendan Jackman <jackmanb@google.com>
>>>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>>>> Cc: Kent Overstreet <kent.overstreet@linux.dev>
>>>> Cc: Michal Hocko <mhocko@suse.com>
>>>> Cc: Vlastimil Babka <vbabka@kernel.org>
>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
>>>> ---
>>>>
>>>>    include/linux/alloc_tag.h |    4
>>>>    lib/alloc_tag.c           |  145 
>>>> +++++++++++++++++++++++-------------
>>>>    mm/page_alloc.c           |   12 +-
>>>>    3 files changed, 102 insertions(+), 59 deletions(-)
>>>>
>>>> --- 
>>>> a/include/linux/alloc_tag.h~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
>>>> +++ a/include/linux/alloc_tag.h
>>>> @@ -163,11 +163,11 @@ static inline void alloc_tag_sub_check(u
>>>>    {
>>>>        WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
>>>>    }
>>>> -void alloc_tag_add_early_pfn(unsigned long pfn);
>>>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
>>>>    #else
>>>>    static inline void alloc_tag_add_check(union codetag_ref *ref, 
>>>> struct alloc_tag *tag) {}
>>>>    static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
>>>> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
>>>> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, 
>>>> gfp_t gfp_flags) {}
>>>>    #endif
>>>>
>>>>    /* Caller should verify both ref and tag to be valid */
>>>> --- 
>>>> a/lib/alloc_tag.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
>>>> +++ a/lib/alloc_tag.c
>>>> @@ -767,60 +767,95 @@ static __init bool need_page_alloc_taggi
>>>>     * their codetag uninitialized. Track these early PFNs so we can 
>>>> clear
>>>>     * their codetag refs later to avoid warnings when they are freed.
>>>>     *
>>>> - * Early allocations include:
>>>> - *   - Base allocations independent of CPU count
>>>> - *   - Per-CPU allocations (e.g., CPU hotplug callbacks during 
>>>> smp_init,
>>>> - *     such as trace ring buffers, scheduler per-cpu data)
>>>> - *
>>>> - * For simplicity, we fix the size to 8192.
>>>> - * If insufficient, a warning will be triggered to alert the user.
>>>> + * Each page is cast to a pfn_pool: the first few bytes hold metadata
>>>> + * (next pointer and slot count), the remainder stores PFNs.
>>>> + */
>>>> +struct pfn_pool {
>>>> +     struct pfn_pool *next;
>>>> +     atomic_t count;
>>>> +     unsigned long pfns[];
>>>> +};
>>>> +
>>>> +#define PFN_POOL_SIZE                        ((PAGE_SIZE - 
>>>> offsetof(struct pfn_pool, pfns)) / \
>>>> +                                      sizeof(unsigned long))
>>>> +
>>>> +/*
>>>> + * Skip early PFN recording for a page allocation.  Reuses the
>>>> + * %__GFP_NO_OBJ_EXT bit.  Used by __alloc_tag_add_early_pfn() to 
>>>> avoid
>>>> + * recursion when allocating pages for the early PFN tracking list
>>>> + * itself.
>>>>     *
>>>> - * TODO: Replace fixed-size array with dynamic allocation using
>>>> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
>>>> + * Codetags of the pages allocated with __GFP_NO_CODETAG should be
>>>> + * cleared (via clear_page_tag_ref()) before freeing the pages to 
>>>> prevent
>>>> + * alloc_tag_sub_check() from triggering a warning.
>>>>     */
>>>> -#define EARLY_ALLOC_PFN_MAX          8192
>>>> +#define __GFP_NO_CODETAG             __GFP_NO_OBJ_EXT
>>>>
>>>> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
>>>> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
>>>> +static struct pfn_pool *current_pfn_pool __initdata;
>>>>
>>>> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
>>>> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, 
>>>> gfp_t gfp_flags)
>>>>    {
>>>> -     int old_idx, new_idx;
>>>> +     struct pfn_pool *pool;
>>>> +     int idx;
>>>>
>>>>        do {
>>>> -             old_idx = atomic_read(&early_pfn_count);
>>>> -             if (old_idx >= EARLY_ALLOC_PFN_MAX) {
>>>> -                     pr_warn_once("Early page allocations before 
>>>> page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
>>>> -                                   EARLY_ALLOC_PFN_MAX);
>>>> -                     return;
>>>> +             pool = READ_ONCE(current_pfn_pool);
>>>> +             if (!pool || atomic_read(&pool->count) >= 
>>>> PFN_POOL_SIZE) {
>>>> +                     gfp_t gfp = gfp_flags & 
>>>> ~(__GFP_DIRECT_RECLAIM | GFP_ZONEMASK);
>>>> +                     struct page *new_page = alloc_page(gfp | 
>>>> __GFP_NO_CODETAG);
>>>> +                     struct pfn_pool *new;
>>>> +
>>>> +                     if (!new_page) {
>>>> +                             pr_warn_once("early PFN tracking page 
>>>> allocation failed\n");
>>>> +                             return;
>>>> +                     }
>>>> +                     new = page_address(new_page);
>>>> +                     new->next = pool;
>>>> +                     atomic_set(&new->count, 0);
>>>> +                     if (cmpxchg(&current_pfn_pool, pool, new) != 
>>>> pool) {
>>>> +                             clear_page_tag_ref(new_page);
>>>> +                             __free_page(new_page);
>>>> +                             continue;
>>>> +                     }
>>>> +                     pool = new;
>>>>                }
>>>> -             new_idx = old_idx + 1;
>>>> -     } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, 
>>>> new_idx));
>>>> +             idx = atomic_read(&pool->count);
>>>> +             if (idx >= PFN_POOL_SIZE)
>>>> +                     continue;
>>>> +             if (atomic_cmpxchg(&pool->count, idx, idx + 1) == idx)
>>>> +                     break;
>>>> +     } while (1);
>>>>
>>>> -     early_pfns[old_idx] = pfn;
>>>> +     pool->pfns[idx] = pfn;
>>>>    }
>>>>
>>>> -typedef void alloc_tag_add_func(unsigned long pfn);
>>>> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
>>>>    static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr 
>>>> __refdata =
>>>>        RCU_INITIALIZER(__alloc_tag_add_early_pfn);
>>>>
>>>> -void alloc_tag_add_early_pfn(unsigned long pfn)
>>>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>>>>    {
>>>>        alloc_tag_add_func *alloc_tag_add;
>>>>
>>>>        if (static_key_enabled(&mem_profiling_compressed))
>>>>                return;
>>>>
>>>> +     /* Skip allocations for the tracking list itself to avoid 
>>>> recursion. */
>>>> +     if (gfp_flags & __GFP_NO_CODETAG)
>>>> +             return;
>>>> +
>>>>        rcu_read_lock();
>>>>        alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
>>>>        if (alloc_tag_add)
>>>> -             alloc_tag_add(pfn);
>>>> +             alloc_tag_add(pfn, gfp_flags);
>>>>        rcu_read_unlock();
>>>>    }
>>>>
>>>>    static void __init clear_early_alloc_pfn_tag_refs(void)
>>>>    {
>>>> -     unsigned int i;
>>>> +     struct pfn_pool *pool, *next;
>>>> +     struct page *page;
>>>> +     int i;
>>>>
>>>>        if (static_key_enabled(&mem_profiling_compressed))
>>>>                return;
>>>> @@ -829,37 +864,45 @@ static void __init clear_early_alloc_pfn
>>>>        /* Make sure we are not racing with 
>>>> __alloc_tag_add_early_pfn() */
>>>>        synchronize_rcu();
>>>>
>>>> -     for (i = 0; i < atomic_read(&early_pfn_count); i++) {
>>>> -             unsigned long pfn = early_pfns[i];
>>>> +     for (pool = current_pfn_pool; pool; pool = next) {
>>>> +             int nr_pfns = atomic_read(&pool->count);
>>>> +
>>>> +             for (i = 0; i < nr_pfns; i++) {
>>>> +                     unsigned long pfn = pool->pfns[i];
>>>>
>>>> -             if (pfn_valid(pfn)) {
>>>> -                     struct page *page = pfn_to_page(pfn);
>>>> -                     union pgtag_ref_handle handle;
>>>> -                     union codetag_ref ref;
>>>> -
>>>> -                     if (get_page_tag_ref(page, &ref, &handle)) {
>>>> -                             /*
>>>> -                              * An early-allocated page could be 
>>>> freed and reallocated
>>>> -                              * after its page_ext is initialized 
>>>> but before we clear it.
>>>> -                              * In that case, it already has a 
>>>> valid tag set.
>>>> -                              * We should not overwrite that valid 
>>>> tag with CODETAG_EMPTY.
>>>> -                              *
>>>> -                              * Note: there is still a small race 
>>>> window between checking
>>>> -                              * ref.ct and calling 
>>>> set_codetag_empty(). We accept this
>>>> -                              * race as it's unlikely and the 
>>>> extra complexity of atomic
>>>> -                              * cmpxchg is not worth it for this 
>>>> debug-only code path.
>>>> -                              */
>>>> -                             if (ref.ct) {
>>>> +                     if (pfn_valid(pfn)) {
>>>> +                             union pgtag_ref_handle handle;
>>>> +                             union codetag_ref ref;
>>>> +
>>>> +                             if 
>>>> (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
>>>> +                                     /*
>>>> +                                      * An early-allocated page 
>>>> could be freed and reallocated
>>>> +                                      * after its page_ext is 
>>>> initialized but before we clear it.
>>>> +                                      * In that case, it already 
>>>> has a valid tag set.
>>>> +                                      * We should not overwrite 
>>>> that valid tag
>>>> +                                      * with CODETAG_EMPTY.
>>>> +                                      *
>>>> +                                      * Note: there is still a 
>>>> small race window between checking
>>>> +                                      * ref.ct and calling 
>>>> set_codetag_empty(). We accept this
>>>> +                                      * race as it's unlikely and 
>>>> the extra complexity of atomic
>>>> +                                      * cmpxchg is not worth it 
>>>> for this debug-only code path.
>>>> +                                      */
>>>> +                                     if (ref.ct) {
>>>> + put_page_tag_ref(handle);
>>>> +                                             continue;
>>>> +                                     }
>>>> +
>>>> + set_codetag_empty(&ref);
>>>> + update_page_tag_ref(handle, &ref);
>>>> put_page_tag_ref(handle);
>>>> -                                     continue;
>>>>                                }
>>>> -
>>>> -                             set_codetag_empty(&ref);
>>>> -                             update_page_tag_ref(handle, &ref);
>>>> -                             put_page_tag_ref(handle);
>>>>                        }
>>>>                }
>>>>
>>>> +             next = pool->next;
>>>> +             page = virt_to_page(pool);
>>>> +             clear_page_tag_ref(page);
>>>> +             __free_page(page);
>>>>        }
>>>>    }
>>>>    #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
>>>> --- 
>>>> a/mm/page_alloc.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
>>>> +++ a/mm/page_alloc.c
>>>> @@ -1255,7 +1255,7 @@ void __clear_page_tag_ref(struct page *p
>>>>    /* Should be called only if mem_alloc_profiling_enabled() */
>>>>    static noinline
>>>>    void __pgalloc_tag_add(struct page *page, struct task_struct *task,
>>>> -                    unsigned int nr)
>>>> +                    unsigned int nr, gfp_t gfp_flags)
>>>>    {
>>>>        union pgtag_ref_handle handle;
>>>>        union codetag_ref ref;
>>>> @@ -1269,17 +1269,17 @@ void __pgalloc_tag_add(struct page *page
>>>>                 * page_ext is not available yet, record the pfn so 
>>>> we can
>>>>                 * clear the tag ref later when page_ext is 
>>>> initialized.
>>>>                 */
>>>> -             alloc_tag_add_early_pfn(page_to_pfn(page));
>>>> +             alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
>>>>                if (task->alloc_tag)
>>>> alloc_tag_set_inaccurate(task->alloc_tag);
>>>>        }
>>>>    }
>>>>
>>>>    static inline void pgalloc_tag_add(struct page *page, struct 
>>>> task_struct *task,
>>>> -                                unsigned int nr)
>>>> +                                unsigned int nr, gfp_t gfp_flags)
>>>>    {
>>>>        if (mem_alloc_profiling_enabled())
>>>> -             __pgalloc_tag_add(page, task, nr);
>>>> +             __pgalloc_tag_add(page, task, nr, gfp_flags);
>>>>    }
>>>>
>>>>    /* Should be called only if mem_alloc_profiling_enabled() */
>>>> @@ -1312,7 +1312,7 @@ static inline void pgalloc_tag_sub_pages
>>>>    #else /* CONFIG_MEM_ALLOC_PROFILING */
>>>>
>>>>    static inline void pgalloc_tag_add(struct page *page, struct 
>>>> task_struct *task,
>>>> -                                unsigned int nr) {}
>>>> +                                unsigned int nr, gfp_t gfp_flags) {}
>>>>    static inline void pgalloc_tag_sub(struct page *page, unsigned 
>>>> int nr) {}
>>>>    static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, 
>>>> unsigned int nr) {}
>>>>
>>>> @@ -1867,7 +1867,7 @@ inline void post_alloc_hook(struct page
>>>>
>>>>        set_page_owner(page, order, gfp_flags);
>>>>        page_table_check_alloc(page, order);
>>>> -     pgalloc_tag_add(page, current, 1 << order);
>>>> +     pgalloc_tag_add(page, current, 1 << order, gfp_flags);
>>>>    }
>>>>
>>>>    static void prep_new_page(struct page *page, unsigned int order, 
>>>> gfp_t gfp_flags,
>>>> _
>>>>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
  2026-06-04  2:46         ` Hao Ge
  2026-06-04  5:29           ` Hao Ge
@ 2026-06-04 23:52           ` Suren Baghdasaryan
  1 sibling, 0 replies; 9+ messages in thread
From: Suren Baghdasaryan @ 2026-06-04 23:52 UTC (permalink / raw)
  To: Hao Ge
  Cc: Andrew Morton, linux-mm, linux-kernel, Kent Overstreet,
	Roman Gushchin

On Wed, Jun 3, 2026 at 7:47 PM Hao Ge <hao.ge@linux.dev> wrote:
>
>
> On 2026/6/4 00:54, Suren Baghdasaryan wrote:
> > On Tue, May 26, 2026 at 10:22 PM Hao Ge <hao.ge@linux.dev> wrote:
> >>
> >> On 2026/5/27 10:00, Andrew Morton wrote:
> >>> On Fri, 8 May 2026 17:12:51 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:
> >>>
> >>>> On Wed,  6 May 2026 10:22:56 +0800 Hao Ge <hao.ge@linux.dev> wrote:
> >>>>
> >>>>> Pages allocated before page_ext is available have their codetag left
> >>>>> uninitialized. Track these early PFNs and clear their codetag in
> >>>>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> >>>>> warnings when they are freed later.
> >>>>>
> >>>>> Currently a fixed-size array of 8192 entries is used, with a warning if
> >>>>> the limit is exceeded. However, the number of early allocations depends
> >>>>> on the number of CPUs and can be larger than 8192.
> >>>>>
> >>>>> Replace the fixed-size array with a dynamically allocated linked list
> >>>>> of pfn_pool structs. Each node is allocated via alloc_page() and mapped
> >>>>> to a pfn_pool containing a next pointer, an atomic slot counter, and a
> >>>>> PFN array that fills the remainder of the page.
> >>>>>
> >>>>> The tracking pages themselves are allocated via alloc_page(), which
> >>>>> would trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and
> >>>>> recurse indefinitely. Introduce __GFP_NO_CODETAG (reuses the
> >>>>> %__GFP_NO_OBJ_EXT bit) and pass gfp_flags through pgalloc_tag_add()
> >>>>> so that the early path can skip recording allocations that carry this
> >>>>> flag.
> >>>> AI review asked a couple of things.  I have a feeling we saw at least
> >>>> one of these, so probably already dealt with.
> >>>>       https://sashiko.dev/#/patchset/20260506022256.32664-1-hao.ge@linux.dev
> >> Hi Andrew
> >>
> >> My apologies. I'm also waiting for Suren's review. He may have been tied
> >> up lately
> >>
> >> and might not have time to get to this.
> >>
> >>
> >> Sashiko raised two issues this time. I've already responded to the first
> >> one.
> >>
> >> See the link below:
> >>
> >> https://lore.kernel.org/all/0b9969e2-b208-46c2-a9a5-bf620239275a@linux.dev/
> >>
> >> If I haven't missed any details, it should be a false positive.
> > That seems to be the case. I wonder why Sashiko did not consider
> > that... CC'ing Roman to see if Sashiko can be improved (unless we both
> > are missing something).
> >
> >>
> >> As for the second point, let me address it.
> >>
> >> The early PFN tracking window is entirely within mm_core_init(),
> >>
> >> which is called from start_kernel():
> >>
> >> start_kernel()
> >>
> >>       mm_core_init()
> >>
> >>           memblock_free_all();
> >>
> >>           mem_init() //start early PFN tracking
> >>
> >>           kmem_cache_init()                           // SLUB bootstrap +
> >> kmalloc caches
> >> ...
> >>           page_ext_init()                                   // clears
> >> alloc_tag_add_early_pfn_ptr
> >>
> >>       ...
> >>
> >>       rest_init() //spawns kernel_init thread
> >>
> >>
> >> kernel_init() → kernel_init_freeable()            // separate thread, later
> >>
> >>       smp_init()                                    // secondary CPUs
> >> come online here
> >>
> >> Within the early PFN window (mem_init() to page_ext_init()):
> >>
> >>    1. We are still in start_kernel(), single CPU. The buddy allocator
> >>
> >> was just initialized from memblock and should have plenty of free
> >>
> >> pages, so alloc_page() would likely be satisfied from the fast
> >>
> >> path. If so, the __GFP_NOFAIL without __GFP_DIRECT_RECLAIM
> >>
> >> check in the slowpath would not be reached.
> >>
> >> 2. Since only the boot CPU is running, alloc_page() targets the
> >>
> >> boot node, which has memory. So even if __GFP_THISNODE were
> >>
> >> inherited, it would not fail on the boot node during this window.
> >>
> >>
> >> So Sashiko's analysis applies to the general case, and indeed the issues
> >>
> >> he raised could occur there.
> >>
> >> However, in the early boot scenario, I believe the current patch is safe,
> >>
> >> even though it is not fully generic (after all, no one can predict
> >> future use cases).
> >>
> >> Therefore, I agree with his suggestion that using a clean mask like
> >> GFP_NOWAIT | __GFP_NOWARN.
>
> Hi Suren
>
> I've been thinking about the GFP flags issue for the past few days.
> There are actually a couple of issues
>
> with the suggestion of using GFP_NOWAIT | __GFP_NOWARN.
>
> First, GFP_NOWAIT already includes __GFP_NOWARN, so it's redundant.
>
> Second, GFP_NOWAIT also includes __GFP_KSWAPD_RECLAIM, which is exactly
> the same issue he flagged
>
> previously with GFP_ATOMIC — it can still trigger wakeup_kswapd() and
> acquire scheduler locks, leading to
>
> potential deadlock in the same scenario he described.
>
> So I think __GFP_HIGH | __GFP_NO_CODETAG is the right choice.

Make sense. TBH if we are failing to allocate a 0-order page during
the early init stage then something is seriously wrong with the
system.

>
> Since this runs under rcu_read_lock(), we can't have __GFP_DIRECT_RECLAIM.
>
> And since Sashiko pointed out the scheduler lock concern with
> __GFP_KSWAPD_RECLAIM,
>
> we can't have that either.
>
> I have posted the v6 revision, would you please kindly review it at your
> convenience?
>
> https://lore.kernel.org/all/20260604024008.46592-1-hao.ge@linux.dev/
>
> Thanks
>
> Best Regards
>
> Hao
>
> > This sounds good to me. With that change feel free to add:
> >
> > Acked-by: Suren Baghdasaryan <surenb@google.com>
> >
> >>
> >> In any case, I will wait for your and Suren's feedback. You may have
> >> different opinions on this matter.
> >>
> >>
> >> Thanks
> >>
> >> Best Regards
> >>
> >> Hao
> >>
> >>
> >>> Please?
> >>>
> >>> Also, this patch has no evidence of human review.
> >>>
> >>>
> >>> From: Hao Ge <hao.ge@linux.dev>
> >>> Subject: mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
> >>> Date: Wed, 6 May 2026 10:22:56 +0800
> >>>
> >>> Pages allocated before page_ext is available have their codetag left
> >>> uninitialized.  Track these early PFNs and clear their codetag in
> >>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set" warnings
> >>> when they are freed later.
> >>>
> >>> Currently a fixed-size array of 8192 entries is used, with a warning if
> >>> the limit is exceeded.  However, the number of early allocations depends
> >>> on the number of CPUs and can be larger than 8192.
> >>>
> >>> Replace the fixed-size array with a dynamically allocated linked list of
> >>> pfn_pool structs.  Each node is allocated via alloc_page() and mapped to a
> >>> pfn_pool containing a next pointer, an atomic slot counter, and a PFN
> >>> array that fills the remainder of the page.
> >>>
> >>> The tracking pages themselves are allocated via alloc_page(), which would
> >>> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
> >>> indefinitely.  Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
> >>> bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
> >>> can skip recording allocations that carry this flag.
> >>>
> >>> Link: https://lore.kernel.org/20260506022256.32664-1-hao.ge@linux.dev
> >>> Signed-off-by: Hao Ge <hao.ge@linux.dev>
> >>> Suggested-by: Suren Baghdasaryan <surenb@google.com>
> >>> Cc: Brendan Jackman <jackmanb@google.com>
> >>> Cc: Johannes Weiner <hannes@cmpxchg.org>
> >>> Cc: Kent Overstreet <kent.overstreet@linux.dev>
> >>> Cc: Michal Hocko <mhocko@suse.com>
> >>> Cc: Vlastimil Babka <vbabka@kernel.org>
> >>> Cc: Zi Yan <ziy@nvidia.com>
> >>> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> >>> ---
> >>>
> >>>    include/linux/alloc_tag.h |    4
> >>>    lib/alloc_tag.c           |  145 +++++++++++++++++++++++-------------
> >>>    mm/page_alloc.c           |   12 +-
> >>>    3 files changed, 102 insertions(+), 59 deletions(-)
> >>>
> >>> --- a/include/linux/alloc_tag.h~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> >>> +++ a/include/linux/alloc_tag.h
> >>> @@ -163,11 +163,11 @@ static inline void alloc_tag_sub_check(u
> >>>    {
> >>>        WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> >>>    }
> >>> -void alloc_tag_add_early_pfn(unsigned long pfn);
> >>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
> >>>    #else
> >>>    static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
> >>>    static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
> >>> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
> >>> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
> >>>    #endif
> >>>
> >>>    /* Caller should verify both ref and tag to be valid */
> >>> --- a/lib/alloc_tag.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> >>> +++ a/lib/alloc_tag.c
> >>> @@ -767,60 +767,95 @@ static __init bool need_page_alloc_taggi
> >>>     * their codetag uninitialized. Track these early PFNs so we can clear
> >>>     * their codetag refs later to avoid warnings when they are freed.
> >>>     *
> >>> - * Early allocations include:
> >>> - *   - Base allocations independent of CPU count
> >>> - *   - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
> >>> - *     such as trace ring buffers, scheduler per-cpu data)
> >>> - *
> >>> - * For simplicity, we fix the size to 8192.
> >>> - * If insufficient, a warning will be triggered to alert the user.
> >>> + * Each page is cast to a pfn_pool: the first few bytes hold metadata
> >>> + * (next pointer and slot count), the remainder stores PFNs.
> >>> + */
> >>> +struct pfn_pool {
> >>> +     struct pfn_pool *next;
> >>> +     atomic_t count;
> >>> +     unsigned long pfns[];
> >>> +};
> >>> +
> >>> +#define PFN_POOL_SIZE                        ((PAGE_SIZE - offsetof(struct pfn_pool, pfns)) / \
> >>> +                                      sizeof(unsigned long))
> >>> +
> >>> +/*
> >>> + * Skip early PFN recording for a page allocation.  Reuses the
> >>> + * %__GFP_NO_OBJ_EXT bit.  Used by __alloc_tag_add_early_pfn() to avoid
> >>> + * recursion when allocating pages for the early PFN tracking list
> >>> + * itself.
> >>>     *
> >>> - * TODO: Replace fixed-size array with dynamic allocation using
> >>> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
> >>> + * Codetags of the pages allocated with __GFP_NO_CODETAG should be
> >>> + * cleared (via clear_page_tag_ref()) before freeing the pages to prevent
> >>> + * alloc_tag_sub_check() from triggering a warning.
> >>>     */
> >>> -#define EARLY_ALLOC_PFN_MAX          8192
> >>> +#define __GFP_NO_CODETAG             __GFP_NO_OBJ_EXT
> >>>
> >>> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
> >>> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
> >>> +static struct pfn_pool *current_pfn_pool __initdata;
> >>>
> >>> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
> >>> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >>>    {
> >>> -     int old_idx, new_idx;
> >>> +     struct pfn_pool *pool;
> >>> +     int idx;
> >>>
> >>>        do {
> >>> -             old_idx = atomic_read(&early_pfn_count);
> >>> -             if (old_idx >= EARLY_ALLOC_PFN_MAX) {
> >>> -                     pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
> >>> -                                   EARLY_ALLOC_PFN_MAX);
> >>> -                     return;
> >>> +             pool = READ_ONCE(current_pfn_pool);
> >>> +             if (!pool || atomic_read(&pool->count) >= PFN_POOL_SIZE) {
> >>> +                     gfp_t gfp = gfp_flags & ~(__GFP_DIRECT_RECLAIM | GFP_ZONEMASK);
> >>> +                     struct page *new_page = alloc_page(gfp | __GFP_NO_CODETAG);
> >>> +                     struct pfn_pool *new;
> >>> +
> >>> +                     if (!new_page) {
> >>> +                             pr_warn_once("early PFN tracking page allocation failed\n");
> >>> +                             return;
> >>> +                     }
> >>> +                     new = page_address(new_page);
> >>> +                     new->next = pool;
> >>> +                     atomic_set(&new->count, 0);
> >>> +                     if (cmpxchg(&current_pfn_pool, pool, new) != pool) {
> >>> +                             clear_page_tag_ref(new_page);
> >>> +                             __free_page(new_page);
> >>> +                             continue;
> >>> +                     }
> >>> +                     pool = new;
> >>>                }
> >>> -             new_idx = old_idx + 1;
> >>> -     } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
> >>> +             idx = atomic_read(&pool->count);
> >>> +             if (idx >= PFN_POOL_SIZE)
> >>> +                     continue;
> >>> +             if (atomic_cmpxchg(&pool->count, idx, idx + 1) == idx)
> >>> +                     break;
> >>> +     } while (1);
> >>>
> >>> -     early_pfns[old_idx] = pfn;
> >>> +     pool->pfns[idx] = pfn;
> >>>    }
> >>>
> >>> -typedef void alloc_tag_add_func(unsigned long pfn);
> >>> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
> >>>    static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
> >>>        RCU_INITIALIZER(__alloc_tag_add_early_pfn);
> >>>
> >>> -void alloc_tag_add_early_pfn(unsigned long pfn)
> >>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >>>    {
> >>>        alloc_tag_add_func *alloc_tag_add;
> >>>
> >>>        if (static_key_enabled(&mem_profiling_compressed))
> >>>                return;
> >>>
> >>> +     /* Skip allocations for the tracking list itself to avoid recursion. */
> >>> +     if (gfp_flags & __GFP_NO_CODETAG)
> >>> +             return;
> >>> +
> >>>        rcu_read_lock();
> >>>        alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
> >>>        if (alloc_tag_add)
> >>> -             alloc_tag_add(pfn);
> >>> +             alloc_tag_add(pfn, gfp_flags);
> >>>        rcu_read_unlock();
> >>>    }
> >>>
> >>>    static void __init clear_early_alloc_pfn_tag_refs(void)
> >>>    {
> >>> -     unsigned int i;
> >>> +     struct pfn_pool *pool, *next;
> >>> +     struct page *page;
> >>> +     int i;
> >>>
> >>>        if (static_key_enabled(&mem_profiling_compressed))
> >>>                return;
> >>> @@ -829,37 +864,45 @@ static void __init clear_early_alloc_pfn
> >>>        /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
> >>>        synchronize_rcu();
> >>>
> >>> -     for (i = 0; i < atomic_read(&early_pfn_count); i++) {
> >>> -             unsigned long pfn = early_pfns[i];
> >>> +     for (pool = current_pfn_pool; pool; pool = next) {
> >>> +             int nr_pfns = atomic_read(&pool->count);
> >>> +
> >>> +             for (i = 0; i < nr_pfns; i++) {
> >>> +                     unsigned long pfn = pool->pfns[i];
> >>>
> >>> -             if (pfn_valid(pfn)) {
> >>> -                     struct page *page = pfn_to_page(pfn);
> >>> -                     union pgtag_ref_handle handle;
> >>> -                     union codetag_ref ref;
> >>> -
> >>> -                     if (get_page_tag_ref(page, &ref, &handle)) {
> >>> -                             /*
> >>> -                              * An early-allocated page could be freed and reallocated
> >>> -                              * after its page_ext is initialized but before we clear it.
> >>> -                              * In that case, it already has a valid tag set.
> >>> -                              * We should not overwrite that valid tag with CODETAG_EMPTY.
> >>> -                              *
> >>> -                              * Note: there is still a small race window between checking
> >>> -                              * ref.ct and calling set_codetag_empty(). We accept this
> >>> -                              * race as it's unlikely and the extra complexity of atomic
> >>> -                              * cmpxchg is not worth it for this debug-only code path.
> >>> -                              */
> >>> -                             if (ref.ct) {
> >>> +                     if (pfn_valid(pfn)) {
> >>> +                             union pgtag_ref_handle handle;
> >>> +                             union codetag_ref ref;
> >>> +
> >>> +                             if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
> >>> +                                     /*
> >>> +                                      * An early-allocated page could be freed and reallocated
> >>> +                                      * after its page_ext is initialized but before we clear it.
> >>> +                                      * In that case, it already has a valid tag set.
> >>> +                                      * We should not overwrite that valid tag
> >>> +                                      * with CODETAG_EMPTY.
> >>> +                                      *
> >>> +                                      * Note: there is still a small race window between checking
> >>> +                                      * ref.ct and calling set_codetag_empty(). We accept this
> >>> +                                      * race as it's unlikely and the extra complexity of atomic
> >>> +                                      * cmpxchg is not worth it for this debug-only code path.
> >>> +                                      */
> >>> +                                     if (ref.ct) {
> >>> +                                             put_page_tag_ref(handle);
> >>> +                                             continue;
> >>> +                                     }
> >>> +
> >>> +                                     set_codetag_empty(&ref);
> >>> +                                     update_page_tag_ref(handle, &ref);
> >>>                                        put_page_tag_ref(handle);
> >>> -                                     continue;
> >>>                                }
> >>> -
> >>> -                             set_codetag_empty(&ref);
> >>> -                             update_page_tag_ref(handle, &ref);
> >>> -                             put_page_tag_ref(handle);
> >>>                        }
> >>>                }
> >>>
> >>> +             next = pool->next;
> >>> +             page = virt_to_page(pool);
> >>> +             clear_page_tag_ref(page);
> >>> +             __free_page(page);
> >>>        }
> >>>    }
> >>>    #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
> >>> --- a/mm/page_alloc.c~mm-alloc_tag-replace-fixed-size-early-pfn-array-with-dynamic-linked-list
> >>> +++ a/mm/page_alloc.c
> >>> @@ -1255,7 +1255,7 @@ void __clear_page_tag_ref(struct page *p
> >>>    /* Should be called only if mem_alloc_profiling_enabled() */
> >>>    static noinline
> >>>    void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> >>> -                    unsigned int nr)
> >>> +                    unsigned int nr, gfp_t gfp_flags)
> >>>    {
> >>>        union pgtag_ref_handle handle;
> >>>        union codetag_ref ref;
> >>> @@ -1269,17 +1269,17 @@ void __pgalloc_tag_add(struct page *page
> >>>                 * page_ext is not available yet, record the pfn so we can
> >>>                 * clear the tag ref later when page_ext is initialized.
> >>>                 */
> >>> -             alloc_tag_add_early_pfn(page_to_pfn(page));
> >>> +             alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
> >>>                if (task->alloc_tag)
> >>>                        alloc_tag_set_inaccurate(task->alloc_tag);
> >>>        }
> >>>    }
> >>>
> >>>    static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> >>> -                                unsigned int nr)
> >>> +                                unsigned int nr, gfp_t gfp_flags)
> >>>    {
> >>>        if (mem_alloc_profiling_enabled())
> >>> -             __pgalloc_tag_add(page, task, nr);
> >>> +             __pgalloc_tag_add(page, task, nr, gfp_flags);
> >>>    }
> >>>
> >>>    /* Should be called only if mem_alloc_profiling_enabled() */
> >>> @@ -1312,7 +1312,7 @@ static inline void pgalloc_tag_sub_pages
> >>>    #else /* CONFIG_MEM_ALLOC_PROFILING */
> >>>
> >>>    static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> >>> -                                unsigned int nr) {}
> >>> +                                unsigned int nr, gfp_t gfp_flags) {}
> >>>    static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
> >>>    static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
> >>>
> >>> @@ -1867,7 +1867,7 @@ inline void post_alloc_hook(struct page
> >>>
> >>>        set_page_owner(page, order, gfp_flags);
> >>>        page_table_check_alloc(page, order);
> >>> -     pgalloc_tag_add(page, current, 1 << order);
> >>> +     pgalloc_tag_add(page, current, 1 << order, gfp_flags);
> >>>    }
> >>>
> >>>    static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
> >>> _
> >>>


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2026-06-04 23:52 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-06  2:22 [PATCH v5] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list Hao Ge
2026-05-09  0:12 ` Andrew Morton
2026-05-27  2:00   ` Andrew Morton
2026-05-27  5:22     ` Hao Ge
2026-06-02 23:40       ` Suren Baghdasaryan
2026-06-03 16:54       ` Suren Baghdasaryan
2026-06-04  2:46         ` Hao Ge
2026-06-04  5:29           ` Hao Ge
2026-06-04 23:52           ` Suren Baghdasaryan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.