* [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
@ 2026-04-23 8:37 Hao Ge
2026-04-27 16:02 ` Suren Baghdasaryan
0 siblings, 1 reply; 7+ messages in thread
From: Hao Ge @ 2026-04-23 8:37 UTC (permalink / raw)
To: Suren Baghdasaryan, Kent Overstreet, Andrew Morton
Cc: linux-mm, linux-kernel, Hao Ge
Pages allocated before page_ext is available have their codetag left
uninitialized. Track these early PFNs and clear their codetag in
clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
warnings when they are freed later.
Currently a fixed-size array of 8192 entries is used, with a warning if
the limit is exceeded. However, the number of early allocations depends
on the number of CPUs and can be larger than 8192.
Replace the fixed-size array with a dynamically allocated linked list
of pages. Each page stores PFNs in its page body and uses page->lru
to link to the next page, with page->private tracking the number of
used slots.
The list pages themselves are allocated via alloc_page(), which would
trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
indefinitely. Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
can skip recording allocations that carry this flag.
Signed-off-by: Hao Ge <hao.ge@linux.dev>
---
v3:
- Simplify linked list: use page->lru for chaining and page->private as
slot counter, removing the early_pfn_node struct and freelist (suggested
by Suren Baghdasaryan)
- Pass gfp_flags through alloc_tag_add_early_pfn() but strip
__GFP_DIRECT_RECLAIM instead of selecting GFP_KERNEL/GFP_ATOMIC,
because __alloc_tag_add_early_pfn() is invoked under rcu_read_lock().
v2:
- Use cmpxchg to atomically update early_pfn_pages, preventing page leak under concurrent allocation
- Pass gfp_flags through the full call chain and use gfpflags_allow_blocking()
to select GFP_KERNEL vs GFP_ATOMIC, avoiding unnecessary GFP_ATOMIC in process context
---
include/linux/alloc_tag.h | 22 ++++++-
lib/alloc_tag.c | 118 +++++++++++++++++++++-----------------
mm/page_alloc.c | 23 ++++----
3 files changed, 97 insertions(+), 66 deletions(-)
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 02de2ede560f..ce9b478033bf 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -150,6 +150,23 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
}
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+/*
+ * Skip early PFN recording for a page allocation. Reuses the
+ * %__GFP_NO_OBJ_EXT bit. Used by __alloc_tag_add_early_pfn() to avoid
+ * recursion when allocating pages for the early PFN tracking list
+ * itself.
+ *
+ * Callers must set the codetag to CODETAG_EMPTY (via
+ * clear_page_tag_ref()) before freeing pages allocated with this
+ * flag once page_ext becomes available, otherwise
+ * alloc_tag_sub_check() will trigger a warning.
+ */
+#define __GFP_NO_CODETAG __GFP_NO_OBJ_EXT
+
+static inline bool should_record_early_pfn(gfp_t gfp_flags)
+{
+ return !(gfp_flags & __GFP_NO_CODETAG);
+}
static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
{
WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref),
@@ -163,11 +180,12 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
{
WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
}
-void alloc_tag_add_early_pfn(unsigned long pfn);
+void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
#else
static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
-static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
+static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
+static inline bool should_record_early_pfn(gfp_t gfp_flags) { return false; }
#endif
/* Caller should verify both ref and tag to be valid */
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index ed1bdcf1f8ab..a00dc5a867e4 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -766,45 +766,49 @@ static __init bool need_page_alloc_tagging(void)
* Some pages are allocated before page_ext becomes available, leaving
* their codetag uninitialized. Track these early PFNs so we can clear
* their codetag refs later to avoid warnings when they are freed.
- *
- * Early allocations include:
- * - Base allocations independent of CPU count
- * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
- * such as trace ring buffers, scheduler per-cpu data)
- *
- * For simplicity, we fix the size to 8192.
- * If insufficient, a warning will be triggered to alert the user.
- *
- * TODO: Replace fixed-size array with dynamic allocation using
- * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
*/
-#define EARLY_ALLOC_PFN_MAX 8192
+#define EARLY_PFNS_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
-static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
+static struct page *early_pfn_current __initdata;
-static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
+static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
{
- int old_idx, new_idx;
+ struct page *page;
+ unsigned long idx;
do {
- old_idx = atomic_read(&early_pfn_count);
- if (old_idx >= EARLY_ALLOC_PFN_MAX) {
- pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
- EARLY_ALLOC_PFN_MAX);
- return;
+ page = READ_ONCE(early_pfn_current);
+ if (!page || READ_ONCE(page->private) >= EARLY_PFNS_PER_PAGE) {
+ gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
+ struct page *new = alloc_page(gfp | __GFP_NO_CODETAG);
+
+ if (!new) {
+ pr_warn_once("early PFN tracking page allocation failed\n");
+ return;
+ }
+ new->lru.next = (struct list_head *)page;
+ if (cmpxchg(&early_pfn_current, page, new) != page) {
+ __free_page(new);
+ continue;
+ }
+ page = new;
}
- new_idx = old_idx + 1;
- } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
+ idx = READ_ONCE(page->private);
+ if (idx >= EARLY_PFNS_PER_PAGE)
+ continue;
+ if (cmpxchg(&page->private, idx, idx + 1) != idx)
+ continue;
+ break;
+ } while (1);
- early_pfns[old_idx] = pfn;
+ ((unsigned long *)page_address(page))[idx] = pfn;
}
-typedef void alloc_tag_add_func(unsigned long pfn);
+typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
RCU_INITIALIZER(__alloc_tag_add_early_pfn);
-void alloc_tag_add_early_pfn(unsigned long pfn)
+void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
{
alloc_tag_add_func *alloc_tag_add;
@@ -814,13 +818,14 @@ void alloc_tag_add_early_pfn(unsigned long pfn)
rcu_read_lock();
alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
if (alloc_tag_add)
- alloc_tag_add(pfn);
+ alloc_tag_add(pfn, gfp_flags);
rcu_read_unlock();
}
static void __init clear_early_alloc_pfn_tag_refs(void)
{
- unsigned int i;
+ struct page *page, *next;
+ unsigned long i;
if (static_key_enabled(&mem_profiling_compressed))
return;
@@ -829,37 +834,42 @@ static void __init clear_early_alloc_pfn_tag_refs(void)
/* Make sure we are not racing with __alloc_tag_add_early_pfn() */
synchronize_rcu();
- for (i = 0; i < atomic_read(&early_pfn_count); i++) {
- unsigned long pfn = early_pfns[i];
-
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- /*
- * An early-allocated page could be freed and reallocated
- * after its page_ext is initialized but before we clear it.
- * In that case, it already has a valid tag set.
- * We should not overwrite that valid tag with CODETAG_EMPTY.
- *
- * Note: there is still a small race window between checking
- * ref.ct and calling set_codetag_empty(). We accept this
- * race as it's unlikely and the extra complexity of atomic
- * cmpxchg is not worth it for this debug-only code path.
- */
- if (ref.ct) {
+ for (page = early_pfn_current; page; page = next) {
+ for (i = 0; i < page->private; i++) {
+ unsigned long pfn = ((unsigned long *)page_address(page))[i];
+
+ if (pfn_valid(pfn)) {
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
+ /*
+ * An early-allocated page could be freed and reallocated
+ * after its page_ext is initialized but before we clear it.
+ * In that case, it already has a valid tag set.
+ * We should not overwrite that valid tag
+ * with CODETAG_EMPTY.
+ *
+ * Note: there is still a small race window between checking
+ * ref.ct and calling set_codetag_empty(). We accept this
+ * race as it's unlikely and the extra complexity of atomic
+ * cmpxchg is not worth it for this debug-only code path.
+ */
+ if (ref.ct) {
+ put_page_tag_ref(handle);
+ continue;
+ }
+
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
put_page_tag_ref(handle);
- continue;
}
-
- set_codetag_empty(&ref);
- update_page_tag_ref(handle, &ref);
- put_page_tag_ref(handle);
}
}
+ next = (struct page *)page->lru.next;
+ clear_page_tag_ref(page);
+ __free_page(page);
}
}
#else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 04494bc2e46f..5b7b234967a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1284,7 +1284,7 @@ void __clear_page_tag_ref(struct page *page)
/* Should be called only if mem_alloc_profiling_enabled() */
static noinline
void __pgalloc_tag_add(struct page *page, struct task_struct *task,
- unsigned int nr)
+ unsigned int nr, gfp_t gfp_flags)
{
union pgtag_ref_handle handle;
union codetag_ref ref;
@@ -1294,21 +1294,24 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
update_page_tag_ref(handle, &ref);
put_page_tag_ref(handle);
} else {
- /*
- * page_ext is not available yet, record the pfn so we can
- * clear the tag ref later when page_ext is initialized.
- */
- alloc_tag_add_early_pfn(page_to_pfn(page));
+
if (task->alloc_tag)
alloc_tag_set_inaccurate(task->alloc_tag);
+
+ /*
+ * page_ext is not available yet, record the pfn so the
+ * tag ref can be cleared later when page_ext is initialized.
+ */
+ if (should_record_early_pfn(gfp_flags))
+ alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
}
}
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
- unsigned int nr)
+ unsigned int nr, gfp_t gfp_flags)
{
if (mem_alloc_profiling_enabled())
- __pgalloc_tag_add(page, task, nr);
+ __pgalloc_tag_add(page, task, nr, gfp_flags);
}
/* Should be called only if mem_alloc_profiling_enabled() */
@@ -1341,7 +1344,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
#else /* CONFIG_MEM_ALLOC_PROFILING */
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
- unsigned int nr) {}
+ unsigned int nr, gfp_t gfp_flags) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
@@ -1896,7 +1899,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
- pgalloc_tag_add(page, current, 1 << order);
+ pgalloc_tag_add(page, current, 1 << order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
--
2.25.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
2026-04-23 8:37 [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list Hao Ge
@ 2026-04-27 16:02 ` Suren Baghdasaryan
2026-04-27 18:23 ` Suren Baghdasaryan
2026-04-28 3:00 ` Hao Ge
0 siblings, 2 replies; 7+ messages in thread
From: Suren Baghdasaryan @ 2026-04-27 16:02 UTC (permalink / raw)
To: Hao Ge; +Cc: Kent Overstreet, Andrew Morton, linux-mm, linux-kernel
On Thu, Apr 23, 2026 at 1:39 AM Hao Ge <hao.ge@linux.dev> wrote:
>
> Pages allocated before page_ext is available have their codetag left
> uninitialized. Track these early PFNs and clear their codetag in
> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> warnings when they are freed later.
>
> Currently a fixed-size array of 8192 entries is used, with a warning if
> the limit is exceeded. However, the number of early allocations depends
> on the number of CPUs and can be larger than 8192.
>
> Replace the fixed-size array with a dynamically allocated linked list
> of pages. Each page stores PFNs in its page body and uses page->lru
> to link to the next page, with page->private tracking the number of
> used slots.
>
> The list pages themselves are allocated via alloc_page(), which would
> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
> indefinitely. Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
> bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
> can skip recording allocations that carry this flag.
>
> Signed-off-by: Hao Ge <hao.ge@linux.dev>
> ---
> v3:
> - Simplify linked list: use page->lru for chaining and page->private as
> slot counter, removing the early_pfn_node struct and freelist (suggested
> by Suren Baghdasaryan)
> - Pass gfp_flags through alloc_tag_add_early_pfn() but strip
> __GFP_DIRECT_RECLAIM instead of selecting GFP_KERNEL/GFP_ATOMIC,
> because __alloc_tag_add_early_pfn() is invoked under rcu_read_lock().
>
> v2:
> - Use cmpxchg to atomically update early_pfn_pages, preventing page leak under concurrent allocation
> - Pass gfp_flags through the full call chain and use gfpflags_allow_blocking()
> to select GFP_KERNEL vs GFP_ATOMIC, avoiding unnecessary GFP_ATOMIC in process context
> ---
> include/linux/alloc_tag.h | 22 ++++++-
> lib/alloc_tag.c | 118 +++++++++++++++++++++-----------------
> mm/page_alloc.c | 23 ++++----
> 3 files changed, 97 insertions(+), 66 deletions(-)
>
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> index 02de2ede560f..ce9b478033bf 100644
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -150,6 +150,23 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> }
>
> #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +/*
> + * Skip early PFN recording for a page allocation. Reuses the
> + * %__GFP_NO_OBJ_EXT bit. Used by __alloc_tag_add_early_pfn() to avoid
> + * recursion when allocating pages for the early PFN tracking list
> + * itself.
> + *
> + * Callers must set the codetag to CODETAG_EMPTY (via
> + * clear_page_tag_ref()) before freeing pages allocated with this
> + * flag once page_ext becomes available, otherwise
> + * alloc_tag_sub_check() will trigger a warning.
nit: Callers must... sounds like someone who uses
__alloc_tag_add_early_pfn() has to do something extra, however
clear_page_tag_ref() is done by clear_early_alloc_pfn_tag_refs(), so
callers don't really need to do this. I suggest rephrasing as:
Codetags of the pages allocated with __GFP_NO_CODETAG should be
cleared (via clear_page_tag_ref()) before freeing the pages to prevent
alloc_tag_sub_check() from triggering a warning.
> + */
> +#define __GFP_NO_CODETAG __GFP_NO_OBJ_EXT
> +
> +static inline bool should_record_early_pfn(gfp_t gfp_flags)
> +{
> + return !(gfp_flags & __GFP_NO_CODETAG);
> +}
> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
> {
> WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref),
> @@ -163,11 +180,12 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
> {
> WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> }
> -void alloc_tag_add_early_pfn(unsigned long pfn);
> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
> #else
> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
> static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
> +static inline bool should_record_early_pfn(gfp_t gfp_flags) { return false; }
> #endif
>
> /* Caller should verify both ref and tag to be valid */
> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> index ed1bdcf1f8ab..a00dc5a867e4 100644
> --- a/lib/alloc_tag.c
> +++ b/lib/alloc_tag.c
> @@ -766,45 +766,49 @@ static __init bool need_page_alloc_tagging(void)
> * Some pages are allocated before page_ext becomes available, leaving
> * their codetag uninitialized. Track these early PFNs so we can clear
> * their codetag refs later to avoid warnings when they are freed.
> - *
> - * Early allocations include:
> - * - Base allocations independent of CPU count
> - * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
> - * such as trace ring buffers, scheduler per-cpu data)
> - *
> - * For simplicity, we fix the size to 8192.
> - * If insufficient, a warning will be triggered to alert the user.
> - *
> - * TODO: Replace fixed-size array with dynamic allocation using
> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
> */
> -#define EARLY_ALLOC_PFN_MAX 8192
> +#define EARLY_PFNS_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
>
> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
> +static struct page *early_pfn_current __initdata;
So, instead of using early_pfn_current as the head page and then
page->lru.next directly, I would suggest having "struct llist_head
early_pfn_head;" as the list head and either using page->pcp_llist to
link the pages or even better, adding a "struct llist_node llist;"
into the union containing "struct list_head lru;". Note that
llist_add() uses try_cmpxchg(), so the link addition is atomic.
And you should probably use folios rather than pages to follow the new trends.
>
> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> {
> - int old_idx, new_idx;
> + struct page *page;
> + unsigned long idx;
>
> do {
> - old_idx = atomic_read(&early_pfn_count);
> - if (old_idx >= EARLY_ALLOC_PFN_MAX) {
> - pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
> - EARLY_ALLOC_PFN_MAX);
> - return;
> + page = READ_ONCE(early_pfn_current);
> + if (!page || READ_ONCE(page->private) >= EARLY_PFNS_PER_PAGE) {
> + gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
> + struct page *new = alloc_page(gfp | __GFP_NO_CODETAG);
> +
> + if (!new) {
> + pr_warn_once("early PFN tracking page allocation failed\n");
> + return;
> + }
> + new->lru.next = (struct list_head *)page;
> + if (cmpxchg(&early_pfn_current, page, new) != page) {
> + __free_page(new);
> + continue;
> + }
> + page = new;
> }
> - new_idx = old_idx + 1;
> - } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
> + idx = READ_ONCE(page->private);
> + if (idx >= EARLY_PFNS_PER_PAGE)
> + continue;
> + if (cmpxchg(&page->private, idx, idx + 1) != idx)
> + continue;
> + break;
nit: This would read more easily:
if (cmpxchg(&page->private, idx, idx + 1) == idx)
break;
> + } while (1);
>
> - early_pfns[old_idx] = pfn;
> + ((unsigned long *)page_address(page))[idx] = pfn;
> }
>
> -typedef void alloc_tag_add_func(unsigned long pfn);
> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
> static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
> RCU_INITIALIZER(__alloc_tag_add_early_pfn);
>
> -void alloc_tag_add_early_pfn(unsigned long pfn)
> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> {
> alloc_tag_add_func *alloc_tag_add;
>
> @@ -814,13 +818,14 @@ void alloc_tag_add_early_pfn(unsigned long pfn)
> rcu_read_lock();
> alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
> if (alloc_tag_add)
> - alloc_tag_add(pfn);
> + alloc_tag_add(pfn, gfp_flags);
> rcu_read_unlock();
> }
>
> static void __init clear_early_alloc_pfn_tag_refs(void)
> {
> - unsigned int i;
> + struct page *page, *next;
> + unsigned long i;
>
> if (static_key_enabled(&mem_profiling_compressed))
> return;
> @@ -829,37 +834,42 @@ static void __init clear_early_alloc_pfn_tag_refs(void)
> /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
> synchronize_rcu();
>
> - for (i = 0; i < atomic_read(&early_pfn_count); i++) {
> - unsigned long pfn = early_pfns[i];
> -
> - if (pfn_valid(pfn)) {
> - struct page *page = pfn_to_page(pfn);
> - union pgtag_ref_handle handle;
> - union codetag_ref ref;
> -
> - if (get_page_tag_ref(page, &ref, &handle)) {
> - /*
> - * An early-allocated page could be freed and reallocated
> - * after its page_ext is initialized but before we clear it.
> - * In that case, it already has a valid tag set.
> - * We should not overwrite that valid tag with CODETAG_EMPTY.
> - *
> - * Note: there is still a small race window between checking
> - * ref.ct and calling set_codetag_empty(). We accept this
> - * race as it's unlikely and the extra complexity of atomic
> - * cmpxchg is not worth it for this debug-only code path.
> - */
> - if (ref.ct) {
> + for (page = early_pfn_current; page; page = next) {
> + for (i = 0; i < page->private; i++) {
> + unsigned long pfn = ((unsigned long *)page_address(page))[i];
This page_address() can be done in the outer loop since the page does
not change inside the inner loop.
> +
> + if (pfn_valid(pfn)) {
> + union pgtag_ref_handle handle;
> + union codetag_ref ref;
> +
> + if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
> + /*
> + * An early-allocated page could be freed and reallocated
> + * after its page_ext is initialized but before we clear it.
> + * In that case, it already has a valid tag set.
> + * We should not overwrite that valid tag
> + * with CODETAG_EMPTY.
> + *
> + * Note: there is still a small race window between checking
> + * ref.ct and calling set_codetag_empty(). We accept this
> + * race as it's unlikely and the extra complexity of atomic
> + * cmpxchg is not worth it for this debug-only code path.
> + */
> + if (ref.ct) {
> + put_page_tag_ref(handle);
> + continue;
> + }
> +
> + set_codetag_empty(&ref);
> + update_page_tag_ref(handle, &ref);
> put_page_tag_ref(handle);
> - continue;
> }
> -
> - set_codetag_empty(&ref);
> - update_page_tag_ref(handle, &ref);
> - put_page_tag_ref(handle);
> }
> }
>
> + next = (struct page *)page->lru.next;
> + clear_page_tag_ref(page);
> + __free_page(page);
> }
> }
> #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 04494bc2e46f..5b7b234967a5 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1284,7 +1284,7 @@ void __clear_page_tag_ref(struct page *page)
> /* Should be called only if mem_alloc_profiling_enabled() */
> static noinline
> void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> - unsigned int nr)
> + unsigned int nr, gfp_t gfp_flags)
> {
> union pgtag_ref_handle handle;
> union codetag_ref ref;
> @@ -1294,21 +1294,24 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> update_page_tag_ref(handle, &ref);
> put_page_tag_ref(handle);
> } else {
> - /*
> - * page_ext is not available yet, record the pfn so we can
> - * clear the tag ref later when page_ext is initialized.
> - */
> - alloc_tag_add_early_pfn(page_to_pfn(page));
> +
> if (task->alloc_tag)
> alloc_tag_set_inaccurate(task->alloc_tag);
> +
> + /*
> + * page_ext is not available yet, record the pfn so the
> + * tag ref can be cleared later when page_ext is initialized.
> + */
> + if (should_record_early_pfn(gfp_flags))
> + alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
Any reason for changing the order of alloc_tag_add_early_pfn() and
alloc_tag_set_inaccurate()? I liked the previous version where we
explain why this is done at the very beginning of the block. You could
also call should_record_early_pfn() from inside
alloc_tag_add_early_pfn(), which would keep both
should_record_early_pfn() and __GFP_NO_CODETAG definitions local to
alloc_tag.c.
> }
> }
>
> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> - unsigned int nr)
> + unsigned int nr, gfp_t gfp_flags)
> {
> if (mem_alloc_profiling_enabled())
> - __pgalloc_tag_add(page, task, nr);
> + __pgalloc_tag_add(page, task, nr, gfp_flags);
> }
>
> /* Should be called only if mem_alloc_profiling_enabled() */
> @@ -1341,7 +1344,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
> #else /* CONFIG_MEM_ALLOC_PROFILING */
>
> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> - unsigned int nr) {}
> + unsigned int nr, gfp_t gfp_flags) {}
> static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
> static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
>
> @@ -1896,7 +1899,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
>
> set_page_owner(page, order, gfp_flags);
> page_table_check_alloc(page, order);
> - pgalloc_tag_add(page, current, 1 << order);
> + pgalloc_tag_add(page, current, 1 << order, gfp_flags);
> }
>
> static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
2026-04-27 16:02 ` Suren Baghdasaryan
@ 2026-04-27 18:23 ` Suren Baghdasaryan
2026-04-28 3:00 ` Hao Ge
1 sibling, 0 replies; 7+ messages in thread
From: Suren Baghdasaryan @ 2026-04-27 18:23 UTC (permalink / raw)
To: Hao Ge; +Cc: Kent Overstreet, Andrew Morton, linux-mm, linux-kernel
On Mon, Apr 27, 2026 at 9:02 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Apr 23, 2026 at 1:39 AM Hao Ge <hao.ge@linux.dev> wrote:
> >
> > Pages allocated before page_ext is available have their codetag left
> > uninitialized. Track these early PFNs and clear their codetag in
> > clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> > warnings when they are freed later.
> >
> > Currently a fixed-size array of 8192 entries is used, with a warning if
> > the limit is exceeded. However, the number of early allocations depends
> > on the number of CPUs and can be larger than 8192.
> >
> > Replace the fixed-size array with a dynamically allocated linked list
> > of pages. Each page stores PFNs in its page body and uses page->lru
> > to link to the next page, with page->private tracking the number of
> > used slots.
> >
> > The list pages themselves are allocated via alloc_page(), which would
> > trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
> > indefinitely. Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
> > bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
> > can skip recording allocations that carry this flag.
> >
> > Signed-off-by: Hao Ge <hao.ge@linux.dev>
> > ---
> > v3:
> > - Simplify linked list: use page->lru for chaining and page->private as
> > slot counter, removing the early_pfn_node struct and freelist (suggested
> > by Suren Baghdasaryan)
> > - Pass gfp_flags through alloc_tag_add_early_pfn() but strip
> > __GFP_DIRECT_RECLAIM instead of selecting GFP_KERNEL/GFP_ATOMIC,
> > because __alloc_tag_add_early_pfn() is invoked under rcu_read_lock().
> >
> > v2:
> > - Use cmpxchg to atomically update early_pfn_pages, preventing page leak under concurrent allocation
> > - Pass gfp_flags through the full call chain and use gfpflags_allow_blocking()
> > to select GFP_KERNEL vs GFP_ATOMIC, avoiding unnecessary GFP_ATOMIC in process context
> > ---
> > include/linux/alloc_tag.h | 22 ++++++-
> > lib/alloc_tag.c | 118 +++++++++++++++++++++-----------------
> > mm/page_alloc.c | 23 ++++----
> > 3 files changed, 97 insertions(+), 66 deletions(-)
> >
> > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> > index 02de2ede560f..ce9b478033bf 100644
> > --- a/include/linux/alloc_tag.h
> > +++ b/include/linux/alloc_tag.h
> > @@ -150,6 +150,23 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> > }
> >
> > #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > +/*
> > + * Skip early PFN recording for a page allocation. Reuses the
> > + * %__GFP_NO_OBJ_EXT bit. Used by __alloc_tag_add_early_pfn() to avoid
> > + * recursion when allocating pages for the early PFN tracking list
> > + * itself.
> > + *
> > + * Callers must set the codetag to CODETAG_EMPTY (via
> > + * clear_page_tag_ref()) before freeing pages allocated with this
> > + * flag once page_ext becomes available, otherwise
> > + * alloc_tag_sub_check() will trigger a warning.
>
> nit: Callers must... sounds like someone who uses
> __alloc_tag_add_early_pfn() has to do something extra, however
> clear_page_tag_ref() is done by clear_early_alloc_pfn_tag_refs(), so
> callers don't really need to do this. I suggest rephrasing as:
>
> Codetags of the pages allocated with __GFP_NO_CODETAG should be
> cleared (via clear_page_tag_ref()) before freeing the pages to prevent
> alloc_tag_sub_check() from triggering a warning.
>
> > + */
> > +#define __GFP_NO_CODETAG __GFP_NO_OBJ_EXT
> > +
> > +static inline bool should_record_early_pfn(gfp_t gfp_flags)
> > +{
> > + return !(gfp_flags & __GFP_NO_CODETAG);
> > +}
> > static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
> > {
> > WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref),
> > @@ -163,11 +180,12 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
> > {
> > WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> > }
> > -void alloc_tag_add_early_pfn(unsigned long pfn);
> > +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
> > #else
> > static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
> > static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
> > -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
> > +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
> > +static inline bool should_record_early_pfn(gfp_t gfp_flags) { return false; }
> > #endif
> >
> > /* Caller should verify both ref and tag to be valid */
> > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > index ed1bdcf1f8ab..a00dc5a867e4 100644
> > --- a/lib/alloc_tag.c
> > +++ b/lib/alloc_tag.c
> > @@ -766,45 +766,49 @@ static __init bool need_page_alloc_tagging(void)
> > * Some pages are allocated before page_ext becomes available, leaving
> > * their codetag uninitialized. Track these early PFNs so we can clear
> > * their codetag refs later to avoid warnings when they are freed.
> > - *
> > - * Early allocations include:
> > - * - Base allocations independent of CPU count
> > - * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
> > - * such as trace ring buffers, scheduler per-cpu data)
> > - *
> > - * For simplicity, we fix the size to 8192.
> > - * If insufficient, a warning will be triggered to alert the user.
> > - *
> > - * TODO: Replace fixed-size array with dynamic allocation using
> > - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
> > */
> > -#define EARLY_ALLOC_PFN_MAX 8192
> > +#define EARLY_PFNS_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
> >
> > -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
> > -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
> > +static struct page *early_pfn_current __initdata;
>
> So, instead of using early_pfn_current as the head page and then
> page->lru.next directly, I would suggest having "struct llist_head
> early_pfn_head;" as the list head and either using page->pcp_llist to
> link the pages or even better, adding a "struct llist_node llist;"
> into the union containing "struct list_head lru;". Note that
> llist_add() uses try_cmpxchg(), so the link addition is atomic.
> And you should probably use folios rather than pages to follow the new trends.
Please ignore that last suggestion about using folios. I checked with
Matthew and he says to use folios when needed, not whenever we can.
So, using pages here seems fine.
>
> >
> > -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
> > +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> > {
> > - int old_idx, new_idx;
> > + struct page *page;
> > + unsigned long idx;
> >
> > do {
> > - old_idx = atomic_read(&early_pfn_count);
> > - if (old_idx >= EARLY_ALLOC_PFN_MAX) {
> > - pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
> > - EARLY_ALLOC_PFN_MAX);
> > - return;
> > + page = READ_ONCE(early_pfn_current);
> > + if (!page || READ_ONCE(page->private) >= EARLY_PFNS_PER_PAGE) {
> > + gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
> > + struct page *new = alloc_page(gfp | __GFP_NO_CODETAG);
> > +
> > + if (!new) {
> > + pr_warn_once("early PFN tracking page allocation failed\n");
> > + return;
> > + }
> > + new->lru.next = (struct list_head *)page;
> > + if (cmpxchg(&early_pfn_current, page, new) != page) {
> > + __free_page(new);
> > + continue;
> > + }
> > + page = new;
> > }
> > - new_idx = old_idx + 1;
> > - } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
> > + idx = READ_ONCE(page->private);
> > + if (idx >= EARLY_PFNS_PER_PAGE)
> > + continue;
> > + if (cmpxchg(&page->private, idx, idx + 1) != idx)
> > + continue;
> > + break;
>
> nit: This would read more easily:
>
> if (cmpxchg(&page->private, idx, idx + 1) == idx)
> break;
>
>
> > + } while (1);
> >
> > - early_pfns[old_idx] = pfn;
> > + ((unsigned long *)page_address(page))[idx] = pfn;
> > }
> >
> > -typedef void alloc_tag_add_func(unsigned long pfn);
> > +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
> > static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
> > RCU_INITIALIZER(__alloc_tag_add_early_pfn);
> >
> > -void alloc_tag_add_early_pfn(unsigned long pfn)
> > +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> > {
> > alloc_tag_add_func *alloc_tag_add;
> >
> > @@ -814,13 +818,14 @@ void alloc_tag_add_early_pfn(unsigned long pfn)
> > rcu_read_lock();
> > alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
> > if (alloc_tag_add)
> > - alloc_tag_add(pfn);
> > + alloc_tag_add(pfn, gfp_flags);
> > rcu_read_unlock();
> > }
> >
> > static void __init clear_early_alloc_pfn_tag_refs(void)
> > {
> > - unsigned int i;
> > + struct page *page, *next;
> > + unsigned long i;
> >
> > if (static_key_enabled(&mem_profiling_compressed))
> > return;
> > @@ -829,37 +834,42 @@ static void __init clear_early_alloc_pfn_tag_refs(void)
> > /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
> > synchronize_rcu();
> >
> > - for (i = 0; i < atomic_read(&early_pfn_count); i++) {
> > - unsigned long pfn = early_pfns[i];
> > -
> > - if (pfn_valid(pfn)) {
> > - struct page *page = pfn_to_page(pfn);
> > - union pgtag_ref_handle handle;
> > - union codetag_ref ref;
> > -
> > - if (get_page_tag_ref(page, &ref, &handle)) {
> > - /*
> > - * An early-allocated page could be freed and reallocated
> > - * after its page_ext is initialized but before we clear it.
> > - * In that case, it already has a valid tag set.
> > - * We should not overwrite that valid tag with CODETAG_EMPTY.
> > - *
> > - * Note: there is still a small race window between checking
> > - * ref.ct and calling set_codetag_empty(). We accept this
> > - * race as it's unlikely and the extra complexity of atomic
> > - * cmpxchg is not worth it for this debug-only code path.
> > - */
> > - if (ref.ct) {
> > + for (page = early_pfn_current; page; page = next) {
> > + for (i = 0; i < page->private; i++) {
> > + unsigned long pfn = ((unsigned long *)page_address(page))[i];
>
> This page_address() can be done in the outer loop since the page does
> not change inside the inner loop.
>
> > +
> > + if (pfn_valid(pfn)) {
> > + union pgtag_ref_handle handle;
> > + union codetag_ref ref;
> > +
> > + if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
> > + /*
> > + * An early-allocated page could be freed and reallocated
> > + * after its page_ext is initialized but before we clear it.
> > + * In that case, it already has a valid tag set.
> > + * We should not overwrite that valid tag
> > + * with CODETAG_EMPTY.
> > + *
> > + * Note: there is still a small race window between checking
> > + * ref.ct and calling set_codetag_empty(). We accept this
> > + * race as it's unlikely and the extra complexity of atomic
> > + * cmpxchg is not worth it for this debug-only code path.
> > + */
> > + if (ref.ct) {
> > + put_page_tag_ref(handle);
> > + continue;
> > + }
> > +
> > + set_codetag_empty(&ref);
> > + update_page_tag_ref(handle, &ref);
> > put_page_tag_ref(handle);
> > - continue;
> > }
> > -
> > - set_codetag_empty(&ref);
> > - update_page_tag_ref(handle, &ref);
> > - put_page_tag_ref(handle);
> > }
> > }
> >
> > + next = (struct page *)page->lru.next;
> > + clear_page_tag_ref(page);
> > + __free_page(page);
> > }
> > }
> > #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 04494bc2e46f..5b7b234967a5 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1284,7 +1284,7 @@ void __clear_page_tag_ref(struct page *page)
> > /* Should be called only if mem_alloc_profiling_enabled() */
> > static noinline
> > void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> > - unsigned int nr)
> > + unsigned int nr, gfp_t gfp_flags)
> > {
> > union pgtag_ref_handle handle;
> > union codetag_ref ref;
> > @@ -1294,21 +1294,24 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> > update_page_tag_ref(handle, &ref);
> > put_page_tag_ref(handle);
> > } else {
> > - /*
> > - * page_ext is not available yet, record the pfn so we can
> > - * clear the tag ref later when page_ext is initialized.
> > - */
> > - alloc_tag_add_early_pfn(page_to_pfn(page));
> > +
> > if (task->alloc_tag)
> > alloc_tag_set_inaccurate(task->alloc_tag);
> > +
> > + /*
> > + * page_ext is not available yet, record the pfn so the
> > + * tag ref can be cleared later when page_ext is initialized.
> > + */
> > + if (should_record_early_pfn(gfp_flags))
> > + alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
>
> Any reason for changing the order of alloc_tag_add_early_pfn() and
> alloc_tag_set_inaccurate()? I liked the previous version where we
> explain why this is done at the very beginning of the block. You could
> also call should_record_early_pfn() from inside
> alloc_tag_add_early_pfn(), which would keep both
> should_record_early_pfn() and __GFP_NO_CODETAG definitions local to
> alloc_tag.c.
>
> > }
> > }
> >
> > static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> > - unsigned int nr)
> > + unsigned int nr, gfp_t gfp_flags)
> > {
> > if (mem_alloc_profiling_enabled())
> > - __pgalloc_tag_add(page, task, nr);
> > + __pgalloc_tag_add(page, task, nr, gfp_flags);
> > }
> >
> > /* Should be called only if mem_alloc_profiling_enabled() */
> > @@ -1341,7 +1344,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
> > #else /* CONFIG_MEM_ALLOC_PROFILING */
> >
> > static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> > - unsigned int nr) {}
> > + unsigned int nr, gfp_t gfp_flags) {}
> > static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
> > static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
> >
> > @@ -1896,7 +1899,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
> >
> > set_page_owner(page, order, gfp_flags);
> > page_table_check_alloc(page, order);
> > - pgalloc_tag_add(page, current, 1 << order);
> > + pgalloc_tag_add(page, current, 1 << order, gfp_flags);
> > }
> >
> > static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
> > --
> > 2.25.1
> >
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
2026-04-27 16:02 ` Suren Baghdasaryan
2026-04-27 18:23 ` Suren Baghdasaryan
@ 2026-04-28 3:00 ` Hao Ge
2026-04-29 5:47 ` Suren Baghdasaryan
1 sibling, 1 reply; 7+ messages in thread
From: Hao Ge @ 2026-04-28 3:00 UTC (permalink / raw)
To: Suren Baghdasaryan; +Cc: Kent Overstreet, Andrew Morton, linux-mm, linux-kernel
Hi Suren
Thanks a lot for your review and suggestions.
On 2026/4/28 00:02, Suren Baghdasaryan wrote:
> On Thu, Apr 23, 2026 at 1:39 AM Hao Ge <hao.ge@linux.dev> wrote:
>> Pages allocated before page_ext is available have their codetag left
>> uninitialized. Track these early PFNs and clear their codetag in
>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
>> warnings when they are freed later.
>>
>> Currently a fixed-size array of 8192 entries is used, with a warning if
>> the limit is exceeded. However, the number of early allocations depends
>> on the number of CPUs and can be larger than 8192.
>>
>> Replace the fixed-size array with a dynamically allocated linked list
>> of pages. Each page stores PFNs in its page body and uses page->lru
>> to link to the next page, with page->private tracking the number of
>> used slots.
>>
>> The list pages themselves are allocated via alloc_page(), which would
>> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
>> indefinitely. Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
>> bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
>> can skip recording allocations that carry this flag.
>>
>> Signed-off-by: Hao Ge <hao.ge@linux.dev>
>> ---
>> v3:
>> - Simplify linked list: use page->lru for chaining and page->private as
>> slot counter, removing the early_pfn_node struct and freelist (suggested
>> by Suren Baghdasaryan)
>> - Pass gfp_flags through alloc_tag_add_early_pfn() but strip
>> __GFP_DIRECT_RECLAIM instead of selecting GFP_KERNEL/GFP_ATOMIC,
>> because __alloc_tag_add_early_pfn() is invoked under rcu_read_lock().
>>
>> v2:
>> - Use cmpxchg to atomically update early_pfn_pages, preventing page leak under concurrent allocation
>> - Pass gfp_flags through the full call chain and use gfpflags_allow_blocking()
>> to select GFP_KERNEL vs GFP_ATOMIC, avoiding unnecessary GFP_ATOMIC in process context
>> ---
>> include/linux/alloc_tag.h | 22 ++++++-
>> lib/alloc_tag.c | 118 +++++++++++++++++++++-----------------
>> mm/page_alloc.c | 23 ++++----
>> 3 files changed, 97 insertions(+), 66 deletions(-)
>>
>> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
>> index 02de2ede560f..ce9b478033bf 100644
>> --- a/include/linux/alloc_tag.h
>> +++ b/include/linux/alloc_tag.h
>> @@ -150,6 +150,23 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
>> }
>>
>> #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
>> +/*
>> + * Skip early PFN recording for a page allocation. Reuses the
>> + * %__GFP_NO_OBJ_EXT bit. Used by __alloc_tag_add_early_pfn() to avoid
>> + * recursion when allocating pages for the early PFN tracking list
>> + * itself.
>> + *
>> + * Callers must set the codetag to CODETAG_EMPTY (via
>> + * clear_page_tag_ref()) before freeing pages allocated with this
>> + * flag once page_ext becomes available, otherwise
>> + * alloc_tag_sub_check() will trigger a warning.
> nit: Callers must... sounds like someone who uses
> __alloc_tag_add_early_pfn() has to do something extra, however
> clear_page_tag_ref() is done by clear_early_alloc_pfn_tag_refs(), so
> callers don't really need to do this. I suggest rephrasing as:
>
> Codetags of the pages allocated with __GFP_NO_CODETAG should be
> cleared (via clear_page_tag_ref()) before freeing the pages to prevent
> alloc_tag_sub_check() from triggering a warning.
That's reasonable.
I will adjust the wording in v2.
>> + */
>> +#define __GFP_NO_CODETAG __GFP_NO_OBJ_EXT
>> +
>> +static inline bool should_record_early_pfn(gfp_t gfp_flags)
>> +{
>> + return !(gfp_flags & __GFP_NO_CODETAG);
>> +}
>> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
>> {
>> WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref),
>> @@ -163,11 +180,12 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
>> {
>> WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
>> }
>> -void alloc_tag_add_early_pfn(unsigned long pfn);
>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
>> #else
>> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
>> static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
>> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
>> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
>> +static inline bool should_record_early_pfn(gfp_t gfp_flags) { return false; }
>> #endif
>>
>> /* Caller should verify both ref and tag to be valid */
>> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
>> index ed1bdcf1f8ab..a00dc5a867e4 100644
>> --- a/lib/alloc_tag.c
>> +++ b/lib/alloc_tag.c
>> @@ -766,45 +766,49 @@ static __init bool need_page_alloc_tagging(void)
>> * Some pages are allocated before page_ext becomes available, leaving
>> * their codetag uninitialized. Track these early PFNs so we can clear
>> * their codetag refs later to avoid warnings when they are freed.
>> - *
>> - * Early allocations include:
>> - * - Base allocations independent of CPU count
>> - * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
>> - * such as trace ring buffers, scheduler per-cpu data)
>> - *
>> - * For simplicity, we fix the size to 8192.
>> - * If insufficient, a warning will be triggered to alert the user.
>> - *
>> - * TODO: Replace fixed-size array with dynamic allocation using
>> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
>> */
>> -#define EARLY_ALLOC_PFN_MAX 8192
>> +#define EARLY_PFNS_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
>>
>> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
>> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
>> +static struct page *early_pfn_current __initdata;
> So, instead of using early_pfn_current as the head page and then
> page->lru.next directly, I would suggest having "struct llist_head
> early_pfn_head;" as the list head and either using page->pcp_llist to
> link the pages or even better, adding a "struct llist_node llist;"
> into the union containing "struct list_head lru;". Note that
> llist_add() uses try_cmpxchg(), so the link addition is atomic.
Adding a struct llist_node into the page union only for the
CONFIG_MEM_ALLOC_PROFILING_DEBUG feature,
which may not be quite appropriate from a design perspective.
> And you should probably use folios rather than pages to follow the new trends.
>
>> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
>> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>> {
>> - int old_idx, new_idx;
>> + struct page *page;
>> + unsigned long idx;
>>
>> do {
>> - old_idx = atomic_read(&early_pfn_count);
>> - if (old_idx >= EARLY_ALLOC_PFN_MAX) {
>> - pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
>> - EARLY_ALLOC_PFN_MAX);
>> - return;
>> + page = READ_ONCE(early_pfn_current);
>> + if (!page || READ_ONCE(page->private) >= EARLY_PFNS_PER_PAGE) {
>> + gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
>> + struct page *new = alloc_page(gfp | __GFP_NO_CODETAG);
>> +
>> + if (!new) {
>> + pr_warn_once("early PFN tracking page allocation failed\n");
>> + return;
>> + }
>> + new->lru.next = (struct list_head *)page;
>> + if (cmpxchg(&early_pfn_current, page, new) != page) {
>> + __free_page(new);
>> + continue;
>> + }
I also considered using llist_head initially. like this
static LLIST_HEAD(early_pfn_pages);
static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t
gfp_flags)
{
struct page *page;
unsigned long idx;
do {
page = llist_first(&early_pfn_pages);
if (!page || READ_ONCE(page->private) >=
EARLY_PFNS_PER_PAGE) {
gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
struct page *new = alloc_page(gfp |
__GFP_NO_CODETAG);
if (!new) {
pr_warn_once("early PFN tracking page
allocation failed\n");
return;
}
new->private = 0;
llist_add(&new->pcp_llist, &early_pfn_pages);
page = new;
}
idx = READ_ONCE(page->private);
if (idx >= EARLY_PFNS_PER_PAGE)
continue;
if (cmpxchg(&page->private, idx, idx + 1) == idx)
break;
} while (1);
((unsigned long *)page_address(page))[idx] = pfn;
}
The problem is that llist_add() retries internally until it succeeds.
When multiple CPUs concurrently see that the head page is NULL or full ,
they each allocate a new tracking page and all insertions succeed:
CPU0: head page A is full or NULL, allocates B, llist_add(B) → head=B→A
CPU1: head page A is full or NULL, allocates C, llist_add(C) → head=C→B→A
CPU0: claims slot 0 in B
CPU1: claims slot 0 in C
Now the head is C. Once C fills up, a new page D is allocated and
becomes the new head. But B still has ~511 unused slots and will never
be the head again — we only ever look at llist_first(). Those slots are
permanently wasted.
That's why I used struct page *early_pfn_current with direct cmpxchg —
the loser frees its page and retries, avoiding the waste.
That said, I may be missing something. Is there a way to use llist that
avoids this? I'd appreciate your thoughts.
>> + page = new;
>> }
>> - new_idx = old_idx + 1;
>> - } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
>> + idx = READ_ONCE(page->private);
>> + if (idx >= EARLY_PFNS_PER_PAGE)
>> + continue;
>> + if (cmpxchg(&page->private, idx, idx + 1) != idx)
>> + continue;
>> + break;
> nit: This would read more easily:
>
> if (cmpxchg(&page->private, idx, idx + 1) == idx)
> break;
will fix in the next version.
>
>> + } while (1);
>>
>> - early_pfns[old_idx] = pfn;
>> + ((unsigned long *)page_address(page))[idx] = pfn;
>> }
>>
>> -typedef void alloc_tag_add_func(unsigned long pfn);
>> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
>> static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
>> RCU_INITIALIZER(__alloc_tag_add_early_pfn);
>>
>> -void alloc_tag_add_early_pfn(unsigned long pfn)
>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>> {
>> alloc_tag_add_func *alloc_tag_add;
>>
>> @@ -814,13 +818,14 @@ void alloc_tag_add_early_pfn(unsigned long pfn)
>> rcu_read_lock();
>> alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
>> if (alloc_tag_add)
>> - alloc_tag_add(pfn);
>> + alloc_tag_add(pfn, gfp_flags);
>> rcu_read_unlock();
>> }
>>
>> static void __init clear_early_alloc_pfn_tag_refs(void)
>> {
>> - unsigned int i;
>> + struct page *page, *next;
>> + unsigned long i;
>>
>> if (static_key_enabled(&mem_profiling_compressed))
>> return;
>> @@ -829,37 +834,42 @@ static void __init clear_early_alloc_pfn_tag_refs(void)
>> /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
>> synchronize_rcu();
>>
>> - for (i = 0; i < atomic_read(&early_pfn_count); i++) {
>> - unsigned long pfn = early_pfns[i];
>> -
>> - if (pfn_valid(pfn)) {
>> - struct page *page = pfn_to_page(pfn);
>> - union pgtag_ref_handle handle;
>> - union codetag_ref ref;
>> -
>> - if (get_page_tag_ref(page, &ref, &handle)) {
>> - /*
>> - * An early-allocated page could be freed and reallocated
>> - * after its page_ext is initialized but before we clear it.
>> - * In that case, it already has a valid tag set.
>> - * We should not overwrite that valid tag with CODETAG_EMPTY.
>> - *
>> - * Note: there is still a small race window between checking
>> - * ref.ct and calling set_codetag_empty(). We accept this
>> - * race as it's unlikely and the extra complexity of atomic
>> - * cmpxchg is not worth it for this debug-only code path.
>> - */
>> - if (ref.ct) {
>> + for (page = early_pfn_current; page; page = next) {
>> + for (i = 0; i < page->private; i++) {
>> + unsigned long pfn = ((unsigned long *)page_address(page))[i];
> This page_address() can be done in the outer loop since the page does
> not change inside the inner loop.
will fix in the next version.
>> +
>> + if (pfn_valid(pfn)) {
>> + union pgtag_ref_handle handle;
>> + union codetag_ref ref;
>> +
>> + if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
>> + /*
>> + * An early-allocated page could be freed and reallocated
>> + * after its page_ext is initialized but before we clear it.
>> + * In that case, it already has a valid tag set.
>> + * We should not overwrite that valid tag
>> + * with CODETAG_EMPTY.
>> + *
>> + * Note: there is still a small race window between checking
>> + * ref.ct and calling set_codetag_empty(). We accept this
>> + * race as it's unlikely and the extra complexity of atomic
>> + * cmpxchg is not worth it for this debug-only code path.
>> + */
>> + if (ref.ct) {
>> + put_page_tag_ref(handle);
>> + continue;
>> + }
>> +
>> + set_codetag_empty(&ref);
>> + update_page_tag_ref(handle, &ref);
>> put_page_tag_ref(handle);
>> - continue;
>> }
>> -
>> - set_codetag_empty(&ref);
>> - update_page_tag_ref(handle, &ref);
>> - put_page_tag_ref(handle);
>> }
>> }
>>
>> + next = (struct page *)page->lru.next;
>> + clear_page_tag_ref(page);
>> + __free_page(page);
>> }
>> }
>> #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 04494bc2e46f..5b7b234967a5 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1284,7 +1284,7 @@ void __clear_page_tag_ref(struct page *page)
>> /* Should be called only if mem_alloc_profiling_enabled() */
>> static noinline
>> void __pgalloc_tag_add(struct page *page, struct task_struct *task,
>> - unsigned int nr)
>> + unsigned int nr, gfp_t gfp_flags)
>> {
>> union pgtag_ref_handle handle;
>> union codetag_ref ref;
>> @@ -1294,21 +1294,24 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
>> update_page_tag_ref(handle, &ref);
>> put_page_tag_ref(handle);
>> } else {
>> - /*
>> - * page_ext is not available yet, record the pfn so we can
>> - * clear the tag ref later when page_ext is initialized.
>> - */
>> - alloc_tag_add_early_pfn(page_to_pfn(page));
>> +
>> if (task->alloc_tag)
>> alloc_tag_set_inaccurate(task->alloc_tag);
>> +
>> + /*
>> + * page_ext is not available yet, record the pfn so the
>> + * tag ref can be cleared later when page_ext is initialized.
>> + */
>> + if (should_record_early_pfn(gfp_flags))
>> + alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
> Any reason for changing the order of alloc_tag_add_early_pfn() and
> alloc_tag_set_inaccurate()? I liked the previous version where we
> explain why this is done at the very beginning of the block. You could
> also call should_record_early_pfn() from inside
> alloc_tag_add_early_pfn(), which would keep both
> should_record_early_pfn() and __GFP_NO_CODETAG definitions local to
> alloc_tag.c.
>
will fix in the next version.
Thanks
Best regards.
Hao
>> }
>> }
>>
>> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
>> - unsigned int nr)
>> + unsigned int nr, gfp_t gfp_flags)
>> {
>> if (mem_alloc_profiling_enabled())
>> - __pgalloc_tag_add(page, task, nr);
>> + __pgalloc_tag_add(page, task, nr, gfp_flags);
>> }
>>
>> /* Should be called only if mem_alloc_profiling_enabled() */
>> @@ -1341,7 +1344,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
>> #else /* CONFIG_MEM_ALLOC_PROFILING */
>>
>> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
>> - unsigned int nr) {}
>> + unsigned int nr, gfp_t gfp_flags) {}
>> static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
>> static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
>>
>> @@ -1896,7 +1899,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
>>
>> set_page_owner(page, order, gfp_flags);
>> page_table_check_alloc(page, order);
>> - pgalloc_tag_add(page, current, 1 << order);
>> + pgalloc_tag_add(page, current, 1 << order, gfp_flags);
>> }
>>
>> static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
>> --
>> 2.25.1
>>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
2026-04-28 3:00 ` Hao Ge
@ 2026-04-29 5:47 ` Suren Baghdasaryan
2026-04-29 7:03 ` Hao Ge
0 siblings, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2026-04-29 5:47 UTC (permalink / raw)
To: Hao Ge; +Cc: Kent Overstreet, Andrew Morton, linux-mm, linux-kernel
On Mon, Apr 27, 2026 at 8:01 PM Hao Ge <hao.ge@linux.dev> wrote:
>
> Hi Suren
>
>
> Thanks a lot for your review and suggestions.
>
>
> On 2026/4/28 00:02, Suren Baghdasaryan wrote:
> > On Thu, Apr 23, 2026 at 1:39 AM Hao Ge <hao.ge@linux.dev> wrote:
> >> Pages allocated before page_ext is available have their codetag left
> >> uninitialized. Track these early PFNs and clear their codetag in
> >> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> >> warnings when they are freed later.
> >>
> >> Currently a fixed-size array of 8192 entries is used, with a warning if
> >> the limit is exceeded. However, the number of early allocations depends
> >> on the number of CPUs and can be larger than 8192.
> >>
> >> Replace the fixed-size array with a dynamically allocated linked list
> >> of pages. Each page stores PFNs in its page body and uses page->lru
> >> to link to the next page, with page->private tracking the number of
> >> used slots.
> >>
> >> The list pages themselves are allocated via alloc_page(), which would
> >> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
> >> indefinitely. Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
> >> bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
> >> can skip recording allocations that carry this flag.
> >>
> >> Signed-off-by: Hao Ge <hao.ge@linux.dev>
> >> ---
> >> v3:
> >> - Simplify linked list: use page->lru for chaining and page->private as
> >> slot counter, removing the early_pfn_node struct and freelist (suggested
> >> by Suren Baghdasaryan)
> >> - Pass gfp_flags through alloc_tag_add_early_pfn() but strip
> >> __GFP_DIRECT_RECLAIM instead of selecting GFP_KERNEL/GFP_ATOMIC,
> >> because __alloc_tag_add_early_pfn() is invoked under rcu_read_lock().
> >>
> >> v2:
> >> - Use cmpxchg to atomically update early_pfn_pages, preventing page leak under concurrent allocation
> >> - Pass gfp_flags through the full call chain and use gfpflags_allow_blocking()
> >> to select GFP_KERNEL vs GFP_ATOMIC, avoiding unnecessary GFP_ATOMIC in process context
> >> ---
> >> include/linux/alloc_tag.h | 22 ++++++-
> >> lib/alloc_tag.c | 118 +++++++++++++++++++++-----------------
> >> mm/page_alloc.c | 23 ++++----
> >> 3 files changed, 97 insertions(+), 66 deletions(-)
> >>
> >> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> >> index 02de2ede560f..ce9b478033bf 100644
> >> --- a/include/linux/alloc_tag.h
> >> +++ b/include/linux/alloc_tag.h
> >> @@ -150,6 +150,23 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> >> }
> >>
> >> #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> >> +/*
> >> + * Skip early PFN recording for a page allocation. Reuses the
> >> + * %__GFP_NO_OBJ_EXT bit. Used by __alloc_tag_add_early_pfn() to avoid
> >> + * recursion when allocating pages for the early PFN tracking list
> >> + * itself.
> >> + *
> >> + * Callers must set the codetag to CODETAG_EMPTY (via
> >> + * clear_page_tag_ref()) before freeing pages allocated with this
> >> + * flag once page_ext becomes available, otherwise
> >> + * alloc_tag_sub_check() will trigger a warning.
> > nit: Callers must... sounds like someone who uses
> > __alloc_tag_add_early_pfn() has to do something extra, however
> > clear_page_tag_ref() is done by clear_early_alloc_pfn_tag_refs(), so
> > callers don't really need to do this. I suggest rephrasing as:
> >
> > Codetags of the pages allocated with __GFP_NO_CODETAG should be
> > cleared (via clear_page_tag_ref()) before freeing the pages to prevent
> > alloc_tag_sub_check() from triggering a warning.
>
> That's reasonable.
>
> I will adjust the wording in v2.
> >> + */
> >> +#define __GFP_NO_CODETAG __GFP_NO_OBJ_EXT
> >> +
> >> +static inline bool should_record_early_pfn(gfp_t gfp_flags)
> >> +{
> >> + return !(gfp_flags & __GFP_NO_CODETAG);
> >> +}
> >> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
> >> {
> >> WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref),
> >> @@ -163,11 +180,12 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
> >> {
> >> WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> >> }
> >> -void alloc_tag_add_early_pfn(unsigned long pfn);
> >> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
> >> #else
> >> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
> >> static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
> >> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
> >> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
> >> +static inline bool should_record_early_pfn(gfp_t gfp_flags) { return false; }
> >> #endif
> >>
> >> /* Caller should verify both ref and tag to be valid */
> >> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> >> index ed1bdcf1f8ab..a00dc5a867e4 100644
> >> --- a/lib/alloc_tag.c
> >> +++ b/lib/alloc_tag.c
> >> @@ -766,45 +766,49 @@ static __init bool need_page_alloc_tagging(void)
> >> * Some pages are allocated before page_ext becomes available, leaving
> >> * their codetag uninitialized. Track these early PFNs so we can clear
> >> * their codetag refs later to avoid warnings when they are freed.
> >> - *
> >> - * Early allocations include:
> >> - * - Base allocations independent of CPU count
> >> - * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
> >> - * such as trace ring buffers, scheduler per-cpu data)
> >> - *
> >> - * For simplicity, we fix the size to 8192.
> >> - * If insufficient, a warning will be triggered to alert the user.
> >> - *
> >> - * TODO: Replace fixed-size array with dynamic allocation using
> >> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
> >> */
> >> -#define EARLY_ALLOC_PFN_MAX 8192
> >> +#define EARLY_PFNS_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
> >>
> >> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
> >> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
> >> +static struct page *early_pfn_current __initdata;
> > So, instead of using early_pfn_current as the head page and then
> > page->lru.next directly, I would suggest having "struct llist_head
> > early_pfn_head;" as the list head and either using page->pcp_llist to
> > link the pages or even better, adding a "struct llist_node llist;"
> > into the union containing "struct list_head lru;". Note that
> > llist_add() uses try_cmpxchg(), so the link addition is atomic.
>
> Adding a struct llist_node into the page union only for the
> CONFIG_MEM_ALLOC_PROFILING_DEBUG feature,
>
> which may not be quite appropriate from a design perspective.
>
> > And you should probably use folios rather than pages to follow the new trends.
> >
> >> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
> >> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >> {
> >> - int old_idx, new_idx;
> >> + struct page *page;
> >> + unsigned long idx;
> >>
> >> do {
> >> - old_idx = atomic_read(&early_pfn_count);
> >> - if (old_idx >= EARLY_ALLOC_PFN_MAX) {
> >> - pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
> >> - EARLY_ALLOC_PFN_MAX);
> >> - return;
> >> + page = READ_ONCE(early_pfn_current);
> >> + if (!page || READ_ONCE(page->private) >= EARLY_PFNS_PER_PAGE) {
> >> + gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
> >> + struct page *new = alloc_page(gfp | __GFP_NO_CODETAG);
> >> +
> >> + if (!new) {
> >> + pr_warn_once("early PFN tracking page allocation failed\n");
> >> + return;
> >> + }
> >> + new->lru.next = (struct list_head *)page;
> >> + if (cmpxchg(&early_pfn_current, page, new) != page) {
> >> + __free_page(new);
> >> + continue;
> >> + }
>
>
> I also considered using llist_head initially. like this
>
> static LLIST_HEAD(early_pfn_pages);
>
> static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t
> gfp_flags)
> {
> struct page *page;
> unsigned long idx;
>
> do {
> page = llist_first(&early_pfn_pages);
> if (!page || READ_ONCE(page->private) >=
> EARLY_PFNS_PER_PAGE) {
> gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
> struct page *new = alloc_page(gfp |
> __GFP_NO_CODETAG);
>
> if (!new) {
> pr_warn_once("early PFN tracking page
> allocation failed\n");
> return;
> }
> new->private = 0;
> llist_add(&new->pcp_llist, &early_pfn_pages);
> page = new;
> }
> idx = READ_ONCE(page->private);
> if (idx >= EARLY_PFNS_PER_PAGE)
> continue;
> if (cmpxchg(&page->private, idx, idx + 1) == idx)
> break;
> } while (1);
>
> ((unsigned long *)page_address(page))[idx] = pfn;
> }
>
>
> The problem is that llist_add() retries internally until it succeeds.
>
> When multiple CPUs concurrently see that the head page is NULL or full ,
>
> they each allocate a new tracking page and all insertions succeed:
>
> CPU0: head page A is full or NULL, allocates B, llist_add(B) → head=B→A
>
> CPU1: head page A is full or NULL, allocates C, llist_add(C) → head=C→B→A
>
> CPU0: claims slot 0 in B
>
> CPU1: claims slot 0 in C
>
> Now the head is C. Once C fills up, a new page D is allocated and
> becomes the new head. But B still has ~511 unused slots and will never
>
> be the head again — we only ever look at llist_first(). Those slots are
> permanently wasted.
>
> That's why I used struct page *early_pfn_current with direct cmpxchg —
> the loser frees its page and retries, avoiding the waste.
>
> That said, I may be missing something. Is there a way to use llist that
> avoids this? I'd appreciate your thoughts.
No, I think you are right, that retry logic in llist_add() does not
work for our case. I think the code you have works, but the way it
uses page attributes is kinda hacky and confusing. If we have to open
code the linked list handling then maybe just do this:
struct pfn_arr_hdr {
struct pfn_arr *next;
atomic_t count;
};
#define PFN_ARR_SIZE ((PAGE_SIZE - sizeof(struct pfn_arr_hdr)) /
sizeof(unsigned long))
struct pfn_arr {
struct pfn_arr_hdr hdr;
unsigned long pfns[PFN_ARR_SIZE];
};
So, instead of allocating pages, you use kmalloc() to allocate struct
pfn_arr and fill pfn_arr.pfns until it's full using pfn_arr.hdr.count
to track the array size. When current pfn_arr is full, you allocate a
new pfn_arr and link them via pfn_arr.hdr.next atomically, the way you
are doing with pages. That seems cleaner to me. WDYT?
>
>
> >> + page = new;
> >> }
> >> - new_idx = old_idx + 1;
> >> - } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
> >> + idx = READ_ONCE(page->private);
> >> + if (idx >= EARLY_PFNS_PER_PAGE)
> >> + continue;
> >> + if (cmpxchg(&page->private, idx, idx + 1) != idx)
> >> + continue;
> >> + break;
> > nit: This would read more easily:
> >
> > if (cmpxchg(&page->private, idx, idx + 1) == idx)
> > break;
> will fix in the next version.
> >
> >> + } while (1);
> >>
> >> - early_pfns[old_idx] = pfn;
> >> + ((unsigned long *)page_address(page))[idx] = pfn;
> >> }
> >>
> >> -typedef void alloc_tag_add_func(unsigned long pfn);
> >> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
> >> static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
> >> RCU_INITIALIZER(__alloc_tag_add_early_pfn);
> >>
> >> -void alloc_tag_add_early_pfn(unsigned long pfn)
> >> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >> {
> >> alloc_tag_add_func *alloc_tag_add;
> >>
> >> @@ -814,13 +818,14 @@ void alloc_tag_add_early_pfn(unsigned long pfn)
> >> rcu_read_lock();
> >> alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
> >> if (alloc_tag_add)
> >> - alloc_tag_add(pfn);
> >> + alloc_tag_add(pfn, gfp_flags);
> >> rcu_read_unlock();
> >> }
> >>
> >> static void __init clear_early_alloc_pfn_tag_refs(void)
> >> {
> >> - unsigned int i;
> >> + struct page *page, *next;
> >> + unsigned long i;
> >>
> >> if (static_key_enabled(&mem_profiling_compressed))
> >> return;
> >> @@ -829,37 +834,42 @@ static void __init clear_early_alloc_pfn_tag_refs(void)
> >> /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
> >> synchronize_rcu();
> >>
> >> - for (i = 0; i < atomic_read(&early_pfn_count); i++) {
> >> - unsigned long pfn = early_pfns[i];
> >> -
> >> - if (pfn_valid(pfn)) {
> >> - struct page *page = pfn_to_page(pfn);
> >> - union pgtag_ref_handle handle;
> >> - union codetag_ref ref;
> >> -
> >> - if (get_page_tag_ref(page, &ref, &handle)) {
> >> - /*
> >> - * An early-allocated page could be freed and reallocated
> >> - * after its page_ext is initialized but before we clear it.
> >> - * In that case, it already has a valid tag set.
> >> - * We should not overwrite that valid tag with CODETAG_EMPTY.
> >> - *
> >> - * Note: there is still a small race window between checking
> >> - * ref.ct and calling set_codetag_empty(). We accept this
> >> - * race as it's unlikely and the extra complexity of atomic
> >> - * cmpxchg is not worth it for this debug-only code path.
> >> - */
> >> - if (ref.ct) {
> >> + for (page = early_pfn_current; page; page = next) {
> >> + for (i = 0; i < page->private; i++) {
> >> + unsigned long pfn = ((unsigned long *)page_address(page))[i];
> > This page_address() can be done in the outer loop since the page does
> > not change inside the inner loop.
> will fix in the next version.
> >> +
> >> + if (pfn_valid(pfn)) {
> >> + union pgtag_ref_handle handle;
> >> + union codetag_ref ref;
> >> +
> >> + if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
> >> + /*
> >> + * An early-allocated page could be freed and reallocated
> >> + * after its page_ext is initialized but before we clear it.
> >> + * In that case, it already has a valid tag set.
> >> + * We should not overwrite that valid tag
> >> + * with CODETAG_EMPTY.
> >> + *
> >> + * Note: there is still a small race window between checking
> >> + * ref.ct and calling set_codetag_empty(). We accept this
> >> + * race as it's unlikely and the extra complexity of atomic
> >> + * cmpxchg is not worth it for this debug-only code path.
> >> + */
> >> + if (ref.ct) {
> >> + put_page_tag_ref(handle);
> >> + continue;
> >> + }
> >> +
> >> + set_codetag_empty(&ref);
> >> + update_page_tag_ref(handle, &ref);
> >> put_page_tag_ref(handle);
> >> - continue;
> >> }
> >> -
> >> - set_codetag_empty(&ref);
> >> - update_page_tag_ref(handle, &ref);
> >> - put_page_tag_ref(handle);
> >> }
> >> }
> >>
> >> + next = (struct page *)page->lru.next;
> >> + clear_page_tag_ref(page);
> >> + __free_page(page);
> >> }
> >> }
> >> #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index 04494bc2e46f..5b7b234967a5 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -1284,7 +1284,7 @@ void __clear_page_tag_ref(struct page *page)
> >> /* Should be called only if mem_alloc_profiling_enabled() */
> >> static noinline
> >> void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> >> - unsigned int nr)
> >> + unsigned int nr, gfp_t gfp_flags)
> >> {
> >> union pgtag_ref_handle handle;
> >> union codetag_ref ref;
> >> @@ -1294,21 +1294,24 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> >> update_page_tag_ref(handle, &ref);
> >> put_page_tag_ref(handle);
> >> } else {
> >> - /*
> >> - * page_ext is not available yet, record the pfn so we can
> >> - * clear the tag ref later when page_ext is initialized.
> >> - */
> >> - alloc_tag_add_early_pfn(page_to_pfn(page));
> >> +
> >> if (task->alloc_tag)
> >> alloc_tag_set_inaccurate(task->alloc_tag);
> >> +
> >> + /*
> >> + * page_ext is not available yet, record the pfn so the
> >> + * tag ref can be cleared later when page_ext is initialized.
> >> + */
> >> + if (should_record_early_pfn(gfp_flags))
> >> + alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
> > Any reason for changing the order of alloc_tag_add_early_pfn() and
> > alloc_tag_set_inaccurate()? I liked the previous version where we
> > explain why this is done at the very beginning of the block. You could
> > also call should_record_early_pfn() from inside
> > alloc_tag_add_early_pfn(), which would keep both
> > should_record_early_pfn() and __GFP_NO_CODETAG definitions local to
> > alloc_tag.c.
> >
> will fix in the next version.
>
> Thanks
>
> Best regards.
>
> Hao
>
> >> }
> >> }
> >>
> >> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> >> - unsigned int nr)
> >> + unsigned int nr, gfp_t gfp_flags)
> >> {
> >> if (mem_alloc_profiling_enabled())
> >> - __pgalloc_tag_add(page, task, nr);
> >> + __pgalloc_tag_add(page, task, nr, gfp_flags);
> >> }
> >>
> >> /* Should be called only if mem_alloc_profiling_enabled() */
> >> @@ -1341,7 +1344,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
> >> #else /* CONFIG_MEM_ALLOC_PROFILING */
> >>
> >> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> >> - unsigned int nr) {}
> >> + unsigned int nr, gfp_t gfp_flags) {}
> >> static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
> >> static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
> >>
> >> @@ -1896,7 +1899,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
> >>
> >> set_page_owner(page, order, gfp_flags);
> >> page_table_check_alloc(page, order);
> >> - pgalloc_tag_add(page, current, 1 << order);
> >> + pgalloc_tag_add(page, current, 1 << order, gfp_flags);
> >> }
> >>
> >> static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
> >> --
> >> 2.25.1
> >>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
2026-04-29 5:47 ` Suren Baghdasaryan
@ 2026-04-29 7:03 ` Hao Ge
2026-04-29 18:20 ` Suren Baghdasaryan
0 siblings, 1 reply; 7+ messages in thread
From: Hao Ge @ 2026-04-29 7:03 UTC (permalink / raw)
To: Suren Baghdasaryan; +Cc: Kent Overstreet, Andrew Morton, linux-mm, linux-kernel
Hi Suren
On 2026/4/29 13:47, Suren Baghdasaryan wrote:
> On Mon, Apr 27, 2026 at 8:01 PM Hao Ge <hao.ge@linux.dev> wrote:
>> Hi Suren
>>
>>
>> Thanks a lot for your review and suggestions.
>>
>>
>> On 2026/4/28 00:02, Suren Baghdasaryan wrote:
>>> On Thu, Apr 23, 2026 at 1:39 AM Hao Ge <hao.ge@linux.dev> wrote:
>>>> Pages allocated before page_ext is available have their codetag left
>>>> uninitialized. Track these early PFNs and clear their codetag in
>>>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
>>>> warnings when they are freed later.
>>>>
>>>> Currently a fixed-size array of 8192 entries is used, with a warning if
>>>> the limit is exceeded. However, the number of early allocations depends
>>>> on the number of CPUs and can be larger than 8192.
>>>>
>>>> Replace the fixed-size array with a dynamically allocated linked list
>>>> of pages. Each page stores PFNs in its page body and uses page->lru
>>>> to link to the next page, with page->private tracking the number of
>>>> used slots.
>>>>
>>>> The list pages themselves are allocated via alloc_page(), which would
>>>> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
>>>> indefinitely. Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
>>>> bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
>>>> can skip recording allocations that carry this flag.
>>>>
>>>> Signed-off-by: Hao Ge <hao.ge@linux.dev>
>>>> ---
>>>> v3:
>>>> - Simplify linked list: use page->lru for chaining and page->private as
>>>> slot counter, removing the early_pfn_node struct and freelist (suggested
>>>> by Suren Baghdasaryan)
>>>> - Pass gfp_flags through alloc_tag_add_early_pfn() but strip
>>>> __GFP_DIRECT_RECLAIM instead of selecting GFP_KERNEL/GFP_ATOMIC,
>>>> because __alloc_tag_add_early_pfn() is invoked under rcu_read_lock().
>>>>
>>>> v2:
>>>> - Use cmpxchg to atomically update early_pfn_pages, preventing page leak under concurrent allocation
>>>> - Pass gfp_flags through the full call chain and use gfpflags_allow_blocking()
>>>> to select GFP_KERNEL vs GFP_ATOMIC, avoiding unnecessary GFP_ATOMIC in process context
>>>> ---
>>>> include/linux/alloc_tag.h | 22 ++++++-
>>>> lib/alloc_tag.c | 118 +++++++++++++++++++++-----------------
>>>> mm/page_alloc.c | 23 ++++----
>>>> 3 files changed, 97 insertions(+), 66 deletions(-)
>>>>
>>>> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
>>>> index 02de2ede560f..ce9b478033bf 100644
>>>> --- a/include/linux/alloc_tag.h
>>>> +++ b/include/linux/alloc_tag.h
>>>> @@ -150,6 +150,23 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
>>>> }
>>>>
>>>> #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
>>>> +/*
>>>> + * Skip early PFN recording for a page allocation. Reuses the
>>>> + * %__GFP_NO_OBJ_EXT bit. Used by __alloc_tag_add_early_pfn() to avoid
>>>> + * recursion when allocating pages for the early PFN tracking list
>>>> + * itself.
>>>> + *
>>>> + * Callers must set the codetag to CODETAG_EMPTY (via
>>>> + * clear_page_tag_ref()) before freeing pages allocated with this
>>>> + * flag once page_ext becomes available, otherwise
>>>> + * alloc_tag_sub_check() will trigger a warning.
>>> nit: Callers must... sounds like someone who uses
>>> __alloc_tag_add_early_pfn() has to do something extra, however
>>> clear_page_tag_ref() is done by clear_early_alloc_pfn_tag_refs(), so
>>> callers don't really need to do this. I suggest rephrasing as:
>>>
>>> Codetags of the pages allocated with __GFP_NO_CODETAG should be
>>> cleared (via clear_page_tag_ref()) before freeing the pages to prevent
>>> alloc_tag_sub_check() from triggering a warning.
>> That's reasonable.
>>
>> I will adjust the wording in v2.
>>>> + */
>>>> +#define __GFP_NO_CODETAG __GFP_NO_OBJ_EXT
>>>> +
>>>> +static inline bool should_record_early_pfn(gfp_t gfp_flags)
>>>> +{
>>>> + return !(gfp_flags & __GFP_NO_CODETAG);
>>>> +}
>>>> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
>>>> {
>>>> WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref),
>>>> @@ -163,11 +180,12 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
>>>> {
>>>> WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
>>>> }
>>>> -void alloc_tag_add_early_pfn(unsigned long pfn);
>>>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
>>>> #else
>>>> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
>>>> static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
>>>> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
>>>> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
>>>> +static inline bool should_record_early_pfn(gfp_t gfp_flags) { return false; }
>>>> #endif
>>>>
>>>> /* Caller should verify both ref and tag to be valid */
>>>> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
>>>> index ed1bdcf1f8ab..a00dc5a867e4 100644
>>>> --- a/lib/alloc_tag.c
>>>> +++ b/lib/alloc_tag.c
>>>> @@ -766,45 +766,49 @@ static __init bool need_page_alloc_tagging(void)
>>>> * Some pages are allocated before page_ext becomes available, leaving
>>>> * their codetag uninitialized. Track these early PFNs so we can clear
>>>> * their codetag refs later to avoid warnings when they are freed.
>>>> - *
>>>> - * Early allocations include:
>>>> - * - Base allocations independent of CPU count
>>>> - * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
>>>> - * such as trace ring buffers, scheduler per-cpu data)
>>>> - *
>>>> - * For simplicity, we fix the size to 8192.
>>>> - * If insufficient, a warning will be triggered to alert the user.
>>>> - *
>>>> - * TODO: Replace fixed-size array with dynamic allocation using
>>>> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
>>>> */
>>>> -#define EARLY_ALLOC_PFN_MAX 8192
>>>> +#define EARLY_PFNS_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
>>>>
>>>> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
>>>> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
>>>> +static struct page *early_pfn_current __initdata;
>>> So, instead of using early_pfn_current as the head page and then
>>> page->lru.next directly, I would suggest having "struct llist_head
>>> early_pfn_head;" as the list head and either using page->pcp_llist to
>>> link the pages or even better, adding a "struct llist_node llist;"
>>> into the union containing "struct list_head lru;". Note that
>>> llist_add() uses try_cmpxchg(), so the link addition is atomic.
>> Adding a struct llist_node into the page union only for the
>> CONFIG_MEM_ALLOC_PROFILING_DEBUG feature,
>>
>> which may not be quite appropriate from a design perspective.
>>
>>> And you should probably use folios rather than pages to follow the new trends.
>>>
>>>> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
>>>> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>>>> {
>>>> - int old_idx, new_idx;
>>>> + struct page *page;
>>>> + unsigned long idx;
>>>>
>>>> do {
>>>> - old_idx = atomic_read(&early_pfn_count);
>>>> - if (old_idx >= EARLY_ALLOC_PFN_MAX) {
>>>> - pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
>>>> - EARLY_ALLOC_PFN_MAX);
>>>> - return;
>>>> + page = READ_ONCE(early_pfn_current);
>>>> + if (!page || READ_ONCE(page->private) >= EARLY_PFNS_PER_PAGE) {
>>>> + gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
>>>> + struct page *new = alloc_page(gfp | __GFP_NO_CODETAG);
>>>> +
>>>> + if (!new) {
>>>> + pr_warn_once("early PFN tracking page allocation failed\n");
>>>> + return;
>>>> + }
>>>> + new->lru.next = (struct list_head *)page;
>>>> + if (cmpxchg(&early_pfn_current, page, new) != page) {
>>>> + __free_page(new);
>>>> + continue;
>>>> + }
>>
>> I also considered using llist_head initially. like this
>>
>> static LLIST_HEAD(early_pfn_pages);
>>
>> static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t
>> gfp_flags)
>> {
>> struct page *page;
>> unsigned long idx;
>>
>> do {
>> page = llist_first(&early_pfn_pages);
>> if (!page || READ_ONCE(page->private) >=
>> EARLY_PFNS_PER_PAGE) {
>> gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
>> struct page *new = alloc_page(gfp |
>> __GFP_NO_CODETAG);
>>
>> if (!new) {
>> pr_warn_once("early PFN tracking page
>> allocation failed\n");
>> return;
>> }
>> new->private = 0;
>> llist_add(&new->pcp_llist, &early_pfn_pages);
>> page = new;
>> }
>> idx = READ_ONCE(page->private);
>> if (idx >= EARLY_PFNS_PER_PAGE)
>> continue;
>> if (cmpxchg(&page->private, idx, idx + 1) == idx)
>> break;
>> } while (1);
>>
>> ((unsigned long *)page_address(page))[idx] = pfn;
>> }
>>
>>
>> The problem is that llist_add() retries internally until it succeeds.
>>
>> When multiple CPUs concurrently see that the head page is NULL or full ,
>>
>> they each allocate a new tracking page and all insertions succeed:
>>
>> CPU0: head page A is full or NULL, allocates B, llist_add(B) → head=B→A
>>
>> CPU1: head page A is full or NULL, allocates C, llist_add(C) → head=C→B→A
>>
>> CPU0: claims slot 0 in B
>>
>> CPU1: claims slot 0 in C
>>
>> Now the head is C. Once C fills up, a new page D is allocated and
>> becomes the new head. But B still has ~511 unused slots and will never
>>
>> be the head again — we only ever look at llist_first(). Those slots are
>> permanently wasted.
>>
>> That's why I used struct page *early_pfn_current with direct cmpxchg —
>> the loser frees its page and retries, avoiding the waste.
>>
>> That said, I may be missing something. Is there a way to use llist that
>> avoids this? I'd appreciate your thoughts.
> No, I think you are right, that retry logic in llist_add() does not
> work for our case. I think the code you have works, but the way it
> uses page attributes is kinda hacky and confusing. If we have to open
> code the linked list handling then maybe just do this:
>
> struct pfn_arr_hdr {
> struct pfn_arr *next;
> atomic_t count;
> };
>
> #define PFN_ARR_SIZE ((PAGE_SIZE - sizeof(struct pfn_arr_hdr)) /
> sizeof(unsigned long))
>
> struct pfn_arr {
> struct pfn_arr_hdr hdr;
> unsigned long pfns[PFN_ARR_SIZE];
> };
>
> So, instead of allocating pages, you use kmalloc() to allocate struct
> pfn_arr and fill pfn_arr.pfns until it's full using pfn_arr.hdr.count
> to track the array size. When current pfn_arr is full, you allocate a
> new pfn_arr and link them via pfn_arr.hdr.next atomically, the way you
> are doing with pages. That seems cleaner to me. WDYT?
Actually, using kmalloc() would not be safe either, as page allocations
can occur before kmalloc() becomes available.
For example:
memblock_free_all() ← buddy allocator ready
mem_init()
preallocate_vmalloc_pages()
p4d_alloc() / pud_alloc()
alloc_pages() -> triggers alloc_tag_add_early_pfn()
kmem_cache_init() -> kmalloc becomes available
Using alloc_page() is the safest choice — the very fact that we reach
alloc_tag_add_early_pfn() means a page allocation via the buddy
allocator has already succeeded, so the buddy allocator is guaranteed to
be ready.
How about keeping alloc_page() but mapping the page body to a proper struct?
struct pfn_pool {
struct pfn_pool *next;
atomic_t count;
unsigned long pfns[];
};
The first few bytes of the page hold metadata (next pointer and slot
count), the remainder stores PFNs.
WDYT?
Thanks.
Best regards.
Hao
>
>>
>>>> + page = new;
>>>> }
>>>> - new_idx = old_idx + 1;
>>>> - } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
>>>> + idx = READ_ONCE(page->private);
>>>> + if (idx >= EARLY_PFNS_PER_PAGE)
>>>> + continue;
>>>> + if (cmpxchg(&page->private, idx, idx + 1) != idx)
>>>> + continue;
>>>> + break;
>>> nit: This would read more easily:
>>>
>>> if (cmpxchg(&page->private, idx, idx + 1) == idx)
>>> break;
>> will fix in the next version.
>>>> + } while (1);
>>>>
>>>> - early_pfns[old_idx] = pfn;
>>>> + ((unsigned long *)page_address(page))[idx] = pfn;
>>>> }
>>>>
>>>> -typedef void alloc_tag_add_func(unsigned long pfn);
>>>> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
>>>> static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
>>>> RCU_INITIALIZER(__alloc_tag_add_early_pfn);
>>>>
>>>> -void alloc_tag_add_early_pfn(unsigned long pfn)
>>>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
>>>> {
>>>> alloc_tag_add_func *alloc_tag_add;
>>>>
>>>> @@ -814,13 +818,14 @@ void alloc_tag_add_early_pfn(unsigned long pfn)
>>>> rcu_read_lock();
>>>> alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
>>>> if (alloc_tag_add)
>>>> - alloc_tag_add(pfn);
>>>> + alloc_tag_add(pfn, gfp_flags);
>>>> rcu_read_unlock();
>>>> }
>>>>
>>>> static void __init clear_early_alloc_pfn_tag_refs(void)
>>>> {
>>>> - unsigned int i;
>>>> + struct page *page, *next;
>>>> + unsigned long i;
>>>>
>>>> if (static_key_enabled(&mem_profiling_compressed))
>>>> return;
>>>> @@ -829,37 +834,42 @@ static void __init clear_early_alloc_pfn_tag_refs(void)
>>>> /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
>>>> synchronize_rcu();
>>>>
>>>> - for (i = 0; i < atomic_read(&early_pfn_count); i++) {
>>>> - unsigned long pfn = early_pfns[i];
>>>> -
>>>> - if (pfn_valid(pfn)) {
>>>> - struct page *page = pfn_to_page(pfn);
>>>> - union pgtag_ref_handle handle;
>>>> - union codetag_ref ref;
>>>> -
>>>> - if (get_page_tag_ref(page, &ref, &handle)) {
>>>> - /*
>>>> - * An early-allocated page could be freed and reallocated
>>>> - * after its page_ext is initialized but before we clear it.
>>>> - * In that case, it already has a valid tag set.
>>>> - * We should not overwrite that valid tag with CODETAG_EMPTY.
>>>> - *
>>>> - * Note: there is still a small race window between checking
>>>> - * ref.ct and calling set_codetag_empty(). We accept this
>>>> - * race as it's unlikely and the extra complexity of atomic
>>>> - * cmpxchg is not worth it for this debug-only code path.
>>>> - */
>>>> - if (ref.ct) {
>>>> + for (page = early_pfn_current; page; page = next) {
>>>> + for (i = 0; i < page->private; i++) {
>>>> + unsigned long pfn = ((unsigned long *)page_address(page))[i];
>>> This page_address() can be done in the outer loop since the page does
>>> not change inside the inner loop.
>> will fix in the next version.
>>>> +
>>>> + if (pfn_valid(pfn)) {
>>>> + union pgtag_ref_handle handle;
>>>> + union codetag_ref ref;
>>>> +
>>>> + if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
>>>> + /*
>>>> + * An early-allocated page could be freed and reallocated
>>>> + * after its page_ext is initialized but before we clear it.
>>>> + * In that case, it already has a valid tag set.
>>>> + * We should not overwrite that valid tag
>>>> + * with CODETAG_EMPTY.
>>>> + *
>>>> + * Note: there is still a small race window between checking
>>>> + * ref.ct and calling set_codetag_empty(). We accept this
>>>> + * race as it's unlikely and the extra complexity of atomic
>>>> + * cmpxchg is not worth it for this debug-only code path.
>>>> + */
>>>> + if (ref.ct) {
>>>> + put_page_tag_ref(handle);
>>>> + continue;
>>>> + }
>>>> +
>>>> + set_codetag_empty(&ref);
>>>> + update_page_tag_ref(handle, &ref);
>>>> put_page_tag_ref(handle);
>>>> - continue;
>>>> }
>>>> -
>>>> - set_codetag_empty(&ref);
>>>> - update_page_tag_ref(handle, &ref);
>>>> - put_page_tag_ref(handle);
>>>> }
>>>> }
>>>>
>>>> + next = (struct page *)page->lru.next;
>>>> + clear_page_tag_ref(page);
>>>> + __free_page(page);
>>>> }
>>>> }
>>>> #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>> index 04494bc2e46f..5b7b234967a5 100644
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -1284,7 +1284,7 @@ void __clear_page_tag_ref(struct page *page)
>>>> /* Should be called only if mem_alloc_profiling_enabled() */
>>>> static noinline
>>>> void __pgalloc_tag_add(struct page *page, struct task_struct *task,
>>>> - unsigned int nr)
>>>> + unsigned int nr, gfp_t gfp_flags)
>>>> {
>>>> union pgtag_ref_handle handle;
>>>> union codetag_ref ref;
>>>> @@ -1294,21 +1294,24 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
>>>> update_page_tag_ref(handle, &ref);
>>>> put_page_tag_ref(handle);
>>>> } else {
>>>> - /*
>>>> - * page_ext is not available yet, record the pfn so we can
>>>> - * clear the tag ref later when page_ext is initialized.
>>>> - */
>>>> - alloc_tag_add_early_pfn(page_to_pfn(page));
>>>> +
>>>> if (task->alloc_tag)
>>>> alloc_tag_set_inaccurate(task->alloc_tag);
>>>> +
>>>> + /*
>>>> + * page_ext is not available yet, record the pfn so the
>>>> + * tag ref can be cleared later when page_ext is initialized.
>>>> + */
>>>> + if (should_record_early_pfn(gfp_flags))
>>>> + alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
>>> Any reason for changing the order of alloc_tag_add_early_pfn() and
>>> alloc_tag_set_inaccurate()? I liked the previous version where we
>>> explain why this is done at the very beginning of the block. You could
>>> also call should_record_early_pfn() from inside
>>> alloc_tag_add_early_pfn(), which would keep both
>>> should_record_early_pfn() and __GFP_NO_CODETAG definitions local to
>>> alloc_tag.c.
>>>
>> will fix in the next version.
>>
>> Thanks
>>
>> Best regards.
>>
>> Hao
>>
>>>> }
>>>> }
>>>>
>>>> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
>>>> - unsigned int nr)
>>>> + unsigned int nr, gfp_t gfp_flags)
>>>> {
>>>> if (mem_alloc_profiling_enabled())
>>>> - __pgalloc_tag_add(page, task, nr);
>>>> + __pgalloc_tag_add(page, task, nr, gfp_flags);
>>>> }
>>>>
>>>> /* Should be called only if mem_alloc_profiling_enabled() */
>>>> @@ -1341,7 +1344,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
>>>> #else /* CONFIG_MEM_ALLOC_PROFILING */
>>>>
>>>> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
>>>> - unsigned int nr) {}
>>>> + unsigned int nr, gfp_t gfp_flags) {}
>>>> static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
>>>> static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
>>>>
>>>> @@ -1896,7 +1899,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
>>>>
>>>> set_page_owner(page, order, gfp_flags);
>>>> page_table_check_alloc(page, order);
>>>> - pgalloc_tag_add(page, current, 1 << order);
>>>> + pgalloc_tag_add(page, current, 1 << order, gfp_flags);
>>>> }
>>>>
>>>> static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
>>>> --
>>>> 2.25.1
>>>>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list
2026-04-29 7:03 ` Hao Ge
@ 2026-04-29 18:20 ` Suren Baghdasaryan
0 siblings, 0 replies; 7+ messages in thread
From: Suren Baghdasaryan @ 2026-04-29 18:20 UTC (permalink / raw)
To: Hao Ge; +Cc: Kent Overstreet, Andrew Morton, linux-mm, linux-kernel
On Wed, Apr 29, 2026 at 12:04 AM Hao Ge <hao.ge@linux.dev> wrote:
>
> Hi Suren
>
>
> On 2026/4/29 13:47, Suren Baghdasaryan wrote:
> > On Mon, Apr 27, 2026 at 8:01 PM Hao Ge <hao.ge@linux.dev> wrote:
> >> Hi Suren
> >>
> >>
> >> Thanks a lot for your review and suggestions.
> >>
> >>
> >> On 2026/4/28 00:02, Suren Baghdasaryan wrote:
> >>> On Thu, Apr 23, 2026 at 1:39 AM Hao Ge <hao.ge@linux.dev> wrote:
> >>>> Pages allocated before page_ext is available have their codetag left
> >>>> uninitialized. Track these early PFNs and clear their codetag in
> >>>> clear_early_alloc_pfn_tag_refs() to avoid "alloc_tag was not set"
> >>>> warnings when they are freed later.
> >>>>
> >>>> Currently a fixed-size array of 8192 entries is used, with a warning if
> >>>> the limit is exceeded. However, the number of early allocations depends
> >>>> on the number of CPUs and can be larger than 8192.
> >>>>
> >>>> Replace the fixed-size array with a dynamically allocated linked list
> >>>> of pages. Each page stores PFNs in its page body and uses page->lru
> >>>> to link to the next page, with page->private tracking the number of
> >>>> used slots.
> >>>>
> >>>> The list pages themselves are allocated via alloc_page(), which would
> >>>> trigger __pgalloc_tag_add() -> alloc_tag_add_early_pfn() and recurse
> >>>> indefinitely. Introduce __GFP_NO_CODETAG (reuses the %__GFP_NO_OBJ_EXT
> >>>> bit) and pass gfp_flags through pgalloc_tag_add() so that the early path
> >>>> can skip recording allocations that carry this flag.
> >>>>
> >>>> Signed-off-by: Hao Ge <hao.ge@linux.dev>
> >>>> ---
> >>>> v3:
> >>>> - Simplify linked list: use page->lru for chaining and page->private as
> >>>> slot counter, removing the early_pfn_node struct and freelist (suggested
> >>>> by Suren Baghdasaryan)
> >>>> - Pass gfp_flags through alloc_tag_add_early_pfn() but strip
> >>>> __GFP_DIRECT_RECLAIM instead of selecting GFP_KERNEL/GFP_ATOMIC,
> >>>> because __alloc_tag_add_early_pfn() is invoked under rcu_read_lock().
> >>>>
> >>>> v2:
> >>>> - Use cmpxchg to atomically update early_pfn_pages, preventing page leak under concurrent allocation
> >>>> - Pass gfp_flags through the full call chain and use gfpflags_allow_blocking()
> >>>> to select GFP_KERNEL vs GFP_ATOMIC, avoiding unnecessary GFP_ATOMIC in process context
> >>>> ---
> >>>> include/linux/alloc_tag.h | 22 ++++++-
> >>>> lib/alloc_tag.c | 118 +++++++++++++++++++++-----------------
> >>>> mm/page_alloc.c | 23 ++++----
> >>>> 3 files changed, 97 insertions(+), 66 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> >>>> index 02de2ede560f..ce9b478033bf 100644
> >>>> --- a/include/linux/alloc_tag.h
> >>>> +++ b/include/linux/alloc_tag.h
> >>>> @@ -150,6 +150,23 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> >>>> }
> >>>>
> >>>> #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> >>>> +/*
> >>>> + * Skip early PFN recording for a page allocation. Reuses the
> >>>> + * %__GFP_NO_OBJ_EXT bit. Used by __alloc_tag_add_early_pfn() to avoid
> >>>> + * recursion when allocating pages for the early PFN tracking list
> >>>> + * itself.
> >>>> + *
> >>>> + * Callers must set the codetag to CODETAG_EMPTY (via
> >>>> + * clear_page_tag_ref()) before freeing pages allocated with this
> >>>> + * flag once page_ext becomes available, otherwise
> >>>> + * alloc_tag_sub_check() will trigger a warning.
> >>> nit: Callers must... sounds like someone who uses
> >>> __alloc_tag_add_early_pfn() has to do something extra, however
> >>> clear_page_tag_ref() is done by clear_early_alloc_pfn_tag_refs(), so
> >>> callers don't really need to do this. I suggest rephrasing as:
> >>>
> >>> Codetags of the pages allocated with __GFP_NO_CODETAG should be
> >>> cleared (via clear_page_tag_ref()) before freeing the pages to prevent
> >>> alloc_tag_sub_check() from triggering a warning.
> >> That's reasonable.
> >>
> >> I will adjust the wording in v2.
> >>>> + */
> >>>> +#define __GFP_NO_CODETAG __GFP_NO_OBJ_EXT
> >>>> +
> >>>> +static inline bool should_record_early_pfn(gfp_t gfp_flags)
> >>>> +{
> >>>> + return !(gfp_flags & __GFP_NO_CODETAG);
> >>>> +}
> >>>> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
> >>>> {
> >>>> WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref),
> >>>> @@ -163,11 +180,12 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref)
> >>>> {
> >>>> WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> >>>> }
> >>>> -void alloc_tag_add_early_pfn(unsigned long pfn);
> >>>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags);
> >>>> #else
> >>>> static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
> >>>> static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
> >>>> -static inline void alloc_tag_add_early_pfn(unsigned long pfn) {}
> >>>> +static inline void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags) {}
> >>>> +static inline bool should_record_early_pfn(gfp_t gfp_flags) { return false; }
> >>>> #endif
> >>>>
> >>>> /* Caller should verify both ref and tag to be valid */
> >>>> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> >>>> index ed1bdcf1f8ab..a00dc5a867e4 100644
> >>>> --- a/lib/alloc_tag.c
> >>>> +++ b/lib/alloc_tag.c
> >>>> @@ -766,45 +766,49 @@ static __init bool need_page_alloc_tagging(void)
> >>>> * Some pages are allocated before page_ext becomes available, leaving
> >>>> * their codetag uninitialized. Track these early PFNs so we can clear
> >>>> * their codetag refs later to avoid warnings when they are freed.
> >>>> - *
> >>>> - * Early allocations include:
> >>>> - * - Base allocations independent of CPU count
> >>>> - * - Per-CPU allocations (e.g., CPU hotplug callbacks during smp_init,
> >>>> - * such as trace ring buffers, scheduler per-cpu data)
> >>>> - *
> >>>> - * For simplicity, we fix the size to 8192.
> >>>> - * If insufficient, a warning will be triggered to alert the user.
> >>>> - *
> >>>> - * TODO: Replace fixed-size array with dynamic allocation using
> >>>> - * a GFP flag similar to ___GFP_NO_OBJ_EXT to avoid recursion.
> >>>> */
> >>>> -#define EARLY_ALLOC_PFN_MAX 8192
> >>>> +#define EARLY_PFNS_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
> >>>>
> >>>> -static unsigned long early_pfns[EARLY_ALLOC_PFN_MAX] __initdata;
> >>>> -static atomic_t early_pfn_count __initdata = ATOMIC_INIT(0);
> >>>> +static struct page *early_pfn_current __initdata;
> >>> So, instead of using early_pfn_current as the head page and then
> >>> page->lru.next directly, I would suggest having "struct llist_head
> >>> early_pfn_head;" as the list head and either using page->pcp_llist to
> >>> link the pages or even better, adding a "struct llist_node llist;"
> >>> into the union containing "struct list_head lru;". Note that
> >>> llist_add() uses try_cmpxchg(), so the link addition is atomic.
> >> Adding a struct llist_node into the page union only for the
> >> CONFIG_MEM_ALLOC_PROFILING_DEBUG feature,
> >>
> >> which may not be quite appropriate from a design perspective.
> >>
> >>> And you should probably use folios rather than pages to follow the new trends.
> >>>
> >>>> -static void __init __alloc_tag_add_early_pfn(unsigned long pfn)
> >>>> +static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >>>> {
> >>>> - int old_idx, new_idx;
> >>>> + struct page *page;
> >>>> + unsigned long idx;
> >>>>
> >>>> do {
> >>>> - old_idx = atomic_read(&early_pfn_count);
> >>>> - if (old_idx >= EARLY_ALLOC_PFN_MAX) {
> >>>> - pr_warn_once("Early page allocations before page_ext init exceeded EARLY_ALLOC_PFN_MAX (%d)\n",
> >>>> - EARLY_ALLOC_PFN_MAX);
> >>>> - return;
> >>>> + page = READ_ONCE(early_pfn_current);
> >>>> + if (!page || READ_ONCE(page->private) >= EARLY_PFNS_PER_PAGE) {
> >>>> + gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
> >>>> + struct page *new = alloc_page(gfp | __GFP_NO_CODETAG);
> >>>> +
> >>>> + if (!new) {
> >>>> + pr_warn_once("early PFN tracking page allocation failed\n");
> >>>> + return;
> >>>> + }
> >>>> + new->lru.next = (struct list_head *)page;
> >>>> + if (cmpxchg(&early_pfn_current, page, new) != page) {
> >>>> + __free_page(new);
> >>>> + continue;
> >>>> + }
> >>
> >> I also considered using llist_head initially. like this
> >>
> >> static LLIST_HEAD(early_pfn_pages);
> >>
> >> static void __init __alloc_tag_add_early_pfn(unsigned long pfn, gfp_t
> >> gfp_flags)
> >> {
> >> struct page *page;
> >> unsigned long idx;
> >>
> >> do {
> >> page = llist_first(&early_pfn_pages);
> >> if (!page || READ_ONCE(page->private) >=
> >> EARLY_PFNS_PER_PAGE) {
> >> gfp_t gfp = gfp_flags & ~__GFP_DIRECT_RECLAIM;
> >> struct page *new = alloc_page(gfp |
> >> __GFP_NO_CODETAG);
> >>
> >> if (!new) {
> >> pr_warn_once("early PFN tracking page
> >> allocation failed\n");
> >> return;
> >> }
> >> new->private = 0;
> >> llist_add(&new->pcp_llist, &early_pfn_pages);
> >> page = new;
> >> }
> >> idx = READ_ONCE(page->private);
> >> if (idx >= EARLY_PFNS_PER_PAGE)
> >> continue;
> >> if (cmpxchg(&page->private, idx, idx + 1) == idx)
> >> break;
> >> } while (1);
> >>
> >> ((unsigned long *)page_address(page))[idx] = pfn;
> >> }
> >>
> >>
> >> The problem is that llist_add() retries internally until it succeeds.
> >>
> >> When multiple CPUs concurrently see that the head page is NULL or full ,
> >>
> >> they each allocate a new tracking page and all insertions succeed:
> >>
> >> CPU0: head page A is full or NULL, allocates B, llist_add(B) → head=B→A
> >>
> >> CPU1: head page A is full or NULL, allocates C, llist_add(C) → head=C→B→A
> >>
> >> CPU0: claims slot 0 in B
> >>
> >> CPU1: claims slot 0 in C
> >>
> >> Now the head is C. Once C fills up, a new page D is allocated and
> >> becomes the new head. But B still has ~511 unused slots and will never
> >>
> >> be the head again — we only ever look at llist_first(). Those slots are
> >> permanently wasted.
> >>
> >> That's why I used struct page *early_pfn_current with direct cmpxchg —
> >> the loser frees its page and retries, avoiding the waste.
> >>
> >> That said, I may be missing something. Is there a way to use llist that
> >> avoids this? I'd appreciate your thoughts.
> > No, I think you are right, that retry logic in llist_add() does not
> > work for our case. I think the code you have works, but the way it
> > uses page attributes is kinda hacky and confusing. If we have to open
> > code the linked list handling then maybe just do this:
> >
> > struct pfn_arr_hdr {
> > struct pfn_arr *next;
> > atomic_t count;
> > };
> >
> > #define PFN_ARR_SIZE ((PAGE_SIZE - sizeof(struct pfn_arr_hdr)) /
> > sizeof(unsigned long))
> >
> > struct pfn_arr {
> > struct pfn_arr_hdr hdr;
> > unsigned long pfns[PFN_ARR_SIZE];
> > };
> >
> > So, instead of allocating pages, you use kmalloc() to allocate struct
> > pfn_arr and fill pfn_arr.pfns until it's full using pfn_arr.hdr.count
> > to track the array size. When current pfn_arr is full, you allocate a
> > new pfn_arr and link them via pfn_arr.hdr.next atomically, the way you
> > are doing with pages. That seems cleaner to me. WDYT?
>
> Actually, using kmalloc() would not be safe either, as page allocations
> can occur before kmalloc() becomes available.
I see.
>
> For example:
>
> memblock_free_all() ← buddy allocator ready
>
> mem_init()
>
> preallocate_vmalloc_pages()
>
> p4d_alloc() / pud_alloc()
>
> alloc_pages() -> triggers alloc_tag_add_early_pfn()
>
> kmem_cache_init() -> kmalloc becomes available
>
> Using alloc_page() is the safest choice — the very fact that we reach
> alloc_tag_add_early_pfn() means a page allocation via the buddy
>
> allocator has already succeeded, so the buddy allocator is guaranteed to
> be ready.
>
> How about keeping alloc_page() but mapping the page body to a proper struct?
>
> struct pfn_pool {
>
> struct pfn_pool *next;
>
> atomic_t count;
>
> unsigned long pfns[];
>
> };
>
> The first few bytes of the page hold metadata (next pointer and slot
> count), the remainder stores PFNs.
>
> WDYT?
Yeah, that sounds good to me.
>
> Thanks.
>
> Best regards.
>
> Hao
>
> >
> >>
> >>>> + page = new;
> >>>> }
> >>>> - new_idx = old_idx + 1;
> >>>> - } while (!atomic_try_cmpxchg(&early_pfn_count, &old_idx, new_idx));
> >>>> + idx = READ_ONCE(page->private);
> >>>> + if (idx >= EARLY_PFNS_PER_PAGE)
> >>>> + continue;
> >>>> + if (cmpxchg(&page->private, idx, idx + 1) != idx)
> >>>> + continue;
> >>>> + break;
> >>> nit: This would read more easily:
> >>>
> >>> if (cmpxchg(&page->private, idx, idx + 1) == idx)
> >>> break;
> >> will fix in the next version.
> >>>> + } while (1);
> >>>>
> >>>> - early_pfns[old_idx] = pfn;
> >>>> + ((unsigned long *)page_address(page))[idx] = pfn;
> >>>> }
> >>>>
> >>>> -typedef void alloc_tag_add_func(unsigned long pfn);
> >>>> +typedef void alloc_tag_add_func(unsigned long pfn, gfp_t gfp_flags);
> >>>> static alloc_tag_add_func __rcu *alloc_tag_add_early_pfn_ptr __refdata =
> >>>> RCU_INITIALIZER(__alloc_tag_add_early_pfn);
> >>>>
> >>>> -void alloc_tag_add_early_pfn(unsigned long pfn)
> >>>> +void alloc_tag_add_early_pfn(unsigned long pfn, gfp_t gfp_flags)
> >>>> {
> >>>> alloc_tag_add_func *alloc_tag_add;
> >>>>
> >>>> @@ -814,13 +818,14 @@ void alloc_tag_add_early_pfn(unsigned long pfn)
> >>>> rcu_read_lock();
> >>>> alloc_tag_add = rcu_dereference(alloc_tag_add_early_pfn_ptr);
> >>>> if (alloc_tag_add)
> >>>> - alloc_tag_add(pfn);
> >>>> + alloc_tag_add(pfn, gfp_flags);
> >>>> rcu_read_unlock();
> >>>> }
> >>>>
> >>>> static void __init clear_early_alloc_pfn_tag_refs(void)
> >>>> {
> >>>> - unsigned int i;
> >>>> + struct page *page, *next;
> >>>> + unsigned long i;
> >>>>
> >>>> if (static_key_enabled(&mem_profiling_compressed))
> >>>> return;
> >>>> @@ -829,37 +834,42 @@ static void __init clear_early_alloc_pfn_tag_refs(void)
> >>>> /* Make sure we are not racing with __alloc_tag_add_early_pfn() */
> >>>> synchronize_rcu();
> >>>>
> >>>> - for (i = 0; i < atomic_read(&early_pfn_count); i++) {
> >>>> - unsigned long pfn = early_pfns[i];
> >>>> -
> >>>> - if (pfn_valid(pfn)) {
> >>>> - struct page *page = pfn_to_page(pfn);
> >>>> - union pgtag_ref_handle handle;
> >>>> - union codetag_ref ref;
> >>>> -
> >>>> - if (get_page_tag_ref(page, &ref, &handle)) {
> >>>> - /*
> >>>> - * An early-allocated page could be freed and reallocated
> >>>> - * after its page_ext is initialized but before we clear it.
> >>>> - * In that case, it already has a valid tag set.
> >>>> - * We should not overwrite that valid tag with CODETAG_EMPTY.
> >>>> - *
> >>>> - * Note: there is still a small race window between checking
> >>>> - * ref.ct and calling set_codetag_empty(). We accept this
> >>>> - * race as it's unlikely and the extra complexity of atomic
> >>>> - * cmpxchg is not worth it for this debug-only code path.
> >>>> - */
> >>>> - if (ref.ct) {
> >>>> + for (page = early_pfn_current; page; page = next) {
> >>>> + for (i = 0; i < page->private; i++) {
> >>>> + unsigned long pfn = ((unsigned long *)page_address(page))[i];
> >>> This page_address() can be done in the outer loop since the page does
> >>> not change inside the inner loop.
> >> will fix in the next version.
> >>>> +
> >>>> + if (pfn_valid(pfn)) {
> >>>> + union pgtag_ref_handle handle;
> >>>> + union codetag_ref ref;
> >>>> +
> >>>> + if (get_page_tag_ref(pfn_to_page(pfn), &ref, &handle)) {
> >>>> + /*
> >>>> + * An early-allocated page could be freed and reallocated
> >>>> + * after its page_ext is initialized but before we clear it.
> >>>> + * In that case, it already has a valid tag set.
> >>>> + * We should not overwrite that valid tag
> >>>> + * with CODETAG_EMPTY.
> >>>> + *
> >>>> + * Note: there is still a small race window between checking
> >>>> + * ref.ct and calling set_codetag_empty(). We accept this
> >>>> + * race as it's unlikely and the extra complexity of atomic
> >>>> + * cmpxchg is not worth it for this debug-only code path.
> >>>> + */
> >>>> + if (ref.ct) {
> >>>> + put_page_tag_ref(handle);
> >>>> + continue;
> >>>> + }
> >>>> +
> >>>> + set_codetag_empty(&ref);
> >>>> + update_page_tag_ref(handle, &ref);
> >>>> put_page_tag_ref(handle);
> >>>> - continue;
> >>>> }
> >>>> -
> >>>> - set_codetag_empty(&ref);
> >>>> - update_page_tag_ref(handle, &ref);
> >>>> - put_page_tag_ref(handle);
> >>>> }
> >>>> }
> >>>>
> >>>> + next = (struct page *)page->lru.next;
> >>>> + clear_page_tag_ref(page);
> >>>> + __free_page(page);
> >>>> }
> >>>> }
> >>>> #else /* !CONFIG_MEM_ALLOC_PROFILING_DEBUG */
> >>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >>>> index 04494bc2e46f..5b7b234967a5 100644
> >>>> --- a/mm/page_alloc.c
> >>>> +++ b/mm/page_alloc.c
> >>>> @@ -1284,7 +1284,7 @@ void __clear_page_tag_ref(struct page *page)
> >>>> /* Should be called only if mem_alloc_profiling_enabled() */
> >>>> static noinline
> >>>> void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> >>>> - unsigned int nr)
> >>>> + unsigned int nr, gfp_t gfp_flags)
> >>>> {
> >>>> union pgtag_ref_handle handle;
> >>>> union codetag_ref ref;
> >>>> @@ -1294,21 +1294,24 @@ void __pgalloc_tag_add(struct page *page, struct task_struct *task,
> >>>> update_page_tag_ref(handle, &ref);
> >>>> put_page_tag_ref(handle);
> >>>> } else {
> >>>> - /*
> >>>> - * page_ext is not available yet, record the pfn so we can
> >>>> - * clear the tag ref later when page_ext is initialized.
> >>>> - */
> >>>> - alloc_tag_add_early_pfn(page_to_pfn(page));
> >>>> +
> >>>> if (task->alloc_tag)
> >>>> alloc_tag_set_inaccurate(task->alloc_tag);
> >>>> +
> >>>> + /*
> >>>> + * page_ext is not available yet, record the pfn so the
> >>>> + * tag ref can be cleared later when page_ext is initialized.
> >>>> + */
> >>>> + if (should_record_early_pfn(gfp_flags))
> >>>> + alloc_tag_add_early_pfn(page_to_pfn(page), gfp_flags);
> >>> Any reason for changing the order of alloc_tag_add_early_pfn() and
> >>> alloc_tag_set_inaccurate()? I liked the previous version where we
> >>> explain why this is done at the very beginning of the block. You could
> >>> also call should_record_early_pfn() from inside
> >>> alloc_tag_add_early_pfn(), which would keep both
> >>> should_record_early_pfn() and __GFP_NO_CODETAG definitions local to
> >>> alloc_tag.c.
> >>>
> >> will fix in the next version.
> >>
> >> Thanks
> >>
> >> Best regards.
> >>
> >> Hao
> >>
> >>>> }
> >>>> }
> >>>>
> >>>> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> >>>> - unsigned int nr)
> >>>> + unsigned int nr, gfp_t gfp_flags)
> >>>> {
> >>>> if (mem_alloc_profiling_enabled())
> >>>> - __pgalloc_tag_add(page, task, nr);
> >>>> + __pgalloc_tag_add(page, task, nr, gfp_flags);
> >>>> }
> >>>>
> >>>> /* Should be called only if mem_alloc_profiling_enabled() */
> >>>> @@ -1341,7 +1344,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
> >>>> #else /* CONFIG_MEM_ALLOC_PROFILING */
> >>>>
> >>>> static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
> >>>> - unsigned int nr) {}
> >>>> + unsigned int nr, gfp_t gfp_flags) {}
> >>>> static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
> >>>> static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
> >>>>
> >>>> @@ -1896,7 +1899,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
> >>>>
> >>>> set_page_owner(page, order, gfp_flags);
> >>>> page_table_check_alloc(page, order);
> >>>> - pgalloc_tag_add(page, current, 1 << order);
> >>>> + pgalloc_tag_add(page, current, 1 << order, gfp_flags);
> >>>> }
> >>>>
> >>>> static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
> >>>> --
> >>>> 2.25.1
> >>>>
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2026-04-29 18:20 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-23 8:37 [PATCH v3] mm/alloc_tag: replace fixed-size early PFN array with dynamic linked list Hao Ge
2026-04-27 16:02 ` Suren Baghdasaryan
2026-04-27 18:23 ` Suren Baghdasaryan
2026-04-28 3:00 ` Hao Ge
2026-04-29 5:47 ` Suren Baghdasaryan
2026-04-29 7:03 ` Hao Ge
2026-04-29 18:20 ` Suren Baghdasaryan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox