From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com, linux-mm@kvack.org, david@kernel.org,
willy@infradead.org, surenb@google.com, hannes@cmpxchg.org,
ljs@kernel.org, ziy@nvidia.com, usama.arif@linux.dev,
Rik van Riel <riel@meta.com>, Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 27/45] mm: trigger deferred SPB evacuation when atomic allocs would taint a clean SPB
Date: Thu, 30 Apr 2026 16:20:56 -0400 [thread overview]
Message-ID: <20260430202233.111010-28-riel@surriel.com> (raw)
In-Reply-To: <20260430202233.111010-1-riel@surriel.com>
From: Rik van Riel <riel@meta.com>
Hook queue_spb_evacuate() into __rmqueue_claim() so that whenever a
non-movable allocation is about to claim a pageblock from an empty or
clean superpageblock as a fallback (i.e. cat_search[c] is not
SB_SEARCH_PREFERRED), a deferred spb_evacuate_for_order() is scheduled
on the zone's pgdat workqueue.
The current allocation still proceeds and taints the clean SPB this
time, but the deferred evacuation creates free pageblocks inside
existing tainted SPBs so the next caller hitting the same trigger can
claim from the tainted pool instead of tainting another clean SPB.
Movable allocations are excluded because their preferred category is
SB_CLEAN; falling back from clean to tainted does not taint anything
new and so does not need the hint.
The trigger is gated by single-flight, throttle, and tainted-pool
precheck inside queue_spb_evacuate(), so it is safe to fire from this
hot path without storming the workqueue.
Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
include/linux/mmzone.h | 18 ++++
mm/page_alloc.c | 189 ++++++++++++++++++++++++++++++++++++++++-
2 files changed, 206 insertions(+), 1 deletion(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 765e1c5dc365..195a80e2f0ee 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1139,6 +1139,22 @@ struct zone {
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
+
+ /*
+ * Atomic-context SPB evacuation deferral state.
+ *
+ * spb_evac_in_flight: bitmap indexed by
+ * migratetype * NR_PAGE_ORDERS + order, set on enqueue and
+ * cleared by the worker after spb_evacuate_for_order returns.
+ * Provides single-flight gating per (migratetype, order).
+ *
+ * spb_evac_last: jiffies of the last enqueue per migratetype,
+ * used as a 10ms throttle to prevent wakeup storms from
+ * concurrent atomic allocations.
+ */
+ DECLARE_BITMAP(spb_evac_in_flight,
+ MIGRATE_PCPTYPES * NR_PAGE_ORDERS);
+ unsigned long spb_evac_last[MIGRATE_PCPTYPES];
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -1552,6 +1568,8 @@ typedef struct pglist_data {
struct task_struct *kcompactd;
bool proactive_compact_trigger;
struct workqueue_struct *evacuate_wq;
+ struct llist_head spb_evac_pending;
+ struct irq_work spb_evac_irq_work;
#endif
/*
* This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ceb1284a63ed..f0fdfe8c9a45 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -788,6 +788,8 @@ static struct page *spb_try_alloc_contig(struct zone *zone,
gfp_t gfp_mask);
static bool spb_evacuate_for_order(struct zone *zone, unsigned int order,
int migratetype);
+static void queue_spb_evacuate(struct zone *zone, unsigned int order,
+ int migratetype);
#else
static inline void spb_maybe_start_defrag(struct superpageblock *sb) {}
static inline bool spb_needs_defrag(struct superpageblock *sb) { return false; }
@@ -802,6 +804,8 @@ static inline bool spb_evacuate_for_order(struct zone *zone, unsigned int order,
{
return false;
}
+static inline void queue_spb_evacuate(struct zone *zone, unsigned int order,
+ int migratetype) {}
#endif
static void spb_update_list(struct superpageblock *sb)
@@ -3784,6 +3788,18 @@ __rmqueue_claim(struct zone *zone, int order, int start_migratetype,
if (!page)
continue;
+ /*
+ * About to claim from an empty or clean superpageblock
+ * for a non-movable allocation -- this taints a fresh
+ * SPB. Defer an evacuation pass over the tainted pool
+ * so subsequent allocations can reclaim freed
+ * pageblocks instead of repeating this fallback.
+ */
+ if (cat_search[c] != SB_SEARCH_PREFERRED &&
+ start_migratetype != MIGRATE_MOVABLE)
+ queue_spb_evacuate(zone, order,
+ start_migratetype);
+
page = try_to_claim_block(zone, page, current_order,
order, start_migratetype,
fallback_mt, alloc_flags,
@@ -8728,6 +8744,168 @@ static void evacuate_pageblock(struct zone *zone, unsigned long start_pfn,
putback_movable_pages(&cc.migratepages);
}
+/*
+ * Atomic-context SPB evacuation deferral.
+ *
+ * When an atomic allocation in __rmqueue_claim is about to taint a
+ * clean superpageblock because the tainted pool has no free page at
+ * the requested (order, migratetype), schedule a deferred call to
+ * spb_evacuate_for_order. That frees pageblocks inside tainted SPBs so
+ * subsequent allocations can claim them instead of tainting more clean
+ * SPBs.
+ *
+ * Two-step deferral mirrors the pageblock-evacuate path: irq_work to
+ * leave allocator lock context, then queue_work to reach process
+ * context where spb_evacuate_for_order can sleep in migrate_pages.
+ */
+
+struct spb_evac_request {
+ struct work_struct work;
+ struct zone *zone;
+ unsigned int order;
+ int migratetype;
+ struct llist_node free_node;
+};
+
+#define NR_SPB_EVAC_REQUESTS 64
+static struct spb_evac_request spb_evac_pool[NR_SPB_EVAC_REQUESTS];
+static struct llist_head spb_evac_freelist;
+
+static struct spb_evac_request *spb_evac_request_alloc(void)
+{
+ struct llist_node *node;
+
+ node = llist_del_first(&spb_evac_freelist);
+ if (!node)
+ return NULL;
+ return container_of(node, struct spb_evac_request, free_node);
+}
+
+static void spb_evac_request_free(struct spb_evac_request *req)
+{
+ llist_add(&req->free_node, &spb_evac_freelist);
+}
+
+static void spb_evac_work_fn(struct work_struct *work)
+{
+ struct spb_evac_request *req = container_of(work,
+ struct spb_evac_request,
+ work);
+ struct zone *zone = req->zone;
+ unsigned int order = req->order;
+ int mt = req->migratetype;
+
+ spb_evacuate_for_order(zone, order, mt);
+
+ /*
+ * Clearing the in-flight bit lets a future caller hitting the
+ * same (mt, order) re-enqueue evacuation. Ordering between this
+ * worker's SPB state changes and the future caller's
+ * tainted_pool_has_free walk is provided by zone->lock taken
+ * inside spb_evacuate_for_order and by the future caller.
+ */
+ clear_bit(mt * NR_PAGE_ORDERS + order, zone->spb_evac_in_flight);
+ spb_evac_request_free(req);
+}
+
+static void spb_evac_irq_work_fn(struct irq_work *work)
+{
+ pg_data_t *pgdat = container_of(work, pg_data_t,
+ spb_evac_irq_work);
+ struct llist_node *pending;
+ struct spb_evac_request *req, *next;
+
+ if (!pgdat->evacuate_wq)
+ return;
+
+ pending = llist_del_all(&pgdat->spb_evac_pending);
+ llist_for_each_entry_safe(req, next, pending, free_node) {
+ INIT_WORK(&req->work, spb_evac_work_fn);
+ queue_work(pgdat->evacuate_wq, &req->work);
+ }
+}
+
+/*
+ * Walk tainted SPBs to check whether any has a free page at the given
+ * order and migratetype. When this returns true, a clean-SPB claim is
+ * not pool depletion but a try_to_claim_block over-rejection: skip the
+ * deferred evacuation since it cannot help.
+ */
+static bool tainted_pool_has_free(struct zone *zone, unsigned int order,
+ int migratetype)
+{
+ struct superpageblock *sb;
+ int full;
+
+ lockdep_assert_held(&zone->lock);
+
+ for (full = 0; full < __NR_SB_FULLNESS; full++) {
+ list_for_each_entry(sb, &zone->spb_lists[SB_TAINTED][full],
+ list) {
+ struct free_area *fa = &sb->free_area[order];
+
+ if (fa->nr_free &&
+ !list_empty(&fa->free_list[migratetype]))
+ return true;
+ }
+ }
+ return false;
+}
+
+/**
+ * queue_spb_evacuate - schedule deferred SPB evacuation from atomic context
+ * @zone: zone that just failed to find a free page in the tainted pool
+ * @order: requested allocation order
+ * @migratetype: requested migratetype (UNMOVABLE or RECLAIMABLE only)
+ *
+ * Caller must hold zone->lock; the tainted-pool walk asserts it.
+ *
+ * Single-flight gated per (zone, migratetype, order) and throttled to
+ * one enqueue per 10ms per (zone, migratetype). Pool exhaustion
+ * silently drops the request; the next caller hitting the same trigger
+ * will retry.
+ */
+static void queue_spb_evacuate(struct zone *zone, unsigned int order,
+ int migratetype)
+{
+ pg_data_t *pgdat = zone->zone_pgdat;
+ struct spb_evac_request *req;
+ unsigned int bit;
+
+ lockdep_assert_held(&zone->lock);
+
+ if (!pgdat->spb_evac_irq_work.func)
+ return;
+ if (order >= NR_PAGE_ORDERS || migratetype >= MIGRATE_PCPTYPES)
+ return;
+
+ if (time_before(jiffies,
+ zone->spb_evac_last[migratetype] + HZ / 100))
+ return;
+
+ bit = migratetype * NR_PAGE_ORDERS + order;
+ if (test_and_set_bit(bit, zone->spb_evac_in_flight))
+ return;
+
+ if (tainted_pool_has_free(zone, order, migratetype)) {
+ clear_bit(bit, zone->spb_evac_in_flight);
+ return;
+ }
+
+ req = spb_evac_request_alloc();
+ if (!req) {
+ clear_bit(bit, zone->spb_evac_in_flight);
+ return;
+ }
+
+ zone->spb_evac_last[migratetype] = jiffies;
+ req->zone = zone;
+ req->order = order;
+ req->migratetype = migratetype;
+ llist_add(&req->free_node, &pgdat->spb_evac_pending);
+ irq_work_queue(&pgdat->spb_evac_irq_work);
+}
+
/*
* Background superpageblock defragmentation.
*
@@ -9202,7 +9380,12 @@ static void spb_maybe_start_defrag(struct superpageblock *sb)
static int __init pageblock_evacuate_init(void)
{
- int nid;
+ int nid, i;
+
+ /* Initialize the global freelist of SPB evacuate requests */
+ init_llist_head(&spb_evac_freelist);
+ for (i = 0; i < NR_SPB_EVAC_REQUESTS; i++)
+ llist_add(&spb_evac_pool[i].free_node, &spb_evac_freelist);
/* Create a per-pgdat workqueue */
for_each_online_node(nid) {
@@ -9217,6 +9400,10 @@ static int __init pageblock_evacuate_init(void)
continue;
}
+ init_llist_head(&pgdat->spb_evac_pending);
+ init_irq_work(&pgdat->spb_evac_irq_work,
+ spb_evac_irq_work_fn);
+
/* Initialize per-superpageblock defrag work structs */
for (z = 0; z < MAX_NR_ZONES; z++) {
struct zone *zone = &pgdat->node_zones[z];
--
2.52.0
next prev parent reply other threads:[~2026-04-30 20:22 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-30 20:20 [00/45 RFC PATCH] 1GB superpageblock memory allocation Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 01/45] mm: page_alloc: replace pageblock_flags bitmap with struct pageblock_data Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 02/45] mm: page_alloc: per-cpu pageblock buddy allocator Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 03/45] mm: page_alloc: use trylock for PCP lock in free path to avoid lock inversion Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 04/45] mm: mm_init: fix zone assignment for pages in unavailable ranges Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 05/45] mm: vmstat: restore per-migratetype free counts in /proc/pagetypeinfo Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 06/45] mm: page_alloc: remove watermark boost mechanism Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 07/45] mm: page_alloc: async evacuation of stolen movable pageblocks Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 08/45] mm: page_alloc: track actual page contents in pageblock flags Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 09/45] mm: page_alloc: introduce superpageblock metadata for 1GB anti-fragmentation Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 10/45] mm: page_alloc: support superpageblock resize for memory hotplug Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 11/45] mm: page_alloc: add superpageblock fullness lists for allocation steering Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 12/45] mm: page_alloc: steer pageblock stealing to tainted superpageblocks Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 13/45] mm: page_alloc: steer movable allocations to fullest clean superpageblocks Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 14/45] mm: page_alloc: extract claim_whole_block from try_to_claim_block Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 15/45] mm: page_alloc: add per-superpageblock free lists Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 16/45] mm: page_alloc: add background superpageblock defragmentation worker Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 17/45] mm: page_alloc: add within-superpageblock compaction for clean superpageblocks Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 18/45] mm: page_alloc: superpageblock-aware contiguous and higher order allocation Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 19/45] mm: page_alloc: prevent atomic allocations from tainting clean SPBs Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 20/45] mm: page_alloc: aggressively pack non-movable allocations in tainted SPBs on large systems Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 21/45] mm: page_alloc: prefer reclaim over tainting clean superpageblocks Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 22/45] mm: page_alloc: adopt partial pageblocks from tainted superpageblocks Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 23/45] mm: page_alloc: add CONFIG_DEBUG_VM sanity checks for SPB counters Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 24/45] mm: page_alloc: targeted evacuation and dynamic reserves for tainted SPBs Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 25/45] mm: page_alloc: skip pageblock compatibility threshold in " Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 26/45] mm: page_alloc: prevent UNMOVABLE/RECLAIMABLE mixing in pageblocks Rik van Riel
2026-04-30 20:20 ` Rik van Riel [this message]
2026-04-30 20:20 ` [RFC PATCH 28/45] mm: page_alloc: keep PCP refill in tainted SPBs across owned pageblocks Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 29/45] mm: page_alloc: refuse fragmenting fallback for callers with cheap fallback Rik van Riel
2026-04-30 20:20 ` [RFC PATCH 30/45] mm: page_alloc: drive slab shrink from SPB anti-fragmentation pressure Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 31/45] mm: page_alloc: cross-non-movable buddy borrow within tainted SPBs Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 32/45] mm: page_alloc: proactive high-water trigger for SPB slab shrink Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 33/45] mm: page_alloc: refuse to taint clean SPBs for atomic NORETRY callers Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 34/45] mm: page_reporting: walk per-superpageblock free lists Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 35/45] mm: show_mem: collect migratetype letters from per-superpageblock lists Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 36/45] mm: page_alloc: add alloc_flags parameter to __rmqueue_smallest Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 37/45] mm/slub: kvmalloc — add __GFP_NORETRY to large-kmalloc attempt Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 38/45] mm: page_alloc: per-(zone, order, mt) PASS_1 hint cache Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 39/45] mm: debug: prevent infinite recursion in dump_page() with CMA Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 40/45] PM: hibernate: walk per-superpageblock free lists in mark_free_pages Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 41/45] btrfs: allocate eb-attached btree pages as movable Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 42/45] mm: page_alloc: cross-MOV borrow within tainted SPBs Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 43/45] mm: page_alloc: trigger defrag from allocator hot path on tainted-SPB pressure Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 44/45] mm: page_alloc: SPB tracepoint instrumentation [DROP-FOR-UPSTREAM] Rik van Riel
2026-04-30 20:21 ` [RFC PATCH 45/45] mm: page_alloc: enlarge and unify spb_evacuate_for_order Rik van Riel
2026-05-01 7:14 ` [00/45 RFC PATCH] 1GB superpageblock memory allocation David Hildenbrand (Arm)
2026-05-01 11:58 ` Rik van Riel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260430202233.111010-28-riel@surriel.com \
--to=riel@surriel.com \
--cc=david@kernel.org \
--cc=hannes@cmpxchg.org \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=riel@meta.com \
--cc=surenb@google.com \
--cc=usama.arif@linux.dev \
--cc=willy@infradead.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox