From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com, linux-mm@kvack.org, david@kernel.org,
willy@infradead.org, surenb@google.com, hannes@cmpxchg.org,
ljs@kernel.org, ziy@nvidia.com, usama.arif@linux.dev,
fvdl@google.com, Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 06/40] mm: page_alloc: async evacuation of stolen movable pageblocks
Date: Wed, 20 May 2026 10:59:12 -0400 [thread overview]
Message-ID: <20260520150018.2491267-7-riel@surriel.com> (raw)
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
When the page allocator steals a movable pageblock for unmovable or
reclaimable allocations (via try_to_claim_block), the remaining movable
pages in that block can prevent future unmovable/reclaimable allocations
from being concentrated in fewer pageblocks, leading to long-term memory
fragmentation.
Add a lightweight asynchronous evacuation mechanism: when a movable
pageblock is claimed for unmovable/reclaimable use, queue a work item to
migrate the remaining movable pages out. This allows future
unmovable/reclaimable allocations to be satisfied from the now-evacuated
block, keeping those allocation types concentrated and reducing
fragmentation.
Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
include/linux/mmzone.h | 4 +
mm/page_alloc.c | 223 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 227 insertions(+)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 13e29b2ebb86..90498bbbf60b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
+#include <linux/irq_work_types.h>
#include <linux/zswap.h>
#include <linux/sizes.h>
#include <asm/page.h>
@@ -1540,6 +1541,9 @@ typedef struct pglist_data {
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
bool proactive_compact_trigger;
+ struct workqueue_struct *evacuate_wq;
+ struct llist_head evacuate_pending;
+ struct irq_work evacuate_irq_work;
#endif
/*
* This is a per-node reserve of pages that are not available
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e01e58aca54..0f3d734bd296 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/interrupt.h>
+#include <linux/irq_work.h>
#include <linux/jiffies.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
@@ -51,6 +52,7 @@
#include <linux/lockdep.h>
#include <linux/psi.h>
#include <linux/khugepaged.h>
+#include <linux/workqueue.h>
#include <linux/delayacct.h>
#include <linux/cacheinfo.h>
#include <linux/pgalloc_tag.h>
@@ -59,6 +61,10 @@
#include "shuffle.h"
#include "page_reporting.h"
+#ifdef CONFIG_COMPACTION
+static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn);
+#endif
+
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
@@ -2428,6 +2434,13 @@ try_to_claim_block(struct zone *zone, struct page *page,
int free_pages, movable_pages, alike_pages;
unsigned long start_pfn;
+ /*
+ * Don't steal from pageblocks that are isolated for
+ * evacuation -- that would undo the work in progress.
+ */
+ if (get_pageblock_isolate(page))
+ return NULL;
+
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
unsigned int nr_added;
@@ -2473,6 +2486,18 @@ try_to_claim_block(struct zone *zone, struct page *page,
page_group_by_mobility_disabled) {
__move_freepages_block(zone, start_pfn, block_type, start_type);
set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
+#ifdef CONFIG_COMPACTION
+ /*
+ * A movable pageblock was just claimed for unmovable or
+ * reclaimable use. Queue async evacuation of the remaining
+ * movable pages so future unmovable/reclaimable allocations
+ * can stay concentrated in fewer pageblocks.
+ */
+ if (block_type == MIGRATE_MOVABLE &&
+ (start_type == MIGRATE_UNMOVABLE ||
+ start_type == MIGRATE_RECLAIMABLE))
+ queue_pageblock_evacuate(zone, start_pfn);
+#endif
return __rmqueue_smallest(zone, order, start_type);
}
@@ -7184,6 +7209,204 @@ void __init page_alloc_sysctl_init(void)
register_sysctl_init("vm", page_alloc_sysctl_table);
}
+#ifdef CONFIG_COMPACTION
+/*
+ * Pageblock evacuation: asynchronously migrate movable pages out of
+ * pageblocks that were stolen for unmovable/reclaimable allocations.
+ * This keeps unmovable/reclaimable allocations concentrated in fewer
+ * pageblocks, reducing long-term fragmentation.
+ *
+ * Uses a global pool of 64 pre-allocated work items (~3.5KB total)
+ * and a per-pgdat workqueue to keep migration node-local.
+ */
+
+struct evacuate_item {
+ struct work_struct work;
+ struct zone *zone;
+ unsigned long start_pfn;
+ struct llist_node free_node;
+};
+
+#define NR_EVACUATE_ITEMS 64
+static struct evacuate_item evacuate_pool[NR_EVACUATE_ITEMS];
+static struct llist_head evacuate_freelist;
+
+static struct evacuate_item *evacuate_item_alloc(void)
+{
+ struct llist_node *node;
+
+ node = llist_del_first(&evacuate_freelist);
+ if (!node)
+ return NULL;
+ return container_of(node, struct evacuate_item, free_node);
+}
+
+static void evacuate_item_free(struct evacuate_item *item)
+{
+ llist_add(&item->free_node, &evacuate_freelist);
+}
+
+static void evacuate_pageblock(struct zone *zone, unsigned long start_pfn)
+{
+ unsigned long end_pfn = start_pfn + pageblock_nr_pages;
+ unsigned long pfn = start_pfn;
+ int nr_reclaimed;
+ int ret = 0;
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = zone,
+ .mode = MIGRATE_ASYNC,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE,
+ };
+ struct migration_target_control mtc = {
+ .nid = zone_to_nid(zone),
+ .gfp_mask = GFP_HIGHUSER_MOVABLE,
+ };
+
+ /* Verify this pageblock is still worth evacuating */
+ if (get_pageblock_migratetype(pfn_to_page(start_pfn)) == MIGRATE_MOVABLE)
+ return;
+
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ /*
+ * Loop through the entire pageblock, isolating and migrating
+ * in batches. isolate_migratepages_range stops at
+ * COMPACT_CLUSTER_MAX, so we must loop to cover the full block.
+ */
+ while (pfn < end_pfn || !list_empty(&cc.migratepages)) {
+ if (list_empty(&cc.migratepages)) {
+ cc.nr_migratepages = 0;
+ cc.migrate_pfn = pfn;
+ ret = isolate_migratepages_range(&cc, pfn, end_pfn);
+ if (ret && ret != -EAGAIN)
+ break;
+ pfn = cc.migrate_pfn;
+ if (list_empty(&cc.migratepages))
+ break;
+ }
+
+ nr_reclaimed = reclaim_clean_pages_from_list(zone,
+ &cc.migratepages);
+ cc.nr_migratepages -= nr_reclaimed;
+
+ if (!list_empty(&cc.migratepages)) {
+ ret = migrate_pages(&cc.migratepages,
+ alloc_migration_target, NULL,
+ (unsigned long)&mtc, cc.mode,
+ MR_COMPACTION, NULL);
+ if (ret) {
+ putback_movable_pages(&cc.migratepages);
+ break;
+ }
+ }
+
+ cond_resched();
+ }
+
+ if (!list_empty(&cc.migratepages))
+ putback_movable_pages(&cc.migratepages);
+}
+
+static void evacuate_work_fn(struct work_struct *work)
+{
+ struct evacuate_item *item = container_of(work, struct evacuate_item,
+ work);
+ evacuate_pageblock(item->zone, item->start_pfn);
+ evacuate_item_free(item);
+}
+
+/**
+ * evacuate_irq_work_fn - IRQ work callback to drain pending evacuations
+ * @work: the irq_work embedded in pg_data_t
+ *
+ * queue_work() can deadlock when called from inside the page allocator
+ * because it may try to allocate memory with locks already held.
+ * Use irq_work to defer the queue_work() calls to a safe context.
+ */
+static void evacuate_irq_work_fn(struct irq_work *work)
+{
+ pg_data_t *pgdat = container_of(work, pg_data_t,
+ evacuate_irq_work);
+ struct llist_node *pending;
+ struct evacuate_item *item, *next;
+
+ if (!pgdat->evacuate_wq)
+ return;
+
+ /*
+ * Collect all pending items first, then queue them. Use _safe
+ * because evacuate_work_fn() may run immediately on another
+ * CPU and free the item before we follow the next pointer.
+ */
+ pending = llist_del_all(&pgdat->evacuate_pending);
+ llist_for_each_entry_safe(item, next, pending, free_node) {
+ INIT_WORK(&item->work, evacuate_work_fn);
+ queue_work(pgdat->evacuate_wq, &item->work);
+ }
+}
+
+/**
+ * queue_pageblock_evacuate - schedule async evacuation of movable pages
+ * @zone: the zone containing the pageblock
+ * @pfn: start PFN of the pageblock (must be pageblock-aligned)
+ *
+ * Called from the page allocator when a movable pageblock is claimed
+ * for unmovable or reclaimable allocations. Queues the pageblock for
+ * background migration of its remaining movable pages. Uses irq_work
+ * to defer the actual queue_work() call outside the allocator's lock
+ * context.
+ */
+static void queue_pageblock_evacuate(struct zone *zone, unsigned long pfn)
+{
+ struct evacuate_item *item;
+ pg_data_t *pgdat = zone->zone_pgdat;
+
+ if (!pgdat->evacuate_irq_work.func)
+ return;
+
+ item = evacuate_item_alloc();
+ if (!item)
+ return;
+
+ item->zone = zone;
+ item->start_pfn = pfn;
+ llist_add(&item->free_node, &pgdat->evacuate_pending);
+ irq_work_queue(&pgdat->evacuate_irq_work);
+}
+
+static int __init pageblock_evacuate_init(void)
+{
+ int nid, i;
+
+ /* Initialize the global freelist of work items */
+ init_llist_head(&evacuate_freelist);
+ for (i = 0; i < NR_EVACUATE_ITEMS; i++)
+ llist_add(&evacuate_pool[i].free_node, &evacuate_freelist);
+
+ /* Create a per-pgdat workqueue */
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ char name[32];
+
+ snprintf(name, sizeof(name), "kevacuate/%d", nid);
+ pgdat->evacuate_wq = alloc_workqueue(name, WQ_MEM_RECLAIM, 1);
+ if (!pgdat->evacuate_wq) {
+ pr_warn("Failed to create evacuate workqueue for node %d\n", nid);
+ continue;
+ }
+
+ init_llist_head(&pgdat->evacuate_pending);
+ init_irq_work(&pgdat->evacuate_irq_work,
+ evacuate_irq_work_fn);
+ }
+
+ return 0;
+}
+late_initcall(pageblock_evacuate_init);
+#endif /* CONFIG_COMPACTION */
+
#ifdef CONFIG_CONTIG_ALLOC
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
--
2.54.0
next prev parent reply other threads:[~2026-05-20 15:00 UTC|newest]
Thread overview: 53+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-20 14:59 [RFC PATCH 00/40] mm: reliable 1GB page allocation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 01/40] mm: page_alloc: replace pageblock_flags bitmap with struct pageblock_data Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 02/40] mm: page_alloc: per-cpu pageblock buddy allocator Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 03/40] mm: page_alloc: split-path PCP free with local-trylock + remote-llist Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 04/40] mm: mm_init: fix zone assignment for pages in unavailable ranges Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 05/40] mm: page_alloc: remove watermark boost mechanism Rik van Riel
2026-05-26 14:02 ` Usama Arif
2026-05-27 15:41 ` Rik van Riel
2026-05-20 14:59 ` Rik van Riel [this message]
2026-05-20 14:59 ` [RFC PATCH 07/40] mm: page_alloc: track actual page contents in pageblock flags Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 08/40] mm: page_alloc: superpageblock metadata for 1GB anti-fragmentation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 09/40] mm: page_alloc: support superpageblock resize for memory hotplug Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 10/40] mm: page_alloc: add superpageblock fullness lists for allocation steering Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 11/40] mm: page_alloc: steer pageblock stealing to tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 12/40] mm: page_alloc: steer movable allocations to fullest clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 13/40] mm: page_alloc: extract claim_whole_block from try_to_claim_block Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 14/40] mm: page_alloc: add per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 15/40] mm: page_alloc: add background superpageblock defragmentation worker Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 16/40] mm: compaction: walk per-superpageblock free lists for migration targets Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 17/40] mm: page_alloc: superpageblock-aware contiguous and higher order allocation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 18/40] mm: page_alloc: prevent atomic allocations from tainting clean SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 19/40] mm: page_alloc: aggressively pack non-movable allocs in tainted SPBs on large systems Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 20/40] mm: page_alloc: prefer reclaim over tainting clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 21/40] mm: page_alloc: adopt partial pageblocks from tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 22/40] mm: page_alloc: add CONFIG_DEBUG_VM sanity checks for SPB counters Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 23/40] mm: page_alloc: targeted evacuation and dynamic reserves for tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 24/40] mm: page_alloc: prevent UNMOVABLE/RECLAIMABLE mixing in pageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 25/40] mm: trigger deferred SPB evac when atomic allocs would taint a clean SPB Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 26/40] mm: page_alloc: refuse fragmenting fallback for callers with cheap fallback Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 27/40] mm: page_alloc: cross-migratetype buddy borrow within tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 28/40] mm: page_alloc: drive slab shrink from SPB anti-fragmentation pressure Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 29/40] mm: page_reporting: walk per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 30/40] mm: show_mem: collect migratetype letters from per-superpageblock lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 31/40] mm: page_alloc: per-(zone, order, mt) PASS_1 hint cache Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 32/40] mm: debug: prevent infinite recursion in dump_page() with CMA Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 33/40] PM: hibernate: walk per-superpageblock free lists in mark_free_pages Rik van Riel
2026-05-20 18:19 ` Rafael J. Wysocki
2026-05-20 14:59 ` [RFC PATCH 34/40] btrfs: allocate eb-attached btree pages as movable Rik van Riel
2026-05-20 17:47 ` Boris Burkov
2026-05-23 15:58 ` David Sterba
2026-05-24 1:43 ` Rik van Riel
2026-05-24 19:59 ` Matthew Wilcox
2026-05-25 6:57 ` Christoph Hellwig
2026-05-20 14:59 ` [RFC PATCH 35/40] mm: page_alloc: refuse best-effort high-order allocs servable at lower orders Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 36/40] mm: page_alloc: set ALLOC_NOFRAGMENT on alloc_frozen_pages_nolock_noprof Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 37/40] mm: page_alloc: move spb_get_category and spb_tainted_reserve to mmzone.h Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 38/40] mm: compaction: skip empty tainted superpageblocks as migration source Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 39/40] mm: compaction: respect tainted SPB reserve in destination selection Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 40/40] mm: page_alloc: SPB tracepoint instrumentation [DO-NOT-MERGE] Rik van Riel
2026-05-21 5:09 ` kernel test robot
2026-05-21 7:39 ` [syzbot ci] Re: mm: reliable 1GB page allocation syzbot ci
2026-05-22 11:02 ` [RFC PATCH 00/40] " Usama Arif
2026-05-22 13:55 ` Rik van Riel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260520150018.2491267-7-riel@surriel.com \
--to=riel@surriel.com \
--cc=david@kernel.org \
--cc=fvdl@google.com \
--cc=hannes@cmpxchg.org \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=surenb@google.com \
--cc=usama.arif@linux.dev \
--cc=willy@infradead.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.