All of lore.kernel.org
 help / color / mirror / Atom feed
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com, linux-mm@kvack.org, david@kernel.org,
	willy@infradead.org, surenb@google.com, hannes@cmpxchg.org,
	ljs@kernel.org, ziy@nvidia.com, usama.arif@linux.dev,
	fvdl@google.com, Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 08/40] mm: page_alloc: superpageblock metadata for 1GB anti-fragmentation
Date: Wed, 20 May 2026 10:59:14 -0400	[thread overview]
Message-ID: <20260520150018.2491267-9-riel@surriel.com> (raw)
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>

Introduce a 1GB (PUD-sized) "superpageblock" data structure to track
pageblock composition at a coarser granularity, enabling future steering of
unmovable/reclaimable allocations into already-tainted superpageblocks and
preserving clean superpageblocks for 1GB hugepage allocation.

Each superpageblock groups SUPERBLOCK_NR_PAGEBLOCKS pageblocks (512 on
  x86_64 with 2MB pageblocks) and maintains:
- Counts of pageblocks by migratetype (nr_free, nr_unmovable,
  nr_reclaimable, nr_movable, nr_reserved)
- A list_head for future organization by fullness category
- Identity (start_pfn, zone pointer)

Superblock counters are maintained by hooking into
init_pageblock_migratetype(). Memory holes and firmware-reserved regions
are tracked as reserved pageblocks by initializing all slots as reserved
during setup and decrementing as init_pageblock_migratetype() claims them.

The superpageblock array is allocated per-zone during boot via memblock. At
~48 bytes per superpageblock (~12KB for a 256GB system), the overhead is
negligible.

This is pure bookkeeping with no allocation behavior change.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/mmzone.h | 57 ++++++++++++++++++++++++++
 mm/mm_init.c           | 90 ++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c        | 65 ++++++++++++++++++++++++++++++
 3 files changed, 212 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 90498bbbf60b..e3eac971a76a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -974,6 +974,43 @@ enum zone_type {
 
 #define ASYNC_AND_SYNC 2
 
+/*
+ * Superpageblock: 1GB (PUD-sized) region for anti-fragmentation tracking.
+ *
+ * Groups pageblocks to steer unmovable/reclaimable allocations into
+ * already-tainted superpageblocks, preserving clean superpageblocks for 1GB
+ * hugepage allocation.
+ *
+ * SUPERPAGEBLOCK_ORDER derived from PUD geometry:
+ *   x86_64: PUD_SHIFT=30, PAGE_SHIFT=12 → order 18 → 1GB
+ *   Each superpageblock contains SUPERPAGEBLOCK_NR_PAGEBLOCKS pageblocks
+ *   (512 on x86_64 with 2MB pageblocks).
+ */
+#define SUPERPAGEBLOCK_ORDER	(PUD_SHIFT - PAGE_SHIFT)
+#define SUPERPAGEBLOCK_NR_PAGES	(1UL << SUPERPAGEBLOCK_ORDER)
+
+/*
+ * SUPERPAGEBLOCK_NR_PAGEBLOCKS depends on pageblock_order which may be
+ * variable (CONFIG_HUGETLB_PAGE_SIZE_VARIABLE).
+ */
+#define SUPERPAGEBLOCK_NR_PAGEBLOCKS (1UL << (SUPERPAGEBLOCK_ORDER - pageblock_order))
+
+struct superpageblock {
+	/* Pageblock counts by current migratetype */
+	u16			nr_free;
+	u16			nr_unmovable;
+	u16			nr_reclaimable;
+	u16			nr_movable;
+	u16			nr_reserved;	/* holes, firmware, etc. */
+
+	/* For organizing superpageblocks by fullness category */
+	struct list_head	list;
+
+	/* Identity */
+	unsigned long		start_pfn;
+	struct zone		*zone;
+};
+
 struct zone {
 	/* Read-mostly fields */
 
@@ -1016,6 +1053,11 @@ struct zone {
 	struct pageblock_data	*pageblock_data;
 #endif /* CONFIG_SPARSEMEM */
 
+	/* Superpageblock array for 1GB anti-fragmentation tracking */
+	struct superpageblock	*superpageblocks;
+	unsigned long		nr_superpageblocks;
+	unsigned long		superpageblock_base_pfn; /* 1GB-aligned base */
+
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
@@ -1159,6 +1201,21 @@ struct zone {
 #endif
 } ____cacheline_internodealigned_in_smp;
 
+static inline struct superpageblock *pfn_to_superpageblock(struct zone *zone,
+						   unsigned long pfn)
+{
+	unsigned long idx;
+
+	if (!zone->superpageblocks)
+		return NULL;
+
+	idx = (pfn - zone->superpageblock_base_pfn) >> SUPERPAGEBLOCK_ORDER;
+	if (idx >= zone->nr_superpageblocks)
+		return NULL;
+
+	return &zone->superpageblocks[idx];
+}
+
 enum pgdat_flags {
 	PGDAT_WRITEBACK,		/* reclaim scanning has recently found
 					 * many pages under writeback
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 47a222e49fc9..de02a6087c21 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1503,6 +1503,95 @@ static void __ref setup_usemap(struct zone *zone)
 static inline void setup_usemap(struct zone *zone) {}
 #endif /* CONFIG_SPARSEMEM */
 
+/**
+ * init_one_superpageblock - initialize a single superpageblock
+ * @sb: superpageblock to initialize
+ * @zone: owning zone
+ * @start_pfn: start PFN for this superpageblock
+ * @zone_start: zone start PFN (for clipping)
+ * @zone_end: zone end PFN (for clipping)
+ *
+ * Zero counters, compute the zone-clipped pageblock count.
+ * Used by both boot-time setup and memory hotplug resize.
+ */
+static void __meminit init_one_superpageblock(struct superpageblock *sb,
+					      struct zone *zone,
+					      unsigned long start_pfn,
+					      unsigned long zone_start,
+					      unsigned long zone_end)
+{
+	unsigned long sb_end = start_pfn + SUPERPAGEBLOCK_NR_PAGES;
+	unsigned long pb_start = max(start_pfn, zone_start);
+	unsigned long pb_end = min(sb_end, zone_end);
+	u16 actual_pbs;
+
+	sb->nr_unmovable = 0;
+	sb->nr_reclaimable = 0;
+	sb->nr_movable = 0;
+	sb->nr_free = 0;
+	INIT_LIST_HEAD(&sb->list);
+	sb->start_pfn = start_pfn;
+	sb->zone = zone;
+
+	/*
+	 * Start with all pageblock slots as reserved.
+	 * init_pageblock_migratetype() will decrement nr_reserved and
+	 * increment the appropriate counter for each real pageblock.
+	 * Holes and firmware-reserved regions stay counted as reserved.
+	 *
+	 * Only count pageblocks that fall within the zone's span.
+	 * The first and last superpageblocks may extend beyond the
+	 * zone boundaries.  Use round-up division because a partial
+	 * pageblock at the zone boundary still gets initialized by
+	 * init_pageblock_migratetype().
+	 */
+	actual_pbs = (pb_end > pb_start) ?
+		     ((pb_end - pb_start + pageblock_nr_pages - 1) >>
+		      pageblock_order) : 0;
+	sb->nr_reserved = actual_pbs;
+}
+
+static void __init setup_superpageblocks(struct zone *zone)
+{
+	unsigned long zone_start = zone->zone_start_pfn;
+	unsigned long zone_end = zone_start + zone->spanned_pages;
+	unsigned long sb_base, nr_superpageblocks;
+	size_t alloc_size;
+	unsigned long i;
+
+	zone->superpageblocks = NULL;
+	zone->nr_superpageblocks = 0;
+	zone->superpageblock_base_pfn = 0;
+
+	if (!zone->spanned_pages)
+		return;
+
+	/*
+	 * Superpageblocks must be 1GB (PUD) aligned. Align the base down
+	 * and the end up to cover all 1GB regions the zone spans.
+	 */
+	sb_base = ALIGN_DOWN(zone_start, SUPERPAGEBLOCK_NR_PAGES);
+	nr_superpageblocks = (ALIGN(zone_end, SUPERPAGEBLOCK_NR_PAGES) - sb_base) >>
+			 SUPERPAGEBLOCK_ORDER;
+
+	alloc_size = nr_superpageblocks * sizeof(struct superpageblock);
+	zone->superpageblocks = memblock_alloc_node(alloc_size, SMP_CACHE_BYTES,
+						zone_to_nid(zone));
+	if (!zone->superpageblocks) {
+		pr_warn("Failed to allocate %zu bytes for zone %s superpageblocks\n",
+			alloc_size, zone->name);
+		return;
+	}
+
+	zone->nr_superpageblocks = nr_superpageblocks;
+	zone->superpageblock_base_pfn = sb_base;
+
+	for (i = 0; i < nr_superpageblocks; i++)
+		init_one_superpageblock(&zone->superpageblocks[i], zone,
+					sb_base + (i << SUPERPAGEBLOCK_ORDER),
+					zone_start, zone_end);
+}
+
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
@@ -1611,6 +1700,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 			continue;
 
 		setup_usemap(zone);
+		setup_superpageblocks(zone);
 		init_currently_empty_zone(zone, zone->zone_start_pfn, size);
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23108cdcbbec..b9b7d54a869c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -457,6 +457,62 @@ void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
 	clear_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
 
+/*
+ * Map migratetype to PB_has_* bit index. Returns -1 for types that
+ * don't have a tracking bit (e.g. MIGRATE_ISOLATE).
+ */
+static inline int migratetype_to_has_bit(int migratetype)
+{
+	switch (migratetype) {
+	case MIGRATE_UNMOVABLE:
+	case MIGRATE_HIGHATOMIC:
+		return PB_has_unmovable;
+	case MIGRATE_RECLAIMABLE:
+		return PB_has_reclaimable;
+	case MIGRATE_MOVABLE:
+#ifdef CONFIG_CMA
+	case MIGRATE_CMA:
+#endif
+		return PB_has_movable;
+	default:
+		return -1;
+	}
+}
+
+/*
+ * __spb_set_has_type - set PB_has_* and increment type counter
+ *
+ * Idempotent: only increments the counter on the 0→1 bit transition.
+ */
+static void __spb_set_has_type(struct page *page, int migratetype)
+{
+	unsigned long pfn = page_to_pfn(page);
+	struct superpageblock *sb = pfn_to_superpageblock(page_zone(page), pfn);
+	int bit;
+
+	if (!sb)
+		return;
+
+	bit = migratetype_to_has_bit(migratetype);
+	if (bit < 0)
+		return;
+
+	if (!get_pfnblock_bit(page, pfn, bit)) {
+		set_pfnblock_bit(page, pfn, bit);
+		switch (bit) {
+		case PB_has_unmovable:
+			sb->nr_unmovable++;
+			break;
+		case PB_has_reclaimable:
+			sb->nr_reclaimable++;
+			break;
+		case PB_has_movable:
+			sb->nr_movable++;
+			break;
+		}
+	}
+}
+
 /**
  * set_pageblock_migratetype - Set the migratetype of a pageblock
  * @page: The page within the block of interest
@@ -490,6 +546,7 @@ void __meminit init_pageblock_migratetype(struct page *page,
 {
 	unsigned long pfn = page_to_pfn(page);
 	struct pageblock_data *pbd;
+	struct superpageblock *sb;
 	unsigned long flags;
 
 	if (unlikely(page_group_by_mobility_disabled &&
@@ -513,6 +570,14 @@ void __meminit init_pageblock_migratetype(struct page *page,
 	pbd = pfn_to_pageblock(page, pfn);
 	pbd->block_pfn = pfn;
 	INIT_LIST_HEAD(&pbd->cpu_node);
+
+	/* Transition from reserved (boot default) to initial migratetype */
+	sb = pfn_to_superpageblock(page_zone(page), pfn);
+	if (sb) {
+		if (sb->nr_reserved)
+			sb->nr_reserved--;
+		__spb_set_has_type(page, migratetype);
+	}
 }
 
 #ifdef CONFIG_DEBUG_VM
-- 
2.54.0



  parent reply	other threads:[~2026-05-20 15:01 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-20 14:59 [RFC PATCH 00/40] mm: reliable 1GB page allocation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 01/40] mm: page_alloc: replace pageblock_flags bitmap with struct pageblock_data Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 02/40] mm: page_alloc: per-cpu pageblock buddy allocator Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 03/40] mm: page_alloc: split-path PCP free with local-trylock + remote-llist Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 04/40] mm: mm_init: fix zone assignment for pages in unavailable ranges Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 05/40] mm: page_alloc: remove watermark boost mechanism Rik van Riel
2026-05-26 14:02   ` Usama Arif
2026-05-27 15:41     ` Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 06/40] mm: page_alloc: async evacuation of stolen movable pageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 07/40] mm: page_alloc: track actual page contents in pageblock flags Rik van Riel
2026-05-20 14:59 ` Rik van Riel [this message]
2026-05-20 14:59 ` [RFC PATCH 09/40] mm: page_alloc: support superpageblock resize for memory hotplug Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 10/40] mm: page_alloc: add superpageblock fullness lists for allocation steering Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 11/40] mm: page_alloc: steer pageblock stealing to tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 12/40] mm: page_alloc: steer movable allocations to fullest clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 13/40] mm: page_alloc: extract claim_whole_block from try_to_claim_block Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 14/40] mm: page_alloc: add per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 15/40] mm: page_alloc: add background superpageblock defragmentation worker Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 16/40] mm: compaction: walk per-superpageblock free lists for migration targets Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 17/40] mm: page_alloc: superpageblock-aware contiguous and higher order allocation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 18/40] mm: page_alloc: prevent atomic allocations from tainting clean SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 19/40] mm: page_alloc: aggressively pack non-movable allocs in tainted SPBs on large systems Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 20/40] mm: page_alloc: prefer reclaim over tainting clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 21/40] mm: page_alloc: adopt partial pageblocks from tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 22/40] mm: page_alloc: add CONFIG_DEBUG_VM sanity checks for SPB counters Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 23/40] mm: page_alloc: targeted evacuation and dynamic reserves for tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 24/40] mm: page_alloc: prevent UNMOVABLE/RECLAIMABLE mixing in pageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 25/40] mm: trigger deferred SPB evac when atomic allocs would taint a clean SPB Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 26/40] mm: page_alloc: refuse fragmenting fallback for callers with cheap fallback Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 27/40] mm: page_alloc: cross-migratetype buddy borrow within tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 28/40] mm: page_alloc: drive slab shrink from SPB anti-fragmentation pressure Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 29/40] mm: page_reporting: walk per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 30/40] mm: show_mem: collect migratetype letters from per-superpageblock lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 31/40] mm: page_alloc: per-(zone, order, mt) PASS_1 hint cache Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 32/40] mm: debug: prevent infinite recursion in dump_page() with CMA Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 33/40] PM: hibernate: walk per-superpageblock free lists in mark_free_pages Rik van Riel
2026-05-20 18:19   ` Rafael J. Wysocki
2026-05-20 14:59 ` [RFC PATCH 34/40] btrfs: allocate eb-attached btree pages as movable Rik van Riel
2026-05-20 17:47   ` Boris Burkov
2026-05-23 15:58     ` David Sterba
2026-05-24  1:43       ` Rik van Riel
2026-05-24 19:59         ` Matthew Wilcox
2026-05-25  6:57           ` Christoph Hellwig
2026-05-20 14:59 ` [RFC PATCH 35/40] mm: page_alloc: refuse best-effort high-order allocs servable at lower orders Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 36/40] mm: page_alloc: set ALLOC_NOFRAGMENT on alloc_frozen_pages_nolock_noprof Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 37/40] mm: page_alloc: move spb_get_category and spb_tainted_reserve to mmzone.h Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 38/40] mm: compaction: skip empty tainted superpageblocks as migration source Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 39/40] mm: compaction: respect tainted SPB reserve in destination selection Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 40/40] mm: page_alloc: SPB tracepoint instrumentation [DO-NOT-MERGE] Rik van Riel
2026-05-21  5:09   ` kernel test robot
2026-05-21  7:39 ` [syzbot ci] Re: mm: reliable 1GB page allocation syzbot ci
2026-05-22 11:02 ` [RFC PATCH 00/40] " Usama Arif
2026-05-22 13:55   ` Rik van Riel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260520150018.2491267-9-riel@surriel.com \
    --to=riel@surriel.com \
    --cc=david@kernel.org \
    --cc=fvdl@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=surenb@google.com \
    --cc=usama.arif@linux.dev \
    --cc=willy@infradead.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.