From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com, linux-mm@kvack.org, david@kernel.org,
willy@infradead.org, surenb@google.com, hannes@cmpxchg.org,
ljs@kernel.org, ziy@nvidia.com, usama.arif@linux.dev,
fvdl@google.com, Johannes Weiner <hnaz@cmpxchg.org>,
Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 01/40] mm: page_alloc: replace pageblock_flags bitmap with struct pageblock_data
Date: Wed, 20 May 2026 10:59:07 -0400 [thread overview]
Message-ID: <20260520150018.2491267-2-riel@surriel.com> (raw)
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
From: Johannes Weiner <hannes@cmpxchg.org>
Replace the packed pageblock_flags bitmap with a per-pageblock struct
containing its own flags word. This changes the storage from
NR_PAGEBLOCK_BITS bits per pageblock packed into shared unsigned longs,
to a dedicated unsigned long per pageblock.
The free path looks up migratetype (from pageblock flags) immediately
followed by looking up pageblock ownership. Colocating them in a struct
means this hot path touches one cache line instead of two.
The per-pageblock struct also eliminates all the bit-packing indexing
(pfn_to_bitidx, word selection, intra-word shifts), simplifying the
accessor code.
Memory overhead: 8 bytes per pageblock (one unsigned long). With 2MB
pageblocks on x86_64, that's 4KB per GB -- up from ~0.5-1 bytes per
pageblock with the packed bitmap, but still negligible in absolute terms.
No functional change.
Signed-off-by: Johannes Weiner <hnaz@cmpxchg.org>
Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
include/linux/mmzone.h | 15 ++++----
mm/internal.h | 17 +++++++++
mm/mm_init.c | 25 +++++--------
mm/page_alloc.c | 84 +++++++-----------------------------------
mm/sparse.c | 3 +-
5 files changed, 50 insertions(+), 94 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9adb2ad21da5..935ddc78f636 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1004,7 +1004,7 @@ struct zone {
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
- unsigned long *pageblock_flags;
+ struct pageblock_data *pageblock_data;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
@@ -1957,9 +1957,6 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
-#define SECTION_BLOCKFLAGS_BITS \
- ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
-
#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
#endif
@@ -1992,13 +1989,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
+struct pageblock_data {
+ unsigned long flags;
+};
+
struct mem_section_usage {
struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
/* See declaration of similar field in struct zone */
- unsigned long pageblock_flags[0];
+ struct pageblock_data pageblock_data[];
};
struct page;
@@ -2049,9 +2050,9 @@ extern struct mem_section **mem_section;
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif
-static inline unsigned long *section_to_usemap(struct mem_section *ms)
+static inline struct pageblock_data *section_to_usemap(struct mem_section *ms)
{
- return ms->usage->pageblock_flags;
+ return ms->usage->pageblock_data;
}
static inline struct mem_section *__nr_to_section(unsigned long nr)
diff --git a/mm/internal.h b/mm/internal.h
index 5a2ddcf68e0b..c8404cb00b08 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -808,6 +808,23 @@ static inline struct page *find_buddy_page_pfn(struct page *page,
return NULL;
}
+static inline struct pageblock_data *pfn_to_pageblock(const struct page *page,
+ unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ struct mem_section *ms = __pfn_to_section(pfn);
+ unsigned long idx = (pfn & (PAGES_PER_SECTION - 1)) >> pageblock_order;
+
+ return §ion_to_usemap(ms)[idx];
+#else
+ struct zone *zone = page_zone(page);
+ unsigned long idx;
+
+ idx = (pfn - pageblock_start_pfn(zone->zone_start_pfn)) >> pageblock_order;
+ return &zone->pageblock_data[idx];
+#endif
+}
+
extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f9f8e1af921c..1bc909da9c13 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1453,36 +1453,31 @@ void __meminit init_currently_empty_zone(struct zone *zone,
#ifndef CONFIG_SPARSEMEM
/*
- * Calculate the size of the zone->pageblock_flags rounded to an unsigned long
- * Start by making sure zonesize is a multiple of pageblock_order by rounding
- * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
- * round what is now in bits to nearest long in bits, then return it in
- * bytes.
+ * Calculate the size of the zone->pageblock_data array.
+ * Round up the zone size to a pageblock boundary to get the
+ * number of pageblocks, then multiply by the struct size.
*/
static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
{
- unsigned long usemapsize;
+ unsigned long nr_pageblocks;
zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- usemapsize = round_up(zonesize, pageblock_nr_pages);
- usemapsize = usemapsize >> pageblock_order;
- usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = round_up(usemapsize, BITS_PER_LONG);
+ nr_pageblocks = round_up(zonesize, pageblock_nr_pages) >> pageblock_order;
- return usemapsize / BITS_PER_BYTE;
+ return nr_pageblocks * sizeof(struct pageblock_data);
}
static void __ref setup_usemap(struct zone *zone)
{
unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
zone->spanned_pages);
- zone->pageblock_flags = NULL;
+ zone->pageblock_data = NULL;
if (usemapsize) {
- zone->pageblock_flags =
+ zone->pageblock_data =
memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
zone_to_nid(zone));
- if (!zone->pageblock_flags)
- panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ if (!zone->pageblock_data)
+ panic("Failed to allocate %ld bytes for zone %s pageblock data on node %d\n",
usemapsize, zone->name, zone_to_nid(zone));
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 227d58dc3de6..fcff0083d5d4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -315,52 +315,18 @@ static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order)
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(const struct page *page,
- unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- return section_to_usemap(__pfn_to_section(pfn));
-#else
- return page_zone(page)->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- pfn &= (PAGES_PER_SECTION-1);
-#else
- pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
-#endif /* CONFIG_SPARSEMEM */
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-}
-
static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
{
return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
}
-static __always_inline void
-get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
- unsigned long **bitmap_word, unsigned long *bitidx)
+static __always_inline unsigned long *
+get_pfnblock_flags_word(const struct page *page, unsigned long pfn)
{
- unsigned long *bitmap;
- unsigned long word_bitidx;
-
-#ifdef CONFIG_MEMORY_ISOLATION
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
-#else
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-#endif
BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
- bitmap = get_pageblock_bitmap(page, pfn);
- *bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = *bitidx / BITS_PER_LONG;
- *bitidx &= (BITS_PER_LONG - 1);
- *bitmap_word = &bitmap[word_bitidx];
+ return &pfn_to_pageblock(page, pfn)->flags;
}
@@ -377,18 +343,14 @@ static unsigned long __get_pfnblock_flags_mask(const struct page *page,
unsigned long pfn,
unsigned long mask)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
- unsigned long word;
+ unsigned long *flags_word = get_pfnblock_flags_word(page, pfn);
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
/*
* This races, without locks, with set_pfnblock_migratetype(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
- word = READ_ONCE(*bitmap_word);
- return (word >> bitidx) & mask;
+ return READ_ONCE(*flags_word) & mask;
}
/**
@@ -402,15 +364,10 @@ static unsigned long __get_pfnblock_flags_mask(const struct page *page,
bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
enum pageblock_bits pb_bit)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
-
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
return false;
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
- return test_bit(bitidx + pb_bit, bitmap_word);
+ return test_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
}
/**
@@ -449,18 +406,13 @@ get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
unsigned long flags, unsigned long mask)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
- unsigned long word;
-
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+ unsigned long *flags_word = get_pfnblock_flags_word(page, pfn);
+ unsigned long word, new_word;
- mask <<= bitidx;
- flags <<= bitidx;
-
- word = READ_ONCE(*bitmap_word);
+ word = READ_ONCE(*flags_word);
do {
- } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
+ new_word = (word & ~mask) | flags;
+ } while (!try_cmpxchg(flags_word, &word, new_word));
}
/**
@@ -472,15 +424,10 @@ static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
void set_pfnblock_bit(const struct page *page, unsigned long pfn,
enum pageblock_bits pb_bit)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
-
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
return;
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
- set_bit(bitidx + pb_bit, bitmap_word);
+ set_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
}
/**
@@ -492,15 +439,10 @@ void set_pfnblock_bit(const struct page *page, unsigned long pfn,
void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
enum pageblock_bits pb_bit)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
-
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
return;
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
- clear_bit(bitidx + pb_bit, bitmap_word);
+ clear_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
}
/**
diff --git a/mm/sparse.c b/mm/sparse.c
index effdac6b0ab1..f77d6d9fa62f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -216,7 +216,8 @@ static void __init memblocks_present(void)
static unsigned long usemap_size(void)
{
- return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
+ return (1UL << (PFN_SECTION_SHIFT - pageblock_order)) *
+ sizeof(struct pageblock_data);
}
size_t mem_section_usage_size(void)
--
2.54.0
next prev parent reply other threads:[~2026-05-20 15:02 UTC|newest]
Thread overview: 51+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-20 14:59 [RFC PATCH 00/40] mm: reliable 1GB page allocation Rik van Riel
2026-05-20 14:59 ` Rik van Riel [this message]
2026-05-20 14:59 ` [RFC PATCH 02/40] mm: page_alloc: per-cpu pageblock buddy allocator Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 03/40] mm: page_alloc: split-path PCP free with local-trylock + remote-llist Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 04/40] mm: mm_init: fix zone assignment for pages in unavailable ranges Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 05/40] mm: page_alloc: remove watermark boost mechanism Rik van Riel
2026-05-26 14:02 ` Usama Arif
2026-05-20 14:59 ` [RFC PATCH 06/40] mm: page_alloc: async evacuation of stolen movable pageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 07/40] mm: page_alloc: track actual page contents in pageblock flags Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 08/40] mm: page_alloc: superpageblock metadata for 1GB anti-fragmentation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 09/40] mm: page_alloc: support superpageblock resize for memory hotplug Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 10/40] mm: page_alloc: add superpageblock fullness lists for allocation steering Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 11/40] mm: page_alloc: steer pageblock stealing to tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 12/40] mm: page_alloc: steer movable allocations to fullest clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 13/40] mm: page_alloc: extract claim_whole_block from try_to_claim_block Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 14/40] mm: page_alloc: add per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 15/40] mm: page_alloc: add background superpageblock defragmentation worker Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 16/40] mm: compaction: walk per-superpageblock free lists for migration targets Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 17/40] mm: page_alloc: superpageblock-aware contiguous and higher order allocation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 18/40] mm: page_alloc: prevent atomic allocations from tainting clean SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 19/40] mm: page_alloc: aggressively pack non-movable allocs in tainted SPBs on large systems Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 20/40] mm: page_alloc: prefer reclaim over tainting clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 21/40] mm: page_alloc: adopt partial pageblocks from tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 22/40] mm: page_alloc: add CONFIG_DEBUG_VM sanity checks for SPB counters Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 23/40] mm: page_alloc: targeted evacuation and dynamic reserves for tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 24/40] mm: page_alloc: prevent UNMOVABLE/RECLAIMABLE mixing in pageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 25/40] mm: trigger deferred SPB evac when atomic allocs would taint a clean SPB Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 26/40] mm: page_alloc: refuse fragmenting fallback for callers with cheap fallback Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 27/40] mm: page_alloc: cross-migratetype buddy borrow within tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 28/40] mm: page_alloc: drive slab shrink from SPB anti-fragmentation pressure Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 29/40] mm: page_reporting: walk per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 30/40] mm: show_mem: collect migratetype letters from per-superpageblock lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 31/40] mm: page_alloc: per-(zone, order, mt) PASS_1 hint cache Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 32/40] mm: debug: prevent infinite recursion in dump_page() with CMA Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 33/40] PM: hibernate: walk per-superpageblock free lists in mark_free_pages Rik van Riel
2026-05-20 18:19 ` Rafael J. Wysocki
2026-05-20 14:59 ` [RFC PATCH 34/40] btrfs: allocate eb-attached btree pages as movable Rik van Riel
2026-05-20 17:47 ` Boris Burkov
2026-05-23 15:58 ` David Sterba
2026-05-24 1:43 ` Rik van Riel
2026-05-24 19:59 ` Matthew Wilcox
2026-05-25 6:57 ` Christoph Hellwig
2026-05-20 14:59 ` [RFC PATCH 35/40] mm: page_alloc: refuse best-effort high-order allocs servable at lower orders Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 36/40] mm: page_alloc: set ALLOC_NOFRAGMENT on alloc_frozen_pages_nolock_noprof Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 37/40] mm: page_alloc: move spb_get_category and spb_tainted_reserve to mmzone.h Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 38/40] mm: compaction: skip empty tainted superpageblocks as migration source Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 39/40] mm: compaction: respect tainted SPB reserve in destination selection Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 40/40] mm: page_alloc: SPB tracepoint instrumentation [DO-NOT-MERGE] Rik van Riel
2026-05-21 7:39 ` [syzbot ci] Re: mm: reliable 1GB page allocation syzbot ci
2026-05-22 11:02 ` [RFC PATCH 00/40] " Usama Arif
2026-05-22 13:55 ` Rik van Riel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260520150018.2491267-2-riel@surriel.com \
--to=riel@surriel.com \
--cc=david@kernel.org \
--cc=fvdl@google.com \
--cc=hannes@cmpxchg.org \
--cc=hnaz@cmpxchg.org \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=surenb@google.com \
--cc=usama.arif@linux.dev \
--cc=willy@infradead.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox