From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com, linux-mm@kvack.org, david@kernel.org,
willy@infradead.org, surenb@google.com, hannes@cmpxchg.org,
ljs@kernel.org, ziy@nvidia.com, usama.arif@linux.dev,
fvdl@google.com, Johannes Weiner <hnaz@cmpxchg.org>,
Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 01/40] mm: page_alloc: replace pageblock_flags bitmap with struct pageblock_data
Date: Wed, 20 May 2026 10:59:07 -0400 [thread overview]
Message-ID: <20260520150018.2491267-2-riel@surriel.com> (raw)
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>
From: Johannes Weiner <hannes@cmpxchg.org>
Replace the packed pageblock_flags bitmap with a per-pageblock struct
containing its own flags word. This changes the storage from
NR_PAGEBLOCK_BITS bits per pageblock packed into shared unsigned longs,
to a dedicated unsigned long per pageblock.
The free path looks up migratetype (from pageblock flags) immediately
followed by looking up pageblock ownership. Colocating them in a struct
means this hot path touches one cache line instead of two.
The per-pageblock struct also eliminates all the bit-packing indexing
(pfn_to_bitidx, word selection, intra-word shifts), simplifying the
accessor code.
Memory overhead: 8 bytes per pageblock (one unsigned long). With 2MB
pageblocks on x86_64, that's 4KB per GB -- up from ~0.5-1 bytes per
pageblock with the packed bitmap, but still negligible in absolute terms.
No functional change.
Signed-off-by: Johannes Weiner <hnaz@cmpxchg.org>
Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
include/linux/mmzone.h | 15 ++++----
mm/internal.h | 17 +++++++++
mm/mm_init.c | 25 +++++--------
mm/page_alloc.c | 84 +++++++-----------------------------------
mm/sparse.c | 3 +-
5 files changed, 50 insertions(+), 94 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9adb2ad21da5..935ddc78f636 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1004,7 +1004,7 @@ struct zone {
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
- unsigned long *pageblock_flags;
+ struct pageblock_data *pageblock_data;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
@@ -1957,9 +1957,6 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
-#define SECTION_BLOCKFLAGS_BITS \
- ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
-
#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
#endif
@@ -1992,13 +1989,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
+struct pageblock_data {
+ unsigned long flags;
+};
+
struct mem_section_usage {
struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
/* See declaration of similar field in struct zone */
- unsigned long pageblock_flags[0];
+ struct pageblock_data pageblock_data[];
};
struct page;
@@ -2049,9 +2050,9 @@ extern struct mem_section **mem_section;
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif
-static inline unsigned long *section_to_usemap(struct mem_section *ms)
+static inline struct pageblock_data *section_to_usemap(struct mem_section *ms)
{
- return ms->usage->pageblock_flags;
+ return ms->usage->pageblock_data;
}
static inline struct mem_section *__nr_to_section(unsigned long nr)
diff --git a/mm/internal.h b/mm/internal.h
index 5a2ddcf68e0b..c8404cb00b08 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -808,6 +808,23 @@ static inline struct page *find_buddy_page_pfn(struct page *page,
return NULL;
}
+static inline struct pageblock_data *pfn_to_pageblock(const struct page *page,
+ unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ struct mem_section *ms = __pfn_to_section(pfn);
+ unsigned long idx = (pfn & (PAGES_PER_SECTION - 1)) >> pageblock_order;
+
+ return §ion_to_usemap(ms)[idx];
+#else
+ struct zone *zone = page_zone(page);
+ unsigned long idx;
+
+ idx = (pfn - pageblock_start_pfn(zone->zone_start_pfn)) >> pageblock_order;
+ return &zone->pageblock_data[idx];
+#endif
+}
+
extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f9f8e1af921c..1bc909da9c13 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1453,36 +1453,31 @@ void __meminit init_currently_empty_zone(struct zone *zone,
#ifndef CONFIG_SPARSEMEM
/*
- * Calculate the size of the zone->pageblock_flags rounded to an unsigned long
- * Start by making sure zonesize is a multiple of pageblock_order by rounding
- * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
- * round what is now in bits to nearest long in bits, then return it in
- * bytes.
+ * Calculate the size of the zone->pageblock_data array.
+ * Round up the zone size to a pageblock boundary to get the
+ * number of pageblocks, then multiply by the struct size.
*/
static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
{
- unsigned long usemapsize;
+ unsigned long nr_pageblocks;
zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- usemapsize = round_up(zonesize, pageblock_nr_pages);
- usemapsize = usemapsize >> pageblock_order;
- usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = round_up(usemapsize, BITS_PER_LONG);
+ nr_pageblocks = round_up(zonesize, pageblock_nr_pages) >> pageblock_order;
- return usemapsize / BITS_PER_BYTE;
+ return nr_pageblocks * sizeof(struct pageblock_data);
}
static void __ref setup_usemap(struct zone *zone)
{
unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
zone->spanned_pages);
- zone->pageblock_flags = NULL;
+ zone->pageblock_data = NULL;
if (usemapsize) {
- zone->pageblock_flags =
+ zone->pageblock_data =
memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
zone_to_nid(zone));
- if (!zone->pageblock_flags)
- panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ if (!zone->pageblock_data)
+ panic("Failed to allocate %ld bytes for zone %s pageblock data on node %d\n",
usemapsize, zone->name, zone_to_nid(zone));
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 227d58dc3de6..fcff0083d5d4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -315,52 +315,18 @@ static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order)
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(const struct page *page,
- unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- return section_to_usemap(__pfn_to_section(pfn));
-#else
- return page_zone(page)->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
- pfn &= (PAGES_PER_SECTION-1);
-#else
- pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
-#endif /* CONFIG_SPARSEMEM */
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-}
-
static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
{
return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
}
-static __always_inline void
-get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
- unsigned long **bitmap_word, unsigned long *bitidx)
+static __always_inline unsigned long *
+get_pfnblock_flags_word(const struct page *page, unsigned long pfn)
{
- unsigned long *bitmap;
- unsigned long word_bitidx;
-
-#ifdef CONFIG_MEMORY_ISOLATION
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
-#else
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-#endif
BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
- bitmap = get_pageblock_bitmap(page, pfn);
- *bitidx = pfn_to_bitidx(page, pfn);
- word_bitidx = *bitidx / BITS_PER_LONG;
- *bitidx &= (BITS_PER_LONG - 1);
- *bitmap_word = &bitmap[word_bitidx];
+ return &pfn_to_pageblock(page, pfn)->flags;
}
@@ -377,18 +343,14 @@ static unsigned long __get_pfnblock_flags_mask(const struct page *page,
unsigned long pfn,
unsigned long mask)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
- unsigned long word;
+ unsigned long *flags_word = get_pfnblock_flags_word(page, pfn);
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
/*
* This races, without locks, with set_pfnblock_migratetype(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
- word = READ_ONCE(*bitmap_word);
- return (word >> bitidx) & mask;
+ return READ_ONCE(*flags_word) & mask;
}
/**
@@ -402,15 +364,10 @@ static unsigned long __get_pfnblock_flags_mask(const struct page *page,
bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
enum pageblock_bits pb_bit)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
-
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
return false;
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
- return test_bit(bitidx + pb_bit, bitmap_word);
+ return test_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
}
/**
@@ -449,18 +406,13 @@ get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
unsigned long flags, unsigned long mask)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
- unsigned long word;
-
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+ unsigned long *flags_word = get_pfnblock_flags_word(page, pfn);
+ unsigned long word, new_word;
- mask <<= bitidx;
- flags <<= bitidx;
-
- word = READ_ONCE(*bitmap_word);
+ word = READ_ONCE(*flags_word);
do {
- } while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
+ new_word = (word & ~mask) | flags;
+ } while (!try_cmpxchg(flags_word, &word, new_word));
}
/**
@@ -472,15 +424,10 @@ static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
void set_pfnblock_bit(const struct page *page, unsigned long pfn,
enum pageblock_bits pb_bit)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
-
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
return;
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
- set_bit(bitidx + pb_bit, bitmap_word);
+ set_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
}
/**
@@ -492,15 +439,10 @@ void set_pfnblock_bit(const struct page *page, unsigned long pfn,
void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
enum pageblock_bits pb_bit)
{
- unsigned long *bitmap_word;
- unsigned long bitidx;
-
if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
return;
- get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
- clear_bit(bitidx + pb_bit, bitmap_word);
+ clear_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
}
/**
diff --git a/mm/sparse.c b/mm/sparse.c
index effdac6b0ab1..f77d6d9fa62f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -216,7 +216,8 @@ static void __init memblocks_present(void)
static unsigned long usemap_size(void)
{
- return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
+ return (1UL << (PFN_SECTION_SHIFT - pageblock_order)) *
+ sizeof(struct pageblock_data);
}
size_t mem_section_usage_size(void)
--
2.54.0
next prev parent reply other threads:[~2026-05-20 15:00 UTC|newest]
Thread overview: 53+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-20 14:59 [RFC PATCH 00/40] mm: reliable 1GB page allocation Rik van Riel
2026-05-20 14:59 ` Rik van Riel [this message]
2026-05-20 14:59 ` [RFC PATCH 02/40] mm: page_alloc: per-cpu pageblock buddy allocator Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 03/40] mm: page_alloc: split-path PCP free with local-trylock + remote-llist Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 04/40] mm: mm_init: fix zone assignment for pages in unavailable ranges Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 05/40] mm: page_alloc: remove watermark boost mechanism Rik van Riel
2026-05-26 14:02 ` Usama Arif
2026-05-27 15:41 ` Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 06/40] mm: page_alloc: async evacuation of stolen movable pageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 07/40] mm: page_alloc: track actual page contents in pageblock flags Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 08/40] mm: page_alloc: superpageblock metadata for 1GB anti-fragmentation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 09/40] mm: page_alloc: support superpageblock resize for memory hotplug Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 10/40] mm: page_alloc: add superpageblock fullness lists for allocation steering Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 11/40] mm: page_alloc: steer pageblock stealing to tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 12/40] mm: page_alloc: steer movable allocations to fullest clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 13/40] mm: page_alloc: extract claim_whole_block from try_to_claim_block Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 14/40] mm: page_alloc: add per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 15/40] mm: page_alloc: add background superpageblock defragmentation worker Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 16/40] mm: compaction: walk per-superpageblock free lists for migration targets Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 17/40] mm: page_alloc: superpageblock-aware contiguous and higher order allocation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 18/40] mm: page_alloc: prevent atomic allocations from tainting clean SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 19/40] mm: page_alloc: aggressively pack non-movable allocs in tainted SPBs on large systems Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 20/40] mm: page_alloc: prefer reclaim over tainting clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 21/40] mm: page_alloc: adopt partial pageblocks from tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 22/40] mm: page_alloc: add CONFIG_DEBUG_VM sanity checks for SPB counters Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 23/40] mm: page_alloc: targeted evacuation and dynamic reserves for tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 24/40] mm: page_alloc: prevent UNMOVABLE/RECLAIMABLE mixing in pageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 25/40] mm: trigger deferred SPB evac when atomic allocs would taint a clean SPB Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 26/40] mm: page_alloc: refuse fragmenting fallback for callers with cheap fallback Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 27/40] mm: page_alloc: cross-migratetype buddy borrow within tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 28/40] mm: page_alloc: drive slab shrink from SPB anti-fragmentation pressure Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 29/40] mm: page_reporting: walk per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 30/40] mm: show_mem: collect migratetype letters from per-superpageblock lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 31/40] mm: page_alloc: per-(zone, order, mt) PASS_1 hint cache Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 32/40] mm: debug: prevent infinite recursion in dump_page() with CMA Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 33/40] PM: hibernate: walk per-superpageblock free lists in mark_free_pages Rik van Riel
2026-05-20 18:19 ` Rafael J. Wysocki
2026-05-20 14:59 ` [RFC PATCH 34/40] btrfs: allocate eb-attached btree pages as movable Rik van Riel
2026-05-20 17:47 ` Boris Burkov
2026-05-23 15:58 ` David Sterba
2026-05-24 1:43 ` Rik van Riel
2026-05-24 19:59 ` Matthew Wilcox
2026-05-25 6:57 ` Christoph Hellwig
2026-05-20 14:59 ` [RFC PATCH 35/40] mm: page_alloc: refuse best-effort high-order allocs servable at lower orders Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 36/40] mm: page_alloc: set ALLOC_NOFRAGMENT on alloc_frozen_pages_nolock_noprof Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 37/40] mm: page_alloc: move spb_get_category and spb_tainted_reserve to mmzone.h Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 38/40] mm: compaction: skip empty tainted superpageblocks as migration source Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 39/40] mm: compaction: respect tainted SPB reserve in destination selection Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 40/40] mm: page_alloc: SPB tracepoint instrumentation [DO-NOT-MERGE] Rik van Riel
2026-05-21 5:09 ` kernel test robot
2026-05-21 7:39 ` [syzbot ci] Re: mm: reliable 1GB page allocation syzbot ci
2026-05-22 11:02 ` [RFC PATCH 00/40] " Usama Arif
2026-05-22 13:55 ` Rik van Riel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260520150018.2491267-2-riel@surriel.com \
--to=riel@surriel.com \
--cc=david@kernel.org \
--cc=fvdl@google.com \
--cc=hannes@cmpxchg.org \
--cc=hnaz@cmpxchg.org \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=surenb@google.com \
--cc=usama.arif@linux.dev \
--cc=willy@infradead.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.