All of lore.kernel.org
 help / color / mirror / Atom feed
From: Rik van Riel <riel@surriel.com>
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com, linux-mm@kvack.org, david@kernel.org,
	willy@infradead.org, surenb@google.com, hannes@cmpxchg.org,
	ljs@kernel.org, ziy@nvidia.com, usama.arif@linux.dev,
	fvdl@google.com, Rik van Riel <riel@surriel.com>
Subject: [RFC PATCH 07/40] mm: page_alloc: track actual page contents in pageblock flags
Date: Wed, 20 May 2026 10:59:13 -0400	[thread overview]
Message-ID: <20260520150018.2491267-8-riel@surriel.com> (raw)
In-Reply-To: <20260520150018.2491267-1-riel@surriel.com>

Extend pageblock_data flags with PB_has_unmovable, PB_has_reclaimable, and
PB_has_movable bits to track the actual types of pages allocated within a
pageblock, independent of its intended migratetype.

The flags are set at steal time in try_to_claim_block(), avoiding overhead
on every allocation in __rmqueue_smallest():

1. Allocation / steal time: when try_to_claim_block() claims a pageblock,
set the PB_has_* flag corresponding to the allocation's migratetype. If
unmovable or reclaimable pages are being placed into a pageblock that
already has PB_has_movable set, queue async evacuation of the remaining
movable pages.

2. Full pageblock free: when buddy merging reconstructs a complete
pageblock in __free_one_page(), clear all PB_has_* flags since the block is
now empty.

3. Migration scan: when isolate_migratepages_block() completes a full
pageblock scan and finds no movable pages to isolate, clear PB_has_movable.
This consolidates the clearing for all callers: evacuate_pageblock(),
compaction, and alloc_contig_range().

This provides the foundation for superpageblock-level steering decisions:
knowing which pageblocks actually contain unmovable/reclaimable pages
allows directing future allocations to already-tainted regions, keeping
clean regions available for large contiguous allocations.

Signed-off-by: Rik van Riel <riel@surriel.com>
Assisted-by: Claude:claude-opus-4.7 syzkaller
---
 include/linux/pageblock-flags.h |  9 ++++
 mm/compaction.c                 | 17 ++++++
 mm/page_alloc.c                 | 93 +++++++++++++++++++++++++--------
 3 files changed, 98 insertions(+), 21 deletions(-)

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e046278a01fa..21bfcdf80b2e 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -20,6 +20,15 @@ enum pageblock_bits {
 	PB_migrate_2,
 	PB_compact_skip,/* If set the block is skipped by compaction */
 
+	/*
+	 * Track actual page contents independent of the intended migratetype.
+	 * Set at allocation time; cleared on full pageblock free or when
+	 * migration confirms no pages of that type remain.
+	 */
+	PB_has_unmovable,
+	PB_has_reclaimable,
+	PB_has_movable,
+
 #ifdef CONFIG_MEMORY_ISOLATION
 	/*
 	 * Pageblock isolation is represented with a separate bit, so that
diff --git a/mm/compaction.c b/mm/compaction.c
index 3648ce22c807..e8ca651e2b07 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -867,6 +867,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 	bool skip_on_failure = false;
 	unsigned long next_skip_pfn = 0;
 	bool skip_updated = false;
+	bool movable_skipped = false;
 	int ret = 0;
 
 	cc->migrate_pfn = low_pfn;
@@ -1079,6 +1080,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 					folio = page_folio(page);
 					goto isolate_success;
 				}
+				movable_skipped = true;
 			}
 
 			goto isolate_fail;
@@ -1246,6 +1248,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			lruvec_unlock_irqrestore(locked, flags);
 			locked = NULL;
 		}
+		movable_skipped = true;
 		folio_put(folio);
 
 isolate_fail:
@@ -1309,6 +1312,20 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (!cc->no_set_skip_hint && valid_page && !skip_updated)
 			set_pageblock_skip(valid_page);
 		update_cached_migrate(cc, low_pfn);
+
+		/*
+		 * Full pageblock scanned with no movable pages isolated.
+		 * Only clear PB_has_movable if no movable pages were
+		 * seen at all. If movable pages exist but could not be
+		 * isolated (pinned, writeback, dirty, etc.), leave the
+		 * flag set so a future migration attempt can try again.
+		 */
+		if (!nr_isolated && !movable_skipped && valid_page &&
+		    get_pfnblock_bit(valid_page, pageblock_start_pfn(start_pfn),
+				     PB_has_movable))
+			clear_pfnblock_bit(valid_page,
+					   pageblock_start_pfn(start_pfn),
+					   PB_has_movable);
 	}
 
 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0f3d734bd296..23108cdcbbec 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -928,6 +928,30 @@ static void change_pageblock_range(struct page *pageblock_page,
 	}
 }
 
+/*
+ * mark_pageblock_free - handle a pageblock becoming fully free
+ * @page: page at the start of the pageblock
+ * @pfn: page frame number
+ *
+ * Clear stale PCP ownership and actual-contents tracking flags when
+ * buddy merging reconstructs a full pageblock or a whole pageblock is
+ * freed directly. No PCP can still hold pages from this block (otherwise
+ * the buddy merge couldn't have completed), so the ownership entry would
+ * just cause misrouted frees.
+ */
+static void mark_pageblock_free(struct page *page, unsigned long pfn)
+{
+	clear_pcpblock_owner(page);
+
+	/*
+	 * The entire block is now free -- clear actual-contents tracking
+	 * flags since no allocated pages remain.
+	 */
+	clear_pfnblock_bit(page, pfn, PB_has_unmovable);
+	clear_pfnblock_bit(page, pfn, PB_has_reclaimable);
+	clear_pfnblock_bit(page, pfn, PB_has_movable);
+}
+
 /*
  * Freeing function for a buddy system allocator.
  *
@@ -973,19 +997,14 @@ static inline void __free_one_page(struct page *page,
 	account_freepages(zone, 1 << order, migratetype);
 
 	/*
-	 * For whole blocks, ownership returns to the zone. There are
-	 * no more outstanding frees to route through that CPU's PCP,
-	 * and we don't want to confuse any future users of the pages
-	 * in this block. E.g. rmqueue_buddy().
-	 *
-	 * Check here if a whole block came in directly: pre-merged in
-	 * the PCP, or PCP contended and bypassed.
-	 *
-	 * There is another check in the loop below if a block merges
-	 * up with pages already on the zone buddy.
+	 * When freeing a whole pageblock, clear stale PCP ownership
+	 * and actual-contents tracking flags up front.  The in-loop
+	 * check only fires when sub-pageblock pages merge *up to*
+	 * pageblock_order, not when entering at pageblock_order
+	 * directly.
 	 */
 	if (order == pageblock_order)
-		clear_pcpblock_owner(page);
+		mark_pageblock_free(page, pfn);
 
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt = migratetype;
@@ -1037,9 +1056,13 @@ static inline void __free_one_page(struct page *page,
 		pfn = combined_pfn;
 		order++;
 
-		/* Clear owner also when we merge up. See above */
+		/*
+		 * If merging has reconstructed a full pageblock,
+		 * clear any stale PCP ownership and actual-contents
+		 * tracking flags.
+		 */
 		if (order == pageblock_order)
-			clear_pcpblock_owner(page);
+			mark_pageblock_free(page, pfn);
 	}
 
 done_merging:
@@ -2433,6 +2456,9 @@ try_to_claim_block(struct zone *zone, struct page *page,
 {
 	int free_pages, movable_pages, alike_pages;
 	unsigned long start_pfn;
+#ifdef CONFIG_COMPACTION
+	struct page *start_page;
+#endif
 
 	/*
 	 * Don't steal from pageblocks that are isolated for
@@ -2488,15 +2514,29 @@ try_to_claim_block(struct zone *zone, struct page *page,
 		set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
 #ifdef CONFIG_COMPACTION
 		/*
-		 * A movable pageblock was just claimed for unmovable or
-		 * reclaimable use. Queue async evacuation of the remaining
-		 * movable pages so future unmovable/reclaimable allocations
-		 * can stay concentrated in fewer pageblocks.
+		 * Track actual page contents in pageblock flags.
+		 * Mark the pageblock with the type being allocated, and
+		 * if unmovable/reclaimable pages are being placed into a
+		 * pageblock that already has movable pages, queue async
+		 * evacuation of the movable pages.
 		 */
-		if (block_type == MIGRATE_MOVABLE &&
-		    (start_type == MIGRATE_UNMOVABLE ||
-		     start_type == MIGRATE_RECLAIMABLE))
-			queue_pageblock_evacuate(zone, start_pfn);
+		start_page = pfn_to_page(start_pfn);
+		if (start_type == MIGRATE_UNMOVABLE) {
+			set_pfnblock_bit(start_page, start_pfn,
+					 PB_has_unmovable);
+			if (get_pfnblock_bit(start_page, start_pfn,
+					     PB_has_movable))
+				queue_pageblock_evacuate(zone, start_pfn);
+		} else if (start_type == MIGRATE_RECLAIMABLE) {
+			set_pfnblock_bit(start_page, start_pfn,
+					 PB_has_reclaimable);
+			if (get_pfnblock_bit(start_page, start_pfn,
+					     PB_has_movable))
+				queue_pageblock_evacuate(zone, start_pfn);
+		} else if (start_type == MIGRATE_MOVABLE) {
+			set_pfnblock_bit(start_page, start_pfn,
+					 PB_has_movable);
+		}
 #endif
 		return __rmqueue_smallest(zone, order, start_type);
 	}
@@ -7307,6 +7347,17 @@ static void evacuate_pageblock(struct zone *zone, unsigned long start_pfn)
 
 	if (!list_empty(&cc.migratepages))
 		putback_movable_pages(&cc.migratepages);
+
+	/*
+	 * Re-scan to let isolate_migratepages_block clear PB_has_movable
+	 * if no movable pages remain after evacuation.
+	 */
+	cc.migrate_pfn = start_pfn;
+	cc.nr_migratepages = 0;
+	INIT_LIST_HEAD(&cc.migratepages);
+	isolate_migratepages_range(&cc, start_pfn, end_pfn);
+	if (!list_empty(&cc.migratepages))
+		putback_movable_pages(&cc.migratepages);
 }
 
 static void evacuate_work_fn(struct work_struct *work)
-- 
2.54.0



  parent reply	other threads:[~2026-05-20 15:01 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-20 14:59 [RFC PATCH 00/40] mm: reliable 1GB page allocation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 01/40] mm: page_alloc: replace pageblock_flags bitmap with struct pageblock_data Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 02/40] mm: page_alloc: per-cpu pageblock buddy allocator Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 03/40] mm: page_alloc: split-path PCP free with local-trylock + remote-llist Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 04/40] mm: mm_init: fix zone assignment for pages in unavailable ranges Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 05/40] mm: page_alloc: remove watermark boost mechanism Rik van Riel
2026-05-26 14:02   ` Usama Arif
2026-05-27 15:41     ` Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 06/40] mm: page_alloc: async evacuation of stolen movable pageblocks Rik van Riel
2026-05-20 14:59 ` Rik van Riel [this message]
2026-05-20 14:59 ` [RFC PATCH 08/40] mm: page_alloc: superpageblock metadata for 1GB anti-fragmentation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 09/40] mm: page_alloc: support superpageblock resize for memory hotplug Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 10/40] mm: page_alloc: add superpageblock fullness lists for allocation steering Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 11/40] mm: page_alloc: steer pageblock stealing to tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 12/40] mm: page_alloc: steer movable allocations to fullest clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 13/40] mm: page_alloc: extract claim_whole_block from try_to_claim_block Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 14/40] mm: page_alloc: add per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 15/40] mm: page_alloc: add background superpageblock defragmentation worker Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 16/40] mm: compaction: walk per-superpageblock free lists for migration targets Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 17/40] mm: page_alloc: superpageblock-aware contiguous and higher order allocation Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 18/40] mm: page_alloc: prevent atomic allocations from tainting clean SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 19/40] mm: page_alloc: aggressively pack non-movable allocs in tainted SPBs on large systems Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 20/40] mm: page_alloc: prefer reclaim over tainting clean superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 21/40] mm: page_alloc: adopt partial pageblocks from tainted superpageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 22/40] mm: page_alloc: add CONFIG_DEBUG_VM sanity checks for SPB counters Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 23/40] mm: page_alloc: targeted evacuation and dynamic reserves for tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 24/40] mm: page_alloc: prevent UNMOVABLE/RECLAIMABLE mixing in pageblocks Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 25/40] mm: trigger deferred SPB evac when atomic allocs would taint a clean SPB Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 26/40] mm: page_alloc: refuse fragmenting fallback for callers with cheap fallback Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 27/40] mm: page_alloc: cross-migratetype buddy borrow within tainted SPBs Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 28/40] mm: page_alloc: drive slab shrink from SPB anti-fragmentation pressure Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 29/40] mm: page_reporting: walk per-superpageblock free lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 30/40] mm: show_mem: collect migratetype letters from per-superpageblock lists Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 31/40] mm: page_alloc: per-(zone, order, mt) PASS_1 hint cache Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 32/40] mm: debug: prevent infinite recursion in dump_page() with CMA Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 33/40] PM: hibernate: walk per-superpageblock free lists in mark_free_pages Rik van Riel
2026-05-20 18:19   ` Rafael J. Wysocki
2026-05-20 14:59 ` [RFC PATCH 34/40] btrfs: allocate eb-attached btree pages as movable Rik van Riel
2026-05-20 17:47   ` Boris Burkov
2026-05-23 15:58     ` David Sterba
2026-05-24  1:43       ` Rik van Riel
2026-05-24 19:59         ` Matthew Wilcox
2026-05-25  6:57           ` Christoph Hellwig
2026-05-20 14:59 ` [RFC PATCH 35/40] mm: page_alloc: refuse best-effort high-order allocs servable at lower orders Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 36/40] mm: page_alloc: set ALLOC_NOFRAGMENT on alloc_frozen_pages_nolock_noprof Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 37/40] mm: page_alloc: move spb_get_category and spb_tainted_reserve to mmzone.h Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 38/40] mm: compaction: skip empty tainted superpageblocks as migration source Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 39/40] mm: compaction: respect tainted SPB reserve in destination selection Rik van Riel
2026-05-20 14:59 ` [RFC PATCH 40/40] mm: page_alloc: SPB tracepoint instrumentation [DO-NOT-MERGE] Rik van Riel
2026-05-21  5:09   ` kernel test robot
2026-05-21  7:39 ` [syzbot ci] Re: mm: reliable 1GB page allocation syzbot ci
2026-05-22 11:02 ` [RFC PATCH 00/40] " Usama Arif
2026-05-22 13:55   ` Rik van Riel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260520150018.2491267-8-riel@surriel.com \
    --to=riel@surriel.com \
    --cc=david@kernel.org \
    --cc=fvdl@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=surenb@google.com \
    --cc=usama.arif@linux.dev \
    --cc=willy@infradead.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.