* [RFC 0/3] mm: introduce snapshot_page()
@ 2025-06-17 14:27 Luiz Capitulino
2025-06-17 14:27 ` [RFC 1/3] " Luiz Capitulino
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Luiz Capitulino @ 2025-06-17 14:27 UTC (permalink / raw)
To: david, willy; +Cc: akpm, linux-kernel, linux-mm, lcapitulino, shivankg
Hi,
This series introduces snapshot_page(), a helper function that can be used
to create a snapshot of a struct page and its associated struct folio.
This function is intended to help callers with a consistent view of a
a folio while reducing the chance of encountering partially updated or
inconsistent state, such as during folio splitting which could lead to
crashes and BUG_ON()s being triggered.
This series is on top of latest Linus tree (9afe652958c3).
Luiz Capitulino (3):
mm: introduce snapshot_page()
proc: kpagecount: use snapshot_page()
fs: stable_page_flags(): use snapshot_page()
fs/proc/page.c | 46 ++++++++++++++++++++----------
include/linux/mm.h | 20 +++++++++++++
mm/debug.c | 42 +++------------------------
mm/util.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 126 insertions(+), 53 deletions(-)
--
2.49.0
^ permalink raw reply [flat|nested] 4+ messages in thread
* [RFC 1/3] mm: introduce snapshot_page()
2025-06-17 14:27 [RFC 0/3] mm: introduce snapshot_page() Luiz Capitulino
@ 2025-06-17 14:27 ` Luiz Capitulino
2025-06-17 14:27 ` [RFC 2/3] proc: kpagecount: use snapshot_page() Luiz Capitulino
2025-06-17 14:27 ` [RFC 3/3] fs: stable_page_flags(): " Luiz Capitulino
2 siblings, 0 replies; 4+ messages in thread
From: Luiz Capitulino @ 2025-06-17 14:27 UTC (permalink / raw)
To: david, willy; +Cc: akpm, linux-kernel, linux-mm, lcapitulino, shivankg
This commit refactors __dump_page() into snapshot_page().
snapshot_page() tries to take a faithful snapshot of a page and its
folio representation. The snapshot is returned in the struct
page_snapshot parameter along with additional flags that are best
retrieved at snapshot creation time to reduce race windows.
This function is intended to be used by callers that need a stable
representation of a struct page and struct folio so that pointers
or page information doesn't change while working on a page.
The idea and original implemenetation of snapshot_page() comes from
Matthew Wilcox with suggestions for improvements from David Hildenbrand.
All bugs and misconceptions are mine.
Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
---
include/linux/mm.h | 20 +++++++++++++
mm/debug.c | 42 +++------------------------
mm/util.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 95 insertions(+), 38 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ef2ba0c667a..4d83674e32a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4184,4 +4184,24 @@ static inline bool page_pool_page_is_pp(struct page *page)
}
#endif
+#define PAGE_SNAPSHOT_FAITHFUL (1 << 0)
+#define PAGE_SNAPSHOT_PG_HUGE_ZERO (1 << 1)
+#define PAGE_SNAPSHOT_PG_FREE (1 << 2)
+#define PAGE_SNAPSHOT_PG_IDLE (1 << 3)
+
+struct page_snapshot {
+ struct folio folio_snapshot;
+ struct page page_snapshot;
+ unsigned long pfn;
+ unsigned long idx;
+ unsigned long flags;
+};
+
+static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
+{
+ return ps->flags & 0x1;
+}
+
+void snapshot_page(struct page_snapshot *ps, const struct page *page);
+
#endif /* _LINUX_MM_H */
diff --git a/mm/debug.c b/mm/debug.c
index 907382257062..7349330ea506 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -129,47 +129,13 @@ static void __dump_folio(struct folio *folio, struct page *page,
static void __dump_page(const struct page *page)
{
- struct folio *foliop, folio;
- struct page precise;
- unsigned long head;
- unsigned long pfn = page_to_pfn(page);
- unsigned long idx, nr_pages = 1;
- int loops = 5;
-
-again:
- memcpy(&precise, page, sizeof(*page));
- head = precise.compound_head;
- if ((head & 1) == 0) {
- foliop = (struct folio *)&precise;
- idx = 0;
- if (!folio_test_large(foliop))
- goto dump;
- foliop = (struct folio *)page;
- } else {
- foliop = (struct folio *)(head - 1);
- idx = folio_page_idx(foliop, page);
- }
+ struct page_snapshot ps;
- if (idx < MAX_FOLIO_NR_PAGES) {
- memcpy(&folio, foliop, 2 * sizeof(struct page));
- nr_pages = folio_nr_pages(&folio);
- if (nr_pages > 1)
- memcpy(&folio.__page_2, &foliop->__page_2,
- sizeof(struct page));
- foliop = &folio;
- }
-
- if (idx > nr_pages) {
- if (loops-- > 0)
- goto again;
+ snapshot_page(&ps, page);
+ if (!snapshot_page_is_faithful(&ps))
pr_warn("page does not match folio\n");
- precise.compound_head &= ~1UL;
- foliop = (struct folio *)&precise;
- idx = 0;
- }
-dump:
- __dump_folio(foliop, &precise, pfn, idx);
+ __dump_folio(&ps.folio_snapshot, &ps.page_snapshot, ps.pfn, ps.idx);
}
void dump_page(const struct page *page, const char *reason)
diff --git a/mm/util.c b/mm/util.c
index 0b270c43d7d1..8c56e10aca5b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1171,3 +1171,74 @@ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
return 0;
}
EXPORT_SYMBOL(compat_vma_mmap_prepare);
+
+static void set_flags(struct page_snapshot *ps, const struct folio *folio,
+ const struct page *page)
+{
+ if (is_huge_zero_folio(folio))
+ ps->flags |= PAGE_SNAPSHOT_PG_HUGE_ZERO;
+ if (folio_ref_count(folio) == 0 && is_free_buddy_page(page))
+ ps->flags |= PAGE_SNAPSHOT_PG_FREE;
+ if (folio_test_idle(folio))
+ ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
+}
+
+/*
+ * Create a snapshot of a page and store its struct page and struct folio
+ * representations in a struct page_snapshot.
+ *
+ * @ps: struct page_snapshot to store the page snapshot
+ * @page: the page we want to snapshot
+ *
+ * Note that creating a faithful snapshot of a page may fail if the page
+ * compound keeps changing (eg. due to folio split). In this case we set
+ * ps->faithful to false and the snapshot will assume that @page refers
+ * to a single page.
+ */
+void snapshot_page(struct page_snapshot *ps, const struct page *page)
+{
+ unsigned long head, nr_pages = 1;
+ struct folio *foliop, folio;
+ int loops = 5;
+
+ ps->pfn = page_to_pfn(page);
+ ps->flags = PAGE_SNAPSHOT_FAITHFUL;
+
+again:
+ memcpy(&ps->page_snapshot, page, sizeof(*page));
+ head = ps->page_snapshot.compound_head;
+ if ((head & 1) == 0) {
+ foliop = (struct folio *)&ps->page_snapshot;
+ ps->idx = 0;
+ if (!folio_test_large(foliop)) {
+ set_flags(ps, page_folio(page), page);
+ goto out;
+ }
+ foliop = (struct folio *)page;
+ } else {
+ foliop = (struct folio *)(page->compound_head - 1);
+ ps->idx = folio_page_idx(foliop, page);
+ }
+
+ if (ps->idx < MAX_FOLIO_NR_PAGES) {
+ memcpy(&folio, foliop, 2 * sizeof(struct page));
+ nr_pages = folio_nr_pages(&folio);
+ if (nr_pages > 1)
+ memcpy(&folio.__page_2, &foliop->__page_2,
+ sizeof(struct page));
+ set_flags(ps, foliop, page);
+ foliop = &folio;
+ }
+
+ if (ps->idx > nr_pages) {
+ if (loops-- > 0)
+ goto again;
+ ps->page_snapshot.compound_head &= ~1UL;
+ foliop = (struct folio *)&ps->page_snapshot;
+ ps->flags = 0;
+ ps->idx = 0;
+ }
+
+out:
+ memcpy(&ps->folio_snapshot, foliop, sizeof(struct folio));
+}
--
2.49.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [RFC 2/3] proc: kpagecount: use snapshot_page()
2025-06-17 14:27 [RFC 0/3] mm: introduce snapshot_page() Luiz Capitulino
2025-06-17 14:27 ` [RFC 1/3] " Luiz Capitulino
@ 2025-06-17 14:27 ` Luiz Capitulino
2025-06-17 14:27 ` [RFC 3/3] fs: stable_page_flags(): " Luiz Capitulino
2 siblings, 0 replies; 4+ messages in thread
From: Luiz Capitulino @ 2025-06-17 14:27 UTC (permalink / raw)
To: david, willy; +Cc: akpm, linux-kernel, linux-mm, lcapitulino, shivankg
Currently, the call to folio_precise_page_mapcount() from kpage_read()
can race with a folio split. When the race happens we trigger a
VM_BUG_ON_FOLIO() in folio_entire_mapcount() (see splat below).
This commit fixes this race by using snapshot_page() so that we
retreive the folio mapcount using a folio snapshot.
Splat:
[ 2356.558576] page: refcount:1 mapcount:1 mapping:0000000000000000 index:0xffff85200 pfn:0x6f7c00
[ 2356.558748] memcg:ffff000651775780
[ 2356.558763] anon flags: 0xafffff60020838(uptodate|dirty|lru|owner_2|swapbacked|node=1|zone=2|lastcpupid=0xfffff)
[ 2356.558796] raw: 00afffff60020838 fffffdffdb5d0048 fffffdffdadf7fc8 ffff00064c1629c1
[ 2356.558817] raw: 0000000ffff85200 0000000000000000 0000000100000000 ffff000651775780
[ 2356.558839] page dumped because: VM_BUG_ON_FOLIO(!folio_test_large(folio))
[ 2356.558882] ------------[ cut here ]------------
[ 2356.558897] kernel BUG at ./include/linux/mm.h:1103!
[ 2356.558982] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
[ 2356.564729] CPU: 8 UID: 0 PID: 1864 Comm: folio-split-rac Tainted: G S W 6.15.0+ #3 PREEMPT(voluntary)
[ 2356.566196] Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN
[ 2356.566814] Hardware name: Red Hat KVM, BIOS edk2-20241117-3.el9 11/17/2024
[ 2356.567684] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 2356.568563] pc : kpage_read.constprop.0+0x26c/0x290
[ 2356.569605] lr : kpage_read.constprop.0+0x26c/0x290
[ 2356.569992] sp : ffff80008fb739b0
[ 2356.570263] x29: ffff80008fb739b0 x28: ffff00064aa69580 x27: 00000000ff000000
[ 2356.570842] x26: 0000fffffffffff8 x25: ffff00064aa69580 x24: ffff80008fb73ae0
[ 2356.571411] x23: 0000000000000001 x22: 0000ffff86c6e8b8 x21: 0000000000000008
[ 2356.571978] x20: 00000000006f7c00 x19: 0000ffff86c6e8b8 x18: 0000000000000000
[ 2356.572581] x17: 3630303066666666 x16: 0000000000000003 x15: 0000000000001000
[ 2356.573217] x14: 00000000ffffffff x13: 0000000000000004 x12: 00aaaaaa00aaaaaa
[ 2356.577674] x11: 0000000000000000 x10: 00aaaaaa00aaaaaa x9 : ffffbf3afca6c300
[ 2356.578332] x8 : 0000000000000002 x7 : 0000000000000001 x6 : 0000000000000001
[ 2356.578984] x5 : ffff000c79812408 x4 : 0000000000000000 x3 : 0000000000000000
[ 2356.579635] x2 : 0000000000000000 x1 : ffff00064aa69580 x0 : 000000000000003e
[ 2356.580286] Call trace:
[ 2356.580524] kpage_read.constprop.0+0x26c/0x290 (P)
[ 2356.580982] kpagecount_read+0x28/0x40
[ 2356.581336] proc_reg_read+0x38/0x100
[ 2356.581681] vfs_read+0xcc/0x320
[ 2356.581992] ksys_read+0x74/0x118
[ 2356.582306] __arm64_sys_read+0x24/0x38
[ 2356.582668] invoke_syscall+0x70/0x100
[ 2356.583022] el0_svc_common.constprop.0+0x48/0xf8
[ 2356.583456] do_el0_svc+0x28/0x40
[ 2356.583930] el0_svc+0x38/0x118
[ 2356.584328] el0t_64_sync_handler+0x144/0x168
[ 2356.584883] el0t_64_sync+0x19c/0x1a0
[ 2356.585350] Code: aa0103e0 9003a541 91082021 97f813fc (d4210000)
[ 2356.586130] ---[ end trace 0000000000000000 ]---
[ 2356.587377] note: folio-split-rac[1864] exited with irqs disabled
[ 2356.588050] note: folio-split-rac[1864] exited with preempt_count 1
Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
---
fs/proc/page.c | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 999af26c7298..936f8bbe5a6f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -43,6 +43,22 @@ static inline unsigned long get_max_dump_pfn(void)
#endif
}
+static u64 get_kpage_count(const struct page *page)
+{
+ struct page_snapshot ps;
+ u64 ret;
+
+ snapshot_page(&ps, page);
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ ret = folio_precise_page_mapcount(&ps.folio_snapshot,
+ &ps.page_snapshot);
+ else
+ ret = folio_average_page_mapcount(&ps.folio_snapshot);
+
+ return ret;
+}
+
static ssize_t kpage_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos,
enum kpage_operation op)
@@ -75,10 +91,7 @@ static ssize_t kpage_read(struct file *file, char __user *buf,
info = stable_page_flags(page);
break;
case KPAGE_COUNT:
- if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
- info = folio_precise_page_mapcount(page_folio(page), page);
- else
- info = folio_average_page_mapcount(page_folio(page));
+ info = get_kpage_count(page);
break;
case KPAGE_CGROUP:
info = page_cgroup_ino(page);
--
2.49.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [RFC 3/3] fs: stable_page_flags(): use snapshot_page()
2025-06-17 14:27 [RFC 0/3] mm: introduce snapshot_page() Luiz Capitulino
2025-06-17 14:27 ` [RFC 1/3] " Luiz Capitulino
2025-06-17 14:27 ` [RFC 2/3] proc: kpagecount: use snapshot_page() Luiz Capitulino
@ 2025-06-17 14:27 ` Luiz Capitulino
2 siblings, 0 replies; 4+ messages in thread
From: Luiz Capitulino @ 2025-06-17 14:27 UTC (permalink / raw)
To: david, willy; +Cc: akpm, linux-kernel, linux-mm, lcapitulino, shivankg
A race condition is possible in stable_page_flags() where user-space is
reading /proc/kpageflags concurrently to a folio split. This may lead to
oopses or BUG_ON()s being triggered.
To fix this, this commit uses snapshot_page() in stable_page_flags() so
that stable_page_flags() works with a stable page and folio snapshots
instead.
Note that stable_page_flags() makes use of some functions that require
the original page or folio pointer to work properly (eg.
is_free_budy_page() and folio_test_idle()). Since those functions can't
be used on the page snapshot, we replace their usage with flags that
were set by snapshot_page() for this purpose.
Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
---
fs/proc/page.c | 25 ++++++++++++++-----------
1 file changed, 14 insertions(+), 11 deletions(-)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 936f8bbe5a6f..a2ee95f727f0 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -147,6 +147,7 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
u64 stable_page_flags(const struct page *page)
{
const struct folio *folio;
+ struct page_snapshot ps;
unsigned long k;
unsigned long mapping;
bool is_anon;
@@ -158,7 +159,9 @@ u64 stable_page_flags(const struct page *page)
*/
if (!page)
return 1 << KPF_NOPAGE;
- folio = page_folio(page);
+
+ snapshot_page(&ps, page);
+ folio = &ps.folio_snapshot;
k = folio->flags;
mapping = (unsigned long)folio->mapping;
@@ -167,7 +170,7 @@ u64 stable_page_flags(const struct page *page)
/*
* pseudo flags for the well known (anonymous) memory mapped pages
*/
- if (page_mapped(page))
+ if (folio_mapped(folio))
u |= 1 << KPF_MMAP;
if (is_anon) {
u |= 1 << KPF_ANON;
@@ -179,7 +182,7 @@ u64 stable_page_flags(const struct page *page)
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
- if (page == &folio->page)
+ if (ps.idx == 0)
u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
else
u |= 1 << KPF_COMPOUND_TAIL;
@@ -189,10 +192,10 @@ u64 stable_page_flags(const struct page *page)
folio_test_large_rmappable(folio)) {
/* Note: we indicate any THPs here, not just PMD-sized ones */
u |= 1 << KPF_THP;
- } else if (is_huge_zero_folio(folio)) {
+ } else if (ps.flags & PAGE_SNAPSHOT_PG_HUGE_ZERO) {
u |= 1 << KPF_ZERO_PAGE;
u |= 1 << KPF_THP;
- } else if (is_zero_folio(folio)) {
+ } else if (is_zero_pfn(ps.pfn)) {
u |= 1 << KPF_ZERO_PAGE;
}
@@ -200,14 +203,14 @@ u64 stable_page_flags(const struct page *page)
* Caveats on high order pages: PG_buddy and PG_slab will only be set
* on the head page.
*/
- if (PageBuddy(page))
+ if (PageBuddy(&ps.page_snapshot))
u |= 1 << KPF_BUDDY;
- else if (page_count(page) == 0 && is_free_buddy_page(page))
+ else if (ps.flags & PAGE_SNAPSHOT_PG_FREE)
u |= 1 << KPF_BUDDY;
- if (PageOffline(page))
+ if (folio_test_offline(folio))
u |= 1 << KPF_OFFLINE;
- if (PageTable(page))
+ if (folio_test_pgtable(folio))
u |= 1 << KPF_PGTABLE;
if (folio_test_slab(folio))
u |= 1 << KPF_SLAB;
@@ -215,7 +218,7 @@ u64 stable_page_flags(const struct page *page)
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
u |= kpf_copy_bit(k, KPF_IDLE, PG_idle);
#else
- if (folio_test_idle(folio))
+ if (ps.flags & PAGE_SNAPSHOT_PG_IDLE)
u |= 1 << KPF_IDLE;
#endif
@@ -241,7 +244,7 @@ u64 stable_page_flags(const struct page *page)
if (u & (1 << KPF_HUGE))
u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
else
- u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison);
+ u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison);
#endif
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
--
2.49.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-06-17 14:27 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-17 14:27 [RFC 0/3] mm: introduce snapshot_page() Luiz Capitulino
2025-06-17 14:27 ` [RFC 1/3] " Luiz Capitulino
2025-06-17 14:27 ` [RFC 2/3] proc: kpagecount: use snapshot_page() Luiz Capitulino
2025-06-17 14:27 ` [RFC 3/3] fs: stable_page_flags(): " Luiz Capitulino
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).