* FAILED: patch "[PATCH] ksm: use range-walk function to jump over holes in" failed to apply to 5.10-stable tree
@ 2025-11-20 15:54 gregkh
2026-01-17 23:38 ` [PATCH 5.10.y 1/2] mm/pagewalk: add walk_page_range_vma() Pedro Demarchi Gomes
0 siblings, 1 reply; 3+ messages in thread
From: gregkh @ 2025-11-20 15:54 UTC (permalink / raw)
To: pedrodemargomes, akpm, chengming.zhou, craftfever, david, stable,
xu.xin16
Cc: stable
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable@vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f5548c318d6520d4fa3c5ed6003eeb710763cbc5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable@vger.kernel.org>' --in-reply-to '2025112005-turmoil-elsewhere-e83d@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f5548c318d6520d4fa3c5ed6003eeb710763cbc5 Mon Sep 17 00:00:00 2001
From: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Date: Wed, 22 Oct 2025 12:30:59 -0300
Subject: [PATCH] ksm: use range-walk function to jump over holes in
scan_get_next_rmap_item
Currently, scan_get_next_rmap_item() walks every page address in a VMA to
locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions, causing
ksmd to use large amount of cpu without deduplicating much pages.
This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups. This problem was
previously discussed in [1].
Consider the following test program which creates a 32 TiB mapping in the
virtual address space but only populates a single page:
#include <unistd.h>
#include <stdio.h>
#include <sys/mman.h>
/* 32 TiB */
const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;
int main() {
char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);
if (area == MAP_FAILED) {
perror("mmap() failed\n");
return -1;
}
/* Populate a single page such that we get an anon_vma. */
*area = 0;
/* Enable KSM. */
madvise(area, size, MADV_MERGEABLE);
pause();
return 0;
}
$ ./ksm-sparse &
$ echo 1 > /sys/kernel/mm/ksm/run
Without this patch ksmd uses 100% of the cpu for a long time (more then 1
hour in my test machine) scanning all the 32 TiB virtual address space
that contain only one mapped page. This makes ksmd essentially deadlocked
not able to deduplicate anything of value. With this patch ksmd walks
only the one mapped page and skips the rest of the 32 TiB virtual address
space, making the scan fast using little cpu.
Link: https://lkml.kernel.org/r/20251023035841.41406-1-pedrodemargomes@gmail.com
Link: https://lkml.kernel.org/r/20251022153059.22763-1-pedrodemargomes@gmail.com
Link: https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ [1]
Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
diff --git a/mm/ksm.c b/mm/ksm.c
index 7bc726b50b2f..c4e730409949 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
return true;
}
+struct ksm_next_page_arg {
+ struct folio *folio;
+ struct page *page;
+ unsigned long addr;
+};
+
+static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct ksm_next_page_arg *private = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *start_ptep = NULL, *ptep, pte;
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
+ struct page *page;
+ spinlock_t *ptl;
+ pmd_t pmd;
+
+ if (ksm_test_exit(mm))
+ return 0;
+
+ cond_resched();
+
+ pmd = pmdp_get_lockless(pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
+ ptl = pmd_lock(mm, pmdp);
+ pmd = pmdp_get(pmdp);
+
+ if (!pmd_present(pmd)) {
+ goto not_found_unlock;
+ } else if (pmd_leaf(pmd)) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (!page)
+ goto not_found_unlock;
+ folio = page_folio(page);
+
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ goto not_found_unlock;
+
+ page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+ goto found_unlock;
+ }
+ spin_unlock(ptl);
+ }
+
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+ pte = ptep_get(ptep);
+
+ if (!pte_present(pte))
+ continue;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ continue;
+ folio = page_folio(page);
+
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ continue;
+ goto found_unlock;
+ }
+
+not_found_unlock:
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ return 0;
+found_unlock:
+ folio_get(folio);
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ private->page = page;
+ private->folio = folio;
+ private->addr = addr;
+ return 1;
+}
+
+static struct mm_walk_ops ksm_next_page_ops = {
+ .pmd_entry = ksm_next_page_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
@@ -2542,21 +2631,27 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
+ struct ksm_next_page_arg ksm_next_page_arg;
struct page *tmp_page = NULL;
- struct folio_walk fw;
struct folio *folio;
if (ksm_test_exit(mm))
break;
- folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
- if (folio) {
- if (!folio_is_zone_device(folio) &&
- folio_test_anon(folio)) {
- folio_get(folio);
- tmp_page = fw.page;
- }
- folio_walk_end(&fw, vma);
+ int found;
+
+ found = walk_page_range_vma(vma, ksm_scan.address,
+ vma->vm_end,
+ &ksm_next_page_ops,
+ &ksm_next_page_arg);
+
+ if (found > 0) {
+ folio = ksm_next_page_arg.folio;
+ tmp_page = ksm_next_page_arg.page;
+ ksm_scan.address = ksm_next_page_arg.addr;
+ } else {
+ VM_WARN_ON_ONCE(found < 0);
+ ksm_scan.address = vma->vm_end - PAGE_SIZE;
}
if (tmp_page) {
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH 5.10.y 1/2] mm/pagewalk: add walk_page_range_vma()
2025-11-20 15:54 FAILED: patch "[PATCH] ksm: use range-walk function to jump over holes in" failed to apply to 5.10-stable tree gregkh
@ 2026-01-17 23:38 ` Pedro Demarchi Gomes
2026-01-17 23:38 ` [PATCH 5.10.y 2/2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
0 siblings, 1 reply; 3+ messages in thread
From: Pedro Demarchi Gomes @ 2026-01-17 23:38 UTC (permalink / raw)
To: stable
Cc: Zhi.Yang, David Hildenbrand, Andrea Arcangeli, Hugh Dickins,
Jason Gunthorpe, John Hubbard, Matthew Wilcox (Oracle), Peter Xu,
Shuah Khan, Vlastimil Babka, Andrew Morton, Pedro Demarchi Gomes
From: David Hildenbrand <david@redhat.com>
[ Upstream commit e07cda5f232fac4de0925d8a4c92e51e41fa2f6e ]
Let's add walk_page_range_vma(), which is similar to walk_page_vma(),
however, is only interested in a subset of the VMA range.
To be used in KSM code to stop using follow_page() next.
Link: https://lkml.kernel.org/r/20221021101141.84170-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Stable-dep-of: f5548c318d6 ("ksm: use range-walk function to jump over holes in scan_get_next_rmap_item")
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
include/linux/pagewalk.h | 3 +++
mm/pagewalk.c | 20 ++++++++++++++++++++
2 files changed, 23 insertions(+)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index b1cb6b753abb..3670b46cedb1 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -99,6 +99,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
pgd_t *pgd,
void *private);
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private);
int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 371ec21a1989..3f5cbd6eb4e1 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -461,6 +461,26 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
return walk_pgd_range(start, end, &walk);
}
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
+
+ if (start >= end || !walk.mm)
+ return -EINVAL;
+ if (start < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ mmap_assert_locked(walk.mm);
+ return __walk_page_range(start, end, &walk);
+}
+
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private)
{
--
2.47.3
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH 5.10.y 2/2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2026-01-17 23:38 ` [PATCH 5.10.y 1/2] mm/pagewalk: add walk_page_range_vma() Pedro Demarchi Gomes
@ 2026-01-17 23:38 ` Pedro Demarchi Gomes
0 siblings, 0 replies; 3+ messages in thread
From: Pedro Demarchi Gomes @ 2026-01-17 23:38 UTC (permalink / raw)
To: stable
Cc: Zhi.Yang, Pedro Demarchi Gomes, David Hildenbrand, craftfever,
Chengming Zhou, xu xin, Andrew Morton
[ Upstream commit f5548c318d6520d4fa3c5ed6003eeb710763cbc5 ]
Currently, scan_get_next_rmap_item() walks every page address in a VMA to
locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions, causing
ksmd to use large amount of cpu without deduplicating much pages.
This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups. This problem was
previously discussed in [1].
Consider the following test program which creates a 32 TiB mapping in the
virtual address space but only populates a single page:
/* 32 TiB */
const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;
int main() {
char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);
if (area == MAP_FAILED) {
perror("mmap() failed\n");
return -1;
}
/* Populate a single page such that we get an anon_vma. */
*area = 0;
/* Enable KSM. */
madvise(area, size, MADV_MERGEABLE);
pause();
return 0;
}
$ ./ksm-sparse &
$ echo 1 > /sys/kernel/mm/ksm/run
Without this patch ksmd uses 100% of the cpu for a long time (more then 1
hour in my test machine) scanning all the 32 TiB virtual address space
that contain only one mapped page. This makes ksmd essentially deadlocked
not able to deduplicate anything of value. With this patch ksmd walks
only the one mapped page and skips the rest of the 32 TiB virtual address
space, making the scan fast using little cpu.
Link: https://lkml.kernel.org/r/20251023035841.41406-1-pedrodemargomes@gmail.com
Link: https://lkml.kernel.org/r/20251022153059.22763-1-pedrodemargomes@gmail.com
Link: https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ [1]
Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ change folio to page, replace pmdp_get_lockless with pmd_read_atomic and pmdp_get with
READ_ONCE(*pmdp) ]
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 105 insertions(+), 10 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 25b8362a4f89..94d17fb3f184 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -38,6 +38,7 @@
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/pagewalk.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -2223,6 +2224,89 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
return rmap_item;
}
+struct ksm_next_page_arg {
+ struct page *page;
+ unsigned long addr;
+};
+
+static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct ksm_next_page_arg *private = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *start_ptep = NULL, *ptep, pte;
+ struct mm_struct *mm = walk->mm;
+ struct page *page;
+ spinlock_t *ptl;
+ pmd_t pmd;
+
+ if (ksm_test_exit(mm))
+ return 0;
+
+ cond_resched();
+
+ pmd = pmd_read_atomic(pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
+ ptl = pmd_lock(mm, pmdp);
+ pmd = READ_ONCE(*pmdp);
+
+ if (!pmd_present(pmd)) {
+ goto not_found_unlock;
+ } else if (pmd_leaf(pmd)) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (!page)
+ goto not_found_unlock;
+
+ if (is_zone_device_page(page) || !PageAnon(page))
+ goto not_found_unlock;
+
+ page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+ goto found_unlock;
+ }
+ spin_unlock(ptl);
+ }
+
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+ pte = ptep_get(ptep);
+
+ if (!pte_present(pte))
+ continue;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ continue;
+
+ if (is_zone_device_page(page) || !PageAnon(page))
+ continue;
+ goto found_unlock;
+ }
+
+not_found_unlock:
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ return 0;
+found_unlock:
+ get_page(page);
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ private->page = page;
+ private->addr = addr;
+ return 1;
+}
+
+static struct mm_walk_ops ksm_next_page_ops = {
+ .pmd_entry = ksm_next_page_pmd_entry,
+};
+
static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
@@ -2302,29 +2386,40 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
+ struct ksm_next_page_arg ksm_next_page_arg;
+ struct page *tmp_page = NULL;
+ int found;
+
if (ksm_test_exit(mm))
break;
- *page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (IS_ERR_OR_NULL(*page)) {
- ksm_scan.address += PAGE_SIZE;
- cond_resched();
- continue;
+
+ found = walk_page_range_vma(vma, ksm_scan.address,
+ vma->vm_end,
+ &ksm_next_page_ops,
+ &ksm_next_page_arg);
+
+ if (found > 0) {
+ tmp_page = ksm_next_page_arg.page;
+ ksm_scan.address = ksm_next_page_arg.addr;
+ } else {
+ VM_WARN_ON_ONCE(found < 0);
+ ksm_scan.address = vma->vm_end - PAGE_SIZE;
}
- if (PageAnon(*page)) {
- flush_anon_page(vma, *page, ksm_scan.address);
- flush_dcache_page(*page);
+ if (tmp_page) {
+ flush_anon_page(vma, tmp_page, ksm_scan.address);
+ flush_dcache_page(tmp_page);
rmap_item = get_next_rmap_item(slot,
ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
ksm_scan.rmap_list =
&rmap_item->rmap_list;
ksm_scan.address += PAGE_SIZE;
+ *page = tmp_page;
} else
- put_page(*page);
+ put_page(tmp_page);
mmap_read_unlock(mm);
return rmap_item;
}
- put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
--
2.47.3
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-01-17 23:38 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-20 15:54 FAILED: patch "[PATCH] ksm: use range-walk function to jump over holes in" failed to apply to 5.10-stable tree gregkh
2026-01-17 23:38 ` [PATCH 5.10.y 1/2] mm/pagewalk: add walk_page_range_vma() Pedro Demarchi Gomes
2026-01-17 23:38 ` [PATCH 5.10.y 2/2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox