From: Nico Pache <npache@redhat.com>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: aarcange@redhat.com, akpm@linux-foundation.org,
anshuman.khandual@arm.com, apopple@nvidia.com, baohua@kernel.org,
baolin.wang@linux.alibaba.com, byungchul@sk.com,
catalin.marinas@arm.com, cl@gentwo.org, corbet@lwn.net,
dave.hansen@linux.intel.com, david@kernel.org, dev.jain@arm.com,
gourry@gourry.net, hannes@cmpxchg.org, hughd@google.com,
jackmanb@google.com, jack@suse.cz, jannh@google.com,
jglisse@google.com, joshua.hahnjy@gmail.com, kas@kernel.org,
lance.yang@linux.dev, Liam.Howlett@oracle.com,
lorenzo.stoakes@oracle.com, mathieu.desnoyers@efficios.com,
matthew.brost@intel.com, mhiramat@kernel.org, mhocko@suse.com,
npache@redhat.com, peterx@redhat.com, pfalcato@suse.de,
rakie.kim@sk.com, raquini@redhat.com, rdunlap@infradead.org,
richard.weiyang@gmail.com, rientjes@google.com,
rostedt@goodmis.org, rppt@kernel.org, ryan.roberts@arm.com,
shivankg@amd.com, sunnanyong@huawei.com, surenb@google.com,
thomas.hellstrom@linux.intel.com, tiwai@suse.de,
usamaarif642@gmail.com, vbabka@suse.cz, vishal.moola@gmail.com,
wangkefeng.wang@huawei.com, will@kernel.org, willy@infradead.org,
yang@os.amperecomputing.com, ying.huang@linux.alibaba.com,
ziy@nvidia.com, zokeefe@google.com,
"Lorenzo Stoakes (Oracle)" <ljs@kernel.org>
Subject: [PATCH mm-unstable v4 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd()
Date: Wed, 25 Mar 2026 05:40:22 -0600 [thread overview]
Message-ID: <20260325114022.444081-6-npache@redhat.com> (raw)
In-Reply-To: <20260325114022.444081-1-npache@redhat.com>
The khugepaged daemon and madvise_collapse have two different
implementations that do almost the same thing. Create collapse_single_pmd
to increase code reuse and create an entry point to these two users.
Refactor madvise_collapse and collapse_scan_mm_slot to use the new
collapse_single_pmd function. To help reduce confusion around the
mmap_locked variable, we rename mmap_locked to lock_dropped in the
collapse_scan_mm_slot() function, and remove the redundant mmap_locked
in madvise_collapse(); this further unifies the code readiblity. the
SCAN_PTE_MAPPED_HUGEPAGE enum is no longer reachable in the
madvise_collapse() function, so we drop it from the list of "continuing"
enums.
This introduces a minor behavioral change that is most likely an
undiscovered bug. The current implementation of khugepaged tests
collapse_test_exit_or_disable() before calling collapse_pte_mapped_thp,
but we weren't doing it in the madvise_collapse case. By unifying these
two callers madvise_collapse now also performs this check. We also modify
the return value to be SCAN_ANY_PROCESS which properly indicates that this
process is no longer valid to operate on.
By moving the madvise_collapse writeback-retry logic into the helper
function we can also avoid having to revalidate the VMA.
We guard the khugepaged_pages_collapsed variable to ensure its only
incremented for khugepaged.
As requested we also convert a VM_BUG_ON to a VM_WARN_ON.
Reviewed-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 142 ++++++++++++++++++++++++------------------------
1 file changed, 72 insertions(+), 70 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3728a2cf133c..d06d84219e1b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1257,7 +1257,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
- bool *mmap_locked, struct collapse_control *cc)
+ bool *lock_dropped, struct collapse_control *cc)
{
pmd_t *pmd;
pte_t *pte, *_pte;
@@ -1432,7 +1432,7 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
result = collapse_huge_page(mm, start_addr, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
- *mmap_locked = false;
+ *lock_dropped = true;
}
out:
trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
@@ -2424,6 +2424,67 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
return result;
}
+/*
+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
+ * the results.
+ */
+static enum scan_result collapse_single_pmd(unsigned long addr,
+ struct vm_area_struct *vma, bool *lock_dropped,
+ struct collapse_control *cc)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ bool triggered_wb = false;
+ enum scan_result result;
+ struct file *file;
+ pgoff_t pgoff;
+
+ mmap_assert_locked(mm);
+
+ if (vma_is_anonymous(vma)) {
+ result = collapse_scan_pmd(mm, vma, addr, lock_dropped, cc);
+ goto end;
+ }
+
+ file = get_file(vma->vm_file);
+ pgoff = linear_page_index(vma, addr);
+
+ mmap_read_unlock(mm);
+ *lock_dropped = true;
+retry:
+ result = collapse_scan_file(mm, addr, file, pgoff, cc);
+
+ /*
+ * For MADV_COLLAPSE, when encountering dirty pages, try to writeback,
+ * then retry the collapse one time.
+ */
+ if (!cc->is_khugepaged && result == SCAN_PAGE_DIRTY_OR_WRITEBACK &&
+ !triggered_wb && mapping_can_writeback(file->f_mapping)) {
+ const loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+ const loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+ filemap_write_and_wait_range(file->f_mapping, lstart, lend);
+ triggered_wb = true;
+ goto retry;
+ }
+ fput(file);
+
+ if (result == SCAN_PTE_MAPPED_HUGEPAGE) {
+ mmap_read_lock(mm);
+ if (collapse_test_exit_or_disable(mm))
+ result = SCAN_ANY_PROCESS;
+ else
+ result = try_collapse_pte_mapped_thp(mm, addr,
+ !cc->is_khugepaged);
+ if (result == SCAN_PMD_MAPPED)
+ result = SCAN_SUCCEED;
+ mmap_read_unlock(mm);
+ }
+end:
+ if (cc->is_khugepaged && result == SCAN_SUCCEED)
+ ++khugepaged_pages_collapsed;
+ return result;
+}
+
static void collapse_scan_mm_slot(unsigned int progress_max,
enum scan_result *result, struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
@@ -2485,46 +2546,21 @@ static void collapse_scan_mm_slot(unsigned int progress_max,
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
while (khugepaged_scan.address < hend) {
- bool mmap_locked = true;
+ bool lock_dropped = false;
cond_resched();
if (unlikely(collapse_test_exit_or_disable(mm)))
goto breakouterloop;
- VM_BUG_ON(khugepaged_scan.address < hstart ||
+ VM_WARN_ON_ONCE(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma,
- khugepaged_scan.address);
-
- mmap_read_unlock(mm);
- mmap_locked = false;
- *result = collapse_scan_file(mm,
- khugepaged_scan.address, file, pgoff, cc);
- fput(file);
- if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
- mmap_read_lock(mm);
- if (collapse_test_exit_or_disable(mm))
- goto breakouterloop;
- *result = try_collapse_pte_mapped_thp(mm,
- khugepaged_scan.address, false);
- if (*result == SCAN_PMD_MAPPED)
- *result = SCAN_SUCCEED;
- mmap_read_unlock(mm);
- }
- } else {
- *result = collapse_scan_pmd(mm, vma,
- khugepaged_scan.address, &mmap_locked, cc);
- }
-
- if (*result == SCAN_SUCCEED)
- ++khugepaged_pages_collapsed;
+ *result = collapse_single_pmd(khugepaged_scan.address,
+ vma, &lock_dropped, cc);
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
- if (!mmap_locked)
+ if (lock_dropped)
/*
* We released mmap_lock so break loop. Note
* that we drop mmap_lock before all hugepage
@@ -2799,7 +2835,6 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
unsigned long hstart, hend, addr;
enum scan_result last_fail = SCAN_FAIL;
int thps = 0;
- bool mmap_locked = true;
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
@@ -2821,13 +2856,11 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
enum scan_result result = SCAN_FAIL;
- bool triggered_wb = false;
-retry:
- if (!mmap_locked) {
+ if (*lock_dropped) {
cond_resched();
mmap_read_lock(mm);
- mmap_locked = true;
+ *lock_dropped = false;
result = hugepage_vma_revalidate(mm, addr, false, &vma,
cc);
if (result != SCAN_SUCCEED) {
@@ -2837,45 +2870,14 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
- mmap_assert_locked(mm);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma, addr);
- mmap_read_unlock(mm);
- mmap_locked = false;
- *lock_dropped = true;
- result = collapse_scan_file(mm, addr, file, pgoff, cc);
-
- if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
- mapping_can_writeback(file->f_mapping)) {
- loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
- loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
-
- filemap_write_and_wait_range(file->f_mapping, lstart, lend);
- triggered_wb = true;
- fput(file);
- goto retry;
- }
- fput(file);
- } else {
- result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc);
- }
- if (!mmap_locked)
- *lock_dropped = true;
+ result = collapse_single_pmd(addr, vma, lock_dropped, cc);
-handle_result:
switch (result) {
case SCAN_SUCCEED:
case SCAN_PMD_MAPPED:
++thps;
break;
- case SCAN_PTE_MAPPED_HUGEPAGE:
- BUG_ON(mmap_locked);
- mmap_read_lock(mm);
- result = try_collapse_pte_mapped_thp(mm, addr, true);
- mmap_read_unlock(mm);
- goto handle_result;
/* Whitelisted set of results where continuing OK */
case SCAN_NO_PTE_TABLE:
case SCAN_PTE_NON_PRESENT:
@@ -2898,7 +2900,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
out_maybelock:
/* Caller expects us to hold mmap_lock on return */
- if (!mmap_locked)
+ if (*lock_dropped)
mmap_read_lock(mm);
out_nolock:
mmap_assert_locked(mm);
--
2.53.0
next prev parent reply other threads:[~2026-03-25 11:42 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-25 11:40 [PATCH mm-unstable v4 0/5] mm: khugepaged cleanups and mTHP prerequisites Nico Pache
2026-03-25 11:40 ` [PATCH mm-unstable v4 1/5] mm: consolidate anonymous folio PTE mapping into helpers Nico Pache
2026-03-25 11:40 ` [PATCH mm-unstable v4 2/5] mm: introduce is_pmd_order helper Nico Pache
2026-03-25 12:11 ` Lorenzo Stoakes (Oracle)
2026-03-25 14:45 ` Andrew Morton
2026-03-25 14:49 ` Lorenzo Stoakes (Oracle)
2026-03-25 16:05 ` Andrew Morton
2026-03-25 11:40 ` [PATCH mm-unstable v4 3/5] mm/khugepaged: define KHUGEPAGED_MAX_PTES_LIMIT as HPAGE_PMD_NR - 1 Nico Pache
2026-03-25 11:40 ` [PATCH mm-unstable v4 4/5] mm/khugepaged: rename hpage_collapse_* to collapse_* Nico Pache
2026-03-25 12:08 ` Lorenzo Stoakes (Oracle)
2026-03-25 11:40 ` Nico Pache [this message]
2026-03-31 14:01 ` [PATCH mm-unstable v4 5/5] mm/khugepaged: unify khugepaged and madv_collapse with collapse_single_pmd() Lorenzo Stoakes (Oracle)
2026-03-31 14:13 ` David Hildenbrand (Arm)
2026-03-31 14:15 ` Lorenzo Stoakes (Oracle)
2026-03-31 14:46 ` David Hildenbrand (Arm)
2026-03-31 20:00 ` David Hildenbrand (Arm)
2026-03-31 20:06 ` Lorenzo Stoakes (Oracle)
2026-03-31 20:50 ` David Hildenbrand (Arm)
2026-03-31 21:03 ` David Hildenbrand (Arm)
2026-03-31 21:09 ` Nico Pache
2026-04-01 8:14 ` Lorenzo Stoakes (Oracle)
2026-04-01 20:31 ` Andrew Morton
2026-04-07 8:38 ` Lorenzo Stoakes (Oracle)
2026-04-07 21:42 ` Andrew Morton
2026-04-08 6:42 ` Lorenzo Stoakes
2026-03-31 21:35 ` Andrew Morton
2026-03-31 21:49 ` Nico Pache
2026-04-01 7:05 ` David Hildenbrand (Arm)
2026-04-01 8:17 ` Lorenzo Stoakes (Oracle)
2026-03-31 19:46 ` Nico Pache
2026-03-31 19:59 ` Lorenzo Stoakes (Oracle)
2026-03-31 16:29 ` Lance Yang
2026-03-31 19:59 ` Nico Pache
2026-03-25 11:44 ` [PATCH mm-unstable v4 0/5] mm: khugepaged cleanups and mTHP prerequisites Lorenzo Stoakes (Oracle)
2026-03-26 0:25 ` Andrew Morton
2026-03-26 4:44 ` Roman Gushchin
2026-03-26 16:48 ` Nico Pache
2026-03-26 17:35 ` Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260325114022.444081-6-npache@redhat.com \
--to=npache@redhat.com \
--cc=Liam.Howlett@oracle.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=anshuman.khandual@arm.com \
--cc=apopple@nvidia.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=byungchul@sk.com \
--cc=catalin.marinas@arm.com \
--cc=cl@gentwo.org \
--cc=corbet@lwn.net \
--cc=dave.hansen@linux.intel.com \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gourry@gourry.net \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=jack@suse.cz \
--cc=jackmanb@google.com \
--cc=jannh@google.com \
--cc=jglisse@google.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kas@kernel.org \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=mathieu.desnoyers@efficios.com \
--cc=matthew.brost@intel.com \
--cc=mhiramat@kernel.org \
--cc=mhocko@suse.com \
--cc=peterx@redhat.com \
--cc=pfalcato@suse.de \
--cc=rakie.kim@sk.com \
--cc=raquini@redhat.com \
--cc=rdunlap@infradead.org \
--cc=richard.weiyang@gmail.com \
--cc=rientjes@google.com \
--cc=rostedt@goodmis.org \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shivankg@amd.com \
--cc=sunnanyong@huawei.com \
--cc=surenb@google.com \
--cc=thomas.hellstrom@linux.intel.com \
--cc=tiwai@suse.de \
--cc=usamaarif642@gmail.com \
--cc=vbabka@suse.cz \
--cc=vishal.moola@gmail.com \
--cc=wangkefeng.wang@huawei.com \
--cc=will@kernel.org \
--cc=willy@infradead.org \
--cc=yang@os.amperecomputing.com \
--cc=ying.huang@linux.alibaba.com \
--cc=ziy@nvidia.com \
--cc=zokeefe@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.