Re: [PATCH 7.2 v16 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Usama Arif <usama.arif@linux.dev>
To: Nico Pache <npache@redhat.com>
Cc: Usama Arif <usama.arif@linux.dev>,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-trace-kernel@vger.kernel.org,
	akpm@linux-foundation.org, anshuman.khandual@arm.com,
	apopple@nvidia.com, baohua@kernel.org,
	baolin.wang@linux.alibaba.com, byungchul@sk.com,
	catalin.marinas@arm.com, cl@gentwo.org, corbet@lwn.net,
	dave.hansen@linux.intel.com, david@kernel.org, dev.jain@arm.com,
	gourry@gourry.net, hannes@cmpxchg.org, hughd@google.com,
	jack@suse.cz, jackmanb@google.com, jannh@google.com,
	jglisse@google.com, joshua.hahnjy@gmail.com, kas@kernel.org,
	lance.yang@linux.dev, Liam.Howlett@oracle.com, ljs@kernel.org,
	mathieu.desnoyers@efficios.com, matthew.brost@intel.com,
	mhiramat@kernel.org, mhocko@suse.com, peterx@redhat.com,
	pfalcato@suse.de, rakie.kim@sk.com, raquini@redhat.com,
	rdunlap@infradead.org, richard.weiyang@gmail.com,
	rientjes@google.com, rostedt@goodmis.org, rppt@kernel.org,
	ryan.roberts@arm.com, shivankg@amd.com, sunnanyong@huawei.com,
	surenb@google.com, thomas.hellstrom@linux.intel.com,
	tiwai@suse.de, usamaarif642@gmail.com, vbabka@suse.cz,
	vishal.moola@gmail.com, wangkefeng.wang@huawei.com,
	will@kernel.org, willy@infradead.org,
	yang@os.amperecomputing.com, ying.huang@linux.alibaba.com,
	ziy@nvidia.com, zokeefe@google.com
Subject: Re: [PATCH 7.2 v16 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
Date: Mon, 20 Apr 2026 07:20:54 -0700	[thread overview]
Message-ID: <20260420142057.392263-1-usama.arif@linux.dev> (raw)
In-Reply-To: <20260419185750.260784-6-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:42 -0600 Nico Pache <npache@redhat.com> wrote:

> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
> 
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> access/changes to the page tables. This can happen if the rmap walkers hit
> a pmd_none while the PMD entry is currently unavailable due to being
> temporarily removed during the collapse phase.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 103 +++++++++++++++++++++++++++---------------------
>  1 file changed, 57 insertions(+), 46 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 283bb63854a5..ff6f9f1883ed 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1198,42 +1198,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>  	return SCAN_SUCCEED;
>  }
>  
> -static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
> -		int referenced, int unmapped, struct collapse_control *cc)
> +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
> +		int referenced, int unmapped, struct collapse_control *cc,
> +		unsigned int order)
>  {
>  	LIST_HEAD(compound_pagelist);
>  	pmd_t *pmd, _pmd;
> -	pte_t *pte;
> +	pte_t *pte = NULL;
>  	pgtable_t pgtable;
>  	struct folio *folio;
>  	spinlock_t *pmd_ptl, *pte_ptl;
>  	enum scan_result result = SCAN_FAIL;
>  	struct vm_area_struct *vma;
>  	struct mmu_notifier_range range;
> +	bool anon_vma_locked = false;
> +	const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
> +	const unsigned long end_addr = start_addr + (PAGE_SIZE << order);
>  
> -	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -
> -	/*
> -	 * Before allocating the hugepage, release the mmap_lock read lock.
> -	 * The allocation can take potentially a long time if it involves
> -	 * sync compaction, and we do not need to hold the mmap_lock during
> -	 * that. We will recheck the vma after taking it again in write mode.
> -	 */
> -	mmap_read_unlock(mm);
> -

My understanding now is that the caller will need to drop the mmap_read lock?

This needs explicit documentation at the start of the function IMO. I think
there are callers in later patches and its very easy to miss this.


> -	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> +	result = alloc_charge_folio(&folio, mm, cc, order);
>  	if (result != SCAN_SUCCEED)
>  		goto out_nolock;
>  
>  	mmap_read_lock(mm);
> -	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> -					 HPAGE_PMD_ORDER);
> +	result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
> +					 &vma, cc, order);
>  	if (result != SCAN_SUCCEED) {
>  		mmap_read_unlock(mm);
>  		goto out_nolock;
>  	}
>  
> -	result = find_pmd_or_thp_or_none(mm, address, &pmd);
> +	result = find_pmd_or_thp_or_none(mm, pmd_addr, &pmd);
>  	if (result != SCAN_SUCCEED) {
>  		mmap_read_unlock(mm);
>  		goto out_nolock;
> @@ -1245,8 +1239,8 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  		 * released when it fails. So we jump out_nolock directly in
>  		 * that case.  Continuing to collapse causes inconsistency.
>  		 */
> -		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> -						     referenced, HPAGE_PMD_ORDER);
> +		result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
> +						     referenced, order);
>  		if (result != SCAN_SUCCEED)
>  			goto out_nolock;
>  	}
> @@ -1261,20 +1255,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * mmap_lock.
>  	 */
>  	mmap_write_lock(mm);
> -	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> -					 HPAGE_PMD_ORDER);
> +	result = hugepage_vma_revalidate(mm, pmd_addr, /*expect_anon=*/ true,
> +					 &vma, cc, order);
>  	if (result != SCAN_SUCCEED)
>  		goto out_up_write;
>  	/* check if the pmd is still valid */
>  	vma_start_write(vma);
> -	result = check_pmd_still_valid(mm, address, pmd);
> +	result = check_pmd_still_valid(mm, pmd_addr, pmd);
>  	if (result != SCAN_SUCCEED)
>  		goto out_up_write;
>  
>  	anon_vma_lock_write(vma->anon_vma);
> +	anon_vma_locked = true;
>  
> -	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> -				address + HPAGE_PMD_SIZE);
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
> +				end_addr);
>  	mmu_notifier_invalidate_range_start(&range);
>  
>  	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> @@ -1286,26 +1281,23 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * Parallel GUP-fast is fine since GUP-fast will back off when
>  	 * it detects PMD is changed.
>  	 */
> -	_pmd = pmdp_collapse_flush(vma, address, pmd);
> +	_pmd = pmdp_collapse_flush(vma, pmd_addr, pmd);


For an mTHP collapse covering, say, 64KiB of a 2MiB PMD, the patch still
flushes the entire PMD via pmdp_collapse_flush and tlb_remove_table_sync_one.
That triggers cross-CPU TLB shootdowns for ~1.94MiB of unrelated mappings on
every successful sub-PMD collapse. Probably acceptable as a first cut?


>  	spin_unlock(pmd_ptl);
>  	mmu_notifier_invalidate_range_end(&range);
>  	tlb_remove_table_sync_one();
>  
> -	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> +	pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
>  	if (pte) {
> -		result = __collapse_huge_page_isolate(vma, address, pte, cc,
> -						      HPAGE_PMD_ORDER,
> -						      &compound_pagelist);
> +		result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
> +						      order, &compound_pagelist);
>  		spin_unlock(pte_ptl);
>  	} else {
>  		result = SCAN_NO_PTE_TABLE;
>  	}
>  
>  	if (unlikely(result != SCAN_SUCCEED)) {
> -		if (pte)
> -			pte_unmap(pte);
>  		spin_lock(pmd_ptl);
> -		BUG_ON(!pmd_none(*pmd));
> +		WARN_ON_ONCE(!pmd_none(*pmd));

Why was this turned into WARN_ON_ONCE? Would be good to add to commit message
what the reason is if it has been discussed earlier.

The next line writes a PMD entry over an existing one — that leaks the
previous page table or PMD-mapped folio and can corrupt VA mappings. BUG_ON
failed loudly and safely; WARN_ON_ONCE continues into the corruption and falls
silent after the first hit.


>  		/*
>  		 * We can only use set_pmd_at when establishing
>  		 * hugepmds and never for establishing regular pmds that
> @@ -1313,21 +1305,24 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  		 */
>  		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>  		spin_unlock(pmd_ptl);
> -		anon_vma_unlock_write(vma->anon_vma);
>  		goto out_up_write;
>  	}
>  
>  	/*
> -	 * All pages are isolated and locked so anon_vma rmap
> -	 * can't run anymore.
> +	 * For PMD collapse all pages are isolated and locked so anon_vma
> +	 * rmap can't run anymore. For mTHP collapse the PMD entry has been
> +	 * removed and not all pages are isolated and locked, so we must hold
> +	 * the lock to prevent neighboring folios from attempting to access
> +	 * this PMD until its reinstalled.
>  	 */
> -	anon_vma_unlock_write(vma->anon_vma);
> +	if (is_pmd_order(order)) {
> +		anon_vma_unlock_write(vma->anon_vma);
> +		anon_vma_locked = false;
> +	}
>  
>  	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> -					   vma, address, pte_ptl,
> -					   HPAGE_PMD_ORDER,
> -					   &compound_pagelist);
> -	pte_unmap(pte);
> +					   vma, start_addr, pte_ptl,
> +					   order, &compound_pagelist);
>  	if (unlikely(result != SCAN_SUCCEED))
>  		goto out_up_write;
>  
> @@ -1337,18 +1332,27 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 * write.
>  	 */
>  	__folio_mark_uptodate(folio);
> -	pgtable = pmd_pgtable(_pmd);
> -
>  	spin_lock(pmd_ptl);
> -	BUG_ON(!pmd_none(*pmd));
> -	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> -	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> +	WARN_ON_ONCE(!pmd_none(*pmd));
> +	if (is_pmd_order(order)) { /* PMD collapse */
> +		pgtable = pmd_pgtable(_pmd);
> +		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
> +	} else { /* mTHP collapse */
> +		map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);

map_anon_folio_pte_nopf calls set_ptes and modifies pagetable, while holding
pmd_ptl only. It should be safe as we expect pmd_none. But I think you should
put a comment about this?


> +		smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
> +		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> +	}
>  	spin_unlock(pmd_ptl);
>  
>  	folio = NULL;
>  
>  	result = SCAN_SUCCEED;
>  out_up_write:
> +	if (anon_vma_locked)
> +		anon_vma_unlock_write(vma->anon_vma);
> +	if (pte)
> +		pte_unmap(pte);
>  	mmap_write_unlock(mm);
>  out_nolock:
>  	if (folio)
> @@ -1525,8 +1529,15 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  out_unmap:
>  	pte_unmap_unlock(pte, ptl);
>  	if (result == SCAN_SUCCEED) {
> +		/*
> +		 * Before allocating the hugepage, release the mmap_lock read lock.
> +		 * The allocation can take potentially a long time if it involves
> +		 * sync compaction, and we do not need to hold the mmap_lock during
> +		 * that. We will recheck the vma after taking it again in write mode.
> +		 */
> +		mmap_read_unlock(mm);
>  		result = collapse_huge_page(mm, start_addr, referenced,
> -					    unmapped, cc);
> +					    unmapped, cc, HPAGE_PMD_ORDER);
>  		/* collapse_huge_page will return with the mmap_lock released */
>  		*lock_dropped = true;
>  	}
> -- 
> 2.53.0
> 
>

next prev parent reply	other threads:[~2026-04-20 14:21 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-19 18:57 [PATCH 7.2 v16 00/13] khugepaged: mTHP support Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 01/13] mm/khugepaged: generalize hugepage_vma_revalidate for " Nico Pache
2026-04-20 12:59   ` Usama Arif
2026-04-19 18:57 ` [PATCH 7.2 v16 02/13] mm/khugepaged: generalize alloc_charge_folio() Nico Pache
2026-04-20 13:05   ` Usama Arif
2026-04-27 19:41   ` David Hildenbrand (Arm)
2026-04-29 14:36     ` Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 03/13] mm/khugepaged: rework max_ptes_* handling with helper functions Nico Pache
2026-04-20 13:15   ` Usama Arif
2026-04-29 14:43     ` Nico Pache
2026-04-27 19:52   ` David Hildenbrand (Arm)
2026-04-29 14:48     ` Nico Pache
2026-05-08 11:11   ` Lance Yang
2026-04-19 18:57 ` [PATCH 7.2 v16 04/13] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
2026-04-20 13:55   ` Usama Arif
2026-04-27 20:06     ` David Hildenbrand (Arm)
2026-04-29 15:05     ` Nico Pache
2026-04-27 20:07   ` David Hildenbrand (Arm)
2026-04-29 15:12     ` Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
2026-04-20 14:20   ` Usama Arif [this message]
2026-04-29 15:34     ` Nico Pache
2026-04-27 20:13   ` David Hildenbrand (Arm)
2026-04-29 15:34     ` Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 06/13] mm/khugepaged: skip collapsing mTHP to smaller orders Nico Pache
2026-04-20 15:36   ` Usama Arif
2026-04-19 18:57 ` [PATCH 7.2 v16 07/13] mm/khugepaged: add per-order mTHP collapse failure statistics Nico Pache
2026-04-27 20:21   ` David Hildenbrand (Arm)
2026-04-29 18:21     ` Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 08/13] mm/khugepaged: improve tracepoints for mTHP orders Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 09/13] mm/khugepaged: introduce collapse_allowable_orders helper function Nico Pache
2026-04-27 20:24   ` David Hildenbrand (Arm)
2026-04-29 18:22     ` Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 10/13] mm/khugepaged: Introduce mTHP collapse support Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 11/13] mm/khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 12/13] mm/khugepaged: run khugepaged for all orders Nico Pache
2026-04-19 18:57 ` [PATCH 7.2 v16 13/13] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
2026-04-23 20:30 ` [PATCH 7.2 v16 00/13] khugepaged: mTHP support Andrew Morton
2026-04-24 13:58 ` Andrew Morton
2026-04-24 14:05   ` Matthew Brost
2026-04-24 14:19     ` Andrew Morton
2026-04-24 14:24       ` Matthew Brost
2026-05-03 12:23   ` Nico Pache
2026-05-03 13:21     ` Andrew Morton
2026-05-03 13:32       ` Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260420142057.392263-1-usama.arif@linux.dev \
    --to=usama.arif@linux.dev \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=anshuman.khandual@arm.com \
    --cc=apopple@nvidia.com \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=byungchul@sk.com \
    --cc=catalin.marinas@arm.com \
    --cc=cl@gentwo.org \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=gourry@gourry.net \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=jack@suse.cz \
    --cc=jackmanb@google.com \
    --cc=jannh@google.com \
    --cc=jglisse@google.com \
    --cc=joshua.hahnjy@gmail.com \
    --cc=kas@kernel.org \
    --cc=lance.yang@linux.dev \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=ljs@kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=matthew.brost@intel.com \
    --cc=mhiramat@kernel.org \
    --cc=mhocko@suse.com \
    --cc=npache@redhat.com \
    --cc=peterx@redhat.com \
    --cc=pfalcato@suse.de \
    --cc=rakie.kim@sk.com \
    --cc=raquini@redhat.com \
    --cc=rdunlap@infradead.org \
    --cc=richard.weiyang@gmail.com \
    --cc=rientjes@google.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=ryan.roberts@arm.com \
    --cc=shivankg@amd.com \
    --cc=sunnanyong@huawei.com \
    --cc=surenb@google.com \
    --cc=thomas.hellstrom@linux.intel.com \
    --cc=tiwai@suse.de \
    --cc=usamaarif642@gmail.com \
    --cc=vbabka@suse.cz \
    --cc=vishal.moola@gmail.com \
    --cc=wangkefeng.wang@huawei.com \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=yang@os.amperecomputing.com \
    --cc=ying.huang@linux.alibaba.com \
    --cc=ziy@nvidia.com \
    --cc=zokeefe@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.