public inbox for linux-mm@kvack.org
 help / color / mirror / Atom feed
From: Wei Yang <richard.weiyang@gmail.com>
To: "David Hildenbrand (Arm)" <david@kernel.org>
Cc: Wei Yang <richard.weiyang@gmail.com>,
	akpm@linux-foundation.org, ljs@kernel.org, ziy@nvidia.com,
	baolin.wang@linux.alibaba.com, Liam.Howlett@oracle.com,
	npache@redhat.com, ryan.roberts@arm.com, dev.jain@arm.com,
	baohua@kernel.org, lance.yang@linux.dev, riel@surriel.com,
	vbabka@kernel.org, harry@kernel.org, jannh@google.com,
	rppt@kernel.org, surenb@google.com, mhocko@suse.com,
	shuah@kernel.org, linux-mm@kvack.org,
	Gavin Guo <gavinguo@igalia.com>
Subject: Re: [PATCH 1/2] mm/huge_memory: return true if split_huge_pmd_locked() split PMD to migration entry
Date: Wed, 29 Apr 2026 02:49:13 +0000	[thread overview]
Message-ID: <20260429024913.iepoi7cit3xnwca2@master> (raw)
In-Reply-To: <c71930ae-19d9-4b3b-a74d-3de3261c4d43@kernel.org>

On Tue, Apr 28, 2026 at 10:24:42AM +0200, David Hildenbrand (Arm) wrote:
>On 4/26/26 11:19, Wei Yang wrote:
>> On Fri, Apr 24, 2026 at 09:29:18PM +0200, David Hildenbrand (Arm) wrote:
>>> On 4/15/26 03:08, Wei Yang wrote:
>>>> When @freeze is set to true, split_huge_pmd_locked() is intended to
>>>> split the PMD to migration entry. But if it doesn't manage to clear
>>>> PageAnonExclusive(), it just split PMD and leave the folio mapped
>>>> through PTE.
>>>>
>>>> This patch let split_huge_pmd_locked() return true to indicate it does
>>>> split PMD to migration entry. With this knowledge, we can return
>>>> directly in try_to_migrate_one() if it does.
>>>>
>>>> Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
>>>> Cc: Gavin Guo <gavinguo@igalia.com>
>>>> Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> Cc: Lance Yang <lance.yang@linux.dev>
>>>> ---
>>>
>>> [...]
>>>
>>>>  static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
>>>>  					 unsigned long addr, pmd_t *pmdp,
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 970e077019b7..ec84bb4a0cc3 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3087,7 +3087,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
>>>>  	pmd_populate(mm, pmd, pgtable);
>>>>  }
>>>>  
>>>> -static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>> +static bool __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  		unsigned long haddr, bool freeze)
>>>>  {
>>>>  	struct mm_struct *mm = vma->vm_mm;
>>>> @@ -3096,7 +3096,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  	pgtable_t pgtable;
>>>>  	pmd_t old_pmd, _pmd;
>>>>  	bool soft_dirty, uffd_wp = false, young = false, write = false;
>>>> -	bool anon_exclusive = false, dirty = false;
>>>> +	bool anon_exclusive = false, dirty = false, ret = false;
>>>>  	unsigned long addr;
>>>>  	pte_t *pte;
>>>>  	int i;
>>>> @@ -3118,13 +3118,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  		if (arch_needs_pgtable_deposit())
>>>>  			zap_deposited_table(mm, pmd);
>>>>  		if (vma_is_special_huge(vma))
>>>> -			return;
>>>> +			return ret;
>>>
>>> Why not "return false" in these cases where it really can always only false?
>>>
>> 
>> Will adjust related places.
>> 
>>>>  		if (unlikely(pmd_is_migration_entry(old_pmd))) {
>>>>  			const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
>>>>  
>>>>  			folio = softleaf_to_folio(old_entry);
>>>>  		} else if (is_huge_zero_pmd(old_pmd)) {
>>>> -			return;
>>>> +			return ret;
>>>>  		} else {
>>>>  			page = pmd_page(old_pmd);
>>>>  			folio = page_folio(page);
>>>> @@ -3136,7 +3136,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  			folio_put(folio);
>>>>  		}
>>>>  		add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
>>>> -		return;
>>>> +		return ret;
>>>>  	}
>>>>  
>>>>  	if (is_huge_zero_pmd(*pmd)) {
>>>> @@ -3149,7 +3149,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  		 * small page also write protected so it does not seems useful
>>>>  		 * to invalidate secondary mmu at this time.
>>>>  		 */
>>>> -		return __split_huge_zero_page_pmd(vma, haddr, pmd);
>>>> +		__split_huge_zero_page_pmd(vma, haddr, pmd);
>>>> +		return ret;
>>>>  	}
>>>>  
>>>>  	if (pmd_is_migration_entry(*pmd)) {
>>>> @@ -3309,6 +3310,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>>>>  			set_pte_at(mm, addr, pte + i, entry);
>>>>  		}
>>>> +		ret = true;
>>>>  	} else if (pmd_is_device_private_entry(old_pmd)) {
>>>>  		pte_t entry;
>>>>  		swp_entry_t swp_entry;
>>>> @@ -3366,14 +3368,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>  
>>>>  	smp_wmb(); /* make pte visible before pmd */
>>>>  	pmd_populate(mm, pmd, pgtable);
>>>> +	return ret;
>>>>  }
>>>>  
>>>> -void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>>> +bool split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>>>  			   pmd_t *pmd, bool freeze)
>>>>  {
>>>>  	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
>>>>  	if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd))
>>>> -		__split_huge_pmd_locked(vma, pmd, address, freeze);
>>>> +		return __split_huge_pmd_locked(vma, pmd, address, freeze);
>>>> +	else
>>>> +		return false;
>>>
>>> No need for the "else".
>>>
>> 
>> Got it.
>> 
>>>>  }
>>>>  
>>>>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index 78b7fb5f367c..91fb495bebbe 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -2464,13 +2464,18 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>>>>  
>>>>  			if (flags & TTU_SPLIT_HUGE_PMD) {
>>>>  				/*
>>>> -				 * split_huge_pmd_locked() might leave the
>>>> +				 * If split_huge_pmd_locked() does split PMD
>>>> +				 * to migration entry, we are done.
>>>> +				 * If split_huge_pmd_locked() leave the
>>>>  				 * folio mapped through PTEs. Retry the walk
>>>>  				 * so we can detect this scenario and properly
>>>>  				 * abort the walk.
>>>
>>> Couldn't we just abort right away, based on the return value?
>>>
>> 
>> Here is my understanding.
>> 
>> We get here when page_vma_mapped_walk() touch a pmd entry, with three cases:
>> 
>>   * pmd_trans_huge()
>>   * pmd_is_migration_entry()
>>   * pmd_is_device_private_entry()
>> 
>> For the first two cases, we grab pmd_lock() and then check the condition is
>> still valid before return. But for case 3, after grab pmd_lock(), it return
>> directly.
>> 
>> This may give chance for another thread to split pmd_is_device_private_entry()
>> to pte mapped, IIUC. For this case, we should restart the walk here.
>
>
>So what you are saying is that we should re-validate in page_vma_mapped_walk()
>that we indeed still have a device-private entry after grabbing the lock?
>
>That's what we do in map_pte() through pmd_same() check.
>
>Likely we should apply the same model here!
>

Below is my proposed change:

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index a4d52fdb3056..6e915d35ae54 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -273,17 +273,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 
 			if (softleaf_is_device_private(entry)) {
 				pvmw->ptl = pmd_lock(mm, pvmw->pmd);
-				return true;
+				if (pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))
+					return true;
+				/* THP pmd was split under us: handle on pte level */
+				spin_unlock(pvmw->ptl);
+				pvmw->ptl = NULL;
+			} else {
+				if ((pvmw->flags & PVMW_SYNC) &&
+				    thp_vma_suitable_order(vma, pvmw->address,
+							   PMD_ORDER) &&
+				    (pvmw->nr_pages >= HPAGE_PMD_NR))
+					sync_with_folio_pmd_zap(mm, pvmw->pmd);
+
+				step_forward(pvmw, PMD_SIZE);
+				continue;
 			}
-
-			if ((pvmw->flags & PVMW_SYNC) &&
-			    thp_vma_suitable_order(vma, pvmw->address,
-						   PMD_ORDER) &&
-			    (pvmw->nr_pages >= HPAGE_PMD_NR))
-				sync_with_folio_pmd_zap(mm, pvmw->pmd);
-
-			step_forward(pvmw, PMD_SIZE);
-			continue;
 		}
 		if (!map_pte(pvmw, &pmde, &ptl)) {
 			if (!pvmw->pte)

After this, we could simplify the logic in try_to_migrate_one() as:

@@ -2471,14 +2471,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                                 * so we can detect this scenario and properly
                                 * abort the walk.
                                 */
-                               if (split_huge_pmd_locked(vma, pvmw.address,
-                                                     pvmw.pmd, true)) {
-                                       page_vma_mapped_walk_done(&pvmw);
-                                       break;
-                               }
-                               flags &= ~TTU_SPLIT_HUGE_PMD;
-                               page_vma_mapped_walk_restart(&pvmw);
-                               continue;
+                               ret = split_huge_pmd_locked(vma, pvmw.address,
+                                                     pvmw.pmd, true);
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
                        }
-- 
Wei Yang
Help you, Help me


  reply	other threads:[~2026-04-29  2:49 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-15  1:08 [PATCH 0/2] mm/huge_memory: optimize migration when huge PMD needs split Wei Yang
2026-04-15  1:08 ` [PATCH 1/2] mm/huge_memory: return true if split_huge_pmd_locked() split PMD to migration entry Wei Yang
2026-04-24 19:29   ` David Hildenbrand (Arm)
2026-04-26  9:19     ` Wei Yang
2026-04-28  8:24       ` David Hildenbrand (Arm)
2026-04-29  2:49         ` Wei Yang [this message]
2026-04-29  6:55           ` David Hildenbrand (Arm)
2026-05-03  0:38             ` Wei Yang
2026-05-04 12:44               ` David Hildenbrand (Arm)
2026-05-05  3:15                 ` Wei Yang
2026-04-15  1:08 ` [PATCH 2/2] mm/selftests: add split_shared_pmd() Wei Yang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260429024913.iepoi7cit3xnwca2@master \
    --to=richard.weiyang@gmail.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=gavinguo@igalia.com \
    --cc=harry@kernel.org \
    --cc=jannh@google.com \
    --cc=lance.yang@linux.dev \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=mhocko@suse.com \
    --cc=npache@redhat.com \
    --cc=riel@surriel.com \
    --cc=rppt@kernel.org \
    --cc=ryan.roberts@arm.com \
    --cc=shuah@kernel.org \
    --cc=surenb@google.com \
    --cc=vbabka@kernel.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox