From: "David Hildenbrand (Arm)" <david@kernel.org>
To: Wei Yang <richard.weiyang@gmail.com>
Cc: akpm@linux-foundation.org, ljs@kernel.org, ziy@nvidia.com,
baolin.wang@linux.alibaba.com, Liam.Howlett@oracle.com,
npache@redhat.com, ryan.roberts@arm.com, dev.jain@arm.com,
baohua@kernel.org, lance.yang@linux.dev, riel@surriel.com,
vbabka@kernel.org, harry@kernel.org, jannh@google.com,
rppt@kernel.org, surenb@google.com, mhocko@suse.com,
shuah@kernel.org, linux-mm@kvack.org,
Gavin Guo <gavinguo@igalia.com>
Subject: Re: [PATCH 1/2] mm/huge_memory: return true if split_huge_pmd_locked() split PMD to migration entry
Date: Tue, 28 Apr 2026 10:24:42 +0200 [thread overview]
Message-ID: <c71930ae-19d9-4b3b-a74d-3de3261c4d43@kernel.org> (raw)
In-Reply-To: <20260426091957.a227zxgkqapibtud@master>
On 4/26/26 11:19, Wei Yang wrote:
> On Fri, Apr 24, 2026 at 09:29:18PM +0200, David Hildenbrand (Arm) wrote:
>> On 4/15/26 03:08, Wei Yang wrote:
>>> When @freeze is set to true, split_huge_pmd_locked() is intended to
>>> split the PMD to migration entry. But if it doesn't manage to clear
>>> PageAnonExclusive(), it just split PMD and leave the folio mapped
>>> through PTE.
>>>
>>> This patch let split_huge_pmd_locked() return true to indicate it does
>>> split PMD to migration entry. With this knowledge, we can return
>>> directly in try_to_migrate_one() if it does.
>>>
>>> Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
>>> Cc: Gavin Guo <gavinguo@igalia.com>
>>> Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
>>> Cc: Zi Yan <ziy@nvidia.com>
>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> Cc: Lance Yang <lance.yang@linux.dev>
>>> ---
>>
>> [...]
>>
>>> static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
>>> unsigned long addr, pmd_t *pmdp,
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 970e077019b7..ec84bb4a0cc3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3087,7 +3087,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
>>> pmd_populate(mm, pmd, pgtable);
>>> }
>>>
>>> -static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> +static bool __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> unsigned long haddr, bool freeze)
>>> {
>>> struct mm_struct *mm = vma->vm_mm;
>>> @@ -3096,7 +3096,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> pgtable_t pgtable;
>>> pmd_t old_pmd, _pmd;
>>> bool soft_dirty, uffd_wp = false, young = false, write = false;
>>> - bool anon_exclusive = false, dirty = false;
>>> + bool anon_exclusive = false, dirty = false, ret = false;
>>> unsigned long addr;
>>> pte_t *pte;
>>> int i;
>>> @@ -3118,13 +3118,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> if (arch_needs_pgtable_deposit())
>>> zap_deposited_table(mm, pmd);
>>> if (vma_is_special_huge(vma))
>>> - return;
>>> + return ret;
>>
>> Why not "return false" in these cases where it really can always only false?
>>
>
> Will adjust related places.
>
>>> if (unlikely(pmd_is_migration_entry(old_pmd))) {
>>> const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
>>>
>>> folio = softleaf_to_folio(old_entry);
>>> } else if (is_huge_zero_pmd(old_pmd)) {
>>> - return;
>>> + return ret;
>>> } else {
>>> page = pmd_page(old_pmd);
>>> folio = page_folio(page);
>>> @@ -3136,7 +3136,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> folio_put(folio);
>>> }
>>> add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
>>> - return;
>>> + return ret;
>>> }
>>>
>>> if (is_huge_zero_pmd(*pmd)) {
>>> @@ -3149,7 +3149,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> * small page also write protected so it does not seems useful
>>> * to invalidate secondary mmu at this time.
>>> */
>>> - return __split_huge_zero_page_pmd(vma, haddr, pmd);
>>> + __split_huge_zero_page_pmd(vma, haddr, pmd);
>>> + return ret;
>>> }
>>>
>>> if (pmd_is_migration_entry(*pmd)) {
>>> @@ -3309,6 +3310,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>>> set_pte_at(mm, addr, pte + i, entry);
>>> }
>>> + ret = true;
>>> } else if (pmd_is_device_private_entry(old_pmd)) {
>>> pte_t entry;
>>> swp_entry_t swp_entry;
>>> @@ -3366,14 +3368,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>
>>> smp_wmb(); /* make pte visible before pmd */
>>> pmd_populate(mm, pmd, pgtable);
>>> + return ret;
>>> }
>>>
>>> -void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>> +bool split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>> pmd_t *pmd, bool freeze)
>>> {
>>> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
>>> if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd))
>>> - __split_huge_pmd_locked(vma, pmd, address, freeze);
>>> + return __split_huge_pmd_locked(vma, pmd, address, freeze);
>>> + else
>>> + return false;
>>
>> No need for the "else".
>>
>
> Got it.
>
>>> }
>>>
>>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 78b7fb5f367c..91fb495bebbe 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -2464,13 +2464,18 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
>>>
>>> if (flags & TTU_SPLIT_HUGE_PMD) {
>>> /*
>>> - * split_huge_pmd_locked() might leave the
>>> + * If split_huge_pmd_locked() does split PMD
>>> + * to migration entry, we are done.
>>> + * If split_huge_pmd_locked() leave the
>>> * folio mapped through PTEs. Retry the walk
>>> * so we can detect this scenario and properly
>>> * abort the walk.
>>
>> Couldn't we just abort right away, based on the return value?
>>
>
> Here is my understanding.
>
> We get here when page_vma_mapped_walk() touch a pmd entry, with three cases:
>
> * pmd_trans_huge()
> * pmd_is_migration_entry()
> * pmd_is_device_private_entry()
>
> For the first two cases, we grab pmd_lock() and then check the condition is
> still valid before return. But for case 3, after grab pmd_lock(), it return
> directly.
>
> This may give chance for another thread to split pmd_is_device_private_entry()
> to pte mapped, IIUC. For this case, we should restart the walk here.
So what you are saying is that we should re-validate in page_vma_mapped_walk()
that we indeed still have a device-private entry after grabbing the lock?
That's what we do in map_pte() through pmd_same() check.
Likely we should apply the same model here!
--
Cheers,
David
next prev parent reply other threads:[~2026-04-28 8:24 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-15 1:08 [PATCH 0/2] mm/huge_memory: optimize migration when huge PMD needs split Wei Yang
2026-04-15 1:08 ` [PATCH 1/2] mm/huge_memory: return true if split_huge_pmd_locked() split PMD to migration entry Wei Yang
2026-04-24 19:29 ` David Hildenbrand (Arm)
2026-04-26 9:19 ` Wei Yang
2026-04-28 8:24 ` David Hildenbrand (Arm) [this message]
2026-04-29 2:49 ` Wei Yang
2026-04-29 6:55 ` David Hildenbrand (Arm)
2026-05-03 0:38 ` Wei Yang
2026-05-04 12:44 ` David Hildenbrand (Arm)
2026-05-05 3:15 ` Wei Yang
2026-04-15 1:08 ` [PATCH 2/2] mm/selftests: add split_shared_pmd() Wei Yang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=c71930ae-19d9-4b3b-a74d-3de3261c4d43@kernel.org \
--to=david@kernel.org \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=dev.jain@arm.com \
--cc=gavinguo@igalia.com \
--cc=harry@kernel.org \
--cc=jannh@google.com \
--cc=lance.yang@linux.dev \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=mhocko@suse.com \
--cc=npache@redhat.com \
--cc=richard.weiyang@gmail.com \
--cc=riel@surriel.com \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shuah@kernel.org \
--cc=surenb@google.com \
--cc=vbabka@kernel.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox