From: David Hildenbrand <david@redhat.com>
To: Lance Yang <lance.yang@linux.dev>
Cc: 21cnbao@gmail.com, akpm@linux-foundation.org,
baolin.wang@linux.alibaba.com, chrisl@kernel.org,
kasong@tencent.com, linux-arm-kernel@lists.infradead.org,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
linux-riscv@lists.infradead.org, lorenzo.stoakes@oracle.com,
ryan.roberts@arm.com, v-songbaohua@oppo.com, x86@kernel.org,
ying.huang@intel.com, zhengtangquan@oppo.com,
Lance Yang <ioworker0@gmail.com>
Subject: Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Date: Thu, 26 Jun 2025 16:39:04 +0200 [thread overview]
Message-ID: <6fbcf806-eb3c-4bcd-8daf-8d87fd759d2b@redhat.com> (raw)
In-Reply-To: <ce78181f-b8f0-4710-be22-eff123760a51@linux.dev>
On 26.06.25 15:52, Lance Yang wrote:
>
>
> On 2025/6/26 21:16, David Hildenbrand wrote:
>> On 26.06.25 14:44, Lance Yang wrote:
>>>
>>> On 2025/6/26 17:29, Lance Yang wrote:
>>>> Before I send out the real patch, I'd like to get some quick feedback to
>>>> ensure I've understood the discussion correctly ;)
>>>>
>>>> Does this look like the right direction?
>>>>
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index fb63d9256f09..5ebffe2137e4 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1845,23 +1845,37 @@ void folio_remove_rmap_pud(struct folio
>>>> *folio, struct page *page,
>>>> #endif
>>>> }
>>>> -/* We support batch unmapping of PTEs for lazyfree large folios */
>>>> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>> - struct folio *folio, pte_t *ptep)
>>>> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>>>> + struct page_vma_mapped_walk *pvmw,
>>>> + enum ttu_flags flags, pte_t pte)
>>>> {
>>>> const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>> - int max_nr = folio_nr_pages(folio);
>>>> - pte_t pte = ptep_get(ptep);
>>>> + unsigned long end_addr, addr = pvmw->address;
>>>> + struct vm_area_struct *vma = pvmw->vma;
>>>> + unsigned int max_nr;
>>>> +
>>>> + if (flags & TTU_HWPOISON)
>>>> + return 1;
>>>> + if (!folio_test_large(folio))
>>>> + return 1;
>>>> + /* We may only batch within a single VMA and a single page
>>>> table. */
>>>> + end_addr = pmd_addr_end(addr, vma->vm_end);
>>>> + max_nr = (end_addr - addr) >> PAGE_SHIFT;
>>>> +
>>>> + /* We only support lazyfree batching for now ... */
>>>> if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
>>>> - return false;
>>>> + return 1;
>>>> if (pte_unused(pte))
>>>> - return false;
>>>> - if (pte_pfn(pte) != folio_pfn(folio))
>>>> - return false;
>>>> + return 1;
>>>> +
>>>> + /* ... where we must be able to batch the whole folio. */
>>>> + if (pte_pfn(pte) != folio_pfn(folio) || max_nr !=
>>>> folio_nr_pages(folio))
>>>> + return 1;
>>>> + max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr,
>>>> fpb_flags,
>>>> + NULL, NULL, NULL);
>>>> - return folio_pte_batch(folio, addr, ptep, pte, max_nr,
>>>> fpb_flags, NULL,
>>>> - NULL, NULL) == max_nr;
>>>> + return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>>> }
>>>> /*
>>>> @@ -2024,9 +2038,7 @@ static bool try_to_unmap_one(struct folio
>>>> *folio, struct vm_area_struct *vma,
>>>> if (pte_dirty(pteval))
>>>> folio_mark_dirty(folio);
>>>> } else if (likely(pte_present(pteval))) {
>>>> - if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
>>>> - can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
>>>> - nr_pages = folio_nr_pages(folio);
>>>> + nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags,
>>>> pteval);
>>>> end_addr = address + nr_pages * PAGE_SIZE;
>>>> flush_cache_range(vma, address, end_addr);
>>>> @@ -2206,13 +2218,16 @@ static bool try_to_unmap_one(struct folio
>>>> *folio, struct vm_area_struct *vma,
>>>> hugetlb_remove_rmap(folio);
>>>> } else {
>>>> folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
>>>> - folio_ref_sub(folio, nr_pages - 1);
>>>> }
>>>> if (vma->vm_flags & VM_LOCKED)
>>>> mlock_drain_local();
>>>> - folio_put(folio);
>>>> - /* We have already batched the entire folio */
>>>> - if (nr_pages > 1)
>>>> + folio_put_refs(folio, nr_pages);
>>>> +
>>>> + /*
>>>> + * If we are sure that we batched the entire folio and cleared
>>>> + * all PTEs, we can just optimize and stop right here.
>>>> + */
>>>> + if (nr_pages == folio_nr_pages(folio))
>>>> goto walk_done;
>>>> continue;
>>>> walk_abort:
>>>> --
>>>
>>> Oops ... Through testing on my machine, I found that the logic doesn't
>>> behave as expected because I messed up the meaning of max_nr (the
>>> available
>>> scan room in the page table) with folio_nr_pages(folio) :(
>>>
>>> With the following change:
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 5ebffe2137e4..b1407348e14e 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1850,9 +1850,9 @@ static inline unsigned int
>>> folio_unmap_pte_batch(struct folio *folio,
>>> enum ttu_flags flags, pte_t pte)
>>> {
>>> const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>> + unsigned int max_nr, nr_pages = folio_nr_pages(folio);
>>> unsigned long end_addr, addr = pvmw->address;
>>> struct vm_area_struct *vma = pvmw->vma;
>>> - unsigned int max_nr;
>>> if (flags & TTU_HWPOISON)
>>> return 1;
>>> @@ -1870,12 +1870,13 @@ static inline unsigned int
>>> folio_unmap_pte_batch(struct folio *folio,
>>> return 1;
>>> /* ... where we must be able to batch the whole folio. */
>>
>> Why is that still required? :)
>
> Sorry ... I was still stuck in the "all-or-nothing" mindset ...
>
> So, IIUC, you mean we should completely remove the "max_nr < nr_pages"
> check and just let folio_pte_batch handle whatever partial batch it
> safely can.
>
>>
>>> - if (pte_pfn(pte) != folio_pfn(folio) || max_nr !=
>>> folio_nr_pages(folio))
>>> + if (pte_pfn(pte) != folio_pfn(folio) || max_nr < nr_pages)
>>> return 1;
>>> - max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr,
>>> fpb_flags,
>>> - NULL, NULL, NULL);
>>> - return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>> + max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, nr_pages,
>>> + fpb_flags, NULL, NULL, NULL);
>>> +
>>> + return (max_nr != nr_pages) ? 1 : max_nr;
>>
>> Why is that still required? :)
>
> Then simply return the number of PTEs that consecutively map to the
> large folio. Right?
Yes. Any part of the large folio. Just return folio_pte_batch() ...
--
Cheers,
David / dhildenb
WARNING: multiple messages have this Message-ID (diff)
From: David Hildenbrand <david@redhat.com>
To: Lance Yang <lance.yang@linux.dev>
Cc: 21cnbao@gmail.com, akpm@linux-foundation.org,
baolin.wang@linux.alibaba.com, chrisl@kernel.org,
kasong@tencent.com, linux-arm-kernel@lists.infradead.org,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
linux-riscv@lists.infradead.org, lorenzo.stoakes@oracle.com,
ryan.roberts@arm.com, v-songbaohua@oppo.com, x86@kernel.org,
ying.huang@intel.com, zhengtangquan@oppo.com,
Lance Yang <ioworker0@gmail.com>
Subject: Re: [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation
Date: Thu, 26 Jun 2025 16:39:04 +0200 [thread overview]
Message-ID: <6fbcf806-eb3c-4bcd-8daf-8d87fd759d2b@redhat.com> (raw)
In-Reply-To: <ce78181f-b8f0-4710-be22-eff123760a51@linux.dev>
On 26.06.25 15:52, Lance Yang wrote:
>
>
> On 2025/6/26 21:16, David Hildenbrand wrote:
>> On 26.06.25 14:44, Lance Yang wrote:
>>>
>>> On 2025/6/26 17:29, Lance Yang wrote:
>>>> Before I send out the real patch, I'd like to get some quick feedback to
>>>> ensure I've understood the discussion correctly ;)
>>>>
>>>> Does this look like the right direction?
>>>>
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index fb63d9256f09..5ebffe2137e4 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1845,23 +1845,37 @@ void folio_remove_rmap_pud(struct folio
>>>> *folio, struct page *page,
>>>> #endif
>>>> }
>>>> -/* We support batch unmapping of PTEs for lazyfree large folios */
>>>> -static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
>>>> - struct folio *folio, pte_t *ptep)
>>>> +static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
>>>> + struct page_vma_mapped_walk *pvmw,
>>>> + enum ttu_flags flags, pte_t pte)
>>>> {
>>>> const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>>> - int max_nr = folio_nr_pages(folio);
>>>> - pte_t pte = ptep_get(ptep);
>>>> + unsigned long end_addr, addr = pvmw->address;
>>>> + struct vm_area_struct *vma = pvmw->vma;
>>>> + unsigned int max_nr;
>>>> +
>>>> + if (flags & TTU_HWPOISON)
>>>> + return 1;
>>>> + if (!folio_test_large(folio))
>>>> + return 1;
>>>> + /* We may only batch within a single VMA and a single page
>>>> table. */
>>>> + end_addr = pmd_addr_end(addr, vma->vm_end);
>>>> + max_nr = (end_addr - addr) >> PAGE_SHIFT;
>>>> +
>>>> + /* We only support lazyfree batching for now ... */
>>>> if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
>>>> - return false;
>>>> + return 1;
>>>> if (pte_unused(pte))
>>>> - return false;
>>>> - if (pte_pfn(pte) != folio_pfn(folio))
>>>> - return false;
>>>> + return 1;
>>>> +
>>>> + /* ... where we must be able to batch the whole folio. */
>>>> + if (pte_pfn(pte) != folio_pfn(folio) || max_nr !=
>>>> folio_nr_pages(folio))
>>>> + return 1;
>>>> + max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr,
>>>> fpb_flags,
>>>> + NULL, NULL, NULL);
>>>> - return folio_pte_batch(folio, addr, ptep, pte, max_nr,
>>>> fpb_flags, NULL,
>>>> - NULL, NULL) == max_nr;
>>>> + return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>>> }
>>>> /*
>>>> @@ -2024,9 +2038,7 @@ static bool try_to_unmap_one(struct folio
>>>> *folio, struct vm_area_struct *vma,
>>>> if (pte_dirty(pteval))
>>>> folio_mark_dirty(folio);
>>>> } else if (likely(pte_present(pteval))) {
>>>> - if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
>>>> - can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
>>>> - nr_pages = folio_nr_pages(folio);
>>>> + nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags,
>>>> pteval);
>>>> end_addr = address + nr_pages * PAGE_SIZE;
>>>> flush_cache_range(vma, address, end_addr);
>>>> @@ -2206,13 +2218,16 @@ static bool try_to_unmap_one(struct folio
>>>> *folio, struct vm_area_struct *vma,
>>>> hugetlb_remove_rmap(folio);
>>>> } else {
>>>> folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
>>>> - folio_ref_sub(folio, nr_pages - 1);
>>>> }
>>>> if (vma->vm_flags & VM_LOCKED)
>>>> mlock_drain_local();
>>>> - folio_put(folio);
>>>> - /* We have already batched the entire folio */
>>>> - if (nr_pages > 1)
>>>> + folio_put_refs(folio, nr_pages);
>>>> +
>>>> + /*
>>>> + * If we are sure that we batched the entire folio and cleared
>>>> + * all PTEs, we can just optimize and stop right here.
>>>> + */
>>>> + if (nr_pages == folio_nr_pages(folio))
>>>> goto walk_done;
>>>> continue;
>>>> walk_abort:
>>>> --
>>>
>>> Oops ... Through testing on my machine, I found that the logic doesn't
>>> behave as expected because I messed up the meaning of max_nr (the
>>> available
>>> scan room in the page table) with folio_nr_pages(folio) :(
>>>
>>> With the following change:
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 5ebffe2137e4..b1407348e14e 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1850,9 +1850,9 @@ static inline unsigned int
>>> folio_unmap_pte_batch(struct folio *folio,
>>> enum ttu_flags flags, pte_t pte)
>>> {
>>> const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
>>> + unsigned int max_nr, nr_pages = folio_nr_pages(folio);
>>> unsigned long end_addr, addr = pvmw->address;
>>> struct vm_area_struct *vma = pvmw->vma;
>>> - unsigned int max_nr;
>>> if (flags & TTU_HWPOISON)
>>> return 1;
>>> @@ -1870,12 +1870,13 @@ static inline unsigned int
>>> folio_unmap_pte_batch(struct folio *folio,
>>> return 1;
>>> /* ... where we must be able to batch the whole folio. */
>>
>> Why is that still required? :)
>
> Sorry ... I was still stuck in the "all-or-nothing" mindset ...
>
> So, IIUC, you mean we should completely remove the "max_nr < nr_pages"
> check and just let folio_pte_batch handle whatever partial batch it
> safely can.
>
>>
>>> - if (pte_pfn(pte) != folio_pfn(folio) || max_nr !=
>>> folio_nr_pages(folio))
>>> + if (pte_pfn(pte) != folio_pfn(folio) || max_nr < nr_pages)
>>> return 1;
>>> - max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr,
>>> fpb_flags,
>>> - NULL, NULL, NULL);
>>> - return (max_nr != folio_nr_pages(folio)) ? 1 : max_nr;
>>> + max_nr = folio_pte_batch(folio, addr, pvmw->pte, pte, nr_pages,
>>> + fpb_flags, NULL, NULL, NULL);
>>> +
>>> + return (max_nr != nr_pages) ? 1 : max_nr;
>>
>> Why is that still required? :)
>
> Then simply return the number of PTEs that consecutively map to the
> large folio. Right?
Yes. Any part of the large folio. Just return folio_pte_batch() ...
--
Cheers,
David / dhildenb
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
next prev parent reply other threads:[~2025-06-26 14:42 UTC|newest]
Thread overview: 90+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-14 9:30 [PATCH v4 0/4] mm: batched unmap lazyfree large folios during reclamation Barry Song
2025-02-14 9:30 ` Barry Song
2025-02-14 9:30 ` [PATCH v4 1/4] mm: Set folio swapbacked iff folios are dirty in try_to_unmap_one Barry Song
2025-02-14 9:30 ` Barry Song
2025-02-14 9:30 ` [PATCH v4 2/4] mm: Support tlbbatch flush for a range of PTEs Barry Song
2025-02-14 9:30 ` Barry Song
2025-02-14 9:30 ` [PATCH v4 3/4] mm: Support batched unmap for lazyfree large folios during reclamation Barry Song
2025-02-14 9:30 ` Barry Song
2025-06-24 12:55 ` David Hildenbrand
2025-06-24 12:55 ` David Hildenbrand
2025-06-24 15:26 ` Lance Yang
2025-06-24 15:26 ` Lance Yang
2025-06-24 15:34 ` David Hildenbrand
2025-06-24 15:34 ` David Hildenbrand
2025-06-24 16:25 ` Lance Yang
2025-06-24 16:25 ` Lance Yang
2025-06-25 9:38 ` Barry Song
2025-06-25 9:38 ` Barry Song
2025-06-25 10:00 ` David Hildenbrand
2025-06-25 10:00 ` David Hildenbrand
2025-06-25 10:38 ` Barry Song
2025-06-25 10:38 ` Barry Song
2025-06-25 10:43 ` David Hildenbrand
2025-06-25 10:43 ` David Hildenbrand
2025-06-25 10:49 ` Barry Song
2025-06-25 10:49 ` Barry Song
2025-06-25 10:59 ` David Hildenbrand
2025-06-25 10:59 ` David Hildenbrand
2025-06-25 10:47 ` Lance Yang
2025-06-25 10:47 ` Lance Yang
2025-06-25 10:49 ` David Hildenbrand
2025-06-25 10:49 ` David Hildenbrand
2025-06-25 10:57 ` Barry Song
2025-06-25 10:57 ` Barry Song
2025-06-25 11:01 ` David Hildenbrand
2025-06-25 11:01 ` David Hildenbrand
2025-06-25 11:15 ` Barry Song
2025-06-25 11:15 ` Barry Song
2025-06-25 11:27 ` David Hildenbrand
2025-06-25 11:27 ` David Hildenbrand
2025-06-25 11:42 ` Barry Song
2025-06-25 11:42 ` Barry Song
2025-06-25 12:09 ` David Hildenbrand
2025-06-25 12:09 ` David Hildenbrand
2025-06-25 12:20 ` Lance Yang
2025-06-25 12:20 ` Lance Yang
2025-06-25 12:25 ` David Hildenbrand
2025-06-25 12:25 ` David Hildenbrand
2025-06-25 12:35 ` Lance Yang
2025-06-25 12:35 ` Lance Yang
2025-06-25 21:03 ` Barry Song
2025-06-25 21:03 ` Barry Song
2025-06-26 1:17 ` Lance Yang
2025-06-26 1:17 ` Lance Yang
2025-06-26 8:17 ` David Hildenbrand
2025-06-26 8:17 ` David Hildenbrand
2025-06-26 9:29 ` Lance Yang
2025-06-26 9:29 ` Lance Yang
2025-06-26 12:44 ` Lance Yang
2025-06-26 12:44 ` Lance Yang
2025-06-26 13:16 ` David Hildenbrand
2025-06-26 13:16 ` David Hildenbrand
2025-06-26 13:52 ` Lance Yang
2025-06-26 13:52 ` Lance Yang
2025-06-26 14:39 ` David Hildenbrand [this message]
2025-06-26 14:39 ` David Hildenbrand
2025-06-26 15:06 ` Lance Yang
2025-06-26 15:06 ` Lance Yang
2025-06-26 21:46 ` Barry Song
2025-06-26 21:46 ` Barry Song
2025-06-26 21:52 ` David Hildenbrand
2025-06-26 21:52 ` David Hildenbrand
2025-06-25 12:58 ` Lance Yang
2025-06-25 12:58 ` Lance Yang
2025-06-25 13:02 ` David Hildenbrand
2025-06-25 13:02 ` David Hildenbrand
2025-06-25 8:44 ` Lance Yang
2025-06-25 8:44 ` Lance Yang
2025-06-25 9:29 ` Lance Yang
2025-06-25 9:29 ` Lance Yang
2025-07-01 10:03 ` Harry Yoo
2025-07-01 10:03 ` Harry Yoo
2025-07-01 13:27 ` Harry Yoo
2025-07-01 13:27 ` Harry Yoo
2025-07-01 16:17 ` David Hildenbrand
2025-07-01 16:17 ` David Hildenbrand
2025-02-14 9:30 ` [PATCH v4 4/4] mm: Avoid splitting pmd for lazyfree pmd-mapped THP in try_to_unmap Barry Song
2025-02-14 9:30 ` Barry Song
2025-06-25 13:49 ` [PATCH v4 0/4] mm: batched unmap lazyfree large folios during reclamation Lorenzo Stoakes
2025-06-25 13:49 ` Lorenzo Stoakes
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=6fbcf806-eb3c-4bcd-8daf-8d87fd759d2b@redhat.com \
--to=david@redhat.com \
--cc=21cnbao@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=chrisl@kernel.org \
--cc=ioworker0@gmail.com \
--cc=kasong@tencent.com \
--cc=lance.yang@linux.dev \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-riscv@lists.infradead.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=ryan.roberts@arm.com \
--cc=v-songbaohua@oppo.com \
--cc=x86@kernel.org \
--cc=ying.huang@intel.com \
--cc=zhengtangquan@oppo.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.