* [PATCH RESEND v5 1/4] mm/rmap: remove duplicated exit code in pagewalk loop
2024-05-13 7:47 [PATCH RESEND v5 0/4] Reclaim lazyfree THP without splitting Lance Yang
@ 2024-05-13 7:47 ` Lance Yang
2024-05-14 6:26 ` Baolin Wang
2024-05-13 7:47 ` [PATCH RESEND v5 2/4] mm/rmap: integrate PMD-mapped folio splitting into " Lance Yang
` (2 subsequent siblings)
3 siblings, 1 reply; 11+ messages in thread
From: Lance Yang @ 2024-05-13 7:47 UTC (permalink / raw)
To: akpm
Cc: willy, sj, baolin.wang, maskray, ziy, ryan.roberts, david,
21cnbao, mhocko, fengwei.yin, zokeefe, shy828301, xiehuan09,
libang.li, wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel, Lance Yang
Introduce the labels walk_done and walk_done_err as exit points to
eliminate duplicated exit code in the pagewalk loop.
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Lance Yang <ioworker0@gmail.com>
---
mm/rmap.c | 40 +++++++++++++++-------------------------
1 file changed, 15 insertions(+), 25 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index e8fc5ecb59b2..ddffa30c79fb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1679,9 +1679,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Restore the mlock which got missed */
if (!folio_test_large(folio))
mlock_vma_folio(folio, vma);
- page_vma_mapped_walk_done(&pvmw);
- ret = false;
- break;
+ goto walk_done_err;
}
pfn = pte_pfn(ptep_get(pvmw.pte));
@@ -1719,11 +1717,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
if (!anon) {
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (!hugetlb_vma_trylock_write(vma)) {
- page_vma_mapped_walk_done(&pvmw);
- ret = false;
- break;
- }
+ if (!hugetlb_vma_trylock_write(vma))
+ goto walk_done_err;
if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
flush_tlb_range(vma,
@@ -1738,8 +1733,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* actual page and drop map count
* to zero.
*/
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_done;
}
hugetlb_vma_unlock_write(vma);
}
@@ -1811,9 +1805,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (unlikely(folio_test_swapbacked(folio) !=
folio_test_swapcache(folio))) {
WARN_ON_ONCE(1);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_done_err;
}
/* MADV_FREE page check */
@@ -1852,23 +1844,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
set_pte_at(mm, address, pvmw.pte, pteval);
folio_set_swapbacked(folio);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_done_err;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_done_err;
}
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_done_err;
}
/* See folio_try_share_anon_rmap(): clear PTE first. */
@@ -1876,9 +1862,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
folio_try_share_anon_rmap_pte(folio, subpage)) {
swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_done_err;
}
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
@@ -1918,6 +1902,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
folio_put(folio);
+ continue;
+walk_done_err:
+ ret = false;
+walk_done:
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
mmu_notifier_invalidate_range_end(&range);
--
2.33.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH RESEND v5 1/4] mm/rmap: remove duplicated exit code in pagewalk loop
2024-05-13 7:47 ` [PATCH RESEND v5 1/4] mm/rmap: remove duplicated exit code in pagewalk loop Lance Yang
@ 2024-05-14 6:26 ` Baolin Wang
2024-05-14 7:43 ` Lance Yang
0 siblings, 1 reply; 11+ messages in thread
From: Baolin Wang @ 2024-05-14 6:26 UTC (permalink / raw)
To: Lance Yang, akpm
Cc: willy, sj, maskray, ziy, ryan.roberts, david, 21cnbao, mhocko,
fengwei.yin, zokeefe, shy828301, xiehuan09, libang.li,
wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel
On 2024/5/13 15:47, Lance Yang wrote:
> Introduce the labels walk_done and walk_done_err as exit points to
> eliminate duplicated exit code in the pagewalk loop.
>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
LGTM.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
> mm/rmap.c | 40 +++++++++++++++-------------------------
> 1 file changed, 15 insertions(+), 25 deletions(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index e8fc5ecb59b2..ddffa30c79fb 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1679,9 +1679,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> /* Restore the mlock which got missed */
> if (!folio_test_large(folio))
> mlock_vma_folio(folio, vma);
> - page_vma_mapped_walk_done(&pvmw);
> - ret = false;
> - break;
> + goto walk_done_err;
> }
>
> pfn = pte_pfn(ptep_get(pvmw.pte));
> @@ -1719,11 +1717,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> */
> if (!anon) {
> VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> - if (!hugetlb_vma_trylock_write(vma)) {
> - page_vma_mapped_walk_done(&pvmw);
> - ret = false;
> - break;
> - }
> + if (!hugetlb_vma_trylock_write(vma))
> + goto walk_done_err;
> if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> hugetlb_vma_unlock_write(vma);
> flush_tlb_range(vma,
> @@ -1738,8 +1733,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> * actual page and drop map count
> * to zero.
> */
> - page_vma_mapped_walk_done(&pvmw);
> - break;
> + goto walk_done;
> }
> hugetlb_vma_unlock_write(vma);
> }
> @@ -1811,9 +1805,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> if (unlikely(folio_test_swapbacked(folio) !=
> folio_test_swapcache(folio))) {
> WARN_ON_ONCE(1);
> - ret = false;
> - page_vma_mapped_walk_done(&pvmw);
> - break;
> + goto walk_done_err;
> }
>
> /* MADV_FREE page check */
> @@ -1852,23 +1844,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> */
> set_pte_at(mm, address, pvmw.pte, pteval);
> folio_set_swapbacked(folio);
> - ret = false;
> - page_vma_mapped_walk_done(&pvmw);
> - break;
> + goto walk_done_err;
> }
>
> if (swap_duplicate(entry) < 0) {
> set_pte_at(mm, address, pvmw.pte, pteval);
> - ret = false;
> - page_vma_mapped_walk_done(&pvmw);
> - break;
> + goto walk_done_err;
> }
> if (arch_unmap_one(mm, vma, address, pteval) < 0) {
> swap_free(entry);
> set_pte_at(mm, address, pvmw.pte, pteval);
> - ret = false;
> - page_vma_mapped_walk_done(&pvmw);
> - break;
> + goto walk_done_err;
> }
>
> /* See folio_try_share_anon_rmap(): clear PTE first. */
> @@ -1876,9 +1862,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> folio_try_share_anon_rmap_pte(folio, subpage)) {
> swap_free(entry);
> set_pte_at(mm, address, pvmw.pte, pteval);
> - ret = false;
> - page_vma_mapped_walk_done(&pvmw);
> - break;
> + goto walk_done_err;
> }
> if (list_empty(&mm->mmlist)) {
> spin_lock(&mmlist_lock);
> @@ -1918,6 +1902,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> if (vma->vm_flags & VM_LOCKED)
> mlock_drain_local();
> folio_put(folio);
> + continue;
> +walk_done_err:
> + ret = false;
> +walk_done:
> + page_vma_mapped_walk_done(&pvmw);
> + break;
> }
>
> mmu_notifier_invalidate_range_end(&range);
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH RESEND v5 1/4] mm/rmap: remove duplicated exit code in pagewalk loop
2024-05-14 6:26 ` Baolin Wang
@ 2024-05-14 7:43 ` Lance Yang
0 siblings, 0 replies; 11+ messages in thread
From: Lance Yang @ 2024-05-14 7:43 UTC (permalink / raw)
To: Baolin Wang
Cc: akpm, willy, sj, maskray, ziy, ryan.roberts, david, 21cnbao,
mhocko, fengwei.yin, zokeefe, shy828301, xiehuan09, libang.li,
wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel
Hi Baolin,
Thanks for taking time to review!
Best,
Lance
On Tue, May 14, 2024 at 2:26 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2024/5/13 15:47, Lance Yang wrote:
> > Introduce the labels walk_done and walk_done_err as exit points to
> > eliminate duplicated exit code in the pagewalk loop.
> >
> > Reviewed-by: Zi Yan <ziy@nvidia.com>
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
>
> LGTM.
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>
> > ---
> > mm/rmap.c | 40 +++++++++++++++-------------------------
> > 1 file changed, 15 insertions(+), 25 deletions(-)
> >
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index e8fc5ecb59b2..ddffa30c79fb 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1679,9 +1679,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > /* Restore the mlock which got missed */
> > if (!folio_test_large(folio))
> > mlock_vma_folio(folio, vma);
> > - page_vma_mapped_walk_done(&pvmw);
> > - ret = false;
> > - break;
> > + goto walk_done_err;
> > }
> >
> > pfn = pte_pfn(ptep_get(pvmw.pte));
> > @@ -1719,11 +1717,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > */
> > if (!anon) {
> > VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
> > - if (!hugetlb_vma_trylock_write(vma)) {
> > - page_vma_mapped_walk_done(&pvmw);
> > - ret = false;
> > - break;
> > - }
> > + if (!hugetlb_vma_trylock_write(vma))
> > + goto walk_done_err;
> > if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
> > hugetlb_vma_unlock_write(vma);
> > flush_tlb_range(vma,
> > @@ -1738,8 +1733,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > * actual page and drop map count
> > * to zero.
> > */
> > - page_vma_mapped_walk_done(&pvmw);
> > - break;
> > + goto walk_done;
> > }
> > hugetlb_vma_unlock_write(vma);
> > }
> > @@ -1811,9 +1805,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > if (unlikely(folio_test_swapbacked(folio) !=
> > folio_test_swapcache(folio))) {
> > WARN_ON_ONCE(1);
> > - ret = false;
> > - page_vma_mapped_walk_done(&pvmw);
> > - break;
> > + goto walk_done_err;
> > }
> >
> > /* MADV_FREE page check */
> > @@ -1852,23 +1844,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > */
> > set_pte_at(mm, address, pvmw.pte, pteval);
> > folio_set_swapbacked(folio);
> > - ret = false;
> > - page_vma_mapped_walk_done(&pvmw);
> > - break;
> > + goto walk_done_err;
> > }
> >
> > if (swap_duplicate(entry) < 0) {
> > set_pte_at(mm, address, pvmw.pte, pteval);
> > - ret = false;
> > - page_vma_mapped_walk_done(&pvmw);
> > - break;
> > + goto walk_done_err;
> > }
> > if (arch_unmap_one(mm, vma, address, pteval) < 0) {
> > swap_free(entry);
> > set_pte_at(mm, address, pvmw.pte, pteval);
> > - ret = false;
> > - page_vma_mapped_walk_done(&pvmw);
> > - break;
> > + goto walk_done_err;
> > }
> >
> > /* See folio_try_share_anon_rmap(): clear PTE first. */
> > @@ -1876,9 +1862,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > folio_try_share_anon_rmap_pte(folio, subpage)) {
> > swap_free(entry);
> > set_pte_at(mm, address, pvmw.pte, pteval);
> > - ret = false;
> > - page_vma_mapped_walk_done(&pvmw);
> > - break;
> > + goto walk_done_err;
> > }
> > if (list_empty(&mm->mmlist)) {
> > spin_lock(&mmlist_lock);
> > @@ -1918,6 +1902,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > if (vma->vm_flags & VM_LOCKED)
> > mlock_drain_local();
> > folio_put(folio);
> > + continue;
> > +walk_done_err:
> > + ret = false;
> > +walk_done:
> > + page_vma_mapped_walk_done(&pvmw);
> > + break;
> > }
> >
> > mmu_notifier_invalidate_range_end(&range);
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH RESEND v5 2/4] mm/rmap: integrate PMD-mapped folio splitting into pagewalk loop
2024-05-13 7:47 [PATCH RESEND v5 0/4] Reclaim lazyfree THP without splitting Lance Yang
2024-05-13 7:47 ` [PATCH RESEND v5 1/4] mm/rmap: remove duplicated exit code in pagewalk loop Lance Yang
@ 2024-05-13 7:47 ` Lance Yang
2024-05-13 7:47 ` [PATCH RESEND v5 3/4] mm/mlock: check for THP missing the mlock in try_to_unmap_one() Lance Yang
2024-05-13 7:47 ` [PATCH RESEND v5 4/4] mm/vmscan: avoid split lazyfree THP during shrink_folio_list() Lance Yang
3 siblings, 0 replies; 11+ messages in thread
From: Lance Yang @ 2024-05-13 7:47 UTC (permalink / raw)
To: akpm
Cc: willy, sj, baolin.wang, maskray, ziy, ryan.roberts, david,
21cnbao, mhocko, fengwei.yin, zokeefe, shy828301, xiehuan09,
libang.li, wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel, Lance Yang
In preparation for supporting try_to_unmap_one() to unmap PMD-mapped
folios, start the pagewalk first, then call split_huge_pmd_address()
to split the folio.
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Lance Yang <ioworker0@gmail.com>
---
include/linux/huge_mm.h | 6 ++++++
mm/huge_memory.c | 42 +++++++++++++++++++++--------------------
mm/rmap.c | 23 ++++++++++++++++------
3 files changed, 45 insertions(+), 26 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c8d3ec116e29..9fcb0b0b6ed1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -409,6 +409,9 @@ static inline bool thp_migration_supported(void)
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}
+void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, bool freeze, struct folio *folio);
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline bool folio_test_pmd_mappable(struct folio *folio)
@@ -471,6 +474,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct folio *folio) {}
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
unsigned long address, bool freeze, struct folio *folio) {}
+static inline void split_huge_pmd_locked(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ bool freeze, struct folio *folio) {}
#define split_huge_pud(__vma, __pmd, __address) \
do { } while (0)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 317de2afd371..425272c6c50b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2581,6 +2581,27 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pmd_populate(mm, pmd, pgtable);
}
+void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, bool freeze, struct folio *folio)
+{
+ VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
+ VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
+ VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
+ VM_BUG_ON(freeze && !folio);
+
+ /*
+ * When the caller requests to set up a migration entry, we
+ * require a folio to check the PMD against. Otherwise, there
+ * is a risk of replacing the wrong folio.
+ */
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+ is_pmd_migration_entry(*pmd)) {
+ if (folio && folio != pmd_folio(*pmd))
+ return;
+ __split_huge_pmd_locked(vma, pmd, address, freeze);
+ }
+}
+
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct folio *folio)
{
@@ -2592,26 +2613,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
-
- /*
- * If caller asks to setup a migration entry, we need a folio to check
- * pmd against. Otherwise we can end up replacing wrong folio.
- */
- VM_BUG_ON(freeze && !folio);
- VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
-
- if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
- is_pmd_migration_entry(*pmd)) {
- /*
- * It's safe to call pmd_page when folio is set because it's
- * guaranteed that pmd is present.
- */
- if (folio && folio != pmd_folio(*pmd))
- goto out;
- __split_huge_pmd_locked(vma, pmd, range.start, freeze);
- }
-
-out:
+ split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index ddffa30c79fb..4c4d14325f2e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1640,9 +1640,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
- if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, false, folio);
-
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
@@ -1668,9 +1665,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_FOLIO(!pvmw.pte, folio);
-
/*
* If the folio is in an mlock()d vma, we must not swap it out.
*/
@@ -1682,6 +1676,23 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto walk_done_err;
}
+ if (!pvmw.pte && (flags & TTU_SPLIT_HUGE_PMD)) {
+ /*
+ * We temporarily have to drop the PTL and start once
+ * again from that now-PTE-mapped page table.
+ */
+ split_huge_pmd_locked(vma, range.start, pvmw.pmd, false,
+ folio);
+ pvmw.pmd = NULL;
+ spin_unlock(pvmw.ptl);
+ pvmw.ptl = NULL;
+ flags &= ~TTU_SPLIT_HUGE_PMD;
+ continue;
+ }
+
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+
pfn = pte_pfn(ptep_get(pvmw.pte));
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
--
2.33.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH RESEND v5 3/4] mm/mlock: check for THP missing the mlock in try_to_unmap_one()
2024-05-13 7:47 [PATCH RESEND v5 0/4] Reclaim lazyfree THP without splitting Lance Yang
2024-05-13 7:47 ` [PATCH RESEND v5 1/4] mm/rmap: remove duplicated exit code in pagewalk loop Lance Yang
2024-05-13 7:47 ` [PATCH RESEND v5 2/4] mm/rmap: integrate PMD-mapped folio splitting into " Lance Yang
@ 2024-05-13 7:47 ` Lance Yang
2024-05-14 6:41 ` Baolin Wang
2024-05-13 7:47 ` [PATCH RESEND v5 4/4] mm/vmscan: avoid split lazyfree THP during shrink_folio_list() Lance Yang
3 siblings, 1 reply; 11+ messages in thread
From: Lance Yang @ 2024-05-13 7:47 UTC (permalink / raw)
To: akpm
Cc: willy, sj, baolin.wang, maskray, ziy, ryan.roberts, david,
21cnbao, mhocko, fengwei.yin, zokeefe, shy828301, xiehuan09,
libang.li, wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel, Lance Yang
The TTU_SPLIT_HUGE_PMD will no longer perform immediately, so we might
encounter a PMD-mapped THP missing the mlock in the VM_LOCKED range
during the pagewalk. It's likely necessary to mlock this THP to prevent
it from being picked up during page reclaim.
Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Lance Yang <ioworker0@gmail.com>
---
mm/rmap.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index 4c4d14325f2e..08a93347f283 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1671,7 +1671,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
/* Restore the mlock which got missed */
- if (!folio_test_large(folio))
+ if (!folio_test_large(folio) ||
+ (!pvmw.pte && (flags & TTU_SPLIT_HUGE_PMD)))
mlock_vma_folio(folio, vma);
goto walk_done_err;
}
--
2.33.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH RESEND v5 3/4] mm/mlock: check for THP missing the mlock in try_to_unmap_one()
2024-05-13 7:47 ` [PATCH RESEND v5 3/4] mm/mlock: check for THP missing the mlock in try_to_unmap_one() Lance Yang
@ 2024-05-14 6:41 ` Baolin Wang
2024-05-14 7:46 ` Lance Yang
0 siblings, 1 reply; 11+ messages in thread
From: Baolin Wang @ 2024-05-14 6:41 UTC (permalink / raw)
To: Lance Yang, akpm
Cc: willy, sj, maskray, ziy, ryan.roberts, david, 21cnbao, mhocko,
fengwei.yin, zokeefe, shy828301, xiehuan09, libang.li,
wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel
On 2024/5/13 15:47, Lance Yang wrote:
> The TTU_SPLIT_HUGE_PMD will no longer perform immediately, so we might
> encounter a PMD-mapped THP missing the mlock in the VM_LOCKED range
> during the pagewalk. It's likely necessary to mlock this THP to prevent
> it from being picked up during page reclaim.
>
> Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
IMO, this patch should be merged into patch 2, otherwise patch 2 is
buggy. Quote the description in the 'submission patches.rst' document:
"When dividing your change into a series of patches, take special care
to ensure that the kernel builds and runs properly after each patch in
the series. Developers using ``git bisect`` to track down a problem can
end up splitting your patch series at any point; they will not thank you
if you introduce bugs in the middle."
> ---
> mm/rmap.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 4c4d14325f2e..08a93347f283 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1671,7 +1671,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> if (!(flags & TTU_IGNORE_MLOCK) &&
> (vma->vm_flags & VM_LOCKED)) {
> /* Restore the mlock which got missed */
> - if (!folio_test_large(folio))
> + if (!folio_test_large(folio) ||
> + (!pvmw.pte && (flags & TTU_SPLIT_HUGE_PMD)))
> mlock_vma_folio(folio, vma);
> goto walk_done_err;
> }
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH RESEND v5 3/4] mm/mlock: check for THP missing the mlock in try_to_unmap_one()
2024-05-14 6:41 ` Baolin Wang
@ 2024-05-14 7:46 ` Lance Yang
0 siblings, 0 replies; 11+ messages in thread
From: Lance Yang @ 2024-05-14 7:46 UTC (permalink / raw)
To: Baolin Wang
Cc: akpm, willy, sj, maskray, ziy, ryan.roberts, david, 21cnbao,
mhocko, fengwei.yin, zokeefe, shy828301, xiehuan09, libang.li,
wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel
On Tue, May 14, 2024 at 2:41 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2024/5/13 15:47, Lance Yang wrote:
> > The TTU_SPLIT_HUGE_PMD will no longer perform immediately, so we might
> > encounter a PMD-mapped THP missing the mlock in the VM_LOCKED range
> > during the pagewalk. It's likely necessary to mlock this THP to prevent
> > it from being picked up during page reclaim.
> >
> > Suggested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
>
> IMO, this patch should be merged into patch 2, otherwise patch 2 is
> buggy. Quote the description in the 'submission patches.rst' document:
>
> "When dividing your change into a series of patches, take special care
> to ensure that the kernel builds and runs properly after each patch in
> the series. Developers using ``git bisect`` to track down a problem can
> end up splitting your patch series at any point; they will not thank you
> if you introduce bugs in the middle."
Thanks for bringing this up!
I completely agree that this patch should be merged into patch2.
Thanks,
Lance
>
> > ---
> > mm/rmap.c | 3 ++-
> > 1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 4c4d14325f2e..08a93347f283 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1671,7 +1671,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > if (!(flags & TTU_IGNORE_MLOCK) &&
> > (vma->vm_flags & VM_LOCKED)) {
> > /* Restore the mlock which got missed */
> > - if (!folio_test_large(folio))
> > + if (!folio_test_large(folio) ||
> > + (!pvmw.pte && (flags & TTU_SPLIT_HUGE_PMD)))
> > mlock_vma_folio(folio, vma);
> > goto walk_done_err;
> > }
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH RESEND v5 4/4] mm/vmscan: avoid split lazyfree THP during shrink_folio_list()
2024-05-13 7:47 [PATCH RESEND v5 0/4] Reclaim lazyfree THP without splitting Lance Yang
` (2 preceding siblings ...)
2024-05-13 7:47 ` [PATCH RESEND v5 3/4] mm/mlock: check for THP missing the mlock in try_to_unmap_one() Lance Yang
@ 2024-05-13 7:47 ` Lance Yang
2024-05-14 7:09 ` Baolin Wang
3 siblings, 1 reply; 11+ messages in thread
From: Lance Yang @ 2024-05-13 7:47 UTC (permalink / raw)
To: akpm
Cc: willy, sj, baolin.wang, maskray, ziy, ryan.roberts, david,
21cnbao, mhocko, fengwei.yin, zokeefe, shy828301, xiehuan09,
libang.li, wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel, Lance Yang
When the user no longer requires the pages, they would use
madvise(MADV_FREE) to mark the pages as lazy free. Subsequently, they
typically would not re-write to that memory again.
During memory reclaim, if we detect that the large folio and its PMD are
both still marked as clean and there are no unexpected references
(such as GUP), so we can just discard the memory lazily, improving the
efficiency of memory reclamation in this case.
On an Intel i5 CPU, reclaiming 1GiB of lazyfree THPs using
mem_cgroup_force_empty() results in the following runtimes in seconds
(shorter is better):
--------------------------------------------
| Old | New | Change |
--------------------------------------------
| 0.683426 | 0.049197 | -92.80% |
--------------------------------------------
Suggested-by: Zi Yan <ziy@nvidia.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Lance Yang <ioworker0@gmail.com>
---
include/linux/huge_mm.h | 9 +++++
mm/huge_memory.c | 75 +++++++++++++++++++++++++++++++++++++++++
mm/rmap.c | 31 ++++++++++-------
3 files changed, 103 insertions(+), 12 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9fcb0b0b6ed1..cfd7ec2b6d0a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -411,6 +411,8 @@ static inline bool thp_migration_supported(void)
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze, struct folio *folio);
+bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmdp, struct folio *folio);
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -478,6 +480,13 @@ static inline void split_huge_pmd_locked(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
bool freeze, struct folio *folio) {}
+static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp,
+ struct folio *folio)
+{
+ return false;
+}
+
#define split_huge_pud(__vma, __pmd, __address) \
do { } while (0)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 425272c6c50b..3ceeeb2f42d4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2687,6 +2687,81 @@ static void unmap_folio(struct folio *folio)
try_to_unmap_flush();
}
+static bool __discard_trans_pmd_locked(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp,
+ struct folio *folio)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int ref_count, map_count;
+ pmd_t orig_pmd = *pmdp;
+ struct mmu_gather tlb;
+ struct page *page;
+
+ if (pmd_dirty(orig_pmd) || folio_test_dirty(folio))
+ return false;
+ if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd)))
+ return false;
+
+ page = pmd_page(orig_pmd);
+ if (unlikely(page_folio(page) != folio))
+ return false;
+
+ tlb_gather_mmu(&tlb, mm);
+ orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmdp);
+ tlb_remove_pmd_tlb_entry(&tlb, pmdp, addr);
+
+ /*
+ * Syncing against concurrent GUP-fast:
+ * - clear PMD; barrier; read refcount
+ * - inc refcount; barrier; read PMD
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for folio refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * If the PMD or folio is redirtied at this point, or if there are
+ * unexpected references, we will give up to discard this folio
+ * and remap it.
+ *
+ * The only folio refs must be one from isolation plus the rmap(s).
+ */
+ if (ref_count != map_count + 1 || folio_test_dirty(folio) ||
+ pmd_dirty(orig_pmd)) {
+ set_pmd_at(mm, addr, pmdp, orig_pmd);
+ return false;
+ }
+
+ folio_remove_rmap_pmd(folio, page, vma);
+ zap_deposited_table(mm, pmdp);
+ add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_drain_local();
+ folio_put(folio);
+
+ return true;
+}
+
+bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmdp, struct folio *folio)
+{
+ VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
+
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
+ return __discard_trans_pmd_locked(vma, addr, pmdp, folio);
+
+ return false;
+}
+
static void remap_page(struct folio *folio, unsigned long nr)
{
int i = 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index 08a93347f283..e09f2141b8dc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1677,18 +1677,25 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto walk_done_err;
}
- if (!pvmw.pte && (flags & TTU_SPLIT_HUGE_PMD)) {
- /*
- * We temporarily have to drop the PTL and start once
- * again from that now-PTE-mapped page table.
- */
- split_huge_pmd_locked(vma, range.start, pvmw.pmd, false,
- folio);
- pvmw.pmd = NULL;
- spin_unlock(pvmw.ptl);
- pvmw.ptl = NULL;
- flags &= ~TTU_SPLIT_HUGE_PMD;
- continue;
+ if (!pvmw.pte) {
+ if (unmap_huge_pmd_locked(vma, range.start, pvmw.pmd,
+ folio))
+ goto walk_done;
+
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ /*
+ * We temporarily have to drop the PTL and start
+ * once again from that now-PTE-mapped page
+ * table.
+ */
+ split_huge_pmd_locked(vma, range.start,
+ pvmw.pmd, false, folio);
+ pvmw.pmd = NULL;
+ spin_unlock(pvmw.ptl);
+ pvmw.ptl = NULL;
+ flags &= ~TTU_SPLIT_HUGE_PMD;
+ continue;
+ }
}
/* Unexpected PMD-mapped THP? */
--
2.33.1
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH RESEND v5 4/4] mm/vmscan: avoid split lazyfree THP during shrink_folio_list()
2024-05-13 7:47 ` [PATCH RESEND v5 4/4] mm/vmscan: avoid split lazyfree THP during shrink_folio_list() Lance Yang
@ 2024-05-14 7:09 ` Baolin Wang
2024-05-14 8:36 ` Lance Yang
0 siblings, 1 reply; 11+ messages in thread
From: Baolin Wang @ 2024-05-14 7:09 UTC (permalink / raw)
To: Lance Yang, akpm
Cc: willy, sj, maskray, ziy, ryan.roberts, david, 21cnbao, mhocko,
fengwei.yin, zokeefe, shy828301, xiehuan09, libang.li,
wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel
On 2024/5/13 15:47, Lance Yang wrote:
> When the user no longer requires the pages, they would use
> madvise(MADV_FREE) to mark the pages as lazy free. Subsequently, they
> typically would not re-write to that memory again.
>
> During memory reclaim, if we detect that the large folio and its PMD are
> both still marked as clean and there are no unexpected references
> (such as GUP), so we can just discard the memory lazily, improving the
> efficiency of memory reclamation in this case.
>
> On an Intel i5 CPU, reclaiming 1GiB of lazyfree THPs using
> mem_cgroup_force_empty() results in the following runtimes in seconds
> (shorter is better):
>
> --------------------------------------------
> | Old | New | Change |
> --------------------------------------------
> | 0.683426 | 0.049197 | -92.80% |
> --------------------------------------------
>
> Suggested-by: Zi Yan <ziy@nvidia.com>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Lance Yang <ioworker0@gmail.com>
> ---
> include/linux/huge_mm.h | 9 +++++
> mm/huge_memory.c | 75 +++++++++++++++++++++++++++++++++++++++++
> mm/rmap.c | 31 ++++++++++-------
> 3 files changed, 103 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 9fcb0b0b6ed1..cfd7ec2b6d0a 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -411,6 +411,8 @@ static inline bool thp_migration_supported(void)
>
> void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
> pmd_t *pmd, bool freeze, struct folio *folio);
> +bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> + pmd_t *pmdp, struct folio *folio);
>
> #else /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> @@ -478,6 +480,13 @@ static inline void split_huge_pmd_locked(struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmd,
> bool freeze, struct folio *folio) {}
>
> +static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
> + unsigned long addr, pmd_t *pmdp,
> + struct folio *folio)
> +{
> + return false;
> +}
> +
> #define split_huge_pud(__vma, __pmd, __address) \
> do { } while (0)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 425272c6c50b..3ceeeb2f42d4 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2687,6 +2687,81 @@ static void unmap_folio(struct folio *folio)
> try_to_unmap_flush();
> }
>
> +static bool __discard_trans_pmd_locked(struct vm_area_struct *vma,
> + unsigned long addr, pmd_t *pmdp,
> + struct folio *folio)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + int ref_count, map_count;
> + pmd_t orig_pmd = *pmdp;
> + struct mmu_gather tlb;
> + struct page *page;
> +
> + if (pmd_dirty(orig_pmd) || folio_test_dirty(folio))
> + return false;
> + if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd)))
> + return false;
> +
> + page = pmd_page(orig_pmd);
> + if (unlikely(page_folio(page) != folio))
> + return false;
The function is called under the ptl lock, so I have no idea why the pmd
value can be changed, seems above validation is useless.
> +
> + tlb_gather_mmu(&tlb, mm);
You missed tlb_finish_mmu() to do tlb flushing, and ...
> + orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmdp);
> + tlb_remove_pmd_tlb_entry(&tlb, pmdp, addr);
I don't think tlb gather is helpful here, since you just flush one PMD
entry. Just using pmdp_huge_clear_flush() seems enough.
> +
> + /*
> + * Syncing against concurrent GUP-fast:
> + * - clear PMD; barrier; read refcount
> + * - inc refcount; barrier; read PMD
> + */
> + smp_mb();
> +
> + ref_count = folio_ref_count(folio);
> + map_count = folio_mapcount(folio);
> +
> + /*
> + * Order reads for folio refcount and dirty flag
> + * (see comments in __remove_mapping()).
> + */
> + smp_rmb();
> +
> + /*
> + * If the PMD or folio is redirtied at this point, or if there are
> + * unexpected references, we will give up to discard this folio
> + * and remap it.
> + *
> + * The only folio refs must be one from isolation plus the rmap(s).
> + */
> + if (ref_count != map_count + 1 || folio_test_dirty(folio) ||
> + pmd_dirty(orig_pmd)) {
> + set_pmd_at(mm, addr, pmdp, orig_pmd);
Should we also call 'folio_set_swapbacked()' if the folio was redirtied?
> + return false;
> + }
> +
> + folio_remove_rmap_pmd(folio, page, vma);
> + zap_deposited_table(mm, pmdp);
> + add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
> + if (vma->vm_flags & VM_LOCKED)
> + mlock_drain_local();
> + folio_put(folio);
> +
> + return true;
> +}
> +
> +bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> + pmd_t *pmdp, struct folio *folio)
> +{
> + VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> + VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
> +
> + if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
> + return __discard_trans_pmd_locked(vma, addr, pmdp, folio);
> +
> + return false;
> +}
> +
> static void remap_page(struct folio *folio, unsigned long nr)
> {
> int i = 0;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 08a93347f283..e09f2141b8dc 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1677,18 +1677,25 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> goto walk_done_err;
> }
>
> - if (!pvmw.pte && (flags & TTU_SPLIT_HUGE_PMD)) {
> - /*
> - * We temporarily have to drop the PTL and start once
> - * again from that now-PTE-mapped page table.
> - */
> - split_huge_pmd_locked(vma, range.start, pvmw.pmd, false,
> - folio);
> - pvmw.pmd = NULL;
> - spin_unlock(pvmw.ptl);
> - pvmw.ptl = NULL;
> - flags &= ~TTU_SPLIT_HUGE_PMD;
> - continue;
> + if (!pvmw.pte) {
> + if (unmap_huge_pmd_locked(vma, range.start, pvmw.pmd,
> + folio))
> + goto walk_done;
> +
> + if (flags & TTU_SPLIT_HUGE_PMD) {
> + /*
> + * We temporarily have to drop the PTL and start
> + * once again from that now-PTE-mapped page
> + * table.
> + */
> + split_huge_pmd_locked(vma, range.start,
> + pvmw.pmd, false, folio);
> + pvmw.pmd = NULL;
> + spin_unlock(pvmw.ptl);
> + pvmw.ptl = NULL;
> + flags &= ~TTU_SPLIT_HUGE_PMD;
> + continue;
> + }
> }
>
> /* Unexpected PMD-mapped THP? */
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH RESEND v5 4/4] mm/vmscan: avoid split lazyfree THP during shrink_folio_list()
2024-05-14 7:09 ` Baolin Wang
@ 2024-05-14 8:36 ` Lance Yang
0 siblings, 0 replies; 11+ messages in thread
From: Lance Yang @ 2024-05-14 8:36 UTC (permalink / raw)
To: Baolin Wang
Cc: akpm, willy, sj, maskray, ziy, ryan.roberts, david, 21cnbao,
mhocko, fengwei.yin, zokeefe, shy828301, xiehuan09, libang.li,
wangkefeng.wang, songmuchun, peterx, minchan, linux-mm,
linux-kernel
Hi Baolin,
Thanks for taking time to review!
On Tue, May 14, 2024 at 3:09 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2024/5/13 15:47, Lance Yang wrote:
> > When the user no longer requires the pages, they would use
> > madvise(MADV_FREE) to mark the pages as lazy free. Subsequently, they
> > typically would not re-write to that memory again.
> >
> > During memory reclaim, if we detect that the large folio and its PMD are
> > both still marked as clean and there are no unexpected references
> > (such as GUP), so we can just discard the memory lazily, improving the
> > efficiency of memory reclamation in this case.
> >
> > On an Intel i5 CPU, reclaiming 1GiB of lazyfree THPs using
> > mem_cgroup_force_empty() results in the following runtimes in seconds
> > (shorter is better):
> >
> > --------------------------------------------
> > | Old | New | Change |
> > --------------------------------------------
> > | 0.683426 | 0.049197 | -92.80% |
> > --------------------------------------------
> >
> > Suggested-by: Zi Yan <ziy@nvidia.com>
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Lance Yang <ioworker0@gmail.com>
> > ---
> > include/linux/huge_mm.h | 9 +++++
> > mm/huge_memory.c | 75 +++++++++++++++++++++++++++++++++++++++++
> > mm/rmap.c | 31 ++++++++++-------
> > 3 files changed, 103 insertions(+), 12 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 9fcb0b0b6ed1..cfd7ec2b6d0a 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -411,6 +411,8 @@ static inline bool thp_migration_supported(void)
> >
> > void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
> > pmd_t *pmd, bool freeze, struct folio *folio);
> > +bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> > + pmd_t *pmdp, struct folio *folio);
> >
> > #else /* CONFIG_TRANSPARENT_HUGEPAGE */
> >
> > @@ -478,6 +480,13 @@ static inline void split_huge_pmd_locked(struct vm_area_struct *vma,
> > unsigned long address, pmd_t *pmd,
> > bool freeze, struct folio *folio) {}
> >
> > +static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
> > + unsigned long addr, pmd_t *pmdp,
> > + struct folio *folio)
> > +{
> > + return false;
> > +}
> > +
> > #define split_huge_pud(__vma, __pmd, __address) \
> > do { } while (0)
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 425272c6c50b..3ceeeb2f42d4 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2687,6 +2687,81 @@ static void unmap_folio(struct folio *folio)
> > try_to_unmap_flush();
> > }
> >
> > +static bool __discard_trans_pmd_locked(struct vm_area_struct *vma,
> > + unsigned long addr, pmd_t *pmdp,
> > + struct folio *folio)
> > +{
> > + struct mm_struct *mm = vma->vm_mm;
> > + int ref_count, map_count;
> > + pmd_t orig_pmd = *pmdp;
> > + struct mmu_gather tlb;
> > + struct page *page;
> > +
> > + if (pmd_dirty(orig_pmd) || folio_test_dirty(folio))
> > + return false;
> > + if (unlikely(!pmd_present(orig_pmd) || !pmd_trans_huge(orig_pmd)))
> > + return false;
> > +
> > + page = pmd_page(orig_pmd);
> > + if (unlikely(page_folio(page) != folio))
> > + return false;
>
> The function is called under the ptl lock, so I have no idea why the pmd
> value can be changed, seems above validation is useless.
Yep. For the page reclaim path, page_vma_mapped_walk() will require
the PTL, and make sure the PMD mapping is unchanged via check_pmd().
But, IMO, we cannot assume that all callers always do the PMD mapping
check. I suggest keeping it to prepare for corner cases :)
>
> > +
> > + tlb_gather_mmu(&tlb, mm);
>
> You missed tlb_finish_mmu() to do tlb flushing, and ...
Good spot!
>
> > + orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmdp);
> > + tlb_remove_pmd_tlb_entry(&tlb, pmdp, addr);
>
> I don't think tlb gather is helpful here, since you just flush one PMD
> entry. Just using pmdp_huge_clear_flush() seems enough.
Nice, thanks for the suggestion!
I completely agree that using pmdp_huge_clear_flush() is sufficient here :)
>
> > +
> > + /*
> > + * Syncing against concurrent GUP-fast:
> > + * - clear PMD; barrier; read refcount
> > + * - inc refcount; barrier; read PMD
> > + */
> > + smp_mb();
> > +
> > + ref_count = folio_ref_count(folio);
> > + map_count = folio_mapcount(folio);
> > +
> > + /*
> > + * Order reads for folio refcount and dirty flag
> > + * (see comments in __remove_mapping()).
> > + */
> > + smp_rmb();
> > +
> > + /*
> > + * If the PMD or folio is redirtied at this point, or if there are
> > + * unexpected references, we will give up to discard this folio
> > + * and remap it.
> > + *
> > + * The only folio refs must be one from isolation plus the rmap(s).
> > + */
> > + if (ref_count != map_count + 1 || folio_test_dirty(folio) ||
> > + pmd_dirty(orig_pmd)) {
> > + set_pmd_at(mm, addr, pmdp, orig_pmd);
>
> Should we also call 'folio_set_swapbacked()' if the folio was redirtied?
Yes. I forgot about that :)
Thanks again for the review!
Lance
>
> > + return false;
> > + }
> > +
> > + folio_remove_rmap_pmd(folio, page, vma);
> > + zap_deposited_table(mm, pmdp);
> > + add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
> > + if (vma->vm_flags & VM_LOCKED)
> > + mlock_drain_local();
> > + folio_put(folio);
> > +
> > + return true;
> > +}
> > +
> > +bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> > + pmd_t *pmdp, struct folio *folio)
> > +{
> > + VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
> > + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
> > + VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
> > +
> > + if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
> > + return __discard_trans_pmd_locked(vma, addr, pmdp, folio);
> > +
> > + return false;
> > +}
> > +
> > static void remap_page(struct folio *folio, unsigned long nr)
> > {
> > int i = 0;
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 08a93347f283..e09f2141b8dc 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1677,18 +1677,25 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > goto walk_done_err;
> > }
> >
> > - if (!pvmw.pte && (flags & TTU_SPLIT_HUGE_PMD)) {
> > - /*
> > - * We temporarily have to drop the PTL and start once
> > - * again from that now-PTE-mapped page table.
> > - */
> > - split_huge_pmd_locked(vma, range.start, pvmw.pmd, false,
> > - folio);
> > - pvmw.pmd = NULL;
> > - spin_unlock(pvmw.ptl);
> > - pvmw.ptl = NULL;
> > - flags &= ~TTU_SPLIT_HUGE_PMD;
> > - continue;
> > + if (!pvmw.pte) {
> > + if (unmap_huge_pmd_locked(vma, range.start, pvmw.pmd,
> > + folio))
> > + goto walk_done;
> > +
> > + if (flags & TTU_SPLIT_HUGE_PMD) {
> > + /*
> > + * We temporarily have to drop the PTL and start
> > + * once again from that now-PTE-mapped page
> > + * table.
> > + */
> > + split_huge_pmd_locked(vma, range.start,
> > + pvmw.pmd, false, folio);
> > + pvmw.pmd = NULL;
> > + spin_unlock(pvmw.ptl);
> > + pvmw.ptl = NULL;
> > + flags &= ~TTU_SPLIT_HUGE_PMD;
> > + continue;
> > + }
> > }
> >
> > /* Unexpected PMD-mapped THP? */
^ permalink raw reply [flat|nested] 11+ messages in thread