linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault
@ 2024-09-04 10:09 Dev Jain
  2024-09-04 10:09 ` [PATCH v2 1/2] mm: Abstract THP allocation Dev Jain
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Dev Jain @ 2024-09-04 10:09 UTC (permalink / raw)
  To: akpm, david, willy, kirill.shutemov
  Cc: ryan.roberts, anshuman.khandual, catalin.marinas, cl, vbabka,
	mhocko, apopple, dave.hansen, will, baohua, jack, mark.rutland,
	hughd, aneesh.kumar, yang, peterx, ioworker0, jglisse,
	linux-arm-kernel, linux-kernel, linux-mm, Dev Jain

It was observed at [1] and [2] that the current kernel behaviour of
shattering a hugezeropage is inconsistent and suboptimal. For a VMA with
a THP allowable order, when we write-fault on it, the kernel installs a
PMD-mapped THP. On the other hand, if we first get a read fault, we get
a PMD pointing to the hugezeropage; subsequent write will trigger a
write-protection fault, shattering the hugezeropage into one writable
page, and all the other PTEs write-protected. The conclusion being, as
compared to the case of a single write-fault, applications have to suffer
512 extra page faults if they were to use the VMA as such, plus we get
the overhead of khugepaged trying to replace that area with a THP anyway.

Instead, replace the hugezeropage with a THP on wp-fault.

v1->v2:
 - Wrap do_huge_zero_wp_pmd_locked() around lock and unlock
 - Call thp_fault_alloc() before do_huge_zero_wp_pmd_locked() to avoid
 - calling sleeping function from spinlock context

[1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/
[2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/

Dev Jain (2):
  mm: Abstract THP allocation
  mm: Allocate THP on hugezeropage wp-fault

 include/linux/huge_mm.h |   6 ++
 mm/huge_memory.c        | 171 +++++++++++++++++++++++++++++-----------
 mm/memory.c             |   5 +-
 3 files changed, 136 insertions(+), 46 deletions(-)

-- 
2.30.2



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 1/2] mm: Abstract THP allocation
  2024-09-04 10:09 [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault Dev Jain
@ 2024-09-04 10:09 ` Dev Jain
  2024-09-05  8:20   ` Kirill A. Shutemov
  2024-09-05 11:08   ` Ryan Roberts
  2024-09-04 10:09 ` [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault Dev Jain
  2024-09-04 11:36 ` [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault Ryan Roberts
  2 siblings, 2 replies; 22+ messages in thread
From: Dev Jain @ 2024-09-04 10:09 UTC (permalink / raw)
  To: akpm, david, willy, kirill.shutemov
  Cc: ryan.roberts, anshuman.khandual, catalin.marinas, cl, vbabka,
	mhocko, apopple, dave.hansen, will, baohua, jack, mark.rutland,
	hughd, aneesh.kumar, yang, peterx, ioworker0, jglisse,
	linux-arm-kernel, linux-kernel, linux-mm, Dev Jain

In preparation for the second patch, abstract away the THP allocation
logic present in the create_huge_pmd() path, which corresponds to the
faulting case when no page is present.

There should be no functional change as a result of applying
this patch.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 43 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 67c86a5d64a6..58125fbcc532 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -943,47 +943,89 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
-static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
-			struct page *page, gfp_t gfp)
+static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+				  unsigned long haddr, struct folio **foliop,
+				  unsigned long addr)
 {
-	struct vm_area_struct *vma = vmf->vma;
-	struct folio *folio = page_folio(page);
-	pgtable_t pgtable;
-	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	vm_fault_t ret = 0;
+	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
 
-	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+	*foliop = folio;
+	if (unlikely(!folio)) {
+		count_vm_event(THP_FAULT_FALLBACK);
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+		return VM_FAULT_FALLBACK;
+	}
 
+	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
 		folio_put(folio);
 		count_vm_event(THP_FAULT_FALLBACK);
 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
-		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
-		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 		return VM_FAULT_FALLBACK;
 	}
 	folio_throttle_swaprate(folio, gfp);
 
-	pgtable = pte_alloc_one(vma->vm_mm);
-	if (unlikely(!pgtable)) {
-		ret = VM_FAULT_OOM;
-		goto release;
-	}
-
-	folio_zero_user(folio, vmf->address);
+	folio_zero_user(folio, addr);
 	/*
 	 * The memory barrier inside __folio_mark_uptodate makes sure that
 	 * folio_zero_user writes become visible before the set_pmd_at()
 	 * write.
 	 */
 	__folio_mark_uptodate(folio);
+	return 0;
+}
+
+static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
+{
+	count_vm_event(THP_FAULT_ALLOC);
+	count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
+	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+}
+
+static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+			struct vm_area_struct *vma, unsigned long haddr,
+			pgtable_t pgtable)
+{
+	pmd_t entry;
+
+	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
+	folio_add_lru_vma(folio, vma);
+	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	mm_inc_nr_ptes(vma->vm_mm);
+}
+
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct folio *folio = NULL;
+	pgtable_t pgtable;
+	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	vm_fault_t ret = 0;
+	gfp_t gfp = vma_thp_gfp_mask(vma);
+
+	pgtable = pte_alloc_one(vma->vm_mm);
+	if (unlikely(!pgtable)) {
+		ret = VM_FAULT_OOM;
+		goto release;
+	}
+
+	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
+			      vmf->address);
+	if (ret)
+		goto release;
 
 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+
 	if (unlikely(!pmd_none(*vmf->pmd))) {
 		goto unlock_release;
 	} else {
-		pmd_t entry;
-
 		ret = check_stable_address_space(vma->vm_mm);
 		if (ret)
 			goto unlock_release;
@@ -997,20 +1039,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
 			return ret;
 		}
-
-		entry = mk_huge_pmd(page, vma->vm_page_prot);
-		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-		folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
-		folio_add_lru_vma(folio, vma);
-		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
-		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		mm_inc_nr_ptes(vma->vm_mm);
+		map_pmd_thp(folio, vmf, vma, haddr, pgtable);
 		spin_unlock(vmf->ptl);
-		count_vm_event(THP_FAULT_ALLOC);
-		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
-		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
 	}
 
 	return 0;
@@ -1019,7 +1050,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 release:
 	if (pgtable)
 		pte_free(vma->vm_mm, pgtable);
-	folio_put(folio);
+	if (folio)
+		folio_put(folio);
 	return ret;
 
 }
@@ -1077,8 +1109,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	gfp_t gfp;
-	struct folio *folio;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 	vm_fault_t ret;
 
@@ -1129,14 +1159,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		}
 		return ret;
 	}
-	gfp = vma_thp_gfp_mask(vma);
-	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
-	if (unlikely(!folio)) {
-		count_vm_event(THP_FAULT_FALLBACK);
-		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
-		return VM_FAULT_FALLBACK;
-	}
-	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+	return __do_huge_pmd_anonymous_page(vmf);
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-04 10:09 [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault Dev Jain
  2024-09-04 10:09 ` [PATCH v2 1/2] mm: Abstract THP allocation Dev Jain
@ 2024-09-04 10:09 ` Dev Jain
  2024-09-05  8:26   ` Kirill A. Shutemov
  2024-09-05 13:14   ` Ryan Roberts
  2024-09-04 11:36 ` [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault Ryan Roberts
  2 siblings, 2 replies; 22+ messages in thread
From: Dev Jain @ 2024-09-04 10:09 UTC (permalink / raw)
  To: akpm, david, willy, kirill.shutemov
  Cc: ryan.roberts, anshuman.khandual, catalin.marinas, cl, vbabka,
	mhocko, apopple, dave.hansen, will, baohua, jack, mark.rutland,
	hughd, aneesh.kumar, yang, peterx, ioworker0, jglisse,
	linux-arm-kernel, linux-kernel, linux-mm, Dev Jain

Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
replace it with a PMD-mapped THP. Change the helpers introduced in the
previous patch to flush TLB entry corresponding to the hugezeropage,
and preserve PMD uffd-wp marker. In case of failure, fallback to
splitting the PMD.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 include/linux/huge_mm.h |  6 ++++
 mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
 mm/memory.c             |  5 +--
 3 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..fdd2cf473a3c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -9,6 +9,12 @@
 #include <linux/kobject.h>
 
 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
+vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+			   unsigned long haddr, struct folio **foliop,
+			   unsigned long addr);
+void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+		 struct vm_area_struct *vma, unsigned long haddr,
+		 pgtable_t pgtable);
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 58125fbcc532..150163ad77d3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
-static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
-				  unsigned long haddr, struct folio **foliop,
-				  unsigned long addr)
+vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+			   unsigned long haddr, struct folio **foliop,
+			   unsigned long addr)
 {
 	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
 
@@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
 	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
 }
 
-static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
-			struct vm_area_struct *vma, unsigned long haddr,
-			pgtable_t pgtable)
+void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+		 struct vm_area_struct *vma, unsigned long haddr,
+		 pgtable_t pgtable)
 {
-	pmd_t entry;
+	pmd_t entry, old_pmd;
+	bool is_pmd_none = pmd_none(*vmf->pmd);
 
 	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
 	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
 	folio_add_lru_vma(folio, vma);
-	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+	if (!is_pmd_none) {
+		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+		if (pmd_uffd_wp(old_pmd))
+			entry = pmd_mkuffd_wp(entry);
+	}
+	if (pgtable)
+		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-	mm_inc_nr_ptes(vma->vm_mm);
+	if (is_pmd_none)
+		mm_inc_nr_ptes(vma->vm_mm);
 }
 
 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
@@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
 	spin_unlock(vmf->ptl);
 }
 
+static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
+					     unsigned long haddr,
+					     struct folio *folio)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	vm_fault_t ret = 0;
+
+	ret = check_stable_address_space(vma->vm_mm);
+	if (ret)
+		goto out;
+	map_pmd_thp(folio, vmf, vma, haddr, NULL);
+out:
+	return ret;
+}
+
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	gfp_t gfp = vma_thp_gfp_mask(vma);
+	struct mmu_notifier_range range;
+	struct folio *folio = NULL;
+	vm_fault_t ret = 0;
+
+	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
+			      vmf->address);
+	if (ret)
+		goto out;
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+				haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+		goto unlock;
+	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
+	if (!ret)
+		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
+unlock:
+	spin_unlock(vmf->ptl);
+	mmu_notifier_invalidate_range_end(&range);
+out:
+	return ret;
+}
+
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 {
 	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
@@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
 
-	if (is_huge_zero_pmd(orig_pmd))
+	if (is_huge_zero_pmd(orig_pmd)) {
+		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
+
+		if (!(ret & VM_FAULT_FALLBACK))
+			return ret;
+
+		/* Fallback to splitting PMD if THP cannot be allocated */
 		goto fallback;
+	}
 
 	spin_lock(vmf->ptl);
 
diff --git a/mm/memory.c b/mm/memory.c
index 3c01d68065be..c081a25f5173 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vma)) {
 		if (likely(!unshare) &&
 		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
-			if (userfaultfd_wp_async(vmf->vma))
+			if (!userfaultfd_wp_async(vmf->vma))
+				return handle_userfault(vmf, VM_UFFD_WP);
+			if (!is_huge_zero_pmd(vmf->orig_pmd))
 				goto split;
-			return handle_userfault(vmf, VM_UFFD_WP);
 		}
 		return do_huge_pmd_wp_page(vmf);
 	}
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault
  2024-09-04 10:09 [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault Dev Jain
  2024-09-04 10:09 ` [PATCH v2 1/2] mm: Abstract THP allocation Dev Jain
  2024-09-04 10:09 ` [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault Dev Jain
@ 2024-09-04 11:36 ` Ryan Roberts
  2024-09-04 15:41   ` Dev Jain
  2 siblings, 1 reply; 22+ messages in thread
From: Ryan Roberts @ 2024-09-04 11:36 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm

Hi Dev,

On 04/09/2024 11:09, Dev Jain wrote:
> It was observed at [1] and [2] that the current kernel behaviour of
> shattering a hugezeropage is inconsistent and suboptimal. For a VMA with
> a THP allowable order, when we write-fault on it, the kernel installs a
> PMD-mapped THP. On the other hand, if we first get a read fault, we get
> a PMD pointing to the hugezeropage; subsequent write will trigger a
> write-protection fault, shattering the hugezeropage into one writable
> page, and all the other PTEs write-protected. The conclusion being, as
> compared to the case of a single write-fault, applications have to suffer
> 512 extra page faults if they were to use the VMA as such, plus we get
> the overhead of khugepaged trying to replace that area with a THP anyway.
> 
> Instead, replace the hugezeropage with a THP on wp-fault.
> 
> v1->v2:
>  - Wrap do_huge_zero_wp_pmd_locked() around lock and unlock
>  - Call thp_fault_alloc() before do_huge_zero_wp_pmd_locked() to avoid
>  - calling sleeping function from spinlock context
> 
> [1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/
> [2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/
> 
> Dev Jain (2):
>   mm: Abstract THP allocation
>   mm: Allocate THP on hugezeropage wp-fault
> 
>  include/linux/huge_mm.h |   6 ++
>  mm/huge_memory.c        | 171 +++++++++++++++++++++++++++++-----------
>  mm/memory.c             |   5 +-
>  3 files changed, 136 insertions(+), 46 deletions(-)
> 

What is the base for this? It doesn't apply on top of mm-unstable.

Thanks,
Ryan



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault
  2024-09-04 11:36 ` [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault Ryan Roberts
@ 2024-09-04 15:41   ` Dev Jain
  2024-09-04 16:01     ` Ryan Roberts
  0 siblings, 1 reply; 22+ messages in thread
From: Dev Jain @ 2024-09-04 15:41 UTC (permalink / raw)
  To: Ryan Roberts, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm


On 9/4/24 17:06, Ryan Roberts wrote:
> Hi Dev,
>
> On 04/09/2024 11:09, Dev Jain wrote:
>> It was observed at [1] and [2] that the current kernel behaviour of
>> shattering a hugezeropage is inconsistent and suboptimal. For a VMA with
>> a THP allowable order, when we write-fault on it, the kernel installs a
>> PMD-mapped THP. On the other hand, if we first get a read fault, we get
>> a PMD pointing to the hugezeropage; subsequent write will trigger a
>> write-protection fault, shattering the hugezeropage into one writable
>> page, and all the other PTEs write-protected. The conclusion being, as
>> compared to the case of a single write-fault, applications have to suffer
>> 512 extra page faults if they were to use the VMA as such, plus we get
>> the overhead of khugepaged trying to replace that area with a THP anyway.
>>
>> Instead, replace the hugezeropage with a THP on wp-fault.
>>
>> v1->v2:
>>   - Wrap do_huge_zero_wp_pmd_locked() around lock and unlock
>>   - Call thp_fault_alloc() before do_huge_zero_wp_pmd_locked() to avoid
>>   - calling sleeping function from spinlock context
>>
>> [1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/
>> [2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/
>>
>> Dev Jain (2):
>>    mm: Abstract THP allocation
>>    mm: Allocate THP on hugezeropage wp-fault
>>
>>   include/linux/huge_mm.h |   6 ++
>>   mm/huge_memory.c        | 171 +++++++++++++++++++++++++++++-----------
>>   mm/memory.c             |   5 +-
>>   3 files changed, 136 insertions(+), 46 deletions(-)
>>
> What is the base for this? It doesn't apply on top of mm-unstable.

Sorry, forgot to mention, it applies on v6.11-rc5.

>
> Thanks,
> Ryan
>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault
  2024-09-04 15:41   ` Dev Jain
@ 2024-09-04 16:01     ` Ryan Roberts
  0 siblings, 0 replies; 22+ messages in thread
From: Ryan Roberts @ 2024-09-04 16:01 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm

On 04/09/2024 16:41, Dev Jain wrote:
> 
> On 9/4/24 17:06, Ryan Roberts wrote:
>> Hi Dev,
>>
>> On 04/09/2024 11:09, Dev Jain wrote:
>>> It was observed at [1] and [2] that the current kernel behaviour of
>>> shattering a hugezeropage is inconsistent and suboptimal. For a VMA with
>>> a THP allowable order, when we write-fault on it, the kernel installs a
>>> PMD-mapped THP. On the other hand, if we first get a read fault, we get
>>> a PMD pointing to the hugezeropage; subsequent write will trigger a
>>> write-protection fault, shattering the hugezeropage into one writable
>>> page, and all the other PTEs write-protected. The conclusion being, as
>>> compared to the case of a single write-fault, applications have to suffer
>>> 512 extra page faults if they were to use the VMA as such, plus we get
>>> the overhead of khugepaged trying to replace that area with a THP anyway.
>>>
>>> Instead, replace the hugezeropage with a THP on wp-fault.
>>>
>>> v1->v2:
>>>   - Wrap do_huge_zero_wp_pmd_locked() around lock and unlock
>>>   - Call thp_fault_alloc() before do_huge_zero_wp_pmd_locked() to avoid
>>>   - calling sleeping function from spinlock context
>>>
>>> [1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/
>>> [2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/
>>>
>>> Dev Jain (2):
>>>    mm: Abstract THP allocation
>>>    mm: Allocate THP on hugezeropage wp-fault
>>>
>>>   include/linux/huge_mm.h |   6 ++
>>>   mm/huge_memory.c        | 171 +++++++++++++++++++++++++++++-----------
>>>   mm/memory.c             |   5 +-
>>>   3 files changed, 136 insertions(+), 46 deletions(-)
>>>
>> What is the base for this? It doesn't apply on top of mm-unstable.
> 
> Sorry, forgot to mention, it applies on v6.11-rc5.

Thanks, I'll give it a review tomorrow. Although I suspect that Andrew will want
it based against mm-unstable once it gets into shape for merging.

> 
>>
>> Thanks,
>> Ryan
>>



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] mm: Abstract THP allocation
  2024-09-04 10:09 ` [PATCH v2 1/2] mm: Abstract THP allocation Dev Jain
@ 2024-09-05  8:20   ` Kirill A. Shutemov
  2024-09-05  8:45     ` Dev Jain
  2024-09-05 11:08   ` Ryan Roberts
  1 sibling, 1 reply; 22+ messages in thread
From: Kirill A. Shutemov @ 2024-09-05  8:20 UTC (permalink / raw)
  To: Dev Jain
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual,
	catalin.marinas, cl, vbabka, mhocko, apopple, dave.hansen, will,
	baohua, jack, mark.rutland, hughd, aneesh.kumar, yang, peterx,
	ioworker0, jglisse, linux-arm-kernel, linux-kernel, linux-mm

On Wed, Sep 04, 2024 at 03:39:22PM +0530, Dev Jain wrote:
> In preparation for the second patch, abstract away the THP allocation
> logic present in the create_huge_pmd() path, which corresponds to the
> faulting case when no page is present.
> 
> There should be no functional change as a result of applying
> this patch.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>  1 file changed, 67 insertions(+), 43 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 67c86a5d64a6..58125fbcc532 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -943,47 +943,89 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>  }
>  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>  
> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> -			struct page *page, gfp_t gfp)
> +static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +				  unsigned long haddr, struct folio **foliop,
> +				  unsigned long addr)

foliop is awkward.

Why not return folio? NULL would indicate to the caller to fallback.

>  {
> -	struct vm_area_struct *vma = vmf->vma;
> -	struct folio *folio = page_folio(page);
> -	pgtable_t pgtable;
> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -	vm_fault_t ret = 0;
> +	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>  
> -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> +	*foliop = folio;
> +	if (unlikely(!folio)) {
> +		count_vm_event(THP_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		return VM_FAULT_FALLBACK;
> +	}
>  
> +	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>  	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>  		folio_put(folio);
>  		count_vm_event(THP_FAULT_FALLBACK);
>  		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>  		return VM_FAULT_FALLBACK;
>  	}
>  	folio_throttle_swaprate(folio, gfp);
>  
> -	pgtable = pte_alloc_one(vma->vm_mm);
> -	if (unlikely(!pgtable)) {
> -		ret = VM_FAULT_OOM;
> -		goto release;
> -	}
> -
> -	folio_zero_user(folio, vmf->address);
> +	folio_zero_user(folio, addr);
>  	/*
>  	 * The memory barrier inside __folio_mark_uptodate makes sure that
>  	 * folio_zero_user writes become visible before the set_pmd_at()
>  	 * write.
>  	 */
>  	__folio_mark_uptodate(folio);
> +	return 0;
> +}
> +
> +static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
> +{
> +	count_vm_event(THP_FAULT_ALLOC);
> +	count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
> +	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> +}
> +
> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +			struct vm_area_struct *vma, unsigned long haddr,
> +			pgtable_t pgtable)
> +{
> +	pmd_t entry;
> +
> +	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
> +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> +	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
> +	folio_add_lru_vma(folio, vma);
> +	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> +	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> +	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> +	mm_inc_nr_ptes(vma->vm_mm);
> +}
> +
> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct folio *folio = NULL;
> +	pgtable_t pgtable;
> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +	vm_fault_t ret = 0;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);
> +
> +	pgtable = pte_alloc_one(vma->vm_mm);
> +	if (unlikely(!pgtable)) {
> +		ret = VM_FAULT_OOM;
> +		goto release;
> +	}
> +
> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
> +			      vmf->address);
> +	if (ret)
> +		goto release;

THP page allocation has higher probability to fail than pgtable allocation. It
is better to allocate it first, before pgtable and do less work on error
path.

>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> +
>  	if (unlikely(!pmd_none(*vmf->pmd))) {
>  		goto unlock_release;
>  	} else {
> -		pmd_t entry;
> -
>  		ret = check_stable_address_space(vma->vm_mm);
>  		if (ret)
>  			goto unlock_release;
> @@ -997,20 +1039,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>  			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
>  			return ret;
>  		}
> -
> -		entry = mk_huge_pmd(page, vma->vm_page_prot);
> -		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -		folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
> -		folio_add_lru_vma(folio, vma);
> -		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> -		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> -		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -		mm_inc_nr_ptes(vma->vm_mm);
> +		map_pmd_thp(folio, vmf, vma, haddr, pgtable);
>  		spin_unlock(vmf->ptl);
> -		count_vm_event(THP_FAULT_ALLOC);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
> -		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>  	}
>  
>  	return 0;
> @@ -1019,7 +1050,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>  release:
>  	if (pgtable)
>  		pte_free(vma->vm_mm, pgtable);
> -	folio_put(folio);
> +	if (folio)
> +		folio_put(folio);
>  	return ret;
>  
>  }
> @@ -1077,8 +1109,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
>  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
> -	gfp_t gfp;
> -	struct folio *folio;
>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>  	vm_fault_t ret;
>  
> @@ -1129,14 +1159,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>  		}
>  		return ret;
>  	}
> -	gfp = vma_thp_gfp_mask(vma);
> -	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
> -	if (unlikely(!folio)) {
> -		count_vm_event(THP_FAULT_FALLBACK);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> -		return VM_FAULT_FALLBACK;
> -	}
> -	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
> +
> +	return __do_huge_pmd_anonymous_page(vmf);
>  }
>  
>  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
> -- 
> 2.30.2
> 

-- 
  Kiryl Shutsemau / Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-04 10:09 ` [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault Dev Jain
@ 2024-09-05  8:26   ` Kirill A. Shutemov
  2024-09-05  8:52     ` Dev Jain
  2024-09-05  9:41     ` Kefeng Wang
  2024-09-05 13:14   ` Ryan Roberts
  1 sibling, 2 replies; 22+ messages in thread
From: Kirill A. Shutemov @ 2024-09-05  8:26 UTC (permalink / raw)
  To: Dev Jain
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual,
	catalin.marinas, cl, vbabka, mhocko, apopple, dave.hansen, will,
	baohua, jack, mark.rutland, hughd, aneesh.kumar, yang, peterx,
	ioworker0, jglisse, linux-arm-kernel, linux-kernel, linux-mm

On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
> replace it with a PMD-mapped THP. Change the helpers introduced in the
> previous patch to flush TLB entry corresponding to the hugezeropage,
> and preserve PMD uffd-wp marker. In case of failure, fallback to
> splitting the PMD.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  include/linux/huge_mm.h |  6 ++++
>  mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>  mm/memory.c             |  5 +--
>  3 files changed, 78 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e25d9ebfdf89..fdd2cf473a3c 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -9,6 +9,12 @@
>  #include <linux/kobject.h>
>  
>  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +			   unsigned long haddr, struct folio **foliop,
> +			   unsigned long addr);
> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +		 struct vm_area_struct *vma, unsigned long haddr,
> +		 pgtable_t pgtable);

Why? I don't see users outside huge_memory.c.

>  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>  		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 58125fbcc532..150163ad77d3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>  }
>  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>  
> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> -				  unsigned long haddr, struct folio **foliop,
> -				  unsigned long addr)
> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +			   unsigned long haddr, struct folio **foliop,
> +			   unsigned long addr)
>  {
>  	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>  
> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>  	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>  }
>  
> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> -			struct vm_area_struct *vma, unsigned long haddr,
> -			pgtable_t pgtable)
> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +		 struct vm_area_struct *vma, unsigned long haddr,
> +		 pgtable_t pgtable)
>  {
> -	pmd_t entry;
> +	pmd_t entry, old_pmd;
> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>  
>  	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>  	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>  	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>  	folio_add_lru_vma(folio, vma);
> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> +	if (!is_pmd_none) {
> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
> +		if (pmd_uffd_wp(old_pmd))
> +			entry = pmd_mkuffd_wp(entry);
> +	}
> +	if (pgtable)
> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>  	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>  	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>  	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -	mm_inc_nr_ptes(vma->vm_mm);
> +	if (is_pmd_none)
> +		mm_inc_nr_ptes(vma->vm_mm);
>  }
>  
>  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>  	spin_unlock(vmf->ptl);
>  }
>  
> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
> +					     unsigned long haddr,
> +					     struct folio *folio)

Why the helper is needed? Cannot it be just opencodded in
do_huge_zero_wp_pmd()?

> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	vm_fault_t ret = 0;
> +
> +	ret = check_stable_address_space(vma->vm_mm);
> +	if (ret)
> +		goto out;
> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
> +out:
> +	return ret;
> +}
> +
> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);
> +	struct mmu_notifier_range range;
> +	struct folio *folio = NULL;
> +	vm_fault_t ret = 0;
> +
> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
> +			      vmf->address);
> +	if (ret)
> +		goto out;
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
> +				haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
> +		goto unlock;
> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
> +	if (!ret)
> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
> +unlock:
> +	spin_unlock(vmf->ptl);
> +	mmu_notifier_invalidate_range_end(&range);
> +out:
> +	return ret;
> +}
> +
>  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>  {
>  	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>  	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>  
> -	if (is_huge_zero_pmd(orig_pmd))
> +	if (is_huge_zero_pmd(orig_pmd)) {
> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
> +
> +		if (!(ret & VM_FAULT_FALLBACK))
> +			return ret;
> +
> +		/* Fallback to splitting PMD if THP cannot be allocated */
>  		goto fallback;
> +	}
>  
>  	spin_lock(vmf->ptl);
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 3c01d68065be..c081a25f5173 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>  	if (vma_is_anonymous(vma)) {
>  		if (likely(!unshare) &&
>  		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
> -			if (userfaultfd_wp_async(vmf->vma))
> +			if (!userfaultfd_wp_async(vmf->vma))
> +				return handle_userfault(vmf, VM_UFFD_WP);
> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>  				goto split;
> -			return handle_userfault(vmf, VM_UFFD_WP);
>  		}
>  		return do_huge_pmd_wp_page(vmf);
>  	}
> -- 
> 2.30.2
> 

-- 
  Kiryl Shutsemau / Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] mm: Abstract THP allocation
  2024-09-05  8:20   ` Kirill A. Shutemov
@ 2024-09-05  8:45     ` Dev Jain
  0 siblings, 0 replies; 22+ messages in thread
From: Dev Jain @ 2024-09-05  8:45 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual,
	catalin.marinas, cl, vbabka, mhocko, apopple, dave.hansen, will,
	baohua, jack, mark.rutland, hughd, aneesh.kumar, yang, peterx,
	ioworker0, jglisse, linux-arm-kernel, linux-kernel, linux-mm


On 9/5/24 13:50, Kirill A. Shutemov wrote:
> On Wed, Sep 04, 2024 at 03:39:22PM +0530, Dev Jain wrote:
>> In preparation for the second patch, abstract away the THP allocation
>> logic present in the create_huge_pmd() path, which corresponds to the
>> faulting case when no page is present.
>>
>> There should be no functional change as a result of applying
>> this patch.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>>   1 file changed, 67 insertions(+), 43 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 67c86a5d64a6..58125fbcc532 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,47 +943,89 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   
>> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>> -			struct page *page, gfp_t gfp)
>> +static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +				  unsigned long haddr, struct folio **foliop,
>> +				  unsigned long addr)
> foliop is awkward.
>
> Why not return folio? NULL would indicate to the caller to fallback.

I took inspiration from other call sites like mfill_atomic_pte_copy() which
have a double pointer to the folio. If we do as you say, then in thp_fault_alloc(),
we will have to do all the fallback stat computation, and return the folio, then
check in the caller whether that is NULL, then set ret = VM_FAULT_FALLBACK, then
goto out/release. Basically, stat computation for fallback, and the actual setting
of return value to fallback would get separated. Currently, I can simultaneously
set the folio pointer and the return value.

>
>>   {
>> -	struct vm_area_struct *vma = vmf->vma;
>> -	struct folio *folio = page_folio(page);
>> -	pgtable_t pgtable;
>> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> -	vm_fault_t ret = 0;
>> +	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>   
>> -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>> +	*foliop = folio;
>> +	if (unlikely(!folio)) {
>> +		count_vm_event(THP_FAULT_FALLBACK);
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +		return VM_FAULT_FALLBACK;
>> +	}
>>   
>> +	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>   	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>   		folio_put(folio);
>>   		count_vm_event(THP_FAULT_FALLBACK);
>>   		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>   		return VM_FAULT_FALLBACK;
>>   	}
>>   	folio_throttle_swaprate(folio, gfp);
>>   
>> -	pgtable = pte_alloc_one(vma->vm_mm);
>> -	if (unlikely(!pgtable)) {
>> -		ret = VM_FAULT_OOM;
>> -		goto release;
>> -	}
>> -
>> -	folio_zero_user(folio, vmf->address);
>> +	folio_zero_user(folio, addr);
>>   	/*
>>   	 * The memory barrier inside __folio_mark_uptodate makes sure that
>>   	 * folio_zero_user writes become visible before the set_pmd_at()
>>   	 * write.
>>   	 */
>>   	__folio_mark_uptodate(folio);
>> +	return 0;
>> +}
>> +
>> +static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>> +{
>> +	count_vm_event(THP_FAULT_ALLOC);
>> +	count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
>> +	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>> +}
>> +
>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +			struct vm_area_struct *vma, unsigned long haddr,
>> +			pgtable_t pgtable)
>> +{
>> +	pmd_t entry;
>> +
>> +	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>> +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> +	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>> +	folio_add_lru_vma(folio, vma);
>> +	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> +	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>> +	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> +	mm_inc_nr_ptes(vma->vm_mm);
>> +}
>> +
>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	struct folio *folio = NULL;
>> +	pgtable_t pgtable;
>> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> +	vm_fault_t ret = 0;
>> +	gfp_t gfp = vma_thp_gfp_mask(vma);
>> +
>> +	pgtable = pte_alloc_one(vma->vm_mm);
>> +	if (unlikely(!pgtable)) {
>> +		ret = VM_FAULT_OOM;
>> +		goto release;
>> +	}
>> +
>> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>> +			      vmf->address);
>> +	if (ret)
>> +		goto release;
> THP page allocation has higher probability to fail than pgtable allocation. It
> is better to allocate it first, before pgtable and do less work on error
> path.

Thanks, makes sense.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-05  8:26   ` Kirill A. Shutemov
@ 2024-09-05  8:52     ` Dev Jain
  2024-09-05  9:41     ` Kefeng Wang
  1 sibling, 0 replies; 22+ messages in thread
From: Dev Jain @ 2024-09-05  8:52 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual,
	catalin.marinas, cl, vbabka, mhocko, apopple, dave.hansen, will,
	baohua, jack, mark.rutland, hughd, aneesh.kumar, yang, peterx,
	ioworker0, jglisse, linux-arm-kernel, linux-kernel, linux-mm


On 9/5/24 13:56, Kirill A. Shutemov wrote:
> On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>> previous patch to flush TLB entry corresponding to the hugezeropage,
>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>> splitting the PMD.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   include/linux/huge_mm.h |  6 ++++
>>   mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>   mm/memory.c             |  5 +--
>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..fdd2cf473a3c 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -9,6 +9,12 @@
>>   #include <linux/kobject.h>
>>   
>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr);
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable);
> Why? I don't see users outside huge_memory.c.


Ah sorry! When I started out on this I may have planned to use
it in mm/memory.c, but then completely forgot to drop this.

>
>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>   		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>   		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 58125fbcc532..150163ad77d3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   
>> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> -				  unsigned long haddr, struct folio **foliop,
>> -				  unsigned long addr)
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr)
>>   {
>>   	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>   
>> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>   	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>   }
>>   
>> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> -			struct vm_area_struct *vma, unsigned long haddr,
>> -			pgtable_t pgtable)
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable)
>>   {
>> -	pmd_t entry;
>> +	pmd_t entry, old_pmd;
>> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>>   
>>   	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>   	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>   	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>   	folio_add_lru_vma(folio, vma);
>> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> +	if (!is_pmd_none) {
>> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>> +		if (pmd_uffd_wp(old_pmd))
>> +			entry = pmd_mkuffd_wp(entry);
>> +	}
>> +	if (pgtable)
>> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>   	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>   	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>   	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> -	mm_inc_nr_ptes(vma->vm_mm);
>> +	if (is_pmd_none)
>> +		mm_inc_nr_ptes(vma->vm_mm);
>>   }
>>   
>>   static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>   	spin_unlock(vmf->ptl);
>>   }
>>   
>> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>> +					     unsigned long haddr,
>> +					     struct folio *folio)
> Why the helper is needed? Cannot it be just opencodded in
> do_huge_zero_wp_pmd()?

I was taking inspiration from split_huge_pud_locked() and the like.
Although I guess I should open code it, since do_huge_zero_wp_pmd_locked()
is small anyways.

>
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = check_stable_address_space(vma->vm_mm);
>> +	if (ret)
>> +		goto out;
>> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	gfp_t gfp = vma_thp_gfp_mask(vma);
>> +	struct mmu_notifier_range range;
>> +	struct folio *folio = NULL;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>> +			      vmf->address);
>> +	if (ret)
>> +		goto out;
>> +
>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>> +				haddr + HPAGE_PMD_SIZE);
>> +	mmu_notifier_invalidate_range_start(&range);
>> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>> +		goto unlock;
>> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>> +	if (!ret)
>> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>> +unlock:
>> +	spin_unlock(vmf->ptl);
>> +	mmu_notifier_invalidate_range_end(&range);
>> +out:
>> +	return ret;
>> +}
>> +
>>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   {
>>   	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>   	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>   
>> -	if (is_huge_zero_pmd(orig_pmd))
>> +	if (is_huge_zero_pmd(orig_pmd)) {
>> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>> +
>> +		if (!(ret & VM_FAULT_FALLBACK))
>> +			return ret;
>> +
>> +		/* Fallback to splitting PMD if THP cannot be allocated */
>>   		goto fallback;
>> +	}
>>   
>>   	spin_lock(vmf->ptl);
>>   
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 3c01d68065be..c081a25f5173 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>>   	if (vma_is_anonymous(vma)) {
>>   		if (likely(!unshare) &&
>>   		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>> -			if (userfaultfd_wp_async(vmf->vma))
>> +			if (!userfaultfd_wp_async(vmf->vma))
>> +				return handle_userfault(vmf, VM_UFFD_WP);
>> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>>   				goto split;
>> -			return handle_userfault(vmf, VM_UFFD_WP);
>>   		}
>>   		return do_huge_pmd_wp_page(vmf);
>>   	}
>> -- 
>> 2.30.2
>>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-05  8:26   ` Kirill A. Shutemov
  2024-09-05  8:52     ` Dev Jain
@ 2024-09-05  9:41     ` Kefeng Wang
  2024-09-05  9:53       ` Dev Jain
  1 sibling, 1 reply; 22+ messages in thread
From: Kefeng Wang @ 2024-09-05  9:41 UTC (permalink / raw)
  To: Kirill A. Shutemov, Dev Jain
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual,
	catalin.marinas, cl, vbabka, mhocko, apopple, dave.hansen, will,
	baohua, jack, mark.rutland, hughd, aneesh.kumar, yang, peterx,
	ioworker0, jglisse, linux-arm-kernel, linux-kernel, linux-mm



On 2024/9/5 16:26, Kirill A. Shutemov wrote:
> On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>> previous patch to flush TLB entry corresponding to the hugezeropage,
>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>> splitting the PMD.

We met same issue, and a very similar function in our kernel,
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   include/linux/huge_mm.h |  6 ++++
>>   mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>   mm/memory.c             |  5 +--
>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..fdd2cf473a3c 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -9,6 +9,12 @@
>>   #include <linux/kobject.h>
>>   
>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr);
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable);
> 
> Why? I don't see users outside huge_memory.c.
> 
>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>   		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>   		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 58125fbcc532..150163ad77d3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   
>> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> -				  unsigned long haddr, struct folio **foliop,
>> -				  unsigned long addr)
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr)
>>   {
>>   	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>   
>> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>   	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>   }
>>   
>> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> -			struct vm_area_struct *vma, unsigned long haddr,
>> -			pgtable_t pgtable)
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable)
>>   {
>> -	pmd_t entry;
>> +	pmd_t entry, old_pmd;
>> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>>   
>>   	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>   	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>   	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>   	folio_add_lru_vma(folio, vma);
>> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> +	if (!is_pmd_none) {
>> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>> +		if (pmd_uffd_wp(old_pmd))
>> +			entry = pmd_mkuffd_wp(entry);
>> +	}
>> +	if (pgtable)
>> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>   	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>   	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>   	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> -	mm_inc_nr_ptes(vma->vm_mm);
>> +	if (is_pmd_none)
>> +		mm_inc_nr_ptes(vma->vm_mm);
>>   }
>>   
>>   static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>   	spin_unlock(vmf->ptl);
>>   }
>>   
>> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>> +					     unsigned long haddr,
>> +					     struct folio *folio)
> 
> Why the helper is needed? Cannot it be just opencodded in
> do_huge_zero_wp_pmd()?
> 
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = check_stable_address_space(vma->vm_mm);
>> +	if (ret)
>> +		goto out;
>> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	gfp_t gfp = vma_thp_gfp_mask(vma);
>> +	struct mmu_notifier_range range;
>> +	struct folio *folio = NULL;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>> +			      vmf->address);
>> +	if (ret)
>> +		goto out;
>> +
>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>> +				haddr + HPAGE_PMD_SIZE);
>> +	mmu_notifier_invalidate_range_start(&range);
>> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>> +		goto unlock;
>> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>> +	if (!ret)
>> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>> +unlock:
>> +	spin_unlock(vmf->ptl);
>> +	mmu_notifier_invalidate_range_end(&range);

the folio need to released when !pmd_same() and 
check_stable_address_space() return VM_FAULT_SIGBUS.

>> +out:
>> +	return ret;
>> +}
>> +
>>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   {
>>   	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>   	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>   
>> -	if (is_huge_zero_pmd(orig_pmd))
>> +	if (is_huge_zero_pmd(orig_pmd)) {
>> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>> +
>> +		if (!(ret & VM_FAULT_FALLBACK))
>> +			return ret;
>> +
>> +		/* Fallback to splitting PMD if THP cannot be allocated */
>>   		goto fallback;
>> +	}
>>   
>>   	spin_lock(vmf->ptl);
>>   
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 3c01d68065be..c081a25f5173 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>>   	if (vma_is_anonymous(vma)) {
>>   		if (likely(!unshare) &&
>>   		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>> -			if (userfaultfd_wp_async(vmf->vma))
>> +			if (!userfaultfd_wp_async(vmf->vma))
>> +				return handle_userfault(vmf, VM_UFFD_WP);
>> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>>   				goto split;
>> -			return handle_userfault(vmf, VM_UFFD_WP);
>>   		}
>>   		return do_huge_pmd_wp_page(vmf);
>>   	}
>> -- 
>> 2.30.2
>>
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-05  9:41     ` Kefeng Wang
@ 2024-09-05  9:53       ` Dev Jain
  0 siblings, 0 replies; 22+ messages in thread
From: Dev Jain @ 2024-09-05  9:53 UTC (permalink / raw)
  To: Kefeng Wang, Kirill A. Shutemov
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual,
	catalin.marinas, cl, vbabka, mhocko, apopple, dave.hansen, will,
	baohua, jack, mark.rutland, hughd, aneesh.kumar, yang, peterx,
	ioworker0, jglisse, linux-arm-kernel, linux-kernel, linux-mm


On 9/5/24 15:11, Kefeng Wang wrote:
>
>
> On 2024/9/5 16:26, Kirill A. Shutemov wrote:
>> On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
>>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage 
>>> and
>>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>>> previous patch to flush TLB entry corresponding to the hugezeropage,
>>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>>> splitting the PMD.
>
> We met same issue, and a very similar function in our kernel,
>>>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>   include/linux/huge_mm.h |  6 ++++
>>>   mm/huge_memory.c        | 79 
>>> +++++++++++++++++++++++++++++++++++------
>>>   mm/memory.c             |  5 +--
>>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e25d9ebfdf89..fdd2cf473a3c 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -9,6 +9,12 @@
>>>   #include <linux/kobject.h>
>>>     vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct 
>>> vm_area_struct *vma,
>>> +               unsigned long haddr, struct folio **foliop,
>>> +               unsigned long addr);
>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>> +         pgtable_t pgtable);
>>
>> Why? I don't see users outside huge_memory.c.
>>
>>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>             pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>>             struct vm_area_struct *dst_vma, struct vm_area_struct 
>>> *src_vma);
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 58125fbcc532..150163ad77d3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file 
>>> *filp, unsigned long addr,
>>>   }
>>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>   -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct 
>>> vm_area_struct *vma,
>>> -                  unsigned long haddr, struct folio **foliop,
>>> -                  unsigned long addr)
>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct 
>>> vm_area_struct *vma,
>>> +               unsigned long haddr, struct folio **foliop,
>>> +               unsigned long addr)
>>>   {
>>>       struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
>>> true);
>>>   @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct 
>>> vm_area_struct *vma, int order)
>>>       count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>   }
>>>   -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> -            struct vm_area_struct *vma, unsigned long haddr,
>>> -            pgtable_t pgtable)
>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>> +         pgtable_t pgtable)
>>>   {
>>> -    pmd_t entry;
>>> +    pmd_t entry, old_pmd;
>>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>>         entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>       folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>       folio_add_lru_vma(folio, vma);
>>> -    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>> +    if (!is_pmd_none) {
>>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>>> +        if (pmd_uffd_wp(old_pmd))
>>> +            entry = pmd_mkuffd_wp(entry);
>>> +    }
>>> +    if (pgtable)
>>> +        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>       update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>> -    mm_inc_nr_ptes(vma->vm_mm);
>>> +    if (is_pmd_none)
>>> +        mm_inc_nr_ptes(vma->vm_mm);
>>>   }
>>>     static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault 
>>> *vmf)
>>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>       spin_unlock(vmf->ptl);
>>>   }
>>>   +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>>> +                         unsigned long haddr,
>>> +                         struct folio *folio)
>>
>> Why the helper is needed? Cannot it be just opencodded in
>> do_huge_zero_wp_pmd()?
>>
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    vm_fault_t ret = 0;
>>> +
>>> +    ret = check_stable_address_space(vma->vm_mm);
>>> +    if (ret)
>>> +        goto out;
>>> +    map_pmd_thp(folio, vmf, vma, haddr, NULL);
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, 
>>> unsigned long haddr)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>> +    struct mmu_notifier_range range;
>>> +    struct folio *folio = NULL;
>>> +    vm_fault_t ret = 0;
>>> +
>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>> +                  vmf->address);
>>> +    if (ret)
>>> +        goto out;
>>> +
>>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, 
>>> vma->vm_mm, haddr,
>>> +                haddr + HPAGE_PMD_SIZE);
>>> +    mmu_notifier_invalidate_range_start(&range);
>>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>>> +        goto unlock;
>>> +    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>>> +    if (!ret)
>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>> +unlock:
>>> +    spin_unlock(vmf->ptl);
>>> +    mmu_notifier_invalidate_range_end(&range);
>
> the folio need to released when !pmd_same() and 
> check_stable_address_space() return VM_FAULT_SIGBUS.

Yes, thanks.
>
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>>
>>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] mm: Abstract THP allocation
  2024-09-04 10:09 ` [PATCH v2 1/2] mm: Abstract THP allocation Dev Jain
  2024-09-05  8:20   ` Kirill A. Shutemov
@ 2024-09-05 11:08   ` Ryan Roberts
  2024-09-06  5:42     ` Dev Jain
  1 sibling, 1 reply; 22+ messages in thread
From: Ryan Roberts @ 2024-09-05 11:08 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm

On 04/09/2024 11:09, Dev Jain wrote:
> In preparation for the second patch, abstract away the THP allocation
> logic present in the create_huge_pmd() path, which corresponds to the
> faulting case when no page is present.
> 
> There should be no functional change as a result of applying
> this patch.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>  1 file changed, 67 insertions(+), 43 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 67c86a5d64a6..58125fbcc532 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -943,47 +943,89 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>  }
>  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>  
> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> -			struct page *page, gfp_t gfp)
> +static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,

Is there a reason for specifying order as a parameter? Previously it was
hardcoded to HPAGE_PMD_ORDER. But now, thp_fault_alloc() and
__thp_fault_success_stats() both take order and map_pmd_thp() is still
implicitly mapping a PMD-sized block. Unless there is a reason you need this
parameter in the next patch (I don't think there is?) I suggest simplifying.

> +				  unsigned long haddr, struct folio **foliop,

FWIW, I agree with Kirill's suggestion to just return folio* and drop the output
param.

Thanks,
Ryan

> +				  unsigned long addr)
>  {
> -	struct vm_area_struct *vma = vmf->vma;
> -	struct folio *folio = page_folio(page);
> -	pgtable_t pgtable;
> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -	vm_fault_t ret = 0;
> +	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>  
> -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> +	*foliop = folio;
> +	if (unlikely(!folio)) {
> +		count_vm_event(THP_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		return VM_FAULT_FALLBACK;
> +	}
>  
> +	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>  	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>  		folio_put(folio);
>  		count_vm_event(THP_FAULT_FALLBACK);
>  		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>  		return VM_FAULT_FALLBACK;
>  	}
>  	folio_throttle_swaprate(folio, gfp);
>  
> -	pgtable = pte_alloc_one(vma->vm_mm);
> -	if (unlikely(!pgtable)) {
> -		ret = VM_FAULT_OOM;
> -		goto release;
> -	}
> -
> -	folio_zero_user(folio, vmf->address);
> +	folio_zero_user(folio, addr);
>  	/*
>  	 * The memory barrier inside __folio_mark_uptodate makes sure that
>  	 * folio_zero_user writes become visible before the set_pmd_at()
>  	 * write.
>  	 */
>  	__folio_mark_uptodate(folio);
> +	return 0;
> +}
> +
> +static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
> +{
> +	count_vm_event(THP_FAULT_ALLOC);
> +	count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
> +	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> +}
> +
> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +			struct vm_area_struct *vma, unsigned long haddr,
> +			pgtable_t pgtable)
> +{
> +	pmd_t entry;
> +
> +	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
> +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> +	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
> +	folio_add_lru_vma(folio, vma);
> +	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> +	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> +	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> +	mm_inc_nr_ptes(vma->vm_mm);
> +}
> +
> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct folio *folio = NULL;
> +	pgtable_t pgtable;
> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +	vm_fault_t ret = 0;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);
> +
> +	pgtable = pte_alloc_one(vma->vm_mm);
> +	if (unlikely(!pgtable)) {
> +		ret = VM_FAULT_OOM;
> +		goto release;
> +	}
> +
> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
> +			      vmf->address);
> +	if (ret)
> +		goto release;
>  
>  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> +
>  	if (unlikely(!pmd_none(*vmf->pmd))) {
>  		goto unlock_release;
>  	} else {
> -		pmd_t entry;
> -
>  		ret = check_stable_address_space(vma->vm_mm);
>  		if (ret)
>  			goto unlock_release;
> @@ -997,20 +1039,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>  			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
>  			return ret;
>  		}
> -
> -		entry = mk_huge_pmd(page, vma->vm_page_prot);
> -		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -		folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
> -		folio_add_lru_vma(folio, vma);
> -		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> -		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> -		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -		mm_inc_nr_ptes(vma->vm_mm);
> +		map_pmd_thp(folio, vmf, vma, haddr, pgtable);
>  		spin_unlock(vmf->ptl);
> -		count_vm_event(THP_FAULT_ALLOC);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
> -		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>  	}
>  
>  	return 0;
> @@ -1019,7 +1050,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>  release:
>  	if (pgtable)
>  		pte_free(vma->vm_mm, pgtable);
> -	folio_put(folio);
> +	if (folio)
> +		folio_put(folio);
>  	return ret;
>  
>  }
> @@ -1077,8 +1109,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
>  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
> -	gfp_t gfp;
> -	struct folio *folio;
>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>  	vm_fault_t ret;
>  
> @@ -1129,14 +1159,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>  		}
>  		return ret;
>  	}
> -	gfp = vma_thp_gfp_mask(vma);
> -	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
> -	if (unlikely(!folio)) {
> -		count_vm_event(THP_FAULT_FALLBACK);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> -		return VM_FAULT_FALLBACK;
> -	}
> -	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
> +
> +	return __do_huge_pmd_anonymous_page(vmf);
>  }
>  
>  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-04 10:09 ` [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault Dev Jain
  2024-09-05  8:26   ` Kirill A. Shutemov
@ 2024-09-05 13:14   ` Ryan Roberts
  2024-09-06  7:05     ` Dev Jain
  1 sibling, 1 reply; 22+ messages in thread
From: Ryan Roberts @ 2024-09-05 13:14 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm

On 04/09/2024 11:09, Dev Jain wrote:
> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
> replace it with a PMD-mapped THP. Change the helpers introduced in the
> previous patch to flush TLB entry corresponding to the hugezeropage,
> and preserve PMD uffd-wp marker. In case of failure, fallback to
> splitting the PMD.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  include/linux/huge_mm.h |  6 ++++
>  mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>  mm/memory.c             |  5 +--
>  3 files changed, 78 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e25d9ebfdf89..fdd2cf473a3c 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -9,6 +9,12 @@
>  #include <linux/kobject.h>
>  
>  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +			   unsigned long haddr, struct folio **foliop,
> +			   unsigned long addr);
> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +		 struct vm_area_struct *vma, unsigned long haddr,
> +		 pgtable_t pgtable);

I don't think you are using either of these outside of huge_memory.c, so not
sure you need to declare them here or make them non-static?

>  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>  		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 58125fbcc532..150163ad77d3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>  }
>  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>  
> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> -				  unsigned long haddr, struct folio **foliop,
> -				  unsigned long addr)
> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +			   unsigned long haddr, struct folio **foliop,
> +			   unsigned long addr)
>  {
>  	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>  
> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>  	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>  }
>  
> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> -			struct vm_area_struct *vma, unsigned long haddr,
> -			pgtable_t pgtable)
> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +		 struct vm_area_struct *vma, unsigned long haddr,
> +		 pgtable_t pgtable)
>  {
> -	pmd_t entry;
> +	pmd_t entry, old_pmd;
> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>  
>  	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>  	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>  	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>  	folio_add_lru_vma(folio, vma);
> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> +	if (!is_pmd_none) {
> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
> +		if (pmd_uffd_wp(old_pmd))
> +			entry = pmd_mkuffd_wp(entry);

I don't really get this; entry is writable, so I wouldn't expect to also be
setting uffd-wp here? That combination is not allowed and is checked for in
page_table_check_pte_flags().

It looks like you expect to get here in the uffd-wp-async case, which used to
cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
would cause the uffd-wp bit to be set in each of the new ptes, then during
fallback to handling the wp fault on the pte, uffd would handle it?

> +	}
> +	if (pgtable)
> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);

Should this call be moved outside of here? It doesn't really feel like it
belongs. Could it be called before calling map_pmd_thp() for the site that has a
pgtable?

>  	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>  	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>  	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -	mm_inc_nr_ptes(vma->vm_mm);
> +	if (is_pmd_none)
> +		mm_inc_nr_ptes(vma->vm_mm);
>  }
>  
>  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>  	spin_unlock(vmf->ptl);
>  }
>  
> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
> +					     unsigned long haddr,
> +					     struct folio *folio)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	vm_fault_t ret = 0;
> +
> +	ret = check_stable_address_space(vma->vm_mm);
> +	if (ret)
> +		goto out;
> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
> +out:
> +	return ret;
> +}
> +
> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);
> +	struct mmu_notifier_range range;
> +	struct folio *folio = NULL;
> +	vm_fault_t ret = 0;
> +
> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
> +			      vmf->address);

Just checking: the PTE table was already allocated during the read fault, right?
So we don't have to allocate it here.

> +	if (ret)
> +		goto out;
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
> +				haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
> +		goto unlock;
> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
> +	if (!ret)
> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
> +unlock:
> +	spin_unlock(vmf->ptl);
> +	mmu_notifier_invalidate_range_end(&range);

I'll confess I don't understand all the mmu notifier rules. But the doc at
Documentation/mm/mmu_notifier.rst implies that the notification must be done
while holding the PTL. Although that's not how wp_page_copy(). Are you confident
what you have done is correct?

Thanks,
Ryan

> +out:
> +	return ret;
> +}
> +
>  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>  {
>  	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>  	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>  
> -	if (is_huge_zero_pmd(orig_pmd))
> +	if (is_huge_zero_pmd(orig_pmd)) {
> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
> +
> +		if (!(ret & VM_FAULT_FALLBACK))
> +			return ret;
> +
> +		/* Fallback to splitting PMD if THP cannot be allocated */
>  		goto fallback;
> +	}
>  
>  	spin_lock(vmf->ptl);
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 3c01d68065be..c081a25f5173 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>  	if (vma_is_anonymous(vma)) {
>  		if (likely(!unshare) &&
>  		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
> -			if (userfaultfd_wp_async(vmf->vma))
> +			if (!userfaultfd_wp_async(vmf->vma))
> +				return handle_userfault(vmf, VM_UFFD_WP);
> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>  				goto split;
> -			return handle_userfault(vmf, VM_UFFD_WP);
>  		}
>  		return do_huge_pmd_wp_page(vmf);
>  	}



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] mm: Abstract THP allocation
  2024-09-05 11:08   ` Ryan Roberts
@ 2024-09-06  5:42     ` Dev Jain
  2024-09-06  8:28       ` Ryan Roberts
  0 siblings, 1 reply; 22+ messages in thread
From: Dev Jain @ 2024-09-06  5:42 UTC (permalink / raw)
  To: Ryan Roberts, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm


On 9/5/24 16:38, Ryan Roberts wrote:
> On 04/09/2024 11:09, Dev Jain wrote:
>> In preparation for the second patch, abstract away the THP allocation
>> logic present in the create_huge_pmd() path, which corresponds to the
>> faulting case when no page is present.
>>
>> There should be no functional change as a result of applying
>> this patch.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>>   1 file changed, 67 insertions(+), 43 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 67c86a5d64a6..58125fbcc532 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,47 +943,89 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   
>> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>> -			struct page *page, gfp_t gfp)
>> +static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> Is there a reason for specifying order as a parameter? Previously it was
> hardcoded to HPAGE_PMD_ORDER. But now, thp_fault_alloc() and
> __thp_fault_success_stats() both take order and map_pmd_thp() is still
> implicitly mapping a PMD-sized block. Unless there is a reason you need this
> parameter in the next patch (I don't think there is?) I suggest simplifying.

If I am not wrong, thp_fault_alloc() and __thp_fault_success_stats()
will remain the same in case of mTHP? I chose to pass order so that these
functions can be used by others in the future. Therefore, these two functions
can be generically used in the future while map_pmd_thp() (as the name suggests)
maps only a PMD-mappable THP.

>
>> +				  unsigned long haddr, struct folio **foliop,
> FWIW, I agree with Kirill's suggestion to just return folio* and drop the output
> param.

As I replied to Kirill, I do not think that is a good idea. If you do a git grep on
the tree for "foliop", you will find several places where that is being used, for
example, check out alloc_charge_folio() in mm/khugepaged.c. The author intends to
do the stat computation and setting *foliop in the function itself, so that the
caller is only concerned with the return value.
Also, if we return the folio instead of VM_FAULT_FALLBACK from thp_fault_alloc(),
then you will have two "if (unlikely(!folio))" branches, the first to do the stat
computation in the function, and the second branch in the caller to set ret = VM_FAULT_FALLBACK
and then goto out/release.
And, it is already inconsistent to break away the stat computation and the return value
setting, when the stat computation name is of the form "count_vm_event(THP_FAULT_FALLBACK)"
and friends, at which point it would just be better to open code the function.

If I am missing something and your suggestion can be implemented neatly, please guide.

>
> Thanks,
> Ryan
>
>> +				  unsigned long addr)
>>   {
>> -	struct vm_area_struct *vma = vmf->vma;
>> -	struct folio *folio = page_folio(page);
>> -	pgtable_t pgtable;
>> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> -	vm_fault_t ret = 0;
>> +	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>   
>> -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>> +	*foliop = folio;
>> +	if (unlikely(!folio)) {
>> +		count_vm_event(THP_FAULT_FALLBACK);
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +		return VM_FAULT_FALLBACK;
>> +	}
>>   
>> +	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>   	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>   		folio_put(folio);
>>   		count_vm_event(THP_FAULT_FALLBACK);
>>   		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>   		return VM_FAULT_FALLBACK;
>>   	}
>>   	folio_throttle_swaprate(folio, gfp);
>>   
>> -	pgtable = pte_alloc_one(vma->vm_mm);
>> -	if (unlikely(!pgtable)) {
>> -		ret = VM_FAULT_OOM;
>> -		goto release;
>> -	}
>> -
>> -	folio_zero_user(folio, vmf->address);
>> +	folio_zero_user(folio, addr);
>>   	/*
>>   	 * The memory barrier inside __folio_mark_uptodate makes sure that
>>   	 * folio_zero_user writes become visible before the set_pmd_at()
>>   	 * write.
>>   	 */
>>   	__folio_mark_uptodate(folio);
>> +	return 0;
>> +}
>> +
>> +static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>> +{
>> +	count_vm_event(THP_FAULT_ALLOC);
>> +	count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
>> +	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>> +}
>> +
>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +			struct vm_area_struct *vma, unsigned long haddr,
>> +			pgtable_t pgtable)
>> +{
>> +	pmd_t entry;
>> +
>> +	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>> +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> +	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>> +	folio_add_lru_vma(folio, vma);
>> +	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> +	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>> +	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> +	mm_inc_nr_ptes(vma->vm_mm);
>> +}
>> +
>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	struct folio *folio = NULL;
>> +	pgtable_t pgtable;
>> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> +	vm_fault_t ret = 0;
>> +	gfp_t gfp = vma_thp_gfp_mask(vma);
>> +
>> +	pgtable = pte_alloc_one(vma->vm_mm);
>> +	if (unlikely(!pgtable)) {
>> +		ret = VM_FAULT_OOM;
>> +		goto release;
>> +	}
>> +
>> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>> +			      vmf->address);
>> +	if (ret)
>> +		goto release;
>>   
>>   	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +
>>   	if (unlikely(!pmd_none(*vmf->pmd))) {
>>   		goto unlock_release;
>>   	} else {
>> -		pmd_t entry;
>> -
>>   		ret = check_stable_address_space(vma->vm_mm);
>>   		if (ret)
>>   			goto unlock_release;
>> @@ -997,20 +1039,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>   			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
>>   			return ret;
>>   		}
>> -
>> -		entry = mk_huge_pmd(page, vma->vm_page_prot);
>> -		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> -		folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>> -		folio_add_lru_vma(folio, vma);
>> -		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> -		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>> -		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>> -		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> -		mm_inc_nr_ptes(vma->vm_mm);
>> +		map_pmd_thp(folio, vmf, vma, haddr, pgtable);
>>   		spin_unlock(vmf->ptl);
>> -		count_vm_event(THP_FAULT_ALLOC);
>> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>> -		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>   	}
>>   
>>   	return 0;
>> @@ -1019,7 +1050,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>   release:
>>   	if (pgtable)
>>   		pte_free(vma->vm_mm, pgtable);
>> -	folio_put(folio);
>> +	if (folio)
>> +		folio_put(folio);
>>   	return ret;
>>   
>>   }
>> @@ -1077,8 +1109,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>   {
>>   	struct vm_area_struct *vma = vmf->vma;
>> -	gfp_t gfp;
>> -	struct folio *folio;
>>   	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>   	vm_fault_t ret;
>>   
>> @@ -1129,14 +1159,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>   		}
>>   		return ret;
>>   	}
>> -	gfp = vma_thp_gfp_mask(vma);
>> -	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
>> -	if (unlikely(!folio)) {
>> -		count_vm_event(THP_FAULT_FALLBACK);
>> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>> -		return VM_FAULT_FALLBACK;
>> -	}
>> -	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
>> +
>> +	return __do_huge_pmd_anonymous_page(vmf);
>>   }
>>   
>>   static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-05 13:14   ` Ryan Roberts
@ 2024-09-06  7:05     ` Dev Jain
  2024-09-06  8:43       ` Ryan Roberts
  0 siblings, 1 reply; 22+ messages in thread
From: Dev Jain @ 2024-09-06  7:05 UTC (permalink / raw)
  To: Ryan Roberts, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm


On 9/5/24 18:44, Ryan Roberts wrote:
> On 04/09/2024 11:09, Dev Jain wrote:
>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>> previous patch to flush TLB entry corresponding to the hugezeropage,
>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>> splitting the PMD.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   include/linux/huge_mm.h |  6 ++++
>>   mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>   mm/memory.c             |  5 +--
>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..fdd2cf473a3c 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -9,6 +9,12 @@
>>   #include <linux/kobject.h>
>>   
>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr);
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable);
> I don't think you are using either of these outside of huge_memory.c, so not
> sure you need to declare them here or make them non-static?

As pointed out by Kirill, you are right, I forgot to drop these from my previous
approach.

>
>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>   		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>   		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 58125fbcc532..150163ad77d3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   
>> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> -				  unsigned long haddr, struct folio **foliop,
>> -				  unsigned long addr)
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr)
>>   {
>>   	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>   
>> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>   	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>   }
>>   
>> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> -			struct vm_area_struct *vma, unsigned long haddr,
>> -			pgtable_t pgtable)
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable)
>>   {
>> -	pmd_t entry;
>> +	pmd_t entry, old_pmd;
>> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>>   
>>   	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>   	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>   	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>   	folio_add_lru_vma(folio, vma);
>> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> +	if (!is_pmd_none) {
>> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>> +		if (pmd_uffd_wp(old_pmd))
>> +			entry = pmd_mkuffd_wp(entry);
> I don't really get this; entry is writable, so I wouldn't expect to also be
> setting uffd-wp here? That combination is not allowed and is checked for in
> page_table_check_pte_flags().
>
> It looks like you expect to get here in the uffd-wp-async case, which used to
> cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
> would cause the uffd-wp bit to be set in each of the new ptes, then during
> fallback to handling the wp fault on the pte, uffd would handle it?

I guess you are correct; I missed the WARN_ON() in page_table_check_pmd_flags(),
but I did see, if I read the uffd code correctly, that mfill_atomic() will just
return in case of pmd_trans_huge(*dst_pmd) while doing a uffd_copy to the destination
location. So preserving pmd_uffd_wp is useless in case a THP is mapped, but I did not
know that in fact it is supposed to be an invalid combination. So, I will drop it,
unless someone has some other objection.

>
>> +	}
>> +	if (pgtable)
>> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> Should this call be moved outside of here? It doesn't really feel like it
> belongs. Could it be called before calling map_pmd_thp() for the site that has a
> pgtable?

Every other place I checked, they are doing this: make the entry -> deposit pgtable ->
set_pmd_at(). I guess the general flow is to do the deposit based on the old pmd, before
setting the new one. Which implies: I should at least move this check before I call
pmdp_huge_clear_flush(). And, since vmf->pmd and creating the new entry has no relation,
I am inclined to do what you are saying.

>
>>   	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>   	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>   	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> -	mm_inc_nr_ptes(vma->vm_mm);
>> +	if (is_pmd_none)
>> +		mm_inc_nr_ptes(vma->vm_mm);
>>   }
>>   
>>   static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>   	spin_unlock(vmf->ptl);
>>   }
>>   
>> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>> +					     unsigned long haddr,
>> +					     struct folio *folio)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = check_stable_address_space(vma->vm_mm);
>> +	if (ret)
>> +		goto out;
>> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	gfp_t gfp = vma_thp_gfp_mask(vma);
>> +	struct mmu_notifier_range range;
>> +	struct folio *folio = NULL;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>> +			      vmf->address);
> Just checking: the PTE table was already allocated during the read fault, right?
> So we don't have to allocate it here.

Correct, that happens in set_huge_zero_folio(). Thanks for checking.

>
>> +	if (ret)
>> +		goto out;
>> +
>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>> +				haddr + HPAGE_PMD_SIZE);
>> +	mmu_notifier_invalidate_range_start(&range);
>> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>> +		goto unlock;
>> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>> +	if (!ret)
>> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>> +unlock:
>> +	spin_unlock(vmf->ptl);
>> +	mmu_notifier_invalidate_range_end(&range);
> I'll confess I don't understand all the mmu notifier rules.

I confess the same :)

> But the doc at
> Documentation/mm/mmu_notifier.rst implies that the notification must be done
> while holding the PTL. Although that's not how wp_page_copy(). Are you confident
> what you have done is correct?

Everywhere else, invalidate_range_end() is getting called after dropping the lock,
one reason is that it has a might_sleep(), and therefore we cannot call it while
holding a spinlock. I still don't know what exactly these calls mean...but I think
what I have done is correct.

>
> Thanks,
> Ryan
>
>> +out:
>> +	return ret;
>> +}
>> +
>>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   {
>>   	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>   	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>   
>> -	if (is_huge_zero_pmd(orig_pmd))
>> +	if (is_huge_zero_pmd(orig_pmd)) {
>> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>> +
>> +		if (!(ret & VM_FAULT_FALLBACK))
>> +			return ret;
>> +
>> +		/* Fallback to splitting PMD if THP cannot be allocated */
>>   		goto fallback;
>> +	}
>>   
>>   	spin_lock(vmf->ptl);
>>   
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 3c01d68065be..c081a25f5173 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>>   	if (vma_is_anonymous(vma)) {
>>   		if (likely(!unshare) &&
>>   		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>> -			if (userfaultfd_wp_async(vmf->vma))
>> +			if (!userfaultfd_wp_async(vmf->vma))
>> +				return handle_userfault(vmf, VM_UFFD_WP);
>> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>>   				goto split;
>> -			return handle_userfault(vmf, VM_UFFD_WP);
>>   		}
>>   		return do_huge_pmd_wp_page(vmf);
>>   	}


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] mm: Abstract THP allocation
  2024-09-06  5:42     ` Dev Jain
@ 2024-09-06  8:28       ` Ryan Roberts
  2024-09-06  8:45         ` Dev Jain
  0 siblings, 1 reply; 22+ messages in thread
From: Ryan Roberts @ 2024-09-06  8:28 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm

On 06/09/2024 06:42, Dev Jain wrote:
> 
> On 9/5/24 16:38, Ryan Roberts wrote:
>> On 04/09/2024 11:09, Dev Jain wrote:
>>> In preparation for the second patch, abstract away the THP allocation
>>> logic present in the create_huge_pmd() path, which corresponds to the
>>> faulting case when no page is present.
>>>
>>> There should be no functional change as a result of applying
>>> this patch.
>>>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>   mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>>>   1 file changed, 67 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 67c86a5d64a6..58125fbcc532 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -943,47 +943,89 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>> unsigned long addr,
>>>   }
>>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>   -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>> -            struct page *page, gfp_t gfp)
>>> +static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>> vm_area_struct *vma,
>> Is there a reason for specifying order as a parameter? Previously it was
>> hardcoded to HPAGE_PMD_ORDER. But now, thp_fault_alloc() and
>> __thp_fault_success_stats() both take order and map_pmd_thp() is still
>> implicitly mapping a PMD-sized block. Unless there is a reason you need this
>> parameter in the next patch (I don't think there is?) I suggest simplifying.
> 
> If I am not wrong, thp_fault_alloc() and __thp_fault_success_stats()
> will remain the same in case of mTHP? 

No, its a bit different for smaller-than-pmd THP - that's handled in
alloc_anon_folio() in memory.c. It deals with fallback to smaller orders (inc
order-0. Potentially alloc_anon_folio() could be refactored to use your new
functions in future but I'm sure there would be a number of subtlties to
consider and in any case you would want to make that a separate change. The
order param would be added as part of that future change.

> I chose to pass order so that these
> functions can be used by others in the future. 

It's not valuable to have an internal interface with no users. My advice is to
keep it simple and only extend it when a user pops up.

> Therefore, these two functions
> can be generically used in the future while map_pmd_thp() (as the name suggests)
> maps only a PMD-mappable THP.
> 
>>
>>> +                  unsigned long haddr, struct folio **foliop,
>> FWIW, I agree with Kirill's suggestion to just return folio* and drop the output
>> param.
> 
> As I replied to Kirill, I do not think that is a good idea. If you do a git grep on
> the tree for "foliop", you will find several places where that is being used, for
> example, check out alloc_charge_folio() in mm/khugepaged.c. The author intends to
> do the stat computation and setting *foliop in the function itself, so that the
> caller is only concerned with the return value.

By having 2 return params, you open up the possibility of returning an invalid
combination (e.g. ret==FALLBACK, folio!=NULL). You avoid that by having a single
return param, which is either NULL (failure) or a valid pointer.

> Also, if we return the folio instead of VM_FAULT_FALLBACK from thp_fault_alloc(),
> then you will have two "if (unlikely(!folio))" branches, the first to do the stat
> computation in the function, and the second branch in the caller to set ret =
> VM_FAULT_FALLBACK
> and then goto out/release.

There are already 2 conditionals, that doesn't change.

> And, it is already inconsistent to break away the stat computation and the
> return value
> setting, when the stat computation name is of the form
> "count_vm_event(THP_FAULT_FALLBACK)"
> and friends, at which point it would just be better to open code the function.

I don't understand this.

> 
> If I am missing something and your suggestion can be implemented neatly, please
> guide.

This looks cleaner/simpler to my eye (also removing the order parameter):

static folio *thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma,
			      unsigned long haddr, unsigned long addr)
{
	const int order = HPAGE_PMD_ORDER;
	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);

	if (unlikely(!folio)) {
		count_vm_event(THP_FAULT_FALLBACK);
		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
		goto out;
	}

	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
		folio_put(folio);
		count_vm_event(THP_FAULT_FALLBACK);
		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
		goto out;
	}
	folio_throttle_swaprate(folio, gfp);

	folio_zero_user(folio, addr);
	/*
	 * The memory barrier inside __folio_mark_uptodate makes sure that
	 * folio_zero_user writes become visible before the set_pmd_at()
	 * write.
	 */
	__folio_mark_uptodate(folio);
out:
	return folio;
}


static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
	vm_fault_t ret = VM_FAULT_FALLBACK;
	...

	folio = thp_fault_alloc(gfp, vma, haddr, vmf->address);
	if (unlikely(!folio))
		goto release;
	...
}

Thanks,
Ryan

> 
>>
>> Thanks,
>> Ryan
>>
>>> +                  unsigned long addr)
>>>   {
>>> -    struct vm_area_struct *vma = vmf->vma;
>>> -    struct folio *folio = page_folio(page);
>>> -    pgtable_t pgtable;
>>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>> -    vm_fault_t ret = 0;
>>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>   -    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>> +    *foliop = folio;
>>> +    if (unlikely(!folio)) {
>>> +        count_vm_event(THP_FAULT_FALLBACK);
>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> +        return VM_FAULT_FALLBACK;
>>> +    }
>>>   +    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>>       if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>>           folio_put(folio);
>>>           count_vm_event(THP_FAULT_FALLBACK);
>>>           count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>           return VM_FAULT_FALLBACK;
>>>       }
>>>       folio_throttle_swaprate(folio, gfp);
>>>   -    pgtable = pte_alloc_one(vma->vm_mm);
>>> -    if (unlikely(!pgtable)) {
>>> -        ret = VM_FAULT_OOM;
>>> -        goto release;
>>> -    }
>>> -
>>> -    folio_zero_user(folio, vmf->address);
>>> +    folio_zero_user(folio, addr);
>>>       /*
>>>        * The memory barrier inside __folio_mark_uptodate makes sure that
>>>        * folio_zero_user writes become visible before the set_pmd_at()
>>>        * write.
>>>        */
>>>       __folio_mark_uptodate(folio);
>>> +    return 0;
>>> +}
>>> +
>>> +static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>> +{
>>> +    count_vm_event(THP_FAULT_ALLOC);
>>> +    count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
>>> +    count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>> +}
>>> +
>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +            struct vm_area_struct *vma, unsigned long haddr,
>>> +            pgtable_t pgtable)
>>> +{
>>> +    pmd_t entry;
>>> +
>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>> +    folio_add_lru_vma(folio, vma);
>>> +    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>> +    mm_inc_nr_ptes(vma->vm_mm);
>>> +}
>>> +
>>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    struct folio *folio = NULL;
>>> +    pgtable_t pgtable;
>>> +    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>> +    vm_fault_t ret = 0;
>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>> +
>>> +    pgtable = pte_alloc_one(vma->vm_mm);
>>> +    if (unlikely(!pgtable)) {
>>> +        ret = VM_FAULT_OOM;
>>> +        goto release;
>>> +    }
>>> +
>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>> +                  vmf->address);
>>> +    if (ret)
>>> +        goto release;
>>>         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>> +
>>>       if (unlikely(!pmd_none(*vmf->pmd))) {
>>>           goto unlock_release;
>>>       } else {
>>> -        pmd_t entry;
>>> -
>>>           ret = check_stable_address_space(vma->vm_mm);
>>>           if (ret)
>>>               goto unlock_release;
>>> @@ -997,20 +1039,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct
>>> vm_fault *vmf,
>>>               VM_BUG_ON(ret & VM_FAULT_FALLBACK);
>>>               return ret;
>>>           }
>>> -
>>> -        entry = mk_huge_pmd(page, vma->vm_page_prot);
>>> -        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>> -        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>> -        folio_add_lru_vma(folio, vma);
>>> -        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>> -        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>> -        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>> -        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>> -        mm_inc_nr_ptes(vma->vm_mm);
>>> +        map_pmd_thp(folio, vmf, vma, haddr, pgtable);
>>>           spin_unlock(vmf->ptl);
>>> -        count_vm_event(THP_FAULT_ALLOC);
>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>>> -        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>>       }
>>>         return 0;
>>> @@ -1019,7 +1050,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct
>>> vm_fault *vmf,
>>>   release:
>>>       if (pgtable)
>>>           pte_free(vma->vm_mm, pgtable);
>>> -    folio_put(folio);
>>> +    if (folio)
>>> +        folio_put(folio);
>>>       return ret;
>>>     }
>>> @@ -1077,8 +1109,6 @@ static void set_huge_zero_folio(pgtable_t pgtable,
>>> struct mm_struct *mm,
>>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>   {
>>>       struct vm_area_struct *vma = vmf->vma;
>>> -    gfp_t gfp;
>>> -    struct folio *folio;
>>>       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>       vm_fault_t ret;
>>>   @@ -1129,14 +1159,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct
>>> vm_fault *vmf)
>>>           }
>>>           return ret;
>>>       }
>>> -    gfp = vma_thp_gfp_mask(vma);
>>> -    folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
>>> -    if (unlikely(!folio)) {
>>> -        count_vm_event(THP_FAULT_FALLBACK);
>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> -        return VM_FAULT_FALLBACK;
>>> -    }
>>> -    return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
>>> +
>>> +    return __do_huge_pmd_anonymous_page(vmf);
>>>   }
>>>     static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-06  7:05     ` Dev Jain
@ 2024-09-06  8:43       ` Ryan Roberts
  2024-09-06  9:00         ` Dev Jain
  0 siblings, 1 reply; 22+ messages in thread
From: Ryan Roberts @ 2024-09-06  8:43 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm

On 06/09/2024 08:05, Dev Jain wrote:
> 
> On 9/5/24 18:44, Ryan Roberts wrote:
>> On 04/09/2024 11:09, Dev Jain wrote:
>>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>>> previous patch to flush TLB entry corresponding to the hugezeropage,
>>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>>> splitting the PMD.
>>>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>   include/linux/huge_mm.h |  6 ++++
>>>   mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>>   mm/memory.c             |  5 +--
>>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e25d9ebfdf89..fdd2cf473a3c 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -9,6 +9,12 @@
>>>   #include <linux/kobject.h>
>>>     vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>> +               unsigned long haddr, struct folio **foliop,
>>> +               unsigned long addr);
>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>> +         pgtable_t pgtable);
>> I don't think you are using either of these outside of huge_memory.c, so not
>> sure you need to declare them here or make them non-static?
> 
> As pointed out by Kirill, you are right, I forgot to drop these from my previous
> approach.
> 
>>
>>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>             pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>>             struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 58125fbcc532..150163ad77d3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>> unsigned long addr,
>>>   }
>>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>   -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>> vm_area_struct *vma,
>>> -                  unsigned long haddr, struct folio **foliop,
>>> -                  unsigned long addr)
>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>> +               unsigned long haddr, struct folio **foliop,
>>> +               unsigned long addr)
>>>   {
>>>       struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>   @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct
>>> vm_area_struct *vma, int order)
>>>       count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>   }
>>>   -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> -            struct vm_area_struct *vma, unsigned long haddr,
>>> -            pgtable_t pgtable)
>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>> +         pgtable_t pgtable)
>>>   {
>>> -    pmd_t entry;
>>> +    pmd_t entry, old_pmd;
>>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>>         entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>       folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>       folio_add_lru_vma(folio, vma);
>>> -    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>> +    if (!is_pmd_none) {
>>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>>> +        if (pmd_uffd_wp(old_pmd))
>>> +            entry = pmd_mkuffd_wp(entry);
>> I don't really get this; entry is writable, so I wouldn't expect to also be
>> setting uffd-wp here? That combination is not allowed and is checked for in
>> page_table_check_pte_flags().
>>
>> It looks like you expect to get here in the uffd-wp-async case, which used to
>> cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
>> would cause the uffd-wp bit to be set in each of the new ptes, then during
>> fallback to handling the wp fault on the pte, uffd would handle it?
> 
> I guess you are correct; I missed the WARN_ON() in page_table_check_pmd_flags(),
> but I did see, if I read the uffd code correctly, that mfill_atomic() will just
> return in case of pmd_trans_huge(*dst_pmd) while doing a uffd_copy to the
> destination
> location. So preserving pmd_uffd_wp is useless in case a THP is mapped, but I
> did not
> know that in fact it is supposed to be an invalid combination. So, I will drop it,
> unless someone has some other objection.

So what's the correct way to handle uffd-wp-async in wp_huge_pmd()? Just split
it? If so, you can revert your changes in memory.c.

> 
>>
>>> +    }
>>> +    if (pgtable)
>>> +        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> Should this call be moved outside of here? It doesn't really feel like it
>> belongs. Could it be called before calling map_pmd_thp() for the site that has a
>> pgtable?
> 
> Every other place I checked, they are doing this: make the entry -> deposit
> pgtable ->
> set_pmd_at(). I guess the general flow is to do the deposit based on the old
> pmd, before
> setting the new one. Which implies: I should at least move this check before I call
> pmdp_huge_clear_flush(). And, since vmf->pmd and creating the new entry has no
> relation,
> I am inclined to do what you are saying.

pgtable_trans_huge_deposit() is just adding the pgtable to a list so that if the
THP needs to be split in future, then we have preallocated the pte pgtable so
the operation can't fail. And enqueing it is just under the protection of the
PTL as far as I can tell. So I think the only ordering requirement is that you
both set the pmd and deposit the pgtable under the lock (without dropping it in
between). So you can deposit either before or after map_pmd_thp(). And
pmdp_huge_clear_flush() is irrelavent, I think?

> 
>>
>>>       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>       update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>> -    mm_inc_nr_ptes(vma->vm_mm);
>>> +    if (is_pmd_none)
>>> +        mm_inc_nr_ptes(vma->vm_mm);
>>>   }
>>>     static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>       spin_unlock(vmf->ptl);
>>>   }
>>>   +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>>> +                         unsigned long haddr,
>>> +                         struct folio *folio)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    vm_fault_t ret = 0;
>>> +
>>> +    ret = check_stable_address_space(vma->vm_mm);
>>> +    if (ret)
>>> +        goto out;
>>> +    map_pmd_thp(folio, vmf, vma, haddr, NULL);
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long
>>> haddr)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>> +    struct mmu_notifier_range range;
>>> +    struct folio *folio = NULL;
>>> +    vm_fault_t ret = 0;
>>> +
>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>> +                  vmf->address);
>> Just checking: the PTE table was already allocated during the read fault, right?
>> So we don't have to allocate it here.
> 
> Correct, that happens in set_huge_zero_folio(). Thanks for checking.
> 
>>
>>> +    if (ret)
>>> +        goto out;
>>> +
>>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>>> +                haddr + HPAGE_PMD_SIZE);
>>> +    mmu_notifier_invalidate_range_start(&range);
>>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>>> +        goto unlock;
>>> +    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>>> +    if (!ret)
>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>> +unlock:
>>> +    spin_unlock(vmf->ptl);
>>> +    mmu_notifier_invalidate_range_end(&range);
>> I'll confess I don't understand all the mmu notifier rules.
> 
> I confess the same :)
> 
>> But the doc at
>> Documentation/mm/mmu_notifier.rst implies that the notification must be done
>> while holding the PTL. Although that's not how wp_page_copy(). Are you confident
>> what you have done is correct?
> 
> Everywhere else, invalidate_range_end() is getting called after dropping the lock,
> one reason is that it has a might_sleep(), and therefore we cannot call it while
> holding a spinlock. I still don't know what exactly these calls mean...but I think
> what I have done is correct.

High level; they are notifying secondary MMUs (e.g. IOMMU) of a change so the
tables of those secondary MMUs can be kept in sync. I don't understand all the
ordering requirement details though.

I think what you have is probably correct, would be good for someone that knows
what they are talking about to confirm though :)

> 
>>
>> Thanks,
>> Ryan
>>
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>   {
>>>       const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>       vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>>       VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>>   -    if (is_huge_zero_pmd(orig_pmd))
>>> +    if (is_huge_zero_pmd(orig_pmd)) {
>>> +        vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>>> +
>>> +        if (!(ret & VM_FAULT_FALLBACK))
>>> +            return ret;
>>> +
>>> +        /* Fallback to splitting PMD if THP cannot be allocated */
>>>           goto fallback;
>>> +    }
>>>         spin_lock(vmf->ptl);
>>>   diff --git a/mm/memory.c b/mm/memory.c
>>> index 3c01d68065be..c081a25f5173 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault
>>> *vmf)
>>>       if (vma_is_anonymous(vma)) {
>>>           if (likely(!unshare) &&
>>>               userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>>> -            if (userfaultfd_wp_async(vmf->vma))
>>> +            if (!userfaultfd_wp_async(vmf->vma))
>>> +                return handle_userfault(vmf, VM_UFFD_WP);
>>> +            if (!is_huge_zero_pmd(vmf->orig_pmd))
>>>                   goto split;
>>> -            return handle_userfault(vmf, VM_UFFD_WP);
>>>           }
>>>           return do_huge_pmd_wp_page(vmf);
>>>       }



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] mm: Abstract THP allocation
  2024-09-06  8:28       ` Ryan Roberts
@ 2024-09-06  8:45         ` Dev Jain
  2024-09-06  9:00           ` Ryan Roberts
  0 siblings, 1 reply; 22+ messages in thread
From: Dev Jain @ 2024-09-06  8:45 UTC (permalink / raw)
  To: Ryan Roberts, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm


On 9/6/24 13:58, Ryan Roberts wrote:
> On 06/09/2024 06:42, Dev Jain wrote:
>> On 9/5/24 16:38, Ryan Roberts wrote:
>>> On 04/09/2024 11:09, Dev Jain wrote:
>>>> In preparation for the second patch, abstract away the THP allocation
>>>> logic present in the create_huge_pmd() path, which corresponds to the
>>>> faulting case when no page is present.
>>>>
>>>> There should be no functional change as a result of applying
>>>> this patch.
>>>>
>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>> ---
>>>>    mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>>>>    1 file changed, 67 insertions(+), 43 deletions(-)
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 67c86a5d64a6..58125fbcc532 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -943,47 +943,89 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>>> unsigned long addr,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>>    -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>>> -            struct page *page, gfp_t gfp)
>>>> +static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>>> vm_area_struct *vma,
>>> Is there a reason for specifying order as a parameter? Previously it was
>>> hardcoded to HPAGE_PMD_ORDER. But now, thp_fault_alloc() and
>>> __thp_fault_success_stats() both take order and map_pmd_thp() is still
>>> implicitly mapping a PMD-sized block. Unless there is a reason you need this
>>> parameter in the next patch (I don't think there is?) I suggest simplifying.
>> If I am not wrong, thp_fault_alloc() and __thp_fault_success_stats()
>> will remain the same in case of mTHP?
> No, its a bit different for smaller-than-pmd THP - that's handled in
> alloc_anon_folio() in memory.c. It deals with fallback to smaller orders (inc
> order-0. Potentially alloc_anon_folio() could be refactored to use your new
> functions in future but I'm sure there would be a number of subtlties to
> consider and in any case you would want to make that a separate change. The
> order param would be added as part of that future change.

Okay.

>
>> I chose to pass order so that these
>> functions can be used by others in the future.
> It's not valuable to have an internal interface with no users. My advice is to
> keep it simple and only extend it when a user pops up.

Makes sense. In that case, should I call it pmd_thp_fault_alloc() to enforce that
this is not a non-PMD THP?

>
>> Therefore, these two functions
>> can be generically used in the future while map_pmd_thp() (as the name suggests)
>> maps only a PMD-mappable THP.
>>
>>>> +                  unsigned long haddr, struct folio **foliop,
>>> FWIW, I agree with Kirill's suggestion to just return folio* and drop the output
>>> param.
>> As I replied to Kirill, I do not think that is a good idea. If you do a git grep on
>> the tree for "foliop", you will find several places where that is being used, for
>> example, check out alloc_charge_folio() in mm/khugepaged.c. The author intends to
>> do the stat computation and setting *foliop in the function itself, so that the
>> caller is only concerned with the return value.
> By having 2 return params, you open up the possibility of returning an invalid
> combination (e.g. ret==FALLBACK, folio!=NULL). You avoid that by having a single
> return param, which is either NULL (failure) or a valid pointer.

Okay, I agree with this in a generic sense.

>
>> Also, if we return the folio instead of VM_FAULT_FALLBACK from thp_fault_alloc(),
>> then you will have two "if (unlikely(!folio))" branches, the first to do the stat
>> computation in the function, and the second branch in the caller to set ret =
>> VM_FAULT_FALLBACK
>> and then goto out/release.
> There are already 2 conditionals, that doesn't change.
>
>> And, it is already inconsistent to break away the stat computation and the
>> return value
>> setting, when the stat computation name is of the form
>> "count_vm_event(THP_FAULT_FALLBACK)"
>> and friends, at which point it would just be better to open code the function.
> I don't understand this.
>
>> If I am missing something and your suggestion can be implemented neatly, please
>> guide.
> This looks cleaner/simpler to my eye (also removing the order parameter):
>
> static folio *thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma,
> 			      unsigned long haddr, unsigned long addr)
> {
> 	const int order = HPAGE_PMD_ORDER;
> 	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>
> 	if (unlikely(!folio)) {
> 		count_vm_event(THP_FAULT_FALLBACK);
> 		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> 		goto out;
> 	}
>
> 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> 	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> 		folio_put(folio);
> 		count_vm_event(THP_FAULT_FALLBACK);
> 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> 		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> 		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> 		goto out;
> 	}
> 	folio_throttle_swaprate(folio, gfp);
>
> 	folio_zero_user(folio, addr);
> 	/*
> 	 * The memory barrier inside __folio_mark_uptodate makes sure that
> 	 * folio_zero_user writes become visible before the set_pmd_at()
> 	 * write.
> 	 */
> 	__folio_mark_uptodate(folio);
> out:
> 	return folio;
> }
>
>
> static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> {
> 	vm_fault_t ret = VM_FAULT_FALLBACK;
> 	...
>
> 	folio = thp_fault_alloc(gfp, vma, haddr, vmf->address);
> 	if (unlikely(!folio))
> 		goto release;
> 	...
> }
>

Thanks for your help. I will switch to this, just that, ret should still
be initialized to zero in case we spot the pmd changes after taking the lock.
So I'll set it in the case of !folio.

>
>>> Thanks,
>>> Ryan
>>>
>>>> +                  unsigned long addr)
>>>>    {
>>>> -    struct vm_area_struct *vma = vmf->vma;
>>>> -    struct folio *folio = page_folio(page);
>>>> -    pgtable_t pgtable;
>>>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>> -    vm_fault_t ret = 0;
>>>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>>    -    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>>> +    *foliop = folio;
>>>> +    if (unlikely(!folio)) {
>>>> +        count_vm_event(THP_FAULT_FALLBACK);
>>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>> +        return VM_FAULT_FALLBACK;
>>>> +    }
>>>>    +    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>>>        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>>>            folio_put(folio);
>>>>            count_vm_event(THP_FAULT_FALLBACK);
>>>>            count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>            return VM_FAULT_FALLBACK;
>>>>        }
>>>>        folio_throttle_swaprate(folio, gfp);
>>>>    -    pgtable = pte_alloc_one(vma->vm_mm);
>>>> -    if (unlikely(!pgtable)) {
>>>> -        ret = VM_FAULT_OOM;
>>>> -        goto release;
>>>> -    }
>>>> -
>>>> -    folio_zero_user(folio, vmf->address);
>>>> +    folio_zero_user(folio, addr);
>>>>        /*
>>>>         * The memory barrier inside __folio_mark_uptodate makes sure that
>>>>         * folio_zero_user writes become visible before the set_pmd_at()
>>>>         * write.
>>>>         */
>>>>        __folio_mark_uptodate(folio);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>>> +{
>>>> +    count_vm_event(THP_FAULT_ALLOC);
>>>> +    count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
>>>> +    count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>> +}
>>>> +
>>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> +            struct vm_area_struct *vma, unsigned long haddr,
>>>> +            pgtable_t pgtable)
>>>> +{
>>>> +    pmd_t entry;
>>>> +
>>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>> +    folio_add_lru_vma(folio, vma);
>>>> +    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>> +    mm_inc_nr_ptes(vma->vm_mm);
>>>> +}
>>>> +
>>>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>> +{
>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>> +    struct folio *folio = NULL;
>>>> +    pgtable_t pgtable;
>>>> +    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>> +    vm_fault_t ret = 0;
>>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>>> +
>>>> +    pgtable = pte_alloc_one(vma->vm_mm);
>>>> +    if (unlikely(!pgtable)) {
>>>> +        ret = VM_FAULT_OOM;
>>>> +        goto release;
>>>> +    }
>>>> +
>>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>>> +                  vmf->address);
>>>> +    if (ret)
>>>> +        goto release;
>>>>          vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>> +
>>>>        if (unlikely(!pmd_none(*vmf->pmd))) {
>>>>            goto unlock_release;
>>>>        } else {
>>>> -        pmd_t entry;
>>>> -
>>>>            ret = check_stable_address_space(vma->vm_mm);
>>>>            if (ret)
>>>>                goto unlock_release;
>>>> @@ -997,20 +1039,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct
>>>> vm_fault *vmf,
>>>>                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
>>>>                return ret;
>>>>            }
>>>> -
>>>> -        entry = mk_huge_pmd(page, vma->vm_page_prot);
>>>> -        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>> -        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>> -        folio_add_lru_vma(folio, vma);
>>>> -        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>> -        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>> -        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>> -        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>> -        mm_inc_nr_ptes(vma->vm_mm);
>>>> +        map_pmd_thp(folio, vmf, vma, haddr, pgtable);
>>>>            spin_unlock(vmf->ptl);
>>>> -        count_vm_event(THP_FAULT_ALLOC);
>>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>>>> -        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>>>        }
>>>>          return 0;
>>>> @@ -1019,7 +1050,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct
>>>> vm_fault *vmf,
>>>>    release:
>>>>        if (pgtable)
>>>>            pte_free(vma->vm_mm, pgtable);
>>>> -    folio_put(folio);
>>>> +    if (folio)
>>>> +        folio_put(folio);
>>>>        return ret;
>>>>      }
>>>> @@ -1077,8 +1109,6 @@ static void set_huge_zero_folio(pgtable_t pgtable,
>>>> struct mm_struct *mm,
>>>>    vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>>    {
>>>>        struct vm_area_struct *vma = vmf->vma;
>>>> -    gfp_t gfp;
>>>> -    struct folio *folio;
>>>>        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>>        vm_fault_t ret;
>>>>    @@ -1129,14 +1159,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct
>>>> vm_fault *vmf)
>>>>            }
>>>>            return ret;
>>>>        }
>>>> -    gfp = vma_thp_gfp_mask(vma);
>>>> -    folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
>>>> -    if (unlikely(!folio)) {
>>>> -        count_vm_event(THP_FAULT_FALLBACK);
>>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>> -        return VM_FAULT_FALLBACK;
>>>> -    }
>>>> -    return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
>>>> +
>>>> +    return __do_huge_pmd_anonymous_page(vmf);
>>>>    }
>>>>      static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-06  8:43       ` Ryan Roberts
@ 2024-09-06  9:00         ` Dev Jain
  2024-09-06  9:31           ` Ryan Roberts
  0 siblings, 1 reply; 22+ messages in thread
From: Dev Jain @ 2024-09-06  9:00 UTC (permalink / raw)
  To: Ryan Roberts, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm


On 9/6/24 14:13, Ryan Roberts wrote:
> On 06/09/2024 08:05, Dev Jain wrote:
>> On 9/5/24 18:44, Ryan Roberts wrote:
>>> On 04/09/2024 11:09, Dev Jain wrote:
>>>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>>>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>>>> previous patch to flush TLB entry corresponding to the hugezeropage,
>>>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>>>> splitting the PMD.
>>>>
>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>> ---
>>>>    include/linux/huge_mm.h |  6 ++++
>>>>    mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>>>    mm/memory.c             |  5 +--
>>>>    3 files changed, 78 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index e25d9ebfdf89..fdd2cf473a3c 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -9,6 +9,12 @@
>>>>    #include <linux/kobject.h>
>>>>      vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>>> +               unsigned long haddr, struct folio **foliop,
>>>> +               unsigned long addr);
>>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>>> +         pgtable_t pgtable);
>>> I don't think you are using either of these outside of huge_memory.c, so not
>>> sure you need to declare them here or make them non-static?
>> As pointed out by Kirill, you are right, I forgot to drop these from my previous
>> approach.
>>
>>>>    int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>>              pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>>>              struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 58125fbcc532..150163ad77d3 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>>> unsigned long addr,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>>    -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>>> vm_area_struct *vma,
>>>> -                  unsigned long haddr, struct folio **foliop,
>>>> -                  unsigned long addr)
>>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>>> +               unsigned long haddr, struct folio **foliop,
>>>> +               unsigned long addr)
>>>>    {
>>>>        struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>>    @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct
>>>> vm_area_struct *vma, int order)
>>>>        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>>    }
>>>>    -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> -            struct vm_area_struct *vma, unsigned long haddr,
>>>> -            pgtable_t pgtable)
>>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>>> +         pgtable_t pgtable)
>>>>    {
>>>> -    pmd_t entry;
>>>> +    pmd_t entry, old_pmd;
>>>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>>>          entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>>        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>>        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>>        folio_add_lru_vma(folio, vma);
>>>> -    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>> +    if (!is_pmd_none) {
>>>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>>>> +        if (pmd_uffd_wp(old_pmd))
>>>> +            entry = pmd_mkuffd_wp(entry);
>>> I don't really get this; entry is writable, so I wouldn't expect to also be
>>> setting uffd-wp here? That combination is not allowed and is checked for in
>>> page_table_check_pte_flags().
>>>
>>> It looks like you expect to get here in the uffd-wp-async case, which used to
>>> cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
>>> would cause the uffd-wp bit to be set in each of the new ptes, then during
>>> fallback to handling the wp fault on the pte, uffd would handle it?
>> I guess you are correct; I missed the WARN_ON() in page_table_check_pmd_flags(),
>> but I did see, if I read the uffd code correctly, that mfill_atomic() will just
>> return in case of pmd_trans_huge(*dst_pmd) while doing a uffd_copy to the
>> destination
>> location. So preserving pmd_uffd_wp is useless in case a THP is mapped, but I
>> did not
>> know that in fact it is supposed to be an invalid combination. So, I will drop it,
>> unless someone has some other objection.
> So what's the correct way to handle uffd-wp-async in wp_huge_pmd()? Just split
> it? If so, you can revert your changes in memory.c.

I think so.

>
>>>> +    }
>>>> +    if (pgtable)
>>>> +        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>> Should this call be moved outside of here? It doesn't really feel like it
>>> belongs. Could it be called before calling map_pmd_thp() for the site that has a
>>> pgtable?
>> Every other place I checked, they are doing this: make the entry -> deposit
>> pgtable ->
>> set_pmd_at(). I guess the general flow is to do the deposit based on the old
>> pmd, before
>> setting the new one. Which implies: I should at least move this check before I call
>> pmdp_huge_clear_flush(). And, since vmf->pmd and creating the new entry has no
>> relation,
>> I am inclined to do what you are saying.
> pgtable_trans_huge_deposit() is just adding the pgtable to a list so that if the
> THP needs to be split in future, then we have preallocated the pte pgtable so
> the operation can't fail.

Yes.

> And enqueing it is just under the protection of the
> PTL as far as I can tell. So I think the only ordering requirement is that you
> both set the pmd and deposit the pgtable under the lock (without dropping it in
> between). So you can deposit either before or after map_pmd_thp().

Yes I'll do that before.

> And
> pmdp_huge_clear_flush() is irrelavent, I think?

You mean, in this context? Everywhere, pgtable deposit uses the old pmd
value to be replaced as its input, that is, it is called before set_pmd_at().
So calling pgtable deposit after clear_flush() will violate this ordering.
I do not think this ordering is really required but I'd rather be safe :)

>
>>>>        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>>        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>>        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>> -    mm_inc_nr_ptes(vma->vm_mm);
>>>> +    if (is_pmd_none)
>>>> +        mm_inc_nr_ptes(vma->vm_mm);
>>>>    }
>>>>      static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>>        spin_unlock(vmf->ptl);
>>>>    }
>>>>    +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>>>> +                         unsigned long haddr,
>>>> +                         struct folio *folio)
>>>> +{
>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>> +    vm_fault_t ret = 0;
>>>> +
>>>> +    ret = check_stable_address_space(vma->vm_mm);
>>>> +    if (ret)
>>>> +        goto out;
>>>> +    map_pmd_thp(folio, vmf, vma, haddr, NULL);
>>>> +out:
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long
>>>> haddr)
>>>> +{
>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>>> +    struct mmu_notifier_range range;
>>>> +    struct folio *folio = NULL;
>>>> +    vm_fault_t ret = 0;
>>>> +
>>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>>> +                  vmf->address);
>>> Just checking: the PTE table was already allocated during the read fault, right?
>>> So we don't have to allocate it here.
>> Correct, that happens in set_huge_zero_folio(). Thanks for checking.
>>
>>>> +    if (ret)
>>>> +        goto out;
>>>> +
>>>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>>>> +                haddr + HPAGE_PMD_SIZE);
>>>> +    mmu_notifier_invalidate_range_start(&range);
>>>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>>>> +        goto unlock;
>>>> +    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>>>> +    if (!ret)
>>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>>> +unlock:
>>>> +    spin_unlock(vmf->ptl);
>>>> +    mmu_notifier_invalidate_range_end(&range);
>>> I'll confess I don't understand all the mmu notifier rules.
>> I confess the same :)
>>
>>> But the doc at
>>> Documentation/mm/mmu_notifier.rst implies that the notification must be done
>>> while holding the PTL. Although that's not how wp_page_copy(). Are you confident
>>> what you have done is correct?
>> Everywhere else, invalidate_range_end() is getting called after dropping the lock,
>> one reason is that it has a might_sleep(), and therefore we cannot call it while
>> holding a spinlock. I still don't know what exactly these calls mean...but I think
>> what I have done is correct.
> High level; they are notifying secondary MMUs (e.g. IOMMU) of a change so the
> tables of those secondary MMUs can be kept in sync. I don't understand all the
> ordering requirement details though.
>
> I think what you have is probably correct, would be good for someone that knows
> what they are talking about to confirm though :)

Exactly.

>
>>> Thanks,
>>> Ryan
>>>
>>>> +out:
>>>> +    return ret;
>>>> +}
>>>> +
>>>>    vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>>    {
>>>>        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>>>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>>        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>>>        VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>>>    -    if (is_huge_zero_pmd(orig_pmd))
>>>> +    if (is_huge_zero_pmd(orig_pmd)) {
>>>> +        vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>>>> +
>>>> +        if (!(ret & VM_FAULT_FALLBACK))
>>>> +            return ret;
>>>> +
>>>> +        /* Fallback to splitting PMD if THP cannot be allocated */
>>>>            goto fallback;
>>>> +    }
>>>>          spin_lock(vmf->ptl);
>>>>    diff --git a/mm/memory.c b/mm/memory.c
>>>> index 3c01d68065be..c081a25f5173 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault
>>>> *vmf)
>>>>        if (vma_is_anonymous(vma)) {
>>>>            if (likely(!unshare) &&
>>>>                userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>>>> -            if (userfaultfd_wp_async(vmf->vma))
>>>> +            if (!userfaultfd_wp_async(vmf->vma))
>>>> +                return handle_userfault(vmf, VM_UFFD_WP);
>>>> +            if (!is_huge_zero_pmd(vmf->orig_pmd))
>>>>                    goto split;
>>>> -            return handle_userfault(vmf, VM_UFFD_WP);
>>>>            }
>>>>            return do_huge_pmd_wp_page(vmf);
>>>>        }


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] mm: Abstract THP allocation
  2024-09-06  8:45         ` Dev Jain
@ 2024-09-06  9:00           ` Ryan Roberts
  0 siblings, 0 replies; 22+ messages in thread
From: Ryan Roberts @ 2024-09-06  9:00 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm

On 06/09/2024 09:45, Dev Jain wrote:
> 
> On 9/6/24 13:58, Ryan Roberts wrote:
>> On 06/09/2024 06:42, Dev Jain wrote:
>>> On 9/5/24 16:38, Ryan Roberts wrote:
>>>> On 04/09/2024 11:09, Dev Jain wrote:
>>>>> In preparation for the second patch, abstract away the THP allocation
>>>>> logic present in the create_huge_pmd() path, which corresponds to the
>>>>> faulting case when no page is present.
>>>>>
>>>>> There should be no functional change as a result of applying
>>>>> this patch.
>>>>>
>>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>>> ---
>>>>>    mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>>>>>    1 file changed, 67 insertions(+), 43 deletions(-)
>>>>>
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 67c86a5d64a6..58125fbcc532 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -943,47 +943,89 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>>>> unsigned long addr,
>>>>>    }
>>>>>    EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>>>    -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>>>> -            struct page *page, gfp_t gfp)
>>>>> +static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>>>> vm_area_struct *vma,
>>>> Is there a reason for specifying order as a parameter? Previously it was
>>>> hardcoded to HPAGE_PMD_ORDER. But now, thp_fault_alloc() and
>>>> __thp_fault_success_stats() both take order and map_pmd_thp() is still
>>>> implicitly mapping a PMD-sized block. Unless there is a reason you need this
>>>> parameter in the next patch (I don't think there is?) I suggest simplifying.
>>> If I am not wrong, thp_fault_alloc() and __thp_fault_success_stats()
>>> will remain the same in case of mTHP?
>> No, its a bit different for smaller-than-pmd THP - that's handled in
>> alloc_anon_folio() in memory.c. It deals with fallback to smaller orders (inc
>> order-0. Potentially alloc_anon_folio() could be refactored to use your new
>> functions in future but I'm sure there would be a number of subtlties to
>> consider and in any case you would want to make that a separate change. The
>> order param would be added as part of that future change.
> 
> Okay.
> 
>>
>>> I chose to pass order so that these
>>> functions can be used by others in the future.
>> It's not valuable to have an internal interface with no users. My advice is to
>> keep it simple and only extend it when a user pops up.
> 
> Makes sense. In that case, should I call it pmd_thp_fault_alloc() to enforce that
> this is not a non-PMD THP?

Yeah, that works for me.

> 
>>
>>> Therefore, these two functions
>>> can be generically used in the future while map_pmd_thp() (as the name suggests)
>>> maps only a PMD-mappable THP.
>>>
>>>>> +                  unsigned long haddr, struct folio **foliop,
>>>> FWIW, I agree with Kirill's suggestion to just return folio* and drop the
>>>> output
>>>> param.
>>> As I replied to Kirill, I do not think that is a good idea. If you do a git
>>> grep on
>>> the tree for "foliop", you will find several places where that is being used,
>>> for
>>> example, check out alloc_charge_folio() in mm/khugepaged.c. The author
>>> intends to
>>> do the stat computation and setting *foliop in the function itself, so that the
>>> caller is only concerned with the return value.
>> By having 2 return params, you open up the possibility of returning an invalid
>> combination (e.g. ret==FALLBACK, folio!=NULL). You avoid that by having a single
>> return param, which is either NULL (failure) or a valid pointer.
> 
> Okay, I agree with this in a generic sense.
> 
>>
>>> Also, if we return the folio instead of VM_FAULT_FALLBACK from
>>> thp_fault_alloc(),
>>> then you will have two "if (unlikely(!folio))" branches, the first to do the
>>> stat
>>> computation in the function, and the second branch in the caller to set ret =
>>> VM_FAULT_FALLBACK
>>> and then goto out/release.
>> There are already 2 conditionals, that doesn't change.
>>
>>> And, it is already inconsistent to break away the stat computation and the
>>> return value
>>> setting, when the stat computation name is of the form
>>> "count_vm_event(THP_FAULT_FALLBACK)"
>>> and friends, at which point it would just be better to open code the function.
>> I don't understand this.
>>
>>> If I am missing something and your suggestion can be implemented neatly, please
>>> guide.
>> This looks cleaner/simpler to my eye (also removing the order parameter):
>>
>> static folio *thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma,
>>                   unsigned long haddr, unsigned long addr)
>> {
>>     const int order = HPAGE_PMD_ORDER;
>>     struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>
>>     if (unlikely(!folio)) {
>>         count_vm_event(THP_FAULT_FALLBACK);
>>         count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>         goto out;
>>     }
>>
>>     VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>     if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>         folio_put(folio);
>>         count_vm_event(THP_FAULT_FALLBACK);
>>         count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>>         count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>         count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>         goto out;
>>     }
>>     folio_throttle_swaprate(folio, gfp);
>>
>>     folio_zero_user(folio, addr);
>>     /*
>>      * The memory barrier inside __folio_mark_uptodate makes sure that
>>      * folio_zero_user writes become visible before the set_pmd_at()
>>      * write.
>>      */
>>     __folio_mark_uptodate(folio);
>> out:
>>     return folio;
>> }
>>
>>
>> static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> {
>>     vm_fault_t ret = VM_FAULT_FALLBACK;
>>     ...
>>
>>     folio = thp_fault_alloc(gfp, vma, haddr, vmf->address);
>>     if (unlikely(!folio))
>>         goto release;
>>     ...
>> }
>>
> 
> Thanks for your help. I will switch to this, just that, ret should still
> be initialized to zero in case we spot the pmd changes after taking the lock.
> So I'll set it in the case of !folio.

ret = thp_fault_alloc() will have already set it back to zero so there isn't a
bug. But if you prefer to explicitly set it in the "if" to make the code clearer
to read, that's fine by me.

Thanks,
Ryan

> 
>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>> +                  unsigned long addr)
>>>>>    {
>>>>> -    struct vm_area_struct *vma = vmf->vma;
>>>>> -    struct folio *folio = page_folio(page);
>>>>> -    pgtable_t pgtable;
>>>>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>>> -    vm_fault_t ret = 0;
>>>>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>>>    -    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>>>> +    *foliop = folio;
>>>>> +    if (unlikely(!folio)) {
>>>>> +        count_vm_event(THP_FAULT_FALLBACK);
>>>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>> +        return VM_FAULT_FALLBACK;
>>>>> +    }
>>>>>    +    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>>>>        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>>>>            folio_put(folio);
>>>>>            count_vm_event(THP_FAULT_FALLBACK);
>>>>>            count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>>>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>> -        count_mthp_stat(HPAGE_PMD_ORDER,
>>>>> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>>            return VM_FAULT_FALLBACK;
>>>>>        }
>>>>>        folio_throttle_swaprate(folio, gfp);
>>>>>    -    pgtable = pte_alloc_one(vma->vm_mm);
>>>>> -    if (unlikely(!pgtable)) {
>>>>> -        ret = VM_FAULT_OOM;
>>>>> -        goto release;
>>>>> -    }
>>>>> -
>>>>> -    folio_zero_user(folio, vmf->address);
>>>>> +    folio_zero_user(folio, addr);
>>>>>        /*
>>>>>         * The memory barrier inside __folio_mark_uptodate makes sure that
>>>>>         * folio_zero_user writes become visible before the set_pmd_at()
>>>>>         * write.
>>>>>         */
>>>>>        __folio_mark_uptodate(folio);
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>>>> +{
>>>>> +    count_vm_event(THP_FAULT_ALLOC);
>>>>> +    count_mthp_stat(order, MTHP_STAT_ANON_FAULT_ALLOC);
>>>>> +    count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>>> +}
>>>>> +
>>>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>> +            struct vm_area_struct *vma, unsigned long haddr,
>>>>> +            pgtable_t pgtable)
>>>>> +{
>>>>> +    pmd_t entry;
>>>>> +
>>>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>>> +    folio_add_lru_vma(folio, vma);
>>>>> +    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>>> +    mm_inc_nr_ptes(vma->vm_mm);
>>>>> +}
>>>>> +
>>>>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>>> +{
>>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>>> +    struct folio *folio = NULL;
>>>>> +    pgtable_t pgtable;
>>>>> +    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>>> +    vm_fault_t ret = 0;
>>>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>>>> +
>>>>> +    pgtable = pte_alloc_one(vma->vm_mm);
>>>>> +    if (unlikely(!pgtable)) {
>>>>> +        ret = VM_FAULT_OOM;
>>>>> +        goto release;
>>>>> +    }
>>>>> +
>>>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>>>> +                  vmf->address);
>>>>> +    if (ret)
>>>>> +        goto release;
>>>>>          vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>> +
>>>>>        if (unlikely(!pmd_none(*vmf->pmd))) {
>>>>>            goto unlock_release;
>>>>>        } else {
>>>>> -        pmd_t entry;
>>>>> -
>>>>>            ret = check_stable_address_space(vma->vm_mm);
>>>>>            if (ret)
>>>>>                goto unlock_release;
>>>>> @@ -997,20 +1039,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct
>>>>> vm_fault *vmf,
>>>>>                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
>>>>>                return ret;
>>>>>            }
>>>>> -
>>>>> -        entry = mk_huge_pmd(page, vma->vm_page_prot);
>>>>> -        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>>> -        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>>> -        folio_add_lru_vma(folio, vma);
>>>>> -        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>>> -        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>>> -        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>>> -        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>>> -        mm_inc_nr_ptes(vma->vm_mm);
>>>>> +        map_pmd_thp(folio, vmf, vma, haddr, pgtable);
>>>>>            spin_unlock(vmf->ptl);
>>>>> -        count_vm_event(THP_FAULT_ALLOC);
>>>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>>>>> -        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>>>>        }
>>>>>          return 0;
>>>>> @@ -1019,7 +1050,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct
>>>>> vm_fault *vmf,
>>>>>    release:
>>>>>        if (pgtable)
>>>>>            pte_free(vma->vm_mm, pgtable);
>>>>> -    folio_put(folio);
>>>>> +    if (folio)
>>>>> +        folio_put(folio);
>>>>>        return ret;
>>>>>      }
>>>>> @@ -1077,8 +1109,6 @@ static void set_huge_zero_folio(pgtable_t pgtable,
>>>>> struct mm_struct *mm,
>>>>>    vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>>>    {
>>>>>        struct vm_area_struct *vma = vmf->vma;
>>>>> -    gfp_t gfp;
>>>>> -    struct folio *folio;
>>>>>        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>>>        vm_fault_t ret;
>>>>>    @@ -1129,14 +1159,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct
>>>>> vm_fault *vmf)
>>>>>            }
>>>>>            return ret;
>>>>>        }
>>>>> -    gfp = vma_thp_gfp_mask(vma);
>>>>> -    folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
>>>>> -    if (unlikely(!folio)) {
>>>>> -        count_vm_event(THP_FAULT_FALLBACK);
>>>>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>> -        return VM_FAULT_FALLBACK;
>>>>> -    }
>>>>> -    return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
>>>>> +
>>>>> +    return __do_huge_pmd_anonymous_page(vmf);
>>>>>    }
>>>>>      static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long
>>>>> addr,



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault
  2024-09-06  9:00         ` Dev Jain
@ 2024-09-06  9:31           ` Ryan Roberts
  0 siblings, 0 replies; 22+ messages in thread
From: Ryan Roberts @ 2024-09-06  9:31 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy, kirill.shutemov
  Cc: anshuman.khandual, catalin.marinas, cl, vbabka, mhocko, apopple,
	dave.hansen, will, baohua, jack, mark.rutland, hughd,
	aneesh.kumar, yang, peterx, ioworker0, jglisse, linux-arm-kernel,
	linux-kernel, linux-mm

On 06/09/2024 10:00, Dev Jain wrote:
> 
> On 9/6/24 14:13, Ryan Roberts wrote:
>> On 06/09/2024 08:05, Dev Jain wrote:
>>> On 9/5/24 18:44, Ryan Roberts wrote:
>>>> On 04/09/2024 11:09, Dev Jain wrote:
>>>>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>>>>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>>>>> previous patch to flush TLB entry corresponding to the hugezeropage,
>>>>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>>>>> splitting the PMD.
>>>>>
>>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>>> ---
>>>>>    include/linux/huge_mm.h |  6 ++++
>>>>>    mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>>>>    mm/memory.c             |  5 +--
>>>>>    3 files changed, 78 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index e25d9ebfdf89..fdd2cf473a3c 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -9,6 +9,12 @@
>>>>>    #include <linux/kobject.h>
>>>>>      vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>>>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>>>> +               unsigned long haddr, struct folio **foliop,
>>>>> +               unsigned long addr);
>>>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>>>> +         pgtable_t pgtable);
>>>> I don't think you are using either of these outside of huge_memory.c, so not
>>>> sure you need to declare them here or make them non-static?
>>> As pointed out by Kirill, you are right, I forgot to drop these from my previous
>>> approach.
>>>
>>>>>    int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>>>              pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>>>>              struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 58125fbcc532..150163ad77d3 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>>>> unsigned long addr,
>>>>>    }
>>>>>    EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>>>    -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>>>> vm_area_struct *vma,
>>>>> -                  unsigned long haddr, struct folio **foliop,
>>>>> -                  unsigned long addr)
>>>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>>>> +               unsigned long haddr, struct folio **foliop,
>>>>> +               unsigned long addr)
>>>>>    {
>>>>>        struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>>>    @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct
>>>>> vm_area_struct *vma, int order)
>>>>>        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>>>    }
>>>>>    -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>> -            struct vm_area_struct *vma, unsigned long haddr,
>>>>> -            pgtable_t pgtable)
>>>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>>>> +         pgtable_t pgtable)
>>>>>    {
>>>>> -    pmd_t entry;
>>>>> +    pmd_t entry, old_pmd;
>>>>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>>>>          entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>>>        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>>>        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>>>        folio_add_lru_vma(folio, vma);
>>>>> -    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>>> +    if (!is_pmd_none) {
>>>>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>>>>> +        if (pmd_uffd_wp(old_pmd))
>>>>> +            entry = pmd_mkuffd_wp(entry);
>>>> I don't really get this; entry is writable, so I wouldn't expect to also be
>>>> setting uffd-wp here? That combination is not allowed and is checked for in
>>>> page_table_check_pte_flags().
>>>>
>>>> It looks like you expect to get here in the uffd-wp-async case, which used to
>>>> cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
>>>> would cause the uffd-wp bit to be set in each of the new ptes, then during
>>>> fallback to handling the wp fault on the pte, uffd would handle it?
>>> I guess you are correct; I missed the WARN_ON() in page_table_check_pmd_flags(),
>>> but I did see, if I read the uffd code correctly, that mfill_atomic() will just
>>> return in case of pmd_trans_huge(*dst_pmd) while doing a uffd_copy to the
>>> destination
>>> location. So preserving pmd_uffd_wp is useless in case a THP is mapped, but I
>>> did not
>>> know that in fact it is supposed to be an invalid combination. So, I will
>>> drop it,
>>> unless someone has some other objection.
>> So what's the correct way to handle uffd-wp-async in wp_huge_pmd()? Just split
>> it? If so, you can revert your changes in memory.c.
> 
> I think so.
> 
>>
>>>>> +    }
>>>>> +    if (pgtable)
>>>>> +        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>> Should this call be moved outside of here? It doesn't really feel like it
>>>> belongs. Could it be called before calling map_pmd_thp() for the site that
>>>> has a
>>>> pgtable?
>>> Every other place I checked, they are doing this: make the entry -> deposit
>>> pgtable ->
>>> set_pmd_at(). I guess the general flow is to do the deposit based on the old
>>> pmd, before
>>> setting the new one. Which implies: I should at least move this check before
>>> I call
>>> pmdp_huge_clear_flush(). And, since vmf->pmd and creating the new entry has no
>>> relation,
>>> I am inclined to do what you are saying.
>> pgtable_trans_huge_deposit() is just adding the pgtable to a list so that if the
>> THP needs to be split in future, then we have preallocated the pte pgtable so
>> the operation can't fail.
> 
> Yes.
> 
>> And enqueing it is just under the protection of the
>> PTL as far as I can tell. So I think the only ordering requirement is that you
>> both set the pmd and deposit the pgtable under the lock (without dropping it in
>> between). So you can deposit either before or after map_pmd_thp().
> 
> Yes I'll do that before.
> 
>> And
>> pmdp_huge_clear_flush() is irrelavent, I think?
> 
> You mean, in this context? Everywhere, pgtable deposit uses the old pmd
> value to be replaced as its input, that is, it is called before set_pmd_at().
> So calling pgtable deposit after clear_flush() will violate this ordering.
> I do not think this ordering is really required but I'd rather be safe :)

The pmd pointer is just used to get the pmd table (the pointer points to an
entry inside the table so its just a case of backwards aligning the pointer).
The pointer is never dereferenced, so the value of the entry is irrelevant.

> 
>>
>>>>>        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>>>        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>>>        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>>> -    mm_inc_nr_ptes(vma->vm_mm);
>>>>> +    if (is_pmd_none)
>>>>> +        mm_inc_nr_ptes(vma->vm_mm);
>>>>>    }
>>>>>      static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>>>        spin_unlock(vmf->ptl);
>>>>>    }
>>>>>    +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>>>>> +                         unsigned long haddr,
>>>>> +                         struct folio *folio)
>>>>> +{
>>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>>> +    vm_fault_t ret = 0;
>>>>> +
>>>>> +    ret = check_stable_address_space(vma->vm_mm);
>>>>> +    if (ret)
>>>>> +        goto out;
>>>>> +    map_pmd_thp(folio, vmf, vma, haddr, NULL);
>>>>> +out:
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long
>>>>> haddr)
>>>>> +{
>>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>>>> +    struct mmu_notifier_range range;
>>>>> +    struct folio *folio = NULL;
>>>>> +    vm_fault_t ret = 0;
>>>>> +
>>>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>>>> +                  vmf->address);
>>>> Just checking: the PTE table was already allocated during the read fault,
>>>> right?
>>>> So we don't have to allocate it here.
>>> Correct, that happens in set_huge_zero_folio(). Thanks for checking.
>>>
>>>>> +    if (ret)
>>>>> +        goto out;
>>>>> +
>>>>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>>>>> +                haddr + HPAGE_PMD_SIZE);
>>>>> +    mmu_notifier_invalidate_range_start(&range);
>>>>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>>>>> +        goto unlock;
>>>>> +    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>>>>> +    if (!ret)
>>>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>>>> +unlock:
>>>>> +    spin_unlock(vmf->ptl);
>>>>> +    mmu_notifier_invalidate_range_end(&range);
>>>> I'll confess I don't understand all the mmu notifier rules.
>>> I confess the same :)
>>>
>>>> But the doc at
>>>> Documentation/mm/mmu_notifier.rst implies that the notification must be done
>>>> while holding the PTL. Although that's not how wp_page_copy(). Are you
>>>> confident
>>>> what you have done is correct?
>>> Everywhere else, invalidate_range_end() is getting called after dropping the
>>> lock,
>>> one reason is that it has a might_sleep(), and therefore we cannot call it while
>>> holding a spinlock. I still don't know what exactly these calls mean...but I
>>> think
>>> what I have done is correct.
>> High level; they are notifying secondary MMUs (e.g. IOMMU) of a change so the
>> tables of those secondary MMUs can be kept in sync. I don't understand all the
>> ordering requirement details though.
>>
>> I think what you have is probably correct, would be good for someone that knows
>> what they are talking about to confirm though :)
> 
> Exactly.
> 
>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>> +out:
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>>    vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>>>    {
>>>>>        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>>>>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>>>        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>>>>        VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>>>>    -    if (is_huge_zero_pmd(orig_pmd))
>>>>> +    if (is_huge_zero_pmd(orig_pmd)) {
>>>>> +        vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>>>>> +
>>>>> +        if (!(ret & VM_FAULT_FALLBACK))
>>>>> +            return ret;
>>>>> +
>>>>> +        /* Fallback to splitting PMD if THP cannot be allocated */
>>>>>            goto fallback;
>>>>> +    }
>>>>>          spin_lock(vmf->ptl);
>>>>>    diff --git a/mm/memory.c b/mm/memory.c
>>>>> index 3c01d68065be..c081a25f5173 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault
>>>>> *vmf)
>>>>>        if (vma_is_anonymous(vma)) {
>>>>>            if (likely(!unshare) &&
>>>>>                userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>>>>> -            if (userfaultfd_wp_async(vmf->vma))
>>>>> +            if (!userfaultfd_wp_async(vmf->vma))
>>>>> +                return handle_userfault(vmf, VM_UFFD_WP);
>>>>> +            if (!is_huge_zero_pmd(vmf->orig_pmd))
>>>>>                    goto split;
>>>>> -            return handle_userfault(vmf, VM_UFFD_WP);
>>>>>            }
>>>>>            return do_huge_pmd_wp_page(vmf);
>>>>>        }



^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2024-09-06  9:31 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-04 10:09 [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault Dev Jain
2024-09-04 10:09 ` [PATCH v2 1/2] mm: Abstract THP allocation Dev Jain
2024-09-05  8:20   ` Kirill A. Shutemov
2024-09-05  8:45     ` Dev Jain
2024-09-05 11:08   ` Ryan Roberts
2024-09-06  5:42     ` Dev Jain
2024-09-06  8:28       ` Ryan Roberts
2024-09-06  8:45         ` Dev Jain
2024-09-06  9:00           ` Ryan Roberts
2024-09-04 10:09 ` [PATCH v2 2/2] mm: Allocate THP on hugezeropage wp-fault Dev Jain
2024-09-05  8:26   ` Kirill A. Shutemov
2024-09-05  8:52     ` Dev Jain
2024-09-05  9:41     ` Kefeng Wang
2024-09-05  9:53       ` Dev Jain
2024-09-05 13:14   ` Ryan Roberts
2024-09-06  7:05     ` Dev Jain
2024-09-06  8:43       ` Ryan Roberts
2024-09-06  9:00         ` Dev Jain
2024-09-06  9:31           ` Ryan Roberts
2024-09-04 11:36 ` [PATCH v2 0/2] Do not shatter hugezeropage on wp-fault Ryan Roberts
2024-09-04 15:41   ` Dev Jain
2024-09-04 16:01     ` Ryan Roberts

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).