LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Yin Tirui <yintirui@huawei.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	Matthew Wilcox <willy@infradead.org>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <ljs@kernel.org>, Juergen Gross <jgross@suse.com>,
	Jonathan Cameron <jic23@kernel.org>,
	Will Deacon <will@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>,
	Peter Xu <peterx@redhat.com>,
	Luiz Capitulino <luizcap@redhat.com>,
	Thomas Gleixner <tglx@kernel.org>, Ingo Molnar <mingo@redhat.com>,
	Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	"H . Peter Anvin" <hpa@zytor.com>,
	Andy Lutomirski <luto@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Madhavan Srinivasan <maddy@linux.ibm.com>,
	Michael Ellerman <mpe@ellerman.id.au>,
	Nicholas Piggin <npiggin@gmail.com>,
	Christophe Leroy <chleroy@kernel.org>,
	"Liam R . Howlett" <liam@infradead.org>, Zi Yan <ziy@nvidia.com>,
	Baolin Wang <baolin.wang@linux.alibaba.com>,
	Nico Pache <npache@redhat.com>,
	Ryan Roberts <ryan.roberts@arm.com>, Dev Jain <dev.jain@arm.com>,
	Barry Song <baohua@kernel.org>, Lance Yang <lance.yang@linux.dev>,
	Vlastimil Babka <vbabka@kernel.org>,
	Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Michal Hocko <mhocko@suse.com>,
	Anshuman Khandual <anshuman.khandual@arm.com>,
	Rohan McLure <rmclure@linux.ibm.com>,
	Kevin Brodsky <kevin.brodsky@arm.com>,
	Alistair Popple <apopple@nvidia.com>,
	Andrew Donnellan <andrew+kernel@donnellan.id.au>,
	Pasha Tatashin <pasha.tatashin@soleen.com>,
	Baoquan He <bhe@redhat.com>, Thomas Huth <thuth@redhat.com>,
	Coiby Xu <coxu@redhat.com>, Dan Williams <djbw@kernel.org>,
	Yu-cheng Yu <yu-cheng.yu@intel.com>,
	Lu Baolu <baolu.lu@linux.intel.com>,
	Conor Dooley <conor.dooley@microchip.com>,
	Rik van Riel <riel@surriel.com>, <wangkefeng.wang@huawei.com>,
	<chenjun102@huawei.com>, <yintirui@huawei.com>,
	<linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>,
	<x86@kernel.org>, <linux-arm-kernel@lists.infradead.org>,
	<linuxppc-dev@lists.ozlabs.org>, <linux-pm@vger.kernel.org>
Subject: [PATCH mm-unstable RFC v4 4/7] mm/huge_memory: refactor copy_huge_pmd()
Date: Tue, 26 May 2026 22:50:00 +0800	[thread overview]
Message-ID: <20260526145003.88445-5-yintirui@huawei.com> (raw)
In-Reply-To: <20260526145003.88445-1-yintirui@huawei.com>

Classify the source PMD via pmd_present() and vm_normal_folio_pmd(),
matching the way the PTE path uses pte_present() and vm_normal_page().
This moves the present-PMD decision from VMA identity checks to the
actual PMD/folio state.

Drop the defensive "if (!pmd_trans_huge(pmd)) goto out_unlock" branch:
with mmap_write_lock held during fork, it should not occur.

Extract the present-PMD side of copy_huge_pmd() into
copy_present_huge_pmd(). The helper owns the child pgtable passed by the
caller: it either deposits the pgtable when installing a copied PMD, or
frees it on paths that do not install one.

The child pgtable is now allocated once up front and freed on every skip
path. This makes file/shmem and PFNMAP/special skip paths take the PMD
locks and free the preallocated pgtable before returning. These are not
expected to be hot paths, and the PFNMAP case is reused by the follow-up
PMD PFNMAP copy support.

Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
 mm/huge_memory.c | 175 +++++++++++++++++++++++++----------------------
 1 file changed, 95 insertions(+), 80 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9832ee910d5e..3964258ff91d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1879,6 +1879,82 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return false;
 }
 
+static int copy_present_huge_pmd(
+		struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+		pmd_t pmd, pgtable_t pgtable, bool *need_split)
+{
+	struct folio *src_folio;
+	bool wrprotect = true;
+
+	src_folio = vm_normal_folio_pmd(src_vma, addr, pmd);
+	if (!src_folio) {
+		/*
+		 * When page table lock is held, the huge zero pmd should not be
+		 * under splitting since we don't split the page itself, only pmd to
+		 * a page table.
+		 */
+		if (is_huge_zero_pmd(pmd)) {
+			/*
+			 * mm_get_huge_zero_folio() will never allocate a new
+			 * folio here, since we already have a zero page to
+			 * copy. It just takes a reference.
+			 */
+			mm_get_huge_zero_folio(dst_mm);
+			goto set_pmd;
+		}
+
+		/*
+		 * Making sure it's not a CoW VMA with writable
+		 * mapping, otherwise it means either the anon page wrongly
+		 * applied special bit, or we made the PRIVATE mapping be
+		 * able to wrongly write to the backend MMIO.
+		 */
+		VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
+		pte_free(dst_mm, pgtable);
+		pgtable = NULL;
+		wrprotect = false;
+		goto set_pmd;
+	}
+
+	/* File THPs are copied lazily by refaulting. */
+	if (!folio_test_anon(src_folio)) {
+		pte_free(dst_mm, pgtable);
+		return 0;
+	}
+
+	folio_get(src_folio);
+	if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio,
+							&src_folio->page,
+							dst_vma, src_vma))) {
+		/* Page maybe pinned: split and retry the fault on PTEs. */
+		folio_put(src_folio);
+		pte_free(dst_mm, pgtable);
+		*need_split = true;
+		return -EAGAIN;
+	}
+	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+set_pmd:
+	if (pgtable) {
+		mm_inc_nr_ptes(dst_mm);
+		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+	}
+
+	if (wrprotect) {
+		pmdp_set_wrprotect(src_mm, addr, src_pmd);
+		if (!userfaultfd_wp(dst_vma))
+			pmd = pmd_clear_uffd_wp(pmd);
+		pmd = pmd_wrprotect(pmd);
+	}
+
+	pmd = pmd_mkold(pmd);
+	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+
+	return 0;
+}
+
 static void copy_huge_non_present_pmd(
 		struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
@@ -1940,104 +2016,43 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 {
 	spinlock_t *dst_ptl, *src_ptl;
-	struct page *src_page;
-	struct folio *src_folio;
-	pmd_t pmd;
 	pgtable_t pgtable = NULL;
-	int ret = -ENOMEM;
-
-	pmd = pmdp_get_lockless(src_pmd);
-	if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
-		     !is_huge_zero_pmd(pmd))) {
-		dst_ptl = pmd_lock(dst_mm, dst_pmd);
-		src_ptl = pmd_lockptr(src_mm, src_pmd);
-		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
-		/*
-		 * No need to recheck the pmd, it can't change with write
-		 * mmap lock held here.
-		 *
-		 * Meanwhile, making sure it's not a CoW VMA with writable
-		 * mapping, otherwise it means either the anon page wrongly
-		 * applied special bit, or we made the PRIVATE mapping be
-		 * able to wrongly write to the backend MMIO.
-		 */
-		VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
-		goto set_pmd;
-	}
-
-	/* Skip if can be re-fill on fault */
-	if (!vma_is_anonymous(dst_vma))
-		return 0;
+	bool need_split = false;
+	int ret = 0;
+	pmd_t pmd;
 
 	pgtable = pte_alloc_one(dst_mm);
 	if (unlikely(!pgtable))
-		goto out;
+		return -ENOMEM;
 
 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
 	src_ptl = pmd_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 
-	ret = -EAGAIN;
 	pmd = *src_pmd;
 
-	if (unlikely(thp_migration_supported() &&
-		     pmd_is_valid_softleaf(pmd))) {
-		copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
+	if (likely(pmd_present(pmd))) {
+		ret = copy_present_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
+					  dst_vma, src_vma, pmd, pgtable, &need_split);
+	} else if (unlikely(thp_migration_supported() && pmd_is_valid_softleaf(pmd))) {
+		if (unlikely(!vma_is_anonymous(dst_vma)))
+			pte_free(dst_mm, pgtable);
+		else
+			copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
 					  dst_vma, src_vma, pmd, pgtable);
-		ret = 0;
-		goto out_unlock;
-	}
-
-	if (unlikely(!pmd_trans_huge(pmd))) {
+	} else {
+		VM_WARN_ONCE(1, "unexpected non-present PMD %llx\n",
+				(unsigned long long)pmd_val(pmd));
 		pte_free(dst_mm, pgtable);
-		goto out_unlock;
-	}
-	/*
-	 * When page table lock is held, the huge zero pmd should not be
-	 * under splitting since we don't split the page itself, only pmd to
-	 * a page table.
-	 */
-	if (is_huge_zero_pmd(pmd)) {
-		/*
-		 * mm_get_huge_zero_folio() will never allocate a new
-		 * folio here, since we already have a zero page to
-		 * copy. It just takes a reference.
-		 */
-		mm_get_huge_zero_folio(dst_mm);
-		goto out_zero_page;
+		ret = -EAGAIN;
 	}
 
-	src_page = pmd_page(pmd);
-	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
-	src_folio = page_folio(src_page);
+	spin_unlock(src_ptl);
+	spin_unlock(dst_ptl);
 
-	folio_get(src_folio);
-	if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
-		/* Page maybe pinned: split and retry the fault on PTEs. */
-		folio_put(src_folio);
-		pte_free(dst_mm, pgtable);
-		spin_unlock(src_ptl);
-		spin_unlock(dst_ptl);
+	if (unlikely(need_split))
 		__split_huge_pmd(src_vma, src_pmd, addr, false);
-		return -EAGAIN;
-	}
-	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-out_zero_page:
-	mm_inc_nr_ptes(dst_mm);
-	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-	pmdp_set_wrprotect(src_mm, addr, src_pmd);
-	if (!userfaultfd_wp(dst_vma))
-		pmd = pmd_clear_uffd_wp(pmd);
-	pmd = pmd_wrprotect(pmd);
-set_pmd:
-	pmd = pmd_mkold(pmd);
-	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 
-	ret = 0;
-out_unlock:
-	spin_unlock(src_ptl);
-	spin_unlock(dst_ptl);
-out:
 	return ret;
 }
 
-- 
2.43.0



  parent reply	other threads:[~2026-05-26 22:39 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-26 14:49 [PATCH mm-unstable RFC v4 0/7] mm: add huge pfnmap support for remap_pfn_range() Yin Tirui
2026-05-26 14:49 ` [PATCH mm-unstable RFC v4 1/7] x86/mm: use PTE-level pgprot for huge PFN helpers Yin Tirui
2026-05-26 14:49 ` [PATCH mm-unstable RFC v4 2/7] arm64/mm: " Yin Tirui
2026-05-26 14:49 ` [PATCH mm-unstable RFC v4 3/7] powerpc/mm: " Yin Tirui
2026-05-26 14:50 ` Yin Tirui [this message]
2026-05-27 12:24   ` [PATCH mm-unstable RFC v4 4/7] mm/huge_memory: refactor copy_huge_pmd() Dev Jain
2026-05-26 14:50 ` [PATCH mm-unstable RFC v4 5/7] mm/huge_memory: refactor __split_huge_pmd_locked() Yin Tirui
2026-05-26 14:50 ` [PATCH mm-unstable RFC v4 6/7] mm/huge_memory: make move_huge_pmd() use has_deposited_pgtable() Yin Tirui
2026-05-26 14:50 ` [PATCH mm-unstable RFC v4 7/7] mm: add PMD-level PFNMAP support for remap_pfn_range() Yin Tirui
2026-05-26 15:33 ` [PATCH mm-unstable RFC v4 0/7] mm: add huge pfnmap " Lorenzo Stoakes
2026-05-27  2:57   ` Yin Tirui

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260526145003.88445-5-yintirui@huawei.com \
    --to=yintirui@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrew+kernel@donnellan.id.au \
    --cc=anshuman.khandual@arm.com \
    --cc=apopple@nvidia.com \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=baolu.lu@linux.intel.com \
    --cc=bhe@redhat.com \
    --cc=bp@alien8.de \
    --cc=catalin.marinas@arm.com \
    --cc=chenjun102@huawei.com \
    --cc=chleroy@kernel.org \
    --cc=conor.dooley@microchip.com \
    --cc=coxu@redhat.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=djbw@kernel.org \
    --cc=hpa@zytor.com \
    --cc=jgross@suse.com \
    --cc=jic23@kernel.org \
    --cc=kevin.brodsky@arm.com \
    --cc=lance.yang@linux.dev \
    --cc=liam@infradead.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-pm@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=ljs@kernel.org \
    --cc=luizcap@redhat.com \
    --cc=luto@kernel.org \
    --cc=maddy@linux.ibm.com \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=mpe@ellerman.id.au \
    --cc=npache@redhat.com \
    --cc=npiggin@gmail.com \
    --cc=pasha.tatashin@soleen.com \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=riel@surriel.com \
    --cc=rmclure@linux.ibm.com \
    --cc=rppt@kernel.org \
    --cc=ryan.roberts@arm.com \
    --cc=surenb@google.com \
    --cc=tglx@kernel.org \
    --cc=thuth@redhat.com \
    --cc=vbabka@kernel.org \
    --cc=wangkefeng.wang@huawei.com \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    --cc=yu-cheng.yu@intel.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox