From: Alistair Popple <apopple@nvidia.com>
To: dan.j.williams@intel.com, vishal.l.verma@intel.com,
dave.jiang@intel.com, logang@deltatee.com, bhelgaas@google.com,
jack@suse.cz, jgg@ziepe.ca
Cc: catalin.marinas@arm.com, will@kernel.org, mpe@ellerman.id.au,
npiggin@gmail.com, dave.hansen@linux.intel.com,
ira.weiny@intel.com, willy@infradead.org, djwong@kernel.org,
tytso@mit.edu, linmiaohe@huawei.com, david@redhat.com,
peterx@redhat.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org,
linux-arm-kernel@lists.infradead.org,
linuxppc-dev@lists.ozlabs.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-mm@kvack.org, linux-ext4@vger.kernel.org,
linux-xfs@vger.kernel.org, jhubbard@nvidia.com, hch@lst.de,
david@fromorbit.com, Alistair Popple <apopple@nvidia.com>
Subject: [PATCH 07/13] huge_memory: Allow mappings of PUD sized pages
Date: Thu, 27 Jun 2024 10:54:22 +1000 [thread overview]
Message-ID: <bd332b0d3971b03152b3541f97470817c5147b51.1719386613.git-series.apopple@nvidia.com> (raw)
In-Reply-To: <cover.66009f59a7fe77320d413011386c3ae5c2ee82eb.1719386613.git-series.apopple@nvidia.com>
Currently DAX folio/page reference counts are managed differently to
normal pages. To allow these to be managed the same as normal pages
introduce dax_insert_pfn_pud. This will map the entire PUD-sized folio
and take references as it would for a normally mapped page.
This is distinct from the current mechanism, vmf_insert_pfn_pud, which
simply inserts a special devmap PUD entry into the page table without
holding a reference to the page for the mapping.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
include/linux/huge_mm.h | 4 ++-
include/linux/rmap.h | 14 +++++-
mm/huge_memory.c | 108 ++++++++++++++++++++++++++++++++++++++---
mm/rmap.c | 48 ++++++++++++++++++-
4 files changed, 168 insertions(+), 6 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2aa986a..b98a3cc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_UNSUPPORTED,
@@ -106,6 +107,9 @@ extern struct kobj_attribute shmem_enabled_attr;
#define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1))
#define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT)
+#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long transparent_hugepage_flags;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 7229b9b..c5a0205 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -192,6 +192,7 @@ typedef int __bitwise rmap_t;
enum rmap_level {
RMAP_LEVEL_PTE = 0,
RMAP_LEVEL_PMD,
+ RMAP_LEVEL_PUD,
};
static inline void __folio_rmap_sanity_checks(struct folio *folio,
@@ -225,6 +226,13 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio,
VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
break;
+ case RMAP_LEVEL_PUD:
+ /*
+ * Asume that we are creating * a single "entire" mapping of the folio.
+ */
+ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
+ VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
+ break;
default:
VM_WARN_ON_ONCE(true);
}
@@ -248,12 +256,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
folio_add_file_rmap_ptes(folio, page, 1, vma)
void folio_add_file_rmap_pmd(struct folio *, struct page *,
struct vm_area_struct *);
+void folio_add_file_rmap_pud(struct folio *, struct page *,
+ struct vm_area_struct *);
void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
struct vm_area_struct *);
#define folio_remove_rmap_pte(folio, page, vma) \
folio_remove_rmap_ptes(folio, page, 1, vma)
void folio_remove_rmap_pmd(struct folio *, struct page *,
struct vm_area_struct *);
+void folio_remove_rmap_pud(struct folio *, struct page *,
+ struct vm_area_struct *);
void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address, rmap_t flags);
@@ -338,6 +350,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
atomic_add(orig_nr_pages, &folio->_large_mapcount);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
atomic_inc(&folio->_entire_mapcount);
atomic_inc(&folio->_large_mapcount);
break;
@@ -434,6 +447,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
atomic_add(orig_nr_pages, &folio->_large_mapcount);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
if (PageAnonExclusive(page)) {
if (unlikely(maybe_pinned))
return -EBUSY;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index db7946a..e1f053e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1283,6 +1283,70 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+
+/**
+ * dax_insert_pfn_pud - insert a pud size pfn backed by a normal page
+ * @vmf: Structure describing the fault
+ * @pfn: pfn of the page to insert
+ * @write: whether it's a write fault
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PUD_MASK;
+ pud_t *pud = vmf->pud;
+ pgprot_t prot = vma->vm_page_prot;
+ struct mm_struct *mm = vma->vm_mm;
+ pud_t entry;
+ spinlock_t *ptl;
+ struct folio *folio;
+ struct page *page;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &prot, pfn);
+
+ ptl = pud_lock(mm, pud);
+ if (!pud_none(*pud)) {
+ if (write) {
+ if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_huge_zero_pud(*pud));
+ goto out_unlock;
+ }
+ entry = pud_mkyoung(*pud);
+ entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
+ if (pudp_set_access_flags(vma, addr, pud, entry, 1))
+ update_mmu_cache_pud(vma, addr, pud);
+ }
+ goto out_unlock;
+ }
+
+ entry = pud_mkhuge(pfn_t_pud(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pud_mkdevmap(entry);
+ if (write) {
+ entry = pud_mkyoung(pud_mkdirty(entry));
+ entry = maybe_pud_mkwrite(entry, vma);
+ }
+
+ page = pfn_t_to_page(pfn);
+ folio = page_folio(page);
+ folio_get(folio);
+ folio_add_file_rmap_pud(folio, page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
+
+ set_pud_at(mm, addr, pud, entry);
+ update_mmu_cache_pud(vma, addr, pud);
+
+out_unlock:
+ spin_unlock(ptl);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(dax_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1836,7 +1900,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else if (is_huge_zero_pmd(orig_pmd)) {
- zap_deposited_table(tlb->mm, pmd);
+ if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
struct folio *folio = NULL;
@@ -2268,20 +2333,34 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
pud_t *pud, unsigned long addr)
{
+ pud_t orig_pud;
spinlock_t *ptl;
ptl = __pud_trans_huge_lock(pud, vma);
if (!ptl)
return 0;
- pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
+ orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
- /* No support for anonymous PUD pages yet */
- BUG();
+ struct page *page = NULL;
+ struct folio *folio;
+
+ /* No support for anonymous PUD pages or migration yet */
+ BUG_ON(vma_is_anonymous(vma) || !pud_present(orig_pud));
+
+ page = pud_page(orig_pud);
+ folio = page_folio(page);
+ folio_remove_rmap_pud(folio, page, vma);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+
+ spin_unlock(ptl);
+ tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
}
return 1;
}
@@ -2289,6 +2368,8 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
unsigned long haddr)
{
+ pud_t old_pud;
+
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
@@ -2296,7 +2377,22 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
count_vm_event(THP_SPLIT_PUD);
- pudp_huge_clear_flush(vma, haddr, pud);
+ old_pud = pudp_huge_clear_flush(vma, haddr, pud);
+ if (is_huge_zero_pud(old_pud))
+ return;
+
+ if (vma_is_dax(vma)) {
+ struct page *page = pud_page(old_pud);
+ struct folio *folio = page_folio(page);
+
+ if (!folio_test_dirty(folio) && pud_dirty(old_pud))
+ folio_mark_dirty(folio);
+ if (!folio_test_referenced(folio) && pud_young(old_pud))
+ folio_set_referenced(folio);
+ folio_remove_rmap_pud(folio, page, vma);
+ folio_put(folio);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+ }
}
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
diff --git a/mm/rmap.c b/mm/rmap.c
index e8fc5ec..e949e4f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1165,6 +1165,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
atomic_add(orig_nr_pages, &folio->_large_mapcount);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
if (first) {
nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
@@ -1306,6 +1307,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
case RMAP_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
+ case RMAP_LEVEL_PUD:
+ /*
+ * Keep the compiler happy, we don't support anonymous PUD mappings.
+ */
+ WARN_ON_ONCE(1);
+ break;
}
}
for (i = 0; i < nr_pages; i++) {
@@ -1489,6 +1496,26 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
+ * @folio: The folio to add the mapping to
+ * @page: The first page to add
+ * @vma: The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum rmap_level level)
@@ -1521,6 +1548,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
atomic_dec(&folio->_large_mapcount);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
@@ -1615,6 +1643,26 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
+ * @folio: The folio to remove the mapping from
+ * @page: The first page to remove
+ * @vma: The vm area from which the mapping is removed
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
--
git-series 0.9.1
next prev parent reply other threads:[~2024-06-27 0:55 UTC|newest]
Thread overview: 54+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-06-27 0:54 [PATCH 00/13] fs/dax: Fix FS DAX page reference counts Alistair Popple
2024-06-27 0:54 ` [PATCH 01/13] mm/gup.c: Remove redundant check for PCI P2PDMA page Alistair Popple
2024-06-27 6:36 ` Dan Williams
2024-06-27 0:54 ` [PATCH 02/13] pci/p2pdma: Don't initialise page refcount to one Alistair Popple
2024-06-27 5:30 ` Christoph Hellwig
2024-06-29 21:28 ` Bjorn Helgaas
2024-06-27 0:54 ` [PATCH 03/13] fs/dax: Refactor wait for dax idle page Alistair Popple
2024-06-27 5:31 ` Christoph Hellwig
2024-06-27 0:54 ` [PATCH 04/13] fs/dax: Add dax_page_free callback Alistair Popple
2024-06-27 5:33 ` Christoph Hellwig
2024-06-27 23:48 ` Alistair Popple
2024-06-27 0:54 ` [PATCH 05/13] mm: Allow compound zone device pages Alistair Popple
2024-06-27 5:35 ` Christoph Hellwig
2024-06-27 0:54 ` [PATCH 06/13] mm/memory: Add dax_insert_pfn Alistair Popple
2024-06-27 5:22 ` Christoph Hellwig
2024-06-27 11:33 ` Jan Kara
2024-09-06 6:21 ` Alistair Popple
2024-07-02 7:18 ` David Hildenbrand
2024-07-02 10:47 ` Alistair Popple
2024-07-02 11:46 ` Christoph Hellwig
2024-07-02 11:53 ` David Hildenbrand
2024-06-27 0:54 ` Alistair Popple [this message]
2024-06-27 22:26 ` [PATCH 07/13] huge_memory: Allow mappings of PUD sized pages kernel test robot
2024-07-02 7:16 ` David Hildenbrand
2024-07-02 10:19 ` Alistair Popple
2024-07-02 11:02 ` David Hildenbrand
2024-07-02 11:30 ` Alistair Popple
2024-07-02 13:01 ` David Hildenbrand
2024-07-02 11:51 ` Christoph Hellwig
2024-06-27 0:54 ` [PATCH 08/13] huge_memory: Allow mappings of PMD " Alistair Popple
2024-06-27 0:54 ` [PATCH 09/13] gup: Don't allow FOLL_LONGTERM pinning of FS DAX pages Alistair Popple
2024-07-01 8:59 ` David Hildenbrand
2024-07-01 23:47 ` Alistair Popple
2024-07-02 10:48 ` David Hildenbrand
2024-06-27 0:54 ` [PATCH 10/13] fs/dax: Properly refcount fs dax pages Alistair Popple
2024-06-27 5:44 ` Christoph Hellwig
2024-09-06 6:00 ` Alistair Popple
2024-06-27 0:54 ` [PATCH 11/13] huge_memory: Remove dead vmf_insert_pXd code Alistair Popple
2024-07-05 14:24 ` Peter Xu
2024-07-09 4:07 ` Alistair Popple
2024-07-09 15:56 ` Peter Xu
2024-07-12 2:40 ` Alistair Popple
2024-07-12 15:52 ` Peter Xu
2024-06-27 0:54 ` [PATCH 12/13] mm: Remove pXX_devmap callers Alistair Popple
2024-06-27 0:54 ` [PATCH 13/13] mm: Remove devmap related functions and page table bits Alistair Popple
2024-06-27 23:04 ` kernel test robot
2024-06-28 2:12 ` kernel test robot
2024-07-08 11:35 ` Will Deacon
2024-06-27 6:58 ` [PATCH 00/13] fs/dax: Fix FS DAX page reference counts Dan Williams
2024-06-27 7:15 ` Alistair Popple
2024-06-27 20:24 ` Dan Williams
2024-06-28 0:06 ` Alistair Popple
2024-07-01 4:24 ` Dave Chinner
2024-07-01 8:33 ` Alistair Popple
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=bd332b0d3971b03152b3541f97470817c5147b51.1719386613.git-series.apopple@nvidia.com \
--to=apopple@nvidia.com \
--cc=bhelgaas@google.com \
--cc=catalin.marinas@arm.com \
--cc=dan.j.williams@intel.com \
--cc=dave.hansen@linux.intel.com \
--cc=dave.jiang@intel.com \
--cc=david@fromorbit.com \
--cc=david@redhat.com \
--cc=djwong@kernel.org \
--cc=hch@lst.de \
--cc=ira.weiny@intel.com \
--cc=jack@suse.cz \
--cc=jgg@ziepe.ca \
--cc=jhubbard@nvidia.com \
--cc=linmiaohe@huawei.com \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-xfs@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=logang@deltatee.com \
--cc=mpe@ellerman.id.au \
--cc=npiggin@gmail.com \
--cc=nvdimm@lists.linux.dev \
--cc=peterx@redhat.com \
--cc=tytso@mit.edu \
--cc=vishal.l.verma@intel.com \
--cc=will@kernel.org \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).