All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrew Morton <akpm@linux-foundation.org>
To: mm-commits@vger.kernel.org, will@kernel.org, osalvador@suse.de,
	npiggin@gmail.com, muchun.song@linux.dev, mpe@ellerman.id.au,
	mike.kravetz@oracle.com, joao.m.martins@oracle.com,
	dan.j.williams@intel.com, christophe.leroy@csgroup.eu,
	catalin.marinas@arm.com, aneesh.kumar@linux.ibm.com,
	akpm@linux-foundation.org
Subject: [merged mm-stable] powerpc-book3s64-radix-add-support-for-vmemmap-optimization-for-radix.patch removed from -mm tree
Date: Fri, 11 Aug 2023 16:03:23 -0700	[thread overview]
Message-ID: <20230811230323.796CDC433C7@smtp.kernel.org> (raw)


The quilt patch titled
     Subject: powerpc/book3s64/radix: add support for vmemmap optimization for radix
has been removed from the -mm tree.  Its filename was
     powerpc-book3s64-radix-add-support-for-vmemmap-optimization-for-radix.patch

This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

------------------------------------------------------
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Subject: powerpc/book3s64/radix: add support for vmemmap optimization for radix
Date: Tue, 25 Jul 2023 00:37:57 +0530

With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap
page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)).  Hence
with 64K page size, we don't use vmemmap deduplication for PMD-level
mapping.

[aneesh.kumar@linux.ibm.com: ppc64: don't include radix headers if CONFIG_PPC_RADIX_MMU=n]
  Link: https://lkml.kernel.org/r/87zg3jw8km.fsf@linux.ibm.com
Link: https://lkml.kernel.org/r/20230724190759.483013-12-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 Documentation/mm/vmemmap_dedup.rst         |    1 
 Documentation/powerpc/index.rst            |    1 
 Documentation/powerpc/vmemmap_dedup.rst    |  101 +++++++++
 arch/powerpc/Kconfig                       |    1 
 arch/powerpc/include/asm/book3s/64/radix.h |   11 +
 arch/powerpc/mm/book3s64/radix_pgtable.c   |  203 +++++++++++++++++++
 6 files changed, 318 insertions(+)

--- a/arch/powerpc/include/asm/book3s/64/radix.h~powerpc-book3s64-radix-add-support-for-vmemmap-optimization-for-radix
+++ a/arch/powerpc/include/asm/book3s/64/radix.h
@@ -326,6 +326,7 @@ static inline pud_t radix__pud_mkdevmap(
 }
 
 struct vmem_altmap;
+struct dev_pagemap;
 extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
 					     unsigned long page_size,
 					     unsigned long phys);
@@ -363,5 +364,15 @@ int radix__remove_section_mapping(unsign
 
 void radix__kernel_map_pages(struct page *page, int numpages, int enable);
 
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+#define vmemmap_can_optimize vmemmap_can_optimize
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
+#endif
+
+#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+					      unsigned long start,
+					      unsigned long end, int node,
+					      struct dev_pagemap *pgmap);
 #endif /* __ASSEMBLY__ */
 #endif
--- a/arch/powerpc/Kconfig~powerpc-book3s64-radix-add-support-for-vmemmap-optimization-for-radix
+++ a/arch/powerpc/Kconfig
@@ -174,6 +174,7 @@ config PPC
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select ARCH_WANT_LD_ORPHAN_WARN
+	select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP	if PPC_RADIX_MMU
 	select ARCH_WANTS_MODULES_DATA_IN_VMALLOC	if PPC_BOOK3S_32 || PPC_8xx
 	select ARCH_WEAK_RELEASE_ACQUIRE
 	select BINFMT_ELF
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c~powerpc-book3s64-radix-add-support-for-vmemmap-optimization-for-radix
+++ a/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -986,6 +986,15 @@ int __meminit radix__vmemmap_create_mapp
 	return 0;
 }
 
+
+bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+{
+	if (radix_enabled())
+		return __vmemmap_can_optimize(altmap, pgmap);
+
+	return false;
+}
+
 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
 				unsigned long addr, unsigned long next)
 {
@@ -1193,6 +1202,200 @@ base_mapping:
 	return 0;
 }
 
+static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
+							 struct vmem_altmap *altmap,
+							 struct page *reuse)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset_k(addr);
+	p4d = p4d_offset(pgd, addr);
+	pud = vmemmap_pud_alloc(p4d, node, addr);
+	if (!pud)
+		return NULL;
+	pmd = vmemmap_pmd_alloc(pud, node, addr);
+	if (!pmd)
+		return NULL;
+	if (pmd_leaf(*pmd))
+		/*
+		 * The second page is mapped as a hugepage due to a nearby request.
+		 * Force our mapping to page size without deduplication
+		 */
+		return NULL;
+	pte = vmemmap_pte_alloc(pmd, node, addr);
+	if (!pte)
+		return NULL;
+	radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+	return pte;
+}
+
+static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
+						    unsigned long pfn_offset, int node)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long map_addr;
+
+	/* the second vmemmap page which we use for duplication */
+	map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
+	pgd = pgd_offset_k(map_addr);
+	p4d = p4d_offset(pgd, map_addr);
+	pud = vmemmap_pud_alloc(p4d, node, map_addr);
+	if (!pud)
+		return NULL;
+	pmd = vmemmap_pmd_alloc(pud, node, map_addr);
+	if (!pmd)
+		return NULL;
+	if (pmd_leaf(*pmd))
+		/*
+		 * The second page is mapped as a hugepage due to a nearby request.
+		 * Force our mapping to page size without deduplication
+		 */
+		return NULL;
+	pte = vmemmap_pte_alloc(pmd, node, map_addr);
+	if (!pte)
+		return NULL;
+	/*
+	 * Check if there exist a mapping to the left
+	 */
+	if (pte_none(*pte)) {
+		/*
+		 * Populate the head page vmemmap page.
+		 * It can fall in different pmd, hence
+		 * vmemmap_populate_address()
+		 */
+		pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
+		if (!pte)
+			return NULL;
+		/*
+		 * Populate the tail pages vmemmap page
+		 */
+		pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
+		if (!pte)
+			return NULL;
+		vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
+		return pte;
+	}
+	return pte;
+}
+
+int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+					      unsigned long start,
+					      unsigned long end, int node,
+					      struct dev_pagemap *pgmap)
+{
+	/*
+	 * we want to map things as base page size mapping so that
+	 * we can save space in vmemmap. We could have huge mapping
+	 * covering out both edges.
+	 */
+	unsigned long addr;
+	unsigned long addr_pfn = start_pfn;
+	unsigned long next;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	for (addr = start; addr < end; addr = next) {
+
+		pgd = pgd_offset_k(addr);
+		p4d = p4d_offset(pgd, addr);
+		pud = vmemmap_pud_alloc(p4d, node, addr);
+		if (!pud)
+			return -ENOMEM;
+		pmd = vmemmap_pmd_alloc(pud, node, addr);
+		if (!pmd)
+			return -ENOMEM;
+
+		if (pmd_leaf(READ_ONCE(*pmd))) {
+			/* existing huge mapping. Skip the range */
+			addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
+			next = pmd_addr_end(addr, end);
+			continue;
+		}
+		pte = vmemmap_pte_alloc(pmd, node, addr);
+		if (!pte)
+			return -ENOMEM;
+		if (!pte_none(*pte)) {
+			/*
+			 * This could be because we already have a compound
+			 * page whose VMEMMAP_RESERVE_NR pages were mapped and
+			 * this request fall in those pages.
+			 */
+			addr_pfn += 1;
+			next = addr + PAGE_SIZE;
+			continue;
+		} else {
+			unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+			unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
+			pte_t *tail_page_pte;
+
+			/*
+			 * if the address is aligned to huge page size it is the
+			 * head mapping.
+			 */
+			if (pfn_offset == 0) {
+				/* Populate the head page vmemmap page */
+				pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+				if (!pte)
+					return -ENOMEM;
+				vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+				/*
+				 * Populate the tail pages vmemmap page
+				 * It can fall in different pmd, hence
+				 * vmemmap_populate_address()
+				 */
+				pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
+				if (!pte)
+					return -ENOMEM;
+
+				addr_pfn += 2;
+				next = addr + 2 * PAGE_SIZE;
+				continue;
+			}
+			/*
+			 * get the 2nd mapping details
+			 * Also create it if that doesn't exist
+			 */
+			tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
+			if (!tail_page_pte) {
+
+				pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
+				if (!pte)
+					return -ENOMEM;
+				vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+				addr_pfn += 1;
+				next = addr + PAGE_SIZE;
+				continue;
+			}
+
+			pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
+			if (!pte)
+				return -ENOMEM;
+			vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+			addr_pfn += 1;
+			next = addr + PAGE_SIZE;
+			continue;
+		}
+	}
+	return 0;
+}
+
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
 {
--- a/Documentation/mm/vmemmap_dedup.rst~powerpc-book3s64-radix-add-support-for-vmemmap-optimization-for-radix
+++ a/Documentation/mm/vmemmap_dedup.rst
@@ -210,6 +210,7 @@ the device (altmap).
 
 The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
 PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
+For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst
 
 The differences with HugeTLB are relatively minor.
 
--- a/Documentation/powerpc/index.rst~powerpc-book3s64-radix-add-support-for-vmemmap-optimization-for-radix
+++ a/Documentation/powerpc/index.rst
@@ -36,6 +36,7 @@ powerpc
     ultravisor
     vas-api
     vcpudispatch_stats
+    vmemmap_dedup
 
     features
 
--- /dev/null
+++ a/Documentation/powerpc/vmemmap_dedup.rst
@@ -0,0 +1,101 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Device DAX
+==========
+
+The device-dax interface uses the tail deduplication technique explained in
+Documentation/mm/vmemmap_dedup.rst
+
+On powerpc, vmemmap deduplication is only used with radix MMU translation. Also
+with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap
+deduplication.
+
+With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap
+page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no
+vmemmap deduplication possible.
+
+With 1G PUD level mapping, we require 16384 struct pages and a single 64K
+vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we
+require 16 64K pages in vmemmap to map the struct page for 1G PUD level mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                   | | | | |
+ |           |                     |     3     | ------------------+ | | | |
+ |           |                     +-----------+                     | | | |
+ |           |                     |     4     | --------------------+ | | |
+ |    PUD    |                     +-----------+                       | | |
+ |   level   |                     |     .     | ----------------------+ | |
+ |  mapping  |                     +-----------+                         | |
+ |           |                     |     .     | ------------------------+ |
+ |           |                     +-----------+                           |
+ |           |                     |     15    | --------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
+
+
+With 4K page size, 2M PMD level mapping requires 512 struct pages and a single
+4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we
+require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                   | | | | |
+ |           |                     |     3     | ------------------+ | | | |
+ |           |                     +-----------+                     | | | |
+ |           |                     |     4     | --------------------+ | | |
+ |    PMD    |                     +-----------+                       | | |
+ |   level   |                     |     5     | ----------------------+ | |
+ |  mapping  |                     +-----------+                         | |
+ |           |                     |     6     | ------------------------+ |
+ |           |                     +-----------+                           |
+ |           |                     |     7     | --------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
+
+With 1G PUD level mapping, we require 262144 struct pages and a single 4K
+vmemmap page can contain 64 struct pages (4K/sizeof(struct page)). Hence we
+require 4096 4K pages in vmemmap to map the struct pages for 1G PUD level
+mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
+ |           |                     |     0     | -------------> |     0     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     1     | -------------> |     1     |
+ |           |                     +-----------+                +-----------+
+ |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
+ |           |                     +-----------+                   | | | | |
+ |           |                     |     3     | ------------------+ | | | |
+ |           |                     +-----------+                     | | | |
+ |           |                     |     4     | --------------------+ | | |
+ |    PUD    |                     +-----------+                       | | |
+ |   level   |                     |     .     | ----------------------+ | |
+ |  mapping  |                     +-----------+                         | |
+ |           |                     |     .     | ------------------------+ |
+ |           |                     +-----------+                           |
+ |           |                     |   4095    | --------------------------+
+ |           |                     +-----------+
+ |           |
+ |           |
+ |           |
+ +-----------+
_

Patches currently in -mm which might be from aneesh.kumar@linux.ibm.com are

mm-memory_hotplug-simplify-arch_mhp_memmap_on_memory_enable-kconfig.patch
mm-memory_hotplug-allow-memmap-on-memory-hotplug-request-to-fallback.patch
mm-memory_hotplug-allow-architecture-to-override-memmap-on-memory-support-check.patch
mm-memory_hotplug-support-memmap_on_memory-when-memmap-is-not-aligned-to-pageblocks.patch
powerpc-book3s64-memhotplug-enable-memmap-on-memory-for-radix.patch
mm-memory_hotplug-embed-vmem_altmap-details-in-memory-block.patch


                 reply	other threads:[~2023-08-11 23:06 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230811230323.796CDC433C7@smtp.kernel.org \
    --to=akpm@linux-foundation.org \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=catalin.marinas@arm.com \
    --cc=christophe.leroy@csgroup.eu \
    --cc=dan.j.williams@intel.com \
    --cc=joao.m.martins@oracle.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mike.kravetz@oracle.com \
    --cc=mm-commits@vger.kernel.org \
    --cc=mpe@ellerman.id.au \
    --cc=muchun.song@linux.dev \
    --cc=npiggin@gmail.com \
    --cc=osalvador@suse.de \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.