public inbox for linux-arm-kernel@lists.infradead.org
 help / color / mirror / Atom feed
* [PATCH] arm64: Implement clear_pages()
@ 2026-03-03 10:06 Linus Walleij
  2026-03-03 14:46 ` Will Deacon
  0 siblings, 1 reply; 6+ messages in thread
From: Linus Walleij @ 2026-03-03 10:06 UTC (permalink / raw)
  To: Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton,
	Joey Gouly, Suzuki K Poulose, Zenghui Yu, Ryan Roberts,
	Ankur Arora, David Hildenbrand
  Cc: linux-arm-kernel, kvmarm, James Clark, Linus Walleij

A recent patch introduced clear_pages() and made it possible to
provide assembly optimizations like for clear_page().

This augments the existing clear_page() optimization in arm64
to accept any number of pages the following way:

- Make clear_page() a static inline special case of clear_pages()

- Implement clear_pages() as a static inline that just calculate
  the number of total bytes in the page set and passes this number
  to the assembly routine clear_pages_asm.

- The old clear_pages assembly is rewritten to clear_pages_asm
  which will take a start address (at an even page) and a number
  of bytes to clear from that address.

This is similar to the optimization provided for x86.

Performance improvements:

The baseline is the current v7.0-rc1 which calls the existing
clear_page() assembly optimization in a loop, see <linux/mm.h>.
Any improvements are about avoiding the outer loop, in most cases
the clearing will be linear and the savings will be small and
only noticeable on really big clearing operations.

We boot the kernel with cmdline like this:
"default_hugepagesz=1G hugepagesz=1G hugepages=32" to make sure
we have ample hugepages. This was then tested with the same
cmdline as the original series:

perf bench mem mmap -p 1GB -f demand -s 32GB -l 5

The first run was discarded as the memory hierarchy is cold on
the first run. Then I ran the above command 5 times and averaged
the throughput, which sees a small but consistent improvement in
the throughput:

On QEMU:

Before this patch:     After this patch:
2.38 GB/s              2.41 GB/s

On hardware Radxa Orion O6 we see this on *some* cores and no
change on others:

Before this patch:     After this patch:
43.3 GB/s              45.3 GB/s

There is a small but consistent improvement in throughput, as
expected.

Tested-by: James Clark <james.clark2@arm.com>
Signed-off-by: Linus Walleij <linusw@kernel.org>
---
 arch/arm64/include/asm/page.h                  | 13 ++++++++++++-
 arch/arm64/kernel/image-vars.h                 |  2 +-
 arch/arm64/kvm/hyp/nvhe/Makefile               |  2 +-
 arch/arm64/lib/Makefile                        |  2 +-
 arch/arm64/lib/{clear_page.S => clear_pages.S} | 18 +++++++++---------
 5 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index b39cc1127e1f..916a3e7c9a19 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -20,7 +20,18 @@ struct page;
 struct vm_area_struct;
 
 extern void copy_page(void *to, const void *from);
-extern void clear_page(void *to);
+extern void clear_pages_asm(void *addr, unsigned int nbytes);
+
+static inline void clear_pages(void *addr, unsigned int npages)
+{
+	clear_pages_asm(addr, npages * PAGE_SIZE);
+}
+#define clear_pages clear_pages
+
+static inline void clear_page(void *addr)
+{
+	clear_pages(addr, 1);
+}
 
 void copy_user_highpage(struct page *to, struct page *from,
 			unsigned long vaddr, struct vm_area_struct *vma);
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index d7b0d12b1015..61232f9e1e68 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -117,7 +117,7 @@ KVM_NVHE_ALIAS(__start___kvm_ex_table);
 KVM_NVHE_ALIAS(__stop___kvm_ex_table);
 
 /* Position-independent library routines */
-KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page);
+KVM_NVHE_ALIAS_HYP(clear_pages, __pi_clear_pages);
 KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page);
 KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy);
 KVM_NVHE_ALIAS_HYP(memset, __pi_memset);
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index a244ec25f8c5..f857dac82a88 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -17,7 +17,7 @@ ccflags-y += -fno-stack-protector	\
 hostprogs := gen-hyprel
 HOST_EXTRACFLAGS += -I$(objtree)/include
 
-lib-objs := clear_page.o copy_page.o memcpy.o memset.o
+lib-objs := clear_pages.o copy_page.o memcpy.o memset.o
 lib-objs := $(addprefix ../../../lib/, $(lib-objs))
 
 CFLAGS_switch.nvhe.o += -Wno-override-init
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 633e5223d944..86995e2e0807 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   copy_to_user.o copy_page.o				\
-		   clear_page.o csum.o insn.o memchr.o memcpy.o		\
+		   clear_pages.o csum.o insn.o memchr.o memcpy.o	\
 		   memset.o memcmp.o strcmp.o strncmp.o strlen.o	\
 		   strnlen.o strchr.o strrchr.o tishift.o
 
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_pages.S
similarity index 70%
rename from arch/arm64/lib/clear_page.S
rename to arch/arm64/lib/clear_pages.S
index bd6f7d5eb6eb..2d3043c13791 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_pages.S
@@ -12,22 +12,22 @@
  * Clear page @dest
  *
  * Parameters:
- *	x0 - dest
+ *	x0 - dest - should be start of a page
+ *	x1 - number of bytes to clear, should be a multiple of PAGE_SIZE
  */
-SYM_FUNC_START(__pi_clear_page)
+SYM_FUNC_START(__pi_clear_pages)
 #ifdef CONFIG_AS_HAS_MOPS
 	.arch_extension mops
 alternative_if_not ARM64_HAS_MOPS
 	b	.Lno_mops
 alternative_else_nop_endif
-
-	mov	x1, #PAGE_SIZE
 	setpn	[x0]!, x1!, xzr
 	setmn	[x0]!, x1!, xzr
 	seten	[x0]!, x1!, xzr
 	ret
 .Lno_mops:
 #endif
+	add	x4, x0, x1	/* Find the end */
 	mrs	x1, dczid_el0
 	tbnz	x1, #4, 2f	/* Branch if DC ZVA is prohibited */
 	and	w1, w1, #0xf
@@ -36,7 +36,7 @@ alternative_else_nop_endif
 
 1:	dc	zva, x0
 	add	x0, x0, x1
-	tst	x0, #(PAGE_SIZE - 1)
+	cmp	x0, x4
 	b.ne	1b
 	ret
 
@@ -45,9 +45,9 @@ alternative_else_nop_endif
 	stnp	xzr, xzr, [x0, #32]
 	stnp	xzr, xzr, [x0, #48]
 	add	x0, x0, #64
-	tst	x0, #(PAGE_SIZE - 1)
+	cmp	x0, x4
 	b.ne	2b
 	ret
-SYM_FUNC_END(__pi_clear_page)
-SYM_FUNC_ALIAS(clear_page, __pi_clear_page)
-EXPORT_SYMBOL(clear_page)
+SYM_FUNC_END(__pi_clear_pages)
+SYM_FUNC_ALIAS(clear_pages_asm, __pi_clear_pages)
+EXPORT_SYMBOL(clear_pages_asm)

---
base-commit: dbe60c40b86ec4a1168552398b3b64c14c38b2d7
change-id: 20260212-aarch64-clear-pages-a439c2c552bb

Best regards,
-- 
Linus Walleij <linusw@kernel.org>



^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-03-04  8:49 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-03 10:06 [PATCH] arm64: Implement clear_pages() Linus Walleij
2026-03-03 14:46 ` Will Deacon
2026-03-03 15:45   ` Catalin Marinas
2026-03-04  0:39   ` Linus Walleij
2026-03-04  8:05     ` Ankur Arora
2026-03-04  8:49       ` Catalin Marinas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox