All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] Faster Arm64 __arch_copy_from_user and __arch_copy_to_user
@ 2026-03-16 12:31 Qi Xi
  2026-03-24  1:52 ` Qi Xi
  2026-05-15 17:38 ` Catalin Marinas
  0 siblings, 2 replies; 3+ messages in thread
From: Qi Xi @ 2026-03-16 12:31 UTC (permalink / raw)
  To: catalin.marinas, will
  Cc: sunnanyong, xiqi2, wangkefeng.wang, benniu, linux-arm-kernel,
	linux-kernel

Based on Ben Niu's "Faster Arm64 __arch_copy_from_user and
__arch_copy_to_user" patch [1], this implementation further optimizes
and simplifies user space copies by:

1. Limiting optimization scope to >=128 bytes copies where PAN state matters.
   For <128 bytes copies, the implementation uses non-privileged
   instructions uniformly, simplifying the code and reducing maintenance
   cost.
2. Adding "arm64.nopan" cmdline support using the standard idreg-override
   framework, allowing runtime PAN disable without building separate
   CONFIG_ARM64_PAN=y/n kernels as required by Ben Niu's version.
   The implementation maintains separate paths for PAN-enabled (using
   unprivileged ldtr/sttr) and PAN-disabled (using standard ldp/stp), with
   runtime selection via ALTERNATIVE() at the large copy loop entry.
3. Retaining the critical path optimization from the original patch:
   reducing pointer update instructions through manual batch updates,
   processing 64 bytes per iteration with only one pair of add instructions.

Performance improvements measured on Kunpeng 920 with PAN disabled:

The ku_copy microbenchmark [2] (a kernel module that measures
copy_to/from_user throughput across various sizes by copying 1GB of
data in each test):
copy_to_user throughput change (positive = improvement):
128B: +0.9%   256B: +10.3%  512B: +23.3%  1024B: +38.1%
2048B: +56.2% 4096B: +68.5% 8192B: +74.8% 16384B: +79.7%
32768B: +80.7% 65536B: +81.3% 131072B: +77.3% 262144B: +77.9%
copy_from_user throughput change:
128B: +2.0%   256B: +7.5%   512B: +20.3%  1024B: +28.4%
2048B: +38.1% 4096B: +39.6% 8192B: +41.5% 16384B: +42.3%
32768B: +42.2% 65536B: +44.8% 131072B: +70.3% 262144B: +71.0%

Real-world workloads:
- RocksDB read-write mixed workload:
  Overall throughput improved by 2%.
  copy_to_user hotspot reduced from 3.3% to 2.7% of total CPU cycles.
  copy_from_user hotspot reduced from 2.25% to 0.85% of total CPU cycles.

- BRPC rdma_performance (server side, baidu_std protocol over TCP):
  copy_to_user accounts for ~11.5% of total CPU cycles.
  After optimization, server CPU utilization reduced from 64% to 62%
  (2% absolute improvement, equivalent to ~17% reduction in
  copy_to_user overhead)

[1] https://lore.kernel.org/all/20251018052237.1368504-2-benniu@meta.com/
[2] https://github.com/mcfi/benchmark/tree/main/ku_copy

Co-developed-by: Ben Niu <benniu@meta.com>
Signed-off-by: Ben Niu <benniu@meta.com>
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
Signed-off-by: Qi Xi <xiqi2@huawei.com>
---
Changes in v3:
- Limiting optimization scope to >=128 bytes copies.
- Use idreg-override for PAN runtime selection with "arm64.nopan" cmdline.
---
 arch/arm64/include/asm/asm-uaccess.h  |  22 ++----
 arch/arm64/kernel/pi/idreg-override.c |   2 +
 arch/arm64/lib/copy_from_user.S       |  17 +++-
 arch/arm64/lib/copy_template.S        | 108 +++++++++++++++++++-------
 arch/arm64/lib/copy_to_user.S         |  17 +++-
 5 files changed, 114 insertions(+), 52 deletions(-)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 9148f5a31968..198a05d478fc 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -70,27 +70,21 @@ alternative_else_nop_endif
  * This is complicated as there is no post-increment or pair versions of the
  * unprivileged instructions, and USER() only works for single instructions.
  */
-	.macro user_ldp l, reg1, reg2, addr, post_inc
-8888:		ldtr	\reg1, [\addr];
-8889:		ldtr	\reg2, [\addr, #8];
-		add	\addr, \addr, \post_inc;
+	.macro user_ldst l, inst, reg, addr, post_inc
+8888:		\inst		\reg, [\addr];
+		add		\addr, \addr, \post_inc;
 
 		_asm_extable_uaccess	8888b, \l;
-		_asm_extable_uaccess	8889b, \l;
 	.endm
 
-	.macro user_stp l, reg1, reg2, addr, post_inc
-8888:		sttr	\reg1, [\addr];
-8889:		sttr	\reg2, [\addr, #8];
-		add	\addr, \addr, \post_inc;
+	.macro user_ldst_index l, inst, reg, addr, val
+8888:		\inst		\reg, [\addr, \val];
 
-		_asm_extable_uaccess	8888b,\l;
-		_asm_extable_uaccess	8889b,\l;
+		_asm_extable_uaccess	8888b, \l;
 	.endm
 
-	.macro user_ldst l, inst, reg, addr, post_inc
-8888:		\inst		\reg, [\addr];
-		add		\addr, \addr, \post_inc;
+	.macro user_ldst_pair_index l, inst, reg1, reg2, addr, val
+8888:		\inst		\reg1, \reg2, [\addr, \val];
 
 		_asm_extable_uaccess	8888b, \l;
 	.endm
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index bc57b290e5e7..ac26f1f3aad4 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -64,6 +64,7 @@ static const struct ftr_set_desc mmfr1 __prel64_initconst = {
 	.override	= &id_aa64mmfr1_override,
 	.fields		= {
 		FIELD("vh", ID_AA64MMFR1_EL1_VH_SHIFT, mmfr1_vh_filter),
+		FIELD("pan", ID_AA64MMFR1_EL1_PAN_SHIFT, NULL),
 		{}
 	},
 };
@@ -249,6 +250,7 @@ static const struct {
 	{ "arm64.nolva",		"id_aa64mmfr2.varange=0" },
 	{ "arm64.no32bit_el0",		"id_aa64pfr0.el0=1" },
 	{ "arm64.nompam",		"id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" },
+	{ "arm64.nopan",		"id_aa64mmfr1.pan=0" },
 };
 
 static int __init parse_hexdigit(const char *p, u64 *v)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 400057d607ec..1f578c4d0ae6 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -44,12 +44,21 @@
 	str \reg, [\ptr], \val
 	.endm
 
-	.macro ldp1 reg1, reg2, ptr, val
-	user_ldp 9997f, \reg1, \reg2, \ptr, \val
+	.macro ldp_unpriv reg1, reg2, ptr, val
+	user_ldst_index 9997f, ldtr, \reg1, \ptr, \val
+	user_ldst_index 9997f, ldtr, \reg2, \ptr, \val + 8
 	.endm
 
-	.macro stp1 reg1, reg2, ptr, val
-	stp \reg1, \reg2, [\ptr], \val
+	.macro stp_unpriv reg1, reg2, ptr, val
+	stp \reg1, \reg2, [\ptr, \val]
+	.endm
+
+	.macro ldp_priv reg1, reg2, ptr, val
+	user_ldst_pair_index 9997f, ldp, \reg1, \reg2, \ptr, \val
+	.endm
+
+	.macro stp_priv reg1, reg2, ptr, val
+	stp \reg1, \reg2, [\ptr, \val]
 	.endm
 
 	.macro cpy1 dst, src, count
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 7f2f5a0e2fb9..5ef6dc9bf7d8 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -97,14 +97,20 @@ alternative_else_nop_endif
 	cmp	tmp1w, #0x20
 	b.eq	1f
 	b.lt	2f
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
+	ldp_unpriv	A_l, A_h, src, #0
+	stp_unpriv	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
 1:
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
+	ldp_unpriv	A_l, A_h, src, #0
+	stp_unpriv	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
 2:
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
+	ldp_unpriv	A_l, A_h, src, #0
+	stp_unpriv	A_l, A_h, dst, #0
+	add	src, src, #16
+	add	dst, dst, #16
 .Ltiny15:
 	/*
 	* Prefer to break one ldp/stp into several load/store to access
@@ -142,14 +148,16 @@ alternative_else_nop_endif
 	* Less than 128 bytes to copy, so handle 64 here and then jump
 	* to the tail.
 	*/
-	ldp1	A_l, A_h, src, #16
-	stp1	A_l, A_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	D_l, D_h, src, #16
-	stp1	D_l, D_h, dst, #16
+	ldp_unpriv	A_l, A_h, src, #0
+	stp_unpriv	A_l, A_h, dst, #0
+	ldp_unpriv	B_l, B_h, src, #16
+	ldp_unpriv	C_l, C_h, src, #32
+	stp_unpriv	B_l, B_h, dst, #16
+	stp_unpriv	C_l, C_h, dst, #32
+	ldp_unpriv	D_l, D_h, src, #48
+	stp_unpriv	D_l, D_h, dst, #48
+	add	src, src, #64
+	add	dst, dst, #64
 
 	tst	count, #0x3f
 	b.ne	.Ltail63
@@ -161,30 +169,70 @@ alternative_else_nop_endif
 	*/
 	.p2align	L1_CACHE_SHIFT
 .Lcpy_body_large:
+	/* Runtime PAN decision for large copies */
+	ALTERNATIVE("b .Llarge_pan_disabled", "b .Llarge_pan_enabled", ARM64_HAS_PAN)
+
+.Llarge_pan_enabled:
+	/* PAN enabled version - use unprivileged loads (ldp_unpriv) */
 	/* pre-get 64 bytes data. */
-	ldp1	A_l, A_h, src, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	ldp1	D_l, D_h, src, #16
+	ldp_unpriv	A_l, A_h, src, #0
+	ldp_unpriv	B_l, B_h, src, #16
+	ldp_unpriv	C_l, C_h, src, #32
+	ldp_unpriv	D_l, D_h, src, #48
+	add	src, src, #64
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp_unpriv	A_l, A_h, dst, #0
+	ldp_unpriv	A_l, A_h, src, #0
+	stp_unpriv	B_l, B_h, dst, #16
+	ldp_unpriv	B_l, B_h, src, #16
+	stp_unpriv	C_l, C_h, dst, #32
+	ldp_unpriv	C_l, C_h, src, #32
+	stp_unpriv	D_l, D_h, dst, #48
+	ldp_unpriv	D_l, D_h, src, #48
+	add	dst, dst, #64
+	add	src, src, #64
+	subs	count, count, #64
+	b.ge	1b
+	b	.Llarge_done
+
+.Llarge_pan_disabled:
+	/* PAN disabled version - use normal loads without post-increment */
+	/* pre-get 64 bytes data using normal loads */
+	ldp_priv	A_l, A_h, src, #0
+	ldp_priv	B_l, B_h, src, #16
+	ldp_priv	C_l, C_h, src, #32
+	ldp_priv	D_l, D_h, src, #48
+	add	src, src, #64
 1:
 	/*
 	* interlace the load of next 64 bytes data block with store of the last
 	* loaded 64 bytes data.
 	*/
-	stp1	A_l, A_h, dst, #16
-	ldp1	A_l, A_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	D_l, D_h, dst, #16
-	ldp1	D_l, D_h, src, #16
+	stp_priv	A_l, A_h, dst, #0
+	ldp_priv	A_l, A_h, src, #0
+	stp_priv	B_l, B_h, dst, #16
+	ldp_priv	B_l, B_h, src, #16
+	stp_priv	C_l, C_h, dst, #32
+	ldp_priv	C_l, C_h, src, #32
+	stp_priv	D_l, D_h, dst, #48
+	ldp_priv	D_l, D_h, src, #48
+	add	dst, dst, #64
+	add	src, src, #64
 	subs	count, count, #64
 	b.ge	1b
-	stp1	A_l, A_h, dst, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	stp1	D_l, D_h, dst, #16
+
+.Llarge_done:
+	/* Post-loop: store the last block of data using stp_unpriv */
+	/* (without post-increment) */
+	stp_unpriv	A_l, A_h, dst, #0
+	stp_unpriv	B_l, B_h, dst, #16
+	stp_unpriv	C_l, C_h, dst, #32
+	stp_unpriv	D_l, D_h, dst, #48
+	add	dst, dst, #64
 
 	tst	count, #0x3f
 	b.ne	.Ltail63
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 819f2e3fc7a9..9738ae96c823 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -43,12 +43,21 @@
 	user_ldst 9997f, sttr, \reg, \ptr, \val
 	.endm
 
-	.macro ldp1 reg1, reg2, ptr, val
-	ldp \reg1, \reg2, [\ptr], \val
+	.macro ldp_unpriv reg1, reg2, ptr, val
+	ldp \reg1, \reg2, [\ptr, \val]
 	.endm
 
-	.macro stp1 reg1, reg2, ptr, val
-	user_stp 9997f, \reg1, \reg2, \ptr, \val
+	.macro stp_unpriv reg1, reg2, ptr, val
+	user_ldst_index 9997f, sttr, \reg1, \ptr, \val
+	user_ldst_index 9997f, sttr, \reg2, \ptr, \val + 8
+	.endm
+
+	.macro ldp_priv reg1, reg2, ptr, val
+	ldp \reg1, \reg2, [\ptr, \val]
+	.endm
+
+	.macro stp_priv reg1, reg2, ptr, val
+	user_ldst_pair_index 9997f, stp, \reg1, \reg2, \ptr, \val
 	.endm
 
 	.macro cpy1 dst, src, count
-- 
2.33.0



^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-05-15 17:38 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-16 12:31 [PATCH v3] Faster Arm64 __arch_copy_from_user and __arch_copy_to_user Qi Xi
2026-03-24  1:52 ` Qi Xi
2026-05-15 17:38 ` Catalin Marinas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.