[RFC PATCH 1/3] x86/lib: Refactor csum_partial_copy_generic() into a macro

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Chang S. Bae" <chang.seok.bae@intel.com>
To: linux-kernel@vger.kernel.org
Cc: x86@kernel.org, tglx@linutronix.de, mingo@redhat.com,
	bp@alien8.de, dave.hansen@linux.intel.com,
	chang.seok.bae@intel.com
Subject: [RFC PATCH 1/3] x86/lib: Refactor csum_partial_copy_generic() into a macro
Date: Mon, 24 Nov 2025 21:32:24 +0000	[thread overview]
Message-ID: <20251124213227.123779-2-chang.seok.bae@intel.com> (raw)
In-Reply-To: <20251124213227.123779-1-chang.seok.bae@intel.com>

The current assembly implementation is too rigid to support new
variants that share most of the logic. Refactor the function body into a
reusable macro, with register aliasing to improve readability.

No functional change.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
---
No intention for upstream, but this series is just an example of how
extended GPRs can be used within the kernel.
---
 arch/x86/lib/csum-copy_64.S | 187 ++++++++++++++++++++----------------
 1 file changed, 103 insertions(+), 84 deletions(-)

diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index d9e16a2cf285..66ed849090b7 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -26,17 +26,27 @@
  * They also should align source or destination to 8 bytes.
  */
 
-	.macro source
+.macro source
 10:
 	_ASM_EXTABLE_UA(10b, .Lfault)
-	.endm
+.endm
 
-	.macro dest
+.macro dest
 20:
 	_ASM_EXTABLE_UA(20b, .Lfault)
-	.endm
+.endm
 
-SYM_FUNC_START(csum_partial_copy_generic)
+.macro restore_regs_and_ret
+	movq 0*8(%rsp), %rbx
+	movq 1*8(%rsp), %r12
+	movq 2*8(%rsp), %r14
+	movq 3*8(%rsp), %r13
+	movq 4*8(%rsp), %r15
+	addq $5*8, %rsp
+	RET
+.endm
+
+.macro	_csum_partial_copy
 	subq  $5*8, %rsp
 	movq  %rbx, 0*8(%rsp)
 	movq  %r12, 1*8(%rsp)
@@ -48,41 +58,52 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	xorl  %r9d, %r9d
 	movl  %edx, %ecx
 	cmpl  $8, %ecx
-	jb    .Lshort
+	jb    .Lshort\@
 
 	testb  $7, %sil
-	jne   .Lunaligned
-.Laligned:
-	movl  %ecx, %r12d
+	jne   .Lunaligned\@
+.Laligned\@:
+	.set  INP, %rdi		/* input pointer */
+	.set  OUTP, %rsi	/* output pointer */
+	.set  SUM, %rax		/* checksum accumulator */
+	.set  ZERO, %r9		/* zero register */
+	.set  LEN, %ecx		/* byte count */
+	.set  LEN64B, %r12d	/* 64-byte block count */
+	.set  TMP1, %rbx
+	.set  TMP2, %r8
+	.set  TMP3, %r11
+	.set  TMP4, %rdx
+	.set  TMP5, %r10
+	.set  TMP6, %r15
+	.set  TMP7, %r14
+	.set  TMP8, %r13
 
-	shrq  $6, %r12
-	jz	.Lhandle_tail       /* < 64 */
+	movl  LEN, LEN64B
+
+	shrl  $6, LEN64B
+	jz	.Lhandle_tail\@     /* < 64 */
 
 	clc
 
-	/* main loop. clear in 64 byte blocks */
-	/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
-	/* r11:	temp3, rdx: temp4, r12 loopcnt */
-	/* r10:	temp5, r15: temp6, r14 temp7, r13 temp8 */
 	.p2align 4
-.Lloop:
+.Lloop\@:
 	source
-	movq  (%rdi), %rbx
+	movq  (INP), TMP1
 	source
-	movq  8(%rdi), %r8
+	movq  8(INP), TMP2
 	source
-	movq  16(%rdi), %r11
+	movq  16(INP), TMP3
 	source
-	movq  24(%rdi), %rdx
+	movq  24(INP), TMP4
 
 	source
-	movq  32(%rdi), %r10
+	movq  32(INP), TMP5
 	source
-	movq  40(%rdi), %r15
+	movq  40(INP), TMP6
 	source
-	movq  48(%rdi), %r14
+	movq  48(INP), TMP7
 	source
-	movq  56(%rdi), %r13
+	movq  56(INP), TMP8
 
 30:
 	/*
@@ -92,64 +113,64 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	_ASM_EXTABLE(30b, 2f)
 	prefetcht0 5*64(%rdi)
 2:
-	adcq  %rbx, %rax
-	adcq  %r8, %rax
-	adcq  %r11, %rax
-	adcq  %rdx, %rax
-	adcq  %r10, %rax
-	adcq  %r15, %rax
-	adcq  %r14, %rax
-	adcq  %r13, %rax
+	adcq  TMP1, SUM
+	adcq  TMP2, SUM
+	adcq  TMP3, SUM
+	adcq  TMP4, SUM
+	adcq  TMP5, SUM
+	adcq  TMP6, SUM
+	adcq  TMP7, SUM
+	adcq  TMP8, SUM
 
-	decl %r12d
+	decl LEN64B
 
 	dest
-	movq %rbx, (%rsi)
+	movq TMP1, (OUTP)
 	dest
-	movq %r8, 8(%rsi)
+	movq TMP2, 8(OUTP)
 	dest
-	movq %r11, 16(%rsi)
+	movq TMP3, 16(OUTP)
 	dest
-	movq %rdx, 24(%rsi)
+	movq TMP4, 24(OUTP)
 
 	dest
-	movq %r10, 32(%rsi)
+	movq TMP5, 32(OUTP)
 	dest
-	movq %r15, 40(%rsi)
+	movq TMP6, 40(OUTP)
 	dest
-	movq %r14, 48(%rsi)
+	movq TMP7, 48(OUTP)
 	dest
-	movq %r13, 56(%rsi)
+	movq TMP8, 56(OUTP)
 
-	leaq 64(%rdi), %rdi
-	leaq 64(%rsi), %rsi
+	leaq 64(INP), INP
+	leaq 64(OUTP), OUTP
 
-	jnz	.Lloop
+	jnz	.Lloop\@
 
-	adcq  %r9, %rax
+	adcq  ZERO, SUM
 
 	/* do last up to 56 bytes */
-.Lhandle_tail:
+.Lhandle_tail\@:
 	/* ecx:	count, rcx.63: the end result needs to be rol8 */
 	movq %rcx, %r10
 	andl $63, %ecx
 	shrl $3, %ecx
-	jz	.Lfold
+	jz	.Lfold\@
 	clc
 	.p2align 4
-.Lloop_8:
+.Lloop_8\@:
 	source
-	movq (%rdi), %rbx
-	adcq %rbx, %rax
-	decl %ecx
+	movq (INP), TMP1
+	adcq TMP1, SUM
+	decl LEN
 	dest
-	movq %rbx, (%rsi)
-	leaq 8(%rsi), %rsi /* preserve carry */
-	leaq 8(%rdi), %rdi
-	jnz	.Lloop_8
-	adcq %r9, %rax	/* add in carry */
+	movq TMP1, (OUTP)
+	leaq 8(INP), INP /* preserve carry */
+	leaq 8(OUTP), OUTP
+	jnz	.Lloop_8\@
+	adcq ZERO, SUM	/* add in carry */
 
-.Lfold:
+.Lfold\@:
 	/* reduce checksum to 32bits */
 	movl %eax, %ebx
 	shrq $32, %rax
@@ -157,17 +178,17 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	adcl %r9d, %eax
 
 	/* do last up to 6 bytes */
-.Lhandle_7:
+.Lhandle_7\@:
 	movl %r10d, %ecx
 	andl $7, %ecx
-.L1:				/* .Lshort rejoins the common path here */
+.L1\@:				/* .Lshort\@ rejoins the common path here */
 	shrl $1, %ecx
-	jz   .Lhandle_1
+	jz   .Lhandle_1\@
 	movl $2, %edx
 	xorl %ebx, %ebx
 	clc
 	.p2align 4
-.Lloop_1:
+.Lloop_1\@:
 	source
 	movw (%rdi), %bx
 	adcl %ebx, %eax
@@ -176,13 +197,13 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	movw %bx, (%rsi)
 	leaq 2(%rdi), %rdi
 	leaq 2(%rsi), %rsi
-	jnz .Lloop_1
+	jnz .Lloop_1\@
 	adcl %r9d, %eax	/* add in carry */
 
 	/* handle last odd byte */
-.Lhandle_1:
+.Lhandle_1\@:
 	testb $1, %r10b
-	jz    .Lende
+	jz    .Lende\@
 	xorl  %ebx, %ebx
 	source
 	movb (%rdi), %bl
@@ -191,24 +212,18 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	addl %ebx, %eax
 	adcl %r9d, %eax		/* carry */
 
-.Lende:
+.Lende\@:
 	testq %r10, %r10
-	js  .Lwas_odd
-.Lout:
-	movq 0*8(%rsp), %rbx
-	movq 1*8(%rsp), %r12
-	movq 2*8(%rsp), %r14
-	movq 3*8(%rsp), %r13
-	movq 4*8(%rsp), %r15
-	addq $5*8, %rsp
-	RET
-.Lshort:
+	js  .Lwas_odd\@
+.Lout\@:
+	restore_regs_and_ret
+.Lshort\@:
 	movl %ecx, %r10d
-	jmp  .L1
-.Lunaligned:
+	jmp  .L1\@
+.Lunaligned\@:
 	xorl %ebx, %ebx
 	testb $1, %sil
-	jne  .Lodd
+	jne  .Lodd\@
 1:	testb $2, %sil
 	je   2f
 	source
@@ -220,7 +235,7 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	leaq 2(%rsi), %rsi
 	addq %rbx, %rax
 2:	testb $4, %sil
-	je .Laligned
+	je .Laligned\@
 	source
 	movl (%rdi), %ebx
 	dest
@@ -229,9 +244,9 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	subq $4, %rcx
 	leaq 4(%rsi), %rsi
 	addq %rbx, %rax
-	jmp .Laligned
+	jmp .Laligned\@
 
-.Lodd:
+.Lodd\@:
 	source
 	movb (%rdi), %bl
 	dest
@@ -245,12 +260,16 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	addq %rbx, %rax
 	jmp 1b
 
-.Lwas_odd:
+.Lwas_odd\@:
 	roll $8, %eax
-	jmp .Lout
+	jmp .Lout\@
+.endm
 
 	/* Exception: just return 0 */
 .Lfault:
 	xorl %eax, %eax
-	jmp  .Lout
+	restore_regs_and_ret
+
+SYM_FUNC_START(csum_partial_copy_generic)
+	_csum_partial_copy
 SYM_FUNC_END(csum_partial_copy_generic)
-- 
2.51.0

next prev parent reply	other threads:[~2025-11-24 21:55 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-24 21:32 [DISCUSSION] x86: In-Kernel Use of Extended General-Purpose Registers Chang S. Bae
2025-11-24 21:32 ` Chang S. Bae [this message]
2025-11-24 21:32 ` [RFC PATCH 2/3] x86/lib: Convert repeated asm sequences in checksum copy into macros Chang S. Bae
2025-11-24 21:32 ` [RFC PATCH 3/3] x86/lib: Use EGPRs in 64-bit checksum copy loop Chang S. Bae
2025-11-25 10:37   ` david laight
2025-12-01 21:39     ` Chang S. Bae
2025-11-26 16:30 ` [DISCUSSION] x86: In-Kernel Use of Extended General-Purpose Registers Peter Zijlstra
2025-12-01 21:40   ` Chang S. Bae

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:d9e16a2cf28 dfblob:66ed849090b )
 OR (
bs:"[RFC PATCH 1/3] x86/lib: Refactor csum_partial_copy_generic() into a macro" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251124213227.123779-2-chang.seok.bae@intel.com \
    --to=chang.seok.bae@intel.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.