[RFC PATCH 2/3] x86/lib: Convert repeated asm sequences in checksum copy into macros

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Chang S. Bae" <chang.seok.bae@intel.com>
To: linux-kernel@vger.kernel.org
Cc: x86@kernel.org, tglx@linutronix.de, mingo@redhat.com,
	bp@alien8.de, dave.hansen@linux.intel.com,
	chang.seok.bae@intel.com
Subject: [RFC PATCH 2/3] x86/lib: Convert repeated asm sequences in checksum copy into macros
Date: Mon, 24 Nov 2025 21:32:25 +0000	[thread overview]
Message-ID: <20251124213227.123779-3-chang.seok.bae@intel.com> (raw)
In-Reply-To: <20251124213227.123779-1-chang.seok.bae@intel.com>

Several instruction patterns are repeated in the checksum-copy function.
Replace them with small macros to make concise and more readable.

No functional change.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
---
These repetitions are related to the loop unrolling, which will be
further extended using EGPRs in the next patch.
---
 arch/x86/lib/csum-copy_64.S | 106 ++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 58 deletions(-)

diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 66ed849090b7..5526bdfac041 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -46,6 +46,43 @@
 	RET
 .endm
 
+.macro prefetch
+30:
+	/*
+	 * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
+	 * potentially unmapped kernel address.
+	 */
+	_ASM_EXTABLE(30b, 2f)
+	prefetcht0 5*64(%rdi)
+2:
+.endm
+
+.macro loadregs offset, src, regs:vararg
+	source
+	i = 0
+.irp  r, \regs
+	movq  8*(\offset + i)(\src), \r
+.endr
+.endm
+
+.macro storeregs offset, dst, regs:vararg
+	dest
+	i = 0
+.irp  r, \regs
+	movq  \r, 8*(\offset + i)(\dst)
+.endr
+.endm
+
+.macro sumregs sum, regs:vararg
+.irp  r, \regs
+	adcq  \r, \sum
+.endr
+.endm
+
+.macro incr ptr, count
+	leaq  8*(\count)(\ptr), \ptr
+.endm
+
 .macro	_csum_partial_copy
 	subq  $5*8, %rsp
 	movq  %rbx, 0*8(%rsp)
@@ -87,63 +124,18 @@
 
 	.p2align 4
 .Lloop\@:
-	source
-	movq  (INP), TMP1
-	source
-	movq  8(INP), TMP2
-	source
-	movq  16(INP), TMP3
-	source
-	movq  24(INP), TMP4
+	loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
 
-	source
-	movq  32(INP), TMP5
-	source
-	movq  40(INP), TMP6
-	source
-	movq  48(INP), TMP7
-	source
-	movq  56(INP), TMP8
+	prefetch
 
-30:
-	/*
-	 * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
-	 * potentially unmapped kernel address.
-	 */
-	_ASM_EXTABLE(30b, 2f)
-	prefetcht0 5*64(%rdi)
-2:
-	adcq  TMP1, SUM
-	adcq  TMP2, SUM
-	adcq  TMP3, SUM
-	adcq  TMP4, SUM
-	adcq  TMP5, SUM
-	adcq  TMP6, SUM
-	adcq  TMP7, SUM
-	adcq  TMP8, SUM
+	sumregs SUM, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
 
 	decl LEN64B
 
-	dest
-	movq TMP1, (OUTP)
-	dest
-	movq TMP2, 8(OUTP)
-	dest
-	movq TMP3, 16(OUTP)
-	dest
-	movq TMP4, 24(OUTP)
+	storeregs 0, OUTP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
 
-	dest
-	movq TMP5, 32(OUTP)
-	dest
-	movq TMP6, 40(OUTP)
-	dest
-	movq TMP7, 48(OUTP)
-	dest
-	movq TMP8, 56(OUTP)
-
-	leaq 64(INP), INP
-	leaq 64(OUTP), OUTP
+	incr INP, 8
+	incr OUTP, 8
 
 	jnz	.Lloop\@
 
@@ -159,14 +151,12 @@
 	clc
 	.p2align 4
 .Lloop_8\@:
-	source
-	movq (INP), TMP1
-	adcq TMP1, SUM
+	loadregs 0, INP, TMP1
+	sumregs SUM, TMP1
 	decl LEN
-	dest
-	movq TMP1, (OUTP)
-	leaq 8(INP), INP /* preserve carry */
-	leaq 8(OUTP), OUTP
+	storeregs 0, OUTP, TMP1
+	incr INP, 1 /* preserve carry */
+	incr OUTP, 1
 	jnz	.Lloop_8\@
 	adcq ZERO, SUM	/* add in carry */
 
-- 
2.51.0

next prev parent reply	other threads:[~2025-11-24 21:55 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-24 21:32 [DISCUSSION] x86: In-Kernel Use of Extended General-Purpose Registers Chang S. Bae
2025-11-24 21:32 ` [RFC PATCH 1/3] x86/lib: Refactor csum_partial_copy_generic() into a macro Chang S. Bae
2025-11-24 21:32 ` Chang S. Bae [this message]
2025-11-24 21:32 ` [RFC PATCH 3/3] x86/lib: Use EGPRs in 64-bit checksum copy loop Chang S. Bae
2025-11-25 10:37   ` david laight
2025-12-01 21:39     ` Chang S. Bae
2025-11-26 16:30 ` [DISCUSSION] x86: In-Kernel Use of Extended General-Purpose Registers Peter Zijlstra
2025-12-01 21:40   ` Chang S. Bae

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:66ed849090b dfblob:5526bdfac04 )
 OR (
bs:"[RFC PATCH 2/3] x86/lib: Convert repeated asm sequences in checksum copy into macros" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251124213227.123779-3-chang.seok.bae@intel.com \
    --to=chang.seok.bae@intel.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.