From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CBD92FD7D8
	for <linux-kernel@vger.kernel.org>; Mon, 24 Nov 2025 21:55:35 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1764021337; cv=none; b=LBYBvwvcFL3065N4MMlOi7djAk3nEWccqx3stR+cFOoWm6Zda+CgHyIohtXmPpKbn5EkfGAR864W8s5dap8XS+yqJfFxJFROrMYY9ur6SpcnKjJzK8vUPrwE5QnxOaXimTTSFe+fd6lvqpiRjH8wRyIE1dtkNjngkupKGHVqjQc=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1764021337; c=relaxed/simple;
	bh=mP88kNlNQz94ZT8UHZ3MO6Bg/wDFm4P/C9KI6XufkyA=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version; b=H0zysNTSyNH3u4OGTKiE8o44CmX/LS666iSwqm4KacZOTTtIBZzZgd1Uv8r+uNMey0gO+MS/ySusKyNTZEohp0NlCHqNrBIU3B9NwB986rHpda2DnNRm9T72FvSjnY7alnwqdU/8GHfZZag+U21Cl9Kakfg/+1nnn+rG04eRDqI=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=DDSldcn/; arc=none smtp.client-ip=198.175.65.17
Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com
Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="DDSldcn/"
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple;
  d=intel.com; i=@intel.com; q=dns/txt; s=Intel;
  t=1764021335; x=1795557335;
  h=from:to:cc:subject:date:message-id:in-reply-to:
   references:mime-version:content-transfer-encoding;
  bh=mP88kNlNQz94ZT8UHZ3MO6Bg/wDFm4P/C9KI6XufkyA=;
  b=DDSldcn/vc8dR1Cqsi0qtbf7DgxMIgs6AMtqlT9wpurYuGLiKXMYAv/T
   3E6l3v91XlJh8YMdMuFXJ9/K0Xd2kF9JNyL+Pg29Xykpv4cDniMgsKI4R
   zAQWr2xUXxcZDV8k/FQhQMGw++I+zgEiAFEZVhCq7TMOAmqwEDEMfdpYr
   k6rSq/XCqH2waqjMp55V99Jfq0Y9EL7zmf2lSmr8xFP7SMd38ud09wp7y
   xyL9pberWRZFesWCxEuSlq/qavAotC8Qys3LT27HEs+hrEJ75A/sAAiKm
   xwF3n6L5+I4KJyEZ8hW1caK/LCTmnVPbjlYJZH1rG20AboNjdIetRCnro
   Q==;
X-CSE-ConnectionGUID: jDZ76dMoQpmUlZI1pEqghw==
X-CSE-MsgGUID: dCOVk1UGRVGXa6I8QJp3wQ==
X-IronPort-AV: E=McAfee;i="6800,10657,11623"; a="65985343"
X-IronPort-AV: E=Sophos;i="6.20,223,1758610800"; 
   d="scan'208";a="65985343"
Received: from fmviesa003.fm.intel.com ([10.60.135.143])
  by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 24 Nov 2025 13:55:35 -0800
X-CSE-ConnectionGUID: pgUMWdvXROuIXqnMOhGxMw==
X-CSE-MsgGUID: SVwuYcrxSDCZuN82qjB1/Q==
X-ExtLoop1: 1
Received: from chang-linux-3.sc.intel.com (HELO chang-linux-3) ([172.25.66.172])
  by fmviesa003.fm.intel.com with ESMTP; 24 Nov 2025 13:55:34 -0800
From: "Chang S. Bae" <chang.seok.bae@intel.com>
To: linux-kernel@vger.kernel.org
Cc: x86@kernel.org,
	tglx@linutronix.de,
	mingo@redhat.com,
	bp@alien8.de,
	dave.hansen@linux.intel.com,
	chang.seok.bae@intel.com
Subject: [RFC PATCH 1/3] x86/lib: Refactor csum_partial_copy_generic() into a macro
Date: Mon, 24 Nov 2025 21:32:24 +0000
Message-ID: <20251124213227.123779-2-chang.seok.bae@intel.com>
X-Mailer: git-send-email 2.51.0
In-Reply-To: <20251124213227.123779-1-chang.seok.bae@intel.com>
References: <20251124213227.123779-1-chang.seok.bae@intel.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit

The current assembly implementation is too rigid to support new
variants that share most of the logic. Refactor the function body into a
reusable macro, with register aliasing to improve readability.

No functional change.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
---
No intention for upstream, but this series is just an example of how
extended GPRs can be used within the kernel.
---
 arch/x86/lib/csum-copy_64.S | 187 ++++++++++++++++++++----------------
 1 file changed, 103 insertions(+), 84 deletions(-)

diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index d9e16a2cf285..66ed849090b7 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -26,17 +26,27 @@
  * They also should align source or destination to 8 bytes.
  */
 
-	.macro source
+.macro source
 10:
 	_ASM_EXTABLE_UA(10b, .Lfault)
-	.endm
+.endm
 
-	.macro dest
+.macro dest
 20:
 	_ASM_EXTABLE_UA(20b, .Lfault)
-	.endm
+.endm
 
-SYM_FUNC_START(csum_partial_copy_generic)
+.macro restore_regs_and_ret
+	movq 0*8(%rsp), %rbx
+	movq 1*8(%rsp), %r12
+	movq 2*8(%rsp), %r14
+	movq 3*8(%rsp), %r13
+	movq 4*8(%rsp), %r15
+	addq $5*8, %rsp
+	RET
+.endm
+
+.macro	_csum_partial_copy
 	subq  $5*8, %rsp
 	movq  %rbx, 0*8(%rsp)
 	movq  %r12, 1*8(%rsp)
@@ -48,41 +58,52 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	xorl  %r9d, %r9d
 	movl  %edx, %ecx
 	cmpl  $8, %ecx
-	jb    .Lshort
+	jb    .Lshort\@
 
 	testb  $7, %sil
-	jne   .Lunaligned
-.Laligned:
-	movl  %ecx, %r12d
+	jne   .Lunaligned\@
+.Laligned\@:
+	.set  INP, %rdi		/* input pointer */
+	.set  OUTP, %rsi	/* output pointer */
+	.set  SUM, %rax		/* checksum accumulator */
+	.set  ZERO, %r9		/* zero register */
+	.set  LEN, %ecx		/* byte count */
+	.set  LEN64B, %r12d	/* 64-byte block count */
+	.set  TMP1, %rbx
+	.set  TMP2, %r8
+	.set  TMP3, %r11
+	.set  TMP4, %rdx
+	.set  TMP5, %r10
+	.set  TMP6, %r15
+	.set  TMP7, %r14
+	.set  TMP8, %r13
 
-	shrq  $6, %r12
-	jz	.Lhandle_tail       /* < 64 */
+	movl  LEN, LEN64B
+
+	shrl  $6, LEN64B
+	jz	.Lhandle_tail\@     /* < 64 */
 
 	clc
 
-	/* main loop. clear in 64 byte blocks */
-	/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
-	/* r11:	temp3, rdx: temp4, r12 loopcnt */
-	/* r10:	temp5, r15: temp6, r14 temp7, r13 temp8 */
 	.p2align 4
-.Lloop:
+.Lloop\@:
 	source
-	movq  (%rdi), %rbx
+	movq  (INP), TMP1
 	source
-	movq  8(%rdi), %r8
+	movq  8(INP), TMP2
 	source
-	movq  16(%rdi), %r11
+	movq  16(INP), TMP3
 	source
-	movq  24(%rdi), %rdx
+	movq  24(INP), TMP4
 
 	source
-	movq  32(%rdi), %r10
+	movq  32(INP), TMP5
 	source
-	movq  40(%rdi), %r15
+	movq  40(INP), TMP6
 	source
-	movq  48(%rdi), %r14
+	movq  48(INP), TMP7
 	source
-	movq  56(%rdi), %r13
+	movq  56(INP), TMP8
 
 30:
 	/*
@@ -92,64 +113,64 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	_ASM_EXTABLE(30b, 2f)
 	prefetcht0 5*64(%rdi)
 2:
-	adcq  %rbx, %rax
-	adcq  %r8, %rax
-	adcq  %r11, %rax
-	adcq  %rdx, %rax
-	adcq  %r10, %rax
-	adcq  %r15, %rax
-	adcq  %r14, %rax
-	adcq  %r13, %rax
+	adcq  TMP1, SUM
+	adcq  TMP2, SUM
+	adcq  TMP3, SUM
+	adcq  TMP4, SUM
+	adcq  TMP5, SUM
+	adcq  TMP6, SUM
+	adcq  TMP7, SUM
+	adcq  TMP8, SUM
 
-	decl %r12d
+	decl LEN64B
 
 	dest
-	movq %rbx, (%rsi)
+	movq TMP1, (OUTP)
 	dest
-	movq %r8, 8(%rsi)
+	movq TMP2, 8(OUTP)
 	dest
-	movq %r11, 16(%rsi)
+	movq TMP3, 16(OUTP)
 	dest
-	movq %rdx, 24(%rsi)
+	movq TMP4, 24(OUTP)
 
 	dest
-	movq %r10, 32(%rsi)
+	movq TMP5, 32(OUTP)
 	dest
-	movq %r15, 40(%rsi)
+	movq TMP6, 40(OUTP)
 	dest
-	movq %r14, 48(%rsi)
+	movq TMP7, 48(OUTP)
 	dest
-	movq %r13, 56(%rsi)
+	movq TMP8, 56(OUTP)
 
-	leaq 64(%rdi), %rdi
-	leaq 64(%rsi), %rsi
+	leaq 64(INP), INP
+	leaq 64(OUTP), OUTP
 
-	jnz	.Lloop
+	jnz	.Lloop\@
 
-	adcq  %r9, %rax
+	adcq  ZERO, SUM
 
 	/* do last up to 56 bytes */
-.Lhandle_tail:
+.Lhandle_tail\@:
 	/* ecx:	count, rcx.63: the end result needs to be rol8 */
 	movq %rcx, %r10
 	andl $63, %ecx
 	shrl $3, %ecx
-	jz	.Lfold
+	jz	.Lfold\@
 	clc
 	.p2align 4
-.Lloop_8:
+.Lloop_8\@:
 	source
-	movq (%rdi), %rbx
-	adcq %rbx, %rax
-	decl %ecx
+	movq (INP), TMP1
+	adcq TMP1, SUM
+	decl LEN
 	dest
-	movq %rbx, (%rsi)
-	leaq 8(%rsi), %rsi /* preserve carry */
-	leaq 8(%rdi), %rdi
-	jnz	.Lloop_8
-	adcq %r9, %rax	/* add in carry */
+	movq TMP1, (OUTP)
+	leaq 8(INP), INP /* preserve carry */
+	leaq 8(OUTP), OUTP
+	jnz	.Lloop_8\@
+	adcq ZERO, SUM	/* add in carry */
 
-.Lfold:
+.Lfold\@:
 	/* reduce checksum to 32bits */
 	movl %eax, %ebx
 	shrq $32, %rax
@@ -157,17 +178,17 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	adcl %r9d, %eax
 
 	/* do last up to 6 bytes */
-.Lhandle_7:
+.Lhandle_7\@:
 	movl %r10d, %ecx
 	andl $7, %ecx
-.L1:				/* .Lshort rejoins the common path here */
+.L1\@:				/* .Lshort\@ rejoins the common path here */
 	shrl $1, %ecx
-	jz   .Lhandle_1
+	jz   .Lhandle_1\@
 	movl $2, %edx
 	xorl %ebx, %ebx
 	clc
 	.p2align 4
-.Lloop_1:
+.Lloop_1\@:
 	source
 	movw (%rdi), %bx
 	adcl %ebx, %eax
@@ -176,13 +197,13 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	movw %bx, (%rsi)
 	leaq 2(%rdi), %rdi
 	leaq 2(%rsi), %rsi
-	jnz .Lloop_1
+	jnz .Lloop_1\@
 	adcl %r9d, %eax	/* add in carry */
 
 	/* handle last odd byte */
-.Lhandle_1:
+.Lhandle_1\@:
 	testb $1, %r10b
-	jz    .Lende
+	jz    .Lende\@
 	xorl  %ebx, %ebx
 	source
 	movb (%rdi), %bl
@@ -191,24 +212,18 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	addl %ebx, %eax
 	adcl %r9d, %eax		/* carry */
 
-.Lende:
+.Lende\@:
 	testq %r10, %r10
-	js  .Lwas_odd
-.Lout:
-	movq 0*8(%rsp), %rbx
-	movq 1*8(%rsp), %r12
-	movq 2*8(%rsp), %r14
-	movq 3*8(%rsp), %r13
-	movq 4*8(%rsp), %r15
-	addq $5*8, %rsp
-	RET
-.Lshort:
+	js  .Lwas_odd\@
+.Lout\@:
+	restore_regs_and_ret
+.Lshort\@:
 	movl %ecx, %r10d
-	jmp  .L1
-.Lunaligned:
+	jmp  .L1\@
+.Lunaligned\@:
 	xorl %ebx, %ebx
 	testb $1, %sil
-	jne  .Lodd
+	jne  .Lodd\@
 1:	testb $2, %sil
 	je   2f
 	source
@@ -220,7 +235,7 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	leaq 2(%rsi), %rsi
 	addq %rbx, %rax
 2:	testb $4, %sil
-	je .Laligned
+	je .Laligned\@
 	source
 	movl (%rdi), %ebx
 	dest
@@ -229,9 +244,9 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	subq $4, %rcx
 	leaq 4(%rsi), %rsi
 	addq %rbx, %rax
-	jmp .Laligned
+	jmp .Laligned\@
 
-.Lodd:
+.Lodd\@:
 	source
 	movb (%rdi), %bl
 	dest
@@ -245,12 +260,16 @@ SYM_FUNC_START(csum_partial_copy_generic)
 	addq %rbx, %rax
 	jmp 1b
 
-.Lwas_odd:
+.Lwas_odd\@:
 	roll $8, %eax
-	jmp .Lout
+	jmp .Lout\@
+.endm
 
 	/* Exception: just return 0 */
 .Lfault:
 	xorl %eax, %eax
-	jmp  .Lout
+	restore_regs_and_ret
+
+SYM_FUNC_START(csum_partial_copy_generic)
+	_csum_partial_copy
 SYM_FUNC_END(csum_partial_copy_generic)
-- 
2.51.0