From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CBD92FD7D8 for ; Mon, 24 Nov 2025 21:55:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764021337; cv=none; b=LBYBvwvcFL3065N4MMlOi7djAk3nEWccqx3stR+cFOoWm6Zda+CgHyIohtXmPpKbn5EkfGAR864W8s5dap8XS+yqJfFxJFROrMYY9ur6SpcnKjJzK8vUPrwE5QnxOaXimTTSFe+fd6lvqpiRjH8wRyIE1dtkNjngkupKGHVqjQc= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764021337; c=relaxed/simple; bh=mP88kNlNQz94ZT8UHZ3MO6Bg/wDFm4P/C9KI6XufkyA=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=H0zysNTSyNH3u4OGTKiE8o44CmX/LS666iSwqm4KacZOTTtIBZzZgd1Uv8r+uNMey0gO+MS/ySusKyNTZEohp0NlCHqNrBIU3B9NwB986rHpda2DnNRm9T72FvSjnY7alnwqdU/8GHfZZag+U21Cl9Kakfg/+1nnn+rG04eRDqI= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=DDSldcn/; arc=none smtp.client-ip=198.175.65.17 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="DDSldcn/" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1764021335; x=1795557335; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=mP88kNlNQz94ZT8UHZ3MO6Bg/wDFm4P/C9KI6XufkyA=; b=DDSldcn/vc8dR1Cqsi0qtbf7DgxMIgs6AMtqlT9wpurYuGLiKXMYAv/T 3E6l3v91XlJh8YMdMuFXJ9/K0Xd2kF9JNyL+Pg29Xykpv4cDniMgsKI4R zAQWr2xUXxcZDV8k/FQhQMGw++I+zgEiAFEZVhCq7TMOAmqwEDEMfdpYr k6rSq/XCqH2waqjMp55V99Jfq0Y9EL7zmf2lSmr8xFP7SMd38ud09wp7y xyL9pberWRZFesWCxEuSlq/qavAotC8Qys3LT27HEs+hrEJ75A/sAAiKm xwF3n6L5+I4KJyEZ8hW1caK/LCTmnVPbjlYJZH1rG20AboNjdIetRCnro Q==; X-CSE-ConnectionGUID: jDZ76dMoQpmUlZI1pEqghw== X-CSE-MsgGUID: dCOVk1UGRVGXa6I8QJp3wQ== X-IronPort-AV: E=McAfee;i="6800,10657,11623"; a="65985343" X-IronPort-AV: E=Sophos;i="6.20,223,1758610800"; d="scan'208";a="65985343" Received: from fmviesa003.fm.intel.com ([10.60.135.143]) by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 24 Nov 2025 13:55:35 -0800 X-CSE-ConnectionGUID: pgUMWdvXROuIXqnMOhGxMw== X-CSE-MsgGUID: SVwuYcrxSDCZuN82qjB1/Q== X-ExtLoop1: 1 Received: from chang-linux-3.sc.intel.com (HELO chang-linux-3) ([172.25.66.172]) by fmviesa003.fm.intel.com with ESMTP; 24 Nov 2025 13:55:34 -0800 From: "Chang S. Bae" To: linux-kernel@vger.kernel.org Cc: x86@kernel.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, chang.seok.bae@intel.com Subject: [RFC PATCH 1/3] x86/lib: Refactor csum_partial_copy_generic() into a macro Date: Mon, 24 Nov 2025 21:32:24 +0000 Message-ID: <20251124213227.123779-2-chang.seok.bae@intel.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251124213227.123779-1-chang.seok.bae@intel.com> References: <20251124213227.123779-1-chang.seok.bae@intel.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit The current assembly implementation is too rigid to support new variants that share most of the logic. Refactor the function body into a reusable macro, with register aliasing to improve readability. No functional change. Signed-off-by: Chang S. Bae --- No intention for upstream, but this series is just an example of how extended GPRs can be used within the kernel. --- arch/x86/lib/csum-copy_64.S | 187 ++++++++++++++++++++---------------- 1 file changed, 103 insertions(+), 84 deletions(-) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index d9e16a2cf285..66ed849090b7 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -26,17 +26,27 @@ * They also should align source or destination to 8 bytes. */ - .macro source +.macro source 10: _ASM_EXTABLE_UA(10b, .Lfault) - .endm +.endm - .macro dest +.macro dest 20: _ASM_EXTABLE_UA(20b, .Lfault) - .endm +.endm -SYM_FUNC_START(csum_partial_copy_generic) +.macro restore_regs_and_ret + movq 0*8(%rsp), %rbx + movq 1*8(%rsp), %r12 + movq 2*8(%rsp), %r14 + movq 3*8(%rsp), %r13 + movq 4*8(%rsp), %r15 + addq $5*8, %rsp + RET +.endm + +.macro _csum_partial_copy subq $5*8, %rsp movq %rbx, 0*8(%rsp) movq %r12, 1*8(%rsp) @@ -48,41 +58,52 @@ SYM_FUNC_START(csum_partial_copy_generic) xorl %r9d, %r9d movl %edx, %ecx cmpl $8, %ecx - jb .Lshort + jb .Lshort\@ testb $7, %sil - jne .Lunaligned -.Laligned: - movl %ecx, %r12d + jne .Lunaligned\@ +.Laligned\@: + .set INP, %rdi /* input pointer */ + .set OUTP, %rsi /* output pointer */ + .set SUM, %rax /* checksum accumulator */ + .set ZERO, %r9 /* zero register */ + .set LEN, %ecx /* byte count */ + .set LEN64B, %r12d /* 64-byte block count */ + .set TMP1, %rbx + .set TMP2, %r8 + .set TMP3, %r11 + .set TMP4, %rdx + .set TMP5, %r10 + .set TMP6, %r15 + .set TMP7, %r14 + .set TMP8, %r13 - shrq $6, %r12 - jz .Lhandle_tail /* < 64 */ + movl LEN, LEN64B + + shrl $6, LEN64B + jz .Lhandle_tail\@ /* < 64 */ clc - /* main loop. clear in 64 byte blocks */ - /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ - /* r11: temp3, rdx: temp4, r12 loopcnt */ - /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */ .p2align 4 -.Lloop: +.Lloop\@: source - movq (%rdi), %rbx + movq (INP), TMP1 source - movq 8(%rdi), %r8 + movq 8(INP), TMP2 source - movq 16(%rdi), %r11 + movq 16(INP), TMP3 source - movq 24(%rdi), %rdx + movq 24(INP), TMP4 source - movq 32(%rdi), %r10 + movq 32(INP), TMP5 source - movq 40(%rdi), %r15 + movq 40(INP), TMP6 source - movq 48(%rdi), %r14 + movq 48(INP), TMP7 source - movq 56(%rdi), %r13 + movq 56(INP), TMP8 30: /* @@ -92,64 +113,64 @@ SYM_FUNC_START(csum_partial_copy_generic) _ASM_EXTABLE(30b, 2f) prefetcht0 5*64(%rdi) 2: - adcq %rbx, %rax - adcq %r8, %rax - adcq %r11, %rax - adcq %rdx, %rax - adcq %r10, %rax - adcq %r15, %rax - adcq %r14, %rax - adcq %r13, %rax + adcq TMP1, SUM + adcq TMP2, SUM + adcq TMP3, SUM + adcq TMP4, SUM + adcq TMP5, SUM + adcq TMP6, SUM + adcq TMP7, SUM + adcq TMP8, SUM - decl %r12d + decl LEN64B dest - movq %rbx, (%rsi) + movq TMP1, (OUTP) dest - movq %r8, 8(%rsi) + movq TMP2, 8(OUTP) dest - movq %r11, 16(%rsi) + movq TMP3, 16(OUTP) dest - movq %rdx, 24(%rsi) + movq TMP4, 24(OUTP) dest - movq %r10, 32(%rsi) + movq TMP5, 32(OUTP) dest - movq %r15, 40(%rsi) + movq TMP6, 40(OUTP) dest - movq %r14, 48(%rsi) + movq TMP7, 48(OUTP) dest - movq %r13, 56(%rsi) + movq TMP8, 56(OUTP) - leaq 64(%rdi), %rdi - leaq 64(%rsi), %rsi + leaq 64(INP), INP + leaq 64(OUTP), OUTP - jnz .Lloop + jnz .Lloop\@ - adcq %r9, %rax + adcq ZERO, SUM /* do last up to 56 bytes */ -.Lhandle_tail: +.Lhandle_tail\@: /* ecx: count, rcx.63: the end result needs to be rol8 */ movq %rcx, %r10 andl $63, %ecx shrl $3, %ecx - jz .Lfold + jz .Lfold\@ clc .p2align 4 -.Lloop_8: +.Lloop_8\@: source - movq (%rdi), %rbx - adcq %rbx, %rax - decl %ecx + movq (INP), TMP1 + adcq TMP1, SUM + decl LEN dest - movq %rbx, (%rsi) - leaq 8(%rsi), %rsi /* preserve carry */ - leaq 8(%rdi), %rdi - jnz .Lloop_8 - adcq %r9, %rax /* add in carry */ + movq TMP1, (OUTP) + leaq 8(INP), INP /* preserve carry */ + leaq 8(OUTP), OUTP + jnz .Lloop_8\@ + adcq ZERO, SUM /* add in carry */ -.Lfold: +.Lfold\@: /* reduce checksum to 32bits */ movl %eax, %ebx shrq $32, %rax @@ -157,17 +178,17 @@ SYM_FUNC_START(csum_partial_copy_generic) adcl %r9d, %eax /* do last up to 6 bytes */ -.Lhandle_7: +.Lhandle_7\@: movl %r10d, %ecx andl $7, %ecx -.L1: /* .Lshort rejoins the common path here */ +.L1\@: /* .Lshort\@ rejoins the common path here */ shrl $1, %ecx - jz .Lhandle_1 + jz .Lhandle_1\@ movl $2, %edx xorl %ebx, %ebx clc .p2align 4 -.Lloop_1: +.Lloop_1\@: source movw (%rdi), %bx adcl %ebx, %eax @@ -176,13 +197,13 @@ SYM_FUNC_START(csum_partial_copy_generic) movw %bx, (%rsi) leaq 2(%rdi), %rdi leaq 2(%rsi), %rsi - jnz .Lloop_1 + jnz .Lloop_1\@ adcl %r9d, %eax /* add in carry */ /* handle last odd byte */ -.Lhandle_1: +.Lhandle_1\@: testb $1, %r10b - jz .Lende + jz .Lende\@ xorl %ebx, %ebx source movb (%rdi), %bl @@ -191,24 +212,18 @@ SYM_FUNC_START(csum_partial_copy_generic) addl %ebx, %eax adcl %r9d, %eax /* carry */ -.Lende: +.Lende\@: testq %r10, %r10 - js .Lwas_odd -.Lout: - movq 0*8(%rsp), %rbx - movq 1*8(%rsp), %r12 - movq 2*8(%rsp), %r14 - movq 3*8(%rsp), %r13 - movq 4*8(%rsp), %r15 - addq $5*8, %rsp - RET -.Lshort: + js .Lwas_odd\@ +.Lout\@: + restore_regs_and_ret +.Lshort\@: movl %ecx, %r10d - jmp .L1 -.Lunaligned: + jmp .L1\@ +.Lunaligned\@: xorl %ebx, %ebx testb $1, %sil - jne .Lodd + jne .Lodd\@ 1: testb $2, %sil je 2f source @@ -220,7 +235,7 @@ SYM_FUNC_START(csum_partial_copy_generic) leaq 2(%rsi), %rsi addq %rbx, %rax 2: testb $4, %sil - je .Laligned + je .Laligned\@ source movl (%rdi), %ebx dest @@ -229,9 +244,9 @@ SYM_FUNC_START(csum_partial_copy_generic) subq $4, %rcx leaq 4(%rsi), %rsi addq %rbx, %rax - jmp .Laligned + jmp .Laligned\@ -.Lodd: +.Lodd\@: source movb (%rdi), %bl dest @@ -245,12 +260,16 @@ SYM_FUNC_START(csum_partial_copy_generic) addq %rbx, %rax jmp 1b -.Lwas_odd: +.Lwas_odd\@: roll $8, %eax - jmp .Lout + jmp .Lout\@ +.endm /* Exception: just return 0 */ .Lfault: xorl %eax, %eax - jmp .Lout + restore_regs_and_ret + +SYM_FUNC_START(csum_partial_copy_generic) + _csum_partial_copy SYM_FUNC_END(csum_partial_copy_generic) -- 2.51.0