From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4BB9F32FA11 for ; Mon, 24 Nov 2025 21:55:37 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764021339; cv=none; b=m4nq9orOctEUhHXWjGgr/iiW2miiCFT9EA3uDwNUbP0C7RZpcRlvS7BSjHdfQxj+RAy0hYb1Oc8ntpRYn4wotnTPVWGFV2aWEOYW7DdcyendeMyDGScKGX9fsnbPiEHJYOcY37PEthaM9u+M+0Y8A65J2RHGIOJXH0OpaxxFNLk= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764021339; c=relaxed/simple; bh=VTIV9XRcXOtCUAH1+SA7E5lC8LCXhrBXiGVpFvsjCaY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=O3doet6y6Kd7TtdWynkpI0rHT18ctcjMh8tnTe84KqfqAGZXp9B8oXwrwio1cVvXl2Gf5N11/vu/xWoZLawuDUg2e4mgZ3gA22owLyJtSa3WSneqHyAjS1audKInFRXZa/Gk6nbBBv8Cg2zonupKCp8gDU6xbeZX/syBQABNyyM= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=AchQQyQ0; arc=none smtp.client-ip=198.175.65.17 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="AchQQyQ0" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1764021337; x=1795557337; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=VTIV9XRcXOtCUAH1+SA7E5lC8LCXhrBXiGVpFvsjCaY=; b=AchQQyQ0XnjX9bkQiftW8ZBealdEint/mQds7HgYMuANZ7nqU4zbrKrN 5H3ynmsSFEZ9JxVOOXGBnlgFyt8ClsS0BM+jHRYTAA3uTuZGZuWsk+EoC 6Z4wwVKtdy1YiNNDW3ZXjnsaw3RoB7mKtl7Cm+UtOp+8n9ymwZq9d7a9N TXR5ecZjgU9V82mq3tgQzfQTUFGi5u/HmDpoLOotK75OoKDSrzqWfcv8a jmP0z3oaxmfBmQwryk8sEoiuJuiWi43qs+g+7m94UIHrAn0FVwPEDYOlH VyvH1tZ7YlZKVIxOPd/eDfHoG1cB9qORq/6Er0av0xYGjbXM6V6cKjTpB g==; X-CSE-ConnectionGUID: kDAwwYiWSGurZv5MoscjzA== X-CSE-MsgGUID: KgRxogE0SDaPK0RCjhV67w== X-IronPort-AV: E=McAfee;i="6800,10657,11623"; a="65985353" X-IronPort-AV: E=Sophos;i="6.20,223,1758610800"; d="scan'208";a="65985353" Received: from fmviesa003.fm.intel.com ([10.60.135.143]) by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 24 Nov 2025 13:55:37 -0800 X-CSE-ConnectionGUID: +0Gaqjn9SkS6dkdAmWOllQ== X-CSE-MsgGUID: GiMPQyqoTNuhS6UKu0N7mw== X-ExtLoop1: 1 Received: from chang-linux-3.sc.intel.com (HELO chang-linux-3) ([172.25.66.172]) by fmviesa003.fm.intel.com with ESMTP; 24 Nov 2025 13:55:36 -0800 From: "Chang S. Bae" To: linux-kernel@vger.kernel.org Cc: x86@kernel.org, tglx@linutronix.de, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, chang.seok.bae@intel.com Subject: [RFC PATCH 3/3] x86/lib: Use EGPRs in 64-bit checksum copy loop Date: Mon, 24 Nov 2025 21:32:26 +0000 Message-ID: <20251124213227.123779-4-chang.seok.bae@intel.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251124213227.123779-1-chang.seok.bae@intel.com> References: <20251124213227.123779-1-chang.seok.bae@intel.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit The current checksum copy routine already uses all legacy GPRs for loop unrolling. APX introduces additional GPRs. Use them to extend the unrolling further. Signed-off-by: Chang S. Bae --- Caveat: This is primarily an illustrative example. I have not fully audited all call sites or large-buffer use cases (yet). The goal is to demonstrate the potential of the extended register set. --- arch/x86/Kconfig | 6 +++ arch/x86/Kconfig.assembler | 6 +++ arch/x86/include/asm/checksum_64.h | 24 +++++++++++- arch/x86/lib/csum-copy_64.S | 59 ++++++++++++++++++++++++++++-- 4 files changed, 90 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fa3b616af03a..e6d969376bf2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1890,6 +1890,12 @@ config X86_USER_SHADOW_STACK If unsure, say N. +config X86_APX + bool "In-kernel APX use" + depends on AS_APX + help + Experimental: enable in-kernel use of APX + config INTEL_TDX_HOST bool "Intel Trust Domain Extensions (TDX) host support" depends on CPU_SUP_INTEL diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index b1c59fb0a4c9..d208ac540609 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -5,3 +5,9 @@ config AS_WRUSS def_bool $(as-instr64,wrussq %rax$(comma)(%rbx)) help Supported by binutils >= 2.31 and LLVM integrated assembler + +config AS_APX + def_bool $(as-instr64,mov %r16$(comma)%r17) + help + Assembler support extended registers. + Supported by binutils >= 2.43 (LLVM version TBD) diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 4d4a47a3a8ab..4cbd9e71f8c3 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -10,6 +10,7 @@ #include #include +#include /** * csum_fold - Fold and invert a 32bit checksum. @@ -129,7 +130,28 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, extern __wsum csum_partial(const void *buff, int len, __wsum sum); /* Do not call this directly. Use the wrappers below */ -extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len); +extern __visible __wsum csum_partial_copy(const void *src, void *dst, int len); +#ifndef CONFIG_X86_APX +static inline __wsum csum_partial_copy_generic(const void *src, void *dst, int len) +{ + return csum_partial_copy(src, dst, len); +} +#else +extern __visible __wsum csum_partial_copy_apx(const void *src, void *dst, int len); +static inline __wsum csum_partial_copy_generic(const void *src, void *dst, int len) +{ + __wsum sum; + + if (!cpu_has_xfeatures(XFEATURE_MASK_APX, NULL) || !irq_fpu_usable()) + return csum_partial_copy(src, dst, len); + + kernel_fpu_begin(); + sum = csum_partial_copy_apx(src, dst, len); + kernel_fpu_end(); + + return sum; +} +#endif extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len); extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len); diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 5526bdfac041..dc99227af94f 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -119,11 +119,54 @@ shrl $6, LEN64B jz .Lhandle_tail\@ /* < 64 */ +.if USE_APX + cmpl $3, LEN64B + jb .Lloop_64\@ /* < 192 */ + clc + .p2align 4 +.Lloop_192\@: + .set TMP9, %r16 + .set TMP10, %r17 + .set TMP11, %r18 + .set TMP12, %r19 + .set TMP13, %r20 + .set TMP14, %r21 + .set TMP15, %r22 + .set TMP16, %r23 + .set TMP17, %r24 + .set TMP18, %r25 + .set TMP19, %r26 + .set TMP20, %r27 + .set TMP21, %r28 + .set TMP22, %r29 + .set TMP23, %r30 + .set TMP24, %r31 + + .p2align 4 + loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8 + loadregs 8, INP, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16 + loadregs 16, INP, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24 + + sumregs SUM, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8 + sumregs SUM, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16 + sumregs SUM, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24 + + storeregs 0, OUTP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8 + storeregs 8, OUTP, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16 + storeregs 16, OUTP, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24 + + incr INP, 24 + incr OUTP, 24 + sub $3, LEN64B + cmp $3, LEN64B + jnb .Lloop_192\@ +.else clc .p2align 4 -.Lloop\@: +.endif +.Lloop_64\@: loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8 prefetch @@ -137,7 +180,7 @@ incr INP, 8 incr OUTP, 8 - jnz .Lloop\@ + jnz .Lloop_64\@ adcq ZERO, SUM @@ -260,6 +303,14 @@ xorl %eax, %eax restore_regs_and_ret -SYM_FUNC_START(csum_partial_copy_generic) +.set USE_APX, 0 +SYM_FUNC_START(csum_partial_copy) _csum_partial_copy -SYM_FUNC_END(csum_partial_copy_generic) +SYM_FUNC_END(csum_partial_copy) + +#ifdef CONFIG_X86_APX +.set USE_APX, 1 +SYM_FUNC_START(csum_partial_copy_apx) + _csum_partial_copy +SYM_FUNC_END(csum_partial_copy_apx) +#endif -- 2.51.0