[PATCH] x86_64: inline csum_ipv6

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] x86_64: inline csum_ipv6_magic()
@ 2025-11-13 15:45 Eric Dumazet
  2025-11-13 16:26 ` Dave Hansen
  0 siblings, 1 reply; 7+ messages in thread
From: Eric Dumazet @ 2025-11-13 15:45 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: linux-kernel, Simon Horman, Kuniyuki Iwashima, netdev,
	Eric Dumazet, Eric Dumazet

Inline this small helper.

This reduces register pressure, as saddr and daddr are often
back to back in memory.

For instance code inlined in tcp6_gro_receive() will look like:

 55a:	48 03 73 28          	add    0x28(%rbx),%rsi
 55e:	8b 43 70             	mov    0x70(%rbx),%eax
 561:	29 f8                	sub    %edi,%eax
 563:	0f c8                	bswap  %eax
 565:	89 c0                	mov    %eax,%eax
 567:	48 05 00 06 00 00    	add    $0x600,%rax
 56d:	48 03 46 08          	add    0x8(%rsi),%rax
 571:	48 13 46 10          	adc    0x10(%rsi),%rax
 575:	48 13 46 18          	adc    0x18(%rsi),%rax
 579:	48 13 46 20          	adc    0x20(%rsi),%rax
 57d:	48 83 d0 00          	adc    $0x0,%rax
 581:	48 89 c6             	mov    %rax,%rsi
 584:	48 c1 ee 20          	shr    $0x20,%rsi
 588:	01 f0                	add    %esi,%eax
 58a:	83 d0 00             	adc    $0x0,%eax
 58d:	89 c6                	mov    %eax,%esi
 58f:	66 31 c0             	xor    %ax,%ax

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 arch/x86/include/asm/checksum_64.h | 45 ++++++++++++++++++++++--------
 arch/x86/lib/csum-wrappers_64.c    | 22 ---------------
 2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 4d4a47a3a8ab2310d279f7e465032b1463200393..5bdfd2db2b5a573ff8193a4878d372c97b158f47 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -9,6 +9,7 @@
  */
 
 #include <linux/compiler.h>
+#include <linux/in6.h>
 #include <asm/byteorder.h>
 
 /**
@@ -145,6 +146,17 @@ extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);
  */
 extern __sum16 ip_compute_csum(const void *buff, int len);
 
+static inline unsigned add32_with_carry(unsigned a, unsigned b)
+{
+	asm("addl %2,%0\n\t"
+	    "adcl $0,%0"
+	    : "=r" (a)
+	    : "0" (a), "rm" (b));
+	return a;
+}
+
+#define _HAVE_ARCH_IPV6_CSUM 1
+
 /**
  * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
  * @saddr: source address
@@ -158,20 +170,29 @@ extern __sum16 ip_compute_csum(const void *buff, int len);
  * Returns the unfolded 32bit checksum.
  */
 
-struct in6_addr;
+static inline __sum16 csum_ipv6_magic(
+	const struct in6_addr *_saddr, const struct in6_addr *_daddr,
+	__u32 len, __u8 proto, __wsum sum)
+{
+	const unsigned long *saddr = (const unsigned long *)_saddr;
+	const unsigned long *daddr = (const unsigned long *)_daddr;
+	__u64 sum64;
 
-#define _HAVE_ARCH_IPV6_CSUM 1
-extern __sum16
-csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
-		__u32 len, __u8 proto, __wsum sum);
+	sum64 = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
+		(__force __u64)sum;
 
-static inline unsigned add32_with_carry(unsigned a, unsigned b)
-{
-	asm("addl %2,%0\n\t"
-	    "adcl $0,%0"
-	    : "=r" (a)
-	    : "0" (a), "rm" (b));
-	return a;
+	asm("	addq %1,%[sum64]\n"
+	    "	adcq %2,%[sum64]\n"
+	    "	adcq %3,%[sum64]\n"
+	    "	adcq %4,%[sum64]\n"
+	    "	adcq $0,%[sum64]\n"
+
+	    : [sum64] "+r" (sum64)
+	    : "m" (saddr[0]), "m" (saddr[1]),
+	      "m" (daddr[0]), "m" (daddr[1]));
+
+	return csum_fold(
+	       (__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
 }
 
 #define HAVE_ARCH_CSUM_ADD
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index f4df4d241526c64a5ad2eabdcbf5f0d8d56d6fd8..831b7110b041598b9764a6647fa259e1058efef2 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -68,25 +68,3 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len)
 }
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
 
-__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-			const struct in6_addr *daddr,
-			__u32 len, __u8 proto, __wsum sum)
-{
-	__u64 rest, sum64;
-
-	rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
-		(__force __u64)sum;
-
-	asm("	addq (%[saddr]),%[sum]\n"
-	    "	adcq 8(%[saddr]),%[sum]\n"
-	    "	adcq (%[daddr]),%[sum]\n"
-	    "	adcq 8(%[daddr]),%[sum]\n"
-	    "	adcq $0,%[sum]\n"
-
-	    : [sum] "=r" (sum64)
-	    : "[sum]" (rest), [saddr] "r" (saddr), [daddr] "r" (daddr));
-
-	return csum_fold(
-	       (__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
-}
-EXPORT_SYMBOL(csum_ipv6_magic);
-- 
2.51.2.1041.gc1ab5b90ca-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: inline csum_ipv6_magic()
  2025-11-13 15:45 [PATCH] x86_64: inline csum_ipv6_magic() Eric Dumazet
@ 2025-11-13 16:26 ` Dave Hansen
  2025-11-13 18:18   ` Eric Dumazet
  0 siblings, 1 reply; 7+ messages in thread
From: Dave Hansen @ 2025-11-13 16:26 UTC (permalink / raw)
  To: Eric Dumazet, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, David S . Miller,
	Jakub Kicinski, Paolo Abeni
  Cc: linux-kernel, Simon Horman, Kuniyuki Iwashima, netdev,
	Eric Dumazet

On 11/13/25 07:45, Eric Dumazet wrote:
> Inline this small helper.
> 
> This reduces register pressure, as saddr and daddr are often
> back to back in memory.
> 
> For instance code inlined in tcp6_gro_receive() will look like:

Could you please double check what the code growth is for this across
the tree? There are 80-ish users of csum_ipv6_magic().

Or, is there a discrete, measurable performance gain from doing this?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: inline csum_ipv6_magic()
  2025-11-13 16:26 ` Dave Hansen
@ 2025-11-13 18:18   ` Eric Dumazet
  2025-11-13 18:40     ` Dave Hansen
  2025-11-13 20:03     ` David Laight
  0 siblings, 2 replies; 7+ messages in thread
From: Eric Dumazet @ 2025-11-13 18:18 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, David S . Miller, Jakub Kicinski, Paolo Abeni,
	linux-kernel, Simon Horman, Kuniyuki Iwashima, netdev,
	Eric Dumazet

On Thu, Nov 13, 2025 at 8:26 AM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 11/13/25 07:45, Eric Dumazet wrote:
> > Inline this small helper.
> >
> > This reduces register pressure, as saddr and daddr are often
> > back to back in memory.
> >
> > For instance code inlined in tcp6_gro_receive() will look like:
>
> Could you please double check what the code growth is for this across
> the tree? There are 80-ish users of csum_ipv6_magic().

Hi Dave

Sure (allyesconfig build)

Before patch:

size vmlinux
   text    data     bss     dec     hex filename
886947242 245613190 40211540 1172771972 45e71484 vmlinux

After patch:
 size vmlinux
   text    data     bss     dec     hex filename
886947242 245613190 40211540 1172771972 45e71484 vmlinux

I found this a bit surprising, so I did a regular build (our Google
production kernel default config)

Before:

size vmlinux
   text    data     bss     dec     hex filename
34812872 22177397 5685248 62675517 3bc5a3d vmlinux

After:

 size vmlinux
   text    data     bss     dec     hex filename
34812501 22177365 5685248 62675114 3bc58aa vmlinux

So it would seem the patch saves 371 bytes for this config.

>
> Or, is there a discrete, measurable performance gain from doing this?

IPv6 incoming TCP/UDP paths call this function twice per packet, which is sad...
One call per TX packet.

Depending on the cpus I can see csum_ipv6_magic() using up to 0.75 %
of cpu cycles.
Then there is the cost in the callers, harder to measure...

Thank you.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: inline csum_ipv6_magic()
  2025-11-13 18:18   ` Eric Dumazet
@ 2025-11-13 18:40     ` Dave Hansen
  2025-12-23  5:03       ` Eric Dumazet
  2025-11-13 20:03     ` David Laight
  1 sibling, 1 reply; 7+ messages in thread
From: Dave Hansen @ 2025-11-13 18:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, David S . Miller, Jakub Kicinski, Paolo Abeni,
	linux-kernel, Simon Horman, Kuniyuki Iwashima, netdev,
	Eric Dumazet

On 11/13/25 10:18, Eric Dumazet wrote:
> So it would seem the patch saves 371 bytes for this config.
> 
>> Or, is there a discrete, measurable performance gain from doing this?
> IPv6 incoming TCP/UDP paths call this function twice per packet, which is sad...
> One call per TX packet.
> 
> Depending on the cpus I can see csum_ipv6_magic() using up to 0.75 %
> of cpu cycles.
> Then there is the cost in the callers, harder to measure...

Oh, wow. That's more than I was expecting. But it does make sense.
Thanks for the info. I'll stick this in the queue to apply in a month or
so after the next -rc1, unless it needs more urgency.

Acked-by: Dave Hansen <dave.hansen@linux.intel.com>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: inline csum_ipv6_magic()
  2025-11-13 18:18   ` Eric Dumazet
  2025-11-13 18:40     ` Dave Hansen
@ 2025-11-13 20:03     ` David Laight
  2025-11-13 20:12       ` Eric Dumazet
  1 sibling, 1 reply; 7+ messages in thread
From: David Laight @ 2025-11-13 20:03 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Dave Hansen, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, David S . Miller,
	Jakub Kicinski, Paolo Abeni, linux-kernel, Simon Horman,
	Kuniyuki Iwashima, netdev, Eric Dumazet

On Thu, 13 Nov 2025 10:18:08 -0800
Eric Dumazet <edumazet@google.com> wrote:

> On Thu, Nov 13, 2025 at 8:26 AM Dave Hansen <dave.hansen@intel.com> wrote:
> >
> > On 11/13/25 07:45, Eric Dumazet wrote:  
> > > Inline this small helper.
> > >
> > > This reduces register pressure, as saddr and daddr are often
> > > back to back in memory.
> > >
> > > For instance code inlined in tcp6_gro_receive() will look like:  
> >
> > Could you please double check what the code growth is for this across
> > the tree? There are 80-ish users of csum_ipv6_magic().  
> 
> Hi Dave
> 
> Sure (allyesconfig build)

Does't allyesconfig pull in all the KASAN stuff as well.
Which makes it fairly useless for normal build tests.

	David

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: inline csum_ipv6_magic()
  2025-11-13 20:03     ` David Laight
@ 2025-11-13 20:12       ` Eric Dumazet
  0 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2025-11-13 20:12 UTC (permalink / raw)
  To: David Laight
  Cc: Dave Hansen, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H . Peter Anvin, David S . Miller,
	Jakub Kicinski, Paolo Abeni, linux-kernel, Simon Horman,
	Kuniyuki Iwashima, netdev, Eric Dumazet

On Thu, Nov 13, 2025 at 12:03 PM David Laight
<david.laight.linux@gmail.com> wrote:
>
> On Thu, 13 Nov 2025 10:18:08 -0800
> Eric Dumazet <edumazet@google.com> wrote:
>
> > On Thu, Nov 13, 2025 at 8:26 AM Dave Hansen <dave.hansen@intel.com> wrote:
> > >
> > > On 11/13/25 07:45, Eric Dumazet wrote:
> > > > Inline this small helper.
> > > >
> > > > This reduces register pressure, as saddr and daddr are often
> > > > back to back in memory.
> > > >
> > > > For instance code inlined in tcp6_gro_receive() will look like:
> > >
> > > Could you please double check what the code growth is for this across
> > > the tree? There are 80-ish users of csum_ipv6_magic().
> >
> > Hi Dave
> >
> > Sure (allyesconfig build)
>
> Does't allyesconfig pull in all the KASAN stuff as well.
> Which makes it fairly useless for normal build tests.

This is why I added a more standard build.

I do not think we have a "make allyesconfig_but_no_debug_stuff"

BTW, inlining rb_first() also saves 744 bytes.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: inline csum_ipv6_magic()
  2025-11-13 18:40     ` Dave Hansen
@ 2025-12-23  5:03       ` Eric Dumazet
  0 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2025-12-23  5:03 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, David S . Miller, Jakub Kicinski, Paolo Abeni,
	linux-kernel, Simon Horman, Kuniyuki Iwashima, netdev,
	Eric Dumazet

On Thu, Nov 13, 2025 at 7:40 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 11/13/25 10:18, Eric Dumazet wrote:
> > So it would seem the patch saves 371 bytes for this config.
> >
> >> Or, is there a discrete, measurable performance gain from doing this?
> > IPv6 incoming TCP/UDP paths call this function twice per packet, which is sad...
> > One call per TX packet.
> >
> > Depending on the cpus I can see csum_ipv6_magic() using up to 0.75 %
> > of cpu cycles.
> > Then there is the cost in the callers, harder to measure...
>
> Oh, wow. That's more than I was expecting. But it does make sense.
> Thanks for the info. I'll stick this in the queue to apply in a month or
> so after the next -rc1, unless it needs more urgency.
>
> Acked-by: Dave Hansen <dave.hansen@linux.intel.com>

Gentle ping, I have not seen this patch reaching the tip tree.

Thanks a lot !

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-12-23  5:03 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-13 15:45 [PATCH] x86_64: inline csum_ipv6_magic() Eric Dumazet
2025-11-13 16:26 ` Dave Hansen
2025-11-13 18:18   ` Eric Dumazet
2025-11-13 18:40     ` Dave Hansen
2025-12-23  5:03       ` Eric Dumazet
2025-11-13 20:03     ` David Laight
2025-11-13 20:12       ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).