[PATCH v2 net-next] net: Implement fast csum_partial for x86_64

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Tom Herbert <tom@herbertland.com>
To: <davem@davemloft.net>, <netdev@vger.kernel.org>
Cc: <kernel-team@fb.com>, <tglx@linutronix.de>, <mingo@redhat.com>,
	<hpa@zytor.com>, <x86@kernel.org>
Subject: [PATCH v2 net-next] net: Implement fast csum_partial for x86_64
Date: Tue, 5 Jan 2016 10:41:01 -0800	[thread overview]
Message-ID: <1452019261-449449-1-git-send-email-tom@herbertland.com> (raw)

Implement assembly routine for csum_partial for 64 bit x86. This
primarily speeds up checksum calculation for smaller lengths such as
those that are present when doing skb_postpull_rcsum when getting
CHECKSUM_COMPLETE from device or after CHECKSUM_UNNECESSARY
conversion.

This implementation is similar to csum_partial implemented in
checksum_32.S, however since we are dealing with 8 bytes at a time
there are more cases for small lengths-- for that we employ a jump
table. Also, we don't do anything special for alignment, unaligned
accesses on x86 do not appear to be a performance issue.

Testing:

Verified correctness by testing arbitrary length buffer filled with
random data. For each buffer I compared the computed checksum
using the original algorithm for each possible alignment (0-7 bytes).

Checksum performance:

Isolating old and new implementation for some common cases:

                        Old      New
Case                    nsecs    nsecs     Improvement
---------------------+--------+--------+-----------------------------
1400 bytes (0 align)    194.5    174.3     10%    (Big packet)
40 bytes (0 align)      13.8     5.8       57%    (Ipv6 hdr common case)
8 bytes (4 align)       8.4      2.9       65%    (UDP, VXLAN in IPv4)
14 bytes (0 align)      10.6     5.8       45%    (Eth hdr)
14 bytes (4 align)      10.8     5.8       46%    (Eth hdr in IPv4)

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 arch/x86/include/asm/checksum_64.h |   5 ++
 arch/x86/lib/csum-partial_64.S     | 147 ++++++++++++++++++++++++++++++++++++
 arch/x86/lib/csum-partial_64.c     | 148 -------------------------------------
 3 files changed, 152 insertions(+), 148 deletions(-)
 create mode 100644 arch/x86/lib/csum-partial_64.S
 delete mode 100644 arch/x86/lib/csum-partial_64.c

diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index cd00e17..a888f65 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -128,6 +128,11 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
  */
 extern __wsum csum_partial(const void *buff, int len, __wsum sum);
 
+static inline __sum16 ip_compute_csum(const void *buff, int len)
+{
+	return csum_fold(csum_partial(buff, len, 0));
+}
+
 #define  _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
 #define HAVE_CSUM_COPY_USER 1
 
diff --git a/arch/x86/lib/csum-partial_64.S b/arch/x86/lib/csum-partial_64.S
new file mode 100644
index 0000000..8e387bb
--- /dev/null
+++ b/arch/x86/lib/csum-partial_64.S
@@ -0,0 +1,147 @@
+/* Copyright 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * Checksum partial calculation
+ *
+ * __wsum csum_partial(const void *buff, int len, __wsum sum)
+ *
+ * Computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * Returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * Register usage:
+ *   %rdi: argument 1, buff
+ *   %rsi: argument 2, length
+ *   %rdx: argument 3, add in value
+ *   %rax,%eax: accumulator and return value
+ *   %rcx,%ecx: counter and tmp
+ *   %r11: tmp
+ *
+ * Basic algorithm:
+ *   1) Sum 8 bytes at a time using adcq (unroll main loop
+ *      to do 64 bytes at a time)
+ *   2) Sum remaining length (less than 8 bytes)
+ *
+ * Note that buffer aligment is not considered, unaligned accesses on x86 don't
+ * seem to be a performance hit (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is set).
+ */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/asm.h>
+
+#define branch_tbl_len		.L_branch_tbl_len
+
+ENTRY(csum_partial)
+	movl	%edx, %eax	/* Initialize with initial sum argument */
+
+	/* Check length */
+	cmpl	$8, %esi
+	jg	10f
+	jl	20f
+
+	/* Exactly 8 bytes length */
+	addl	(%rdi), %eax
+	adcl	4(%rdi), %eax
+	adcl	$0, %eax
+	ret
+
+	/* Less than 8 bytes length */
+20:	clc
+	jmpq *branch_tbl_len(, %rsi, 8)
+
+	/* Greater than 8 bytes length. Determine number of quads (n). Sum
+	 * over first n % 8 quads
+	 */
+10:	movl	%esi, %ecx
+	shrl	$3, %ecx
+	andl	$0x7, %ecx
+	negq	%rcx
+	lea	20f(, %rcx, 4), %r11
+	clc
+	jmp	*%r11
+
+.align 8
+	adcq	6*8(%rdi),%rax
+	adcq	5*8(%rdi),%rax
+	adcq	4*8(%rdi),%rax
+	adcq	3*8(%rdi),%rax
+	adcq	2*8(%rdi),%rax
+	adcq	1*8(%rdi),%rax
+	adcq	0*8(%rdi),%rax
+	nop
+20:	/* #quads % 8 jump table base */
+
+	adcq	$0, %rax
+	shlq	$3, %rcx
+	subq	%rcx, %rdi /* %rcx is already negative length */
+
+	/* Now determine number of blocks of 8 quads. Sum 64 bytes at a time
+	 * using unrolled loop.
+	 */
+	movl	%esi, %ecx
+	shrl	$6, %ecx
+	jz	30f
+	clc
+
+	/* Main loop */
+40:	adcq	0*8(%rdi),%rax
+	adcq	1*8(%rdi),%rax
+	adcq	2*8(%rdi),%rax
+	adcq	3*8(%rdi),%rax
+	adcq	4*8(%rdi),%rax
+	adcq	5*8(%rdi),%rax
+	adcq	6*8(%rdi),%rax
+	adcq	7*8(%rdi),%rax
+	lea	64(%rdi), %rdi
+	loop	40b
+
+	adcq	$0, %rax
+
+	/* Handle remaining length which is < 8 bytes */
+30:	andl	$0x7, %esi
+
+	/* Fold 64 bit sum to 32 bits */
+	movq	%rax, %rcx
+	shrq	$32, %rcx
+	addl	%ecx, %eax
+
+	jmpq *branch_tbl_len(, %rsi, 8)
+
+/* Length table targets */
+
+107:	/* Length 7 */
+	adcw	4(%rdi), %ax
+105:	/* Length 5 */
+	adcw	2(%rdi), %ax
+103:	/* Length 3 */
+	adcw	(%rdi), %ax
+101:	/* Length 1, grab the odd byte */
+	adcb	-1(%rdi, %rsi), %al
+	adcb	$0, %ah
+	adcl	$0, %eax
+	ret
+106:	/* Length 6 */
+	adcw	4(%rdi), %ax
+104:	/* Length 4 */
+	adcl	(%rdi), %eax
+	adcl	$0, %eax
+	ret
+102:	/* Length 2 */
+	adcw	(%rdi), %ax
+100:	/* Length 0 */
+	adcl	$0, %eax
+	ret
+
+.section .rodata
+.align 64
+.L_branch_tbl_len:
+	.quad	100b
+	.quad	101b
+	.quad	102b
+	.quad	103b
+	.quad	104b
+	.quad	105b
+	.quad	106b
+	.quad	107b
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
deleted file mode 100644
index 9845371..0000000
--- a/arch/x86/lib/csum-partial_64.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * arch/x86_64/lib/csum-partial.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed.
- */
- 
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <asm/checksum.h>
-
-static inline unsigned short from32to16(unsigned a) 
-{
-	unsigned short b = a >> 16; 
-	asm("addw %w2,%w0\n\t"
-	    "adcw $0,%w0\n" 
-	    : "=r" (b)
-	    : "0" (b), "r" (a));
-	return b;
-}
-
-/*
- * Do a 64-bit checksum on an arbitrary memory area.
- * Returns a 32bit checksum.
- *
- * This isn't as time critical as it used to be because many NICs
- * do hardware checksumming these days.
- * 
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
- */
-static unsigned do_csum(const unsigned char *buff, unsigned len)
-{
-	unsigned odd, count;
-	unsigned long result = 0;
-
-	if (unlikely(len == 0))
-		return result; 
-	odd = 1 & (unsigned long) buff;
-	if (unlikely(odd)) {
-		result = *buff << 8;
-		len--;
-		buff++;
-	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *)buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			unsigned long zero;
-			unsigned count64;
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
-
-			/* main loop using 64byte blocks */
-			zero = 0;
-			count64 = count >> 3;
-			while (count64) { 
-				asm("addq 0*8(%[src]),%[res]\n\t"
-				    "adcq 1*8(%[src]),%[res]\n\t"
-				    "adcq 2*8(%[src]),%[res]\n\t"
-				    "adcq 3*8(%[src]),%[res]\n\t"
-				    "adcq 4*8(%[src]),%[res]\n\t"
-				    "adcq 5*8(%[src]),%[res]\n\t"
-				    "adcq 6*8(%[src]),%[res]\n\t"
-				    "adcq 7*8(%[src]),%[res]\n\t"
-				    "adcq %[zero],%[res]"
-				    : [res] "=r" (result)
-				    : [src] "r" (buff), [zero] "r" (zero),
-				    "[res]" (result));
-				buff += 64;
-				count64--;
-			}
-
-			/* last up to 7 8byte blocks */
-			count %= 8; 
-			while (count) { 
-				asm("addq %1,%0\n\t"
-				    "adcq %2,%0\n" 
-					    : "=r" (result)
-				    : "m" (*(unsigned long *)buff), 
-				    "r" (zero),  "0" (result));
-				--count; 
-					buff += 8;
-			}
-			result = add32_with_carry(result>>32,
-						  result&0xffffffff); 
-
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
-	}
-	if (len & 1)
-		result += *buff;
-	result = add32_with_carry(result>>32, result & 0xffffffff); 
-	if (unlikely(odd)) { 
-		result = from32to16(result);
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-	}
-	return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	return (__force __wsum)add32_with_carry(do_csum(buff, len),
-						(__force u32)sum);
-}
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum(const void *buff, int len)
-{
-	return csum_fold(csum_partial(buff,len,0));
-}
-EXPORT_SYMBOL(ip_compute_csum);
-
-- 
2.4.6

next             reply	other threads:[~2016-01-05 18:42 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-05 18:41 Tom Herbert [this message]
2016-01-05 22:18 ` [PATCH v2 net-next] net: Implement fast csum_partial for x86_64 Eric Dumazet
2016-01-06  1:10   ` H. Peter Anvin
2016-01-06  3:02     ` Eric Dumazet
2016-01-06 10:16   ` David Laight
2016-01-06 14:25     ` Eric Dumazet
2016-01-06 14:49       ` David Laight
2016-01-06 15:03         ` Eric Dumazet
2016-01-05 23:35 ` Hannes Frederic Sowa
2016-01-06  3:21   ` Eric Dumazet
2016-01-06 20:05 ` Andi Kleen
2016-01-07  1:52 ` Hannes Frederic Sowa
2016-01-07  2:36   ` Tom Herbert
2016-01-07  2:43     ` Hannes Frederic Sowa

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:cd00e17 dfblob:a888f65 dfblob:8e387bb dfblob:9845371 )
 OR (
bs:"[PATCH v2 net-next] net: Implement fast csum_partial for x86_64" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1452019261-449449-1-git-send-email-tom@herbertland.com \
    --to=tom@herbertland.com \
    --cc=davem@davemloft.net \
    --cc=hpa@zytor.com \
    --cc=kernel-team@fb.com \
    --cc=mingo@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.