[PATCH 1/3] powerpc: Optimise 64bit csum

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/3] powerpc: Optimise 64bit csum_partial
@ 2010-08-03  6:08 Anton Blanchard
  2010-08-03  6:09 ` [PATCH 2/3] powerpc: Optimise 64bit csum_partial_copy_generic and add csum_and_copy_from_user Anton Blanchard
  2010-08-03 15:09 ` [PATCH 1/3] powerpc: Optimise 64bit csum_partial Segher Boessenkool
  0 siblings, 2 replies; 6+ messages in thread
From: Anton Blanchard @ 2010-08-03  6:08 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev


The main loop of csum_partial runs very slowly on recent POWER CPUs. After some
analysis on both POWER6 and POWER7 I came up with routine below. First we get
the source aligned to a double word, ignoring any odd alignment to keep things
simple. Then we do 64 bytes at a time, with an entry and exit limb of a further
64 bytes. On both POWER6 and POWER7 this should be as fast as we can go since
we are limited by the latency of the adde instructions.

To test this I forced checksumming on over loopback and ran socklib (a
simple TCP benchmark). On a POWER6 575 throughput improved by 11% with
this patch.

Signed-off-by: Anton Blanchard <anton@samba.org>
--

Index: powerpc.git/arch/powerpc/lib/checksum_64.S
===================================================================
--- powerpc.git.orig/arch/powerpc/lib/checksum_64.S	2010-08-03 13:32:45.291991557 +1000
+++ powerpc.git/arch/powerpc/lib/checksum_64.S	2010-08-03 15:16:59.600753385 +1000
@@ -65,55 +65,168 @@ _GLOBAL(csum_tcpudp_magic)
 	srwi	r3,r3,16
 	blr
 
+#define STACKFRAMESIZE 256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
 /*
  * Computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit).
  *
- * This code assumes at least halfword alignment, though the length
- * can be any number of bytes.  The sum is accumulated in r5.
- *
  * csum_partial(r3=buff, r4=len, r5=sum)
  */
 _GLOBAL(csum_partial)
-        subi	r3,r3,8		/* we'll offset by 8 for the loads */
-        srdi.	r6,r4,3         /* divide by 8 for doubleword count */
-        addic   r5,r5,0         /* clear carry */
-        beq	3f              /* if we're doing < 8 bytes */
-        andi.	r0,r3,2         /* aligned on a word boundary already? */
-        beq+	1f
-        lhz     r6,8(r3)        /* do 2 bytes to get aligned */
-        addi    r3,r3,2
-        subi    r4,r4,2
-        addc    r5,r5,r6
-        srdi.   r6,r4,3         /* recompute number of doublewords */
-        beq     3f              /* any left? */
-1:      mtctr   r6
-2:      ldu     r6,8(r3)        /* main sum loop */
-        adde    r5,r5,r6
-        bdnz    2b
-        andi.	r4,r4,7         /* compute bytes left to sum after doublewords */
-3:	cmpwi	0,r4,4		/* is at least a full word left? */
-	blt	4f
-	lwz	r6,8(r3)	/* sum this word */
+	addic	r0,r5,0			/* clear carry */
+
+	srdi.	r6,r4,3			/* less than 8 bytes? */
+	beq	.Lcsum_tail_word
+
+	/*
+	 * If only halfword aligned, align to a double word. Since odd
+	 * aligned addresses should be rare and they would require more
+	 * work to calculate the correct checksum, we ignore that case
+	 * and take the potential slowdown of unaligned loads.
+	 */
+	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
+	beq	.Lcsum_aligned
+
+	li	r7,4
+	sub	r6,r7,r6
+	mtctr	r6
+
+1:
+	lhz	r6,0(r3)		/* align to doubleword */
+	subi	r4,r4,2
+	addi	r3,r3,2
+	adde	r0,r0,r6
+	bdnz	1b
+
+.Lcsum_aligned:
+	/*
+	 * We unroll the loop such that each iteration is 64 bytes with an
+	 * entry and exit limb of 64 bytes, meaning a minimum size of
+	 * 128 bytes.
+	 */
+	srdi.	r6,r4,7
+	beq	.Lcsum_tail_doublewords		/* len < 128 */
+
+	srdi	r6,r4,6
+	subi	r6,r6,1
+	mtctr	r6
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+	ld	r6,0(r3)
+	ld	r9,8(r3)
+
+	ld	r10,16(r3)
+	ld	r11,24(r3)
+
+	/*
+	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
+	 * the XER dependency. This means the fastest this loop can go is
+	 * 16 cycles per iteration. The scheduling of the loop below has
+	 * been shown to hit this on both POWER6 and POWER7.
+	 */
+	.align 5
+2:
+	adde	r0,r0,r6
+	ld	r12,32(r3)
+	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+	ld	r15,48(r3)
+	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+
+	adde	r0,r0,r11
+
+	adde	r0,r0,r12
+
+	adde	r0,r0,r14
+
+	adde	r0,r0,r15
+	ld	r6,0(r3)
+	ld	r9,8(r3)
+
+	adde	r0,r0,r16
+	ld	r10,16(r3)
+	ld	r11,24(r3)
+	bdnz	2b
+
+
+	adde	r0,r0,r6
+	ld	r12,32(r3)
+	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+	ld	r15,48(r3)
+	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+	adde	r0,r0,r11
+	adde	r0,r0,r12
+	adde	r0,r0,r14
+	adde	r0,r0,r15
+	adde	r0,r0,r16
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	andi.	r4,r4,63
+
+.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
+	srdi.	r6,r4,3
+	beq	.Lcsum_tail_word
+
+	mtctr	r6
+3:
+	ld	r6,0(r3)
+	addi	r3,r3,8
+	adde	r0,r0,r6
+	bdnz	3b
+
+	andi.	r4,r4,7
+
+.Lcsum_tail_word:			/* Up to 7 bytes to go */
+	srdi.	r6,r4,2
+	beq	.Lcsum_tail_halfword
+
+	lwz	r6,0(r3)
 	addi	r3,r3,4
+	adde	r0,r0,r6
 	subi	r4,r4,4
-	adde	r5,r5,r6
-4:	cmpwi	0,r4,2		/* is at least a halfword left? */
-        blt+	5f
-        lhz     r6,8(r3)        /* sum this halfword */
-        addi    r3,r3,2
-        subi    r4,r4,2
-        adde    r5,r5,r6
-5:	cmpwi	0,r4,1		/* is at least a byte left? */
-        bne+    6f
-        lbz     r6,8(r3)        /* sum this byte */
-        slwi    r6,r6,8         /* this byte is assumed to be the upper byte of a halfword */
-        adde    r5,r5,r6
-6:      addze	r5,r5		/* add in final carry */
-	rldicl  r4,r5,32,0      /* fold two 32-bit halves together */
-        add     r3,r4,r5
-        srdi    r3,r3,32
-        blr
+
+.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
+	srdi.	r6,r4,1
+	beq	.Lcsum_tail_byte
+
+	lhz	r6,0(r3)
+	addi	r3,r3,2
+	adde	r0,r0,r6
+	subi	r4,r4,2
+
+.Lcsum_tail_byte:			/* Up to 1 byte to go */
+	andi.	r6,r4,1
+	beq	.Lcsum_finish
+
+	lbz	r6,0(r3)
+	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
+	adde	r0,r0,r9
+
+.Lcsum_finish:
+	addze	r0,r0			/* add in final carry */
+	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
+	add	r3,r4,r0
+	srdi	r3,r3,32
+	blr
 
 /*
  * Computes the checksum of a memory block at src, length len,

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 2/3] powerpc: Optimise 64bit csum_partial_copy_generic and add csum_and_copy_from_user
  2010-08-03  6:08 [PATCH 1/3] powerpc: Optimise 64bit csum_partial Anton Blanchard
@ 2010-08-03  6:09 ` Anton Blanchard
  2010-08-03  6:11   ` [PATCH 3/3] powerpc: Add 64bit csum_and_copy_to_user Anton Blanchard
  2010-08-03 15:09 ` [PATCH 1/3] powerpc: Optimise 64bit csum_partial Segher Boessenkool
  1 sibling, 1 reply; 6+ messages in thread
From: Anton Blanchard @ 2010-08-03  6:09 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev


We use the same core loop as the new csum_partial, adding in the
stores and exception handling code. To keep things simple we do all the
exception fixup in csum_and_copy_from_user. This wrapper function is
modelled on the generic checksum code and is careful to always calculate
a complete checksum even if we only copied part of the data to userspace.

To test this I forced checksumming on over loopback and ran socklib (a
simple TCP benchmark). On a POWER6 575 throughput improved by 19% with
this patch. If I forced both the sender and receiver onto the same cpu
(with the hope of shifting the benchmark from being cache bandwidth limited
to cpu limited), adding this patch improved performance by 55%

Signed-off-by: Anton Blanchard <anton@samba.org>
--

Index: powerpc.git/arch/powerpc/lib/checksum_64.S
===================================================================
--- powerpc.git.orig/arch/powerpc/lib/checksum_64.S	2010-08-03 15:16:59.600753385 +1000
+++ powerpc.git/arch/powerpc/lib/checksum_64.S	2010-08-03 16:00:32.030741261 +1000
@@ -228,115 +228,230 @@ _GLOBAL(csum_partial)
 	srdi	r3,r3,32
 	blr
 
+
+	.macro source
+100:
+	.section __ex_table,"a"
+	.align 3
+	.llong 100b,.Lsrc_error
+	.previous
+	.endm
+
+	.macro dest
+200:
+	.section __ex_table,"a"
+	.align 3
+	.llong 200b,.Ldest_error
+	.previous
+	.endm
+
 /*
  * Computes the checksum of a memory block at src, length len,
  * and adds in "sum" (32-bit), while copying the block to dst.
  * If an access exception occurs on src or dst, it stores -EFAULT
- * to *src_err or *dst_err respectively, and (for an error on
- * src) zeroes the rest of dst.
- *
- * This code needs to be reworked to take advantage of 64 bit sum+copy.
- * However, due to tokenring halfword alignment problems this will be very
- * tricky.  For now we'll leave it until we instrument it somehow.
+ * to *src_err or *dst_err respectively. The caller must take any action
+ * required in this case (zeroing memory, recalculating partial checksum etc).
  *
  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
  */
 _GLOBAL(csum_partial_copy_generic)
-	addic	r0,r6,0
-	subi	r3,r3,4
-	subi	r4,r4,4
-	srwi.	r6,r5,2
-	beq	3f		/* if we're doing < 4 bytes */
-	andi.	r9,r4,2		/* Align dst to longword boundary */
-	beq+	1f
-81:	lhz	r6,4(r3)	/* do 2 bytes to get aligned */
-	addi	r3,r3,2
+	addic	r0,r6,0			/* clear carry */
+
+	srdi.	r6,r5,3			/* less than 8 bytes? */
+	beq	.Lcopy_tail_word
+
+	/*
+	 * If only halfword aligned, align to a double word. Since odd
+	 * aligned addresses should be rare and they would require more
+	 * work to calculate the correct checksum, we ignore that case
+	 * and take the potential slowdown of unaligned loads.
+	 *
+	 * If the source and destination are relatively unaligned we only
+	 * align the source. This keeps things simple.
+	 */
+	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
+	beq	.Lcopy_aligned
+
+	li	r7,4
+	sub	r6,r7,r6
+	mtctr	r6
+
+1:
+source;	lhz	r6,0(r3)		/* align to doubleword */
 	subi	r5,r5,2
-91:	sth	r6,4(r4)
-	addi	r4,r4,2
-	addc	r0,r0,r6
-	srwi.	r6,r5,2		/* # words to do */
-	beq	3f
-1:	mtctr	r6
-82:	lwzu	r6,4(r3)	/* the bdnz has zero overhead, so it should */
-92:	stwu	r6,4(r4)	/* be unnecessary to unroll this loop */
-	adde	r0,r0,r6
-	bdnz	82b
-	andi.	r5,r5,3
-3:	cmpwi	0,r5,2
-	blt+	4f
-83:	lhz	r6,4(r3)
 	addi	r3,r3,2
-	subi	r5,r5,2
-93:	sth	r6,4(r4)
+	adde	r0,r0,r6
+dest;	sth	r6,0(r4)
 	addi	r4,r4,2
+	bdnz	1b
+
+.Lcopy_aligned:
+	/*
+	 * We unroll the loop such that each iteration is 64 bytes with an
+	 * entry and exit limb of 64 bytes, meaning a minimum size of
+	 * 128 bytes.
+	 */
+	srdi.	r6,r5,7
+	beq	.Lcopy_tail_doublewords		/* len < 128 */
+
+	srdi	r6,r5,6
+	subi	r6,r6,1
+	mtctr	r6
+
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r14,STK_REG(r14)(r1)
+	std	r15,STK_REG(r15)(r1)
+	std	r16,STK_REG(r16)(r1)
+
+source;	ld	r6,0(r3)
+source;	ld	r9,8(r3)
+
+source;	ld	r10,16(r3)
+source;	ld	r11,24(r3)
+
+	/*
+	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
+	 * the XER dependency. This means the fastest this loop can go is
+	 * 16 cycles per iteration. The scheduling of the loop below has
+	 * been shown to hit this on both POWER6 and POWER7.
+	 */
+	.align 5
+2:
 	adde	r0,r0,r6
-4:	cmpwi	0,r5,1
-	bne+	5f
-84:	lbz	r6,4(r3)
-94:	stb	r6,4(r4)
-	slwi	r6,r6,8		/* Upper byte of word */
-	adde	r0,r0,r6
-5:	addze	r3,r0		/* add in final carry (unlikely with 64-bit regs) */
-        rldicl  r4,r3,32,0      /* fold 64 bit value */
-        add     r3,r4,r3
-        srdi    r3,r3,32
-	blr
+source;	ld	r12,32(r3)
+source;	ld	r14,40(r3)
 
-/* These shouldn't go in the fixup section, since that would
-   cause the ex_table addresses to get out of order. */
+	adde	r0,r0,r9
+source;	ld	r15,48(r3)
+source;	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+dest;	std	r6,0(r4)
+dest;	std	r9,8(r4)
+
+	adde	r0,r0,r11
+dest;	std	r10,16(r4)
+dest;	std	r11,24(r4)
+
+	adde	r0,r0,r12
+dest;	std	r12,32(r4)
+dest;	std	r14,40(r4)
+
+	adde	r0,r0,r14
+dest;	std	r15,48(r4)
+dest;	std	r16,56(r4)
+	addi	r4,r4,64
+
+	adde	r0,r0,r15
+source;	ld	r6,0(r3)
+source;	ld	r9,8(r3)
+
+	adde	r0,r0,r16
+source;	ld	r10,16(r3)
+source;	ld	r11,24(r3)
+	bdnz	2b
+
+
+	adde	r0,r0,r6
+source;	ld	r12,32(r3)
+source;	ld	r14,40(r3)
+
+	adde	r0,r0,r9
+source;	ld	r15,48(r3)
+source;	ld	r16,56(r3)
+	addi	r3,r3,64
+
+	adde	r0,r0,r10
+dest;	std	r6,0(r4)
+dest;	std	r9,8(r4)
+
+	adde	r0,r0,r11
+dest;	std	r10,16(r4)
+dest;	std	r11,24(r4)
+
+	adde	r0,r0,r12
+dest;	std	r12,32(r4)
+dest;	std	r14,40(r4)
+
+	adde	r0,r0,r14
+dest;	std	r15,48(r4)
+dest;	std	r16,56(r4)
+	addi	r4,r4,64
+
+	adde	r0,r0,r15
+	adde	r0,r0,r16
+
+	ld	r14,STK_REG(r14)(r1)
+	ld	r15,STK_REG(r15)(r1)
+	ld	r16,STK_REG(r16)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+
+	andi.	r5,r5,63
+
+.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
+	srdi.	r6,r5,3
+	beq	.Lcopy_tail_word
 
-	.globl src_error_1
-src_error_1:
-	li	r6,0
-	subi	r5,r5,2
-95:	sth	r6,4(r4)
-	addi	r4,r4,2
-	srwi.	r6,r5,2
-	beq	3f
 	mtctr	r6
-	.globl src_error_2
-src_error_2:
-	li	r6,0
-96:	stwu	r6,4(r4)
-	bdnz	96b
-3:	andi.	r5,r5,3
-	beq	src_error
-	.globl src_error_3
-src_error_3:
-	li	r6,0
-	mtctr	r5
-	addi	r4,r4,3
-97:	stbu	r6,1(r4)
-	bdnz	97b
-	.globl src_error
-src_error:
+3:
+source;	ld	r6,0(r3)
+	addi	r3,r3,8
+	adde	r0,r0,r6
+dest;	std	r6,0(r4)
+	addi	r4,r4,8
+	bdnz	3b
+
+	andi.	r5,r5,7
+
+.Lcopy_tail_word:			/* Up to 7 bytes to go */
+	srdi.	r6,r5,2
+	beq	.Lcopy_tail_halfword
+
+source;	lwz	r6,0(r3)
+	addi	r3,r3,4
+	adde	r0,r0,r6
+dest;	stw	r6,0(r4)
+	addi	r4,r4,4
+	subi	r5,r5,4
+
+.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
+	srdi.	r6,r5,1
+	beq	.Lcopy_tail_byte
+
+source;	lhz	r6,0(r3)
+	addi	r3,r3,2
+	adde	r0,r0,r6
+dest;	sth	r6,0(r4)
+	addi	r4,r4,2
+	subi	r5,r5,2
+
+.Lcopy_tail_byte:			/* Up to 1 byte to go */
+	andi.	r6,r5,1
+	beq	.Lcopy_finish
+
+source;	lbz	r6,0(r3)
+	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
+	adde	r0,r0,r9
+dest;	stb	r6,0(r4)
+
+.Lcopy_finish:
+	addze	r0,r0			/* add in final carry */
+	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
+	add	r3,r4,r0
+	srdi	r3,r3,32
+	blr
+
+.Lsrc_error:
 	cmpdi	0,r7,0
-	beq	1f
+	beqlr
 	li	r6,-EFAULT
 	stw	r6,0(r7)
-1:	addze	r3,r0
 	blr
 
-	.globl dst_error
-dst_error:
+.Ldest_error:
 	cmpdi	0,r8,0
-	beq	1f
+	beqlr
 	li	r6,-EFAULT
 	stw	r6,0(r8)
-1:	addze	r3,r0
 	blr
-
-.section __ex_table,"a"
-	.align  3
-	.llong	81b,src_error_1
-	.llong	91b,dst_error
-	.llong	82b,src_error_2
-	.llong	92b,dst_error
-	.llong	83b,src_error_3
-	.llong	93b,dst_error
-	.llong	84b,src_error_3
-	.llong	94b,dst_error
-	.llong	95b,dst_error
-	.llong	96b,dst_error
-	.llong	97b,dst_error
Index: powerpc.git/arch/powerpc/lib/checksum_wrappers_64.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ powerpc.git/arch/powerpc/lib/checksum_wrappers_64.c	2010-08-03 15:58:43.214492712 +1000
@@ -0,0 +1,65 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2010
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/checksum.h>
+#include <asm/uaccess.h>
+
+__wsum csum_and_copy_from_user(const void __user *src, void *dst,
+			       int len, __wsum sum, int *err_ptr)
+{
+	unsigned int csum;
+
+	might_sleep();
+
+	*err_ptr = 0;
+
+	if (!len) {
+		csum = 0;
+		goto out;
+	}
+
+	if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) {
+		*err_ptr = -EFAULT;
+		csum = (__force unsigned int)sum;
+		goto out;
+	}
+
+	csum = csum_partial_copy_generic((void __force *)src, dst,
+					 len, sum, err_ptr, NULL);
+
+	if (unlikely(*err_ptr)) {
+		int missing = __copy_from_user(dst, src, len);
+
+		if (missing) {
+			memset(dst + len - missing, 0, missing);
+			*err_ptr = -EFAULT;
+		} else {
+			*err_ptr = 0;
+		}
+
+		csum = csum_partial(dst, len, sum);
+	}
+
+out:
+	return (__force __wsum)csum;
+}
+EXPORT_SYMBOL(csum_and_copy_from_user);
Index: powerpc.git/arch/powerpc/lib/Makefile
===================================================================
--- powerpc.git.orig/arch/powerpc/lib/Makefile	2010-08-03 15:16:35.310741991 +1000
+++ powerpc.git/arch/powerpc/lib/Makefile	2010-08-03 15:17:58.240742747 +1000
@@ -17,7 +17,8 @@ obj-$(CONFIG_PPC32)	+= div64.o copy_32.o
 obj-$(CONFIG_HAS_IOMEM)	+= devres.o
 
 obj-$(CONFIG_PPC64)	+= copypage_64.o copyuser_64.o \
-			   memcpy_64.o usercopy_64.o mem_64.o string.o
+			   memcpy_64.o usercopy_64.o mem_64.o string.o \
+			   checksum_wrappers_64.o
 obj-$(CONFIG_XMON)	+= sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)	+= sstep.o ldstfp.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= sstep.o ldstfp.o
Index: powerpc.git/arch/powerpc/include/asm/checksum.h
===================================================================
--- powerpc.git.orig/arch/powerpc/include/asm/checksum.h	2010-08-03 15:16:35.320741299 +1000
+++ powerpc.git/arch/powerpc/include/asm/checksum.h	2010-08-03 15:58:43.234490654 +1000
@@ -52,12 +52,19 @@ extern __wsum csum_partial(const void *b
 extern __wsum csum_partial_copy_generic(const void *src, void *dst,
 					      int len, __wsum sum,
 					      int *src_err, int *dst_err);
+
+#ifdef __powerpc64__
+#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER
+extern __wsum csum_and_copy_from_user(const void __user *src, void *dst,
+				      int len, __wsum sum, int *err_ptr);
+#else
 /*
  * the same as csum_partial, but copies from src to dst while it
  * checksums.
  */
 #define csum_partial_copy_from_user(src, dst, len, sum, errp)   \
         csum_partial_copy_generic((__force const void *)(src), (dst), (len), (sum), (errp), NULL)
+#endif
 
 #define csum_partial_copy_nocheck(src, dst, len, sum)   \
         csum_partial_copy_generic((src), (dst), (len), (sum), NULL, NULL)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 3/3] powerpc: Add 64bit csum_and_copy_to_user
  2010-08-03  6:09 ` [PATCH 2/3] powerpc: Optimise 64bit csum_partial_copy_generic and add csum_and_copy_from_user Anton Blanchard
@ 2010-08-03  6:11   ` Anton Blanchard
  0 siblings, 0 replies; 6+ messages in thread
From: Anton Blanchard @ 2010-08-03  6:11 UTC (permalink / raw)
  To: benh, paulus; +Cc: linuxppc-dev


This adds the equivalent of csum_and_copy_from_user for the receive side so we
can copy and checksum in one pass. It is modelled on the generic checksum
routine.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Index: powerpc.git/arch/powerpc/include/asm/checksum.h
===================================================================
--- powerpc.git.orig/arch/powerpc/include/asm/checksum.h	2010-08-03 16:02:02.234491674 +1000
+++ powerpc.git/arch/powerpc/include/asm/checksum.h	2010-08-03 16:04:10.733241094 +1000
@@ -57,6 +57,9 @@ extern __wsum csum_partial_copy_generic(
 #define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER
 extern __wsum csum_and_copy_from_user(const void __user *src, void *dst,
 				      int len, __wsum sum, int *err_ptr);
+#define HAVE_CSUM_COPY_USER
+extern __wsum csum_and_copy_to_user(const void *src, void __user *dst,
+				    int len, __wsum sum, int *err_ptr);
 #else
 /*
  * the same as csum_partial, but copies from src to dst while it
Index: powerpc.git/arch/powerpc/lib/checksum_wrappers_64.c
===================================================================
--- powerpc.git.orig/arch/powerpc/lib/checksum_wrappers_64.c	2010-08-03 16:02:02.234491674 +1000
+++ powerpc.git/arch/powerpc/lib/checksum_wrappers_64.c	2010-08-03 16:02:03.063242741 +1000
@@ -63,3 +63,40 @@ out:
 	return (__force __wsum)csum;
 }
 EXPORT_SYMBOL(csum_and_copy_from_user);
+
+__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
+			     __wsum sum, int *err_ptr)
+{
+	unsigned int csum;
+
+	might_sleep();
+
+	*err_ptr = 0;
+
+	if (!len) {
+		csum = 0;
+		goto out;
+	}
+
+	if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) {
+		*err_ptr = -EFAULT;
+		csum = -1; /* invalid checksum */
+		goto out;
+	}
+
+	csum = csum_partial_copy_generic(src, (void __force *)dst,
+					 len, sum, NULL, err_ptr);
+
+	if (unlikely(*err_ptr)) {
+		csum = csum_partial(src, len, sum);
+
+		if (copy_to_user(dst, src, len)) {
+			*err_ptr = -EFAULT;
+			csum = -1; /* invalid checksum */
+		}
+	}
+
+out:
+	return (__force __wsum)csum;
+}
+EXPORT_SYMBOL(csum_and_copy_to_user);

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] powerpc: Optimise 64bit csum_partial
  2010-08-03  6:08 [PATCH 1/3] powerpc: Optimise 64bit csum_partial Anton Blanchard
  2010-08-03  6:09 ` [PATCH 2/3] powerpc: Optimise 64bit csum_partial_copy_generic and add csum_and_copy_from_user Anton Blanchard
@ 2010-08-03 15:09 ` Segher Boessenkool
  2010-08-03 23:11   ` Anton Blanchard
  1 sibling, 1 reply; 6+ messages in thread
From: Segher Boessenkool @ 2010-08-03 15:09 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: paulus, linuxppc-dev

> On both POWER6 and POWER7 this should be as fast as we can go since
> we are limited by the latency of the adde instructions.

Not really.  Do you know how many 16/32-bit words you can add before a
64-bit register can overflow? :-)
If you ever have to call this with more than 16GB of data to sum, that's
easy to handle as well of course (just break it into pieces).


Segher

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] powerpc: Optimise 64bit csum_partial
  2010-08-03 15:09 ` [PATCH 1/3] powerpc: Optimise 64bit csum_partial Segher Boessenkool
@ 2010-08-03 23:11   ` Anton Blanchard
  2010-08-03 23:39     ` Segher Boessenkool
  0 siblings, 1 reply; 6+ messages in thread
From: Anton Blanchard @ 2010-08-03 23:11 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: paulus, linuxppc-dev

Hi Segher,

> Not really.  Do you know how many 16/32-bit words you can add before a
> 64-bit register can overflow? :-)

Thats a very good point. I thought about using 32bit adds when writing
the copy and checksum routine, but came to the conclusion that it wouldn't go
any faster than one using addes. The checksum only routine was the same loop
without the stores.

We rarely use csum_partial now we have copy and checksum to and from user
now, but I'll take a look at speeding it up in a follow on patch. Thanks!

Anton

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] powerpc: Optimise 64bit csum_partial
  2010-08-03 23:11   ` Anton Blanchard
@ 2010-08-03 23:39     ` Segher Boessenkool
  0 siblings, 0 replies; 6+ messages in thread
From: Segher Boessenkool @ 2010-08-03 23:39 UTC (permalink / raw)
  To: Anton Blanchard; +Cc: paulus, linuxppc-dev

>
> Hi Segher,
>
>> Not really.  Do you know how many 16/32-bit words you can add before a
>> 64-bit register can overflow? :-)
>
> Thats a very good point. I thought about using 32bit adds when writing
> the copy and checksum routine, but came to the conclusion that it wouldn't
> go
> any faster than one using addes.

Well, you now have one 64-bit word in two cycles, using one load and
an adde.

You can do 64-bits with two loads and two integer insns instead, or
one load and three integer insns.  It depends on your pipeline structure
what is best, I don't remember what POWER6/7 have exactly, but I bet
you do :-)

If you don't have to deal with the carry, you don't have to care about
the latency of your insns either, since you can just software pipeline it.

> The checksum only routine was the same
> loop
> without the stores.

The stores are just to copy, right?  So two loads/two stores/two integer
(per 64-bit), which probably works out to two cycles; or one load/
one store/ three integer, which is one or one and a half cycle.

Segher

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2010-08-03 23:40 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-08-03  6:08 [PATCH 1/3] powerpc: Optimise 64bit csum_partial Anton Blanchard
2010-08-03  6:09 ` [PATCH 2/3] powerpc: Optimise 64bit csum_partial_copy_generic and add csum_and_copy_from_user Anton Blanchard
2010-08-03  6:11   ` [PATCH 3/3] powerpc: Add 64bit csum_and_copy_to_user Anton Blanchard
2010-08-03 15:09 ` [PATCH 1/3] powerpc: Optimise 64bit csum_partial Segher Boessenkool
2010-08-03 23:11   ` Anton Blanchard
2010-08-03 23:39     ` Segher Boessenkool

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).