[PATCH 0 of 2] Add memcpy_uncached_read, a memcpy that doesn't cache reads

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0 of 2] Add memcpy_uncached_read, a memcpy that doesn't cache reads
@ 2006-12-13 16:57 Bryan O'Sullivan
  2006-12-13 16:57 ` [PATCH 1 of 2] Add memcpy_uncached_read, a memcpy that tries to reduce cache pressure Bryan O'Sullivan
  2006-12-13 16:57 ` [PATCH 2 of 2] IB/ipath - use memcpy_uncached_read in RDMA interrupt handler to reduce packet loss Bryan O'Sullivan
  0 siblings, 2 replies; 3+ messages in thread
From: Bryan O'Sullivan @ 2006-12-13 16:57 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, openib-general

Hi, Andrew -

Here's a suitably renamed uncached-read memcpy.  I hope the name is now
self-explanatory.

	<b

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1 of 2] Add memcpy_uncached_read, a memcpy that tries to reduce cache pressure
  2006-12-13 16:57 [PATCH 0 of 2] Add memcpy_uncached_read, a memcpy that doesn't cache reads Bryan O'Sullivan
@ 2006-12-13 16:57 ` Bryan O'Sullivan
  2006-12-13 16:57 ` [PATCH 2 of 2] IB/ipath - use memcpy_uncached_read in RDMA interrupt handler to reduce packet loss Bryan O'Sullivan
  1 sibling, 0 replies; 3+ messages in thread
From: Bryan O'Sullivan @ 2006-12-13 16:57 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, openib-general

This copy routine is memcpy-compatible, but on some architectures will use
cache-bypassing loads to avoid bringing the source data into the cache.

One case where this is useful is when a device issues a DMA to a memory
region, and the CPU must copy the DMAed data elsewhere before doing any
work with it.  Since the source data is read-once, write-never from the
CPU's perspective, caching the data at those addresses can only evict
potentially useful data.

We provide an x86_64 implementation that uses SSE non-temporal loads,
and a generic version that falls back to plain memcpy.

Implementors for other arches should not use cache-bypassing stores to
the destination, as in most cases, the destination is accessed almost
immediately after a copy finishes.

Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>

diff -r 4a0c3ede5076 -r e7c3b265254b arch/x86_64/lib/Makefile
--- a/arch/x86_64/lib/Makefile	Tue Dec 12 10:43:21 2006 -0800
+++ b/arch/x86_64/lib/Makefile	Wed Dec 13 09:51:09 2006 -0800
@@ -9,4 +9,5 @@ lib-y := csum-partial.o csum-copy.o csum
 lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
 	usercopy.o getuser.o putuser.o  \
 	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o \
+	memcpy_uncached_read.o
diff -r 4a0c3ede5076 -r e7c3b265254b arch/x86_64/lib/memcpy_uncached_read.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/arch/x86_64/lib/memcpy_uncached_read.S	Wed Dec 13 09:51:09 2006 -0800
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 QLogic Corporation.  All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * memcpy_uncached_read - memcpy-compatible copy routine, using streaming loads
+ * @dest: destination address
+ * @src: source address (will not be cached)
+ * @count: number of bytes to copy
+ *
+ * Use streaming loads and normal stores for a special-case copy where
+ * we know we won't be reading the source again, but will be reading the
+ * destination again soon.
+ */
+	.text
+	.p2align 4,,15
+	/* rdi  destination, rsi source, rdx count */
+	.globl	memcpy_uncached_read
+	.type	memcpy_uncached_read, @function
+memcpy_uncached_read:
+	movq	%rdi, %rax
+.L5:
+	cmpq	$15, %rdx
+	ja	.L34
+.L3:
+	cmpl	$8, %edx	/* rdx is 0..15 */
+	jbe	.L9
+.L6:
+	testb	$8, %dxl	/* rdx is 3,5,6,7,9..15 */
+	je	.L13
+	movq	(%rsi), %rcx
+	addq	$8, %rsi
+	movq	%rcx, (%rdi)
+	addq	$8, %rdi
+.L13:
+	testb	$4, %dxl
+	je	.L15
+	movl	(%rsi), %ecx
+	addq	$4, %rsi
+	movl	%ecx, (%rdi)
+	addq	$4, %rdi
+.L15:
+	testb	$2, %dxl
+	je	.L17
+	movzwl	(%rsi), %ecx
+	addq	$2, %rsi
+	movw	%cx, (%rdi)
+	addq	$2, %rdi
+.L17:
+	testb	$1, %dxl
+	je	.L33
+.L1:
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+.L33:
+	ret
+.L34:
+	cmpq	$63, %rdx	/* rdx is > 15 */
+	ja	.L64
+	movl	$16, %ecx	/* rdx is 16..63 */
+.L25:
+	movq	8(%rsi), %r8
+	movq	(%rsi), %r9
+	addq	%rcx, %rsi
+	movq	%r8, 8(%rdi)
+	movq	%r9, (%rdi)
+	addq	%rcx, %rdi
+	subq	%rcx, %rdx
+	cmpl	%edx, %ecx	/* is rdx >= 16? */
+	jbe	.L25
+	jmp	.L3		/* rdx is 0..15 */
+	.p2align 4,,7
+.L64:
+	movl	$64, %ecx
+.L42:
+	prefetchnta	128(%rsi)
+	movq	(%rsi), %r8
+	movq	8(%rsi), %r9
+	movq	16(%rsi), %r10
+	movq	24(%rsi), %r11
+	subq	%rcx, %rdx
+	movq	%r8, (%rdi)
+	movq	32(%rsi), %r8
+	movq	%r9, 8(%rdi)
+	movq	40(%rsi), %r9
+	movq	%r10, 16(%rdi)
+	movq	48(%rsi), %r10
+	movq	%r11, 24(%rdi)
+	movq	56(%rsi), %r11
+	addq	%rcx, %rsi
+	movq	%r8, 32(%rdi)
+	movq	%r9, 40(%rdi)
+	movq	%r10, 48(%rdi)
+	movq	%r11, 56(%rdi)
+	addq	%rcx, %rdi
+	cmpq	%rdx, %rcx	/* is rdx >= 64? */
+	jbe	.L42
+	sfence
+	orl	%edx, %edx
+	je	.L33
+	jmp	.L5
+.L9:
+	jmp	*.L12(,%rdx,8)	/* rdx is 0..8 */
+	.section	.rodata
+	.align 8
+	.align 4
+.L12:
+	.quad	.L33
+	.quad	.L1
+	.quad	.L2
+	.quad	.L6
+	.quad	.L4
+	.quad	.L6
+	.quad	.L6
+	.quad	.L6
+	.quad	.L8
+	.text
+.L2:
+	movzwl	(%rsi), %ecx
+	movw	%cx, (%rdi)
+	ret
+.L4:
+	movl	(%rsi), %ecx
+	movl	%ecx, (%rdi)
+	ret
+.L8:
+	movq	(%rsi), %rcx
+	movq	%rcx, (%rdi)
+	ret
diff -r 4a0c3ede5076 -r e7c3b265254b include/asm-x86_64/string.h
--- a/include/asm-x86_64/string.h	Tue Dec 12 10:43:21 2006 -0800
+++ b/include/asm-x86_64/string.h	Wed Dec 13 09:51:09 2006 -0800
@@ -39,6 +39,8 @@ extern void *__memcpy(void *to, const vo
 		 __ret = __builtin_memcpy((dst),(src),__len);	\
 	   __ret; }) 
 
+#define __HAVE_ARCH_MEMCPY_UNCACHED_READ
+extern void *memcpy_uncached_read(void *to, const void *from, size_t len); 
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
diff -r 4a0c3ede5076 -r e7c3b265254b include/linux/string.h
--- a/include/linux/string.h	Tue Dec 12 10:43:21 2006 -0800
+++ b/include/linux/string.h	Wed Dec 13 09:51:09 2006 -0800
@@ -85,6 +85,9 @@ extern void * memset(void *,int,__kernel
 #ifndef __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *,const void *,__kernel_size_t);
 #endif
+#ifndef __HAVE_ARCH_MEMCPY_UNCACHED_READ
+#define memcpy_uncached_read(dest, src, count) memcpy((dest), (src), (count))
+#endif
 #ifndef __HAVE_ARCH_MEMMOVE
 extern void * memmove(void *,const void *,__kernel_size_t);
 #endif

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 2 of 2] IB/ipath - use memcpy_uncached_read in RDMA interrupt handler to reduce packet loss
  2006-12-13 16:57 [PATCH 0 of 2] Add memcpy_uncached_read, a memcpy that doesn't cache reads Bryan O'Sullivan
  2006-12-13 16:57 ` [PATCH 1 of 2] Add memcpy_uncached_read, a memcpy that tries to reduce cache pressure Bryan O'Sullivan
@ 2006-12-13 16:57 ` Bryan O'Sullivan
  1 sibling, 0 replies; 3+ messages in thread
From: Bryan O'Sullivan @ 2006-12-13 16:57 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, openib-general

In cases where a large incoming RDMA is being received, we have to
copy data inside the interrupt handler before we can ACK each packet.
The source is DMAed to by the hardware, which means that the CPU won't
have it cached.  We only read the source this one time; using normal load
instructions pollutes the dcache with useless data, reducing performance
to the point where we can lose a significant number of packets.

We use memcpy_uncached_read to try to not fill the dcache with useless data.
Avoiding the cache refill penalty lets us keep up better with the sender,
resulting in many fewer dropped packets.

Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>

diff -r e7c3b265254b -r f25d77f76998 drivers/infiniband/hw/ipath/ipath_verbs.c
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c	Wed Dec 13 09:51:09 2006 -0800
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c	Wed Dec 13 09:51:09 2006 -0800
@@ -167,7 +167,7 @@ void ipath_copy_sge(struct ipath_sge_sta
 		BUG_ON(len == 0);
 		if (len > length)
 			len = length;
-		memcpy(sge->vaddr, data, len);
+		memcpy_uncached_read(sge->vaddr, data, len);
 		sge->vaddr += len;
 		sge->length -= len;
 		sge->sge_length -= len;

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2006-12-13 18:31 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-12-13 16:57 [PATCH 0 of 2] Add memcpy_uncached_read, a memcpy that doesn't cache reads Bryan O'Sullivan
2006-12-13 16:57 ` [PATCH 1 of 2] Add memcpy_uncached_read, a memcpy that tries to reduce cache pressure Bryan O'Sullivan
2006-12-13 16:57 ` [PATCH 2 of 2] IB/ipath - use memcpy_uncached_read in RDMA interrupt handler to reduce packet loss Bryan O'Sullivan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox