linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
@ 2010-02-10 20:37 adharmap at codeaurora.org
  2010-02-10 20:37 ` [PATCH 1/2] dma: define barrierless versions of map and unmap area adharmap at codeaurora.org
                   ` (3 more replies)
  0 siblings, 4 replies; 18+ messages in thread
From: adharmap at codeaurora.org @ 2010-02-10 20:37 UTC (permalink / raw)
  To: linux-arm-kernel

From: Abhijeet Dharmapurikar <adharmap@quicinc.com>

Please refer to the post here
http://lkml.org/lkml/2010/1/4/347

These changes are to introduce barrierless dma_map_area and dma_unmap_area and
use them to map the buffers in the scatterlist. For the last buffer, call
the normal dma_map_area(aka with barriers) effectively executing the barrier
at the end of the operation.

Note that the barrierless operations are implemented for few arm
architectures only and I would implement for others once these are okayed by the
community.

Abhijeet Dharmapurikar (2):
  dma: define barrierless versions of map and unmap area
  dma: fix scatter-gather api to use barrierless map/unmap functions

 arch/arm/include/asm/cacheflush.h  |    9 +++
 arch/arm/include/asm/dma-mapping.h |   82 +++++++++++++++++++++
 arch/arm/mm/cache-v3.S             |    6 ++
 arch/arm/mm/cache-v4.S             |    6 ++
 arch/arm/mm/cache-v4wb.S           |   94 +++++++++++++++++-------
 arch/arm/mm/cache-v4wt.S           |    6 ++
 arch/arm/mm/cache-v6.S             |  139 +++++++++++++++++++++++++----------
 arch/arm/mm/cache-v7.S             |  120 +++++++++++++++++++++++--------
 arch/arm/mm/dma-mapping.c          |   55 +++++++++++++--
 9 files changed, 414 insertions(+), 103 deletions(-)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/2] dma: define barrierless versions of map and unmap area
  2010-02-10 20:37 [RFC 0/2] fix dma_map_sg not to do barriers for each buffer adharmap at codeaurora.org
@ 2010-02-10 20:37 ` adharmap at codeaurora.org
  2010-02-10 20:37 ` [PATCH 2/2] dma: fix scatter-gather api to use barrierless map/unmap functions adharmap at codeaurora.org
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 18+ messages in thread
From: adharmap at codeaurora.org @ 2010-02-10 20:37 UTC (permalink / raw)
  To: linux-arm-kernel

From: Abhijeet Dharmapurikar <adharmap@quicinc.com>

Barrierless versions of dma_map_area and dma_unmap_area will be used in
the scatter-gather mapping and unmapping functions.

Signed-off-by: Abhijeet Dharmapurikar <adharmap@quicinc.com>
---
 arch/arm/include/asm/cacheflush.h |    9 +++
 arch/arm/mm/cache-v3.S            |    6 ++
 arch/arm/mm/cache-v4.S            |    6 ++
 arch/arm/mm/cache-v4wb.S          |   94 +++++++++++++++++--------
 arch/arm/mm/cache-v4wt.S          |    6 ++
 arch/arm/mm/cache-v6.S            |  139 ++++++++++++++++++++++++++-----------
 arch/arm/mm/cache-v7.S            |  120 ++++++++++++++++++++++++--------
 7 files changed, 283 insertions(+), 97 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 8148a00..e91e014 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -215,6 +215,9 @@ struct cpu_cache_fns {
 	void (*dma_map_area)(const void *, size_t, int);
 	void (*dma_unmap_area)(const void *, size_t, int);
 
+	void (*dma_map_area_nobarrier)(const void *, size_t, int);
+	void (*dma_unmap_area_nobarrier)(const void *, size_t, int);
+
 	void (*dma_flush_range)(const void *, const void *);
 };
 
@@ -246,6 +249,8 @@ extern struct cpu_cache_fns cpu_cache;
  */
 #define dmac_map_area			cpu_cache.dma_map_area
 #define dmac_unmap_area		cpu_cache.dma_unmap_area
+#define dmac_map_area_nobarrier			cpu_cache.dma_map_area_nobarrier
+#define dmac_unmap_area_nobarrier	cpu_cache.dma_unmap_area_nobarrier
 #define dmac_flush_range		cpu_cache.dma_flush_range
 
 #else
@@ -272,10 +277,14 @@ extern void __cpuc_flush_dcache_area(void *, size_t);
  */
 #define dmac_map_area			__glue(_CACHE,_dma_map_area)
 #define dmac_unmap_area		__glue(_CACHE,_dma_unmap_area)
+#define dmac_map_area_nobarrier			__glue(_CACHE,_dma_map_area_nobarrier)
+#define dmac_unmap_area_nobarrier		__glue(_CACHE,_dma_unmap_area_nobarrier)
 #define dmac_flush_range		__glue(_CACHE,_dma_flush_range)
 
 extern void dmac_map_area(const void *, size_t, int);
 extern void dmac_unmap_area(const void *, size_t, int);
+extern void dmac_map_area_nobarrier(const void *, size_t, int);
+extern void dmac_unmap_area_nobarrier(const void *, size_t, int);
 extern void dmac_flush_range(const void *, const void *);
 
 #endif
diff --git a/arch/arm/mm/cache-v3.S b/arch/arm/mm/cache-v3.S
index c2ff3c5..5ba5b9b 100644
--- a/arch/arm/mm/cache-v3.S
+++ b/arch/arm/mm/cache-v3.S
@@ -103,6 +103,7 @@ ENTRY(v3_dma_flush_range)
  *	- dir	- DMA direction
  */
 ENTRY(v3_dma_unmap_area)
+ENTRY(v3_dma_unmap_area_nobarrier)
 	teq	r2, #DMA_TO_DEVICE
 	bne	v3_dma_flush_range
 	/* FALLTHROUGH */
@@ -114,9 +115,12 @@ ENTRY(v3_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v3_dma_map_area)
+ENTRY(v3_dma_map_area_nobarrier)
 	mov	pc, lr
 ENDPROC(v3_dma_unmap_area)
+ENDPROC(v3_dma_unmap_area_nobarrier)
 ENDPROC(v3_dma_map_area)
+ENDPROC(v3_dma_map_area_nobarrier)
 
 	__INITDATA
 
@@ -130,5 +134,7 @@ ENTRY(v3_cache_fns)
 	.long	v3_flush_kern_dcache_area
 	.long	v3_dma_map_area
 	.long	v3_dma_unmap_area
+	.long	v3_dma_map_area_nobarrier
+	.long	v3_dma_unmap_area_nobarrier
 	.long	v3_dma_flush_range
 	.size	v3_cache_fns, . - v3_cache_fns
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 4810f7e..a914c5f 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -115,6 +115,7 @@ ENTRY(v4_dma_flush_range)
  *	- dir	- DMA direction
  */
 ENTRY(v4_dma_unmap_area)
+ENTRY(v4_dma_unmap_area_nobarrier)
 	teq	r2, #DMA_TO_DEVICE
 	bne	v4_dma_flush_range
 	/* FALLTHROUGH */
@@ -126,9 +127,12 @@ ENTRY(v4_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v4_dma_map_area)
+ENTRY(v4_dma_map_area_nobarrier)
 	mov	pc, lr
 ENDPROC(v4_dma_unmap_area)
+ENDPROC(v4_dma_unmap_area_nobarrier)
 ENDPROC(v4_dma_map_area)
+ENDPROC(v4_dma_map_area_nobarrier)
 
 	__INITDATA
 
@@ -142,5 +146,7 @@ ENTRY(v4_cache_fns)
 	.long	v4_flush_kern_dcache_area
 	.long	v4_dma_map_area
 	.long	v4_dma_unmap_area
+	.long	v4_dma_map_area_nobarrier
+	.long	v4_dma_unmap_area_nobarrier
 	.long	v4_dma_flush_range
 	.size	v4_cache_fns, . - v4_cache_fns
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index df8368a..dff8248 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -113,6 +113,37 @@ ENTRY(v4wb_flush_user_cache_range)
 	mcrne	p15, 0, ip, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
+	.macro v4wb_dma_flush_range_macro, start, end
+	bic	\start, \start, #CACHE_DLINESIZE - 1
+1:	mcr	p15, 0, \start, c7, c10, 1		@ clean D entry
+	mcr	p15, 0, \start, c7, c6, 1		@ invalidate D entry
+	add	\start, \start, #CACHE_DLINESIZE
+	cmp	\start, \end
+	blo	1b
+	mov	ip, #0
+	mcr	p15, 0, ip, c7, c5, 0		@ invalidate I cache
+	.endm
+
+	.macro v4wb_dma_inv_range, start, end
+	tst	\start, #CACHE_DLINESIZE - 1
+	bic	\start, \start, #CACHE_DLINESIZE - 1
+	mcrne	p15, 0, \start, c7, c10, 1		@ clean D entry
+	tst	\end, #CACHE_DLINESIZE - 1
+	mcrne	p15, 0, \end, c7, c10, 1		@ clean D entry
+1:	mcr	p15, 0, \start, c7, c6, 1		@ invalidate D entry
+	add	\start, \start, #CACHE_DLINESIZE
+	cmp	\start, \end
+	blo	1b
+	.endm
+
+	.macro v4wb_dma_clean_range, start, end
+	bic	\start, \start, #CACHE_DLINESIZE - 1
+1:	mcr	p15, 0, \start, c7, c10, 1		@ clean D entry
+	add	\start, \start, #CACHE_DLINESIZE
+	cmp	\start, \end
+	blo	1b
+	.endm
+
 /*
  *	flush_kern_dcache_area(void *addr, size_t size)
  *
@@ -150,20 +181,12 @@ ENTRY(v4wb_coherent_kern_range)
  *	- end	 - virtual end address
  */
 ENTRY(v4wb_coherent_user_range)
-	bic	r0, r0, #CACHE_DLINESIZE - 1
-1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
-	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
-	add	r0, r0, #CACHE_DLINESIZE
-	cmp	r0, r1
-	blo	1b
-	mov	ip, #0
-	mcr	p15, 0, ip, c7, c5, 0		@ invalidate I cache
+	v4wb_dma_flush_range_macro r0, r1
 	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
-
 /*
- *	dma_inv_range(start, end)
+ *	dma_inv_range_barrier(start, end)
  *
  *	Invalidate (discard) the specified virtual address range.
  *	May not write back any entries.  If 'start' or 'end'
@@ -173,16 +196,8 @@ ENTRY(v4wb_coherent_user_range)
  *	- start  - virtual start address
  *	- end	 - virtual end address
  */
-v4wb_dma_inv_range:
-	tst	r0, #CACHE_DLINESIZE - 1
-	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
-	add	r0, r0, #CACHE_DLINESIZE
-	cmp	r0, r1
-	blo	1b
+v4wb_dma_inv_range_barrier:
+	v4wb_dma_inv_range r0, r1
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
@@ -194,12 +209,8 @@ v4wb_dma_inv_range:
  *	- start  - virtual start address
  *	- end	 - virtual end address
  */
-v4wb_dma_clean_range:
-	bic	r0, r0, #CACHE_DLINESIZE - 1
-1:	mcr	p15, 0, r0, c7, c10, 1		@ clean D entry
-	add	r0, r0, #CACHE_DLINESIZE
-	cmp	r0, r1
-	blo	1b
+v4wb_dma_clean_range_barrier:
+	v4wb_dma_clean_range r0, r1
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
@@ -216,17 +227,32 @@ v4wb_dma_clean_range:
 	.globl	v4wb_dma_flush_range
 	.set	v4wb_dma_flush_range, v4wb_coherent_kern_range
 
+
+v4wb_dma_inv_range_nobarrier:
+	v4wb_dma_inv_range r0, r1
+	mov	pc, lr
+
+v4wb_dma_clean_range_nobarrier:
+	v4wb_dma_clean_range r0, r1
+	mov	pc, lr
+
+v4wb_dma_flush_range_nobarrier:
+	v4wb_dma_flush_range_macro r0, r1
+	mov	pc, lr
+
+
 /*
  *	dma_map_area(start, size, dir)
  *	- start	- kernel virtual start address
  *	- size	- size of region
  *	- dir	- DMA direction
  */
+
 ENTRY(v4wb_dma_map_area)
 	add	r1, r1, r0
 	cmp	r2, #DMA_TO_DEVICE
-	beq	v4wb_dma_clean_range
-	bcs	v4wb_dma_inv_range
+	beq	v4wb_dma_clean_range_barrier
+	bcs	v4wb_dma_inv_range_barrier
 	b	v4wb_dma_flush_range
 ENDPROC(v4wb_dma_map_area)
 
@@ -237,8 +263,18 @@ ENDPROC(v4wb_dma_map_area)
  *	- dir	- DMA direction
  */
 ENTRY(v4wb_dma_unmap_area)
+ENTRY(v4wb_dma_unmap_area_nobarrier)
 	mov	pc, lr
 ENDPROC(v4wb_dma_unmap_area)
+ENDPROC(v4wb_dma_unmap_area_nobarrier)
+
+ENTRY(v4wb_dma_map_area_nobarrier)
+	add	r1, r1, r0
+	cmp	r2, #DMA_TO_DEVICE
+	beq	v4wb_dma_clean_range_nobarrier
+	bcs	v4wb_dma_inv_range_nobarrier
+	b	v4wb_dma_flush_range_nobarrier
+ENDPROC(v4wb_dma_map_area_nobarrier)
 
 	__INITDATA
 
@@ -252,5 +288,7 @@ ENTRY(v4wb_cache_fns)
 	.long	v4wb_flush_kern_dcache_area
 	.long	v4wb_dma_map_area
 	.long	v4wb_dma_unmap_area
+	.long	v4wb_dma_map_area_nobarrier
+	.long	v4wb_dma_unmap_area_nobarrier
 	.long	v4wb_dma_flush_range
 	.size	v4wb_cache_fns, . - v4wb_cache_fns
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index 45c7031..df587b6 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -168,6 +168,7 @@ v4wt_dma_inv_range:
  *	- dir	- DMA direction
  */
 ENTRY(v4wt_dma_unmap_area)
+ENTRY(v4wt_dma_unmap_area_nobarrier)
 	add	r1, r1, r0
 	teq	r2, #DMA_TO_DEVICE
 	bne	v4wt_dma_inv_range
@@ -180,9 +181,12 @@ ENTRY(v4wt_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v4wt_dma_map_area)
+ENTRY(v4wt_dma_map_area_nobarrier)
 	mov	pc, lr
 ENDPROC(v4wt_dma_unmap_area)
+ENDPROC(v4wt_dma_unmap_area_nobarrier)
 ENDPROC(v4wt_dma_map_area)
+ENDPROC(v4wt_dma_map_area_nobarrier)
 
 	__INITDATA
 
@@ -196,5 +200,7 @@ ENTRY(v4wt_cache_fns)
 	.long	v4wt_flush_kern_dcache_area
 	.long	v4wt_dma_map_area
 	.long	v4wt_dma_unmap_area
+	.long	v4wt_dma_map_area_nobarrier
+	.long	v4wt_dma_unmap_area_nobarrier
 	.long	v4wt_dma_flush_range
 	.size	v4wt_cache_fns, . - v4wt_cache_fns
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 9d89c67..0e3f9b9 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -185,65 +185,96 @@ ENTRY(v6_flush_kern_dcache_area)
 	mov	pc, lr
 
 
-/*
- *	v6_dma_inv_range(start,end)
- *
- *	Invalidate the data cache within the specified region; we will
- *	be performing a DMA operation in this region and we want to
- *	purge old data in the cache.
- *
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
-v6_dma_inv_range:
-	tst	r0, #D_CACHE_LINE_SIZE - 1
-	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
+	.macro v6_dma_inv_range, start,end
+	tst	\start, #D_CACHE_LINE_SIZE - 1
+	bic	\start, \start, #D_CACHE_LINE_SIZE - 1
 #ifdef HARVARD_CACHE
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D line
+	mcrne	p15, 0, \start, c7, c10, 1	@ clean D line
 #else
-	mcrne	p15, 0, r0, c7, c11, 1		@ clean unified line
+	mcrne	p15, 0, \start, c7, c11, 1	@ clean unified line
 #endif
-	tst	r1, #D_CACHE_LINE_SIZE - 1
-	bic	r1, r1, #D_CACHE_LINE_SIZE - 1
+	tst	\end, #D_CACHE_LINE_SIZE - 1
+	bic	\end, \end, #D_CACHE_LINE_SIZE - 1
 #ifdef HARVARD_CACHE
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D line
+	mcrne	p15, 0, \end, c7, c14, 1	@ clean & invalidate D line
 #else
-	mcrne	p15, 0, r1, c7, c15, 1		@ clean & invalidate unified line
+	mcrne	p15, 0, \end, c7, c15, 1	@ clean & invalidate unifiedline
 #endif
 1:
 #ifdef HARVARD_CACHE
-	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D line
+	mcr	p15, 0, \start, c7, c6, 1	@ invalidate D line
 #else
-	mcr	p15, 0, r0, c7, c7, 1		@ invalidate unified line
+	mcr	p15, 0, \start, c7, c7, 1	@ invalidate unified line
 #endif
-	add	r0, r0, #D_CACHE_LINE_SIZE
-	cmp	r0, r1
+	add	\start, \start, #D_CACHE_LINE_SIZE
+	cmp	\start, \end
 	blo	1b
-	mov	r0, #0
+	mov	\start, #0
+	.endm
+
+	.macro v6_dma_clean_range, start, end
+	bic	\start, \start, #D_CACHE_LINE_SIZE - 1
+1:
+#ifdef HARVARD_CACHE
+	mcr	p15, 0, \start, c7, c10, 1	@ clean D line
+#else
+	mcr	p15, 0, \start, c7, c11, 1	@ clean unified line
+#endif
+	add	\start, \start, #D_CACHE_LINE_SIZE
+	cmp	\start, \end
+	blo	1b
+	mov	\start, #0
+	.endm
+
+/*
+ *	v6_dma_inv_range_barrier(start,end)
+ *
+ *	Invalidate the data cache within the specified region; we will
+ *	be performing a DMA operation in this region and we want to
+ *	purge old data in the cache.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+v6_dma_inv_range_barrier:
+	v6_dma_inv_range r0, r1
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
- *	v6_dma_clean_range(start,end)
+ *	v6_dma_clean_range_barrier(start,end)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-v6_dma_clean_range:
-	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
-1:
-#ifdef HARVARD_CACHE
-	mcr	p15, 0, r0, c7, c10, 1		@ clean D line
-#else
-	mcr	p15, 0, r0, c7, c11, 1		@ clean unified line
-#endif
-	add	r0, r0, #D_CACHE_LINE_SIZE
-	cmp	r0, r1
-	blo	1b
-	mov	r0, #0
+v6_dma_clean_range_barrier:
+	v6_dma_clean_range r0, r1
 	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
+ *	v6_dma_inv_range_nobarrier(start,end)
+ *
+ *	Invalidate the data cache within the specified region; we will
+ *	be performing a DMA operation in this region and we want to
+ *	purge old data in the cache.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+v6_dma_inv_range_nobarrier:
+	v6_dma_inv_range r0, r1
+	mov	pc, lr
+
+/*
+ *	v6_dma_clean_range_nobarrier(start,end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+v6_dma_clean_range_nobarrier:
+	v6_dma_clean_range r0, r1
+	mov	pc, lr
+
+/*
  *	v6_dma_flush_range(start,end)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
@@ -272,8 +303,8 @@ ENTRY(v6_dma_flush_range)
 ENTRY(v6_dma_map_area)
 	add	r1, r1, r0
 	teq	r2, #DMA_FROM_DEVICE
-	beq	v6_dma_inv_range
-	b	v6_dma_clean_range
+	beq	v6_dma_inv_range_barrier
+	b	v6_dma_clean_range_barrier
 ENDPROC(v6_dma_map_area)
 
 /*
@@ -285,10 +316,36 @@ ENDPROC(v6_dma_map_area)
 ENTRY(v6_dma_unmap_area)
 	add	r1, r1, r0
 	teq	r2, #DMA_TO_DEVICE
-	bne	v6_dma_inv_range
+	bne	v6_dma_inv_range_barrier
 	mov	pc, lr
 ENDPROC(v6_dma_unmap_area)
 
+/*
+ *	dma_map_area_nobarrier(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v6_dma_map_area_nobarrier)
+	add	r1, r1, r0
+	teq	r2, #DMA_FROM_DEVICE
+	beq	v6_dma_inv_range_nobarrier
+	b	v6_dma_clean_range_nobarrier
+ENDPROC(v6_dma_map_area_nobarrier)
+
+/*
+ *	dma_unmap_area_nobarrier(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v6_dma_unmap_area_nobarrier)
+	add	r1, r1, r0
+	teq	r2, #DMA_TO_DEVICE
+	bne	v6_dma_inv_range_nobarrier
+	mov	pc, lr
+ENDPROC(v6_dma_unmap_area_nobarrier)
+
 	__INITDATA
 
 	.type	v6_cache_fns, #object
@@ -301,5 +358,7 @@ ENTRY(v6_cache_fns)
 	.long	v6_flush_kern_dcache_area
 	.long	v6_dma_map_area
 	.long	v6_dma_unmap_area
+	.long	v6_dma_map_area_nobarrier
+	.long	v6_dma_unmap_area_nobarrier
 	.long	v6_dma_flush_range
 	.size	v6_cache_fns, . - v6_cache_fns
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index bcd64f2..d748137 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -206,8 +206,33 @@ ENTRY(v7_flush_kern_dcache_area)
 	mov	pc, lr
 ENDPROC(v7_flush_kern_dcache_area)
 
+	.macro	v7_dma_inv_range, start, end, line_size, tmp
+	sub	\tmp, \line_size, #1
+	tst	\start, \line_size
+	bic	\start, \start, \tmp
+	mcrne	p15, 0, \start, c7, c14, 1	@ clean & invalidate D / U line
+
+	tst	\end, \tmp
+	bic	\end, \end, \tmp
+	mcrne	p15, 0, \end, c7, c14, 1	@ clean & invalidate D / U line
+1:
+	mcr	p15, 0, \start, c7, c6, 1	@ invalidate D / U line
+	add	\start, \start, \line_size
+	cmp	\start, \end
+	blo	1b
+	.endm
+
+	.macro	v7_dma_clean_range, start, end, line_size, tmp
+	sub	\tmp, \line_size, #1
+	bic	\start, \start, \tmp
+1:
+	mcr	p15, 0, \start, c7, c10, 1	@ clean D / U line
+	add	\start, \start, \line_size
+	cmp	\start, \end
+	blo	1b
+	.endm
 /*
- *	v7_dma_inv_range(start,end)
+ *	v7_dma_inv_range_barrier(start,end)
  *
  *	Invalidate the data cache within the specified region; we will
  *	be performing a DMA operation in this region and we want to
@@ -216,42 +241,51 @@ ENDPROC(v7_flush_kern_dcache_area)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-v7_dma_inv_range:
+v7_dma_inv_range_barrier:
 	dcache_line_size r2, r3
-	sub	r3, r2, #1
-	tst	r0, r3
-	bic	r0, r0, r3
-	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
-
-	tst	r1, r3
-	bic	r1, r1, r3
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D / U line
-1:
-	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D / U line
-	add	r0, r0, r2
-	cmp	r0, r1
-	blo	1b
+	v7_dma_inv_range r0, r1, r2, r3
 	dsb
 	mov	pc, lr
-ENDPROC(v7_dma_inv_range)
+ENDPROC(v7_dma_inv_range_barrier)
 
 /*
- *	v7_dma_clean_range(start,end)
+ *	v7_dma_clean_range_barrier(start,end)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-v7_dma_clean_range:
+v7_dma_clean_range_barrier:
 	dcache_line_size r2, r3
-	sub	r3, r2, #1
-	bic	r0, r0, r3
-1:
-	mcr	p15, 0, r0, c7, c10, 1		@ clean D / U line
-	add	r0, r0, r2
-	cmp	r0, r1
-	blo	1b
+	v7_dma_clean_range r0, r1, r2 ,r3
 	dsb
 	mov	pc, lr
-ENDPROC(v7_dma_clean_range)
+ENDPROC(v7_dma_clean_range_barrier)
+
+/*
+ *	v7_dma_inv_range_nobarrier(start,end)
+ *
+ *	Invalidate the data cache within the specified region; we will
+ *	be performing a DMA operation in this region and we want to
+ *	purge old data in the cache.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+v7_dma_inv_range_nobarrier:
+	dcache_line_size r2, r3
+	v7_dma_inv_range r0, r1, r2, r3
+	mov	pc, lr
+ENDPROC(v7_dma_inv_range_nobarrier)
+
+/*
+ *	v7_dma_clean_range_nobarrier(start,end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+v7_dma_clean_range_nobarrier:
+	dcache_line_size r2, r3
+	v7_dma_clean_range r0, r1, r2 ,r3
+	mov	pc, lr
+ENDPROC(v7_dma_clean_range_nobarrier)
 
 /*
  *	v7_dma_flush_range(start,end)
@@ -280,8 +314,8 @@ ENDPROC(v7_dma_flush_range)
 ENTRY(v7_dma_map_area)
 	add	r1, r1, r0
 	teq	r2, #DMA_FROM_DEVICE
-	beq	v7_dma_inv_range
-	b	v7_dma_clean_range
+	beq	v7_dma_inv_range_barrier
+	b	v7_dma_clean_range_barrier
 ENDPROC(v7_dma_map_area)
 
 /*
@@ -293,10 +327,36 @@ ENDPROC(v7_dma_map_area)
 ENTRY(v7_dma_unmap_area)
 	add	r1, r1, r0
 	teq	r2, #DMA_TO_DEVICE
-	bne	v7_dma_inv_range
+	bne	v7_dma_inv_range_barrier
 	mov	pc, lr
 ENDPROC(v7_dma_unmap_area)
 
+/*
+ *	dma_map_area_nobarrier(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v7_dma_map_area_nobarrier)
+	add	r1, r1, r0
+	teq	r2, #DMA_FROM_DEVICE
+	beq	v7_dma_inv_range_nobarrier
+	b	v7_dma_clean_range_nobarrier
+ENDPROC(v7_dma_map_area_nobarrier)
+
+/*
+ *	dma_unmap_area_nobarrier(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(v7_dma_unmap_area_nobarrier)
+	add	r1, r1, r0
+	teq	r2, #DMA_TO_DEVICE
+	bne	v7_dma_inv_range_nobarrier
+	mov	pc, lr
+ENDPROC(v7_dma_unmap_area_nobarrier)
+
 	__INITDATA
 
 	.type	v7_cache_fns, #object
@@ -309,5 +369,7 @@ ENTRY(v7_cache_fns)
 	.long	v7_flush_kern_dcache_area
 	.long	v7_dma_map_area
 	.long	v7_dma_unmap_area
+	.long	v7_dma_map_area_nobarrier
+	.long	v7_dma_unmap_area_nobarrier
 	.long	v7_dma_flush_range
 	.size	v7_cache_fns, . - v7_cache_fns
-- 
1.5.6.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/2] dma: fix scatter-gather api to use barrierless map/unmap functions
  2010-02-10 20:37 [RFC 0/2] fix dma_map_sg not to do barriers for each buffer adharmap at codeaurora.org
  2010-02-10 20:37 ` [PATCH 1/2] dma: define barrierless versions of map and unmap area adharmap at codeaurora.org
@ 2010-02-10 20:37 ` adharmap at codeaurora.org
  2010-02-10 21:21 ` [RFC 0/2] fix dma_map_sg not to do barriers for each buffer Russell King - ARM Linux
  2010-02-10 21:27 ` Randy Dunlap
  3 siblings, 0 replies; 18+ messages in thread
From: adharmap at codeaurora.org @ 2010-02-10 20:37 UTC (permalink / raw)
  To: linux-arm-kernel

From: Abhijeet Dharmapurikar <adharmap@quicinc.com>

dma_map/unmap_sg need to execute barrier only after the last buffer has been
mapped/unmapped. This imporves performance in situations where multiple
buffers need to be mapped for a single DMA operation.

Signed-off-by: Abhijeet Dharmapurikar <adharmap@quicinc.com>
---
 arch/arm/include/asm/dma-mapping.h |   87 ++++++++++++++++++++++++++++++++++++
 arch/arm/mm/dma-mapping.c          |   59 +++++++++++++++++++++---
 2 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 256ee1c..06b528d 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -110,6 +110,26 @@ static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
 		___dma_page_dev_to_cpu(page, off, size, dir);
 }
 
+static inline void __dma_page_cpu_to_dev_nobarrier(struct page *page,
+	unsigned long off, size_t size, enum dma_data_direction dir)
+{
+	extern void ___dma_page_cpu_to_dev_nobarrier(struct page *,
+		unsigned long, size_t, enum dma_data_direction);
+
+	if (!arch_is_coherent())
+		___dma_page_cpu_to_dev_nobarrier(page, off, size, dir);
+}
+
+static inline void __dma_page_dev_to_cpu_nobarrier(struct page *page,
+	unsigned long off, size_t size, enum dma_data_direction dir)
+{
+	extern void ___dma_page_dev_to_cpu_nobarrier(struct page *,
+		unsigned long, size_t, enum dma_data_direction);
+
+	if (!arch_is_coherent())
+		___dma_page_dev_to_cpu_nobarrier(page, off, size, dir);
+}
+
 /*
  * Return whether the given device DMA address mask can be supported
  * properly.  For example, if your device can only drive the low 24-bits
@@ -305,6 +325,23 @@ extern void dma_unmap_page(struct device *, dma_addr_t, size_t,
 		enum dma_data_direction);
 
 /*
+ * for DMA_BOUNCE we keep the nobarrier version same as their barriered
+ * counterpart
+ */
+static inline dma_addr_t dma_map_page_nobarrier(struct device *dev,
+		struct page *page, unsigned long offset, size_t size,
+		enum dma_data_direction dir)
+{
+	return dma_map_page(dev, page, offset, size, dir);
+}
+
+static inline void dma_unmap_page_nobarrier(struct device *dev,
+		dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+	return dma_unmap_page(dev, handle, size, dir);
+}
+
+/*
  * Private functions
  */
 int dmabounce_sync_for_cpu(struct device *, dma_addr_t, unsigned long,
@@ -374,6 +411,34 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 }
 
 /**
+ * dma_map_page_nobarrier - map a portion of a page for streaming DMA without a
+ * barrier
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @page: page that buffer resides in
+ * @offset: offset into page for start of buffer
+ * @size: size of buffer to map
+ * @dir: DMA transfer direction
+ *
+ * Once this call is followed by a barrier it is ensured that any data held
+ * in the cache is appropriately  discarded or written back.
+ *
+ * The device owns this memory once this call has completed and a barrier is
+ * executed.  The CPU
+ * can regain ownership by calling dma_unmap_page() or
+ * dma_unmap_page_nobarrier() followed by a barrier.
+ */
+static inline dma_addr_t dma_map_page_nobarrier(struct device *dev,
+		struct page *page, unsigned long offset, size_t size,
+		enum dma_data_direction dir)
+{
+	BUG_ON(!valid_dma_direction(dir));
+
+	__dma_page_cpu_to_dev_nobarrier(page, offset, size, dir);
+
+	return page_to_dma(dev, page) + offset;
+}
+
+/**
  * dma_unmap_single - unmap a single buffer previously mapped
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
  * @handle: DMA address of buffer
@@ -413,6 +478,28 @@ static inline void dma_unmap_page(struct device *dev, dma_addr_t handle,
 	__dma_page_dev_to_cpu(dma_to_page(dev, handle), handle & ~PAGE_MASK,
 		size, dir);
 }
+
+/**
+ * dma_unmap_page_nobarrier - unmap a buffer previously mapped through dma_map_page()
+ * or dma_map_page_nobarrier() followed by a barrier
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @handle: DMA address of buffer
+ * @size: size of buffer (same as passed to dma_map_page)
+ * @dir: DMA transfer direction (same as passed to dma_map_page)
+ *
+ * Unmap a page streaming mode DMA translation.  The handle and size
+ * must match what was provided in the previous dma_map_page() call.
+ * All other usages are undefined.
+ *
+ * After this call, followed by a barrier(dsb/dmb), reads by the CPU to the
+ * buffer are guaranteed to see whatever the device wrote there.
+ */
+static inline void dma_unmap_page_nobarrier(struct device *dev,
+		dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+	__dma_page_dev_to_cpu_nobarrier(dma_to_page(dev, handle),
+		handle & ~PAGE_MASK, size, dir);
+}
 #endif /* CONFIG_DMABOUNCE */
 
 /**
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 64daef2..23556ab 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -509,6 +509,37 @@ void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
 }
 EXPORT_SYMBOL(___dma_page_dev_to_cpu);
 
+
+void ___dma_page_cpu_to_dev_nobarrier(struct page *page, unsigned long off,
+	size_t size, enum dma_data_direction dir)
+{
+	unsigned long paddr;
+
+	dma_cache_maint_page(page, off, size, dir, dmac_map_area_nobarrier);
+
+	paddr = page_to_phys(page) + off;
+	if (dir == DMA_FROM_DEVICE) {
+		outer_inv_range(paddr, paddr + size);
+	} else {
+		outer_clean_range(paddr, paddr + size);
+	}
+	/* FIXME: non-speculating: flush on bidirectional mappings? */
+}
+EXPORT_SYMBOL(___dma_page_cpu_to_dev_nobarrier);
+
+void ___dma_page_dev_to_cpu_nobarrier(struct page *page, unsigned long off,
+	size_t size, enum dma_data_direction dir)
+{
+	unsigned long paddr = page_to_phys(page) + off;
+
+	/* FIXME: non-speculating: not required */
+	/* don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE)
+		outer_inv_range(paddr, paddr + size);
+
+	dma_cache_maint_page(page, off, size, dir, dmac_unmap_area_nobarrier);
+}
+EXPORT_SYMBOL(___dma_page_dev_to_cpu_nobarrier);
 /**
  * dma_map_sg - map a set of SG buffers for streaming mode DMA
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -531,17 +562,28 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	struct scatterlist *s;
 	int i, j;
 
-	for_each_sg(sg, s, nents, i) {
-		s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
-						s->length, dir);
+	for_each_sg(sg, s, nents - 1 , i) {
+		s->dma_address = dma_map_page_nobarrier(dev, sg_page(s),
+					s->offset, s->length, dir);
 		if (dma_mapping_error(dev, s->dma_address))
 			goto bad_mapping;
 	}
+
+	s = sg_next(s);
+	i++;
+	s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
+					s->length, dir);
+	if (dma_mapping_error(dev, s->dma_address))
+		goto bad_mapping;
+
 	return nents;
 
  bad_mapping:
-	for_each_sg(sg, s, i, j)
-		dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+	for_each_sg(sg, s, i - 1, j)
+		dma_unmap_page_nobarrier(dev, sg_dma_address(s),
+						sg_dma_len(s), dir);
+	s = sg_next(s);
+	dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
 	return 0;
 }
 EXPORT_SYMBOL(dma_map_sg);
@@ -562,8 +604,11 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 	struct scatterlist *s;
 	int i;
 
-	for_each_sg(sg, s, nents, i)
-		dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+	for_each_sg(sg, s, nents - 1, i)
+		dma_unmap_page_nobarrier(dev, sg_dma_address(s),
+						sg_dma_len(s), dir);
+	s = sg_next(s);
+	dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
 }
 EXPORT_SYMBOL(dma_unmap_sg);
 
-- 
1.5.6.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 20:37 [RFC 0/2] fix dma_map_sg not to do barriers for each buffer adharmap at codeaurora.org
  2010-02-10 20:37 ` [PATCH 1/2] dma: define barrierless versions of map and unmap area adharmap at codeaurora.org
  2010-02-10 20:37 ` [PATCH 2/2] dma: fix scatter-gather api to use barrierless map/unmap functions adharmap at codeaurora.org
@ 2010-02-10 21:21 ` Russell King - ARM Linux
  2010-02-10 23:28   ` Abhijeet Dharmapurikar
  2010-02-11 10:45   ` Catalin Marinas
  2010-02-10 21:27 ` Randy Dunlap
  3 siblings, 2 replies; 18+ messages in thread
From: Russell King - ARM Linux @ 2010-02-10 21:21 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Feb 10, 2010 at 12:37:28PM -0800, adharmap at codeaurora.org wrote:
> From: Abhijeet Dharmapurikar <adharmap@quicinc.com>
> 
> Please refer to the post here
> http://lkml.org/lkml/2010/1/4/347
> 
> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> use them to map the buffers in the scatterlist. For the last buffer, call
> the normal dma_map_area(aka with barriers) effectively executing the barrier
> at the end of the operation.

What if we make dma_map_area and dma_unmap_area both be barrier-less,
and instead have a separate dma_barrier method - eg, something like the
attached?

This might allow for better I-cache usage by not having to duplicate the
DMA cache coherence functions.

PS, you haven't sorted out all the processor support files for your change.

 arch/arm/include/asm/cacheflush.h  |    4 ++++
 arch/arm/include/asm/dma-mapping.h |    8 ++++++++
 arch/arm/mm/cache-fa.S             |   13 +++++++------
 arch/arm/mm/cache-v3.S             |    3 +++
 arch/arm/mm/cache-v4.S             |    3 +++
 arch/arm/mm/cache-v4wb.S           |    9 +++++++--
 arch/arm/mm/cache-v4wt.S           |    3 +++
 arch/arm/mm/cache-v6.S             |   13 +++++++------
 arch/arm/mm/cache-v7.S             |    9 ++++++---
 arch/arm/mm/dma-mapping.c          |   16 ++++++++++++++++
 arch/arm/mm/proc-arm1020e.S        |   10 +++++++---
 arch/arm/mm/proc-arm1022.S         |   10 +++++++---
 arch/arm/mm/proc-arm1026.S         |   10 +++++++---
 arch/arm/mm/proc-arm920.S          |   10 +++++++---
 arch/arm/mm/proc-arm922.S          |   10 +++++++---
 arch/arm/mm/proc-arm925.S          |   10 +++++++---
 arch/arm/mm/proc-arm926.S          |   10 +++++++---
 arch/arm/mm/proc-arm940.S          |   10 +++++++---
 arch/arm/mm/proc-arm946.S          |   10 +++++++---
 arch/arm/mm/proc-feroceon.S        |   13 ++++++++-----
 arch/arm/mm/proc-mohawk.S          |   10 +++++++---
 arch/arm/mm/proc-xsc3.S            |   10 +++++++---
 arch/arm/mm/proc-xscale.S          |   10 +++++++---
 23 files changed, 156 insertions(+), 58 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index e290885..5928e78 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -200,6 +200,7 @@ struct cpu_cache_fns {
 
 	void (*dma_map_area)(const void *, size_t, int);
 	void (*dma_unmap_area)(const void *, size_t, int);
+	void (*dma_barrier)(void);
 
 	void (*dma_flush_range)(const void *, const void *);
 };
@@ -232,6 +233,7 @@ extern struct cpu_cache_fns cpu_cache;
  */
 #define dmac_map_area			cpu_cache.dma_map_area
 #define dmac_unmap_area		cpu_cache.dma_unmap_area
+#define dmac_barrier			cpu_cache.dma_barrier
 #define dmac_flush_range		cpu_cache.dma_flush_range
 
 #else
@@ -258,10 +260,12 @@ extern void __cpuc_flush_dcache_area(void *, size_t);
  */
 #define dmac_map_area			__glue(_CACHE,_dma_map_area)
 #define dmac_unmap_area		__glue(_CACHE,_dma_unmap_area)
+#define dmac_barrier			__glue(_CACHE,_dma_barrier)
 #define dmac_flush_range		__glue(_CACHE,_dma_flush_range)
 
 extern void dmac_map_area(const void *, size_t, int);
 extern void dmac_unmap_area(const void *, size_t, int);
+extern void dmac_barrier(void);
 extern void dmac_flush_range(const void *, const void *);
 
 #endif
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 256ee1c..4a0824c 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -110,6 +110,8 @@ static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
 		___dma_page_dev_to_cpu(page, off, size, dir);
 }
 
+extern void __dma_barrier(enum dma_data_direction);
+
 /*
  * Return whether the given device DMA address mask can be supported
  * properly.  For example, if your device can only drive the low 24-bits
@@ -345,6 +347,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
 	BUG_ON(!valid_dma_direction(dir));
 
 	__dma_single_cpu_to_dev(cpu_addr, size, dir);
+	__dma_barrier(dir);
 
 	return virt_to_dma(dev, cpu_addr);
 }
@@ -369,6 +372,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 	BUG_ON(!valid_dma_direction(dir));
 
 	__dma_page_cpu_to_dev(page, offset, size, dir);
+	__dma_barrier(dir);
 
 	return page_to_dma(dev, page) + offset;
 }
@@ -391,6 +395,7 @@ static inline void dma_unmap_single(struct device *dev, dma_addr_t handle,
 		size_t size, enum dma_data_direction dir)
 {
 	__dma_single_dev_to_cpu(dma_to_virt(dev, handle), size, dir);
+	__dma_barrier(dir);
 }
 
 /**
@@ -412,6 +417,7 @@ static inline void dma_unmap_page(struct device *dev, dma_addr_t handle,
 {
 	__dma_page_dev_to_cpu(dma_to_page(dev, handle), handle & ~PAGE_MASK,
 		size, dir);
+	__dma_barrier(dir);
 }
 #endif /* CONFIG_DMABOUNCE */
 
@@ -443,6 +449,7 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
 		return;
 
 	__dma_single_dev_to_cpu(dma_to_virt(dev, handle) + offset, size, dir);
+	__dma_barrier(dir);
 }
 
 static inline void dma_sync_single_range_for_device(struct device *dev,
@@ -455,6 +462,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 		return;
 
 	__dma_single_cpu_to_dev(dma_to_virt(dev, handle) + offset, size, dir);
+	__dma_barrier(dir);
 }
 
 static inline void dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
index 7148e53..cdcfae2 100644
--- a/arch/arm/mm/cache-fa.S
+++ b/arch/arm/mm/cache-fa.S
@@ -168,8 +168,6 @@ fa_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -186,8 +184,6 @@ fa_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0	
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -201,8 +197,6 @@ ENTRY(fa_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0	
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -229,6 +223,12 @@ ENTRY(fa_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(fa_dma_unmap_area)
 
+ENTRY(fa_dma_barrier)
+	mov	r0, #0	
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+ENDPROC(fa_dma_barrier)
+
 	__INITDATA
 
 	.type	fa_cache_fns, #object
@@ -241,5 +241,6 @@ ENTRY(fa_cache_fns)
 	.long	fa_flush_kern_dcache_area
 	.long	fa_dma_map_area
 	.long	fa_dma_unmap_area
+	.long	fa_dma_barrier
 	.long	fa_dma_flush_range
 	.size	fa_cache_fns, . - fa_cache_fns
diff --git a/arch/arm/mm/cache-v3.S b/arch/arm/mm/cache-v3.S
index c2ff3c5..df34458 100644
--- a/arch/arm/mm/cache-v3.S
+++ b/arch/arm/mm/cache-v3.S
@@ -114,9 +114,11 @@ ENTRY(v3_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v3_dma_map_area)
+ENTRY(v3_dma_barrier)
 	mov	pc, lr
 ENDPROC(v3_dma_unmap_area)
 ENDPROC(v3_dma_map_area)
+ENDPROC(v3_dma_barrier)
 
 	__INITDATA
 
@@ -130,5 +132,6 @@ ENTRY(v3_cache_fns)
 	.long	v3_flush_kern_dcache_area
 	.long	v3_dma_map_area
 	.long	v3_dma_unmap_area
+	.long	v3_dma_barrier
 	.long	v3_dma_flush_range
 	.size	v3_cache_fns, . - v3_cache_fns
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 4810f7e..20260b1 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -126,9 +126,11 @@ ENTRY(v4_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v4_dma_map_area)
+ENTRY(v4_dma_barrier)
 	mov	pc, lr
 ENDPROC(v4_dma_unmap_area)
 ENDPROC(v4_dma_map_area)
+ENDPROC(v4_dma_barrier)
 
 	__INITDATA
 
@@ -142,5 +144,6 @@ ENTRY(v4_cache_fns)
 	.long	v4_flush_kern_dcache_area
 	.long	v4_dma_map_area
 	.long	v4_dma_unmap_area
+	.long	v4_dma_barrier
 	.long	v4_dma_flush_range
 	.size	v4_cache_fns, . - v4_cache_fns
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index df8368a..9c9c875 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -183,7 +183,6 @@ v4wb_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -200,7 +199,6 @@ v4wb_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -240,6 +238,12 @@ ENTRY(v4wb_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(v4wb_dma_unmap_area)
 
+ENTRY(v4wb_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+ENDPROC(v4wb_dma_barrier)
+
 	__INITDATA
 
 	.type	v4wb_cache_fns, #object
@@ -252,5 +256,6 @@ ENTRY(v4wb_cache_fns)
 	.long	v4wb_flush_kern_dcache_area
 	.long	v4wb_dma_map_area
 	.long	v4wb_dma_unmap_area
+	.long	v4wb_dma_barrier
 	.long	v4wb_dma_flush_range
 	.size	v4wb_cache_fns, . - v4wb_cache_fns
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index 45c7031..223eea4 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -180,9 +180,11 @@ ENTRY(v4wt_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v4wt_dma_map_area)
+ENTRY(v4wt_dma_barrier)
 	mov	pc, lr
 ENDPROC(v4wt_dma_unmap_area)
 ENDPROC(v4wt_dma_map_area)
+ENDPROC(v4wt_dma_barrier)
 
 	__INITDATA
 
@@ -196,5 +198,6 @@ ENTRY(v4wt_cache_fns)
 	.long	v4wt_flush_kern_dcache_area
 	.long	v4wt_dma_map_area
 	.long	v4wt_dma_unmap_area
+	.long	v4wt_dma_barrier
 	.long	v4wt_dma_flush_range
 	.size	v4wt_cache_fns, . - v4wt_cache_fns
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 9d89c67..b294854 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -219,8 +219,6 @@ v6_dma_inv_range:
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -239,8 +237,6 @@ v6_dma_clean_range:
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -259,8 +255,6 @@ ENTRY(v6_dma_flush_range)
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -289,6 +283,12 @@ ENTRY(v6_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(v6_dma_unmap_area)
 
+ENTRY(v6_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+ENDPROC(v6_dma_barrier)
+
 	__INITDATA
 
 	.type	v6_cache_fns, #object
@@ -301,5 +301,6 @@ ENTRY(v6_cache_fns)
 	.long	v6_flush_kern_dcache_area
 	.long	v6_dma_map_area
 	.long	v6_dma_unmap_area
+	.long	v6_dma_barrier
 	.long	v6_dma_flush_range
 	.size	v6_cache_fns, . - v6_cache_fns
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index bcd64f2..d89d55a 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -231,7 +231,6 @@ v7_dma_inv_range:
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
 	mov	pc, lr
 ENDPROC(v7_dma_inv_range)
 
@@ -249,7 +248,6 @@ v7_dma_clean_range:
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
 	mov	pc, lr
 ENDPROC(v7_dma_clean_range)
 
@@ -267,7 +265,6 @@ ENTRY(v7_dma_flush_range)
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
 	mov	pc, lr
 ENDPROC(v7_dma_flush_range)
 
@@ -297,6 +294,11 @@ ENTRY(v7_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(v7_dma_unmap_area)
 
+ENTRY(v7_dma_barrier)
+	dsb
+	mov	pc, lr
+ENDPROC(v7_dma_barrier)
+
 	__INITDATA
 
 	.type	v7_cache_fns, #object
@@ -309,5 +311,6 @@ ENTRY(v7_cache_fns)
 	.long	v7_flush_kern_dcache_area
 	.long	v7_dma_map_area
 	.long	v7_dma_unmap_area
+	.long	v6_dma_barrier
 	.long	v7_dma_flush_range
 	.size	v7_cache_fns, . - v7_cache_fns
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 64daef2..debe7cb 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -108,6 +108,7 @@ static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gf
 	memset(ptr, 0, size);
 	dmac_flush_range(ptr, ptr + size);
 	outer_flush_range(__pa(ptr), __pa(ptr) + size);
+	dmac_barrier();
 
 	return page;
 }
@@ -509,6 +510,12 @@ void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
 }
 EXPORT_SYMBOL(___dma_page_dev_to_cpu);
 
+void __dma_barrier(enum dma_data_direction dir)
+{
+	dmac_barrier();
+}
+EXPORT_SYMBOL(__dma_barrier);
+
 /**
  * dma_map_sg - map a set of SG buffers for streaming mode DMA
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -537,6 +544,9 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		if (dma_mapping_error(dev, s->dma_address))
 			goto bad_mapping;
 	}
+
+	__dma_barrier(dir);
+
 	return nents;
 
  bad_mapping:
@@ -564,6 +574,8 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 
 	for_each_sg(sg, s, nents, i)
 		dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+
+	__dma_barrier(dir);
 }
 EXPORT_SYMBOL(dma_unmap_sg);
 
@@ -588,6 +600,8 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		__dma_page_dev_to_cpu(sg_page(s), s->offset,
 				      s->length, dir);
 	}
+
+	__dma_barrier(dir);
 }
 EXPORT_SYMBOL(dma_sync_sg_for_cpu);
 
@@ -612,5 +626,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		__dma_page_cpu_to_dev(sg_page(s), s->offset,
 				      s->length, dir);
 	}
+
+	__dma_barrier(dir);
 }
 EXPORT_SYMBOL(dma_sync_sg_for_device);
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index d278298..fea33c9 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -271,7 +271,6 @@ arm1020e_dma_inv_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -293,7 +292,6 @@ arm1020e_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -313,7 +311,6 @@ ENTRY(arm1020e_dma_flush_range)
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -340,6 +337,12 @@ ENTRY(arm1020e_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm1020e_dma_unmap_area)
 
+ENTRY(arm1020e_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm1020e_dma_barrier)
+
 ENTRY(arm1020e_cache_fns)
 	.long	arm1020e_flush_kern_cache_all
 	.long	arm1020e_flush_user_cache_all
@@ -349,6 +352,7 @@ ENTRY(arm1020e_cache_fns)
 	.long	arm1020e_flush_kern_dcache_area
 	.long	arm1020e_dma_map_area
 	.long	arm1020e_dma_unmap_area
+	.long	arm1020e_dma_barrier
 	.long	arm1020e_dma_flush_range
 
 	.align	5
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index ce13e4a..ba1a7df 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -260,7 +260,6 @@ arm1022_dma_inv_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -282,7 +281,6 @@ arm1022_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -302,7 +300,6 @@ ENTRY(arm1022_dma_flush_range)
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -329,6 +326,12 @@ ENTRY(arm1022_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm1022_dma_unmap_area)
 
+ENTRY(arm1022_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm1022_dma_barrier)
+
 ENTRY(arm1022_cache_fns)
 	.long	arm1022_flush_kern_cache_all
 	.long	arm1022_flush_user_cache_all
@@ -338,6 +341,7 @@ ENTRY(arm1022_cache_fns)
 	.long	arm1022_flush_kern_dcache_area
 	.long	arm1022_dma_map_area
 	.long	arm1022_dma_unmap_area
+	.long	arm1022_dma_barrier
 	.long	arm1022_dma_flush_range
 
 	.align	5
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index 636672a..de648f1 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -254,7 +254,6 @@ arm1026_dma_inv_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -276,7 +275,6 @@ arm1026_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -296,7 +294,6 @@ ENTRY(arm1026_dma_flush_range)
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -323,6 +320,12 @@ ENTRY(arm1026_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm1026_dma_unmap_area)
 
+ENTRY(arm1026_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm1026_dma_barrier)
+
 ENTRY(arm1026_cache_fns)
 	.long	arm1026_flush_kern_cache_all
 	.long	arm1026_flush_user_cache_all
@@ -332,6 +335,7 @@ ENTRY(arm1026_cache_fns)
 	.long	arm1026_flush_kern_dcache_area
 	.long	arm1026_dma_map_area
 	.long	arm1026_dma_unmap_area
+	.long	arm1026_dma_barrier
 	.long	arm1026_dma_flush_range
 
 	.align	5
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 8be8199..ec74093 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -249,7 +249,6 @@ arm920_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -268,7 +267,6 @@ arm920_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -285,7 +283,6 @@ ENTRY(arm920_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -312,6 +309,12 @@ ENTRY(arm920_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm920_dma_unmap_area)
 
+ENTRY(arm920_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm920_dma_barrier)
+
 ENTRY(arm920_cache_fns)
 	.long	arm920_flush_kern_cache_all
 	.long	arm920_flush_user_cache_all
@@ -321,6 +324,7 @@ ENTRY(arm920_cache_fns)
 	.long	arm920_flush_kern_dcache_area
 	.long	arm920_dma_map_area
 	.long	arm920_dma_unmap_area
+	.long	arm920_dma_barrier
 	.long	arm920_dma_flush_range
 
 #endif
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index c0ff8e4..474d4c6 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -251,7 +251,6 @@ arm922_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -270,7 +269,6 @@ arm922_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -287,7 +285,6 @@ ENTRY(arm922_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -314,6 +311,12 @@ ENTRY(arm922_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm922_dma_unmap_area)
 
+ENTRY(arm922_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm922_dma_barrier)
+
 ENTRY(arm922_cache_fns)
 	.long	arm922_flush_kern_cache_all
 	.long	arm922_flush_user_cache_all
@@ -323,6 +326,7 @@ ENTRY(arm922_cache_fns)
 	.long	arm922_flush_kern_dcache_area
 	.long	arm922_dma_map_area
 	.long	arm922_dma_unmap_area
+	.long	arm922_dma_barrier
 	.long	arm922_dma_flush_range
 
 #endif
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index 3c6cffe..0336ae3 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -295,7 +295,6 @@ arm925_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -316,7 +315,6 @@ arm925_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -338,7 +336,6 @@ ENTRY(arm925_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -365,6 +362,12 @@ ENTRY(arm925_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm925_dma_unmap_area)
 
+ENTRY(arm925_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm925_dma_barrier)
+
 ENTRY(arm925_cache_fns)
 	.long	arm925_flush_kern_cache_all
 	.long	arm925_flush_user_cache_all
@@ -374,6 +377,7 @@ ENTRY(arm925_cache_fns)
 	.long	arm925_flush_kern_dcache_area
 	.long	arm925_dma_map_area
 	.long	arm925_dma_unmap_area
+	.long	arm925_dma_barrier
 	.long	arm925_dma_flush_range
 
 ENTRY(cpu_arm925_dcache_clean_area)
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 75b707c..473bbe6 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -258,7 +258,6 @@ arm926_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -279,7 +278,6 @@ arm926_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -301,7 +299,6 @@ ENTRY(arm926_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -328,6 +325,12 @@ ENTRY(arm926_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm926_dma_unmap_area)
 
+ENTRY(arm926_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm926_dma_barrier)
+
 ENTRY(arm926_cache_fns)
 	.long	arm926_flush_kern_cache_all
 	.long	arm926_flush_user_cache_all
@@ -337,6 +340,7 @@ ENTRY(arm926_cache_fns)
 	.long	arm926_flush_kern_dcache_area
 	.long	arm926_dma_map_area
 	.long	arm926_dma_unmap_area
+	.long	arm926_dma_barrier
 	.long	arm926_dma_flush_range
 
 ENTRY(cpu_arm926_dcache_clean_area)
diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S
index 1af1657..c44c963 100644
--- a/arch/arm/mm/proc-arm940.S
+++ b/arch/arm/mm/proc-arm940.S
@@ -180,7 +180,6 @@ arm940_dma_inv_range:
 	bcs	2b				@ entries 63 to 0
 	subs	r1, r1, #1 << 4
 	bcs	1b				@ segments 7 to 0
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -204,7 +203,6 @@ ENTRY(cpu_arm940_dcache_clean_area)
 	subs	r1, r1, #1 << 4
 	bcs	1b				@ segments 7 to 0
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -230,7 +228,6 @@ ENTRY(arm940_dma_flush_range)
 	bcs	2b				@ entries 63 to 0
 	subs	r1, r1, #1 << 4
 	bcs	1b				@ segments 7 to 0
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -257,6 +254,12 @@ ENTRY(arm940_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm940_dma_unmap_area)
 
+ENTRY(arm940_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm940_dma_barrier)
+
 ENTRY(arm940_cache_fns)
 	.long	arm940_flush_kern_cache_all
 	.long	arm940_flush_user_cache_all
@@ -266,6 +269,7 @@ ENTRY(arm940_cache_fns)
 	.long	arm940_flush_kern_dcache_area
 	.long	arm940_dma_map_area
 	.long	arm940_dma_unmap_area
+	.long	arm940_dma_barrier
 	.long	arm940_dma_flush_range
 
 	__INIT
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index 1664b6a..11e9ad7 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -227,7 +227,6 @@ arm946_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -248,7 +247,6 @@ arm946_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -272,7 +270,6 @@ ENTRY(arm946_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -299,6 +296,12 @@ ENTRY(arm946_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm946_dma_unmap_area)
 
+ENTRY(arm946_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm946_dma_barrier)
+
 ENTRY(arm946_cache_fns)
 	.long	arm946_flush_kern_cache_all
 	.long	arm946_flush_user_cache_all
@@ -308,6 +311,7 @@ ENTRY(arm946_cache_fns)
 	.long	arm946_flush_kern_dcache_area
 	.long	arm946_dma_map_area
 	.long	arm946_dma_unmap_area
+	.long	arm946_dma_barrier
 	.long	arm946_dma_flush_range
 
 
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index 53e6323..50a309e 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -284,7 +284,6 @@ feroceon_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 	.align	5
@@ -320,7 +319,6 @@ feroceon_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 	.align	5
@@ -333,7 +331,6 @@ feroceon_range_dma_clean_range:
 	mcr	p15, 5, r0, c15, c13, 0		@ D clean range start
 	mcr	p15, 5, r1, c15, c13, 1		@ D clean range top
 	msr	cpsr_c, r2			@ restore interrupts
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -351,7 +348,6 @@ ENTRY(feroceon_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 	.align	5
@@ -364,7 +360,6 @@ ENTRY(feroceon_range_dma_flush_range)
 	mcr	p15, 5, r0, c15, c15, 0		@ D clean/inv range start
 	mcr	p15, 5, r1, c15, c15, 1		@ D clean/inv range top
 	msr	cpsr_c, r2			@ restore interrupts
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -405,6 +400,12 @@ ENTRY(feroceon_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(feroceon_dma_unmap_area)
 
+ENTRY(feroceon_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(feroceon_dma_barrier)
+
 ENTRY(feroceon_cache_fns)
 	.long	feroceon_flush_kern_cache_all
 	.long	feroceon_flush_user_cache_all
@@ -414,6 +415,7 @@ ENTRY(feroceon_cache_fns)
 	.long	feroceon_flush_kern_dcache_area
 	.long	feroceon_dma_map_area
 	.long	feroceon_dma_unmap_area
+	.long	feroceon_dma_barrier
 	.long	feroceon_dma_flush_range
 
 ENTRY(feroceon_range_cache_fns)
@@ -425,6 +427,7 @@ ENTRY(feroceon_range_cache_fns)
 	.long	feroceon_range_flush_kern_dcache_area
 	.long	feroceon_range_dma_map_area
 	.long	feroceon_dma_unmap_area
+	.long	feroceon_dma_barrier
 	.long	feroceon_range_dma_flush_range
 
 	.align	5
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index caa3115..09e8883 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -228,7 +228,6 @@ mohawk_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -247,7 +246,6 @@ mohawk_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -265,7 +263,6 @@ ENTRY(mohawk_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -292,6 +289,12 @@ ENTRY(mohawk_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(mohawk_dma_unmap_area)
 
+ENTRY(mohawk_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(mohawk_dma_barrier)
+
 ENTRY(mohawk_cache_fns)
 	.long	mohawk_flush_kern_cache_all
 	.long	mohawk_flush_user_cache_all
@@ -301,6 +304,7 @@ ENTRY(mohawk_cache_fns)
 	.long	mohawk_flush_kern_dcache_area
 	.long	mohawk_dma_map_area
 	.long	mohawk_dma_unmap_area
+	.long	mohawk_dma_barrier
 	.long	mohawk_dma_flush_range
 
 ENTRY(cpu_mohawk_dcache_clean_area)
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index 046b3d8..d033ed4 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -267,7 +267,6 @@ xsc3_dma_inv_range:
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mov	pc, lr
 
 /*
@@ -284,7 +283,6 @@ xsc3_dma_clean_range:
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mov	pc, lr
 
 /*
@@ -301,7 +299,6 @@ ENTRY(xsc3_dma_flush_range)
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mov	pc, lr
 
 /*
@@ -328,6 +325,12 @@ ENTRY(xsc3_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(xsc3_dma_unmap_area)
 
+ENTRY(xsc3_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
+	mov	pc, lr
+ENDPROC(xsc3_dma_barrier)
+
 ENTRY(xsc3_cache_fns)
 	.long	xsc3_flush_kern_cache_all
 	.long	xsc3_flush_user_cache_all
@@ -337,6 +340,7 @@ ENTRY(xsc3_cache_fns)
 	.long	xsc3_flush_kern_dcache_area
 	.long	xsc3_dma_map_area
 	.long	xsc3_dma_unmap_area
+	.long	xsc3_dma_barrier
 	.long	xsc3_dma_flush_range
 
 ENTRY(cpu_xsc3_dcache_clean_area)
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 63037e2..e390ae6 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -325,7 +325,6 @@ xscale_dma_inv_range:
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ Drain Write (& Fill) Buffer
 	mov	pc, lr
 
 /*
@@ -342,7 +341,6 @@ xscale_dma_clean_range:
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ Drain Write (& Fill) Buffer
 	mov	pc, lr
 
 /*
@@ -360,7 +358,6 @@ ENTRY(xscale_dma_flush_range)
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ Drain Write (& Fill) Buffer
 	mov	pc, lr
 
 /*
@@ -400,6 +397,12 @@ ENTRY(xscale_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(xscale_dma_unmap_area)
 
+ENTRY(xscale_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ Drain Write (& Fill) Buffer
+	mov	pc, lr
+ENDPROC(xscsale_dma_barrier)
+
 ENTRY(xscale_cache_fns)
 	.long	xscale_flush_kern_cache_all
 	.long	xscale_flush_user_cache_all
@@ -409,6 +412,7 @@ ENTRY(xscale_cache_fns)
 	.long	xscale_flush_kern_dcache_area
 	.long	xscale_dma_map_area
 	.long	xscale_dma_unmap_area
+	.long	xscale_dma_barrier
 	.long	xscale_dma_flush_range
 
 /*

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 20:37 [RFC 0/2] fix dma_map_sg not to do barriers for each buffer adharmap at codeaurora.org
                   ` (2 preceding siblings ...)
  2010-02-10 21:21 ` [RFC 0/2] fix dma_map_sg not to do barriers for each buffer Russell King - ARM Linux
@ 2010-02-10 21:27 ` Randy Dunlap
  2010-02-10 22:40   ` Russell King - ARM Linux
  2010-02-11  0:39   ` FUJITA Tomonori
  3 siblings, 2 replies; 18+ messages in thread
From: Randy Dunlap @ 2010-02-10 21:27 UTC (permalink / raw)
  To: linux-arm-kernel

On 02/10/10 12:37, adharmap at codeaurora.org wrote:
> From: Abhijeet Dharmapurikar <adharmap@quicinc.com>
> 
> Please refer to the post here
> http://lkml.org/lkml/2010/1/4/347
> 
> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> use them to map the buffers in the scatterlist. For the last buffer, call
> the normal dma_map_area(aka with barriers) effectively executing the barrier
> at the end of the operation.
> 
> Note that the barrierless operations are implemented for few arm
> architectures only and I would implement for others once these are okayed by the
> community.

So when you add these interfaces for other architectures, you will also
update Documentation/DMA-API.txt,  right??


> Abhijeet Dharmapurikar (2):
>   dma: define barrierless versions of map and unmap area
>   dma: fix scatter-gather api to use barrierless map/unmap functions
> 
>  arch/arm/include/asm/cacheflush.h  |    9 +++
>  arch/arm/include/asm/dma-mapping.h |   82 +++++++++++++++++++++
>  arch/arm/mm/cache-v3.S             |    6 ++
>  arch/arm/mm/cache-v4.S             |    6 ++
>  arch/arm/mm/cache-v4wb.S           |   94 +++++++++++++++++-------
>  arch/arm/mm/cache-v4wt.S           |    6 ++
>  arch/arm/mm/cache-v6.S             |  139 +++++++++++++++++++++++++----------
>  arch/arm/mm/cache-v7.S             |  120 +++++++++++++++++++++++--------
>  arch/arm/mm/dma-mapping.c          |   55 +++++++++++++--
>  9 files changed, 414 insertions(+), 103 deletions(-)


-- 
~Randy

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 21:27 ` Randy Dunlap
@ 2010-02-10 22:40   ` Russell King - ARM Linux
  2010-02-10 23:10     ` Abhijeet Dharmapurikar
  2010-02-11  0:39   ` FUJITA Tomonori
  1 sibling, 1 reply; 18+ messages in thread
From: Russell King - ARM Linux @ 2010-02-10 22:40 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Feb 10, 2010 at 01:27:47PM -0800, Randy Dunlap wrote:
> On 02/10/10 12:37, adharmap at codeaurora.org wrote:
> > From: Abhijeet Dharmapurikar <adharmap@quicinc.com>
> > 
> > Please refer to the post here
> > http://lkml.org/lkml/2010/1/4/347
> > 
> > These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> > use them to map the buffers in the scatterlist. For the last buffer, call
> > the normal dma_map_area(aka with barriers) effectively executing the barrier
> > at the end of the operation.
> > 
> > Note that the barrierless operations are implemented for few arm
> > architectures only and I would implement for others once these are okayed by the
> > community.
> 
> So when you add these interfaces for other architectures, you will also
> update Documentation/DMA-API.txt,  right??

Do we need barrier-less interfaces for anything other than the dma_*_sg
functions?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 22:40   ` Russell King - ARM Linux
@ 2010-02-10 23:10     ` Abhijeet Dharmapurikar
  0 siblings, 0 replies; 18+ messages in thread
From: Abhijeet Dharmapurikar @ 2010-02-10 23:10 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King - ARM Linux wrote:
> On Wed, Feb 10, 2010 at 01:27:47PM -0800, Randy Dunlap wrote:
>> On 02/10/10 12:37, adharmap at codeaurora.org wrote:
>>> From: Abhijeet Dharmapurikar <adharmap@quicinc.com>
>>>
>>> Please refer to the post here
>>> http://lkml.org/lkml/2010/1/4/347
>>>
>>> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
>>> use them to map the buffers in the scatterlist. For the last buffer, call
>>> the normal dma_map_area(aka with barriers) effectively executing the barrier
>>> at the end of the operation.
>>>
>>> Note that the barrierless operations are implemented for few arm
>>> architectures only and I would implement for others once these are okayed by the
>>> community.
>> So when you add these interfaces for other architectures, you will also
>> update Documentation/DMA-API.txt,  right??
> 
> Do we need barrier-less interfaces for anything other than the dma_*_sg
> functions?

I think, dma_*_sg are the only ones that could benefit from barrier-less 
interfaces.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 21:21 ` [RFC 0/2] fix dma_map_sg not to do barriers for each buffer Russell King - ARM Linux
@ 2010-02-10 23:28   ` Abhijeet Dharmapurikar
  2010-02-10 23:57     ` Russell King - ARM Linux
  2010-02-11 10:45   ` Catalin Marinas
  1 sibling, 1 reply; 18+ messages in thread
From: Abhijeet Dharmapurikar @ 2010-02-10 23:28 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King - ARM Linux wrote:
> On Wed, Feb 10, 2010 at 12:37:28PM -0800, adharmap at codeaurora.org wrote:
>> From: Abhijeet Dharmapurikar <adharmap@quicinc.com>
>>
>> Please refer to the post here
>> http://lkml.org/lkml/2010/1/4/347
>>
>> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
>> use them to map the buffers in the scatterlist. For the last buffer, call
>> the normal dma_map_area(aka with barriers) effectively executing the barrier
>> at the end of the operation.
> 
> What if we make dma_map_area and dma_unmap_area both be barrier-less,
> and instead have a separate dma_barrier method - eg, something like the
> attached?
> 
> This might allow for better I-cache usage by not having to duplicate the
> DMA cache coherence functions.

Agree, thanks for pointing this and for the patch.


> 
> @@ -369,6 +372,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
>  	BUG_ON(!valid_dma_direction(dir));
>  
>  	__dma_page_cpu_to_dev(page, offset, size, dir);
> +	__dma_barrier(dir);
>  
>  	return page_to_dma(dev, page) + offset;
>  }

dma_map_page is going to execute the barrier here.


>  /**
>   * dma_map_sg - map a set of SG buffers for streaming mode DMA
>   * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
> @@ -537,6 +544,9 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
>  		if (dma_mapping_error(dev, s->dma_address))
>  			goto bad_mapping;
>  	}
> +
> +	__dma_barrier(dir);
> +
>  	return nents;

This would call the barrier in addition to the ones executed by 
dma_map_page.

We would need to call __dma_page_cpu_to_dev instead of dma_map_page and 
do the barrier before returning.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 23:28   ` Abhijeet Dharmapurikar
@ 2010-02-10 23:57     ` Russell King - ARM Linux
  2010-02-11 21:36       ` Abhijeet Dharmapurikar
  0 siblings, 1 reply; 18+ messages in thread
From: Russell King - ARM Linux @ 2010-02-10 23:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Feb 10, 2010 at 03:28:17PM -0800, Abhijeet Dharmapurikar wrote:
> This would call the barrier in addition to the ones executed by  
> dma_map_page.
>
> We would need to call __dma_page_cpu_to_dev instead of dma_map_page and  
> do the barrier before returning.

It's not that simple because of the dmabounce crap.  Ho hum, let's add
yet another layer of indirection for it.

 arch/arm/include/asm/cacheflush.h  |    4 ++++
 arch/arm/include/asm/dma-mapping.h |   19 +++++++++++++++++--
 arch/arm/mm/cache-fa.S             |   13 +++++++------
 arch/arm/mm/cache-v3.S             |    3 +++
 arch/arm/mm/cache-v4.S             |    3 +++
 arch/arm/mm/cache-v4wb.S           |    9 +++++++--
 arch/arm/mm/cache-v4wt.S           |    3 +++
 arch/arm/mm/cache-v6.S             |   13 +++++++------
 arch/arm/mm/cache-v7.S             |    9 ++++++---
 arch/arm/mm/dma-mapping.c          |   18 +++++++++++++++++-
 arch/arm/mm/proc-arm1020e.S        |   10 +++++++---
 arch/arm/mm/proc-arm1022.S         |   10 +++++++---
 arch/arm/mm/proc-arm1026.S         |   10 +++++++---
 arch/arm/mm/proc-arm920.S          |   10 +++++++---
 arch/arm/mm/proc-arm922.S          |   10 +++++++---
 arch/arm/mm/proc-arm925.S          |   10 +++++++---
 arch/arm/mm/proc-arm926.S          |   10 +++++++---
 arch/arm/mm/proc-arm940.S          |   10 +++++++---
 arch/arm/mm/proc-arm946.S          |   10 +++++++---
 arch/arm/mm/proc-feroceon.S        |   13 ++++++++-----
 arch/arm/mm/proc-mohawk.S          |   10 +++++++---
 arch/arm/mm/proc-xsc3.S            |   10 +++++++---
 arch/arm/mm/proc-xscale.S          |   10 +++++++---
 23 files changed, 166 insertions(+), 61 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index e290885..5928e78 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -200,6 +200,7 @@ struct cpu_cache_fns {
 
 	void (*dma_map_area)(const void *, size_t, int);
 	void (*dma_unmap_area)(const void *, size_t, int);
+	void (*dma_barrier)(void);
 
 	void (*dma_flush_range)(const void *, const void *);
 };
@@ -232,6 +233,7 @@ extern struct cpu_cache_fns cpu_cache;
  */
 #define dmac_map_area			cpu_cache.dma_map_area
 #define dmac_unmap_area		cpu_cache.dma_unmap_area
+#define dmac_barrier			cpu_cache.dma_barrier
 #define dmac_flush_range		cpu_cache.dma_flush_range
 
 #else
@@ -258,10 +260,12 @@ extern void __cpuc_flush_dcache_area(void *, size_t);
  */
 #define dmac_map_area			__glue(_CACHE,_dma_map_area)
 #define dmac_unmap_area		__glue(_CACHE,_dma_unmap_area)
+#define dmac_barrier			__glue(_CACHE,_dma_barrier)
 #define dmac_flush_range		__glue(_CACHE,_dma_flush_range)
 
 extern void dmac_map_area(const void *, size_t, int);
 extern void dmac_unmap_area(const void *, size_t, int);
+extern void dmac_barrier(void);
 extern void dmac_flush_range(const void *, const void *);
 
 #endif
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 256ee1c..1371db7 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -110,6 +110,8 @@ static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
 		___dma_page_dev_to_cpu(page, off, size, dir);
 }
 
+extern void __dma_barrier(enum dma_data_direction);
+
 /*
  * Return whether the given device DMA address mask can be supported
  * properly.  For example, if your device can only drive the low 24-bits
@@ -299,7 +301,7 @@ extern dma_addr_t dma_map_single(struct device *, void *, size_t,
 		enum dma_data_direction);
 extern void dma_unmap_single(struct device *, dma_addr_t, size_t,
 		enum dma_data_direction);
-extern dma_addr_t dma_map_page(struct device *, struct page *,
+extern dma_addr_t __dma_map_page(struct device *, struct page *,
 		unsigned long, size_t, enum dma_data_direction);
 extern void dma_unmap_page(struct device *, dma_addr_t, size_t,
 		enum dma_data_direction);
@@ -345,6 +347,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
 	BUG_ON(!valid_dma_direction(dir));
 
 	__dma_single_cpu_to_dev(cpu_addr, size, dir);
+	__dma_barrier(dir);
 
 	return virt_to_dma(dev, cpu_addr);
 }
@@ -363,7 +366,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
  * The device owns this memory once this call has completed.  The CPU
  * can regain ownership by calling dma_unmap_page().
  */
-static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+static inline dma_addr_t __dma_map_page(struct device *dev, struct page *page,
 	     unsigned long offset, size_t size, enum dma_data_direction dir)
 {
 	BUG_ON(!valid_dma_direction(dir));
@@ -373,6 +376,14 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 	return page_to_dma(dev, page) + offset;
 }
 
+static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
+	     unsigned long offset, size_t size, enum dma_data_direction dir)
+{
+	dma_addr_t addr = __dma_map_page(page, offset, size, dir);
+	__dma_barrier(dir);
+	return addr;
+}
+
 /**
  * dma_unmap_single - unmap a single buffer previously mapped
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -391,6 +402,7 @@ static inline void dma_unmap_single(struct device *dev, dma_addr_t handle,
 		size_t size, enum dma_data_direction dir)
 {
 	__dma_single_dev_to_cpu(dma_to_virt(dev, handle), size, dir);
+	__dma_barrier(dir);
 }
 
 /**
@@ -412,6 +424,7 @@ static inline void dma_unmap_page(struct device *dev, dma_addr_t handle,
 {
 	__dma_page_dev_to_cpu(dma_to_page(dev, handle), handle & ~PAGE_MASK,
 		size, dir);
+	__dma_barrier(dir);
 }
 #endif /* CONFIG_DMABOUNCE */
 
@@ -443,6 +456,7 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
 		return;
 
 	__dma_single_dev_to_cpu(dma_to_virt(dev, handle) + offset, size, dir);
+	__dma_barrier(dir);
 }
 
 static inline void dma_sync_single_range_for_device(struct device *dev,
@@ -455,6 +469,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 		return;
 
 	__dma_single_cpu_to_dev(dma_to_virt(dev, handle) + offset, size, dir);
+	__dma_barrier(dir);
 }
 
 static inline void dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
index 7148e53..cdcfae2 100644
--- a/arch/arm/mm/cache-fa.S
+++ b/arch/arm/mm/cache-fa.S
@@ -168,8 +168,6 @@ fa_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -186,8 +184,6 @@ fa_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0	
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -201,8 +197,6 @@ ENTRY(fa_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0	
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -229,6 +223,12 @@ ENTRY(fa_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(fa_dma_unmap_area)
 
+ENTRY(fa_dma_barrier)
+	mov	r0, #0	
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+ENDPROC(fa_dma_barrier)
+
 	__INITDATA
 
 	.type	fa_cache_fns, #object
@@ -241,5 +241,6 @@ ENTRY(fa_cache_fns)
 	.long	fa_flush_kern_dcache_area
 	.long	fa_dma_map_area
 	.long	fa_dma_unmap_area
+	.long	fa_dma_barrier
 	.long	fa_dma_flush_range
 	.size	fa_cache_fns, . - fa_cache_fns
diff --git a/arch/arm/mm/cache-v3.S b/arch/arm/mm/cache-v3.S
index c2ff3c5..df34458 100644
--- a/arch/arm/mm/cache-v3.S
+++ b/arch/arm/mm/cache-v3.S
@@ -114,9 +114,11 @@ ENTRY(v3_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v3_dma_map_area)
+ENTRY(v3_dma_barrier)
 	mov	pc, lr
 ENDPROC(v3_dma_unmap_area)
 ENDPROC(v3_dma_map_area)
+ENDPROC(v3_dma_barrier)
 
 	__INITDATA
 
@@ -130,5 +132,6 @@ ENTRY(v3_cache_fns)
 	.long	v3_flush_kern_dcache_area
 	.long	v3_dma_map_area
 	.long	v3_dma_unmap_area
+	.long	v3_dma_barrier
 	.long	v3_dma_flush_range
 	.size	v3_cache_fns, . - v3_cache_fns
diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S
index 4810f7e..20260b1 100644
--- a/arch/arm/mm/cache-v4.S
+++ b/arch/arm/mm/cache-v4.S
@@ -126,9 +126,11 @@ ENTRY(v4_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v4_dma_map_area)
+ENTRY(v4_dma_barrier)
 	mov	pc, lr
 ENDPROC(v4_dma_unmap_area)
 ENDPROC(v4_dma_map_area)
+ENDPROC(v4_dma_barrier)
 
 	__INITDATA
 
@@ -142,5 +144,6 @@ ENTRY(v4_cache_fns)
 	.long	v4_flush_kern_dcache_area
 	.long	v4_dma_map_area
 	.long	v4_dma_unmap_area
+	.long	v4_dma_barrier
 	.long	v4_dma_flush_range
 	.size	v4_cache_fns, . - v4_cache_fns
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index df8368a..9c9c875 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -183,7 +183,6 @@ v4wb_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -200,7 +199,6 @@ v4wb_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -240,6 +238,12 @@ ENTRY(v4wb_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(v4wb_dma_unmap_area)
 
+ENTRY(v4wb_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+ENDPROC(v4wb_dma_barrier)
+
 	__INITDATA
 
 	.type	v4wb_cache_fns, #object
@@ -252,5 +256,6 @@ ENTRY(v4wb_cache_fns)
 	.long	v4wb_flush_kern_dcache_area
 	.long	v4wb_dma_map_area
 	.long	v4wb_dma_unmap_area
+	.long	v4wb_dma_barrier
 	.long	v4wb_dma_flush_range
 	.size	v4wb_cache_fns, . - v4wb_cache_fns
diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S
index 45c7031..223eea4 100644
--- a/arch/arm/mm/cache-v4wt.S
+++ b/arch/arm/mm/cache-v4wt.S
@@ -180,9 +180,11 @@ ENTRY(v4wt_dma_unmap_area)
  *	- dir	- DMA direction
  */
 ENTRY(v4wt_dma_map_area)
+ENTRY(v4wt_dma_barrier)
 	mov	pc, lr
 ENDPROC(v4wt_dma_unmap_area)
 ENDPROC(v4wt_dma_map_area)
+ENDPROC(v4wt_dma_barrier)
 
 	__INITDATA
 
@@ -196,5 +198,6 @@ ENTRY(v4wt_cache_fns)
 	.long	v4wt_flush_kern_dcache_area
 	.long	v4wt_dma_map_area
 	.long	v4wt_dma_unmap_area
+	.long	v4wt_dma_barrier
 	.long	v4wt_dma_flush_range
 	.size	v4wt_cache_fns, . - v4wt_cache_fns
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 9d89c67..b294854 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -219,8 +219,6 @@ v6_dma_inv_range:
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -239,8 +237,6 @@ v6_dma_clean_range:
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -259,8 +255,6 @@ ENTRY(v6_dma_flush_range)
 	add	r0, r0, #D_CACHE_LINE_SIZE
 	cmp	r0, r1
 	blo	1b
-	mov	r0, #0
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
@@ -289,6 +283,12 @@ ENTRY(v6_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(v6_dma_unmap_area)
 
+ENTRY(v6_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
+	mov	pc, lr
+ENDPROC(v6_dma_barrier)
+
 	__INITDATA
 
 	.type	v6_cache_fns, #object
@@ -301,5 +301,6 @@ ENTRY(v6_cache_fns)
 	.long	v6_flush_kern_dcache_area
 	.long	v6_dma_map_area
 	.long	v6_dma_unmap_area
+	.long	v6_dma_barrier
 	.long	v6_dma_flush_range
 	.size	v6_cache_fns, . - v6_cache_fns
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index bcd64f2..d89d55a 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -231,7 +231,6 @@ v7_dma_inv_range:
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
 	mov	pc, lr
 ENDPROC(v7_dma_inv_range)
 
@@ -249,7 +248,6 @@ v7_dma_clean_range:
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
 	mov	pc, lr
 ENDPROC(v7_dma_clean_range)
 
@@ -267,7 +265,6 @@ ENTRY(v7_dma_flush_range)
 	add	r0, r0, r2
 	cmp	r0, r1
 	blo	1b
-	dsb
 	mov	pc, lr
 ENDPROC(v7_dma_flush_range)
 
@@ -297,6 +294,11 @@ ENTRY(v7_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(v7_dma_unmap_area)
 
+ENTRY(v7_dma_barrier)
+	dsb
+	mov	pc, lr
+ENDPROC(v7_dma_barrier)
+
 	__INITDATA
 
 	.type	v7_cache_fns, #object
@@ -309,5 +311,6 @@ ENTRY(v7_cache_fns)
 	.long	v7_flush_kern_dcache_area
 	.long	v7_dma_map_area
 	.long	v7_dma_unmap_area
+	.long	v6_dma_barrier
 	.long	v7_dma_flush_range
 	.size	v7_cache_fns, . - v7_cache_fns
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 64daef2..d807f38 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -108,6 +108,7 @@ static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gf
 	memset(ptr, 0, size);
 	dmac_flush_range(ptr, ptr + size);
 	outer_flush_range(__pa(ptr), __pa(ptr) + size);
+	dmac_barrier();
 
 	return page;
 }
@@ -509,6 +510,12 @@ void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
 }
 EXPORT_SYMBOL(___dma_page_dev_to_cpu);
 
+void __dma_barrier(enum dma_data_direction dir)
+{
+	dmac_barrier();
+}
+EXPORT_SYMBOL(__dma_barrier);
+
 /**
  * dma_map_sg - map a set of SG buffers for streaming mode DMA
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -532,11 +539,14 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	int i, j;
 
 	for_each_sg(sg, s, nents, i) {
-		s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
+		s->dma_address = __dma_map_page(dev, sg_page(s), s->offset,
 						s->length, dir);
 		if (dma_mapping_error(dev, s->dma_address))
 			goto bad_mapping;
 	}
+
+	__dma_barrier(dir);
+
 	return nents;
 
  bad_mapping:
@@ -564,6 +574,8 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 
 	for_each_sg(sg, s, nents, i)
 		dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
+
+	__dma_barrier(dir);
 }
 EXPORT_SYMBOL(dma_unmap_sg);
 
@@ -588,6 +600,8 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		__dma_page_dev_to_cpu(sg_page(s), s->offset,
 				      s->length, dir);
 	}
+
+	__dma_barrier(dir);
 }
 EXPORT_SYMBOL(dma_sync_sg_for_cpu);
 
@@ -612,5 +626,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		__dma_page_cpu_to_dev(sg_page(s), s->offset,
 				      s->length, dir);
 	}
+
+	__dma_barrier(dir);
 }
 EXPORT_SYMBOL(dma_sync_sg_for_device);
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index d278298..fea33c9 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -271,7 +271,6 @@ arm1020e_dma_inv_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -293,7 +292,6 @@ arm1020e_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -313,7 +311,6 @@ ENTRY(arm1020e_dma_flush_range)
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -340,6 +337,12 @@ ENTRY(arm1020e_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm1020e_dma_unmap_area)
 
+ENTRY(arm1020e_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm1020e_dma_barrier)
+
 ENTRY(arm1020e_cache_fns)
 	.long	arm1020e_flush_kern_cache_all
 	.long	arm1020e_flush_user_cache_all
@@ -349,6 +352,7 @@ ENTRY(arm1020e_cache_fns)
 	.long	arm1020e_flush_kern_dcache_area
 	.long	arm1020e_dma_map_area
 	.long	arm1020e_dma_unmap_area
+	.long	arm1020e_dma_barrier
 	.long	arm1020e_dma_flush_range
 
 	.align	5
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index ce13e4a..ba1a7df 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -260,7 +260,6 @@ arm1022_dma_inv_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -282,7 +281,6 @@ arm1022_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -302,7 +300,6 @@ ENTRY(arm1022_dma_flush_range)
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -329,6 +326,12 @@ ENTRY(arm1022_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm1022_dma_unmap_area)
 
+ENTRY(arm1022_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm1022_dma_barrier)
+
 ENTRY(arm1022_cache_fns)
 	.long	arm1022_flush_kern_cache_all
 	.long	arm1022_flush_user_cache_all
@@ -338,6 +341,7 @@ ENTRY(arm1022_cache_fns)
 	.long	arm1022_flush_kern_dcache_area
 	.long	arm1022_dma_map_area
 	.long	arm1022_dma_unmap_area
+	.long	arm1022_dma_barrier
 	.long	arm1022_dma_flush_range
 
 	.align	5
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index 636672a..de648f1 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -254,7 +254,6 @@ arm1026_dma_inv_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -276,7 +275,6 @@ arm1026_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -296,7 +294,6 @@ ENTRY(arm1026_dma_flush_range)
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -323,6 +320,12 @@ ENTRY(arm1026_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm1026_dma_unmap_area)
 
+ENTRY(arm1026_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm1026_dma_barrier)
+
 ENTRY(arm1026_cache_fns)
 	.long	arm1026_flush_kern_cache_all
 	.long	arm1026_flush_user_cache_all
@@ -332,6 +335,7 @@ ENTRY(arm1026_cache_fns)
 	.long	arm1026_flush_kern_dcache_area
 	.long	arm1026_dma_map_area
 	.long	arm1026_dma_unmap_area
+	.long	arm1026_dma_barrier
 	.long	arm1026_dma_flush_range
 
 	.align	5
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 8be8199..ec74093 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -249,7 +249,6 @@ arm920_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -268,7 +267,6 @@ arm920_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -285,7 +283,6 @@ ENTRY(arm920_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -312,6 +309,12 @@ ENTRY(arm920_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm920_dma_unmap_area)
 
+ENTRY(arm920_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm920_dma_barrier)
+
 ENTRY(arm920_cache_fns)
 	.long	arm920_flush_kern_cache_all
 	.long	arm920_flush_user_cache_all
@@ -321,6 +324,7 @@ ENTRY(arm920_cache_fns)
 	.long	arm920_flush_kern_dcache_area
 	.long	arm920_dma_map_area
 	.long	arm920_dma_unmap_area
+	.long	arm920_dma_barrier
 	.long	arm920_dma_flush_range
 
 #endif
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index c0ff8e4..474d4c6 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -251,7 +251,6 @@ arm922_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -270,7 +269,6 @@ arm922_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -287,7 +285,6 @@ ENTRY(arm922_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -314,6 +311,12 @@ ENTRY(arm922_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm922_dma_unmap_area)
 
+ENTRY(arm922_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm922_dma_barrier)
+
 ENTRY(arm922_cache_fns)
 	.long	arm922_flush_kern_cache_all
 	.long	arm922_flush_user_cache_all
@@ -323,6 +326,7 @@ ENTRY(arm922_cache_fns)
 	.long	arm922_flush_kern_dcache_area
 	.long	arm922_dma_map_area
 	.long	arm922_dma_unmap_area
+	.long	arm922_dma_barrier
 	.long	arm922_dma_flush_range
 
 #endif
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index 3c6cffe..0336ae3 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -295,7 +295,6 @@ arm925_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -316,7 +315,6 @@ arm925_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -338,7 +336,6 @@ ENTRY(arm925_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -365,6 +362,12 @@ ENTRY(arm925_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm925_dma_unmap_area)
 
+ENTRY(arm925_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm925_dma_barrier)
+
 ENTRY(arm925_cache_fns)
 	.long	arm925_flush_kern_cache_all
 	.long	arm925_flush_user_cache_all
@@ -374,6 +377,7 @@ ENTRY(arm925_cache_fns)
 	.long	arm925_flush_kern_dcache_area
 	.long	arm925_dma_map_area
 	.long	arm925_dma_unmap_area
+	.long	arm925_dma_barrier
 	.long	arm925_dma_flush_range
 
 ENTRY(cpu_arm925_dcache_clean_area)
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 75b707c..473bbe6 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -258,7 +258,6 @@ arm926_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -279,7 +278,6 @@ arm926_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -301,7 +299,6 @@ ENTRY(arm926_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -328,6 +325,12 @@ ENTRY(arm926_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm926_dma_unmap_area)
 
+ENTRY(arm926_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm926_dma_barrier)
+
 ENTRY(arm926_cache_fns)
 	.long	arm926_flush_kern_cache_all
 	.long	arm926_flush_user_cache_all
@@ -337,6 +340,7 @@ ENTRY(arm926_cache_fns)
 	.long	arm926_flush_kern_dcache_area
 	.long	arm926_dma_map_area
 	.long	arm926_dma_unmap_area
+	.long	arm926_dma_barrier
 	.long	arm926_dma_flush_range
 
 ENTRY(cpu_arm926_dcache_clean_area)
diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S
index 1af1657..c44c963 100644
--- a/arch/arm/mm/proc-arm940.S
+++ b/arch/arm/mm/proc-arm940.S
@@ -180,7 +180,6 @@ arm940_dma_inv_range:
 	bcs	2b				@ entries 63 to 0
 	subs	r1, r1, #1 << 4
 	bcs	1b				@ segments 7 to 0
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -204,7 +203,6 @@ ENTRY(cpu_arm940_dcache_clean_area)
 	subs	r1, r1, #1 << 4
 	bcs	1b				@ segments 7 to 0
 #endif
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -230,7 +228,6 @@ ENTRY(arm940_dma_flush_range)
 	bcs	2b				@ entries 63 to 0
 	subs	r1, r1, #1 << 4
 	bcs	1b				@ segments 7 to 0
-	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -257,6 +254,12 @@ ENTRY(arm940_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm940_dma_unmap_area)
 
+ENTRY(arm940_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, ip, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm940_dma_barrier)
+
 ENTRY(arm940_cache_fns)
 	.long	arm940_flush_kern_cache_all
 	.long	arm940_flush_user_cache_all
@@ -266,6 +269,7 @@ ENTRY(arm940_cache_fns)
 	.long	arm940_flush_kern_dcache_area
 	.long	arm940_dma_map_area
 	.long	arm940_dma_unmap_area
+	.long	arm940_dma_barrier
 	.long	arm940_dma_flush_range
 
 	__INIT
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index 1664b6a..11e9ad7 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -227,7 +227,6 @@ arm946_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -248,7 +247,6 @@ arm946_dma_clean_range:
 	cmp	r0, r1
 	blo	1b
 #endif
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -272,7 +270,6 @@ ENTRY(arm946_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -299,6 +296,12 @@ ENTRY(arm946_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(arm946_dma_unmap_area)
 
+ENTRY(arm946_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(arm946_dma_barrier)
+
 ENTRY(arm946_cache_fns)
 	.long	arm946_flush_kern_cache_all
 	.long	arm946_flush_user_cache_all
@@ -308,6 +311,7 @@ ENTRY(arm946_cache_fns)
 	.long	arm946_flush_kern_dcache_area
 	.long	arm946_dma_map_area
 	.long	arm946_dma_unmap_area
+	.long	arm946_dma_barrier
 	.long	arm946_dma_flush_range
 
 
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index 53e6323..50a309e 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -284,7 +284,6 @@ feroceon_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 	.align	5
@@ -320,7 +319,6 @@ feroceon_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 	.align	5
@@ -333,7 +331,6 @@ feroceon_range_dma_clean_range:
 	mcr	p15, 5, r0, c15, c13, 0		@ D clean range start
 	mcr	p15, 5, r1, c15, c13, 1		@ D clean range top
 	msr	cpsr_c, r2			@ restore interrupts
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -351,7 +348,6 @@ ENTRY(feroceon_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 	.align	5
@@ -364,7 +360,6 @@ ENTRY(feroceon_range_dma_flush_range)
 	mcr	p15, 5, r0, c15, c15, 0		@ D clean/inv range start
 	mcr	p15, 5, r1, c15, c15, 1		@ D clean/inv range top
 	msr	cpsr_c, r2			@ restore interrupts
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -405,6 +400,12 @@ ENTRY(feroceon_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(feroceon_dma_unmap_area)
 
+ENTRY(feroceon_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(feroceon_dma_barrier)
+
 ENTRY(feroceon_cache_fns)
 	.long	feroceon_flush_kern_cache_all
 	.long	feroceon_flush_user_cache_all
@@ -414,6 +415,7 @@ ENTRY(feroceon_cache_fns)
 	.long	feroceon_flush_kern_dcache_area
 	.long	feroceon_dma_map_area
 	.long	feroceon_dma_unmap_area
+	.long	feroceon_dma_barrier
 	.long	feroceon_dma_flush_range
 
 ENTRY(feroceon_range_cache_fns)
@@ -425,6 +427,7 @@ ENTRY(feroceon_range_cache_fns)
 	.long	feroceon_range_flush_kern_dcache_area
 	.long	feroceon_range_dma_map_area
 	.long	feroceon_dma_unmap_area
+	.long	feroceon_dma_barrier
 	.long	feroceon_range_dma_flush_range
 
 	.align	5
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index caa3115..09e8883 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -228,7 +228,6 @@ mohawk_dma_inv_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -247,7 +246,6 @@ mohawk_dma_clean_range:
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -265,7 +263,6 @@ ENTRY(mohawk_dma_flush_range)
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
@@ -292,6 +289,12 @@ ENTRY(mohawk_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(mohawk_dma_unmap_area)
 
+ENTRY(mohawk_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
+	mov	pc, lr
+ENDPROC(mohawk_dma_barrier)
+
 ENTRY(mohawk_cache_fns)
 	.long	mohawk_flush_kern_cache_all
 	.long	mohawk_flush_user_cache_all
@@ -301,6 +304,7 @@ ENTRY(mohawk_cache_fns)
 	.long	mohawk_flush_kern_dcache_area
 	.long	mohawk_dma_map_area
 	.long	mohawk_dma_unmap_area
+	.long	mohawk_dma_barrier
 	.long	mohawk_dma_flush_range
 
 ENTRY(cpu_mohawk_dcache_clean_area)
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index 046b3d8..d033ed4 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -267,7 +267,6 @@ xsc3_dma_inv_range:
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mov	pc, lr
 
 /*
@@ -284,7 +283,6 @@ xsc3_dma_clean_range:
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mov	pc, lr
 
 /*
@@ -301,7 +299,6 @@ ENTRY(xsc3_dma_flush_range)
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
 	mov	pc, lr
 
 /*
@@ -328,6 +325,12 @@ ENTRY(xsc3_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(xsc3_dma_unmap_area)
 
+ENTRY(xsc3_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ data write barrier
+	mov	pc, lr
+ENDPROC(xsc3_dma_barrier)
+
 ENTRY(xsc3_cache_fns)
 	.long	xsc3_flush_kern_cache_all
 	.long	xsc3_flush_user_cache_all
@@ -337,6 +340,7 @@ ENTRY(xsc3_cache_fns)
 	.long	xsc3_flush_kern_dcache_area
 	.long	xsc3_dma_map_area
 	.long	xsc3_dma_unmap_area
+	.long	xsc3_dma_barrier
 	.long	xsc3_dma_flush_range
 
 ENTRY(cpu_xsc3_dcache_clean_area)
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 63037e2..e390ae6 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -325,7 +325,6 @@ xscale_dma_inv_range:
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ Drain Write (& Fill) Buffer
 	mov	pc, lr
 
 /*
@@ -342,7 +341,6 @@ xscale_dma_clean_range:
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ Drain Write (& Fill) Buffer
 	mov	pc, lr
 
 /*
@@ -360,7 +358,6 @@ ENTRY(xscale_dma_flush_range)
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ Drain Write (& Fill) Buffer
 	mov	pc, lr
 
 /*
@@ -400,6 +397,12 @@ ENTRY(xscale_dma_unmap_area)
 	mov	pc, lr
 ENDPROC(xscale_dma_unmap_area)
 
+ENTRY(xscale_dma_barrier)
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c10, 4		@ Drain Write (& Fill) Buffer
+	mov	pc, lr
+ENDPROC(xscsale_dma_barrier)
+
 ENTRY(xscale_cache_fns)
 	.long	xscale_flush_kern_cache_all
 	.long	xscale_flush_user_cache_all
@@ -409,6 +412,7 @@ ENTRY(xscale_cache_fns)
 	.long	xscale_flush_kern_dcache_area
 	.long	xscale_dma_map_area
 	.long	xscale_dma_unmap_area
+	.long	xscale_dma_barrier
 	.long	xscale_dma_flush_range
 
 /*

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 21:27 ` Randy Dunlap
  2010-02-10 22:40   ` Russell King - ARM Linux
@ 2010-02-11  0:39   ` FUJITA Tomonori
  2010-02-11  0:41     ` Randy Dunlap
  1 sibling, 1 reply; 18+ messages in thread
From: FUJITA Tomonori @ 2010-02-11  0:39 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 10 Feb 2010 13:27:47 -0800
Randy Dunlap <rdunlap@xenotime.net> wrote:

> On 02/10/10 12:37, adharmap at codeaurora.org wrote:
> > From: Abhijeet Dharmapurikar <adharmap@quicinc.com>
> > 
> > Please refer to the post here
> > http://lkml.org/lkml/2010/1/4/347
> > 
> > These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> > use them to map the buffers in the scatterlist. For the last buffer, call
> > the normal dma_map_area(aka with barriers) effectively executing the barrier
> > at the end of the operation.
> > 
> > Note that the barrierless operations are implemented for few arm
> > architectures only and I would implement for others once these are okayed by the
> > community.
> 
> So when you add these interfaces for other architectures, you will also
> update Documentation/DMA-API.txt,  right??

Seems that you misunderstand him.

He is talking about other "arm" architectures. His patchset improves
arm's internal implementation (dma_map_area and dma_unmap_area are not
the DMA API; not exported for driver writers). He meant that the
patchset doesn't cover all arm architectures.

This is about arm's implementation details and not related with other
non arm architectures. So no need to update Documentation/DMA-API.txt.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-11  0:39   ` FUJITA Tomonori
@ 2010-02-11  0:41     ` Randy Dunlap
  0 siblings, 0 replies; 18+ messages in thread
From: Randy Dunlap @ 2010-02-11  0:41 UTC (permalink / raw)
  To: linux-arm-kernel

On 02/10/10 16:39, FUJITA Tomonori wrote:
> On Wed, 10 Feb 2010 13:27:47 -0800
> Randy Dunlap <rdunlap@xenotime.net> wrote:
> 
>> On 02/10/10 12:37, adharmap at codeaurora.org wrote:
>>> From: Abhijeet Dharmapurikar <adharmap@quicinc.com>
>>>
>>> Please refer to the post here
>>> http://lkml.org/lkml/2010/1/4/347
>>>
>>> These changes are to introduce barrierless dma_map_area and dma_unmap_area and
>>> use them to map the buffers in the scatterlist. For the last buffer, call
>>> the normal dma_map_area(aka with barriers) effectively executing the barrier
>>> at the end of the operation.
>>>
>>> Note that the barrierless operations are implemented for few arm
>>> architectures only and I would implement for others once these are okayed by the
>>> community.
>>
>> So when you add these interfaces for other architectures, you will also
>> update Documentation/DMA-API.txt,  right??
> 
> Seems that you misunderstand him.
> 
> He is talking about other "arm" architectures. His patchset improves
> arm's internal implementation (dma_map_area and dma_unmap_area are not
> the DMA API; not exported for driver writers). He meant that the
> patchset doesn't cover all arm architectures.
> 
> This is about arm's implementation details and not related with other
> non arm architectures. So no need to update Documentation/DMA-API.txt.

OK, in that case I did misunderstand.  Thanks for the info.

-- 
~Randy

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 21:21 ` [RFC 0/2] fix dma_map_sg not to do barriers for each buffer Russell King - ARM Linux
  2010-02-10 23:28   ` Abhijeet Dharmapurikar
@ 2010-02-11 10:45   ` Catalin Marinas
  2010-02-11 10:53     ` Catalin Marinas
  2010-02-11 10:56     ` Russell King - ARM Linux
  1 sibling, 2 replies; 18+ messages in thread
From: Catalin Marinas @ 2010-02-11 10:45 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 2010-02-10 at 21:21 +0000, Russell King - ARM Linux wrote:
> On Wed, Feb 10, 2010 at 12:37:28PM -0800, adharmap at codeaurora.org wrote:
> > From: Abhijeet Dharmapurikar <adharmap@quicinc.com>
> >
> > Please refer to the post here
> > http://lkml.org/lkml/2010/1/4/347
> >
> > These changes are to introduce barrierless dma_map_area and dma_unmap_area and
> > use them to map the buffers in the scatterlist. For the last buffer, call
> > the normal dma_map_area(aka with barriers) effectively executing the barrier
> > at the end of the operation.
> 
> What if we make dma_map_area and dma_unmap_area both be barrier-less,
> and instead have a separate dma_barrier method - eg, something like the
> attached?

I was just writing the reply when I noticed yours :). Yes, that's a
better approach.

> diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
> index e290885..5928e78 100644
> --- a/arch/arm/include/asm/cacheflush.h
> +++ b/arch/arm/include/asm/cacheflush.h
> @@ -200,6 +200,7 @@ struct cpu_cache_fns {
> 
>         void (*dma_map_area)(const void *, size_t, int);
>         void (*dma_unmap_area)(const void *, size_t, int);
> +       void (*dma_barrier)(void);

Alternatively we could use the dsb() macro. I don't think we need more
than this and we would not (well, not easily) compile ARMv5 and ARMv6 in
the same kernel.

Anyway, an additional branch and return would probably be negligible
compared to the cache flushing operation.

> @@ -345,6 +347,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
>         BUG_ON(!valid_dma_direction(dir));
> 
>         __dma_single_cpu_to_dev(cpu_addr, size, dir);
> +       __dma_barrier(dir);
> 
>         return virt_to_dma(dev, cpu_addr);
>  }

The ___dma_single_cpu_to_dev() covers both inner and outer caches but I
haven't seen it touched by this patch (nor the other you posted). When
you clean the L1 cache, you need to make sure that there is a barrier
(DSB) so that it completes before cleaning the L2, otherwise you clean
the L2 but data keeps coming from L1.

For the *_sg functions, you either use barrier between L1 and L2 for
each page or you do the for_each_sg() loop twice, once for L1 and
another for L2.

-- 
Catalin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-11 10:45   ` Catalin Marinas
@ 2010-02-11 10:53     ` Catalin Marinas
  2010-02-11 11:01       ` Russell King - ARM Linux
  2010-02-11 10:56     ` Russell King - ARM Linux
  1 sibling, 1 reply; 18+ messages in thread
From: Catalin Marinas @ 2010-02-11 10:53 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, 2010-02-11 at 10:45 +0000, Catalin Marinas wrote:
> On Wed, 2010-02-10 at 21:21 +0000, Russell King - ARM Linux wrote:
> > @@ -345,6 +347,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
> >         BUG_ON(!valid_dma_direction(dir));
> >
> >         __dma_single_cpu_to_dev(cpu_addr, size, dir);
> > +       __dma_barrier(dir);
> >
> >         return virt_to_dma(dev, cpu_addr);
> >  }
> 
> The ___dma_single_cpu_to_dev() covers both inner and outer caches but I
> haven't seen it touched by this patch (nor the other you posted). When
> you clean the L1 cache, you need to make sure that there is a barrier
> (DSB) so that it completes before cleaning the L2, otherwise you clean
> the L2 but data keeps coming from L1.

Actually after L2 maintenance we don't even need the __dma_barrier(), we
need an outer_cache.sync() function.

I can do the outer cache optimisations together with a few others for
PL310 (which does not require the cache_wait() call for line
operations).

-- 
Catalin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-11 10:45   ` Catalin Marinas
  2010-02-11 10:53     ` Catalin Marinas
@ 2010-02-11 10:56     ` Russell King - ARM Linux
  2010-02-11 19:13       ` Abhijeet Dharmapurikar
  1 sibling, 1 reply; 18+ messages in thread
From: Russell King - ARM Linux @ 2010-02-11 10:56 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 11, 2010 at 10:45:01AM +0000, Catalin Marinas wrote:
> Alternatively we could use the dsb() macro. I don't think we need more
> than this and we would not (well, not easily) compile ARMv5 and ARMv6 in
> the same kernel.

That doesn't work - ARMv3 and some ARMv4 don't have a 'drain write
buffer' instruction but others do - executing that instruction on
older CPUs which don't have a write buffer causes an illegal
instruction fault.

> The ___dma_single_cpu_to_dev() covers both inner and outer caches but I
> haven't seen it touched by this patch (nor the other you posted). When
> you clean the L1 cache, you need to make sure that there is a barrier
> (DSB) so that it completes before cleaning the L2, otherwise you clean
> the L2 but data keeps coming from L1.
> 
> For the *_sg functions, you either use barrier between L1 and L2 for
> each page or you do the for_each_sg() loop twice, once for L1 and
> another for L2.

Okay, that's a fundamental problem with this approach.  Spanner in the
works kind of thing.  I think that's a problem for Abhijeet's patch
as well - since the same comment appears to apply there too.

Sounds like it needs a totally different approach then.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-11 10:53     ` Catalin Marinas
@ 2010-02-11 11:01       ` Russell King - ARM Linux
  2010-02-11 11:03         ` Catalin Marinas
  0 siblings, 1 reply; 18+ messages in thread
From: Russell King - ARM Linux @ 2010-02-11 11:01 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Feb 11, 2010 at 10:53:05AM +0000, Catalin Marinas wrote:
> Actually after L2 maintenance we don't even need the __dma_barrier(), we
> need an outer_cache.sync() function.
> 
> I can do the outer cache optimisations together with a few others for
> PL310 (which does not require the cache_wait() call for line
> operations).

I'm in half a mind to say "stop everything for the DMA API and wait
until the next merge window" - what we have at the moment is a big
shake up of how the API is implemented, which has had very little
attributable testing.

Let's get the current code (which missed the last merge window) tested,
acked and merged, and only then sort out these kinds of optimizations
after that.  As it is, these DMA patches have had very little in the
way of attributable feedback so far.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-11 11:01       ` Russell King - ARM Linux
@ 2010-02-11 11:03         ` Catalin Marinas
  0 siblings, 0 replies; 18+ messages in thread
From: Catalin Marinas @ 2010-02-11 11:03 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, 2010-02-11 at 11:01 +0000, Russell King - ARM Linux wrote:
> On Thu, Feb 11, 2010 at 10:53:05AM +0000, Catalin Marinas wrote:
> > Actually after L2 maintenance we don't even need the __dma_barrier(), we
> > need an outer_cache.sync() function.
> >
> > I can do the outer cache optimisations together with a few others for
> > PL310 (which does not require the cache_wait() call for line
> > operations).
> 
> I'm in half a mind to say "stop everything for the DMA API and wait
> until the next merge window" - what we have at the moment is a big
> shake up of how the API is implemented, which has had very little
> attributable testing.
> 
> Let's get the current code (which missed the last merge window) tested,
> acked and merged, and only then sort out these kinds of optimizations
> after that.  As it is, these DMA patches have had very little in the
> way of attributable feedback so far.

I agree, I wasn't planning to submit anything for 2.6.34. These
optimisations should probably get in 2.6.35.

-- 
Catalin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-11 10:56     ` Russell King - ARM Linux
@ 2010-02-11 19:13       ` Abhijeet Dharmapurikar
  0 siblings, 0 replies; 18+ messages in thread
From: Abhijeet Dharmapurikar @ 2010-02-11 19:13 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King - ARM Linux wrote:
> On Thu, Feb 11, 2010 at 10:45:01AM +0000, Catalin Marinas wrote:
>> Alternatively we could use the dsb() macro. I don't think we need more
>> than this and we would not (well, not easily) compile ARMv5 and ARMv6 in
>> the same kernel.
> 
> That doesn't work - ARMv3 and some ARMv4 don't have a 'drain write
> buffer' instruction but others do - executing that instruction on
> older CPUs which don't have a write buffer causes an illegal
> instruction fault.
> 
>> The ___dma_single_cpu_to_dev() covers both inner and outer caches but I
>> haven't seen it touched by this patch (nor the other you posted). When
>> you clean the L1 cache, you need to make sure that there is a barrier
>> (DSB) so that it completes before cleaning the L2, otherwise you clean
>> the L2 but data keeps coming from L1.
>>
>> For the *_sg functions, you either use barrier between L1 and L2 for
>> each page or you do the for_each_sg() loop twice, once for L1 and
>> another for L2.
> 
> Okay, that's a fundamental problem with this approach.  Spanner in the
> works kind of thing.  I think that's a problem for Abhijeet's patch
> as well - since the same comment appears to apply there too.

The problem applies to my patch as well, however my board has a unified 
cache and I didn't think about ordering operations on the outer caches.

> Sounds like it needs a totally different approach then.
how about the following ?


 From ea746d981f6f7291fd0f8b3f51bdd3747ca976c5 Mon Sep 17 00:00:00 2001
From: Abhijeet Dharmapurikar <adharmap@codeaurora.org>
Date: Thu, 11 Feb 2010 10:29:19 -0800
Subject: [PATCH] dma: define map/unmap functions for outer cache

Define map and unmap functions for outer cache and execute barriers
at appropriate places within them. For architectures without outer caches
these functions are nil.

Signed-off-by: Abhijeet Dharmapurikar <adharmap@codeaurora.org>
---
  arch/arm/include/asm/cacheflush.h |   39 
+++++++++++++++++++++++++++++++++++++
  arch/arm/mm/dma-mapping.c         |   17 +--------------
  2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/arch/arm/include/asm/cacheflush.h 
b/arch/arm/include/asm/cacheflush.h
index 8148a00..3474a54 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -11,6 +11,7 @@
  #define _ASMARM_CACHEFLUSH_H

  #include <linux/mm.h>
+#include <linux/dma-mapping.h>

  #include <asm/glue.h>
  #include <asm/shmparam.h>
@@ -300,6 +301,38 @@ static inline void outer_flush_range(unsigned long 
start, unsigned long end)
  		outer_cache.flush_range(start, end);
  }

+static inline void dmac_outer_map_area(const void *kaddr, size_t size,
+	enum dma_data_direction dir)
+{
+	unsigned long paddr;
+
+	/* complete all the prior L1 operations */
+	dma_barrier();
+	paddr = __pa(kaddr);
+	if (dir == DMA_FROM_DEVICE) {
+		outer_inv_range(paddr, paddr + size);
+	} else {
+		outer_clean_range(paddr, paddr + size);
+	}
+	/* FIXME: non-speculating: flush on bidirectional mappings? */
+}
+
+static inline void dmac_outer_unmap_area(const void *kaddr, size_t size,
+	enum dma_data_direction dir)
+{
+
+	/* FIXME: non-speculating: not required */
+	/* don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE) {
+		unsigned long paddr = __pa(kaddr);
+		outer_inv_range(paddr, paddr + size);
+	}
+
+	/* complete all the outer cache operations operations */
+	dma_barrier();
+}
+
+
  #else

  static inline void outer_inv_range(unsigned long start, unsigned long end)
@@ -308,6 +341,12 @@ static inline void outer_clean_range(unsigned long 
start, unsigned long end)
  { }
  static inline void outer_flush_range(unsigned long start, unsigned 
long end)
  { }
+static inline void dmac_outer_map_area(const void *kaddr, size_t size,
+	enum dma_data_direction dir)
+{ }
+static inline void dmac_outer_unmap_area(const void *kaddr, size_t size,
+	enum dma_data_direction dir)
+{ }

  #endif

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 64daef2..6fff111 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -407,19 +407,11 @@ EXPORT_SYMBOL(dma_free_coherent);
  void ___dma_single_cpu_to_dev(const void *kaddr, size_t size,
  	enum dma_data_direction dir)
  {
-	unsigned long paddr;
-
  	BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));

  	dmac_map_area(kaddr, size, dir);

-	paddr = __pa(kaddr);
-	if (dir == DMA_FROM_DEVICE) {
-		outer_inv_range(paddr, paddr + size);
-	} else {
-		outer_clean_range(paddr, paddr + size);
-	}
-	/* FIXME: non-speculating: flush on bidirectional mappings? */
+	dmac_outer_map_area(kaddr, size, dir);
  }
  EXPORT_SYMBOL(___dma_single_cpu_to_dev);

@@ -428,12 +420,7 @@ void ___dma_single_dev_to_cpu(const void *kaddr, 
size_t size,
  {
  	BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));

-	/* FIXME: non-speculating: not required */
-	/* don't bother invalidating if DMA to device */
-	if (dir != DMA_TO_DEVICE) {
-		unsigned long paddr = __pa(kaddr);
-		outer_inv_range(paddr, paddr + size);
-	}
+	dmac_outer_unmap_area(kaddr, size, dir);

  	dmac_unmap_area(kaddr, size, dir);
  }
-- 
1.5.6.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC 0/2] fix dma_map_sg not to do barriers for each buffer
  2010-02-10 23:57     ` Russell King - ARM Linux
@ 2010-02-11 21:36       ` Abhijeet Dharmapurikar
  0 siblings, 0 replies; 18+ messages in thread
From: Abhijeet Dharmapurikar @ 2010-02-11 21:36 UTC (permalink / raw)
  To: linux-arm-kernel


>  }
> @@ -363,7 +366,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
>   * The device owns this memory once this call has completed.  The CPU
>   * can regain ownership by calling dma_unmap_page().
>   */
> -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
> +static inline dma_addr_t __dma_map_page(struct device *dev, struct page *page,
>  	     unsigned long offset, size_t size, enum dma_data_direction dir)
>  {
>  	BUG_ON(!valid_dma_direction(dir));
> @@ -373,6 +376,14 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
>  	return page_to_dma(dev, page) + offset;
>  }
>  
> +static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
> +	     unsigned long offset, size_t size, enum dma_data_direction dir)
> +{
> +	dma_addr_t addr = __dma_map_page(page, offset, size, dir);
> +	__dma_barrier(dir);
> +	return addr;
> +}
> +
>  /**
.
.
.
.
>  /**
>   * dma_map_sg - map a set of SG buffers for streaming mode DMA
>   * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
> @@ -532,11 +539,14 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
>  	int i, j;
>  
>  	for_each_sg(sg, s, nents, i) {
> -		s->dma_address = dma_map_page(dev, sg_page(s), s->offset,
> +		s->dma_address = __dma_map_page(dev, sg_page(s), s->offset,
>  						s->length, dir);
>  		if (dma_mapping_error(dev, s->dma_address))
>  			goto bad_mapping;
>  	}
> +
> +	__dma_barrier(dir);
> +
>  	return nents;
>  
>   bad_mapping:
> @@ -564,6 +574,8 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
>  
>  	for_each_sg(sg, s, nents, i)
>  		dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir);
> +
> +	__dma_barrier(dir);
>  }
>  EXPORT_SYMBOL(dma_unmap_sg);

dma_unmap_sg too could use indirection like dma_map_sg.

Thanks for the patch.
Abhijeet

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2010-02-11 21:36 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-02-10 20:37 [RFC 0/2] fix dma_map_sg not to do barriers for each buffer adharmap at codeaurora.org
2010-02-10 20:37 ` [PATCH 1/2] dma: define barrierless versions of map and unmap area adharmap at codeaurora.org
2010-02-10 20:37 ` [PATCH 2/2] dma: fix scatter-gather api to use barrierless map/unmap functions adharmap at codeaurora.org
2010-02-10 21:21 ` [RFC 0/2] fix dma_map_sg not to do barriers for each buffer Russell King - ARM Linux
2010-02-10 23:28   ` Abhijeet Dharmapurikar
2010-02-10 23:57     ` Russell King - ARM Linux
2010-02-11 21:36       ` Abhijeet Dharmapurikar
2010-02-11 10:45   ` Catalin Marinas
2010-02-11 10:53     ` Catalin Marinas
2010-02-11 11:01       ` Russell King - ARM Linux
2010-02-11 11:03         ` Catalin Marinas
2010-02-11 10:56     ` Russell King - ARM Linux
2010-02-11 19:13       ` Abhijeet Dharmapurikar
2010-02-10 21:27 ` Randy Dunlap
2010-02-10 22:40   ` Russell King - ARM Linux
2010-02-10 23:10     ` Abhijeet Dharmapurikar
2010-02-11  0:39   ` FUJITA Tomonori
2010-02-11  0:41     ` Randy Dunlap

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).