linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ARM: dma-mapping: Don't use outer_flush_range when the L2C is coherent
@ 2015-08-13 13:59 Gregory CLEMENT
  2015-08-13 15:16 ` Catalin Marinas
  0 siblings, 1 reply; 5+ messages in thread
From: Gregory CLEMENT @ 2015-08-13 13:59 UTC (permalink / raw)
  To: linux-arm-kernel

When a L2 cache controller is used in a system that provides hardware
coherency, the entire outer cache operations are useless, and can be
skipped.  Moreover, on some systems, it is harmful as it causes
deadlocks between the Marvell coherency mechanism, the Marvell PCIe
controller and the Cortex-A9.

In the current kernel implementation, the outer cache flush range
operation is triggered by the dma_alloc function.
This operation can be take place during runtime and in some
circumstances may lead to the PCIe/PL310 deadlock on Armada 375/38x
SoCs.

This patch extends the __dma_clear_buffer() function to receive a
boolean argument related to the coherency of the system. The same
things is done for the calling functions. In order to be consistent
the iommu variant of dma_alloc are also updated.

Reported-by: Nadav Haklai <nadavh@marvell.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Cc: <stable@vger.kernel.org> # v3.16+
---
 arch/arm/mm/dma-mapping.c | 58 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 1ced8a0f7a52..ccd792af2c51 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -224,7 +224,7 @@ static u64 get_coherent_dma_mask(struct device *dev)
 	return mask;
 }
 
-static void __dma_clear_buffer(struct page *page, size_t size)
+static void __dma_clear_buffer(struct page *page, size_t size, bool is_coherent)
 {
 	/*
 	 * Ensure that the allocated pages are zeroed, and that any data
@@ -241,12 +241,15 @@ static void __dma_clear_buffer(struct page *page, size_t size)
 			page++;
 			size -= PAGE_SIZE;
 		}
-		outer_flush_range(base, end);
+		if (!is_coherent)
+			outer_flush_range(base, end);
 	} else {
 		void *ptr = page_address(page);
 		memset(ptr, 0, size);
-		dmac_flush_range(ptr, ptr + size);
-		outer_flush_range(__pa(ptr), __pa(ptr) + size);
+		if (!is_coherent) {
+			dmac_flush_range(ptr, ptr + size);
+			outer_flush_range(__pa(ptr), __pa(ptr) + size);
+		}
 	}
 }
 
@@ -254,7 +257,8 @@ static void __dma_clear_buffer(struct page *page, size_t size)
  * Allocate a DMA buffer for 'dev' of size 'size' using the
  * specified gfp mask.  Note that 'size' must be page aligned.
  */
-static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gfp)
+static struct page *__dma_alloc_buffer(struct device *dev, size_t size,
+				gfp_t gfp, bool is_coherent)
 {
 	unsigned long order = get_order(size);
 	struct page *page, *p, *e;
@@ -270,7 +274,7 @@ static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gf
 	for (p = page + (size >> PAGE_SHIFT), e = page + (1 << order); p < e; p++)
 		__free_page(p);
 
-	__dma_clear_buffer(page, size);
+	__dma_clear_buffer(page, size, is_coherent);
 
 	return page;
 }
@@ -474,7 +478,8 @@ static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp,
 {
 	struct page *page;
 	void *ptr = NULL;
-	page = __dma_alloc_buffer(dev, size, gfp);
+	/* This function is only called when the arch is non-coherent */
+	page = __dma_alloc_buffer(dev, size, gfp, false);
 	if (!page)
 		return NULL;
 	if (!want_vaddr)
@@ -540,7 +545,13 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size,
 	if (!page)
 		return NULL;
 
-	__dma_clear_buffer(page, size);
+	/*
+	 * We are either called from __dma_alloc on the non-coherent
+	 * case or only once from atomic_pool_init. It is called
+	 * during boot time so it is harmless to use the non-coherent
+	 * case even if the L2C is coherent.
+	 */
+	__dma_clear_buffer(page, size, false);
 
 	if (!want_vaddr)
 		goto out;
@@ -601,7 +612,8 @@ static void *__alloc_simple_buffer(struct device *dev, size_t size, gfp_t gfp,
 				   struct page **ret_page)
 {
 	struct page *page;
-	page = __dma_alloc_buffer(dev, size, gfp);
+	/* This function is only called when the arch is coherent */
+	page = __dma_alloc_buffer(dev, size, gfp, true);
 	if (!page)
 		return NULL;
 
@@ -1131,7 +1143,8 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
 }
 
 static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
-					  gfp_t gfp, struct dma_attrs *attrs)
+					gfp_t gfp, struct dma_attrs *attrs,
+					bool is_coherent)
 {
 	struct page **pages;
 	int count = size >> PAGE_SHIFT;
@@ -1154,7 +1167,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		if (!page)
 			goto error;
 
-		__dma_clear_buffer(page, size);
+		__dma_clear_buffer(page, size, is_coherent);
 
 		for (i = 0; i < count; i++)
 			pages[i] = page + i;
@@ -1198,7 +1211,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 				pages[i + j] = pages[i] + j;
 		}
 
-		__dma_clear_buffer(pages[i], PAGE_SIZE << order);
+		__dma_clear_buffer(pages[i], PAGE_SIZE << order, is_coherent);
 		i += 1 << order;
 		count -= 1 << order;
 	}
@@ -1359,8 +1372,9 @@ static void __iommu_free_atomic(struct device *dev, void *cpu_addr,
 	__free_from_pool(cpu_addr, size);
 }
 
-static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
-	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
+static void *__arm_iommu_alloc_attrs(struct device *dev, size_t size,
+	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs,
+	    bool is_coherent)
 {
 	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
 	struct page **pages;
@@ -1381,7 +1395,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
 	 */
 	gfp &= ~(__GFP_COMP);
 
-	pages = __iommu_alloc_buffer(dev, size, gfp, attrs);
+	pages = __iommu_alloc_buffer(dev, size, gfp, attrs, is_coherent);
 	if (!pages)
 		return NULL;
 
@@ -1406,6 +1420,18 @@ err_buffer:
 	return NULL;
 }
 
+static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
+	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
+{
+	__arm_iommu_alloc_attrs(dev, size, handle, gfp, attrs, false);
+}
+
+static void *arm_coherent_iommu_alloc_attrs(struct device *dev, size_t size,
+	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
+{
+	__arm_iommu_alloc_attrs(dev, size, handle, gfp, attrs, true);
+}
+
 static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 		    void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		    struct dma_attrs *attrs)
@@ -1868,7 +1894,7 @@ struct dma_map_ops iommu_ops = {
 };
 
 struct dma_map_ops iommu_coherent_ops = {
-	.alloc		= arm_iommu_alloc_attrs,
+	.alloc		= arm_coherent_iommu_alloc_attrs,
 	.free		= arm_iommu_free_attrs,
 	.mmap		= arm_iommu_mmap_attrs,
 	.get_sgtable	= arm_iommu_get_sgtable,
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH] ARM: dma-mapping: Don't use outer_flush_range when the L2C is coherent
  2015-08-13 13:59 [PATCH] ARM: dma-mapping: Don't use outer_flush_range when the L2C is coherent Gregory CLEMENT
@ 2015-08-13 15:16 ` Catalin Marinas
  2015-08-14 12:49   ` Gregory CLEMENT
  0 siblings, 1 reply; 5+ messages in thread
From: Catalin Marinas @ 2015-08-13 15:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Aug 13, 2015 at 03:59:25PM +0200, Gregory CLEMENT wrote:
> @@ -241,12 +241,15 @@ static void __dma_clear_buffer(struct page *page, size_t size)
>  			page++;
>  			size -= PAGE_SIZE;
>  		}
> -		outer_flush_range(base, end);
> +		if (!is_coherent)
> +			outer_flush_range(base, end);

There is a dmac_flush_range() call above this (in the "while" block),
please add a check for is_coherent in there as well.

> @@ -540,7 +545,13 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size,
>  	if (!page)
>  		return NULL;
>  
> -	__dma_clear_buffer(page, size);
> +	/*
> +	 * We are either called from __dma_alloc on the non-coherent
> +	 * case or only once from atomic_pool_init. It is called
> +	 * during boot time so it is harmless to use the non-coherent
> +	 * case even if the L2C is coherent.
> +	 */
> +	__dma_clear_buffer(page, size, false);

This is no longer the case with commit 21caf3a765b0 ("ARM: 8398/1: arm
DMA: Fix allocation from CMA for coherent DMA") in -next. So you need
another argument for __alloc_from_contiguous() that can be passed down
to __dma_clear_buffer().

> @@ -1131,7 +1143,8 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
>  }
>  
>  static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
> -					  gfp_t gfp, struct dma_attrs *attrs)
> +					gfp_t gfp, struct dma_attrs *attrs,
> +					bool is_coherent)
>  {
>  	struct page **pages;
>  	int count = size >> PAGE_SHIFT;
> @@ -1154,7 +1167,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
>  		if (!page)
>  			goto error;
>  
> -		__dma_clear_buffer(page, size);
> +		__dma_clear_buffer(page, size, is_coherent);
>  
>  		for (i = 0; i < count; i++)
>  			pages[i] = page + i;
> @@ -1198,7 +1211,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
>  				pages[i + j] = pages[i] + j;
>  		}
>  
> -		__dma_clear_buffer(pages[i], PAGE_SIZE << order);
> +		__dma_clear_buffer(pages[i], PAGE_SIZE << order, is_coherent);
>  		i += 1 << order;
>  		count -= 1 << order;
>  	}
> @@ -1359,8 +1372,9 @@ static void __iommu_free_atomic(struct device *dev, void *cpu_addr,
>  	__free_from_pool(cpu_addr, size);
>  }
>  
> -static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
> -	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
> +static void *__arm_iommu_alloc_attrs(struct device *dev, size_t size,
> +	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs,
> +	    bool is_coherent)
>  {
>  	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
>  	struct page **pages;
> @@ -1381,7 +1395,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
>  	 */
>  	gfp &= ~(__GFP_COMP);
>  
> -	pages = __iommu_alloc_buffer(dev, size, gfp, attrs);
> +	pages = __iommu_alloc_buffer(dev, size, gfp, attrs, is_coherent);
>  	if (!pages)
>  		return NULL;
>  

I can see you are trying to fix the iommu_alloc code to cope with
coherent devices. I think it's currently broken and if you want to fix
it, you'd better add a separate patch. Otherwise, if no-one needs it,
just pass false to __dma_clear_buffer() in __iommu_alloc_buffer()
(though it's better if we fixed it).

In arm_iommu_alloc_attrs(), there is an __iommu_alloc_atomic() call
which goes to the non-cacheable pool, it should use
__alloc_simple_buffer() instead if is_coherent.

-- 
Catalin

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] ARM: dma-mapping: Don't use outer_flush_range when the L2C is coherent
  2015-08-13 15:16 ` Catalin Marinas
@ 2015-08-14 12:49   ` Gregory CLEMENT
  2015-08-14 14:30     ` Catalin Marinas
  0 siblings, 1 reply; 5+ messages in thread
From: Gregory CLEMENT @ 2015-08-14 12:49 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Catalin,

On 13/08/2015 17:16, Catalin Marinas wrote:
> On Thu, Aug 13, 2015 at 03:59:25PM +0200, Gregory CLEMENT wrote:
>> @@ -241,12 +241,15 @@ static void __dma_clear_buffer(struct page *page, size_t size)
>>  			page++;
>>  			size -= PAGE_SIZE;
>>  		}
>> -		outer_flush_range(base, end);
>> +		if (!is_coherent)
>> +			outer_flush_range(base, end);
> 
> There is a dmac_flush_range() call above this (in the "while" block),
> please add a check for is_coherent in there as well.

OK

> 
>> @@ -540,7 +545,13 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size,
>>  	if (!page)
>>  		return NULL;
>>  
>> -	__dma_clear_buffer(page, size);
>> +	/*
>> +	 * We are either called from __dma_alloc on the non-coherent
>> +	 * case or only once from atomic_pool_init. It is called
>> +	 * during boot time so it is harmless to use the non-coherent
>> +	 * case even if the L2C is coherent.
>> +	 */
>> +	__dma_clear_buffer(page, size, false);
> 
> This is no longer the case with commit 21caf3a765b0 ("ARM: 8398/1: arm
> DMA: Fix allocation from CMA for coherent DMA") in -next. So you need
> another argument for __alloc_from_contiguous() that can be passed down
> to __dma_clear_buffer().

OK

> 
>> @@ -1131,7 +1143,8 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
>>  }
>>  
>>  static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
>> -					  gfp_t gfp, struct dma_attrs *attrs)
>> +					gfp_t gfp, struct dma_attrs *attrs,
>> +					bool is_coherent)
>>  {
>>  	struct page **pages;
>>  	int count = size >> PAGE_SHIFT;
>> @@ -1154,7 +1167,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
>>  		if (!page)
>>  			goto error;
>>  
>> -		__dma_clear_buffer(page, size);
>> +		__dma_clear_buffer(page, size, is_coherent);
>>  
>>  		for (i = 0; i < count; i++)
>>  			pages[i] = page + i;
>> @@ -1198,7 +1211,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
>>  				pages[i + j] = pages[i] + j;
>>  		}
>>  
>> -		__dma_clear_buffer(pages[i], PAGE_SIZE << order);
>> +		__dma_clear_buffer(pages[i], PAGE_SIZE << order, is_coherent);
>>  		i += 1 << order;
>>  		count -= 1 << order;
>>  	}
>> @@ -1359,8 +1372,9 @@ static void __iommu_free_atomic(struct device *dev, void *cpu_addr,
>>  	__free_from_pool(cpu_addr, size);
>>  }
>>  
>> -static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
>> -	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
>> +static void *__arm_iommu_alloc_attrs(struct device *dev, size_t size,
>> +	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs,
>> +	    bool is_coherent)
>>  {
>>  	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
>>  	struct page **pages;
>> @@ -1381,7 +1395,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
>>  	 */
>>  	gfp &= ~(__GFP_COMP);
>>  
>> -	pages = __iommu_alloc_buffer(dev, size, gfp, attrs);
>> +	pages = __iommu_alloc_buffer(dev, size, gfp, attrs, is_coherent);
>>  	if (!pages)
>>  		return NULL;
>>  
> 
> I can see you are trying to fix the iommu_alloc code to cope with
> coherent devices. I think it's currently broken and if you want to fix

Could you explain what is broken exactly?
The way I tried to deal with coherency or something more low level?

> it, you'd better add a separate patch. Otherwise, if no-one needs it,
> just pass false to __dma_clear_buffer() in __iommu_alloc_buffer()
> (though it's better if we fixed it).

I will keep it apart for now. I agree trying to do a separate patch for
this but I am not sure that I familiar enough with this part of the kernel
to achieve it.

> 
> In arm_iommu_alloc_attrs(), there is an __iommu_alloc_atomic() call
> which goes to the non-cacheable pool, it should use
> __alloc_simple_buffer() instead if is_coherent.

__iommu_alloc_atomic is called when GFP_WAIT is not set meaning that
we can't sleep, but _alloc_simple_buffer can sleep. I follow the calls
inside this function and at a point we found a call to the might_sleep_if
macro:
http://lxr.free-electrons.com/source/mm/page_alloc.c#L2859

The _alloc_simple_buffer function ended by calling gen_pool_alloc
which won't sleep so from my point of view it is still the right
function to call. Did I miss something?


Thanks,

Gregory


-- 
Gregory Clement, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] ARM: dma-mapping: Don't use outer_flush_range when the L2C is coherent
  2015-08-14 12:49   ` Gregory CLEMENT
@ 2015-08-14 14:30     ` Catalin Marinas
  2015-08-14 15:06       ` Gregory CLEMENT
  0 siblings, 1 reply; 5+ messages in thread
From: Catalin Marinas @ 2015-08-14 14:30 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Aug 14, 2015 at 02:49:58PM +0200, Gregory CLEMENT wrote:
> On 13/08/2015 17:16, Catalin Marinas wrote:
> > On Thu, Aug 13, 2015 at 03:59:25PM +0200, Gregory CLEMENT wrote:
> >> @@ -1131,7 +1143,8 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
> >>  }
> >>  
> >>  static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
> >> -					  gfp_t gfp, struct dma_attrs *attrs)
> >> +					gfp_t gfp, struct dma_attrs *attrs,
> >> +					bool is_coherent)
> >>  {
> >>  	struct page **pages;
> >>  	int count = size >> PAGE_SHIFT;
> >> @@ -1154,7 +1167,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
> >>  		if (!page)
> >>  			goto error;
> >>  
> >> -		__dma_clear_buffer(page, size);
> >> +		__dma_clear_buffer(page, size, is_coherent);
> >>  
> >>  		for (i = 0; i < count; i++)
> >>  			pages[i] = page + i;
> >> @@ -1198,7 +1211,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
> >>  				pages[i + j] = pages[i] + j;
> >>  		}
> >>  
> >> -		__dma_clear_buffer(pages[i], PAGE_SIZE << order);
> >> +		__dma_clear_buffer(pages[i], PAGE_SIZE << order, is_coherent);
> >>  		i += 1 << order;
> >>  		count -= 1 << order;
> >>  	}
> >> @@ -1359,8 +1372,9 @@ static void __iommu_free_atomic(struct device *dev, void *cpu_addr,
> >>  	__free_from_pool(cpu_addr, size);
> >>  }
> >>  
> >> -static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
> >> -	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
> >> +static void *__arm_iommu_alloc_attrs(struct device *dev, size_t size,
> >> +	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs,
> >> +	    bool is_coherent)
> >>  {
> >>  	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
> >>  	struct page **pages;
> >> @@ -1381,7 +1395,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
> >>  	 */
> >>  	gfp &= ~(__GFP_COMP);
> >>  
> >> -	pages = __iommu_alloc_buffer(dev, size, gfp, attrs);
> >> +	pages = __iommu_alloc_buffer(dev, size, gfp, attrs, is_coherent);
> >>  	if (!pages)
> >>  		return NULL;
> >>  
> > 
> > I can see you are trying to fix the iommu_alloc code to cope with
> > coherent devices. I think it's currently broken and if you want to fix
> 
> Could you explain what is broken exactly?
> The way I tried to deal with coherency or something more low level?

The problem is that __iommu_alloc_atomic() allocates from a
non-cacheable pool, which is fine when the device is not cache coherent
but won't work as expected in the is_coherent case (the CPU and device
must access the memory using the same cacheability attributes).

> > In arm_iommu_alloc_attrs(), there is an __iommu_alloc_atomic() call
> > which goes to the non-cacheable pool, it should use
> > __alloc_simple_buffer() instead if is_coherent.
> 
> __iommu_alloc_atomic is called when GFP_WAIT is not set meaning that
> we can't sleep, but _alloc_simple_buffer can sleep. I follow the calls
> inside this function and at a point we found a call to the might_sleep_if
> macro:
> http://lxr.free-electrons.com/source/mm/page_alloc.c#L2859

This might sleep thing is:

	might_sleep_if(gfp_mask & __GFP_WAIT);

So it only sleeps if the __GFP_WAIT is set, but we would only call
__alloc_simple_buffer() in the !__GFP_WAIT case, so there is no
sleeping.

The reason to call __alloc_simple_buffer() (in the atomic alloc case) is
to get cacheable memory, unlike __alloc_from_pool() which only returns
memory from the non-cacheable pool.

> The _alloc_simple_buffer function ended by calling gen_pool_alloc
> which won't sleep so from my point of view it is still the right
> function to call. Did I miss something?

It's not just about sleep vs atomic but about cacheable vs non-cacheable
as well.

As a quick fix, below is what I meant. It needs extra snippets from your
patch to add arm_iommu_alloc_attrs_(coherent|noncoherent) and it's not
tested, but just to give you an idea:

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 9f509b264346..e20a31d45ff3 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1335,13 +1335,16 @@ static struct page **__iommu_get_pages(void *cpu_addr, struct dma_attrs *attrs)
 	return NULL;
 }
 
-static void *__iommu_alloc_atomic(struct device *dev, size_t size,
-				  dma_addr_t *handle)
+static void *__iommu_alloc_simple(struct device *dev, size_t size, gfp_t gfp,
+				  dma_addr_t *handle, bool is_coherent)
 {
 	struct page *page;
 	void *addr;
 
-	addr = __alloc_from_pool(size, &page);
+	if (is_coherent)
+		addr = __alloc_simple_buffer(dev, size, gfp, &page);
+	else
+		addr = __alloc_from_pool(size, &page);
 	if (!addr)
 		return NULL;
 
@@ -1356,15 +1359,18 @@ err_mapping:
 	return NULL;
 }
 
-static void __iommu_free_atomic(struct device *dev, void *cpu_addr,
-				dma_addr_t handle, size_t size)
+static void __iommu_free_simple(struct device *dev, void *cpu_addr,
+				dma_addr_t handle, size_t size, bool is_coherent)
 {
 	__iommu_remove_mapping(dev, handle, size);
-	__free_from_pool(cpu_addr, size);
+	if (is_coherent)
+		__dma_free_buffer(virt_to_page(cpu_addr), size);
+	else
+		__free_from_pool(cpu_addr, size);
 }
 
-static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
-	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
+static void *__arm_iommu_alloc_attrs(struct device *dev, size_t size,
+	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs, bool is_coherent)
 {
 	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
 	struct page **pages;
@@ -1373,8 +1379,8 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
 	*handle = DMA_ERROR_CODE;
 	size = PAGE_ALIGN(size);
 
-	if (!(gfp & __GFP_WAIT))
-		return __iommu_alloc_atomic(dev, size, handle);
+	if (is_coherent || !(gfp & __GFP_WAIT))
+		return __iommu_alloc_simple(dev, size, gfp, handle, is_coherent);
 
 	/*
 	 * Following is a work-around (a.k.a. hack) to prevent pages
@@ -1440,14 +1446,14 @@ static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
  * free a page as defined by the above mapping.
  * Must not be called with IRQs disabled.
  */
-void arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
-			  dma_addr_t handle, struct dma_attrs *attrs)
+void __arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+			  dma_addr_t handle, struct dma_attrs *attrs, bool is_coherent)
 {
 	struct page **pages;
 	size = PAGE_ALIGN(size);
 
-	if (__in_atomic_pool(cpu_addr, size)) {
-		__iommu_free_atomic(dev, cpu_addr, handle, size);
+	if (is_coherent || __in_atomic_pool(cpu_addr, size)) {
+		__iommu_free_simple(dev, cpu_addr, handle, size, is_coherent);
 		return;
 	}
 

-- 
Catalin

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH] ARM: dma-mapping: Don't use outer_flush_range when the L2C is coherent
  2015-08-14 14:30     ` Catalin Marinas
@ 2015-08-14 15:06       ` Gregory CLEMENT
  0 siblings, 0 replies; 5+ messages in thread
From: Gregory CLEMENT @ 2015-08-14 15:06 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Catalin,

On 14/08/2015 16:30, Catalin Marinas wrote:
>>>> -static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
>>>> -	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
>>>> +static void *__arm_iommu_alloc_attrs(struct device *dev, size_t size,
>>>> +	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs,
>>>> +	    bool is_coherent)
>>>>  {
>>>>  	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
>>>>  	struct page **pages;
>>>> @@ -1381,7 +1395,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
>>>>  	 */
>>>>  	gfp &= ~(__GFP_COMP);
>>>>  
>>>> -	pages = __iommu_alloc_buffer(dev, size, gfp, attrs);
>>>> +	pages = __iommu_alloc_buffer(dev, size, gfp, attrs, is_coherent);
>>>>  	if (!pages)
>>>>  		return NULL;
>>>>  
>>>
>>> I can see you are trying to fix the iommu_alloc code to cope with
>>> coherent devices. I think it's currently broken and if you want to fix
>>
>> Could you explain what is broken exactly?
>> The way I tried to deal with coherency or something more low level?
> 
> The problem is that __iommu_alloc_atomic() allocates from a
> non-cacheable pool, which is fine when the device is not cache coherent
> but won't work as expected in the is_coherent case (the CPU and device
> must access the memory using the same cacheability attributes).

Thanks for the explanation.

> 
>>> In arm_iommu_alloc_attrs(), there is an __iommu_alloc_atomic() call
>>> which goes to the non-cacheable pool, it should use
>>> __alloc_simple_buffer() instead if is_coherent.
>>
>> __iommu_alloc_atomic is called when GFP_WAIT is not set meaning that
>> we can't sleep, but _alloc_simple_buffer can sleep. I follow the calls
>> inside this function and at a point we found a call to the might_sleep_if
>> macro:
>> http://lxr.free-electrons.com/source/mm/page_alloc.c#L2859
> 
> This might sleep thing is:
> 
> 	might_sleep_if(gfp_mask & __GFP_WAIT);
> 
> So it only sleeps if the __GFP_WAIT is set, but we would only call
> __alloc_simple_buffer() in the !__GFP_WAIT case, so there is no
> sleeping.

Yes my bad I inverted the logic at a point!

> 
> The reason to call __alloc_simple_buffer() (in the atomic alloc case) is
> to get cacheable memory, unlike __alloc_from_pool() which only returns
> memory from the non-cacheable pool.
> 
>> The _alloc_simple_buffer function ended by calling gen_pool_alloc
>> which won't sleep so from my point of view it is still the right
>> function to call. Did I miss something?
> 
> It's not just about sleep vs atomic but about cacheable vs non-cacheable
> as well.
> 
> As a quick fix, below is what I meant. It needs extra snippets from your
> patch to add arm_iommu_alloc_attrs_(coherent|noncoherent) and it's not
> tested, but just to give you an idea:

I am fine to make it a proper patch following my 1st patch. However I don't have
a platform with IOMMU so the test would have to be done by people ahving such
platforms.

Gregory
> 
> diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
> index 9f509b264346..e20a31d45ff3 100644
> --- a/arch/arm/mm/dma-mapping.c
> +++ b/arch/arm/mm/dma-mapping.c
> @@ -1335,13 +1335,16 @@ static struct page **__iommu_get_pages(void *cpu_addr, struct dma_attrs *attrs)
>  	return NULL;
>  }
>  
> -static void *__iommu_alloc_atomic(struct device *dev, size_t size,
> -				  dma_addr_t *handle)
> +static void *__iommu_alloc_simple(struct device *dev, size_t size, gfp_t gfp,
> +				  dma_addr_t *handle, bool is_coherent)
>  {
>  	struct page *page;
>  	void *addr;
>  
> -	addr = __alloc_from_pool(size, &page);
> +	if (is_coherent)
> +		addr = __alloc_simple_buffer(dev, size, gfp, &page);
> +	else
> +		addr = __alloc_from_pool(size, &page);
>  	if (!addr)
>  		return NULL;
>  
> @@ -1356,15 +1359,18 @@ err_mapping:
>  	return NULL;
>  }
>  
> -static void __iommu_free_atomic(struct device *dev, void *cpu_addr,
> -				dma_addr_t handle, size_t size)
> +static void __iommu_free_simple(struct device *dev, void *cpu_addr,
> +				dma_addr_t handle, size_t size, bool is_coherent)
>  {
>  	__iommu_remove_mapping(dev, handle, size);
> -	__free_from_pool(cpu_addr, size);
> +	if (is_coherent)
> +		__dma_free_buffer(virt_to_page(cpu_addr), size);
> +	else
> +		__free_from_pool(cpu_addr, size);
>  }
>  
> -static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
> -	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
> +static void *__arm_iommu_alloc_attrs(struct device *dev, size_t size,
> +	    dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs, bool is_coherent)
>  {
>  	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
>  	struct page **pages;
> @@ -1373,8 +1379,8 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
>  	*handle = DMA_ERROR_CODE;
>  	size = PAGE_ALIGN(size);
>  
> -	if (!(gfp & __GFP_WAIT))
> -		return __iommu_alloc_atomic(dev, size, handle);
> +	if (is_coherent || !(gfp & __GFP_WAIT))
> +		return __iommu_alloc_simple(dev, size, gfp, handle, is_coherent);
>  
>  	/*
>  	 * Following is a work-around (a.k.a. hack) to prevent pages
> @@ -1440,14 +1446,14 @@ static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
>   * free a page as defined by the above mapping.
>   * Must not be called with IRQs disabled.
>   */
> -void arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
> -			  dma_addr_t handle, struct dma_attrs *attrs)
> +void __arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
> +			  dma_addr_t handle, struct dma_attrs *attrs, bool is_coherent)
>  {
>  	struct page **pages;
>  	size = PAGE_ALIGN(size);
>  
> -	if (__in_atomic_pool(cpu_addr, size)) {
> -		__iommu_free_atomic(dev, cpu_addr, handle, size);
> +	if (is_coherent || __in_atomic_pool(cpu_addr, size)) {
> +		__iommu_free_simple(dev, cpu_addr, handle, size, is_coherent);
>  		return;
>  	}
>  
> 


-- 
Gregory Clement, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-08-14 15:06 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-13 13:59 [PATCH] ARM: dma-mapping: Don't use outer_flush_range when the L2C is coherent Gregory CLEMENT
2015-08-13 15:16 ` Catalin Marinas
2015-08-14 12:49   ` Gregory CLEMENT
2015-08-14 14:30     ` Catalin Marinas
2015-08-14 15:06       ` Gregory CLEMENT

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).