LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH -V5 11/13] arch/powerpc: properly isolate kernel and user proto-VSID
From: Paul Mackerras @ 2012-08-01  4:31 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <1343647339-25576-12-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Mon, Jul 30, 2012 at 04:52:17PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> The proto-VSID space is divided into two class
> User:   0 to 2^(CONTEXT_BITS + USER_ESID_BITS) -1
> kernel: 2^(CONTEXT_BITS + USER_ESID_BITS) to 2^(VSID_BITS) - 1
> 
> With KERNEL_START at 0xc000000000000000, the proto vsid for
> the kernel ends up with 0xc00000000 (36 bits). With 64TB
> patchset we need to have kernel proto-VSID in the
> [2^37 to 2^38 - 1] range due to the increased USER_ESID_BITS.

This needs to be rolled in with the previous patch, otherwise you'll
break bisection.

> diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
> index db2cb3f..405d380 100644
> --- a/arch/powerpc/mm/slb_low.S
> +++ b/arch/powerpc/mm/slb_low.S
> @@ -57,8 +57,16 @@ _GLOBAL(slb_allocate_realmode)
>  _GLOBAL(slb_miss_kernel_load_linear)
>  	li	r11,0
>  BEGIN_FTR_SECTION
> +	li	r9,0x1
> +	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>  	b	slb_finish_load
>  END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
> +	li	r9,0x1
> +	/*
> +	 * shift 12 bits less here, slb_finish_load_1T will do
> +	 * the necessary shits
> +	 */
> +	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>  	b	slb_finish_load_1T

Since you're actually doing exactly the same instructions in the 256M
and 1T segment cases, why not do the li; rldimi before the
BEGIN_FTR_SECTION?

> @@ -86,8 +94,16 @@ _GLOBAL(slb_miss_kernel_load_vmemmap)
>  	li	r11,0
>  6:
>  BEGIN_FTR_SECTION
> +	li	r9,0x1
> +	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>  	b	slb_finish_load
>  END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
> +	li	r9,0x1
> +	/*
> +	 * shift 12 bits less here, slb_finish_load_1T will do
> +	 * the necessary shits
> +	 */
> +	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>  	b	slb_finish_load_1T

And similarly here.

Paul.

^ permalink raw reply

* Re: [PATCH -V5 03/13] arch/powerpc: Convert virtual address to vpn
From: Paul Mackerras @ 2012-08-01  4:33 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <1343647339-25576-4-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Mon, Jul 30, 2012 at 04:52:09PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> This patch convert different functions to take virtual page number
> instead of virtual address. Virtual page number is virtual address
> shifted right by VPN_SHIFT (12) bits. This enable us to have an
> address range of upto 76 bits.

Mostly looks good.  I don't think it's necessary to add the
BUILD_BUG_ON and BUG_ON conditions.  I would think a comment next to
the definition saying that VPN_SHIFT can be at most 12 would be
sufficient.

Paul.

^ permalink raw reply

* Re: [PATCH -V5 06/13] arch/powerpc: Increase the slice range to 64TB
From: Paul Mackerras @ 2012-08-01  5:16 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <1343647339-25576-7-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Mon, Jul 30, 2012 at 04:52:12PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> This patch makes the high psizes mask as an unsigned char array
> so that we can have more than 16TB. Currently we support upto
> 64TB

Comments below...

> diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
> index b9ee79ce..c355af6 100644
> --- a/arch/powerpc/mm/slb_low.S
> +++ b/arch/powerpc/mm/slb_low.S
> @@ -108,17 +108,34 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
>  	 * between 4k and 64k standard page size
>  	 */
>  #ifdef CONFIG_PPC_MM_SLICES
> +	/* r10 have esid */
>  	cmpldi	r10,16
> -
> -	/* Get the slice index * 4 in r11 and matching slice size mask in r9 */
> -	ld	r9,PACALOWSLICESPSIZE(r13)
> -	sldi	r11,r10,2
> +	/* below SLICE_LOW_TOP */
>  	blt	5f
> -	ld	r9,PACAHIGHSLICEPSIZE(r13)
> -	srdi	r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2)
> -	andi.	r11,r11,0x3c
> -
> -5:	/* Extract the psize and multiply to get an array offset */
> +	/*
> +	 * Handle hpsizes,
> +	 * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
> +	 * We use r10 here, later we restore it to esid.
> +	 * Can we use other register instead of r10 ?

Only r9, r10 and r11 are available here, and you're using them all.
Restoring r10 with one integer instruction is going to be quicker than
saving and restoring another register to/from memory.

> +	 */
> +	srdi    r10,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT) /* index */
> +	srdi	r11,r10,1			/* r11 is array index */
> +	addi	r9,r11,PACAHIGHSLICEPSIZE
> +	lbzx	r9,r9,r13			/* r9 is hpsizes[r11] */
> +	sldi    r11,r11,1
> +	subf	r11,r11,r10	/* mask_index = index - (array_index << 1) */
> +	srdi	r10,r3,28	/* restore r10 with esid */
> +	b	6f

How about (untested):

	srdi    r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
	addi	r9,r11,PACAHIGHSLICEPSIZE
	lbzx	r9,r13,r9			/* r9 is hpsizes[r11] */
	/* r11 = (r10 >> 12) & 1, i.e. grab lowest bit of 1T ESID */
	rldicl	r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
	b	6f

Note that I swapped the RA and RB arguments for the lbzx.  Our recent
processors process indexed mode instructions more quickly if the value
in RB is small.

>  static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
>  {
> +	unsigned char *hpsizes;
> +	int index, mask_index;
>  	struct slice_mask ret = { 0, 0 };
>  	unsigned long i;
> -	u64 psizes;
> +	u64 lpsizes;
>  
> -	psizes = mm->context.low_slices_psize;
> +	lpsizes = mm->context.low_slices_psize;
>  	for (i = 0; i < SLICE_NUM_LOW; i++)
> -		if (((psizes >> (i * 4)) & 0xf) == psize)
> +		if (((lpsizes >> (i * 4)) & 0xf) == psize)
>  			ret.low_slices |= 1u << i;
>  
> -	psizes = mm->context.high_slices_psize;
> -	for (i = 0; i < SLICE_NUM_HIGH; i++)
> -		if (((psizes >> (i * 4)) & 0xf) == psize)
> +	hpsizes = mm->context.high_slices_psize;
> +	for (i = 0; i < SLICE_NUM_HIGH; i++) {
> +		mask_index = i & 0x1;
> +		index = i >> 1;
> +		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
>  			ret.high_slices |= 1u << i;

This needs to be 1ul not 1u, since we are creating a 64-bit mask.

>  static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
>  {
> +	int index, mask_index;
>  	/* Write the new slice psize bits */
> -	u64 lpsizes, hpsizes;
> +	unsigned char *hpsizes;
> +	u64 lpsizes;
>  	unsigned long i, flags;
>  
>  	slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
> @@ -201,14 +208,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
>  			lpsizes = (lpsizes & ~(0xful << (i * 4))) |
>  				(((unsigned long)psize) << (i * 4));
>  
> +	/* Assign the value back */
> +	mm->context.low_slices_psize = lpsizes;
> +
>  	hpsizes = mm->context.high_slices_psize;
> -	for (i = 0; i < SLICE_NUM_HIGH; i++)
> +	for (i = 0; i < SLICE_NUM_HIGH; i++) {
> +		mask_index = i & 0x1;
> +		index = i >> 1;
>  		if (mask.high_slices & (1u << i))

Again, 1ul now.  Check all the other places where we manipulate a
slice mask to see if there are any other instances of 1u that need to
be changed.

Paul.

^ permalink raw reply

* Re: [PATCH -V5 07/13] arch/powerpc: Make some of the PGTABLE_RANGE dependency explicit
From: Paul Mackerras @ 2012-08-01  5:18 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <1343647339-25576-8-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Mon, Jul 30, 2012 at 04:52:13PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> slice array size and slice mask size depend on PGTABLE_RANGE. We
> can't directly include pgtable.h in these header because there is
> a circular dependency. So add compile time check for these values.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Reviewed-by: Paul Mackerras <paulus@samba.org>

but you might like to change:

> +#define SLICE_MASK_SIZE (PGTABLE_RANG >> 43)

to have PGTABLE_RANGE rather than PGTABLE_RANG.

^ permalink raw reply

* Re: [PATCH -V5 08/13] arch/powerpc: Use the rquired number of VSID bits in slbmte
From: Paul Mackerras @ 2012-08-01  5:19 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <1343647339-25576-9-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Mon, Jul 30, 2012 at 04:52:14PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> ASM_VSID_SCRAMBLE can leave non-zero bits in the high 28 bits of the result
> for 256MB segment (40 bits for 1T segment). Properly mask them before using
> the values in slbmte
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Reviewed-by: Paul Mackerras <paulus@samba.org>

but change the subject line to say "required" not "rquired".

^ permalink raw reply

* Re: [PATCH -V5 12/13] arch/powerpc: Replace open coded CONTEXT_BITS value
From: Paul Mackerras @ 2012-08-01  5:29 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <1343647339-25576-13-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Mon, Jul 30, 2012 at 04:52:18PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> To clarify the meaning for future readers, replace the open coded
> 19 with CONTEXT_BITS
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/mm/mmu_context_hash64.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
> index 40677aa..daa076c 100644
> --- a/arch/powerpc/mm/mmu_context_hash64.c
> +++ b/arch/powerpc/mm/mmu_context_hash64.c
> @@ -34,7 +34,7 @@ static DEFINE_IDA(mmu_context_ida);
>   * Each segment contains 2^28 bytes.  Each context maps 2^44 bytes,
>   * so we can support 2^19-1 contexts (19 == 35 + 28 - 44).
>   */
> -#define MAX_CONTEXT	((1UL << 19) - 1)
> +#define MAX_CONTEXT	((1UL << CONTEXT_BITS) - 1)

This is a good thing to do, but you should also update the comment.
Maybe you should put this patch before your number 10/13 and then
change the comment in the same patch where you add the 64TB support.

Paul.

^ permalink raw reply

* Re: [PATCH -V5 13/13] arch/powerpc: Update VSID allocation documentation
From: Paul Mackerras @ 2012-08-01  5:35 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <1343647339-25576-14-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Mon, Jul 30, 2012 at 04:52:19PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> This update the proto-VSID and VSID scramble related information
> to be more generic by using names instead of current values.

Comments below...

> - * VSID allocation
> + * VSID allocation (256MB segment)
>   *
> - * We first generate a 36-bit "proto-VSID".  For kernel addresses this
> - * is equal to the ESID, for user addresses it is:
> - *	(context << 15) | (esid & 0x7fff)
> + * We first generate a 38-bit "proto-VSID".  For kernel addresses this
> + * is equal to the ESID | 1 << 37, for user addresses it is:
> + *	(context << USER_ESID_BITS) | (esid & (1U << USER_ESID_BITS))
					      ^^^^^^^^^^^^^^^^^^^^^^
should be ((1U << USER_ESID_BITS) - 1)

>   *
> - * The two forms are distinguishable because the top bit is 0 for user
> - * addresses, whereas the top two bits are 1 for kernel addresses.
> - * Proto-VSIDs with the top two bits equal to 0b10 are reserved for
> - * now.
> + * This splits the proto-VSID into the below range
> + *  0 - (2^(CONTEXT_BITS + USER_ESID_BITS) - 1) : User proto-VSID range
> + *  2^(CONTEXT_BITS + USER_ESID_BITS) - 2^(VSID_BITS) : Kernel proto-VSID range

Perhaps point out also that CONTEXT_BITS + USER_ESID_BITS == VSID_BITS - 1,
that is, you have assigned half of the space to user processes and half
to the kernel.

> -/*
> - * WARNING - If you change these you must make sure the asm
> - * implementations in slb_allocate (slb_low.S), do_stab_bolted
> - * (head.S) and ASM_VSID_SCRAMBLE (below) are changed accordingly.
> - */

Are you absolutely sure that nothing in the assembly code would need
to be changed if someone changed these definitions?

Paul.

^ permalink raw reply

* Re: [RFC PATCH v5 12/19] memory-hotplug: introduce new function arch_remove_memory()
From: Wen Congyang @ 2012-08-01  6:06 UTC (permalink / raw)
  To: jencce zhou
  Cc: linux-s390, linux-ia64, linux-acpi, len.brown, linux-sh,
	linux-kernel, cmetcalf, linux-mm, Yasuaki ISIMATU, paulus,
	minchan.kim, kosaki.motohiro, rientjes, cl, linuxppc-dev, akpm,
	liuj97
In-Reply-To: <CAN6t85SiG62QXdSpSmGQFeG4f3JnOicDx9H3jSvwg0t2Ly-q+w@mail.gmail.com>

At 08/01/2012 10:44 AM, jencce zhou Wrote:
> 2012/7/27 Wen Congyang <wency@cn.fujitsu.com>:
>> We don't call __add_pages() directly in the function add_memory()
>> because some other architecture related things need to be done
>> before or after calling __add_pages(). So we should introduce
>> a new function arch_remove_memory() to revert the things
>> done in arch_add_memory().
>>
>> Note: the function for s390 is not implemented(I don't know how to
>> implement it for s390).
>>
>> CC: David Rientjes <rientjes@google.com>
>> CC: Jiang Liu <liuj97@gmail.com>
>> CC: Len Brown <len.brown@intel.com>
>> CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> CC: Paul Mackerras <paulus@samba.org>
>> CC: Christoph Lameter <cl@linux.com>
>> Cc: Minchan Kim <minchan.kim@gmail.com>
>> CC: Andrew Morton <akpm@linux-foundation.org>
>> CC: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
>> CC: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> ---
>>  arch/ia64/mm/init.c                  |   16 ++++
>>  arch/powerpc/mm/mem.c                |   14 +++
>>  arch/s390/mm/init.c                  |    8 ++
>>  arch/sh/mm/init.c                    |   15 +++
>>  arch/tile/mm/init.c                  |    8 ++
>>  arch/x86/include/asm/pgtable_types.h |    1 +
>>  arch/x86/mm/init_32.c                |   10 ++
>>  arch/x86/mm/init_64.c                |  160 ++++++++++++++++++++++++++++++++++
>>  arch/x86/mm/pageattr.c               |   47 +++++-----
>>  include/linux/memory_hotplug.h       |    1 +
>>  mm/memory_hotplug.c                  |    1 +
>>  11 files changed, 259 insertions(+), 22 deletions(-)
>>
>> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
>> index 0eab454..1e345ed 100644
>> --- a/arch/ia64/mm/init.c
>> +++ b/arch/ia64/mm/init.c
>> @@ -688,6 +688,22 @@ int arch_add_memory(int nid, u64 start, u64 size)
>>
>>         return ret;
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +int arch_remove_memory(u64 start, u64 size)
>> +{
>> +       unsigned long start_pfn = start >> PAGE_SHIFT;
>> +       unsigned long nr_pages = size >> PAGE_SHIFT;
>> +       int ret;
>> +
>> +       ret = __remove_pages(start_pfn, nr_pages);
>> +       if (ret)
>> +               pr_warn("%s: Problem encountered in __remove_pages() as"
>> +                       " ret=%d\n", __func__,  ret);
>> +
>> +       return ret;
>> +}
>> +#endif
>>  #endif
>>
> 
> in 3.5 ia64 implementation did not call __remove_pages at all. so why?

This function only reverts the things done in arch_add_memory(), and it will
be called when a memory device is removed.

When adding a memory device, __add_pages() is called in arch_add_memory(),
so call __remove_pages() in arch_remove_memory().

Thanks
Wen Congyang

> 
> 
>>  /*
>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index baaafde..249cef4 100644
>> --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -133,6 +133,20 @@ int arch_add_memory(int nid, u64 start, u64 size)
>>
>>         return __add_pages(nid, zone, start_pfn, nr_pages);
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +int arch_remove_memory(u64 start, u64 size)
>> +{
>> +       unsigned long start_pfn = start >> PAGE_SHIFT;
>> +       unsigned long nr_pages = size >> PAGE_SHIFT;
>> +
>> +       start = (unsigned long)__va(start);
>> +       if (remove_section_mapping(start, start + size))
>> +               return -EINVAL;
>> +
>> +       return __remove_pages(start_pfn, nr_pages);
>> +}
>> +#endif
>>  #endif /* CONFIG_MEMORY_HOTPLUG */
>>
>>  /*
>> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
>> index 6adbc08..ca4bc46 100644
>> --- a/arch/s390/mm/init.c
>> +++ b/arch/s390/mm/init.c
>> @@ -257,4 +257,12 @@ int arch_add_memory(int nid, u64 start, u64 size)
>>                 vmem_remove_mapping(start, size);
>>         return rc;
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +int arch_remove_memory(u64 start, u64 size)
>> +{
>> +       /* TODO */
>> +       return -EBUSY;
>> +}
>> +#endif
>>  #endif /* CONFIG_MEMORY_HOTPLUG */
>> diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
>> index 82cc576..fc84491 100644
>> --- a/arch/sh/mm/init.c
>> +++ b/arch/sh/mm/init.c
>> @@ -558,4 +558,19 @@ int memory_add_physaddr_to_nid(u64 addr)
>>  EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
>>  #endif
>>
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +int arch_remove_memory(u64 start, u64 size)
>> +{
>> +       unsigned long start_pfn = start >> PAGE_SHIFT;
>> +       unsigned long nr_pages = size >> PAGE_SHIFT;
>> +       int ret;
>> +
>> +       ret = __remove_pages(start_pfn, nr_pages);
>> +       if (unlikely(ret))
>> +               pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
>> +                       ret);
>> +
>> +       return ret;
>> +}
>> +#endif
>>  #endif /* CONFIG_MEMORY_HOTPLUG */
>> diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
>> index ef29d6c..2749515 100644
>> --- a/arch/tile/mm/init.c
>> +++ b/arch/tile/mm/init.c
>> @@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
>>  {
>>         return -EINVAL;
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +int arch_remove_memory(u64 start, u64 size)
>> +{
>> +       /* TODO */
>> +       return -EBUSY;
>> +}
>> +#endif
>>  #endif
>>
>>  struct kmem_cache *pgd_cache;
>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>> index 013286a..b725af2 100644
>> --- a/arch/x86/include/asm/pgtable_types.h
>> +++ b/arch/x86/include/asm/pgtable_types.h
>> @@ -334,6 +334,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
>>   * as a pte too.
>>   */
>>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
>> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
>>
>>  #endif /* !__ASSEMBLY__ */
>>
>> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
>> index 575d86f..a690153 100644
>> --- a/arch/x86/mm/init_32.c
>> +++ b/arch/x86/mm/init_32.c
>> @@ -842,6 +842,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
>>
>>         return __add_pages(nid, zone, start_pfn, nr_pages);
>>  }
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +int arch_remove_memory(unsigned long start, unsigned long size)
>> +{
>> +       unsigned long start_pfn = start >> PAGE_SHIFT;
>> +       unsigned long nr_pages = size >> PAGE_SHIFT;
>> +
>> +       return __remove_pages(start_pfn, nr_pages);
>> +}
>> +#endif
>>  #endif
>>
>>  /*
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index 2b6b4a3..f1554a9 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -675,6 +675,166 @@ int arch_add_memory(int nid, u64 start, u64 size)
>>  }
>>  EXPORT_SYMBOL_GPL(arch_add_memory);
>>
>> +static void __meminit
>> +phys_pte_remove(pte_t *pte_page, unsigned long addr, unsigned long end)
>> +{
>> +       unsigned pages = 0;
>> +       int i = pte_index(addr);
>> +
>> +       pte_t *pte = pte_page + pte_index(addr);
>> +
>> +       for (; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
>> +
>> +               if (addr >= end)
>> +                       break;
>> +
>> +               if (!pte_present(*pte))
>> +                       continue;
>> +
>> +               pages++;
>> +               set_pte(pte, __pte(0));
>> +       }
>> +
>> +       update_page_count(PG_LEVEL_4K, -pages);
>> +}
>> +
>> +static void __meminit
>> +phys_pmd_remove(pmd_t *pmd_page, unsigned long addr, unsigned long end)
>> +{
>> +       unsigned long pages = 0, next;
>> +       int i = pmd_index(addr);
>> +
>> +       for (; i < PTRS_PER_PMD; i++, addr = next) {
>> +               unsigned long pte_phys;
>> +               pmd_t *pmd = pmd_page + pmd_index(addr);
>> +               pte_t *pte;
>> +
>> +               if (addr >= end)
>> +                       break;
>> +
>> +               next = (addr & PMD_MASK) + PMD_SIZE;
>> +
>> +               if (!pmd_present(*pmd))
>> +                       continue;
>> +
>> +               if (pmd_large(*pmd)) {
>> +                       if ((addr & ~PMD_MASK) == 0 && next <= end) {
>> +                               set_pmd(pmd, __pmd(0));
>> +                               pages++;
>> +                               continue;
>> +                       }
>> +
>> +                       /*
>> +                        * We use 2M page, but we need to remove part of them,
>> +                        * so split 2M page to 4K page.
>> +                        */
>> +                       pte = alloc_low_page(&pte_phys);
>> +                       __split_large_page((pte_t *)pmd, addr, pte);
>> +
>> +                       spin_lock(&init_mm.page_table_lock);
>> +                       pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
>> +                       spin_unlock(&init_mm.page_table_lock);
>> +               }
>> +
>> +               spin_lock(&init_mm.page_table_lock);
>> +               pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
>> +               phys_pte_remove(pte, addr, end);
>> +               unmap_low_page(pte);
>> +               spin_unlock(&init_mm.page_table_lock);
>> +       }
>> +       update_page_count(PG_LEVEL_2M, -pages);
>> +}
>> +
>> +static void __meminit
>> +phys_pud_remove(pud_t *pud_page, unsigned long addr, unsigned long end)
>> +{
>> +       unsigned long pages = 0, next;
>> +       int i = pud_index(addr);
>> +
>> +       for (; i < PTRS_PER_PUD; i++, addr = next) {
>> +               unsigned long pmd_phys;
>> +               pud_t *pud = pud_page + pud_index(addr);
>> +               pmd_t *pmd;
>> +
>> +               if (addr >= end)
>> +                       break;
>> +
>> +               next = (addr & PUD_MASK) + PUD_SIZE;
>> +
>> +               if (!pud_present(*pud))
>> +                       continue;
>> +
>> +               if (pud_large(*pud)) {
>> +                       if ((addr & ~PUD_MASK) == 0 && next <= end) {
>> +                               set_pud(pud, __pud(0));
>> +                               pages++;
>> +                               continue;
>> +                       }
>> +
>> +                       /*
>> +                        * We use 1G page, but we need to remove part of them,
>> +                        * so split 1G page to 2M page.
>> +                        */
>> +                       pmd = alloc_low_page(&pmd_phys);
>> +                       __split_large_page((pte_t *)pud, addr, (pte_t *)pmd);
>> +
>> +                       spin_lock(&init_mm.page_table_lock);
>> +                       pud_populate(&init_mm, pud, __va(pmd_phys));
>> +                       spin_unlock(&init_mm.page_table_lock);
>> +               }
>> +
>> +               pmd = map_low_page(pmd_offset(pud, 0));
>> +               phys_pmd_remove(pmd, addr, end);
>> +               unmap_low_page(pmd);
>> +               __flush_tlb_all();
>> +       }
>> +       __flush_tlb_all();
>> +
>> +       update_page_count(PG_LEVEL_1G, -pages);
>> +}
>> +
>> +void __meminit
>> +kernel_physical_mapping_remove(unsigned long start, unsigned long end)
>> +{
>> +       unsigned long next;
>> +
>> +       start = (unsigned long)__va(start);
>> +       end = (unsigned long)__va(end);
>> +
>> +       for (; start < end; start = next) {
>> +               pgd_t *pgd = pgd_offset_k(start);
>> +               pud_t *pud;
>> +
>> +               next = (start + PGDIR_SIZE) & PGDIR_MASK;
>> +               if (next > end)
>> +                       next = end;
>> +
>> +               if (!pgd_present(*pgd))
>> +                       continue;
>> +
>> +               pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
>> +               phys_pud_remove(pud, __pa(start), __pa(end));
>> +               unmap_low_page(pud);
>> +       }
>> +
>> +       __flush_tlb_all();
>> +}
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +int __ref arch_remove_memory(unsigned long start, unsigned long size)
>> +{
>> +       unsigned long start_pfn = start >> PAGE_SHIFT;
>> +       unsigned long nr_pages = size >> PAGE_SHIFT;
>> +       int ret;
>> +
>> +       ret = __remove_pages(start_pfn, nr_pages);
>> +       WARN_ON_ONCE(ret);
>> +
>> +       kernel_physical_mapping_remove(start, start + size);
>> +
>> +       return ret;
>> +}
>> +#endif
>>  #endif /* CONFIG_MEMORY_HOTPLUG */
>>
>>  static struct kcore_list kcore_vsyscall;
>> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
>> index 931930a..c22963d 100644
>> --- a/arch/x86/mm/pageattr.c
>> +++ b/arch/x86/mm/pageattr.c
>> @@ -501,21 +501,13 @@ out_unlock:
>>         return do_split;
>>  }
>>
>> -static int split_large_page(pte_t *kpte, unsigned long address)
>> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
>>  {
>>         unsigned long pfn, pfninc = 1;
>>         unsigned int i, level;
>> -       pte_t *pbase, *tmp;
>> +       pte_t *tmp;
>>         pgprot_t ref_prot;
>> -       struct page *base;
>> -
>> -       if (!debug_pagealloc)
>> -               spin_unlock(&cpa_lock);
>> -       base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
>> -       if (!debug_pagealloc)
>> -               spin_lock(&cpa_lock);
>> -       if (!base)
>> -               return -ENOMEM;
>> +       struct page *base = virt_to_page(pbase);
>>
>>         spin_lock(&pgd_lock);
>>         /*
>> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
>>          * up for us already:
>>          */
>>         tmp = lookup_address(address, &level);
>> -       if (tmp != kpte)
>> -               goto out_unlock;
>> +       if (tmp != kpte) {
>> +               spin_unlock(&pgd_lock);
>> +               return 1;
>> +       }
>>
>> -       pbase = (pte_t *)page_address(base);
>>         paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>>         ref_prot = pte_pgprot(pte_clrhuge(*kpte));
>>         /*
>> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
>>          * going on.
>>          */
>>         __flush_tlb_all();
>> +       spin_unlock(&pgd_lock);
>>
>> -       base = NULL;
>> +       return 0;
>> +}
>>
>> -out_unlock:
>> -       /*
>> -        * If we dropped out via the lookup_address check under
>> -        * pgd_lock then stick the page back into the pool:
>> -        */
>> -       if (base)
>> +static int split_large_page(pte_t *kpte, unsigned long address)
>> +{
>> +       pte_t *pbase;
>> +       struct page *base;
>> +
>> +       if (!debug_pagealloc)
>> +               spin_unlock(&cpa_lock);
>> +       base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
>> +       if (!debug_pagealloc)
>> +               spin_lock(&cpa_lock);
>> +       if (!base)
>> +               return -ENOMEM;
>> +
>> +       pbase = (pte_t *)page_address(base);
>> +       if (__split_large_page(kpte, address, pbase))
>>                 __free_page(base);
>> -       spin_unlock(&pgd_lock);
>>
>>         return 0;
>>  }
>> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>> index 8bf820d..0d500be 100644
>> --- a/include/linux/memory_hotplug.h
>> +++ b/include/linux/memory_hotplug.h
>> @@ -85,6 +85,7 @@ extern void __online_page_free(struct page *page);
>>
>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>>  extern bool is_pageblock_removable_nolock(struct page *page);
>> +extern int arch_remove_memory(unsigned long start, unsigned long size);
>>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>>
>>  /* reasonably generic interface to expand the physical pages in a zone  */
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index a9e1579..0c932e1 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1071,6 +1071,7 @@ int __ref remove_memory(int nid, u64 start, u64 size)
> 
> line 1071?  which version does this patch base on?  thanks a lot.
> 
> 
>>         /* remove memmap entry */
>>         firmware_map_remove(start, start + size, "System RAM");
>>
>> +       arch_remove_memory(start, size);
>>  out:
>>         unlock_memory_hotplug();
>>         return ret;
>> --
>> 1.7.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply

* RE: [PATCH 5/7] fsl-dma: fix support for async_tx API
From: Liu Qiang-B32616 @ 2012-08-01  6:03 UTC (permalink / raw)
  To: Ira W. Snyder, linux-crypto@vger.kernel.org
  Cc: Vinod Koul, dan.j.williams@gmail.com, Dan Williams,
	linuxppc-dev@lists.ozlabs.org, Li Yang-R58472
In-Reply-To: <1343778352-5549-6-git-send-email-iws@ovro.caltech.edu>

> -----Original Message-----
> From: Ira W. Snyder [mailto:iws@ovro.caltech.edu]
> Sent: Wednesday, August 01, 2012 7:46 AM
> To: linux-crypto@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org; Liu Qiang-B32616; Ira W. Snyder; Dan
> Williams; Vinod Koul; Li Yang-R58472; Liu Qiang-B32616
> Subject: [PATCH 5/7] fsl-dma: fix support for async_tx API
>=20
> From: "Ira W. Snyder" <iws@ovro.caltech.edu>
>=20
> The current fsldma driver does not support the async_tx API. This is due
> to a bug: all descriptors are released when the hardware is finished
> with them. The async_tx API requires that the ACK bit is set by software
> before descriptors are freed.
>=20
> This bug is easiest to reproduce by enabling both CONFIG_NET_DMA and
> CONFIG_ASYNC_TX, and using the hardware offload support in MD RAID. The
> network stack will force operations on shared DMA channels, and will
> free descriptors which are being used by the MD RAID code.
>=20
> The BUG_ON(async_tx_test_ack(depend_tx)) test in async_tx_submit() will
> be hit, and panic the machine.
>=20
> TASK =3D ee1a94a0[1390] 'md0_raid5' THREAD: ecf40000 CPU: 0
> GPR00: 00000001 ecf41ca0 ee44/921a94a0 0000003f 00000001 c00593e4
> 00000000 00000001
> GPR08: 00000000 a7a7a7a7 00000001 045/920000002 42028042 100a38d4
> ed576d98 00000000
> GPR16: ed5a11b0 00000000 2b162000 00000200 046/920000000 2d555000
> ed3015e8 c15a7aa0
> GPR24: 00000000 c155fc40 00000000 ecb63220 ecf41d28 e47/92f640bb0
> ef640c30 ecf41ca0
> NIP [c02b048c] async_tx_submit+0x6c/0x2b4
> LR [c02b068c] async_tx_submit+0x26c/0x2b4
> Call Trace:
> [ecf41ca0] [c02b068c] async_tx_submit+0x26c/0x2b448/92 (unreliable)
> [ecf41cd0] [c02b0a4c] async_memcpy+0x240/0x25c
> [ecf41d20] [c0421064] async_copy_data+0xa0/0x17c
> [ecf41d70] [c0421cf4] __raid_run_ops+0x874/0xe10
> [ecf41df0] [c0426ee4] handle_stripe+0x820/0x25e8
> [ecf41e90] [c0429080] raid5d+0x3d4/0x5b4
> [ecf41f40] [c04329b8] md_thread+0x138/0x16c
> [ecf41f90] [c008277c] kthread+0x8c/0x90
> [ecf41ff0] [c0011630] kernel_thread+0x4c/0x68
>=20
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Vinod Koul <vinod.koul@intel.com>
> Cc: Li Yang <leoli@freescale.com>
> Cc: Qiang Liu <qiang.liu@freescale.com>
I have no idea where the log is from, if the log is from my patch, please n=
ote it.

> Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
> ---
>  drivers/dma/fsldma.c |  156 +++++++++++++++++++++++++++++++-------------
> ------
>  drivers/dma/fsldma.h |    1 +
>  2 files changed, 97 insertions(+), 60 deletions(-)
>=20
> diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
> index 80edc63..380c1b7 100644
> --- a/drivers/dma/fsldma.c
> +++ b/drivers/dma/fsldma.c
> @@ -410,16 +410,15 @@ static void fsl_dma_free_descriptor(struct
> fsldma_chan *chan, struct fsl_desc_sw
>  }
>=20
>  /**
> - * fsldma_cleanup_descriptor - cleanup and free a single link descriptor
> + * fsldma_run_tx_complete_actions - run callback and unmap descriptor
>   * @chan: Freescale DMA channel
>   * @desc: descriptor to cleanup and free
>   *
>   * This function is used on a descriptor which has been executed by the
> DMA
> - * controller. It will run any callbacks, submit any dependencies, and
> then
> - * free the descriptor.
> + * controller. It will run the callback and unmap the descriptor, if
> requested.
>   */
> -static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
> -				      struct fsl_desc_sw *desc)
> +static void fsldma_run_tx_complete_actions(struct fsldma_chan *chan,
> +					   struct fsl_desc_sw *desc)
>  {
Ah.. I am the original author of this design. Especially some important int=
erfaces,
at least you should add my name or note the idea before you modify it?

>  	struct dma_async_tx_descriptor *txd =3D &desc->async_tx;
>  	struct device *dev =3D chan->common.device->dev;
> @@ -427,6 +426,10 @@ static void fsldma_cleanup_descriptor(struct
> fsldma_chan *chan,
>  	dma_addr_t dst =3D get_desc_dst(chan, desc);
>  	u32 len =3D get_desc_cnt(chan, desc);
>=20
> +	/* Cookies with value zero are already cleaned up */
> +	if (txd->cookie =3D=3D 0)
> +		return;
> +
>  	/* Run the link descriptor callback function */
>  	if (txd->callback) {
>  #ifdef FSL_DMA_LD_DEBUG
> @@ -435,9 +438,6 @@ static void fsldma_cleanup_descriptor(struct
> fsldma_chan *chan,
>  		txd->callback(txd->callback_param);
>  	}
>=20
> -	/* Run any dependencies */
> -	dma_run_dependencies(txd);
> -
>  	/* Unmap the dst buffer, if requested */
>  	if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
>  		if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
> @@ -454,7 +454,68 @@ static void fsldma_cleanup_descriptor(struct
> fsldma_chan *chan,
>  			dma_unmap_page(dev, src, len, DMA_TO_DEVICE);
>  	}
>=20
> -	fsl_dma_free_descriptor(chan, desc);
> +	/*
> +	 * A zeroed cookie indicates that cleanup actions have been
> +	 * run, but the descriptor has not yet been ACKed.
> +	 */
> +	txd->cookie =3D 0;
> +}
> +
> +/**
> + * fsldma_cleanup_descriptors - cleanup and free link descriptors
> + * @chan: Freescale DMA channel
> + *
> + * This function is used to clean up all descriptors which have been
> executed
> + * by the DMA controller. It will run any callbacks, submit any
> dependencies,
> + * and free any descriptors which have been ACKed.
> + *
> + * LOCKING: must NOT hold chan->desc_lock
> + * CONTEXT: any
> + */
> +static void fsldma_cleanup_descriptors(struct fsldma_chan *chan)
> +{
> +	struct fsl_desc_sw *desc, *_desc;
> +	dma_cookie_t cookie =3D 0;
> +	LIST_HEAD(ld_cleanup);
> +	unsigned long flags;
> +
> +	/*
> +	 * Move all descriptors onto a temporary list so that hardware
> +	 * interrupts can be enabled during cleanup
> +	 */
> +	spin_lock_irqsave(&chan->desc_lock, flags);
> +	list_splice_tail_init(&chan->ld_completed, &ld_cleanup);
NACK.
Here is bug, you cannot assume all descriptors in s/w queue have been compl=
eted,
e.g. client submit some descriptors with zero length, and mixed with normal=
 descriptors,
an interrupt is raised, some are finished but some are not completed, but y=
ou will free
all descriptors at all. So, I want to ask what is the standard of completio=
n of a descriptor?
Hardware is very fast is not enough:(

> +	spin_unlock_irqrestore(&chan->desc_lock, flags);
> +
> +	/* Run TX completion actions on completed descriptors */
> +	list_for_each_entry_safe(desc, _desc, &ld_cleanup, node) {
> +		struct dma_async_tx_descriptor *txd =3D &desc->async_tx;
> +		cookie =3D txd->cookie;
> +
> +		/* Clean up this descriptor */
> +		fsldma_run_tx_complete_actions(chan, desc);
> +
> +		/* Run any dependencies */
> +		dma_run_dependencies(txd);
> +
> +		/* Test for ACK and free */
> +		if (async_tx_test_ack(txd))
> +			fsl_dma_free_descriptor(chan, desc);
> +	}
> +
> +	spin_lock_irqsave(&chan->desc_lock, flags);
> +
> +	/*
> +	 * Replace any non-ACKed descriptors on the front of the
> ld_completed
> +	 * list so they will be freed in the order they were submitted
> +	 */
> +	list_splice(&ld_cleanup, &chan->ld_completed);
> +
> +	/* Update the cookie, if it changed */
> +	if (cookie > 0)
> +		chan->common.completed_cookie =3D cookie;
> +
> +	spin_unlock_irqrestore(&chan->desc_lock, flags);
>  }
>=20
>  static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx=
)
> @@ -578,9 +639,11 @@ static void fsl_dma_free_chan_resources(struct
> dma_chan *dchan)
>  	struct fsldma_chan *chan =3D to_fsl_chan(dchan);
>=20
>  	chan_dbg(chan, "free all channel resources\n");
> +	fsldma_cleanup_descriptors(chan);
>  	spin_lock_irq(&chan->desc_lock);
>  	fsldma_free_desc_list(chan, &chan->ld_pending);
>  	fsldma_free_desc_list(chan, &chan->ld_running);
> +	fsldma_free_desc_list(chan, &chan->ld_completed);
>  	spin_unlock_irq(&chan->desc_lock);
>=20
>  	dma_pool_destroy(chan->desc_pool);
> @@ -945,11 +1008,13 @@ static enum dma_status fsl_tx_status(struct
> dma_chan *dchan,
>  	struct fsldma_chan *chan =3D to_fsl_chan(dchan);
>  	enum dma_status ret;
>=20
> -	spin_lock_irq(&chan->desc_lock);
>  	ret =3D dma_cookie_status(dchan, cookie, txstate);
> -	spin_unlock_irq(&chan->desc_lock);
> +	if (ret =3D=3D DMA_SUCCESS)
> +		return ret;
>=20
> -	return ret;
> +	tasklet_schedule(&chan->tasklet);
> +
> +	return dma_cookie_status(dchan, cookie, txstate);
>  }
>=20
>  /*----------------------------------------------------------------------
> ------*/
> @@ -1013,10 +1078,24 @@ static irqreturn_t fsldma_chan_irq(int irq, void
> *data)
>  	if (stat)
>  		chan_err(chan, "irq: unhandled sr 0x%08x\n", stat);
>=20
> +	spin_lock(&chan->desc_lock);
> +	chan->idle =3D true;
> +
> +	/* Move all running descriptors onto the completed list */
> +	list_splice_tail_init(&chan->ld_running, &chan->ld_completed);
> +
> +	/*
> +	 * Start any pending transactions automatically
> +	 *
> +	 * In the ideal case, we keep the DMA controller busy while we go
> +	 * ahead and free the descriptors in the tasklet.
> +	 */
> +	fsl_chan_xfer_ld_queue(chan);
> +	spin_unlock(&chan->desc_lock);
> +
>  	/*
> -	 * Schedule the tasklet to handle all cleanup of the current
> -	 * transaction. It will start a new transaction if there is
> -	 * one pending.
> +	 * Schedule the tasklet to handle all cleanup of the completed
> +	 * descriptors.
>  	 */
>  	tasklet_schedule(&chan->tasklet);
>  	chan_dbg(chan, "irq: Exit\n");
> @@ -1026,53 +1105,9 @@ static irqreturn_t fsldma_chan_irq(int irq, void
> *data)
>  static void dma_do_tasklet(unsigned long data)
>  {
>  	struct fsldma_chan *chan =3D (struct fsldma_chan *)data;
> -	struct fsl_desc_sw *desc, *_desc;
> -	LIST_HEAD(ld_cleanup);
> -	unsigned long flags;
>=20
>  	chan_dbg(chan, "tasklet entry\n");
> -
> -	spin_lock_irqsave(&chan->desc_lock, flags);
> -
> -	/* update the cookie if we have some descriptors to cleanup */
> -	if (!list_empty(&chan->ld_running)) {
> -		dma_cookie_t cookie;
> -
> -		desc =3D to_fsl_desc(chan->ld_running.prev);
> -		cookie =3D desc->async_tx.cookie;
> -		dma_cookie_complete(&desc->async_tx);
> -
> -		chan_dbg(chan, "completed_cookie=3D%d\n", cookie);
> -	}
> -
> -	/*
> -	 * move the descriptors to a temporary list so we can drop the lock
> -	 * during the entire cleanup operation
> -	 */
> -	list_splice_tail_init(&chan->ld_running, &ld_cleanup);
> -
> -	/* the hardware is now idle and ready for more */
> -	chan->idle =3D true;
> -
> -	/*
> -	 * Start any pending transactions automatically
> -	 *
> -	 * In the ideal case, we keep the DMA controller busy while we go
> -	 * ahead and free the descriptors below.
> -	 */
> -	fsl_chan_xfer_ld_queue(chan);
> -	spin_unlock_irqrestore(&chan->desc_lock, flags);
> -
> -	/* Run the callback for each descriptor, in order */
> -	list_for_each_entry_safe(desc, _desc, &ld_cleanup, node) {
> -
> -		/* Remove from the list of transactions */
> -		list_del(&desc->node);
> -
> -		/* Run all cleanup for this descriptor */
> -		fsldma_cleanup_descriptor(chan, desc);
> -	}
> -
> +	fsldma_cleanup_descriptors(chan);
>  	chan_dbg(chan, "tasklet exit\n");
>  }
>=20
> @@ -1253,6 +1288,7 @@ static int __devinit fsl_dma_chan_probe(struct
> fsldma_device *fdev,
>  	spin_lock_init(&chan->desc_lock);
>  	INIT_LIST_HEAD(&chan->ld_pending);
>  	INIT_LIST_HEAD(&chan->ld_running);
> +	INIT_LIST_HEAD(&chan->ld_completed);
>  	chan->idle =3D true;
>=20
>  	chan->common.device =3D &fdev->common;
> diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
> index f5c3879..7ede908 100644
> --- a/drivers/dma/fsldma.h
> +++ b/drivers/dma/fsldma.h
> @@ -140,6 +140,7 @@ struct fsldma_chan {
>  	spinlock_t desc_lock;		/* Descriptor operation lock */
>  	struct list_head ld_pending;	/* Link descriptors queue */
>  	struct list_head ld_running;	/* Link descriptors queue */
> +	struct list_head ld_completed;	/* Link descriptors queue */
>  	struct dma_chan common;		/* DMA common channel */
>  	struct dma_pool *desc_pool;	/* Descriptors pool */
>  	struct device *dev;		/* Channel device */
> --
> 1.7.8.6
>=20

^ permalink raw reply

* Re: [RFC PATCH v5 16/19] memory-hotplug: free memmap of sparse-vmemmap
From: Wen Congyang @ 2012-08-01  6:09 UTC (permalink / raw)
  To: gerald.schaefer
  Cc: linux-s390, linux-ia64, linux-acpi, len.brown, linux-sh,
	linux-kernel, cmetcalf, linux-mm, Yasuaki ISIMATU, paulus,
	minchan.kim, kosaki.motohiro, rientjes, cl, linuxppc-dev, akpm,
	liuj97
In-Reply-To: <20120731142251.5b2cae37@thinkpad>

At 07/31/2012 08:22 PM, Gerald Schaefer Wrote:
> On Fri, 27 Jul 2012 18:34:38 +0800
> Wen Congyang <wency@cn.fujitsu.com> wrote:
> 
>> From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>>
>> All pages of virtual mapping in removed memory cannot be freed, since
>> some pages used as PGD/PUD includes not only removed memory but also
>> other memory. So the patch checks whether page can be freed or not.
>>
>> How to check whether page can be freed or not?
>>  1. When removing memory, the page structs of the revmoved memory are
>> filled with 0FD.
>>  2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be
>> cleared. In this case, the page used as PT/PMD can be freed.
>>
>> Applying patch, __remove_section() of CONFIG_SPARSEMEM_VMEMMAP is
>> integrated into one. So __remove_section() of
>> CONFIG_SPARSEMEM_VMEMMAP is deleted.
> 
> There should also be generic or dummy versions of the functions
> vmemmap_free_bootmem(), vmemmap_kfree() and
> register_page_bootmem_memmap(). It doesn't compile on other
> archtitectures than x86 as it is now:
> 
> mm/built-in.o: In function `sparse_remove_one_section':
> (.text+0x49fa6): undefined reference to `vmemmap_free_bootmem'
> mm/built-in.o: In function `sparse_remove_one_section':
> (.text+0x49fcc): undefined reference to `vmemmap_kfree'
> mm/built-in.o: In function `register_page_bootmem_info_node':
> (.text+0x57c06): undefined reference to `register_page_bootmem_memmap'
> mm/built-in.o: In function `sparse_add_one_section':
> (.meminit.text+0x2506): undefined reference to `vmemmap_kfree'
> mm/built-in.o: In function `sparse_add_one_section':
> (.meminit.text+0x2528): undefined reference to `vmemmap_kfree'
> make: *** [vmlinux] Error 1
> 
> 

Thanks for testing. I will fix it.

Wen Congyang

^ permalink raw reply

* [PATCH] vphn: fix arch_update_cpu_topology() return value
From: Jesse Larrew @ 2012-08-01  7:22 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel@vger.kernel.org

>From 5dfd547532fca61462dc17fd0bb8e533002c4bc5 Mon Sep 17 00:00:00 2001
From: Jesse Larrew <jlarrew@linux.vnet.ibm.com>
Date: Thu, 7 Jun 2012 16:04:34 -0500

arch_update_cpu_topology() should only return 1 when the topology has
actually changed, and should return 0 otherwise.

This patch fixes a potential bug where rebuild_sched_domains() would
reinitialize the sched domains even when the topology hasn't changed.

Signed-off-by: Jesse Larrew <jlarrew@linux.vnet.ibm.com>
---
 arch/powerpc/mm/numa.c |    7 ++++---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 39b1597..59213cf 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1436,11 +1436,11 @@ static long vphn_get_associativity(unsigned long cpu,
 
 /*
  * Update the node maps and sysfs entries for each cpu whose home node
- * has changed.
+ * has changed. Returns 1 when the topology has changed, and 0 otherwise.
  */
 int arch_update_cpu_topology(void)
 {
-	int cpu, nid, old_nid;
+	int cpu, nid, old_nid, changed = 0;
 	unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
 	struct device *dev;
 
@@ -1466,9 +1466,10 @@ int arch_update_cpu_topology(void)
 		dev = get_cpu_device(cpu);
 		if (dev)
 			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
+		changed = 1;
 	}
 
-	return 1;
+	return changed;
 }
 
 static void topology_work_fn(struct work_struct *work)
-- 
1.7.7.6


Jesse Larrew
Software Engineer, Linux on Power Kernel Team
IBM Linux Technology Center
Phone: (512) 973-2052 (T/L: 363-2052)
jlarrew@linux.vnet.ibm.com

^ permalink raw reply related

* Re: [PATCH -V5 03/13] arch/powerpc: Convert virtual address to vpn
From: Aneesh Kumar K.V @ 2012-08-01  7:23 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev
In-Reply-To: <20120801043339.GB24014@drongo>

Paul Mackerras <paulus@samba.org> writes:

> On Mon, Jul 30, 2012 at 04:52:09PM +0530, Aneesh Kumar K.V wrote:
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> This patch convert different functions to take virtual page number
>> instead of virtual address. Virtual page number is virtual address
>> shifted right by VPN_SHIFT (12) bits. This enable us to have an
>> address range of upto 76 bits.
>
> Mostly looks good.  I don't think it's necessary to add the
> BUILD_BUG_ON and BUG_ON conditions.  I would think a comment next to
> the definition saying that VPN_SHIFT can be at most 12 would be
> sufficient.

Updated.

Thanks
-aneesh

^ permalink raw reply

* Re: 3.5+: yaboot, Invalid memory access
From: Christian Kujau @ 2012-08-01  7:30 UTC (permalink / raw)
  To: Tony Breeds; +Cc: linuxppc-dev
In-Reply-To: <20120801020245.GC6467@thor.bakeyournoodle.com>

On Wed, 1 Aug 2012 at 12:02, Tony Breeds wrote:
> 1) can you also upload you vmlinux so we can poke at it.

I've uploaded the vmlinx (and System.map) to the same location:

  http://nerdbynature.de/bits/3.5.0/yaboot/

> 2) What is the dmseg.txt file there?  If you're unable to boot 3.5 how
> did you get the dmesg?

This is a dmesg from the working 3.5.0 (final), i.e. the release version 
which is working. The version that is not booting is "yesterday's git 
tree" or 37cd9600a9e20359b0283983c9e3a55d84347168 to be specific.

Sorry for the confusion. I only put the dmesg there so that people can 
have a look and see what type of machine this is.

Christian.
-- 
BOFH excuse #54:

Evil dogs hypnotised the night shift

^ permalink raw reply

* Re: [PATCH -V5 06/13] arch/powerpc: Increase the slice range to 64TB
From: Aneesh Kumar K.V @ 2012-08-01  7:43 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev
In-Reply-To: <20120801051617.GC24014@drongo>

Paul Mackerras <paulus@samba.org> writes:

> On Mon, Jul 30, 2012 at 04:52:12PM +0530, Aneesh Kumar K.V wrote:
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> This patch makes the high psizes mask as an unsigned char array
>> so that we can have more than 16TB. Currently we support upto
>> 64TB
>
> Comments below...
>
>> diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
>> index b9ee79ce..c355af6 100644
>> --- a/arch/powerpc/mm/slb_low.S
>> +++ b/arch/powerpc/mm/slb_low.S
>> @@ -108,17 +108,34 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
>>  	 * between 4k and 64k standard page size
>>  	 */
>>  #ifdef CONFIG_PPC_MM_SLICES
>> +	/* r10 have esid */
>>  	cmpldi	r10,16
>> -
>> -	/* Get the slice index * 4 in r11 and matching slice size mask in r9 */
>> -	ld	r9,PACALOWSLICESPSIZE(r13)
>> -	sldi	r11,r10,2
>> +	/* below SLICE_LOW_TOP */
>>  	blt	5f
>> -	ld	r9,PACAHIGHSLICEPSIZE(r13)
>> -	srdi	r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT - 2)
>> -	andi.	r11,r11,0x3c
>> -
>> -5:	/* Extract the psize and multiply to get an array offset */
>> +	/*
>> +	 * Handle hpsizes,
>> +	 * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
>> +	 * We use r10 here, later we restore it to esid.
>> +	 * Can we use other register instead of r10 ?
>
> Only r9, r10 and r11 are available here, and you're using them all.
> Restoring r10 with one integer instruction is going to be quicker than
> saving and restoring another register to/from memory.
>
>> +	 */
>> +	srdi    r10,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT) /* index */
>> +	srdi	r11,r10,1			/* r11 is array index */
>> +	addi	r9,r11,PACAHIGHSLICEPSIZE
>> +	lbzx	r9,r9,r13			/* r9 is hpsizes[r11] */
>> +	sldi    r11,r11,1
>> +	subf	r11,r11,r10	/* mask_index = index - (array_index << 1) */
>> +	srdi	r10,r3,28	/* restore r10 with esid */
>> +	b	6f
>
> How about (untested):
>
> 	srdi    r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
> 	addi	r9,r11,PACAHIGHSLICEPSIZE
> 	lbzx	r9,r13,r9			/* r9 is hpsizes[r11] */
> 	/* r11 = (r10 >> 12) & 1, i.e. grab lowest bit of 1T ESID */
> 	rldicl	r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
> 	b	6f
>

nice, I missed the assembly part when you asked to update the c code
in previous review. This change also bring it closer to the c code.

> Note that I swapped the RA and RB arguments for the lbzx.  Our recent
> processors process indexed mode instructions more quickly if the value
> in RB is small.
>
>>  static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
>>  {
>> +	unsigned char *hpsizes;
>> +	int index, mask_index;
>>  	struct slice_mask ret = { 0, 0 };
>>  	unsigned long i;
>> -	u64 psizes;
>> +	u64 lpsizes;
>>  
>> -	psizes = mm->context.low_slices_psize;
>> +	lpsizes = mm->context.low_slices_psize;
>>  	for (i = 0; i < SLICE_NUM_LOW; i++)
>> -		if (((psizes >> (i * 4)) & 0xf) == psize)
>> +		if (((lpsizes >> (i * 4)) & 0xf) == psize)
>>  			ret.low_slices |= 1u << i;
>>  
>> -	psizes = mm->context.high_slices_psize;
>> -	for (i = 0; i < SLICE_NUM_HIGH; i++)
>> -		if (((psizes >> (i * 4)) & 0xf) == psize)
>> +	hpsizes = mm->context.high_slices_psize;
>> +	for (i = 0; i < SLICE_NUM_HIGH; i++) {
>> +		mask_index = i & 0x1;
>> +		index = i >> 1;
>> +		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
>>  			ret.high_slices |= 1u << i;
>
> This needs to be 1ul not 1u, since we are creating a 64-bit mask.
>
>>  static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize)
>>  {
>> +	int index, mask_index;
>>  	/* Write the new slice psize bits */
>> -	u64 lpsizes, hpsizes;
>> +	unsigned char *hpsizes;
>> +	u64 lpsizes;
>>  	unsigned long i, flags;
>>  
>>  	slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
>> @@ -201,14 +208,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
>>  			lpsizes = (lpsizes & ~(0xful << (i * 4))) |
>>  				(((unsigned long)psize) << (i * 4));
>>  
>> +	/* Assign the value back */
>> +	mm->context.low_slices_psize = lpsizes;
>> +
>>  	hpsizes = mm->context.high_slices_psize;
>> -	for (i = 0; i < SLICE_NUM_HIGH; i++)
>> +	for (i = 0; i < SLICE_NUM_HIGH; i++) {
>> +		mask_index = i & 0x1;
>> +		index = i >> 1;
>>  		if (mask.high_slices & (1u << i))
>
> Again, 1ul now.  Check all the other places where we manipulate a
> slice mask to see if there are any other instances of 1u that need to
> be changed.

I ended up with this. 

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 0136040..b4e996a 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -54,7 +54,7 @@ static void slice_print_mask(const char *label, struct slice_mask mask)
 	*(p++) = '-';
 	*(p++) = ' ';
 	for (i = 0; i < SLICE_NUM_HIGH; i++)
-		*(p++) = (mask.high_slices & (1 << i)) ? '1' : '0';
+		*(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
 	*(p++) = 0;
 
 	printk(KERN_DEBUG "%s:%s\n", label, buf);
@@ -84,8 +84,8 @@ static struct slice_mask slice_range_to_mask(unsigned long start,
 	}
 
 	if ((start + len) > SLICE_LOW_TOP)
-		ret.high_slices = (1u << (GET_HIGH_SLICE_INDEX(end) + 1))
-			- (1u << GET_HIGH_SLICE_INDEX(start));
+		ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
+			- (1ul << GET_HIGH_SLICE_INDEX(start));
 
 	return ret;
 }
@@ -135,7 +135,7 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
 
 	for (i = 0; i < SLICE_NUM_HIGH; i++)
 		if (!slice_high_has_vma(mm, i))
-			ret.high_slices |= 1u << i;
+			ret.high_slices |= 1ul << i;
 
 	return ret;
 }
@@ -158,7 +158,7 @@ static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
-			ret.high_slices |= 1u << i;
+			ret.high_slices |= 1ul << i;
 	}
 
 	return ret;
@@ -215,7 +215,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	for (i = 0; i < SLICE_NUM_HIGH; i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
-		if (mask.high_slices & (1u << i))
+		if (mask.high_slices & (1ul << i))
 			hpsizes[index] = (hpsizes[index] &
 					  ~(0xf << (mask_index * 4))) |
 				(((unsigned long)psize) << (mask_index * 4));


-aneesh

^ permalink raw reply related

* Re: [PATCH -V5 07/13] arch/powerpc: Make some of the PGTABLE_RANGE dependency explicit
From: Aneesh Kumar K.V @ 2012-08-01  7:45 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev
In-Reply-To: <20120801051808.GD24014@drongo>

Paul Mackerras <paulus@samba.org> writes:

> On Mon, Jul 30, 2012 at 04:52:13PM +0530, Aneesh Kumar K.V wrote:
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> slice array size and slice mask size depend on PGTABLE_RANGE. We
>> can't directly include pgtable.h in these header because there is
>> a circular dependency. So add compile time check for these values.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>
> Reviewed-by: Paul Mackerras <paulus@samba.org>
>
> but you might like to change:
>
>> +#define SLICE_MASK_SIZE (PGTABLE_RANG >> 43)
>
> to have PGTABLE_RANGE rather than PGTABLE_RANG.

Updated

Thanks,
-aneesh

^ permalink raw reply

* Re: [PATCH 1/8] ppc/pnv: create bus sensitive PEs
From: Richard Yang @ 2012-08-01  7:49 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev
In-Reply-To: <1340639001-28152-2-git-send-email-shangw@linux.vnet.ibm.com>

On Mon, Jun 25, 2012 at 11:43:14PM +0800, Gavin Shan wrote:
>Basically, there're 2 types of PCI bus sensitive PEs: (A) The PE
>includes single PCI bus. (B) The PE includes the PCI bus and all
>the subordinate PCI buses. At present, we'd like to put PCI bus
>originated by PCI-e link to form PE that contains single PCI bus,
>and the PCIe-to-PCI bridge will form the 2nd type of PE. We don't
>figure out to detect PLX bridge yet. Once we can detect PLX bridge
>some day, we have to put PCI buses originated from the downstream
>port of PLX bridge to the 2nd type of PE.
>
>The patch changes the original implementation for a little bit
>to support 2 types of PCI bus sensitive PEs described as above.
>Also, the function used to retrieve the corresponding PE according
>to the given PCI device has been changed based on that because each
>PCI device should trace the directly associated PE.
>
>Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>Reviewed-by: Ram Pai <linuxram@us.ibm.com>
>Reviewed-by: Richard Yang <weiyang@linux.vnet.ibm.com>
>---
> arch/powerpc/platforms/powernv/pci-ioda.c |   97 +++++++++++++++++------------
> arch/powerpc/platforms/powernv/pci.h      |   10 +--
> 2 files changed, 63 insertions(+), 44 deletions(-)
>
>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>index fbdd74d..1504795 100644
>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>@@ -548,7 +548,7 @@ static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>  * but in the meantime, we need to protect them to avoid warnings
>  */
> #ifdef CONFIG_PCI_MSI
>-static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev)
>+static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev)
> {
> 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
> 	struct pnv_phb *phb = hose->private_data;
>@@ -560,19 +560,6 @@ static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev)
> 		return NULL;
> 	return &phb->ioda.pe_array[pdn->pe_number];
> }
>-
>-static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev)
>-{
>-	struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev);
>-
>-	while (!pe && dev->bus->self) {
>-		dev = dev->bus->self;
>-		pe = __pnv_ioda_get_one_pe(dev);
>-		if (pe)
>-			pe = pe->bus_pe;
>-	}
>-	return pe;
>-}
> #endif /* CONFIG_PCI_MSI */
>
> static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb,
>@@ -589,7 +576,11 @@ static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb,
> 		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
> 		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
> 		parent = pe->pbus->self;
>-		count = pe->pbus->subordinate - pe->pbus->secondary + 1;
>+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>+			count = pe->pbus->subordinate - pe->pbus->secondary + 1;
>+		else
>+			count = 1;
>+
> 		switch(count) {
> 		case  1: bcomp = OpalPciBusAll;		break;
> 		case  2: bcomp = OpalPciBus7Bits;	break;
>@@ -699,6 +690,7 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
> 	return 10;
> }
>
>+#if 0
> static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev)
> {
> 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>@@ -767,6 +759,7 @@ static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>
> 	return pe;
> }
>+#endif /* Useful for SRIOV case */
>
> static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
> {
>@@ -784,43 +777,47 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
> 		pdn->pcidev = dev;
> 		pdn->pe_number = pe->pe_number;
> 		pe->dma_weight += pnv_ioda_dma_weight(dev);
>-		if (dev->subordinate)
>+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> 			pnv_ioda_setup_same_PE(dev->subordinate, pe);
> 	}
> }
>
>-static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev,
>-					    struct pnv_ioda_pe *ppe)
>+/*
>+ * There're 2 types of PCI bus sensitive PEs: One that is compromised of
>+ * single PCI bus. Another one that contains the primary PCI bus and its
>+ * subordinate PCI devices and buses. The second type of PE is normally
>+ * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
>+ */
>+static void __devinit pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
> {
>-	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>+	struct pci_controller *hose = pci_bus_to_host(bus);
> 	struct pnv_phb *phb = hose->private_data;
>-	struct pci_bus *bus = dev->subordinate;
> 	struct pnv_ioda_pe *pe;
> 	int pe_num;
>
>-	if (!bus) {
>-		pr_warning("%s: Bridge without a subordinate bus !\n",
>-			   pci_name(dev));
>-		return;
>-	}
> 	pe_num = pnv_ioda_alloc_pe(phb);
> 	if (pe_num == IODA_INVALID_PE) {
>-		pr_warning("%s: Not enough PE# available, disabling bus\n",
>-			   pci_name(dev));
>+		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
>+			__func__, pci_domain_nr(bus), bus->number);
> 		return;
> 	}
>
> 	pe = &phb->ioda.pe_array[pe_num];
>-	ppe->bus_pe = pe;
>+	pe->flags = (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
> 	pe->pbus = bus;
>+	pe->pe_number = pe_num;

Gavin, 

Sorry for the late reply. I am not sure I a replying on the latest code. If
not, please point me out. 

I think we don't need to add this line. the pe->pe_number is already set in
pnv_ioda_alloc_pe().

> 	pe->pdev = NULL;
> 	pe->tce32_seg = -1;
> 	pe->mve_number = -1;
> 	pe->rid = bus->secondary << 8;
> 	pe->dma_weight = 0;
>
>-	pe_info(pe, "Secondary busses %d..%d associated with PE\n",
>-		bus->secondary, bus->subordinate);
>+	if (all)
>+		pe_info(pe, "Secondary busses %d..%d associated with PE#%d\n",
>+			bus->secondary, bus->subordinate, pe_num);
>+	else
>+		pe_info(pe, "Secondary busses %d associated with PE#%d\n",
>+			bus->secondary, pe_num);
>
> 	if (pnv_ioda_configure_pe(phb, pe)) {
> 		/* XXX What do we do here ? */
>@@ -848,17 +845,33 @@ static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev,
> static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus)
> {
> 	struct pci_dev *dev;
>-	struct pnv_ioda_pe *pe;
>+
>+	pnv_ioda_setup_bus_PE(bus, 0);
>
> 	list_for_each_entry(dev, &bus->devices, bus_list) {
>-		pe = pnv_ioda_setup_dev_PE(dev);
>-		if (pe == NULL)
>-			continue;
>-		/* Leaving the PCIe domain ... single PE# */
>-		if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
>-			pnv_ioda_setup_bus_PE(dev, pe);
>-		else if (dev->subordinate)
>-			pnv_ioda_setup_PEs(dev->subordinate);
>+		if (dev->subordinate) {
>+			if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
>+				pnv_ioda_setup_bus_PE(dev->subordinate, 1);
>+			else
>+				pnv_ioda_setup_PEs(dev->subordinate);
>+		}
>+	}
>+}
>+
>+/*
>+ * Configure PEs so that the downstream PCI buses and devices
>+ * could have their associated PE#. Unfortunately, we didn't
>+ * figure out the way to identify the PLX bridge yet. So we
>+ * simply put the PCI bus and the subordinate behind the root
>+ * port to PE# here. The game rule here is expected to be changed
>+ * as soon as we can detected PLX bridge correctly.
>+ */
>+static void __devinit pnv_pci_ioda_setup_PEs(void)
>+{
>+	struct pci_controller *hose, *tmp;
>+
>+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
>+		pnv_ioda_setup_PEs(hose->bus);
> 	}
> }
>
>@@ -1139,6 +1152,11 @@ static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose)
> 	}
> }
>
>+static void __devinit pnv_pci_ioda_fixup(void)
>+{
>+	pnv_pci_ioda_setup_PEs();
>+}
>+
> /* Prevent enabling devices for which we couldn't properly
>  * assign a PE
>  */
>@@ -1305,6 +1323,7 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np)
> 	 * ourselves here
> 	 */
> 	ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb;
>+	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
> 	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
> 	pci_add_flags(PCI_PROBE_ONLY | PCI_REASSIGN_ALL_RSRC);
>
>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>index 8bc4796..0cb760c 100644
>--- a/arch/powerpc/platforms/powernv/pci.h
>+++ b/arch/powerpc/platforms/powernv/pci.h
>@@ -17,9 +17,14 @@ enum pnv_phb_model {
> };
>
> #define PNV_PCI_DIAG_BUF_SIZE	4096
>+#define PNV_IODA_PE_DEV		(1 << 0)	/* PE has single PCI device	*/
>+#define PNV_IODA_PE_BUS		(1 << 1)	/* PE has primary PCI bus	*/
>+#define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/
>
> /* Data associated with a PE, including IOMMU tracking etc.. */
> struct pnv_ioda_pe {
>+	unsigned long		flags;
>+
> 	/* A PE can be associated with a single device or an
> 	 * entire bus (& children). In the former case, pdev
> 	 * is populated, in the later case, pbus is.
>@@ -40,11 +45,6 @@ struct pnv_ioda_pe {
> 	 */
> 	unsigned int		dma_weight;
>
>-	/* This is a PCI-E -> PCI-X bridge, this points to the
>-	 * corresponding bus PE
>-	 */
>-	struct pnv_ioda_pe	*bus_pe;
>-
> 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
> 	int			tce32_seg;
> 	int			tce32_segcount;
>-- 
>1.7.9.5
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@lists.ozlabs.org
>https://lists.ozlabs.org/listinfo/linuxppc-dev

-- 
Richard Yang
Help you, Help me

^ permalink raw reply

* Re: [PATCH -V5 11/13] arch/powerpc: properly isolate kernel and user proto-VSID
From: Aneesh Kumar K.V @ 2012-08-01  7:58 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev
In-Reply-To: <20120801043147.GA24014@drongo>

Paul Mackerras <paulus@samba.org> writes:

> On Mon, Jul 30, 2012 at 04:52:17PM +0530, Aneesh Kumar K.V wrote:
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> The proto-VSID space is divided into two class
>> User:   0 to 2^(CONTEXT_BITS + USER_ESID_BITS) -1
>> kernel: 2^(CONTEXT_BITS + USER_ESID_BITS) to 2^(VSID_BITS) - 1
>> 
>> With KERNEL_START at 0xc000000000000000, the proto vsid for
>> the kernel ends up with 0xc00000000 (36 bits). With 64TB
>> patchset we need to have kernel proto-VSID in the
>> [2^37 to 2^38 - 1] range due to the increased USER_ESID_BITS.
>
> This needs to be rolled in with the previous patch, otherwise you'll
> break bisection.
>
>> diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
>> index db2cb3f..405d380 100644
>> --- a/arch/powerpc/mm/slb_low.S
>> +++ b/arch/powerpc/mm/slb_low.S
>> @@ -57,8 +57,16 @@ _GLOBAL(slb_allocate_realmode)
>>  _GLOBAL(slb_miss_kernel_load_linear)
>>  	li	r11,0
>>  BEGIN_FTR_SECTION
>> +	li	r9,0x1
>> +	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>>  	b	slb_finish_load
>>  END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
>> +	li	r9,0x1
>> +	/*
>> +	 * shift 12 bits less here, slb_finish_load_1T will do
>> +	 * the necessary shits
>> +	 */
>> +	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>>  	b	slb_finish_load_1T
>
> Since you're actually doing exactly the same instructions in the 256M
> and 1T segment cases, why not do the li; rldimi before the
> BEGIN_FTR_SECTION?
>
>> @@ -86,8 +94,16 @@ _GLOBAL(slb_miss_kernel_load_vmemmap)
>>  	li	r11,0
>>  6:
>>  BEGIN_FTR_SECTION
>> +	li	r9,0x1
>> +	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>>  	b	slb_finish_load
>>  END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
>> +	li	r9,0x1
>> +	/*
>> +	 * shift 12 bits less here, slb_finish_load_1T will do
>> +	 * the necessary shits
>> +	 */
>> +	rldimi  r10,r9,(CONTEXT_BITS + USER_ESID_BITS),0
>>  	b	slb_finish_load_1T
>
> And similarly here.
>

Folded to the previous patch and updated

-aneesh

^ permalink raw reply

* RE: [PATCH] powerpc/fsl: mpic timer driver
From: Wang Dongsheng-B40534 @ 2012-08-01  8:19 UTC (permalink / raw)
  To: Kumar Gala
  Cc: paulus@samba.org, linuxppc-dev@lists.ozlabs.org,
	Wood Scott-B07421
In-Reply-To: <267909D0-5FDC-4864-AA4B-189A651889A6@kernel.crashing.org>



> -----Original Message-----
> From: Kumar Gala [mailto:galak@kernel.crashing.org]
> Sent: Tuesday, July 31, 2012 10:31 PM
> To: Wang Dongsheng-B40534
> Cc: benh@kernel.crashing.org; paulus@samba.org; Wood Scott-B07421;
> linuxppc-dev@lists.ozlabs.org
> Subject: Re: [PATCH] powerpc/fsl: mpic timer driver
>=20
>=20
> On Jul 31, 2012, at 2:58 AM, Wang Dongsheng-B40534 wrote:
>=20
> >
> >
> >> -----Original Message-----
> >> From: Kumar Gala [mailto:galak@kernel.crashing.org]
> >> Sent: Friday, July 27, 2012 9:14 PM
> >> To: Wang Dongsheng-B40534
> >> Cc: benh@kernel.crashing.org; paulus@samba.org; Wood Scott-B07421;
> >> linuxppc-dev@lists.ozlabs.org
> >> Subject: Re: [PATCH] powerpc/fsl: mpic timer driver
> >>
> >>
> >> On Jul 27, 2012, at 1:20 AM, <Dongsheng.wang@freescale.com>
> >> <Dongsheng.wang@freescale.com> wrote:
> >>
> >>> From: Wang Dongsheng <Dongsheng.Wang@freescale.com>
> >>>
> >>> Global timers A and B internal to the PIC. The two independent
> >>> groups of global timer, group A and group B, are identical in their
> >> functionality.
> >>> The hardware timer generates an interrupt on every timer cycle.
> >>> e.g
> >>> Power management can use the hardware timer to wake up the machine.
> >>>
> >>> Signed-off-by: Wang Dongsheng <Dongsheng.Wang@freescale.com>
> >>> Signed-off-by: Li Yang <leoli@freescale.com>
> >>
> >> How much of this is FSL specific vs openpic?  OpenPIC spec's timer
> >> support (only a single group).
> >>
> > [Wang Dongsheng] Yes, OpenPIC only a single group timer.
> > FSL: add more register, features and group.
> > This patch only to support FSL chip.
> > "mpic_timer.c" -> "fsl_mpic_timer.c"
> > I will modify the description of the patch. how about?
>=20
> I'd rather we support both, can we not use the MPIC_FSL flag to deal with
> FSL specific behavior?
>=20
[Wang Dongsheng] This patch, most of them are done processing cascade mode.
Cascade is a FSL chip function. I think this patch is very targeted.
I think that the patch only supports FSL chip.

> - k
>=20

^ permalink raw reply

* Re: [PATCH 1/8] ppc/pnv: create bus sensitive PEs
From: Gavin Shan @ 2012-08-01  8:26 UTC (permalink / raw)
  To: Richard Yang; +Cc: linuxppc-dev, Gavin Shan
In-Reply-To: <20120801074941.GA10239@richard.(null)>

On Wed, Aug 01, 2012 at 03:49:41AM -0400, Richard Yang wrote:
>On Mon, Jun 25, 2012 at 11:43:14PM +0800, Gavin Shan wrote:
>>Basically, there're 2 types of PCI bus sensitive PEs: (A) The PE
>>includes single PCI bus. (B) The PE includes the PCI bus and all
>>the subordinate PCI buses. At present, we'd like to put PCI bus
>>originated by PCI-e link to form PE that contains single PCI bus,
>>and the PCIe-to-PCI bridge will form the 2nd type of PE. We don't
>>figure out to detect PLX bridge yet. Once we can detect PLX bridge
>>some day, we have to put PCI buses originated from the downstream
>>port of PLX bridge to the 2nd type of PE.
>>
>>The patch changes the original implementation for a little bit
>>to support 2 types of PCI bus sensitive PEs described as above.
>>Also, the function used to retrieve the corresponding PE according
>>to the given PCI device has been changed based on that because each
>>PCI device should trace the directly associated PE.
>>
>>Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>>Reviewed-by: Ram Pai <linuxram@us.ibm.com>
>>Reviewed-by: Richard Yang <weiyang@linux.vnet.ibm.com>
>>---
>> arch/powerpc/platforms/powernv/pci-ioda.c |   97 +++++++++++++++++------------
>> arch/powerpc/platforms/powernv/pci.h      |   10 +--
>> 2 files changed, 63 insertions(+), 44 deletions(-)
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>index fbdd74d..1504795 100644
>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>@@ -548,7 +548,7 @@ static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>  * but in the meantime, we need to protect them to avoid warnings
>>  */
>> #ifdef CONFIG_PCI_MSI
>>-static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev)
>>+static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev)
>> {
>> 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>> 	struct pnv_phb *phb = hose->private_data;
>>@@ -560,19 +560,6 @@ static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev)
>> 		return NULL;
>> 	return &phb->ioda.pe_array[pdn->pe_number];
>> }
>>-
>>-static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev)
>>-{
>>-	struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev);
>>-
>>-	while (!pe && dev->bus->self) {
>>-		dev = dev->bus->self;
>>-		pe = __pnv_ioda_get_one_pe(dev);
>>-		if (pe)
>>-			pe = pe->bus_pe;
>>-	}
>>-	return pe;
>>-}
>> #endif /* CONFIG_PCI_MSI */
>>
>> static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb,
>>@@ -589,7 +576,11 @@ static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb,
>> 		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>> 		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>> 		parent = pe->pbus->self;
>>-		count = pe->pbus->subordinate - pe->pbus->secondary + 1;
>>+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>+			count = pe->pbus->subordinate - pe->pbus->secondary + 1;
>>+		else
>>+			count = 1;
>>+
>> 		switch(count) {
>> 		case  1: bcomp = OpalPciBusAll;		break;
>> 		case  2: bcomp = OpalPciBus7Bits;	break;
>>@@ -699,6 +690,7 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
>> 	return 10;
>> }
>>
>>+#if 0
>> static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>> {
>> 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>@@ -767,6 +759,7 @@ static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>>
>> 	return pe;
>> }
>>+#endif /* Useful for SRIOV case */
>>
>> static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>> {
>>@@ -784,43 +777,47 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>> 		pdn->pcidev = dev;
>> 		pdn->pe_number = pe->pe_number;
>> 		pe->dma_weight += pnv_ioda_dma_weight(dev);
>>-		if (dev->subordinate)
>>+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>> 			pnv_ioda_setup_same_PE(dev->subordinate, pe);
>> 	}
>> }
>>
>>-static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev,
>>-					    struct pnv_ioda_pe *ppe)
>>+/*
>>+ * There're 2 types of PCI bus sensitive PEs: One that is compromised of
>>+ * single PCI bus. Another one that contains the primary PCI bus and its
>>+ * subordinate PCI devices and buses. The second type of PE is normally
>>+ * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
>>+ */
>>+static void __devinit pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>> {
>>-	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>+	struct pci_controller *hose = pci_bus_to_host(bus);
>> 	struct pnv_phb *phb = hose->private_data;
>>-	struct pci_bus *bus = dev->subordinate;
>> 	struct pnv_ioda_pe *pe;
>> 	int pe_num;
>>
>>-	if (!bus) {
>>-		pr_warning("%s: Bridge without a subordinate bus !\n",
>>-			   pci_name(dev));
>>-		return;
>>-	}
>> 	pe_num = pnv_ioda_alloc_pe(phb);
>> 	if (pe_num == IODA_INVALID_PE) {
>>-		pr_warning("%s: Not enough PE# available, disabling bus\n",
>>-			   pci_name(dev));
>>+		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
>>+			__func__, pci_domain_nr(bus), bus->number);
>> 		return;
>> 	}
>>
>> 	pe = &phb->ioda.pe_array[pe_num];
>>-	ppe->bus_pe = pe;
>>+	pe->flags = (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>> 	pe->pbus = bus;
>>+	pe->pe_number = pe_num;
>
>Gavin, 
>
>Sorry for the late reply. I am not sure I a replying on the latest code. If
>not, please point me out. 
>
>I think we don't need to add this line. the pe->pe_number is already set in
>pnv_ioda_alloc_pe().
>

Thanks, Richard. I think we probablly need remove the following line in pnv_ioda_alloc_pe()
instead of the line you pointed because pnv_ioda_alloc_pe() might return invalid
PE number (-1). That will eventually cause data corruption while using "-1" to
referring phb->ioda.pe_array[], even the situation shouldn't happen for now :-)

	phb->ioda.pe_array[pe].pe_number = pe;

Let me change it accordingly in next version. The series of patches is pending
for the patches against PCI core change. The later one is waiting for Bjorn's
confirm.

Thanks,
Gavin

>> 	pe->pdev = NULL;
>> 	pe->tce32_seg = -1;
>> 	pe->mve_number = -1;
>> 	pe->rid = bus->secondary << 8;
>> 	pe->dma_weight = 0;
>>
>>-	pe_info(pe, "Secondary busses %d..%d associated with PE\n",
>>-		bus->secondary, bus->subordinate);
>>+	if (all)
>>+		pe_info(pe, "Secondary busses %d..%d associated with PE#%d\n",
>>+			bus->secondary, bus->subordinate, pe_num);
>>+	else
>>+		pe_info(pe, "Secondary busses %d associated with PE#%d\n",
>>+			bus->secondary, pe_num);
>>
>> 	if (pnv_ioda_configure_pe(phb, pe)) {
>> 		/* XXX What do we do here ? */
>>@@ -848,17 +845,33 @@ static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev,
>> static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus)
>> {
>> 	struct pci_dev *dev;
>>-	struct pnv_ioda_pe *pe;
>>+
>>+	pnv_ioda_setup_bus_PE(bus, 0);
>>
>> 	list_for_each_entry(dev, &bus->devices, bus_list) {
>>-		pe = pnv_ioda_setup_dev_PE(dev);
>>-		if (pe == NULL)
>>-			continue;
>>-		/* Leaving the PCIe domain ... single PE# */
>>-		if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
>>-			pnv_ioda_setup_bus_PE(dev, pe);
>>-		else if (dev->subordinate)
>>-			pnv_ioda_setup_PEs(dev->subordinate);
>>+		if (dev->subordinate) {
>>+			if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
>>+				pnv_ioda_setup_bus_PE(dev->subordinate, 1);
>>+			else
>>+				pnv_ioda_setup_PEs(dev->subordinate);
>>+		}
>>+	}
>>+}
>>+
>>+/*
>>+ * Configure PEs so that the downstream PCI buses and devices
>>+ * could have their associated PE#. Unfortunately, we didn't
>>+ * figure out the way to identify the PLX bridge yet. So we
>>+ * simply put the PCI bus and the subordinate behind the root
>>+ * port to PE# here. The game rule here is expected to be changed
>>+ * as soon as we can detected PLX bridge correctly.
>>+ */
>>+static void __devinit pnv_pci_ioda_setup_PEs(void)
>>+{
>>+	struct pci_controller *hose, *tmp;
>>+
>>+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
>>+		pnv_ioda_setup_PEs(hose->bus);
>> 	}
>> }
>>
>>@@ -1139,6 +1152,11 @@ static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose)
>> 	}
>> }
>>
>>+static void __devinit pnv_pci_ioda_fixup(void)
>>+{
>>+	pnv_pci_ioda_setup_PEs();
>>+}
>>+
>> /* Prevent enabling devices for which we couldn't properly
>>  * assign a PE
>>  */
>>@@ -1305,6 +1323,7 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np)
>> 	 * ourselves here
>> 	 */
>> 	ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb;
>>+	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
>> 	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
>> 	pci_add_flags(PCI_PROBE_ONLY | PCI_REASSIGN_ALL_RSRC);
>>
>>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>index 8bc4796..0cb760c 100644
>>--- a/arch/powerpc/platforms/powernv/pci.h
>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>@@ -17,9 +17,14 @@ enum pnv_phb_model {
>> };
>>
>> #define PNV_PCI_DIAG_BUF_SIZE	4096
>>+#define PNV_IODA_PE_DEV		(1 << 0)	/* PE has single PCI device	*/
>>+#define PNV_IODA_PE_BUS		(1 << 1)	/* PE has primary PCI bus	*/
>>+#define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/
>>
>> /* Data associated with a PE, including IOMMU tracking etc.. */
>> struct pnv_ioda_pe {
>>+	unsigned long		flags;
>>+
>> 	/* A PE can be associated with a single device or an
>> 	 * entire bus (& children). In the former case, pdev
>> 	 * is populated, in the later case, pbus is.
>>@@ -40,11 +45,6 @@ struct pnv_ioda_pe {
>> 	 */
>> 	unsigned int		dma_weight;
>>
>>-	/* This is a PCI-E -> PCI-X bridge, this points to the
>>-	 * corresponding bus PE
>>-	 */
>>-	struct pnv_ioda_pe	*bus_pe;
>>-
>> 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>> 	int			tce32_seg;
>> 	int			tce32_segcount;
>>-- 
>>1.7.9.5
>>
>>_______________________________________________
>>Linuxppc-dev mailing list
>>Linuxppc-dev@lists.ozlabs.org
>>https://lists.ozlabs.org/listinfo/linuxppc-dev
>
>-- 
>Richard Yang
>Help you, Help me

^ permalink raw reply

* Re: [PATCH 1/8] ppc/pnv: create bus sensitive PEs
From: Richard Yang @ 2012-08-01  9:04 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev, Richard Yang
In-Reply-To: <20120801082654.GA29143@shangw.(null)>

On Wed, Aug 01, 2012 at 04:26:54PM +0800, Gavin Shan wrote:
>On Wed, Aug 01, 2012 at 03:49:41AM -0400, Richard Yang wrote:
>>On Mon, Jun 25, 2012 at 11:43:14PM +0800, Gavin Shan wrote:
>>>Basically, there're 2 types of PCI bus sensitive PEs: (A) The PE
>>>includes single PCI bus. (B) The PE includes the PCI bus and all
>>>the subordinate PCI buses. At present, we'd like to put PCI bus
>>>originated by PCI-e link to form PE that contains single PCI bus,
>>>and the PCIe-to-PCI bridge will form the 2nd type of PE. We don't
>>>figure out to detect PLX bridge yet. Once we can detect PLX bridge
>>>some day, we have to put PCI buses originated from the downstream
>>>port of PLX bridge to the 2nd type of PE.
>>>
>>>The patch changes the original implementation for a little bit
>>>to support 2 types of PCI bus sensitive PEs described as above.
>>>Also, the function used to retrieve the corresponding PE according
>>>to the given PCI device has been changed based on that because each
>>>PCI device should trace the directly associated PE.
>>>
>>>Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>>>Reviewed-by: Ram Pai <linuxram@us.ibm.com>
>>>Reviewed-by: Richard Yang <weiyang@linux.vnet.ibm.com>
>>>---
>>> arch/powerpc/platforms/powernv/pci-ioda.c |   97 +++++++++++++++++------------
>>> arch/powerpc/platforms/powernv/pci.h      |   10 +--
>>> 2 files changed, 63 insertions(+), 44 deletions(-)
>>>
>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>index fbdd74d..1504795 100644
>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>@@ -548,7 +548,7 @@ static void __devinit pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>>  * but in the meantime, we need to protect them to avoid warnings
>>>  */
>>> #ifdef CONFIG_PCI_MSI
>>>-static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev)
>>>+static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev)
>>> {
>>> 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>> 	struct pnv_phb *phb = hose->private_data;
>>>@@ -560,19 +560,6 @@ static struct pnv_ioda_pe * __devinit __pnv_ioda_get_one_pe(struct pci_dev *dev)
>>> 		return NULL;
>>> 	return &phb->ioda.pe_array[pdn->pe_number];
>>> }
>>>-
>>>-static struct pnv_ioda_pe * __devinit pnv_ioda_get_pe(struct pci_dev *dev)
>>>-{
>>>-	struct pnv_ioda_pe *pe = __pnv_ioda_get_one_pe(dev);
>>>-
>>>-	while (!pe && dev->bus->self) {
>>>-		dev = dev->bus->self;
>>>-		pe = __pnv_ioda_get_one_pe(dev);
>>>-		if (pe)
>>>-			pe = pe->bus_pe;
>>>-	}
>>>-	return pe;
>>>-}
>>> #endif /* CONFIG_PCI_MSI */
>>>
>>> static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb,
>>>@@ -589,7 +576,11 @@ static int __devinit pnv_ioda_configure_pe(struct pnv_phb *phb,
>>> 		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>> 		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>> 		parent = pe->pbus->self;
>>>-		count = pe->pbus->subordinate - pe->pbus->secondary + 1;
>>>+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>>+			count = pe->pbus->subordinate - pe->pbus->secondary + 1;
>>>+		else
>>>+			count = 1;
>>>+
>>> 		switch(count) {
>>> 		case  1: bcomp = OpalPciBusAll;		break;
>>> 		case  2: bcomp = OpalPciBus7Bits;	break;
>>>@@ -699,6 +690,7 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
>>> 	return 10;
>>> }
>>>
>>>+#if 0
>>> static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>>> {
>>> 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>@@ -767,6 +759,7 @@ static struct pnv_ioda_pe * __devinit pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>>>
>>> 	return pe;
>>> }
>>>+#endif /* Useful for SRIOV case */
>>>
>>> static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>>> {
>>>@@ -784,43 +777,47 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
>>> 		pdn->pcidev = dev;
>>> 		pdn->pe_number = pe->pe_number;
>>> 		pe->dma_weight += pnv_ioda_dma_weight(dev);
>>>-		if (dev->subordinate)
>>>+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>> 			pnv_ioda_setup_same_PE(dev->subordinate, pe);
>>> 	}
>>> }
>>>
>>>-static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev,
>>>-					    struct pnv_ioda_pe *ppe)
>>>+/*
>>>+ * There're 2 types of PCI bus sensitive PEs: One that is compromised of
>>>+ * single PCI bus. Another one that contains the primary PCI bus and its
>>>+ * subordinate PCI devices and buses. The second type of PE is normally
>>>+ * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
>>>+ */
>>>+static void __devinit pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>> {
>>>-	struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>+	struct pci_controller *hose = pci_bus_to_host(bus);
>>> 	struct pnv_phb *phb = hose->private_data;
>>>-	struct pci_bus *bus = dev->subordinate;
>>> 	struct pnv_ioda_pe *pe;
>>> 	int pe_num;
>>>
>>>-	if (!bus) {
>>>-		pr_warning("%s: Bridge without a subordinate bus !\n",
>>>-			   pci_name(dev));
>>>-		return;
>>>-	}
>>> 	pe_num = pnv_ioda_alloc_pe(phb);
>>> 	if (pe_num == IODA_INVALID_PE) {
>>>-		pr_warning("%s: Not enough PE# available, disabling bus\n",
>>>-			   pci_name(dev));
>>>+		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
>>>+			__func__, pci_domain_nr(bus), bus->number);
>>> 		return;
>>> 	}
>>>
>>> 	pe = &phb->ioda.pe_array[pe_num];
>>>-	ppe->bus_pe = pe;
>>>+	pe->flags = (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>>> 	pe->pbus = bus;
>>>+	pe->pe_number = pe_num;
>>
>>Gavin, 
>>
>>Sorry for the late reply. I am not sure I a replying on the latest code. If
>>not, please point me out. 
>>
>>I think we don't need to add this line. the pe->pe_number is already set in
>>pnv_ioda_alloc_pe().
>>
>
>Thanks, Richard. I think we probablly need remove the following line in pnv_ioda_alloc_pe()
>instead of the line you pointed because pnv_ioda_alloc_pe() might return invalid
>PE number (-1). That will eventually cause data corruption while using "-1" to
>referring phb->ioda.pe_array[], even the situation shouldn't happen for now :-)
>
>	phb->ioda.pe_array[pe].pe_number = pe;

oh, so it is not proper to set pe_number = -1 in the pe_array, right?

>
>Let me change it accordingly in next version. The series of patches is pending
>for the patches against PCI core change. The later one is waiting for Bjorn's
>confirm.
>
>Thanks,
>Gavin
>
>>> 	pe->pdev = NULL;
>>> 	pe->tce32_seg = -1;
>>> 	pe->mve_number = -1;
>>> 	pe->rid = bus->secondary << 8;
>>> 	pe->dma_weight = 0;
>>>
>>>-	pe_info(pe, "Secondary busses %d..%d associated with PE\n",
>>>-		bus->secondary, bus->subordinate);
>>>+	if (all)
>>>+		pe_info(pe, "Secondary busses %d..%d associated with PE#%d\n",
>>>+			bus->secondary, bus->subordinate, pe_num);
>>>+	else
>>>+		pe_info(pe, "Secondary busses %d associated with PE#%d\n",
>>>+			bus->secondary, pe_num);
>>>
>>> 	if (pnv_ioda_configure_pe(phb, pe)) {
>>> 		/* XXX What do we do here ? */
>>>@@ -848,17 +845,33 @@ static void __devinit pnv_ioda_setup_bus_PE(struct pci_dev *dev,
>>> static void __devinit pnv_ioda_setup_PEs(struct pci_bus *bus)
>>> {
>>> 	struct pci_dev *dev;
>>>-	struct pnv_ioda_pe *pe;
>>>+
>>>+	pnv_ioda_setup_bus_PE(bus, 0);
>>>
>>> 	list_for_each_entry(dev, &bus->devices, bus_list) {
>>>-		pe = pnv_ioda_setup_dev_PE(dev);
>>>-		if (pe == NULL)
>>>-			continue;
>>>-		/* Leaving the PCIe domain ... single PE# */
>>>-		if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
>>>-			pnv_ioda_setup_bus_PE(dev, pe);
>>>-		else if (dev->subordinate)
>>>-			pnv_ioda_setup_PEs(dev->subordinate);
>>>+		if (dev->subordinate) {
>>>+			if (dev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
>>>+				pnv_ioda_setup_bus_PE(dev->subordinate, 1);
>>>+			else
>>>+				pnv_ioda_setup_PEs(dev->subordinate);
>>>+		}
>>>+	}
>>>+}
>>>+
>>>+/*
>>>+ * Configure PEs so that the downstream PCI buses and devices
>>>+ * could have their associated PE#. Unfortunately, we didn't
>>>+ * figure out the way to identify the PLX bridge yet. So we
>>>+ * simply put the PCI bus and the subordinate behind the root
>>>+ * port to PE# here. The game rule here is expected to be changed
>>>+ * as soon as we can detected PLX bridge correctly.
>>>+ */
>>>+static void __devinit pnv_pci_ioda_setup_PEs(void)
>>>+{
>>>+	struct pci_controller *hose, *tmp;
>>>+
>>>+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
>>>+		pnv_ioda_setup_PEs(hose->bus);
>>> 	}
>>> }
>>>
>>>@@ -1139,6 +1152,11 @@ static void __devinit pnv_pci_ioda_fixup_phb(struct pci_controller *hose)
>>> 	}
>>> }
>>>
>>>+static void __devinit pnv_pci_ioda_fixup(void)
>>>+{
>>>+	pnv_pci_ioda_setup_PEs();
>>>+}
>>>+
>>> /* Prevent enabling devices for which we couldn't properly
>>>  * assign a PE
>>>  */
>>>@@ -1305,6 +1323,7 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np)
>>> 	 * ourselves here
>>> 	 */
>>> 	ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb;
>>>+	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
>>> 	ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
>>> 	pci_add_flags(PCI_PROBE_ONLY | PCI_REASSIGN_ALL_RSRC);
>>>
>>>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>>>index 8bc4796..0cb760c 100644
>>>--- a/arch/powerpc/platforms/powernv/pci.h
>>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>>@@ -17,9 +17,14 @@ enum pnv_phb_model {
>>> };
>>>
>>> #define PNV_PCI_DIAG_BUF_SIZE	4096
>>>+#define PNV_IODA_PE_DEV		(1 << 0)	/* PE has single PCI device	*/
>>>+#define PNV_IODA_PE_BUS		(1 << 1)	/* PE has primary PCI bus	*/
>>>+#define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/
>>>
>>> /* Data associated with a PE, including IOMMU tracking etc.. */
>>> struct pnv_ioda_pe {
>>>+	unsigned long		flags;
>>>+
>>> 	/* A PE can be associated with a single device or an
>>> 	 * entire bus (& children). In the former case, pdev
>>> 	 * is populated, in the later case, pbus is.
>>>@@ -40,11 +45,6 @@ struct pnv_ioda_pe {
>>> 	 */
>>> 	unsigned int		dma_weight;
>>>
>>>-	/* This is a PCI-E -> PCI-X bridge, this points to the
>>>-	 * corresponding bus PE
>>>-	 */
>>>-	struct pnv_ioda_pe	*bus_pe;
>>>-
>>> 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>>> 	int			tce32_seg;
>>> 	int			tce32_segcount;
>>>-- 
>>>1.7.9.5
>>>
>>>_______________________________________________
>>>Linuxppc-dev mailing list
>>>Linuxppc-dev@lists.ozlabs.org
>>>https://lists.ozlabs.org/listinfo/linuxppc-dev
>>
>>-- 
>>Richard Yang
>>Help you, Help me
>
>_______________________________________________
>Linuxppc-dev mailing list
>Linuxppc-dev@lists.ozlabs.org
>https://lists.ozlabs.org/listinfo/linuxppc-dev

-- 
Richard Yang
Help you, Help me

^ permalink raw reply

* [PATCH v5 0/6] Raid: enable talitos xor offload for improving performance
From: qiang.liu @ 2012-08-01  8:46 UTC (permalink / raw)
  To: linux-crypto, vinod.koul, dan.j.williams, herbert, linuxppc-dev,
	linux-kernel, dan.j.williams

Hi,

The following 6 patches enabling fsl-dma and talitos offload raid
operations for improving raid performance and balancing CPU load.

Write performance will be improved by 25-30% tested by iozone.
Write performance is improved about 2% after using spin_lock_bh replace
spin_lock_irqsave.
CPU load will be reduced by 8%.

Changes in v5:
	- add detail description in patch 3/6 about the process of completed
	descriptor, the process is in align with fsl-dma Reference Manual,
	illustrate the potential risk and how to reproduce it;
	- drop the patch 7/7 in v4 according to Timur's comments;

Changes in v4:
	- fix an error in talitos when dest addr is same with src addr, dest
	should be freed only one time if src is same with dest addr;
	- correct coding style in fsl-dma according to Ira's comments;
	- fix a race condition in fsl-dma fsl_tx_status(), remove the interface
	which is used to free descriptors in queue ld_completed, this interface
	has been included in fsldma_cleanup_descriptor(), in v3, there is one
	place missed spin_lock protect;
	- split the original patch 3/4 up to 2 patches 3/7 and 4/7 according to
	Li Yang's comments;
	- fix a warning of unitialized cookie;
	- add memory copy self test in fsl-dma;
	- add more detail description about use spin_lock_bh() to instead of
	spin_lock_irqsave() according to Timur's comments.

Changes in v3:
	- change release process of fsl-dma descriptor for resolve the
	potential race condition;
	- add test result when use spin_lock_bh replace spin_lock_irqsave;
	- modify the benchmark results according to the latest patch.

Changes in v2:
	- rebase onto cryptodev tree;
	- split the patch 3/4 up to 3 independent patches;
	- remove the patch 4/4, the fix is not for cryptodev tree;

Qiang Liu (6):
      Talitos: Support for async_tx XOR offload
      fsl-dma: remove attribute DMA_INTERRUPT of dmaengine
      fsl-dma: change release process of dma descriptor for supporting async_tx
      fsl-dma: move the function ahead of its invoke function
      fsl-dma: use spin_lock_bh to instead of spin_lock_irqsave
      fsl-dma: fix a warning of unitialized cookie

 drivers/crypto/Kconfig   |    9 +
 drivers/crypto/talitos.c |  413 ++++++++++++++++++++++++++++++++++++++++
 drivers/crypto/talitos.h |   53 ++++++
 drivers/dma/fsldma.c     |  467 ++++++++++++++++++++++++++--------------------
 drivers/dma/fsldma.h     |    1 +
 5 files changed, 739 insertions(+), 204 deletions(-)

^ permalink raw reply

* [PATCH v5 1/6] Talitos: Support for async_tx XOR offload
From: qiang.liu @ 2012-08-01  8:48 UTC (permalink / raw)
  To: linux-crypto, herbert, davem, linux-kernel, linuxppc-dev
  Cc: Qiang Liu, vinod.koul, dan.j.williams, dan.j.williams

From: Qiang Liu <qiang.liu@freescale.com>

Expose Talitos's XOR functionality to be used for RAID parity
calculation via the Async_tx layer.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Dipen Dudhat <Dipen.Dudhat@freescale.com>
Signed-off-by: Maneesh Gupta <Maneesh.Gupta@freescale.com>
Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Vishnu Suresh <Vishnu@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
---
 drivers/crypto/Kconfig   |    9 +
 drivers/crypto/talitos.c |  413 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/crypto/talitos.h |   53 ++++++
 3 files changed, 475 insertions(+), 0 deletions(-)

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index be6b2ba..f0a7c29 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -222,6 +222,15 @@ config CRYPTO_DEV_TALITOS
 	  To compile this driver as a module, choose M here: the module
 	  will be called talitos.

+config CRYPTO_DEV_TALITOS_RAIDXOR
+	bool "Talitos RAID5 XOR Calculation Offload"
+	default y
+	select DMA_ENGINE
+	depends on CRYPTO_DEV_TALITOS
+	help
+	  Say 'Y' here to use the Freescale Security Engine (SEC) to
+	  offload RAID XOR parity Calculation
+
 config CRYPTO_DEV_IXP4XX
 	tristate "Driver for IXP4xx crypto hardware acceleration"
 	depends on ARCH_IXP4XX
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index efff788..b34264e 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -619,6 +619,399 @@ static void talitos_unregister_rng(struct device *dev)
 	hwrng_unregister(&priv->rng);
 }

+#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
+static void talitos_release_xor(struct device *dev, struct talitos_desc *hwdesc,
+				void *context, int error);
+
+static enum dma_status talitos_is_tx_complete(struct dma_chan *chan,
+					      dma_cookie_t cookie,
+					      struct dma_tx_state *state)
+{
+	struct talitos_xor_chan *xor_chan;
+	dma_cookie_t last_used;
+	dma_cookie_t last_complete;
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+
+	last_used = chan->cookie;
+	last_complete = xor_chan->completed_cookie;
+
+	if (state->last)
+		state->last = last_complete;
+
+	if (state->used)
+		state->used = last_used;
+
+	return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+static void talitos_process_pending(struct talitos_xor_chan *xor_chan)
+{
+	struct talitos_xor_desc *desc, *_desc;
+	unsigned long flags;
+	int status;
+	struct talitos_private *priv;
+	int ch;
+
+	priv = dev_get_drvdata(xor_chan->dev);
+	ch = atomic_inc_return(&priv->last_chan) &
+				  (priv->num_channels - 1);
+	spin_lock_irqsave(&xor_chan->desc_lock, flags);
+
+	list_for_each_entry_safe(desc, _desc, &xor_chan->pending_q, node) {
+		status = talitos_submit(xor_chan->dev, ch, &desc->hwdesc,
+					talitos_release_xor, desc);
+		if (status != -EINPROGRESS)
+			break;
+
+		list_del(&desc->node);
+		list_add_tail(&desc->node, &xor_chan->in_progress_q);
+	}
+
+	spin_unlock_irqrestore(&xor_chan->desc_lock, flags);
+}
+
+static void talitos_xor_run_tx_complete_actions(struct talitos_xor_desc *desc,
+		struct talitos_xor_chan *xor_chan)
+{
+	struct device *dev = xor_chan->dev;
+	dma_addr_t dest, addr;
+	unsigned int src_cnt = desc->unmap_src_cnt;
+	unsigned int len = desc->unmap_len;
+	enum dma_ctrl_flags flags = desc->async_tx.flags;
+	struct dma_async_tx_descriptor *tx = &desc->async_tx;
+
+	/* unmap dma addresses */
+	dest = desc->hwdesc.ptr[6].ptr;
+	if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
+		dma_unmap_page(dev, dest, len, DMA_BIDIRECTIONAL);
+
+	desc->idx = 6 - src_cnt;
+	if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
+		while(desc->idx < 6) {
+			addr = desc->hwdesc.ptr[desc->idx++].ptr;
+			if (addr == dest)
+				continue;
+			dma_unmap_page(dev, addr, len, DMA_TO_DEVICE);
+		}
+	}
+
+	/* run dependent operations */
+	dma_run_dependencies(tx);
+}
+
+static void talitos_release_xor(struct device *dev, struct talitos_desc *hwdesc,
+				void *context, int error)
+{
+	struct talitos_xor_desc *desc = context;
+	struct talitos_xor_chan *xor_chan;
+	dma_async_tx_callback callback;
+	void *callback_param;
+
+	if (unlikely(error))
+		dev_err(dev, "xor operation: talitos error %d\n", error);
+
+	xor_chan = container_of(desc->async_tx.chan, struct talitos_xor_chan,
+				common);
+	spin_lock_bh(&xor_chan->desc_lock);
+	if (xor_chan->completed_cookie < desc->async_tx.cookie)
+		xor_chan->completed_cookie = desc->async_tx.cookie;
+
+	callback = desc->async_tx.callback;
+	callback_param = desc->async_tx.callback_param;
+
+	if (callback) {
+		spin_unlock_bh(&xor_chan->desc_lock);
+		callback(callback_param);
+		spin_lock_bh(&xor_chan->desc_lock);
+	}
+
+	talitos_xor_run_tx_complete_actions(desc, xor_chan);
+
+	list_del(&desc->node);
+	list_add_tail(&desc->node, &xor_chan->free_desc);
+	spin_unlock_bh(&xor_chan->desc_lock);
+	if (!list_empty(&xor_chan->pending_q))
+		talitos_process_pending(xor_chan);
+}
+
+/**
+ * talitos_issue_pending - move the descriptors in submit
+ * queue to pending queue and submit them for processing
+ * @chan: DMA channel
+ */
+static void talitos_issue_pending(struct dma_chan *chan)
+{
+	struct talitos_xor_chan *xor_chan;
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+	spin_lock_bh(&xor_chan->desc_lock);
+	list_splice_tail_init(&xor_chan->submit_q,
+				 &xor_chan->pending_q);
+	spin_unlock_bh(&xor_chan->desc_lock);
+	talitos_process_pending(xor_chan);
+}
+
+static dma_cookie_t talitos_async_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct talitos_xor_desc *desc;
+	struct talitos_xor_chan *xor_chan;
+	dma_cookie_t cookie;
+
+	desc = container_of(tx, struct talitos_xor_desc, async_tx);
+	xor_chan = container_of(tx->chan, struct talitos_xor_chan, common);
+
+	spin_lock_bh(&xor_chan->desc_lock);
+
+	cookie = xor_chan->common.cookie + 1;
+	if (cookie < 0)
+		cookie = 1;
+
+	desc->async_tx.cookie = cookie;
+	xor_chan->common.cookie = desc->async_tx.cookie;
+
+	list_splice_tail_init(&desc->tx_list,
+				 &xor_chan->submit_q);
+
+	spin_unlock_bh(&xor_chan->desc_lock);
+
+	return cookie;
+}
+
+static struct talitos_xor_desc *talitos_xor_alloc_descriptor(
+				struct talitos_xor_chan *xor_chan, gfp_t flags)
+{
+	struct talitos_xor_desc *desc;
+
+	desc = kmalloc(sizeof(*desc), flags);
+	if (desc) {
+		xor_chan->total_desc++;
+		desc->async_tx.tx_submit = talitos_async_tx_submit;
+	}
+
+	return desc;
+}
+
+static void talitos_free_chan_resources(struct dma_chan *chan)
+{
+	struct talitos_xor_chan *xor_chan;
+	struct talitos_xor_desc *desc, *_desc;
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+
+	spin_lock_bh(&xor_chan->desc_lock);
+
+	list_for_each_entry_safe(desc, _desc, &xor_chan->submit_q, node) {
+		list_del(&desc->node);
+		xor_chan->total_desc--;
+		kfree(desc);
+	}
+	list_for_each_entry_safe(desc, _desc, &xor_chan->pending_q, node) {
+		list_del(&desc->node);
+		xor_chan->total_desc--;
+		kfree(desc);
+	}
+	list_for_each_entry_safe(desc, _desc, &xor_chan->in_progress_q, node) {
+		list_del(&desc->node);
+		xor_chan->total_desc--;
+		kfree(desc);
+	}
+	list_for_each_entry_safe(desc, _desc, &xor_chan->free_desc, node) {
+		list_del(&desc->node);
+		xor_chan->total_desc--;
+		kfree(desc);
+	}
+
+	/* Some descriptor not freed? */
+	if (unlikely(xor_chan->total_desc))
+		dev_warn(chan->device->dev, "Failed to free xor channel resource\n");
+
+	spin_unlock_bh(&xor_chan->desc_lock);
+}
+
+static int talitos_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct talitos_xor_chan *xor_chan;
+	struct talitos_xor_desc *desc;
+	LIST_HEAD(tmp_list);
+	int i;
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+
+	if (!list_empty(&xor_chan->free_desc))
+		return xor_chan->total_desc;
+
+	for (i = 0; i < TALITOS_MAX_DESCRIPTOR_NR; i++) {
+		desc = talitos_xor_alloc_descriptor(xor_chan,
+				GFP_KERNEL | GFP_DMA);
+		if (!desc) {
+			dev_err(xor_chan->common.device->dev,
+				"Only %d initial descriptors\n", i);
+			break;
+		}
+		list_add_tail(&desc->node, &tmp_list);
+	}
+
+	if (!i)
+		return -ENOMEM;
+
+	/* At least one desc is allocated */
+	spin_lock_bh(&xor_chan->desc_lock);
+	list_splice_init(&tmp_list, &xor_chan->free_desc);
+	spin_unlock_bh(&xor_chan->desc_lock);
+
+	return xor_chan->total_desc;
+}
+
+static struct dma_async_tx_descriptor *talitos_prep_dma_xor(
+			struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+			unsigned int src_cnt, size_t len, unsigned long flags)
+{
+	struct talitos_xor_chan *xor_chan;
+	struct talitos_xor_desc *new;
+	struct talitos_desc *desc;
+	int i, j;
+
+	BUG_ON(len > TALITOS_MAX_DATA_LEN);
+
+	xor_chan = container_of(chan, struct talitos_xor_chan, common);
+
+	spin_lock_bh(&xor_chan->desc_lock);
+	if (!list_empty(&xor_chan->free_desc)) {
+		new = container_of(xor_chan->free_desc.next,
+				   struct talitos_xor_desc, node);
+		list_del(&new->node);
+	} else {
+		 new = talitos_xor_alloc_descriptor(xor_chan, GFP_KERNEL | GFP_DMA);
+	}
+	spin_unlock_bh(&xor_chan->desc_lock);
+
+	if (!new) {
+		dev_err(xor_chan->common.device->dev,
+			"No free memory for XOR DMA descriptor\n");
+		return NULL;
+	}
+	dma_async_tx_descriptor_init(&new->async_tx, &xor_chan->common);
+
+	INIT_LIST_HEAD(&new->node);
+	INIT_LIST_HEAD(&new->tx_list);
+
+	desc = &new->hwdesc;
+	/* Set destination: Last pointer pair */
+	to_talitos_ptr(&desc->ptr[6], dest);
+	desc->ptr[6].len = cpu_to_be16(len);
+	desc->ptr[6].j_extent = 0;
+	new->unmap_src_cnt = src_cnt;
+	new->unmap_len = len;
+
+	/* Set Sources: End loading from second-last pointer pair */
+	for (i = 5, j = 0; j < src_cnt && i >= 0; i--, j++) {
+		to_talitos_ptr(&desc->ptr[i], src[j]);
+		desc->ptr[i].len = cpu_to_be16(len);
+		desc->ptr[i].j_extent = 0;
+	}
+
+	/*
+	 * documentation states first 0 ptr/len combo marks end of sources
+	 * yet device produces scatter boundary error unless all subsequent
+	 * sources are zeroed out
+	 */
+	for (; i >= 0; i--) {
+		to_talitos_ptr(&desc->ptr[i], 0);
+		desc->ptr[i].len = 0;
+		desc->ptr[i].j_extent = 0;
+	}
+
+	desc->hdr = DESC_HDR_SEL0_AESU | DESC_HDR_MODE0_AESU_XOR |
+		DESC_HDR_TYPE_RAID_XOR;
+
+	new->async_tx.parent = NULL;
+	new->async_tx.next = NULL;
+	new->async_tx.cookie = 0;
+	async_tx_ack(&new->async_tx);
+
+	list_add_tail(&new->node, &new->tx_list);
+
+	new->async_tx.flags = flags;
+	new->async_tx.cookie = -EBUSY;
+
+	return &new->async_tx;
+}
+
+static void talitos_unregister_async_xor(struct device *dev)
+{
+	struct talitos_private *priv = dev_get_drvdata(dev);
+	struct talitos_xor_chan *xor_chan;
+	struct dma_chan *chan, *_chan;
+
+	if (priv->dma_dev_common.chancnt)
+		dma_async_device_unregister(&priv->dma_dev_common);
+
+	list_for_each_entry_safe(chan, _chan, &priv->dma_dev_common.channels,
+				device_node) {
+		xor_chan = container_of(chan, struct talitos_xor_chan,
+					common);
+		list_del(&chan->device_node);
+		priv->dma_dev_common.chancnt--;
+		kfree(xor_chan);
+	}
+}
+
+/**
+ * talitos_register_dma_async - Initialize the Freescale XOR ADMA device
+ * It is registered as a DMA device with the capability to perform
+ * XOR operation with the Async_tx layer.
+ * The various queues and channel resources are also allocated.
+ */
+static int talitos_register_async_tx(struct device *dev, int max_xor_srcs)
+{
+	struct talitos_private *priv = dev_get_drvdata(dev);
+	struct dma_device *dma_dev = &priv->dma_dev_common;
+	struct talitos_xor_chan *xor_chan;
+	int err;
+
+	xor_chan = kzalloc(sizeof(struct talitos_xor_chan), GFP_KERNEL);
+	if (!xor_chan) {
+		dev_err(dev, "unable to allocate xor channel\n");
+		return -ENOMEM;
+	}
+
+	dma_dev->dev = dev;
+	dma_dev->device_alloc_chan_resources = talitos_alloc_chan_resources;
+	dma_dev->device_free_chan_resources = talitos_free_chan_resources;
+	dma_dev->device_prep_dma_xor = talitos_prep_dma_xor;
+	dma_dev->max_xor = max_xor_srcs;
+	dma_dev->device_tx_status = talitos_is_tx_complete;
+	dma_dev->device_issue_pending = talitos_issue_pending;
+	INIT_LIST_HEAD(&dma_dev->channels);
+	dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+
+	xor_chan->dev = dev;
+	xor_chan->common.device = dma_dev;
+	xor_chan->total_desc = 0;
+	INIT_LIST_HEAD(&xor_chan->submit_q);
+	INIT_LIST_HEAD(&xor_chan->pending_q);
+	INIT_LIST_HEAD(&xor_chan->in_progress_q);
+	INIT_LIST_HEAD(&xor_chan->free_desc);
+	spin_lock_init(&xor_chan->desc_lock);
+
+	list_add_tail(&xor_chan->common.device_node, &dma_dev->channels);
+	dma_dev->chancnt++;
+
+	err = dma_async_device_register(dma_dev);
+	if (err) {
+		dev_err(dev, "Unable to register XOR with Async_tx\n");
+		goto err_out;
+	}
+
+	return err;
+
+err_out:
+	talitos_unregister_async_xor(dev);
+	return err;
+}
+#endif
+
 /*
  * crypto alg
  */
@@ -2891,6 +3284,26 @@ static int talitos_probe(struct platform_device *ofdev)
 			dev_info(dev, "hwrng\n");
 	}

+#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
+	/*
+	 * register with async_tx xor, if capable
+	 * SEC 2.x support up to 3 RAID sources,
+	 * SEC 3.x support up to 6
+	 */
+	if (hw_supports(dev, DESC_HDR_SEL0_AESU | DESC_HDR_TYPE_RAID_XOR)) {
+		int max_xor_srcs = 3;
+		if (of_device_is_compatible(np, "fsl,sec3.0"))
+			max_xor_srcs = 6;
+		err = talitos_register_async_tx(dev, max_xor_srcs);
+		if (err) {
+			dev_err(dev, "failed to register async_tx xor: %d\n",
+					err);
+			goto err_out;
+		}
+		dev_info(dev, "max_xor_srcs %d\n", max_xor_srcs);
+	}
+#endif
+
 	/* register crypto algorithms the device supports */
 	for (i = 0; i < ARRAY_SIZE(driver_algs); i++) {
 		if (hw_supports(dev, driver_algs[i].desc_hdr_template)) {
diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
index 61a1405..fc9d125 100644
--- a/drivers/crypto/talitos.h
+++ b/drivers/crypto/talitos.h
@@ -30,6 +30,7 @@

 #define TALITOS_TIMEOUT 100000
 #define TALITOS_MAX_DATA_LEN 65535
+#define TALITOS_MAX_DESCRIPTOR_NR 256

 #define DESC_TYPE(desc_hdr) ((be32_to_cpu(desc_hdr) >> 3) & 0x1f)
 #define PRIMARY_EU(desc_hdr) ((be32_to_cpu(desc_hdr) >> 28) & 0xf)
@@ -131,7 +132,57 @@ struct talitos_private {

 	/* hwrng device */
 	struct hwrng rng;
+
+#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
+	/* XOR Device */
+	struct dma_device dma_dev_common;
+#endif
+};
+
+#ifdef CONFIG_CRYPTO_DEV_TALITOS_RAIDXOR
+/**
+ * talitos_xor_chan - context management for the async_tx channel
+ * @completed_cookie: the last completed cookie
+ * @desc_lock: lock for tx queue
+ * @total_desc: number of descriptors allocated
+ * @submit_q: queue of submitted descriptors
+ * @pending_q: queue of pending descriptors
+ * @in_progress_q: queue of descriptors in progress
+ * @free_desc: queue of unused descriptors
+ * @dev: talitos device implementing this channel
+ * @common: the corresponding xor channel in async_tx
+ */
+struct talitos_xor_chan {
+	dma_cookie_t completed_cookie;
+	spinlock_t desc_lock;
+	unsigned int total_desc;
+	struct list_head submit_q;
+	struct list_head pending_q;
+	struct list_head in_progress_q;
+	struct list_head free_desc;
+	struct device *dev;
+	struct dma_chan common;
+};
+
+/**
+ * talitos_xor_desc - software xor descriptor
+ * @async_tx: the referring async_tx descriptor
+ * @node:
+ * @hwdesc: h/w descriptor
+ * @unmap_src_cnt: number of xor sources
+ * @unmap_len: transaction byte count
+ * @idx: index of xor sources
+ */
+struct talitos_xor_desc {
+	struct dma_async_tx_descriptor async_tx;
+	struct list_head tx_list;
+	struct list_head node;
+	struct talitos_desc hwdesc;
+	unsigned int unmap_src_cnt;
+	unsigned int unmap_len;
+	unsigned int idx;
 };
+#endif

 extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
 			  void (*callback)(struct device *dev,
@@ -284,6 +335,7 @@ extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
 /* primary execution unit mode (MODE0) and derivatives */
 #define	DESC_HDR_MODE0_ENCRYPT		cpu_to_be32(0x00100000)
 #define	DESC_HDR_MODE0_AESU_CBC		cpu_to_be32(0x00200000)
+#define	DESC_HDR_MODE0_AESU_XOR         cpu_to_be32(0x0c600000)
 #define	DESC_HDR_MODE0_DEU_CBC		cpu_to_be32(0x00400000)
 #define	DESC_HDR_MODE0_DEU_3DES		cpu_to_be32(0x00200000)
 #define	DESC_HDR_MODE0_MDEU_CONT	cpu_to_be32(0x08000000)
@@ -344,6 +396,7 @@ extern int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
 #define DESC_HDR_TYPE_IPSEC_ESP			cpu_to_be32(1 << 3)
 #define DESC_HDR_TYPE_COMMON_NONSNOOP_NO_AFEU	cpu_to_be32(2 << 3)
 #define DESC_HDR_TYPE_HMAC_SNOOP_NO_AFEU	cpu_to_be32(4 << 3)
+#define DESC_HDR_TYPE_RAID_XOR                  cpu_to_be32(21 << 3)

 /* link table extent field bits */
 #define DESC_PTR_LNKTBL_JUMP			0x80
--
1.7.5.1

^ permalink raw reply related

* [PATCH v5 2/6] fsl-dma: remove attribute DMA_INTERRUPT of dmaengine
From: qiang.liu @ 2012-08-01  8:49 UTC (permalink / raw)
  To: linux-crypto, linuxppc-dev, linux-kernel, dan.j.williams
  Cc: Vinod Koul, Qiang Liu, herbert, Dan Williams, davem

From: Qiang Liu <qiang.liu@freescale.com>

Delete attribute DMA_INTERRUPT because fsl-dma doesn't support this function,
exception will be thrown if talitos is used to offload xor at the same time.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Li Yang <leoli@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
Acked-by: Ira W. Snyder <iws@ovro.caltech.edu>
---
 drivers/dma/fsldma.c |   31 -------------------------------
 1 files changed, 0 insertions(+), 31 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 8f84761..4f2f212 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -543,35 +543,6 @@ static void fsl_dma_free_chan_resources(struct dma_chan *dchan)
 }

 static struct dma_async_tx_descriptor *
-fsl_dma_prep_interrupt(struct dma_chan *dchan, unsigned long flags)
-{
-	struct fsldma_chan *chan;
-	struct fsl_desc_sw *new;
-
-	if (!dchan)
-		return NULL;
-
-	chan = to_fsl_chan(dchan);
-
-	new = fsl_dma_alloc_descriptor(chan);
-	if (!new) {
-		chan_err(chan, "%s\n", msg_ld_oom);
-		return NULL;
-	}
-
-	new->async_tx.cookie = -EBUSY;
-	new->async_tx.flags = flags;
-
-	/* Insert the link descriptor to the LD ring */
-	list_add_tail(&new->node, &new->tx_list);
-
-	/* Set End-of-link to the last link descriptor of new list */
-	set_ld_eol(chan, new);
-
-	return &new->async_tx;
-}
-
-static struct dma_async_tx_descriptor *
 fsl_dma_prep_memcpy(struct dma_chan *dchan,
 	dma_addr_t dma_dst, dma_addr_t dma_src,
 	size_t len, unsigned long flags)
@@ -1352,12 +1323,10 @@ static int __devinit fsldma_of_probe(struct platform_device *op)
 	fdev->irq = irq_of_parse_and_map(op->dev.of_node, 0);

 	dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask);
-	dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask);
 	dma_cap_set(DMA_SG, fdev->common.cap_mask);
 	dma_cap_set(DMA_SLAVE, fdev->common.cap_mask);
 	fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources;
 	fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources;
-	fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt;
 	fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy;
 	fdev->common.device_prep_dma_sg = fsl_dma_prep_sg;
 	fdev->common.device_tx_status = fsl_tx_status;
--
1.7.5.1

^ permalink raw reply related

* [PATCH v5 3/6] fsl-dma: change release process of dma descriptor for supporting async_tx
From: qiang.liu @ 2012-08-01  8:49 UTC (permalink / raw)
  To: linux-crypto, linuxppc-dev, linux-kernel, dan.j.williams
  Cc: Ira W. Snyder, Vinod Koul, Qiang Liu, herbert, Dan Williams,
	davem

From: Qiang Liu <qiang.liu@freescale.com>

Fix the potential risk when enable config NET_DMA and ASYNC_TX.
Async_tx is lack of support in current release process of dma descriptor,
all descriptors will be released whatever is acked or no-acked by async_tx,
so there is a potential race condition when dma engine is uesd by others
clients (e.g. when enable NET_DMA to offload TCP).

In our case, a race condition which is raised when use both of talitos
and dmaengine to offload xor is because napi scheduler will sync all
pending requests in dma channels, it affects the process of raid operations
due to ack_tx is not checked in fsl dma. The no-acked descriptor is freed
which is submitted just now, as a dependent tx, this freed descriptor trigger
BUG_ON(async_tx_test_ack(depend_tx)) in async_tx_submit().

TASK = ee1a94a0[1390] 'md0_raid5' THREAD: ecf40000 CPU: 0
GPR00: 00000001 ecf41ca0 ee44/921a94a0 0000003f 00000001 c00593e4 00000000 00000001
GPR08: 00000000 a7a7a7a7 00000001 045/920000002 42028042 100a38d4 ed576d98 00000000
GPR16: ed5a11b0 00000000 2b162000 00000200 046/920000000 2d555000 ed3015e8 c15a7aa0
GPR24: 00000000 c155fc40 00000000 ecb63220 ecf41d28 e47/92f640bb0 ef640c30 ecf41ca0
NIP [c02b048c] async_tx_submit+0x6c/0x2b4
LR [c02b068c] async_tx_submit+0x26c/0x2b4
Call Trace:
[ecf41ca0] [c02b068c] async_tx_submit+0x26c/0x2b448/92 (unreliable)
[ecf41cd0] [c02b0a4c] async_memcpy+0x240/0x25c
[ecf41d20] [c0421064] async_copy_data+0xa0/0x17c
[ecf41d70] [c0421cf4] __raid_run_ops+0x874/0xe10
[ecf41df0] [c0426ee4] handle_stripe+0x820/0x25e8
[ecf41e90] [c0429080] raid5d+0x3d4/0x5b4
[ecf41f40] [c04329b8] md_thread+0x138/0x16c
[ecf41f90] [c008277c] kthread+0x8c/0x90
[ecf41ff0] [c0011630] kernel_thread+0x4c/0x68

Another major modification in this patch is the change to completed descriptors,
there is a potential risk which caused by exception interrupt, all descriptors
in ld_running list are seemed completed when an interrupt raised, it works fine
under normal condition, but if there is an exception occured, it cannot work
as our excepted. Hardware should not depend on s/w list, the right way is
to read current descriptor address register to find the last completed
descriptor. If an interrupt is raised by an error, all descriptors in ld_running
should not be seemed finished, or these unfinished descriptors in ld_running
will be released wrongly.

A simple way to reproduce,
Enable dmatest first, then insert some bad descriptors which can trigger
Programming Error interrupts before the good descriptors. Last, the good
descriptors will be freed before they are processsed because of the exception
intrerrupt.

Note: the bad descriptors are only for simulating an exception interrupt.
This case can illustrate the potential risk in current fsl-dma very well.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dan Williams <dan.j.williams@gmail.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Li Yang <leoli@freescale.com>
Cc: Ira W. Snyder <iws@ovro.caltech.edu>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
---
 drivers/dma/fsldma.c |  242 +++++++++++++++++++++++++++++++++++---------------
 drivers/dma/fsldma.h |    1 +
 2 files changed, 172 insertions(+), 71 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 4f2f212..87f52c0 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -400,6 +400,125 @@ out_splice:
 	list_splice_tail_init(&desc->tx_list, &chan->ld_pending);
 }

+static void fsldma_cleanup_descriptor(struct fsldma_chan *chan);
+static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan);
+
+/**
+ * fsldma_clean_completed_descriptor - free all descriptors which
+ * has been completed and acked
+ * @chan: Freescale DMA channel
+ *
+ * This function is used on all completed and acked descriptors.
+ * All descriptors should only be freed in this function.
+ */
+static int
+fsldma_clean_completed_descriptor(struct fsldma_chan *chan)
+{
+	struct fsl_desc_sw *desc, *_desc;
+
+	/* Run the callback for each descriptor, in order */
+	list_for_each_entry_safe(desc, _desc, &chan->ld_completed, node) {
+
+		if (async_tx_test_ack(&desc->async_tx)) {
+			/* Remove from the list of transactions */
+			list_del(&desc->node);
+#ifdef FSL_DMA_LD_DEBUG
+			chan_dbg(chan, "LD %p free\n", desc);
+#endif
+			dma_pool_free(chan->desc_pool, desc,
+					desc->async_tx.phys);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * fsldma_run_tx_complete_actions - cleanup and free a single link descriptor
+ * @chan: Freescale DMA channel
+ * @desc: descriptor to cleanup and free
+ * @cookie: Freescale DMA transaction identifier
+ *
+ * This function is used on a descriptor which has been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies.
+ */
+static dma_cookie_t fsldma_run_tx_complete_actions(struct fsl_desc_sw *desc,
+		struct fsldma_chan *chan, dma_cookie_t cookie)
+{
+	struct dma_async_tx_descriptor *txd = &desc->async_tx;
+	struct device *dev = chan->common.device->dev;
+	dma_addr_t src = get_desc_src(chan, desc);
+	dma_addr_t dst = get_desc_dst(chan, desc);
+	u32 len = get_desc_cnt(chan, desc);
+
+	BUG_ON(txd->cookie < 0);
+
+	if (txd->cookie > 0) {
+		cookie = txd->cookie;
+
+		/* Run the link descriptor callback function */
+		if (txd->callback) {
+#ifdef FSL_DMA_LD_DEBUG
+			chan_dbg(chan, "LD %p callback\n", desc);
+#endif
+			txd->callback(txd->callback_param);
+		}
+
+		/* Unmap the dst buffer, if requested */
+		if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
+			if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
+				dma_unmap_single(dev, dst, len, DMA_FROM_DEVICE);
+			else
+				dma_unmap_page(dev, dst, len, DMA_FROM_DEVICE);
+		}
+
+		/* Unmap the src buffer, if requested */
+		if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
+			if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
+				dma_unmap_single(dev, src, len, DMA_TO_DEVICE);
+			else
+				dma_unmap_page(dev, src, len, DMA_TO_DEVICE);
+		}
+	}
+
+	/* Run any dependencies */
+	dma_run_dependencies(txd);
+
+	return cookie;
+}
+
+/**
+ * fsldma_clean_running_descriptor - move the completed descriptor from
+ * ld_running to ld_completed
+ * @chan: Freescale DMA channel
+ * @desc: the descriptor which is completed
+ *
+ * Free the descriptor directly if acked by async_tx api, or move it to
+ * queue ld_completed.
+ */
+static int
+fsldma_clean_running_descriptor(struct fsldma_chan *chan,
+		struct fsl_desc_sw *desc)
+{
+	/* Remove from the list of transactions */
+	list_del(&desc->node);
+	/*
+	 * the client is allowed to attach dependent operations
+	 * until 'ack' is set
+	 */
+	if (!async_tx_test_ack(&desc->async_tx)) {
+		/*
+		 * Move this descriptor to the list of descriptors which is
+		 * completed, but still awaiting the 'ack' bit to be set.
+		 */
+		list_add_tail(&desc->node, &chan->ld_completed);
+		return 0;
+	}
+
+	dma_pool_free(chan->desc_pool, desc, desc->async_tx.phys);
+	return 0;
+}
+
 static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
 	struct fsldma_chan *chan = to_fsl_chan(tx->chan);
@@ -534,8 +653,10 @@ static void fsl_dma_free_chan_resources(struct dma_chan *dchan)

 	chan_dbg(chan, "free all channel resources\n");
 	spin_lock_irqsave(&chan->desc_lock, flags);
+	fsldma_cleanup_descriptor(chan);
 	fsldma_free_desc_list(chan, &chan->ld_pending);
 	fsldma_free_desc_list(chan, &chan->ld_running);
+	fsldma_free_desc_list(chan, &chan->ld_completed);
 	spin_unlock_irqrestore(&chan->desc_lock, flags);

 	dma_pool_destroy(chan->desc_pool);
@@ -819,46 +940,53 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
  * controller. It will run any callbacks, submit any dependencies, and then
  * free the descriptor.
  */
-static void fsldma_cleanup_descriptor(struct fsldma_chan *chan,
-				      struct fsl_desc_sw *desc)
+static void fsldma_cleanup_descriptor(struct fsldma_chan *chan)
 {
-	struct dma_async_tx_descriptor *txd = &desc->async_tx;
-	struct device *dev = chan->common.device->dev;
-	dma_addr_t src = get_desc_src(chan, desc);
-	dma_addr_t dst = get_desc_dst(chan, desc);
-	u32 len = get_desc_cnt(chan, desc);
+	struct fsl_desc_sw *desc, *_desc;
+	dma_cookie_t cookie = 0;
+	dma_addr_t curr_phys = get_cdar(chan);
+	int idle = dma_is_idle(chan);
+	int seen_current = 0;

-	/* Run the link descriptor callback function */
-	if (txd->callback) {
-#ifdef FSL_DMA_LD_DEBUG
-		chan_dbg(chan, "LD %p callback\n", desc);
-#endif
-		txd->callback(txd->callback_param);
-	}
+	fsldma_clean_completed_descriptor(chan);

-	/* Run any dependencies */
-	dma_run_dependencies(txd);
+	/* Run the callback for each descriptor, in order */
+	list_for_each_entry_safe(desc, _desc, &chan->ld_running, node) {
+		/*
+		 * do not advance past the current descriptor loaded into the
+		 * hardware channel, subsequent descriptors are either in
+		 * process or have not been submitted
+		 */
+		if (seen_current)
+			break;

-	/* Unmap the dst buffer, if requested */
-	if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
-		if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
-			dma_unmap_single(dev, dst, len, DMA_FROM_DEVICE);
-		else
-			dma_unmap_page(dev, dst, len, DMA_FROM_DEVICE);
-	}
+		/*
+		 * stop the search if we reach the current descriptor and the
+		 * channel is busy
+		 */
+		if (desc->async_tx.phys == curr_phys) {
+			seen_current = 1;
+			if (!idle)
+				break;
+		}
+
+		cookie = fsldma_run_tx_complete_actions(desc, chan, cookie);
+
+		if (fsldma_clean_running_descriptor(chan, desc))
+			break;

-	/* Unmap the src buffer, if requested */
-	if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
-		if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
-			dma_unmap_single(dev, src, len, DMA_TO_DEVICE);
-		else
-			dma_unmap_page(dev, src, len, DMA_TO_DEVICE);
 	}

-#ifdef FSL_DMA_LD_DEBUG
-	chan_dbg(chan, "LD %p free\n", desc);
-#endif
-	dma_pool_free(chan->desc_pool, desc, txd->phys);
+	/*
+	 * Start any pending transactions automatically
+	 *
+	 * In the ideal case, we keep the DMA controller busy while we go
+	 * ahead and free the descriptors below.
+	 */
+	fsl_chan_xfer_ld_queue(chan);
+
+	if (cookie > 0)
+		chan->common.completed_cookie = cookie;
 }

 /**
@@ -954,11 +1082,15 @@ static enum dma_status fsl_tx_status(struct dma_chan *dchan,
 	enum dma_status ret;
 	unsigned long flags;

-	spin_lock_irqsave(&chan->desc_lock, flags);
 	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_SUCCESS)
+		return ret;
+
+	spin_lock_irqsave(&chan->desc_lock, flags);
+	fsldma_cleanup_descriptor(chan);
 	spin_unlock_irqrestore(&chan->desc_lock, flags);

-	return ret;
+	return dma_cookie_status(dchan, cookie, txstate);
 }

 /*----------------------------------------------------------------------------*/
@@ -1035,52 +1167,19 @@ static irqreturn_t fsldma_chan_irq(int irq, void *data)
 static void dma_do_tasklet(unsigned long data)
 {
 	struct fsldma_chan *chan = (struct fsldma_chan *)data;
-	struct fsl_desc_sw *desc, *_desc;
-	LIST_HEAD(ld_cleanup);
 	unsigned long flags;

 	chan_dbg(chan, "tasklet entry\n");

 	spin_lock_irqsave(&chan->desc_lock, flags);

-	/* update the cookie if we have some descriptors to cleanup */
-	if (!list_empty(&chan->ld_running)) {
-		dma_cookie_t cookie;
-
-		desc = to_fsl_desc(chan->ld_running.prev);
-		cookie = desc->async_tx.cookie;
-		dma_cookie_complete(&desc->async_tx);
-
-		chan_dbg(chan, "completed_cookie=%d\n", cookie);
-	}
-
-	/*
-	 * move the descriptors to a temporary list so we can drop the lock
-	 * during the entire cleanup operation
-	 */
-	list_splice_tail_init(&chan->ld_running, &ld_cleanup);
-
 	/* the hardware is now idle and ready for more */
 	chan->idle = true;

-	/*
-	 * Start any pending transactions automatically
-	 *
-	 * In the ideal case, we keep the DMA controller busy while we go
-	 * ahead and free the descriptors below.
-	 */
-	fsl_chan_xfer_ld_queue(chan);
-	spin_unlock_irqrestore(&chan->desc_lock, flags);
-
-	/* Run the callback for each descriptor, in order */
-	list_for_each_entry_safe(desc, _desc, &ld_cleanup, node) {
+	/* Run all cleanup for this descriptor */
+	fsldma_cleanup_descriptor(chan);

-		/* Remove from the list of transactions */
-		list_del(&desc->node);
-
-		/* Run all cleanup for this descriptor */
-		fsldma_cleanup_descriptor(chan, desc);
-	}
+	spin_unlock_irqrestore(&chan->desc_lock, flags);

 	chan_dbg(chan, "tasklet exit\n");
 }
@@ -1262,6 +1361,7 @@ static int __devinit fsl_dma_chan_probe(struct fsldma_device *fdev,
 	spin_lock_init(&chan->desc_lock);
 	INIT_LIST_HEAD(&chan->ld_pending);
 	INIT_LIST_HEAD(&chan->ld_running);
+	INIT_LIST_HEAD(&chan->ld_completed);
 	chan->idle = true;

 	chan->common.device = &fdev->common;
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index f5c3879..7ede908 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -140,6 +140,7 @@ struct fsldma_chan {
 	spinlock_t desc_lock;		/* Descriptor operation lock */
 	struct list_head ld_pending;	/* Link descriptors queue */
 	struct list_head ld_running;	/* Link descriptors queue */
+	struct list_head ld_completed;	/* Link descriptors queue */
 	struct dma_chan common;		/* DMA common channel */
 	struct dma_pool *desc_pool;	/* Descriptors pool */
 	struct device *dev;		/* Channel device */
--
1.7.5.1

^ permalink raw reply related

* [PATCH v5 4/6] fsl-dma: move the function ahead of its invoke function
From: qiang.liu @ 2012-08-01  8:49 UTC (permalink / raw)
  To: linux-crypto, linuxppc-dev, linux-kernel, dan.j.williams
  Cc: Vinod Koul, Qiang Liu, herbert, Dan Williams, davem

From: Qiang Liu <qiang.liu@freescale.com>

Move the function fsldma_cleanup_descriptor() and fsl_chan_xfer_ld_queue()
ahead of its invoke function for avoiding redundant definition.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Li Yang <leoli@freescale.com>
Signed-off-by: Qiang Liu <qiang.liu@freescale.com>
---
 drivers/dma/fsldma.c |  252 +++++++++++++++++++++++++-------------------------
 1 files changed, 124 insertions(+), 128 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 87f52c0..bb883c0 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -400,9 +400,6 @@ out_splice:
 	list_splice_tail_init(&desc->tx_list, &chan->ld_pending);
 }

-static void fsldma_cleanup_descriptor(struct fsldma_chan *chan);
-static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan);
-
 /**
  * fsldma_clean_completed_descriptor - free all descriptors which
  * has been completed and acked
@@ -519,6 +516,130 @@ fsldma_clean_running_descriptor(struct fsldma_chan *chan,
 	return 0;
 }

+/**
+ * fsl_chan_xfer_ld_queue - transfer any pending transactions
+ * @chan : Freescale DMA channel
+ *
+ * HARDWARE STATE: idle
+ * LOCKING: must hold chan->desc_lock
+ */
+static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan)
+{
+	struct fsl_desc_sw *desc;
+
+	/*
+	 * If the list of pending descriptors is empty, then we
+	 * don't need to do any work at all
+	 */
+	if (list_empty(&chan->ld_pending)) {
+		chan_dbg(chan, "no pending LDs\n");
+		return;
+	}
+
+	/*
+	 * The DMA controller is not idle, which means that the interrupt
+	 * handler will start any queued transactions when it runs after
+	 * this transaction finishes
+	 */
+	if (!chan->idle) {
+		chan_dbg(chan, "DMA controller still busy\n");
+		return;
+	}
+
+	/*
+	 * If there are some link descriptors which have not been
+	 * transferred, we need to start the controller
+	 */
+
+	/*
+	 * Move all elements from the queue of pending transactions
+	 * onto the list of running transactions
+	 */
+	chan_dbg(chan, "idle, starting controller\n");
+	desc = list_first_entry(&chan->ld_pending, struct fsl_desc_sw, node);
+	list_splice_tail_init(&chan->ld_pending, &chan->ld_running);
+
+	/*
+	 * The 85xx DMA controller doesn't clear the channel start bit
+	 * automatically at the end of a transfer. Therefore we must clear
+	 * it in software before starting the transfer.
+	 */
+	if ((chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) {
+		u32 mode;
+
+		mode = DMA_IN(chan, &chan->regs->mr, 32);
+		mode &= ~FSL_DMA_MR_CS;
+		DMA_OUT(chan, &chan->regs->mr, mode, 32);
+	}
+
+	/*
+	 * Program the descriptor's address into the DMA controller,
+	 * then start the DMA transaction
+	 */
+	set_cdar(chan, desc->async_tx.phys);
+	get_cdar(chan);
+
+	dma_start(chan);
+	chan->idle = false;
+}
+
+/**
+ * fsldma_cleanup_descriptor - cleanup and free a single link descriptor
+ * @chan: Freescale DMA channel
+ * @desc: descriptor to cleanup and free
+ *
+ * This function is used on a descriptor which has been executed by the DMA
+ * controller. It will run any callbacks, submit any dependencies, and then
+ * free the descriptor.
+ */
+static void fsldma_cleanup_descriptor(struct fsldma_chan *chan)
+{
+	struct fsl_desc_sw *desc, *_desc;
+	dma_cookie_t cookie = 0;
+	dma_addr_t curr_phys = get_cdar(chan);
+	int idle = dma_is_idle(chan);
+	int seen_current = 0;
+
+	fsldma_clean_completed_descriptor(chan);
+
+	/* Run the callback for each descriptor, in order */
+	list_for_each_entry_safe(desc, _desc, &chan->ld_running, node) {
+		/*
+		 * do not advance past the current descriptor loaded into the
+		 * hardware channel, subsequent descriptors are either in
+		 * process or have not been submitted
+		 */
+		if (seen_current)
+			break;
+
+		/*
+		 * stop the search if we reach the current descriptor and the
+		 * channel is busy
+		 */
+		if (desc->async_tx.phys == curr_phys) {
+			seen_current = 1;
+			if (!idle)
+				break;
+		}
+
+		cookie = fsldma_run_tx_complete_actions(desc, chan, cookie);
+
+		if (fsldma_clean_running_descriptor(chan, desc))
+			break;
+	}
+
+	/*
+	 * Start any pending transactions automatically
+	 *
+	 * In the ideal case, we keep the DMA controller busy while we go
+	 * ahead and free the descriptors below.
+	 */
+	fsl_chan_xfer_ld_queue(chan);
+
+	if (cookie > 0)
+		chan->common.completed_cookie = cookie;
+}
+
 static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
 	struct fsldma_chan *chan = to_fsl_chan(tx->chan);
@@ -932,131 +1053,6 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
 }

 /**
- * fsldma_cleanup_descriptor - cleanup and free a single link descriptor
- * @chan: Freescale DMA channel
- * @desc: descriptor to cleanup and free
- *
- * This function is used on a descriptor which has been executed by the DMA
- * controller. It will run any callbacks, submit any dependencies, and then
- * free the descriptor.
- */
-static void fsldma_cleanup_descriptor(struct fsldma_chan *chan)
-{
-	struct fsl_desc_sw *desc, *_desc;
-	dma_cookie_t cookie = 0;
-	dma_addr_t curr_phys = get_cdar(chan);
-	int idle = dma_is_idle(chan);
-	int seen_current = 0;
-
-	fsldma_clean_completed_descriptor(chan);
-
-	/* Run the callback for each descriptor, in order */
-	list_for_each_entry_safe(desc, _desc, &chan->ld_running, node) {
-		/*
-		 * do not advance past the current descriptor loaded into the
-		 * hardware channel, subsequent descriptors are either in
-		 * process or have not been submitted
-		 */
-		if (seen_current)
-			break;
-
-		/*
-		 * stop the search if we reach the current descriptor and the
-		 * channel is busy
-		 */
-		if (desc->async_tx.phys == curr_phys) {
-			seen_current = 1;
-			if (!idle)
-				break;
-		}
-
-		cookie = fsldma_run_tx_complete_actions(desc, chan, cookie);
-
-		if (fsldma_clean_running_descriptor(chan, desc))
-			break;
-
-	}
-
-	/*
-	 * Start any pending transactions automatically
-	 *
-	 * In the ideal case, we keep the DMA controller busy while we go
-	 * ahead and free the descriptors below.
-	 */
-	fsl_chan_xfer_ld_queue(chan);
-
-	if (cookie > 0)
-		chan->common.completed_cookie = cookie;
-}
-
-/**
- * fsl_chan_xfer_ld_queue - transfer any pending transactions
- * @chan : Freescale DMA channel
- *
- * HARDWARE STATE: idle
- * LOCKING: must hold chan->desc_lock
- */
-static void fsl_chan_xfer_ld_queue(struct fsldma_chan *chan)
-{
-	struct fsl_desc_sw *desc;
-
-	/*
-	 * If the list of pending descriptors is empty, then we
-	 * don't need to do any work at all
-	 */
-	if (list_empty(&chan->ld_pending)) {
-		chan_dbg(chan, "no pending LDs\n");
-		return;
-	}
-
-	/*
-	 * The DMA controller is not idle, which means that the interrupt
-	 * handler will start any queued transactions when it runs after
-	 * this transaction finishes
-	 */
-	if (!chan->idle) {
-		chan_dbg(chan, "DMA controller still busy\n");
-		return;
-	}
-
-	/*
-	 * If there are some link descriptors which have not been
-	 * transferred, we need to start the controller
-	 */
-
-	/*
-	 * Move all elements from the queue of pending transactions
-	 * onto the list of running transactions
-	 */
-	chan_dbg(chan, "idle, starting controller\n");
-	desc = list_first_entry(&chan->ld_pending, struct fsl_desc_sw, node);
-	list_splice_tail_init(&chan->ld_pending, &chan->ld_running);
-
-	/*
-	 * The 85xx DMA controller doesn't clear the channel start bit
-	 * automatically at the end of a transfer. Therefore we must clear
-	 * it in software before starting the transfer.
-	 */
-	if ((chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) {
-		u32 mode;
-
-		mode = DMA_IN(chan, &chan->regs->mr, 32);
-		mode &= ~FSL_DMA_MR_CS;
-		DMA_OUT(chan, &chan->regs->mr, mode, 32);
-	}
-
-	/*
-	 * Program the descriptor's address into the DMA controller,
-	 * then start the DMA transaction
-	 */
-	set_cdar(chan, desc->async_tx.phys);
-	get_cdar(chan);
-
-	dma_start(chan);
-	chan->idle = false;
-}
-
-/**
  * fsl_dma_memcpy_issue_pending - Issue the DMA start command
  * @chan : Freescale DMA channel
  */
--
1.7.5.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox