LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH v2 1/4] powerpc: Move the setting of rflags out of loop in __hash_page_huge
From: Li Zhong @ 2013-04-12  2:16 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: paulus, Li Zhong
In-Reply-To: <1365733021-28912-1-git-send-email-zhong@linux.vnet.ibm.com>

It seems that rflags don't get changed in  the repeating loop, so move
it out of the loop.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
---
 arch/powerpc/mm/hugetlbpage-hash64.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index cecad34..edb4129 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -87,10 +87,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 
 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 
-repeat:
-		hpte_group = ((hash & htab_hash_mask) *
-			      HPTES_PER_GROUP) & ~0x7UL;
-
 		/* clear HPTE slot informations in new PTE */
 #ifdef CONFIG_PPC_64K_PAGES
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
@@ -101,6 +97,10 @@ repeat:
 		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 				      _PAGE_COHERENT | _PAGE_GUARDED));
 
+repeat:
+		hpte_group = ((hash & htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
 					  mmu_psize, ssize);
-- 
1.7.9.5

^ permalink raw reply related

* [RFC PATCH v2 2/4] powerpc: Split the code trying to insert hpte repeatedly as an helper function
From: Li Zhong @ 2013-04-12  2:16 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: paulus, Li Zhong
In-Reply-To: <1365733021-28912-1-git-send-email-zhong@linux.vnet.ibm.com>

Move the logical trying to insert hpte in __hash_page_huge() to an helper
function, so it could also be used by others.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
---
 arch/powerpc/mm/hash_utils_64.c      |   35 ++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/hugetlbpage-hash64.c |   31 ++++++------------------------
 2 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index f410c3e..716f42b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1230,6 +1230,41 @@ void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc)
 		bad_page_fault(regs, address, SIGBUS);
 }
 
+long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+			   unsigned long pa, unsigned long rflags,
+			   int psize, int ssize)
+{
+	unsigned long hpte_group;
+	long slot;
+
+repeat:
+	hpte_group = ((hash & htab_hash_mask) *
+		       HPTES_PER_GROUP) & ~0x7UL;
+
+	/* Insert into the hash table, primary slot */
+	slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+				  psize, ssize);
+
+	/* Primary is full, try the secondary */
+	if (unlikely(slot == -1)) {
+		hpte_group = ((~hash & htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
+					  HPTE_V_SECONDARY,
+					  psize, ssize);
+		if (slot == -1) {
+			if (mftb() & 0x1)
+				hpte_group = ((hash & htab_hash_mask) *
+					      HPTES_PER_GROUP)&~0x7UL;
+
+			ppc_md.hpte_remove(hpte_group);
+			goto repeat;
+		}
+	}
+
+	return slot;
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
 {
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index edb4129..bd7d38b 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -14,6 +14,10 @@
 #include <asm/cacheflush.h>
 #include <asm/machdep.h>
 
+extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
+			   unsigned long pa, unsigned long rlags,
+			   int psize, int ssize);
+
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		     pte_t *ptep, unsigned long trap, int local, int ssize,
 		     unsigned int shift, unsigned int mmu_psize)
@@ -83,7 +87,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 
 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
 		unsigned long hash = hpt_hash(vpn, shift, ssize);
-		unsigned long hpte_group;
 
 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 
@@ -97,30 +100,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 				      _PAGE_COHERENT | _PAGE_GUARDED));
 
-repeat:
-		hpte_group = ((hash & htab_hash_mask) *
-			      HPTES_PER_GROUP) & ~0x7UL;
-
-		/* Insert into the hash table, primary slot */
-		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
-					  mmu_psize, ssize);
-
-		/* Primary is full, try the secondary */
-		if (unlikely(slot == -1)) {
-			hpte_group = ((~hash & htab_hash_mask) *
-				      HPTES_PER_GROUP) & ~0x7UL;
-			slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags,
-						  HPTE_V_SECONDARY,
-						  mmu_psize, ssize);
-			if (slot == -1) {
-				if (mftb() & 0x1)
-					hpte_group = ((hash & htab_hash_mask) *
-						      HPTES_PER_GROUP)&~0x7UL;
-
-				ppc_md.hpte_remove(hpte_group);
-				goto repeat;
-                        }
-		}
+		slot = hpte_insert_repeating(hash, vpn, pa,
+					     rflags, mmu_psize, ssize);
 
 		/*
 		 * Hypervisor failure. Restore old pte and return -1
-- 
1.7.9.5

^ permalink raw reply related

* [RFC PATCH v2 0/4] try secondary hash before BUG in kernel_map_linear_page()
From: Li Zhong @ 2013-04-12  2:16 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: paulus, Li Zhong

Hi Michael, 

Here is the updated version,  could you please help to review it again? 

As you suggested, this version didn't copy the code, but splitted
the logic into a helper function, so both kernel_map_linear_page() and
__hash_page_huge() can use. 

Also patch #1 moves some unnecessary code out of the repeating loop, so the 
splitting is easier. Patch #3 removes the HPTE_V_BOLTED flag in 
kernel_map_linear_page(), it seems not needed based on my understanding. 

Changes are split into smaller ones, so each one did only one thing. 

Thanks, Zhong

Li Zhong (4):
  powerpc: Move the setting of rflags out of loop in __hash_page_huge
  powerpc: Split the code trying to insert hpte repeatedly as an helper
    function
  powerpc: Don't bolt the hpte in kernel_map_linear_page()
  powerpc: Try to insert the hptes repeatedly in
    kernel_map_linear_page()

 arch/powerpc/mm/hash_utils_64.c      |   45 +++++++++++++++++++++++++++++++---
 arch/powerpc/mm/hugetlbpage-hash64.c |   31 +++++------------------
 2 files changed, 47 insertions(+), 29 deletions(-)

-- 
1.7.9.5

^ permalink raw reply

* Re: [PATCH -V5 22/25] powerpc/THP: get_user_pages_fast changes
From: David Gibson @ 2013-04-12  1:41 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev, linux-mm
In-Reply-To: <1365055083-31956-23-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 3791 bytes --]

On Thu, Apr 04, 2013 at 11:28:00AM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> handle large pages for get_user_pages_fast. Also take care of large page splitting.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/mm/gup.c |   84 +++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 82 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
> index d7efdbf..835c1ae 100644
> --- a/arch/powerpc/mm/gup.c
> +++ b/arch/powerpc/mm/gup.c
> @@ -55,6 +55,72 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
>  	return 1;
>  }
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static inline int gup_huge_pmd(pmd_t *pmdp, unsigned long addr,
> +			       unsigned long end, int write,
> +			       struct page **pages, int *nr)
> +{
> +	int refs;
> +	pmd_t pmd;
> +	unsigned long mask;
> +	struct page *head, *page, *tail;
> +
> +	pmd = *pmdp;
> +	mask = PMD_HUGE_PRESENT | PMD_HUGE_USER;
> +	if (write)
> +		mask |= PMD_HUGE_RW;
> +
> +	if ((pmd_val(pmd) & mask) != mask)
> +		return 0;
> +
> +	/* large pages are never "special" */
> +	VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
> +
> +	refs = 0;
> +	head = pmd_page(pmd);
> +	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> +	tail = page;
> +	do {
> +		VM_BUG_ON(compound_head(page) != head);
> +		pages[*nr] = page;
> +		(*nr)++;
> +		page++;
> +		refs++;
> +	} while (addr += PAGE_SIZE, addr != end);
> +
> +	if (!page_cache_add_speculative(head, refs)) {
> +		*nr -= refs;
> +		return 0;
> +	}
> +
> +	if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) {
> +		*nr -= refs;
> +		while (refs--)
> +			put_page(head);
> +		return 0;
> +	}
> +	/*
> +	 * Any tail page need their mapcount reference taken before we
> +	 * return.
> +	 */
> +	while (refs--) {
> +		if (PageTail(tail))
> +			get_huge_page_tail(tail);
> +		tail++;

Is it safe to do this accounting this late?

> +	}
> +
> +	return 1;
> +}
> +#else
> +
> +static inline int gup_huge_pmd(pmd_t *pmdp, unsigned long addr,
> +			       unsigned long end, int write,
> +			       struct page **pages, int *nr)
> +{
> +	return 1;

Should be a BUG() here, since this should never be called if
!CONFIG_TRANSPARENT_HUGEPAGE.

> +}
> +#endif
> +
>  static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>  		int write, struct page **pages, int *nr)
>  {
> @@ -66,9 +132,23 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>  		pmd_t pmd = *pmdp;
>  
>  		next = pmd_addr_end(addr, end);
> -		if (pmd_none(pmd))
> +		/*
> +		 * The pmd_trans_splitting() check below explains why
> +		 * pmdp_splitting_flush has to flush the tlb, to stop
> +		 * this gup-fast code from running while we set the
> +		 * splitting bit in the pmd. Returning zero will take
> +		 * the slow path that will call wait_split_huge_page()
> +		 * if the pmd is still in splitting state. gup-fast
> +		 * can't because it has irq disabled and
> +		 * wait_split_huge_page() would never return as the
> +		 * tlb flush IPI wouldn't run.
> +		 */
> +		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
>  			return 0;
> -		if (is_hugepd(pmdp)) {
> +		if (unlikely(pmd_large(pmd))) {
> +			if (!gup_huge_pmd(pmdp, addr, next, write, pages, nr))
> +				return 0;
> +		} else if (is_hugepd(pmdp)) {
>  			if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
>  					addr, next, write, pages, nr))
>  				return 0;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [PATCH -V5 19/25] powerpc/THP: Differentiate THP PMD entries from HUGETLB PMD entries
From: David Gibson @ 2013-04-12  1:28 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev, linux-mm
In-Reply-To: <1365055083-31956-20-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 4436 bytes --]

On Thu, Apr 04, 2013 at 11:27:57AM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> HUGETLB clear the top bit of PMD entries and use that to indicate
> a HUGETLB page directory. Since we store pfns in PMDs for THP,
> we would have the top bit cleared by default. Add the top bit mask
> for THP PMD entries and clear that when we are looking for pmd_pfn.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/pgtable.h |   16 +++++++++++++---
>  arch/powerpc/mm/pgtable.c          |    5 ++++-
>  arch/powerpc/mm/pgtable_64.c       |    2 +-
>  3 files changed, 18 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> index 9fbe2a7..9681de4 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -31,7 +31,7 @@ struct mm_struct;
>  #define PMD_HUGE_SPLITTING	0x008
>  #define PMD_HUGE_SAO		0x010 /* strong Access order */
>  #define PMD_HUGE_HASHPTE	0x020
> -#define PMD_ISHUGE		0x040
> +#define _PMD_ISHUGE		0x040
>  #define PMD_HUGE_DIRTY		0x080 /* C: page changed */
>  #define PMD_HUGE_ACCESSED	0x100 /* R: page referenced */
>  #define PMD_HUGE_RW		0x200 /* software: user write access allowed */
> @@ -44,6 +44,14 @@ struct mm_struct;
>  #define PMD_HUGE_RPN_SHIFT	PTE_RPN_SHIFT
>  #define HUGE_PAGE_SIZE		(ASM_CONST(1) << 24)
>  #define HUGE_PAGE_MASK		(~(HUGE_PAGE_SIZE - 1))
> +/*
> + * HugeTLB looks at the top bit of the Linux page table entries to
> + * decide whether it is a huge page directory or not. Mark HUGE
> + * PMD to differentiate
> + */
> +#define PMD_HUGE_NOT_HUGETLB	(ASM_CONST(1) << 63)
> +#define PMD_ISHUGE		(_PMD_ISHUGE | PMD_HUGE_NOT_HUGETLB)

Having a define which looks like the name of a boolean flag, but is
two bits strikes me as a really bad idea.

This is one of the many confusions that comes with different pagetable
encodings for transparent and non-transparent hugepages.

Hrm.  So your original patch was horribly broken in that your hugepage
PMDs didn't have the top bit set, and so would be confused with hugepd
pointers.  Now you're patching it up by forcing the top bit to 1 for
hugepage PMDs.  Confusing way of going about it.

> +#define PMD_HUGE_PROTBITS	(0xfff | PMD_HUGE_NOT_HUGETLB)
>  
>  #ifndef __ASSEMBLY__
>  extern void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
> @@ -70,8 +78,9 @@ static inline int pmd_trans_splitting(pmd_t pmd)
>  
>  static inline int pmd_trans_huge(pmd_t pmd)
>  {
> -	return pmd_val(pmd) & PMD_ISHUGE;
> +	return ((pmd_val(pmd) & PMD_ISHUGE) ==  PMD_ISHUGE);



>  }
> +
>  /* We will enable it in the last patch */
>  #define has_transparent_hugepage() 0
>  #else
> @@ -84,7 +93,8 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
>  	/*
>  	 * Only called for hugepage pmd
>  	 */
> -	return pmd_val(pmd) >> PMD_HUGE_RPN_SHIFT;
> +	unsigned long val = pmd_val(pmd) & ~PMD_HUGE_PROTBITS;
> +	return val  >> PMD_HUGE_RPN_SHIFT;
>  }
>  
>  static inline int pmd_young(pmd_t pmd)
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index 9f33780..cf3ca8e 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c
> @@ -517,7 +517,10 @@ static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
>  pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
>  {
>  	pmd_t pmd;
> -
> +	/*
> +	 * We cannot support that many PFNs
> +	 */
> +	VM_BUG_ON(pfn & PMD_HUGE_NOT_HUGETLB);
>  	pmd_val(pmd) = pfn << PMD_HUGE_RPN_SHIFT;
>  	pmd_val(pmd) |= PMD_ISHUGE;
>  	pmd = pmd_set_protbits(pmd, pgprot);
> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> index 6fc3488..cd53020 100644
> --- a/arch/powerpc/mm/pgtable_64.c
> +++ b/arch/powerpc/mm/pgtable_64.c
> @@ -345,7 +345,7 @@ EXPORT_SYMBOL(__iounmap_at);
>  struct page *pmd_page(pmd_t pmd)
>  {
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -	if (pmd_val(pmd) & PMD_ISHUGE)
> +	if ((pmd_val(pmd) & PMD_ISHUGE) == PMD_ISHUGE)
>  		return pfn_to_page(pmd_pfn(pmd));
>  #endif
>  	return virt_to_page(pmd_page_vaddr(pmd));

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [PATCH -V5 21/25] powerpc: Handle hugepage in perf callchain
From: David Gibson @ 2013-04-12  1:34 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev, linux-mm
In-Reply-To: <1365055083-31956-22-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 2722 bytes --]

On Thu, Apr 04, 2013 at 11:27:59AM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/perf/callchain.c |   32 +++++++++++++++++++++-----------
>  1 file changed, 21 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
> index 578cac7..99262ce 100644
> --- a/arch/powerpc/perf/callchain.c
> +++ b/arch/powerpc/perf/callchain.c
> @@ -115,7 +115,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
>  {
>  	pgd_t *pgdir;
>  	pte_t *ptep, pte;
> -	unsigned shift;
> +	unsigned shift, hugepage;
>  	unsigned long addr = (unsigned long) ptr;
>  	unsigned long offset;
>  	unsigned long pfn;
> @@ -125,20 +125,30 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
>  	if (!pgdir)
>  		return -EFAULT;
>  
> -	ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift, NULL);
> +	ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift, &hugepage);

So, this patch pretty much demonstrates that your earlier patch adding
the optional hugepage argument and making the existing callers pass
NULL was broken.

Any code which calls this function and doesn't use and handle the
hugepage return value is horribly broken, so permitting the hugepage
parameter to be optional is itself broken.

I think instead you need to have an early patch that replaces
find_linux_pte_or_hugepte with a new, more abstracted interface, so
that code using it will remain correct when hugepage PMDs become
possible.

>  	if (!shift)
>  		shift = PAGE_SHIFT;
>  
> -	/* align address to page boundary */
> -	offset = addr & ((1UL << shift) - 1);
> -	addr -= offset;
> -
> -	if (ptep == NULL)
> -		return -EFAULT;
> -	pte = *ptep;
> -	if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
> +	if (!ptep)
>  		return -EFAULT;
> -	pfn = pte_pfn(pte);
> +
> +	if (hugepage) {
> +		pmd_t pmd = *(pmd_t *)ptep;
> +		shift = mmu_psize_defs[MMU_PAGE_16M].shift;
> +		offset = addr & ((1UL << shift) - 1);
> +
> +		if (!pmd_large(pmd) || !(pmd_val(pmd) & PMD_HUGE_USER))
> +			return -EFAULT;
> +		pfn = pmd_pfn(pmd);
> +	} else {
> +		offset = addr & ((1UL << shift) - 1);
> +
> +		pte = *ptep;
> +		if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
> +			return -EFAULT;
> +		pfn = pte_pfn(pte);
> +	}
> +
>  	if (!page_is_ram(pfn))
>  		return -EFAULT;
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [PATCH -V5 17/25] powerpc/THP: Implement transparent hugepages for ppc64
From: David Gibson @ 2013-04-12  0:51 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, paulus, linux-mm
In-Reply-To: <87sj2xms5u.fsf@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 20883 bytes --]

On Thu, Apr 11, 2013 at 01:10:29PM +0530, Aneesh Kumar K.V wrote:
> David Gibson <dwg@au1.ibm.com> writes:
> 
> > On Thu, Apr 04, 2013 at 11:27:55AM +0530, Aneesh Kumar K.V wrote:
> >> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> >> 
> >> We now have pmd entries covering to 16MB range. To implement THP on powerpc,
> >> we double the size of PMD. The second half is used to deposit the pgtable (PTE page).
> >> We also use the depoisted PTE page for tracking the HPTE information. The information
> >> include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
> >> With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need
> >> 4096 entries. Both will fit in a 4K PTE page.
> >> 
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> >> ---
> >>  arch/powerpc/include/asm/page.h              |    2 +-
> >>  arch/powerpc/include/asm/pgtable-ppc64-64k.h |    3 +-
> >>  arch/powerpc/include/asm/pgtable-ppc64.h     |    2 +-
> >>  arch/powerpc/include/asm/pgtable.h           |  240 ++++++++++++++++++++
> >>  arch/powerpc/mm/pgtable.c                    |  314 ++++++++++++++++++++++++++
> >>  arch/powerpc/mm/pgtable_64.c                 |   13 ++
> >>  arch/powerpc/platforms/Kconfig.cputype       |    1 +
> >>  7 files changed, 572 insertions(+), 3 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
> >> index 38e7ff6..b927447 100644
> >> --- a/arch/powerpc/include/asm/page.h
> >> +++ b/arch/powerpc/include/asm/page.h
> >> @@ -40,7 +40,7 @@
> >>  #ifdef CONFIG_HUGETLB_PAGE
> >>  extern unsigned int HPAGE_SHIFT;
> >>  #else
> >> -#define HPAGE_SHIFT PAGE_SHIFT
> >> +#define HPAGE_SHIFT PMD_SHIFT
> >
> > That looks like it could break everything except the 64k page size
> > 64-bit base.
> 
> How about 

It seems very dubious to me to have transparent hugepages enabled
without explicit hugepages in the first place.

> 
> diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
> index b927447..fadb1ce 100644
> --- a/arch/powerpc/include/asm/page.h
> +++ b/arch/powerpc/include/asm/page.h
> @@ -37,10 +37,19 @@
>  #define PAGE_SIZE		(ASM_CONST(1) << PAGE_SHIFT)
>  
>  #ifndef __ASSEMBLY__
> -#ifdef CONFIG_HUGETLB_PAGE
> +/*
> + * With hugetlbfs enabled we allow the HPAGE_SHIFT to run time
> + * configurable. But we enable THP only with 16MB hugepage.
> + * With only THP configured, we force hugepage size to 16MB.
> + * This should ensure that all subarchs that doesn't support
> + * THP continue to work fine with HPAGE_SHIFT usage.
> + */
> +#if defined(CONFIG_HUGETLB_PAGE)
>  extern unsigned int HPAGE_SHIFT;
> -#else
> +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
>  #define HPAGE_SHIFT PMD_SHIFT
> +#else
> +#define HPAGE_SHIFT PAGE_SHIFT
>  #endif
>  #define HPAGE_SIZE		((1UL) << HPAGE_SHIFT)
>  #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
> 
> 
> >
> >>  #endif
> >>  #define HPAGE_SIZE		((1UL) << HPAGE_SHIFT)
> >>  #define HPAGE_MASK		(~(HPAGE_SIZE - 1))
> >> diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
> >> index 3c529b4..5c5541a 100644
> >> --- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
> >> +++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
> >> @@ -33,7 +33,8 @@
> >>  #define PGDIR_MASK	(~(PGDIR_SIZE-1))
> >>  
> >>  /* Bits to mask out from a PMD to get to the PTE page */
> >> -#define PMD_MASKED_BITS		0x1ff
> >> +/* PMDs point to PTE table fragments which are 4K aligned.  */
> >> +#define PMD_MASKED_BITS		0xfff
> >>  /* Bits to mask out from a PGD/PUD to get to the PMD page */
> >>  #define PUD_MASKED_BITS		0x1ff
> >>  
> >> diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
> >> index 0182c20..c0747c7 100644
> >> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
> >> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
> >> @@ -150,7 +150,7 @@
> >>  #define	pmd_present(pmd)	(pmd_val(pmd) != 0)
> >>  #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
> >>  #define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
> >> -#define pmd_page(pmd)		virt_to_page(pmd_page_vaddr(pmd))
> >> +extern struct page *pmd_page(pmd_t pmd);
> >
> > Does unconditionally changing pmd_page() from a macro to an external
> > function have a noticeable performance impact?
> 
> I did measure performance impact with THP enabled. Didn't do a micro
> benchmark to measure impact of this change. Any suggestion what test
> results you would like to see ?
> 
> >
> >>  #define pud_set(pudp, pudval)	(pud_val(*(pudp)) = (pudval))
> >>  #define pud_none(pud)		(!pud_val(pud))
> >> diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
> >> index 4b52726..9fbe2a7 100644
> >> --- a/arch/powerpc/include/asm/pgtable.h
> >> +++ b/arch/powerpc/include/asm/pgtable.h
> >> @@ -23,7 +23,247 @@ struct mm_struct;
> >>   */
> >>  #define PTE_PAGE_HIDX_OFFSET (PTRS_PER_PTE * 8)
> >>  
> >> +/* A large part matches with pte bits */
> >> +#define PMD_HUGE_PRESENT	0x001 /* software: pte contains a translation */
> >> +#define PMD_HUGE_USER		0x002 /* matches one of the PP bits */
> >> +#define PMD_HUGE_FILE		0x002 /* (!present only) software: pte holds file offset */
> >
> > Can we actually get hugepage PMDs that are in this state?
> 
> Currently we can't, but we would start supporting THP for page cache later.
> 
> >
> >> +#define PMD_HUGE_EXEC		0x004 /* No execute on POWER4 and newer (we invert) */
> >> +#define PMD_HUGE_SPLITTING	0x008
> >> +#define PMD_HUGE_SAO		0x010 /* strong Access order */
> >> +#define PMD_HUGE_HASHPTE	0x020
> >> +#define PMD_ISHUGE		0x040
> >> +#define PMD_HUGE_DIRTY		0x080 /* C: page changed */
> >> +#define PMD_HUGE_ACCESSED	0x100 /* R: page referenced */
> >> +#define PMD_HUGE_RW		0x200 /* software: user write access allowed */
> >> +#define PMD_HUGE_BUSY		0x800 /* software: PTE & hash are busy */
> >> +#define PMD_HUGE_HPTEFLAGS	(PMD_HUGE_BUSY | PMD_HUGE_HASHPTE)
> >> +/*
> >> + * We keep both the pmd and pte rpn shift same, eventhough we use only
> >> + * lower 12 bits for hugepage flags at pmd level
> >
> > Why?
> >
> 
> I was trying to keep PTE level access code to as much similar to huge
> PMD level code. hence retained the same RPN SHIFT, since that would work.
> 
> >> + */
> >> +#define PMD_HUGE_RPN_SHIFT	PTE_RPN_SHIFT
> >> +#define HUGE_PAGE_SIZE		(ASM_CONST(1) << 24)
> >> +#define HUGE_PAGE_MASK		(~(HUGE_PAGE_SIZE - 1))
> >> +
> 
> .....
> 
> >> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> >> index 214130a..9f33780 100644
> >> --- a/arch/powerpc/mm/pgtable.c
> >> +++ b/arch/powerpc/mm/pgtable.c
> >> @@ -31,6 +31,7 @@
> >>  #include <asm/pgalloc.h>
> >>  #include <asm/tlbflush.h>
> >>  #include <asm/tlb.h>
> >> +#include <asm/machdep.h>
> >>  
> >>  #include "mmu_decl.h"
> >>  
> >> @@ -240,3 +241,316 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
> >>  }
> >>  #endif /* CONFIG_DEBUG_VM */
> >>  
> >> +
> >> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >> +static pmd_t set_hugepage_access_flags_filter(pmd_t pmd,
> >> +					      struct vm_area_struct *vma,
> >> +					      int dirty)
> >> +{
> >> +	return pmd;
> >> +}
> >
> > I don't really see why you're splitting out these trivial ...filter()
> > functions, rather than just doing it inline in the (single) caller.
> 
> 
> No specific reason other than keeping the hugepmd related code similar
> to PTE related access code. This should enable us to easily enable THP
> support for subarchs.
> 
> >
> >> +
> >> +/*
> >> + * This is called when relaxing access to a hugepage. It's also called in the page
> >> + * fault path when we don't hit any of the major fault cases, ie, a minor
> >> + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
> >> + * handled those two for us, we additionally deal with missing execute
> >> + * permission here on some processors
> >> + */
> >> +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
> >> +			  pmd_t *pmdp, pmd_t entry, int dirty)
> >> +{
> >> +	int changed;
> >> +	entry = set_hugepage_access_flags_filter(entry, vma, dirty);
> >> +	changed = !pmd_same(*(pmdp), entry);
> >> +	if (changed) {
> >> +		__pmdp_set_access_flags(pmdp, entry);
> >> +		/*
> >> +		 * Since we are not supporting SW TLB systems, we don't
> >> +		 * have any thing similar to flush_tlb_page_nohash()
> >> +		 */
> >> +	}
> >> +	return changed;
> >> +}
> >> +
> >> +int pmdp_test_and_clear_young(struct vm_area_struct *vma,
> >> +			      unsigned long address, pmd_t *pmdp)
> >> +{
> >> +	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
> >> +}
> >> +
> >> +/*
> >> + * We currently remove entries from the hashtable regardless of whether
> >> + * the entry was young or dirty. The generic routines only flush if the
> >> + * entry was young or dirty which is not good enough.
> >> + *
> >> + * We should be more intelligent about this but for the moment we override
> >> + * these functions and force a tlb flush unconditionally
> >> + */
> >> +int pmdp_clear_flush_young(struct vm_area_struct *vma,
> >> +				  unsigned long address, pmd_t *pmdp)
> >> +{
> >> +	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
> >> +}
> >> +
> >> +/*
> >> + * We mark the pmd splitting and invalidate all the hpte
> >> + * entries for this hugepage.
> >> + */
> >> +void pmdp_splitting_flush(struct vm_area_struct *vma,
> >> +			  unsigned long address, pmd_t *pmdp)
> >> +{
> >> +	unsigned long old, tmp;
> >> +
> >> +	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> >> +#ifdef PTE_ATOMIC_UPDATES
> >> +
> >> +	__asm__ __volatile__(
> >> +	"1:	ldarx	%0,0,%3\n\
> >> +		andi.	%1,%0,%6\n\
> >> +		bne-	1b \n\
> >> +		ori	%1,%0,%4 \n\
> >> +		stdcx.	%1,0,%3 \n\
> >> +		bne-	1b"
> >> +	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
> >> +	: "r" (pmdp), "i" (PMD_HUGE_SPLITTING), "m" (*pmdp), "i" (PMD_HUGE_BUSY)
> >> +	: "cc" );
> >> +#else
> >> +	old = pmd_val(*pmdp);
> >> +	*pmdp = __pmd(old | PMD_HUGE_SPLITTING);
> >> +#endif
> >> +	/*
> >> +	 * If we didn't had the splitting flag set, go and flush the
> >> +	 * HPTE entries and serialize against gup fast.
> >> +	 */
> >> +	if (!(old & PMD_HUGE_SPLITTING)) {
> >> +#ifdef CONFIG_PPC_STD_MMU_64
> >> +		/* We need to flush the hpte */
> >> +		if (old & PMD_HUGE_HASHPTE)
> >> +			hpte_need_hugepage_flush(vma->vm_mm, address, pmdp);
> >> +#endif
> >> +		/* need tlb flush only to serialize against gup-fast */
> >> +		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> >> +	}
> >> +}
> >> +
> >> +/*
> >> + * We want to put the pgtable in pmd and use pgtable for tracking
> >> + * the base page size hptes
> >> + */
> >> +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
> >> +				pgtable_t pgtable)
> >> +{
> >> +	unsigned long *pgtable_slot;
> >> +	assert_spin_locked(&mm->page_table_lock);
> >> +	/*
> >> +	 * we store the pgtable in the second half of PMD
> >> +	 */
> >> +	pgtable_slot = pmdp + PTRS_PER_PMD;
> >> +	*pgtable_slot = (unsigned long)pgtable;
> >> +}
> >> +
> >> +#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t))
> >
> > Another example of why this define should be moved to a header.
> 
> will drop. 
> 
> >
> >> +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
> >> +{
> >> +	pgtable_t pgtable;
> >> +	unsigned long *pgtable_slot;
> >> +
> >> +	assert_spin_locked(&mm->page_table_lock);
> >> +	pgtable_slot = pmdp + PTRS_PER_PMD;
> >> +	pgtable = (pgtable_t) *pgtable_slot;
> >> +	/*
> >> +	 * We store HPTE information in the deposited PTE fragment.
> >> +	 * zero out the content on withdraw.
> >> +	 */
> >> +	memset(pgtable, 0, PTE_FRAG_SIZE);
> >> +	return pgtable;
> >> +}
> >> +
> >> +/*
> >> + * Since we are looking at latest ppc64, we don't need to worry about
> >> + * i/d cache coherency on exec fault
> >> + */
> >> +static pmd_t set_pmd_filter(pmd_t pmd, unsigned long addr)
> >> +{
> >> +	pmd = __pmd(pmd_val(pmd) & ~PMD_HUGE_HPTEFLAGS);
> >> +	return pmd;
> >> +}
> >> +
> >> +/*
> >> + * We can make it less convoluted than __set_pte_at, because
> >> + * we can ignore lot of hardware here, because this is only for
> >> + * MPSS
> >> + */
> >> +static inline void __set_pmd_at(struct mm_struct *mm, unsigned long addr,
> >> +				pmd_t *pmdp, pmd_t pmd, int percpu)
> >> +{
> >> +	/*
> >> +	 * There is nothing in hash page table now, so nothing to
> >> +	 * invalidate, set_pte_at is used for adding new entry.
> >> +	 * For updating we should use update_hugepage_pmd()
> >> +	 */
> >> +	*pmdp = pmd;
> >> +}
> >> +
> >> +/*
> >> + * set a new huge pmd. We should not be called for updating
> >> + * an existing pmd entry. That should go via pmd_hugepage_update.
> >> + */
> >> +void set_pmd_at(struct mm_struct *mm, unsigned long addr,
> >> +		pmd_t *pmdp, pmd_t pmd)
> >> +{
> >> +	/*
> >> +	 * Note: mm->context.id might not yet have been assigned as
> >> +	 * this context might not have been activated yet when this
> >> +	 * is called.
> >> +	 */
> >> +	pmd = set_pmd_filter(pmd, addr);
> >> +
> >> +	__set_pmd_at(mm, addr, pmdp, pmd, 0);
> >> +
> >> +}
> >> +
> >> +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
> >> +		     pmd_t *pmdp)
> >> +{
> >> +	pmd_hugepage_update(vma->vm_mm, address, pmdp, PMD_HUGE_PRESENT);
> >> +	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> >> +}
> >> +
> >> +/*
> >> + * A linux hugepage PMD was changed and the corresponding hash table entry
> >> + * neesd to be flushed.
> >> + *
> >> + * The linux hugepage PMD now include the pmd entries followed by the address
> >> + * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
> >> + * [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry.
> >> + * With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need
> >> + * 4096 entries. Both will fit in a 4K pgtable_t.
> >> + */
> >> +void hpte_need_hugepage_flush(struct mm_struct *mm, unsigned long addr,
> >> +			      pmd_t *pmdp)
> >> +{
> >> +	int ssize, i;
> >> +	unsigned long s_addr;
> >> +	unsigned int psize, valid;
> >> +	unsigned char *hpte_slot_array;
> >> +	unsigned long hidx, vpn, vsid, hash, shift, slot;
> >> +
> >> +	/*
> >> +	 * Flush all the hptes mapping this hugepage
> >> +	 */
> >> +	s_addr = addr & HUGE_PAGE_MASK;
> >> +	/*
> >> +	 * The hpte hindex are stored in the pgtable whose address is in the
> >> +	 * second half of the PMD
> >> +	 */
> >> +	hpte_slot_array = *(char **)(pmdp + PTRS_PER_PMD);
> >> +
> >> +	/* get the base page size */
> >> +	psize = get_slice_psize(mm, s_addr);
> >> +	shift = mmu_psize_defs[psize].shift;
> >> +
> >> +	for (i = 0; i < HUGE_PAGE_SIZE/(1ul << shift); i++) {
> >
> > HUGE_PAGE_SIZE >> shift would be a simpler way to do this calculation.
> 
> Wonder how I missed that
> 
> 
> >
> >> +		/*
> >> +		 * 8 bits per each hpte entries
> >> +		 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
> >> +		 */
> >> +		valid = hpte_slot_array[i] & 0x1;
> >> +		if (!valid)
> >> +			continue;
> >> +		hidx =  hpte_slot_array[i]  >> 1;
> >> +
> >> +		/* get the vpn */
> >> +		addr = s_addr + (i * (1ul << shift));
> >> +		if (!is_kernel_addr(addr)) {
> >> +			ssize = user_segment_size(addr);
> >> +			vsid = get_vsid(mm->context.id, addr, ssize);
> >> +			WARN_ON(vsid == 0);
> >> +		} else {
> >> +			vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
> >> +			ssize = mmu_kernel_ssize;
> >> +		}
> >> +
> >> +		vpn = hpt_vpn(addr, vsid, ssize);
> >> +		hash = hpt_hash(vpn, shift, ssize);
> >> +		if (hidx & _PTEIDX_SECONDARY)
> >> +			hash = ~hash;
> >> +
> >> +		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> >> +		slot += hidx & _PTEIDX_GROUP_IX;
> >> +		ppc_md.hpte_invalidate(slot, vpn, psize, ssize, 0);
> >> +	}
> >> +}
> >> +
> >> +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
> >> +{
> >> +	unsigned long pmd_prot = 0;
> >> +	unsigned long prot = pgprot_val(pgprot);
> >> +
> >> +	if (prot & _PAGE_PRESENT)
> >> +		pmd_prot |= PMD_HUGE_PRESENT;
> >> +	if (prot & _PAGE_USER)
> >> +		pmd_prot |= PMD_HUGE_USER;
> >> +	if (prot & _PAGE_FILE)
> >> +		pmd_prot |= PMD_HUGE_FILE;
> >> +	if (prot & _PAGE_EXEC)
> >> +		pmd_prot |= PMD_HUGE_EXEC;
> >> +	/*
> >> +	 * _PAGE_COHERENT should always be set
> >> +	 */
> >> +	VM_BUG_ON(!(prot & _PAGE_COHERENT));
> >> +
> >> +	if (prot & _PAGE_SAO)
> >> +		pmd_prot |= PMD_HUGE_SAO;
> >
> > This looks dubious because _PAGE_SAO is not a single bit.  What
> > happens if WRITETHRU or NO_CACHE is set without the other?
> 
> yes that should be 
>     if ((prot & _PAGE_SAO) == _PAGE_SAO )
> 
> 
> >
> >> +	if (prot & _PAGE_DIRTY)
> >> +		pmd_prot |= PMD_HUGE_DIRTY;
> >> +	if (prot & _PAGE_ACCESSED)
> >> +		pmd_prot |= PMD_HUGE_ACCESSED;
> >> +	if (prot & _PAGE_RW)
> >> +		pmd_prot |= PMD_HUGE_RW;
> >> +
> >> +	pmd_val(pmd) |= pmd_prot;
> >> +	return pmd;
> >> +}
> >> +
> >> +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
> >> +{
> >> +	pmd_t pmd;
> >> +
> >> +	pmd_val(pmd) = pfn << PMD_HUGE_RPN_SHIFT;
> >> +	pmd_val(pmd) |= PMD_ISHUGE;
> >> +	pmd = pmd_set_protbits(pmd, pgprot);
> >> +	return pmd;
> >> +}
> >> +
> >> +pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
> >> +{
> >> +	return pfn_pmd(page_to_pfn(page), pgprot);
> >> +}
> >> +
> >> +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
> >> +{
> >> +	/* FIXME!! why are this bits cleared ? */
> >
> > You really need to answer this question...
> 
> will check. 
> 
> >
> >> +	pmd_val(pmd) &= ~(PMD_HUGE_PRESENT |
> >> +			  PMD_HUGE_RW |
> >> +			  PMD_HUGE_EXEC);
> >> +	pmd = pmd_set_protbits(pmd, newprot);
> >> +	return pmd;
> >> +}
> >> +
> >> +/*
> >> + * This is called at the end of handling a user page fault, when the
> >> + * fault has been handled by updating a HUGE PMD entry in the linux page tables.
> >> + * We use it to preload an HPTE into the hash table corresponding to
> >> + * the updated linux HUGE PMD entry.
> >> + */
> >> +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
> >> +			  pmd_t *pmd)
> >> +{
> >> +	/* FIXME!!
> >> +	 * Will be done in a later patch
> >> +	 */
> >
> > If you need another patch to make the code in this patch work, they
> > should probably be folded together.
> >
> 
> I have that as TODO, we can do a hash_preload for hugepage here. But I
> don't see we doing that for HugeTLB. So I haven't yet done that for
> hugepage. Do you know why we don't do hash_preload for HugeTLB page ?
> 
> 
> >> +}
> >> +
> >> +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> >> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> >> index e79840b..6fc3488 100644
> >> --- a/arch/powerpc/mm/pgtable_64.c
> >> +++ b/arch/powerpc/mm/pgtable_64.c
> >> @@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
> >>  EXPORT_SYMBOL(__iounmap);
> >>  EXPORT_SYMBOL(__iounmap_at);
> >>  
> >> +/*
> >> + * For hugepage we have pfn in the pmd, we use PMD_HUGE_RPN_SHIFT bits for flags
> >> + * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
> >> + */
> >> +struct page *pmd_page(pmd_t pmd)
> >> +{
> >> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >> +	if (pmd_val(pmd) & PMD_ISHUGE)
> >> +		return pfn_to_page(pmd_pfn(pmd));
> >> +#endif
> >> +	return virt_to_page(pmd_page_vaddr(pmd));
> >> +}
> >> +
> >>  #ifdef CONFIG_PPC_64K_PAGES
> >>  /*
> >>   * we support 16 fragments per PTE page. This is limited by how many
> >> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
> >> index 72afd28..90ee19b 100644
> >> --- a/arch/powerpc/platforms/Kconfig.cputype
> >> +++ b/arch/powerpc/platforms/Kconfig.cputype
> >> @@ -71,6 +71,7 @@ config PPC_BOOK3S_64
> >>  	select PPC_FPU
> >>  	select PPC_HAVE_PMU_SUPPORT
> >>  	select SYS_SUPPORTS_HUGETLBFS
> >> +	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
> >>  
> >>  config PPC_BOOK3E_64
> >>  	bool "Embedded processors"
> 
> -aneesh
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* questions on CONFIG_PPC_ADV_DEBUG_REGS, DBCR0_BRT, and DBCR0_ACTIVE_EVENTS
From: Chris Friesen @ 2013-04-11 23:21 UTC (permalink / raw)
  To: linuxppc-dev

Hi all,

Sorry for the bunch of emails, I'm working on some new stuff and running 
into issues.

For CONFIG_BOOKE it appears that DBCR0_ACTIVE_EVENTS includes DBCR0_ICMP 
but not DBCR0_BRT.  Is that just because none of the code paths 
currently using DBCR0_ACTIVE_EVENTS need to check DBCR0_BT?

Also, in sys_debug_setcontext() why does SIG_DBG_BRANCH_TRACING return 
-EINVAL if CONFIG_PPC_ADV_DEBUG_REGS is set?  Would it not be possible 
to use DBCR0_BRT?

Thanks,
Chris

-- 

Chris Friesen
Software Designer

500 Palladium Drive, Suite 2100
Ottawa, Ontario K2N 1C2, Canada
www.genband.com
office:+1.343.883.2717
chris.friesen@genband.com

^ permalink raw reply

* [BUG?]  possible bug in user_enable_block_step()
From: Chris Friesen @ 2013-04-11 22:38 UTC (permalink / raw)
  To: linuxppc-dev


Hi,

I'm looking at user_enable_block_step() in current kernels and it has 
the following:

#ifdef CONFIG_PPC_ADV_DEBUG_REGS
		task->thread.dbcr0 &= ~DBCR0_IC;
		task->thread.dbcr0 = DBCR0_IDM | DBCR0_BT;
		regs->msr |= MSR_DE;


Should it be as follows?

		task->thread.dbcr0 |= DBCR0_IDM | DBCR0_BT;


If not, then what's the point of clearing the DBCR0_IC bit in the 
previous line?

Chris


-- 

Chris Friesen
Software Designer

500 Palladium Drive, Suite 2100
Ottawa, Ontario K2N 1C2, Canada
www.genband.com
office:+1.343.883.2717
chris.friesen@genband.com

^ permalink raw reply

* Re: [PATCH 1/9] powerpc,kvm: fix imbalance srcu_read_[un]lock()
From: Paul E. McKenney @ 2013-04-11 21:51 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Lai Jiangshan, Gleb Natapov, Marcelo Tosatti, Alexander Graf,
	kvm-ppc, linux-kernel, kvm, Andrew Morton, linuxppc-dev
In-Reply-To: <20130317212648.GA4259@iris.ozlabs.ibm.com>

On Mon, Mar 18, 2013 at 08:26:48AM +1100, Paul Mackerras wrote:
> On Sat, Mar 16, 2013 at 12:50:49AM +0800, Lai Jiangshan wrote:
> > At the point of up_out label in kvmppc_hv_setup_htab_rma(),
> > srcu read lock is still held.
> > 
> > We have to release it before return.
> > 
> > Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> > Cc: Marcelo Tosatti <mtosatti@redhat.com>
> > Cc: Gleb Natapov <gleb@redhat.com>
> > Cc: Alexander Graf <agraf@suse.de>
> > Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> > Cc: Paul Mackerras <paulus@samba.org>
> > Cc: kvm@vger.kernel.org
> > Cc: kvm-ppc@vger.kernel.org
> > ---
> >  arch/powerpc/kvm/book3s_hv.c |    2 +-
> >  1 files changed, 1 insertions(+), 1 deletions(-)
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index 80dcc53..c26740e 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -1799,7 +1799,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
> >  
> >   up_out:
> >  	up_read(&current->mm->mmap_sem);
> > -	goto out;
> > +	goto out_srcu;
> 
> Acked-by: Paul Mackerras <paulus@samba.org>

Thank you both, queued for 3.11 (assuming no one has beat me to it).

							Thanx, Paul

^ permalink raw reply

* Re: [PATCH 0/2] enable the coreint for the mpc85xx 64bit boards
From: Scott Wood @ 2013-04-11 20:45 UTC (permalink / raw)
  To: Kumar Gala; +Cc: Wood Scott-B07421, Kevin Hao, linuxppc
In-Reply-To: <6DEE94C2-19EB-487D-99B5-58789C9230BF@kernel.crashing.org>

On 04/11/2013 08:21:24 AM, Kumar Gala wrote:
>=20
> On Apr 10, 2013, at 8:32 PM, Kevin Hao wrote:
>=20
> > Hi,
> >
> > With the rework of the lazy EE, it seems that 64bit kernel works =20
> pretty
> > well on mpc85xx 64bit boards with lazy EE enabled. So this patch =20
> series
> > tries to enable the coreint for these boards by default. This passed
> > the ltp test on a t4240qds board and is based on Kumar's next =20
> branch.
> >
> > ---
> > Kevin Hao (2):
> >  powerpc/irq: remove the unneeded flag PACA_IRQ_EE_EDGE
> >  powerpc/85xx: enable coreint for all the 64bit boards
> >
> > arch/powerpc/include/asm/hw_irq.h       | 1 -
> > arch/powerpc/kernel/exceptions-64e.S    | 1 -
> > arch/powerpc/kernel/irq.c               | 8 --------
> > arch/powerpc/platforms/85xx/p5020_ds.c  | 5 -----
> > arch/powerpc/platforms/85xx/p5040_ds.c  | 5 -----
> > arch/powerpc/platforms/85xx/t4240_qds.c | 5 -----
> > 6 files changed, 25 deletions(-)
>=20
> Scott,
>=20
> I'd appreciate your review and ack on this change.

ACK

-Scott=

^ permalink raw reply

* Re: [PATCH 9/9] powerpc: cpufreq: move cpufreq driver to drivers/cpufreq
From: Rafael J. Wysocki @ 2013-04-11 20:33 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: robin.randhawa, linux-pm, Liviu.Dudau, linux-kernel, cpufreq,
	Steve.Bannister, Paul Mackerras, Olof Johansson, Arnd Bergmann,
	arvind.chauhan, linuxppc-dev, linaro-kernel, charles.garcia-tobin
In-Reply-To: <CAKohpomHFgMBAHax=jC_LNd8Gay9aWja+64An7QxnNpaPuuSQQ@mail.gmail.com>

On Thursday, April 11, 2013 12:32:50 PM Viresh Kumar wrote:
> On 11 April 2013 12:29, Olof Johansson <olof@lixom.net> wrote:
> > That's not very nice this late in the staging cycle. Give the powerpc
> > guys a little more time than a single day to come back with test results,
> > please.
> 
> Yes we are waiting indefinitely for an Ack from powerpc guys.. If we can
> get a Ack soon, we will get it in 3.10 otherwise next cycle.

Well, I'll be traveling from Saturday morning onwards until the end of the
next week and my window for taking stuff into v3.10 closes in 16 hours.
Whatever is not ready (i.e. ACKed in this particular case) before that time,
won't be taken.

Thanks,
Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* Re: [PATCH] net: mv643xx_eth: Add GRO support
From: David Miller @ 2013-04-11 20:22 UTC (permalink / raw)
  To: sebastian.hesselbarth
  Cc: andrew, jason, linux-kernel, smoch, paulus, linux-arm-kernel,
	dale, netdev, linuxppc-dev, florian, buytenh
In-Reply-To: <1365684023-9967-1-git-send-email-sebastian.hesselbarth@gmail.com>

From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Thu, 11 Apr 2013 14:40:23 +0200

> This patch adds GRO support to mv643xx_eth by making it invoke
> napi_gro_receive instead of netif_receive_skb.
> 
> Signed-off-by: Soeren Moch <smoch@web.de>
> Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH 0/2] net: mv643xx_eth: use managed clk and allocation
From: David Miller @ 2013-04-11 20:20 UTC (permalink / raw)
  To: sebastian.hesselbarth
  Cc: andrew, jason, sergei.shtylyov, linux-doc, devicetree-discuss,
	linux-kernel, rob.herring, paulus, rob, netdev, linuxppc-dev,
	florian, buytenh
In-Reply-To: <1365672574-31123-1-git-send-email-sebastian.hesselbarth@gmail.com>

From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Thu, 11 Apr 2013 11:29:32 +0200

> With introduction of common clock framework and the ability to provide
> gated clocks, mv643xx_eth required calls to get and enable these clock
> gates on some platforms. Back then, common clock framework api wasn't
> safe for architectures without support for common clocks. This has
> changed now and there are also managed (devm_) counterparts for clock
> related functions.
> 
> The second patch in this series, also converts kzalloc to devm_kzalloc
> where applicable.
> 
> Both patches have been sent to the corresponding mailing lists as
> individual patches before. To get the order required to apply them right,
> this patch set combines both patches into one set.

Both applied to net-next, thanks.

^ permalink raw reply

* Re: recommended method of netbooting kernel/dtb in u-boot?
From: Chris Friesen @ 2013-04-11 19:59 UTC (permalink / raw)
  To: Ira W. Snyder; +Cc: linuxppc-dev
In-Reply-To: <20130411195019.GA31025@ovro.caltech.edu>

On 04/11/2013 01:50 PM, Ira W. Snyder wrote:

> I use a hardware setup which sounds similar to yours. The DHCP server
> controls which file is sent to each card.
>
> I use the FIT image format to combine a kernel, dtb, and initrd in one
> package.
>
> <snip>
>
> I used the U-Boot doc/uImage.FIT/*.its examples to get started, and
> wrote my own custom .its file for my board. I don't use anything other
> than the vmlinux.bin.gz provided by the kernel build.

Okay, that's a good data point, thanks.

Chris

^ permalink raw reply

* Re: recommended method of netbooting kernel/dtb in u-boot?
From: Ira W. Snyder @ 2013-04-11 19:50 UTC (permalink / raw)
  To: Chris Friesen; +Cc: linuxppc-dev
In-Reply-To: <51670344.2090101@genband.com>

On Thu, Apr 11, 2013 at 12:39:00PM -0600, Chris Friesen wrote:
> On 04/11/2013 12:12 PM, Kumar Gala wrote:
> >
> > On Apr 11, 2013, at 10:44 AM, Chris Friesen wrote:
> >
> >>
> >> Hi all,
> >>
> >> We've got a powerpc system that uses u-boot.  In our environment on
> >> bootup u-boot does a DHCP to get networking info, then uses TFTP to
> >> get the kernel, which then does DHCP again and NFS-mounts the
> >> initial root filesystem.
> >>
> >> What's the standard practice for this sort of thing when using
> >> device tree blobs?  Do most people use multi-file images or do they
> >> TFTP scripts to load and execute separate kernel/dtb files?
> >
> > We've normally just done multiple tftp fetches and one grabs dtb and
> > one grabs kernel.
> 
> Do you hardcode the path to the file in the firmware?  Or do you upload 
> a script that knows the path to the file?
> 
> In our case the path to the boot file(s) depends on which slot the card 
> being booted has been inserted in.  The DHCP server knows what the path 
> is, so it can set dhcpd.conf appropriately, but we need to get that 
> information to the firmware on the card being booted.
> 

Hello Chris,

I use a hardware setup which sounds similar to yours. The DHCP server
controls which file is sent to each card.

I use the FIT image format to combine a kernel, dtb, and initrd in one
package. Then the U-Boot command "dhcp $address" sets up the network and
tftp's the filename sent by the DHCP server. You don't need to invoke
the U-Boot command "tftp" if you only have one image. "dhcp" is enough.

I used the U-Boot doc/uImage.FIT/*.its examples to get started, and
wrote my own custom .its file for my board. I don't use anything other
than the vmlinux.bin.gz provided by the kernel build.

Hope it helps,
Ira

^ permalink raw reply

* Re: [PATCH] net: mv643xx_eth: remove deprecated inet_lro support
From: Sebastian Hesselbarth @ 2013-04-11 19:52 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andrew Lunn, Jason Cooper, linux-kernel, David S. Miller,
	Soeren Moch, Paul Mackerras, linux-arm-kernel, Dale Farnsworth,
	Ben Hutchings, netdev, linuxppc-dev, Florian Fainelli,
	Lennert Buytenhek, Willy Tarreau
In-Reply-To: <1365709617.3887.185.camel@edumazet-glaptop>

On 04/11/2013 09:46 PM, Eric Dumazet wrote:
> On Thu, 2013-04-11 at 21:11 +0200, Sebastian Hesselbarth wrote:
>> With recent support for GRO, there is no need to keep both LRO and
>> GRO. This patch therefore removes the deprecated inet_lro support
>> from mv643xx_eth. This is work is based on an experimental patch
>> provided by Eric Dumazet and Willy Tarreau.
>>
>> Signed-off-by: Sebastian Hesselbarth<sebastian.hesselbarth@gmail.com>
>> Based-on-patch-by: Eric Dumazet<eric.dumazet@gmail.com>
>> Based-on-patch-by: Willy Tarreau<w@1wt.eu>
>> ---
>> Note: This patch is based upon recent cleanup patches and GRO support
>> patch for mv643xx_eth.
>>
>> Cc: "David S. Miller"<davem@davemloft.net>
>> Cc: Lennert Buytenhek<buytenh@wantstofly.org>
>> Cc: Andrew Lunn<andrew@lunn.ch>
>> Cc: Jason Cooper<jason@lakedaemon.net>
>> Cc: Florian Fainelli<florian@openwrt.org>
>> Cc: Benjamin Herrenschmidt<benh@kernel.crashing.org>
>> Cc: Paul Mackerras<paulus@samba.org>
>> Cc: Dale Farnsworth<dale@farnsworth.org>
>> Cc: Ben Hutchings<bhutchings@solarflare.com>
>> Cc: Soeren Moch<smoch@web.de>
>> Cc: Eric Dumazet<eric.dumazet@gmail.com>
>> Cc: Willy Tarreau<w@1wt.eu>
>> Cc: netdev@vger.kernel.org
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linuxppc-dev@lists.ozlabs.org
>> Cc: linux-kernel@vger.kernel.org
>> ---
>>   drivers/net/ethernet/marvell/mv643xx_eth.c |   97 +---------------------------
>>   1 file changed, 3 insertions(+), 94 deletions(-)
>
> Seems fine to me, but you also could remove "select INET_LRO"
> from drivers/net/ethernet/marvell/Kconfig

Ok, I will wait for tomorrow to see if there are more objections and
respin a v2.

Sebastian

^ permalink raw reply

* Re: [PATCH] net: mv643xx_eth: remove deprecated inet_lro support
From: Eric Dumazet @ 2013-04-11 19:46 UTC (permalink / raw)
  To: Sebastian Hesselbarth
  Cc: Andrew Lunn, Jason Cooper, linux-kernel, David S. Miller,
	Soeren Moch, Paul Mackerras, linux-arm-kernel, Dale Farnsworth,
	Ben Hutchings, netdev, linuxppc-dev, Florian Fainelli,
	Lennert Buytenhek, Willy Tarreau
In-Reply-To: <1365707488-28819-1-git-send-email-sebastian.hesselbarth@gmail.com>

On Thu, 2013-04-11 at 21:11 +0200, Sebastian Hesselbarth wrote:
> With recent support for GRO, there is no need to keep both LRO and
> GRO. This patch therefore removes the deprecated inet_lro support
> from mv643xx_eth. This is work is based on an experimental patch
> provided by Eric Dumazet and Willy Tarreau.
> 
> Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
> Based-on-patch-by: Eric Dumazet <eric.dumazet@gmail.com>
> Based-on-patch-by: Willy Tarreau <w@1wt.eu>
> ---
> Note: This patch is based upon recent cleanup patches and GRO support
> patch for mv643xx_eth.
> 
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Lennert Buytenhek <buytenh@wantstofly.org>
> Cc: Andrew Lunn <andrew@lunn.ch>
> Cc: Jason Cooper <jason@lakedaemon.net>
> Cc: Florian Fainelli <florian@openwrt.org>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Dale Farnsworth <dale@farnsworth.org>
> Cc: Ben Hutchings <bhutchings@solarflare.com>
> Cc: Soeren Moch <smoch@web.de>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Willy Tarreau <w@1wt.eu>
> Cc: netdev@vger.kernel.org
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  drivers/net/ethernet/marvell/mv643xx_eth.c |   97 +---------------------------
>  1 file changed, 3 insertions(+), 94 deletions(-)

Seems fine to me, but you also could remove "select INET_LRO"
from drivers/net/ethernet/marvell/Kconfig

^ permalink raw reply

* [PATCH] net: mv643xx_eth: remove deprecated inet_lro support
From: Sebastian Hesselbarth @ 2013-04-11 19:11 UTC (permalink / raw)
  To: Sebastian Hesselbarth
  Cc: Andrew Lunn, Jason Cooper, Eric Dumazet, linux-kernel,
	David S. Miller, Soeren Moch, Paul Mackerras, linux-arm-kernel,
	Dale Farnsworth, Ben Hutchings, netdev, linuxppc-dev,
	Florian Fainelli, Lennert Buytenhek, Willy Tarreau

With recent support for GRO, there is no need to keep both LRO and
GRO. This patch therefore removes the deprecated inet_lro support
from mv643xx_eth. This is work is based on an experimental patch
provided by Eric Dumazet and Willy Tarreau.

Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Based-on-patch-by: Eric Dumazet <eric.dumazet@gmail.com>
Based-on-patch-by: Willy Tarreau <w@1wt.eu>
---
Note: This patch is based upon recent cleanup patches and GRO support
patch for mv643xx_eth.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Lennert Buytenhek <buytenh@wantstofly.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Florian Fainelli <florian@openwrt.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Dale Farnsworth <dale@farnsworth.org>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Soeren Moch <smoch@web.de>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Willy Tarreau <w@1wt.eu>
Cc: netdev@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/net/ethernet/marvell/mv643xx_eth.c |   97 +---------------------------
 1 file changed, 3 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index c850d04..d0afeea 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -56,8 +56,8 @@
 #include <linux/phy.h>
 #include <linux/mv643xx_eth.h>
 #include <linux/io.h>
+#include <linux/interrupt.h>
 #include <linux/types.h>
-#include <linux/inet_lro.h>
 #include <linux/slab.h>
 #include <linux/clk.h>
 
@@ -316,12 +316,6 @@ struct mib_counters {
 	u32 rx_overrun;
 };
 
-struct lro_counters {
-	u32 lro_aggregated;
-	u32 lro_flushed;
-	u32 lro_no_desc;
-};
-
 struct rx_queue {
 	int index;
 
@@ -335,9 +329,6 @@ struct rx_queue {
 	dma_addr_t rx_desc_dma;
 	int rx_desc_area_size;
 	struct sk_buff **rx_skb;
-
-	struct net_lro_mgr lro_mgr;
-	struct net_lro_desc lro_arr[8];
 };
 
 struct tx_queue {
@@ -373,8 +364,6 @@ struct mv643xx_eth_private {
 	spinlock_t mib_counters_lock;
 	struct mib_counters mib_counters;
 
-	struct lro_counters lro_counters;
-
 	struct work_struct tx_timeout_task;
 
 	struct napi_struct napi;
@@ -503,42 +492,12 @@ static void txq_maybe_wake(struct tx_queue *txq)
 	}
 }
 
-
-/* rx napi ******************************************************************/
-static int
-mv643xx_get_skb_header(struct sk_buff *skb, void **iphdr, void **tcph,
-		       u64 *hdr_flags, void *priv)
-{
-	unsigned long cmd_sts = (unsigned long)priv;
-
-	/*
-	 * Make sure that this packet is Ethernet II, is not VLAN
-	 * tagged, is IPv4, has a valid IP header, and is TCP.
-	 */
-	if ((cmd_sts & (RX_IP_HDR_OK | RX_PKT_IS_IPV4 |
-		       RX_PKT_IS_ETHERNETV2 | RX_PKT_LAYER4_TYPE_MASK |
-		       RX_PKT_IS_VLAN_TAGGED)) !=
-	    (RX_IP_HDR_OK | RX_PKT_IS_IPV4 |
-	     RX_PKT_IS_ETHERNETV2 | RX_PKT_LAYER4_TYPE_TCP_IPV4))
-		return -1;
-
-	skb_reset_network_header(skb);
-	skb_set_transport_header(skb, ip_hdrlen(skb));
-	*iphdr = ip_hdr(skb);
-	*tcph = tcp_hdr(skb);
-	*hdr_flags = LRO_IPV4 | LRO_TCP;
-
-	return 0;
-}
-
 static int rxq_process(struct rx_queue *rxq, int budget)
 {
 	struct mv643xx_eth_private *mp = rxq_to_mp(rxq);
 	struct net_device_stats *stats = &mp->dev->stats;
-	int lro_flush_needed;
 	int rx;
 
-	lro_flush_needed = 0;
 	rx = 0;
 	while (rx < budget && rxq->rx_desc_count) {
 		struct rx_desc *rx_desc;
@@ -599,12 +558,7 @@ static int rxq_process(struct rx_queue *rxq, int budget)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 		skb->protocol = eth_type_trans(skb, mp->dev);
 
-		if (skb->dev->features & NETIF_F_LRO &&
-		    skb->ip_summed == CHECKSUM_UNNECESSARY) {
-			lro_receive_skb(&rxq->lro_mgr, skb, (void *)cmd_sts);
-			lro_flush_needed = 1;
-		} else
-			napi_gro_receive(&mp->napi, skb);
+		napi_gro_receive(&mp->napi, skb);
 
 		continue;
 
@@ -624,9 +578,6 @@ err:
 		dev_kfree_skb(skb);
 	}
 
-	if (lro_flush_needed)
-		lro_flush_all(&rxq->lro_mgr);
-
 	if (rx < budget)
 		mp->work_rx &= ~(1 << rxq->index);
 
@@ -1118,26 +1069,6 @@ static struct net_device_stats *mv643xx_eth_get_stats(struct net_device *dev)
 	return stats;
 }
 
-static void mv643xx_eth_grab_lro_stats(struct mv643xx_eth_private *mp)
-{
-	u32 lro_aggregated = 0;
-	u32 lro_flushed = 0;
-	u32 lro_no_desc = 0;
-	int i;
-
-	for (i = 0; i < mp->rxq_count; i++) {
-		struct rx_queue *rxq = mp->rxq + i;
-
-		lro_aggregated += rxq->lro_mgr.stats.aggregated;
-		lro_flushed += rxq->lro_mgr.stats.flushed;
-		lro_no_desc += rxq->lro_mgr.stats.no_desc;
-	}
-
-	mp->lro_counters.lro_aggregated = lro_aggregated;
-	mp->lro_counters.lro_flushed = lro_flushed;
-	mp->lro_counters.lro_no_desc = lro_no_desc;
-}
-
 static inline u32 mib_read(struct mv643xx_eth_private *mp, int offset)
 {
 	return rdl(mp, MIB_COUNTERS(mp->port_num) + offset);
@@ -1301,10 +1232,6 @@ struct mv643xx_eth_stats {
 	{ #m, FIELD_SIZEOF(struct mib_counters, m),		\
 	  -1, offsetof(struct mv643xx_eth_private, mib_counters.m) }
 
-#define LROSTAT(m)						\
-	{ #m, FIELD_SIZEOF(struct lro_counters, m),		\
-	  -1, offsetof(struct mv643xx_eth_private, lro_counters.m) }
-
 static const struct mv643xx_eth_stats mv643xx_eth_stats[] = {
 	SSTAT(rx_packets),
 	SSTAT(tx_packets),
@@ -1346,9 +1273,6 @@ static const struct mv643xx_eth_stats mv643xx_eth_stats[] = {
 	MIBSTAT(late_collision),
 	MIBSTAT(rx_discard),
 	MIBSTAT(rx_overrun),
-	LROSTAT(lro_aggregated),
-	LROSTAT(lro_flushed),
-	LROSTAT(lro_no_desc),
 };
 
 static int
@@ -1578,7 +1502,6 @@ static void mv643xx_eth_get_ethtool_stats(struct net_device *dev,
 
 	mv643xx_eth_get_stats(dev);
 	mib_counters_update(mp);
-	mv643xx_eth_grab_lro_stats(mp);
 
 	for (i = 0; i < ARRAY_SIZE(mv643xx_eth_stats); i++) {
 		const struct mv643xx_eth_stats *stat;
@@ -1851,19 +1774,6 @@ static int rxq_init(struct mv643xx_eth_private *mp, int index)
 					nexti * sizeof(struct rx_desc);
 	}
 
-	rxq->lro_mgr.dev = mp->dev;
-	memset(&rxq->lro_mgr.stats, 0, sizeof(rxq->lro_mgr.stats));
-	rxq->lro_mgr.features = LRO_F_NAPI;
-	rxq->lro_mgr.ip_summed = CHECKSUM_UNNECESSARY;
-	rxq->lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
-	rxq->lro_mgr.max_desc = ARRAY_SIZE(rxq->lro_arr);
-	rxq->lro_mgr.max_aggr = 32;
-	rxq->lro_mgr.frag_align_pad = 0;
-	rxq->lro_mgr.lro_arr = rxq->lro_arr;
-	rxq->lro_mgr.get_skb_header = mv643xx_get_skb_header;
-
-	memset(&rxq->lro_arr, 0, sizeof(rxq->lro_arr));
-
 	return 0;
 
 
@@ -2851,8 +2761,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	dev->watchdog_timeo = 2 * HZ;
 	dev->base_addr = 0;
 
-	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
-		NETIF_F_RXCSUM | NETIF_F_LRO;
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
 	dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
 	dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM;
 
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH] bookehv: Handle debug exception on guest exit
From: Stuart Yoder @ 2013-04-11 18:44 UTC (permalink / raw)
  To: Kumar Gala
  Cc: Wood Scott-B07421, KVM list, Alexander Graf, kvm-ppc,
	Yoder Stuart-B08248, Bhushan Bharat-R65777, linuxppc list
In-Reply-To: <CALRxmdD7ZWj4PM5VTZz_RimmgQAJCX=5yUx=H+7-W30r-PdTtA@mail.gmail.com>

So the patch should look something like this (on a 3.8 kernel):

diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 5f051ee..92b675a 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -286,13 +286,13 @@ label:
        andis.  r10,r10,(DBSR_IC|DBSR_BT)@h;                                  \
        beq+    2f;                                                           \
                                                                              \
-       lis     r10,KERNELBASE@h;       /* check if exception in vectors */   \
-       ori     r10,r10,KERNELBASE@l;                                         \
+       lis     r10,interrupt_base@h;   /* check if exception in vectors */   \
+       ori     r10,r10,interrupt_base@l;
        cmplw   r12,r10;                                                      \
        blt+    2f;                     /* addr below exception vectors */    \
                                                                              \
-       lis     r10,DebugDebug@h;                                             \
-       ori     r10,r10,DebugDebug@l;                                         \
+       lis     r10,interrupt_end@h;                                          \
+       ori     r10,r10,interrupt_end@l;
        cmplw   r12,r10;                                                      \
        bgt+    2f;                     /* addr above exception vectors */    \
                                                                              \
@@ -339,13 +339,13 @@ label:
        andis.  r10,r10,(DBSR_IC|DBSR_BT)@h;                                  \
        beq+    2f;                                                           \
                                                                              \
-       lis     r10,KERNELBASE@h;       /* check if exception in vectors */   \
-       ori     r10,r10,KERNELBASE@l;                                         \
+       lis     r10,interrupt_base@h;   /* check if exception in vectors */   \
+       ori     r10,r10,interrupt_base@l;
        cmplw   r12,r10;                                                      \
        blt+    2f;                     /* addr below exception vectors */    \
                                                                              \
-       lis     r10,DebugCrit@h;                                              \
-       ori     r10,r10,DebugCrit@l;                                          \
+       lis     r10,interrupt_end@h;                                          \
+       ori     r10,r10,interrupt_end@l;
        cmplw   r12,r10;                                                      \
        bgt+    2f;                     /* addr above exception vectors */    \
                                                                              \


diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 7a2e5e4..97e2671 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -769,6 +769,8 @@ finish_tlb_load_47x:
         */
        DEBUG_CRIT_EXCEPTION

+interrupt_end:
+
 /*
  * Global functions
  */


diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl
index 58925b6..2c3e31d 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -605,6 +605,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
        /* Embedded Hypervisor Privilege */
        EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE)

+interrupt_end:
+
 /*
  * Local functions
  */

^ permalink raw reply related

* Re: recommended method of netbooting kernel/dtb in u-boot?
From: Chris Friesen @ 2013-04-11 18:39 UTC (permalink / raw)
  To: Kumar Gala; +Cc: linuxppc-dev
In-Reply-To: <3B81A930-A4AB-4AA7-BC36-FCCE55D33C66@kernel.crashing.org>

On 04/11/2013 12:12 PM, Kumar Gala wrote:
>
> On Apr 11, 2013, at 10:44 AM, Chris Friesen wrote:
>
>>
>> Hi all,
>>
>> We've got a powerpc system that uses u-boot.  In our environment on
>> bootup u-boot does a DHCP to get networking info, then uses TFTP to
>> get the kernel, which then does DHCP again and NFS-mounts the
>> initial root filesystem.
>>
>> What's the standard practice for this sort of thing when using
>> device tree blobs?  Do most people use multi-file images or do they
>> TFTP scripts to load and execute separate kernel/dtb files?
>
> We've normally just done multiple tftp fetches and one grabs dtb and
> one grabs kernel.

Do you hardcode the path to the file in the firmware?  Or do you upload 
a script that knows the path to the file?

In our case the path to the boot file(s) depends on which slot the card 
being booted has been inserted in.  The DHCP server knows what the path 
is, so it can set dhcpd.conf appropriately, but we need to get that 
information to the firmware on the card being booted.

Chris

^ permalink raw reply

* Re: [PATCH] bookehv: Handle debug exception on guest exit
From: Stuart Yoder @ 2013-04-11 18:37 UTC (permalink / raw)
  To: Kumar Gala
  Cc: Wood Scott-B07421, KVM list, Alexander Graf, kvm-ppc,
	Yoder Stuart-B08248, Bhushan Bharat-R65777, linuxppc list
In-Reply-To: <5A04ECFC-E828-4EBF-895B-043FC038ABB5@kernel.crashing.org>

On Thu, Apr 11, 2013 at 1:33 PM, Kumar Gala <galak@kernel.crashing.org> wrote:
>
> On Apr 5, 2013, at 2:53 AM, Bhushan Bharat-R65777 wrote:
>
>> Hi Kumar/Benh,
>>
>> After further looking into the code I think that if we correct the vector range below in DebugDebug handler then we do not need the change I provided in this patch.
>>
>> Here is the snapshot for 32 bit (head_booke.h, same will be true for 64 bit):
>>
>> #define DEBUG_DEBUG_EXCEPTION                                                 \
>>        START_EXCEPTION(DebugDebug);                                          \
>>        DEBUG_EXCEPTION_PROLOG;                                               \
>>                                                                              \
>>        /*                                                                    \
>>         * If there is a single step or branch-taken exception in an          \
>>         * exception entry sequence, it was probably meant to apply to        \
>>         * the code where the exception occurred (since exception entry       \
>>         * doesn't turn off DE automatically).  We simulate the effect        \
>>         * of turning off DE on entry to an exception handler by turning      \
>>         * off DE in the DSRR1 value and clearing the debug status.           \
>>         */                                                                   \
>>        mfspr   r10,SPRN_DBSR;          /* check single-step/branch taken */  \
>>        andis.  r10,r10,(DBSR_IC|DBSR_BT)@h;                                  \
>>        beq+    2f;                                                           \
>>                                                                              \
>>        lis     r10,KERNELBASE@h;       /* check if exception in vectors */   \
>>        ori     r10,r10,KERNELBASE@l;                                         \
>>        cmplw   r12,r10;                                                      \
>>        blt+    2f;                     /* addr below exception vectors */    \
>>                                                                              \
>>        lis     r10,DebugDebug@h;                                        \
>>        ori     r10,r10,DebugDebug@l;                                            \
>>
>> ^^^^
>>       Here we assume all exception vector ends at DebugDebug, which is not correct.
>>       We probably should get proper end by using some start_vector and end_vector lebels
>>       or at least use end at Ehvpriv (which is last defined in head_fsl_booke.S for PowerPC. Is that correct?
>>
>>
>>        cmplw   r12,r10;                                                      \
>>        bgt+    2f;                     /* addr above exception vectors */    \
>>
>> Thanks
>> -Bharat
>
> I talked to Stuart and this general approach is good.  Just make sure to update both head_44x.S and head_fsl_booke.S.  Plus do this for both DEBUG_CRIT_EXCEPTION & DEBUG_DEBUG_EXCEPTION

Also, it looks like 64-bit already handles this properly with symbols
identifying the
start/end of the vectors (exceptions-64e.S).

Stuart

^ permalink raw reply

* Re: [PATCH] bookehv: Handle debug exception on guest exit
From: Kumar Gala @ 2013-04-11 18:33 UTC (permalink / raw)
  To: Bhushan Bharat-R65777
  Cc: Wood Scott-B07421, KVM list, Alexander Graf, kvm-ppc,
	Yoder Stuart-B08248, linuxppc list
In-Reply-To: <6A3DF150A5B70D4F9B66A25E3F7C888D06FC1773@039-SN2MPN1-013.039d.mgd.msft.net>


On Apr 5, 2013, at 2:53 AM, Bhushan Bharat-R65777 wrote:

> Hi Kumar/Benh,
>=20
> After further looking into the code I think that if we correct the =
vector range below in DebugDebug handler then we do not need the change =
I provided in this patch.
>=20
> Here is the snapshot for 32 bit (head_booke.h, same will be true for =
64 bit):
>=20
> #define DEBUG_DEBUG_EXCEPTION                                          =
       \
>        START_EXCEPTION(DebugDebug);                                    =
      \
>        DEBUG_EXCEPTION_PROLOG;                                         =
      \
>                                                                        =
      \
>        /*                                                              =
      \
>         * If there is a single step or branch-taken exception in an    =
      \
>         * exception entry sequence, it was probably meant to apply to  =
      \
>         * the code where the exception occurred (since exception entry =
      \
>         * doesn't turn off DE automatically).  We simulate the effect  =
      \
>         * of turning off DE on entry to an exception handler by =
turning      \
>         * off DE in the DSRR1 value and clearing the debug status.     =
      \
>         */                                                             =
      \
>        mfspr   r10,SPRN_DBSR;          /* check single-step/branch =
taken */  \
>        andis.  r10,r10,(DBSR_IC|DBSR_BT)@h;                            =
      \
>        beq+    2f;                                                     =
      \
>                                                                        =
      \
>        lis     r10,KERNELBASE@h;       /* check if exception in =
vectors */   \
>        ori     r10,r10,KERNELBASE@l;                                   =
      \
>        cmplw   r12,r10;                                                =
      \
>        blt+    2f;                     /* addr below exception vectors =
*/    \
>                                                                        =
      \
>        lis     r10,DebugDebug@h;                                       =
 \
>        ori     r10,r10,DebugDebug@l;                                   =
         \
>=20
> ^^^^
> 	Here we assume all exception vector ends at DebugDebug, which is =
not correct.
> 	We probably should get proper end by using some start_vector and =
end_vector lebels
> 	or at least use end at Ehvpriv (which is last defined in =
head_fsl_booke.S for PowerPC. Is that correct?
>=20
> =09
>        cmplw   r12,r10;                                                =
      \
>        bgt+    2f;                     /* addr above exception vectors =
*/    \
>=20
> Thanks
> -Bharat

I talked to Stuart and this general approach is good.  Just make sure to =
update both head_44x.S and head_fsl_booke.S.  Plus do this for both =
DEBUG_CRIT_EXCEPTION & DEBUG_DEBUG_EXCEPTION

- k=

^ permalink raw reply

* Re: [PATCH 2/5 v11] powerpc: Add iommu domain pointer to device archdata
From: Kumar Gala @ 2013-04-11 18:16 UTC (permalink / raw)
  To: Varun Sethi
  Cc: joro, stuart.yoder, linux-kernel, iommu, scottwood, linuxppc-dev
In-Reply-To: <1364500442-20927-3-git-send-email-Varun.Sethi@freescale.com>


On Mar 28, 2013, at 2:53 PM, Varun Sethi wrote:

> Add an iommu domain pointer to device (powerpc) archdata.  Devices
> are attached to iommu domains and this pointer provides a mechanism
> to correlate between a device and the associated iommu domain.  This
> field is set when a device is attached to a domain.
> 
> Signed-off-by: Varun Sethi <Varun.Sethi@freescale.com>
> ---
> - no change in v11.
> - no change in v10.
> - Added CONFIG_IOMMU_API in v9.
> arch/powerpc/include/asm/device.h |    6 ++++++
> 1 files changed, 6 insertions(+), 0 deletions(-)

Acked-by: Kumar Gala <galak@kernel.crashing.org>

- k

^ permalink raw reply

* Re: recommended method of netbooting kernel/dtb in u-boot?
From: Kumar Gala @ 2013-04-11 18:12 UTC (permalink / raw)
  To: Chris Friesen; +Cc: linuxppc-dev
In-Reply-To: <5166DA49.5000907@genband.com>


On Apr 11, 2013, at 10:44 AM, Chris Friesen wrote:

>=20
> Hi all,
>=20
> We've got a powerpc system that uses u-boot.  In our environment on =
bootup u-boot does a DHCP to get networking info, then uses TFTP to get =
the kernel, which then does DHCP again and NFS-mounts the initial root =
filesystem.
>=20
> What's the standard practice for this sort of thing when using device =
tree blobs?  Do most people use multi-file images or do they TFTP =
scripts to load and execute separate kernel/dtb files?
>=20
> For multi-part images is there an in-kernel way of generating a file =
containing a kernel and dtb combined?  I saw a patch proposed in 2008 to =
add a uImage.<dt> build target but it was shot down because it used the =
legacy multi-file format.  Is there an equivalent build target that uses =
the FIT format?


We've normally just done multiple tftp fetches and one grabs dtb and one =
grabs kernel.

- k=

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox