LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [patch 2/2] mm: use vm_unmapped_area() on powerpc architecture
From: akpm @ 2013-02-21 23:05 UTC (permalink / raw)
  To: benh; +Cc: paulus, akpm, walken, linuxppc-dev

From: Michel Lespinasse <walken@google.com>
Subject: mm: use vm_unmapped_area() on powerpc architecture

Update the powerpc slice_get_unmapped_area function to make use of
vm_unmapped_area() instead of implementing a brute force search.

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/powerpc/mm/slice.c |  123 ++++++++++++++++++++++++--------------
 1 file changed, 78 insertions(+), 45 deletions(-)

diff -puN arch/powerpc/mm/slice.c~mm-use-vm_unmapped_area-on-powerpc-architecture arch/powerpc/mm/slice.c
--- a/arch/powerpc/mm/slice.c~mm-use-vm_unmapped_area-on-powerpc-architecture
+++ a/arch/powerpc/mm/slice.c
@@ -237,36 +237,69 @@ static void slice_convert(struct mm_stru
 #endif
 }
 
+/*
+ * Compute which slice addr is part of;
+ * set *boundary_addr to the start or end boundary of that slice
+ * (depending on 'end' parameter);
+ * return boolean indicating if the slice is marked as available in the
+ * 'available' slice_mark.
+ */
+static bool slice_scan_available(unsigned long addr,
+				 struct slice_mask available,
+				 int end,
+				 unsigned long *boundary_addr)
+{
+	unsigned long slice;
+	if (addr < SLICE_LOW_TOP) {
+		slice = GET_LOW_SLICE_INDEX(addr);
+		*boundary_addr = (slice + end) << SLICE_LOW_SHIFT;
+		return !!(available.low_slices & (1u << slice));
+	} else {
+		slice = GET_HIGH_SLICE_INDEX(addr);
+		*boundary_addr = (slice + end) ?
+			((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
+		return !!(available.high_slices & (1u << slice));
+	}
+}
+
 static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
 					      unsigned long len,
 					      struct slice_mask available,
 					      int psize)
 {
-	struct vm_area_struct *vma;
-	unsigned long addr;
-	struct slice_mask mask;
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+	unsigned long addr, found, next_end;
+	struct vm_unmapped_area_info info;
 
-	addr = TASK_UNMAPPED_BASE;
-
-	for (;;) {
-		addr = _ALIGN_UP(addr, 1ul << pshift);
-		if ((TASK_SIZE - len) < addr)
-			break;
-		vma = find_vma(mm, addr);
-		BUG_ON(vma && (addr >= vma->vm_end));
+	info.flags = 0;
+	info.length = len;
+	info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+	info.align_offset = 0;
 
-		mask = slice_range_to_mask(addr, len);
-		if (!slice_check_fit(mask, available)) {
-			if (addr < SLICE_LOW_TOP)
-				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_LOW_SHIFT);
-			else
-				addr = _ALIGN_UP(addr + 1,  1ul << SLICE_HIGH_SHIFT);
+	addr = TASK_UNMAPPED_BASE;
+	while (addr < TASK_SIZE) {
+		info.low_limit = addr;
+		if (!slice_scan_available(addr, available, 1, &addr))
 			continue;
+
+ next_slice:
+		/*
+		 * At this point [info.low_limit; addr) covers
+		 * available slices only and ends at a slice boundary.
+		 * Check if we need to reduce the range, or if we can
+		 * extend it to cover the next available slice.
+		 */
+		if (addr >= TASK_SIZE)
+			addr = TASK_SIZE;
+		else if (slice_scan_available(addr, available, 1, &next_end)) {
+			addr = next_end;
+			goto next_slice;
 		}
-		if (!vma || addr + len <= vma->vm_start)
-			return addr;
-		addr = vma->vm_end;
+		info.high_limit = addr;
+
+		found = vm_unmapped_area(&info);
+		if (!(found & ~PAGE_MASK))
+			return found;
 	}
 
 	return -ENOMEM;
@@ -277,39 +310,39 @@ static unsigned long slice_find_area_top
 					     struct slice_mask available,
 					     int psize)
 {
-	struct vm_area_struct *vma;
-	unsigned long addr;
-	struct slice_mask mask;
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+	unsigned long addr, found, prev;
+	struct vm_unmapped_area_info info;
 
-	addr = mm->mmap_base;
-	while (addr > len) {
-		/* Go down by chunk size */
-		addr = _ALIGN_DOWN(addr - len, 1ul << pshift);
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.align_mask = PAGE_MASK & ((1ul << pshift) - 1);
+	info.align_offset = 0;
 
-		/* Check for hit with different page size */
-		mask = slice_range_to_mask(addr, len);
-		if (!slice_check_fit(mask, available)) {
-			if (addr < SLICE_LOW_TOP)
-				addr = _ALIGN_DOWN(addr, 1ul << SLICE_LOW_SHIFT);
-			else if (addr < (1ul << SLICE_HIGH_SHIFT))
-				addr = SLICE_LOW_TOP;
-			else
-				addr = _ALIGN_DOWN(addr, 1ul << SLICE_HIGH_SHIFT);
+	addr = mm->mmap_base;
+	while (addr > PAGE_SIZE) {
+		info.high_limit = addr;
+		if (!slice_scan_available(addr - 1, available, 0, &addr))
 			continue;
-		}
 
+ prev_slice:
 		/*
-		 * Lookup failure means no vma is above this address,
-		 * else if new region fits below vma->vm_start,
-		 * return with success:
+		 * At this point [addr; info.high_limit) covers
+		 * available slices only and starts at a slice boundary.
+		 * Check if we need to reduce the range, or if we can
+		 * extend it to cover the previous available slice.
 		 */
-		vma = find_vma(mm, addr);
-		if (!vma || (addr + len) <= vma->vm_start)
-			return addr;
+		if (addr < PAGE_SIZE)
+			addr = PAGE_SIZE;
+		else if (slice_scan_available(addr - 1, available, 0, &prev)) {
+			addr = prev;
+			goto prev_slice;
+		}
+		info.low_limit = addr;
 
-		/* try just below the current vma->vm_start */
-		addr = vma->vm_start;
+		found = vm_unmapped_area(&info);
+		if (!(found & ~PAGE_MASK))
+			return found;
 	}
 
 	/*
_

^ permalink raw reply

* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Benjamin Herrenschmidt @ 2013-02-21 23:46 UTC (permalink / raw)
  To: Phileas Fogg; +Cc: linuxppc-dev
In-Reply-To: <5126955B.9070808@mail.ru>

On Thu, 2013-02-21 at 22:44 +0100, Phileas Fogg wrote:
> Stripped OpenWRT image:
> ------------------------
> 
> c00000000001a474:       48 00 00 05     bl      0xc00000000001a478
> c00000000001a478:       7c a8 02 a6     mflr    r5
> c00000000001a47c:       38 a5 00 1c     addi    r5,r5,28
> c00000000001a480:       7c 21 0b 78     mr      r1,r1
> c00000000001a484:       80 85 00 00     lwz     r4,0(r5)
> c00000000001a488:       2c 04 00 00     cmpwi   r4,0
> c00000000001a48c:       40 82 00 62     bnea-   0x60
> c00000000001a490:       4b ff ff f0     b       0xc00000000001a480
> c00000000001a494:       00 00 00 00     .long 0x0
> c00000000001a498:       a0 6d 00 48     lhz     r3,72(r13)
> c00000000001a49c:       48 00 00 11     bl      0xc00000000001a4ac


Smell like a bad stack pointer to me...

One thing I noticed is that kexec doesn't seem to hard disable
interrupts, which is ... fishy at best. It should do that
before it switches stacks around. Dunno if that's the cause
of the problem but it might be worth adding a hard_irq_disable()
after all the local_irq_disable(), making sure we are hard
disabled before going into asm.

Cheers,
Ben.

^ permalink raw reply

* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Benjamin Herrenschmidt @ 2013-02-21 23:47 UTC (permalink / raw)
  To: Phileas Fogg; +Cc: linuxppc-dev
In-Reply-To: <51269A4B.1020501@mail.ru>

On Thu, 2013-02-21 at 23:06 +0100, Phileas Fogg wrote:
> Does it look like the new data at offset 0x80 and 0x88 in DT are MSR
> flags 
> MSR_DR, MSR_IR and MSR_EE ?

Yes, that looks plausible though I would have expected ME to be set as
well ... Or it could be a CCR value. But it does look like something
splattered the DT as if it was a stack... ie, bad r1 value.

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH v6 00/46] CPU hotplug: stop_machine()-free CPU hotplug
From: Rusty Russell @ 2013-02-22  0:31 UTC (permalink / raw)
  To: Srivatsa S. Bhat, tglx, peterz, tj, oleg, paulmck, mingo, akpm,
	namhyung
  Cc: linux-arch, linux, nikunj, linux-pm, fweisbec, linux-doc,
	linux-kernel, rostedt, xiaoguangrong, rjw, sbw, wangyun,
	srivatsa.bhat, netdev, vincent.guittot, walken, linuxppc-dev,
	linux-arm-kernel
In-Reply-To: <20130218123714.26245.61816.stgit@srivatsabhat.in.ibm.com>

"Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com> writes:
> Hi,
>
> This patchset removes CPU hotplug's dependence on stop_machine() from the CPU
> offline path and provides an alternative (set of APIs) to preempt_disable() to
> prevent CPUs from going offline, which can be invoked from atomic context.
> The motivation behind the removal of stop_machine() is to avoid its ill-effects
> and thus improve the design of CPU hotplug. (More description regarding this
> is available in the patches).

If you're doing a v7, please put your benchmark results somewhere!

The obvious place is in the 44/46.

Thanks,
Rusty.

^ permalink raw reply

* Re: [RFC PATCH -V2 05/21] powerpc: Reduce PTE table memory wastage
From: David Gibson @ 2013-02-22  0:32 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-6-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 1013 bytes --]

On Thu, Feb 21, 2013 at 10:17:12PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> We now have PTE page consuming only 2K of the 64K page.This is in order to
> facilitate transparent huge page support, which works much better if our PMDs
> cover 16MB instead of 256MB.
> 
> Inorder to reduce the wastage, we now have multiple PTE page fragment
> from the same PTE page.

This needs a much better description of what you're doing here to
manage the allocations.  It's certainly not easy to figure out from
the code.


[snip]
> +#ifdef CONFIG_PPC_64K_PAGES
> +typedef pte_t *pgtable_t;
> +#else
>  typedef struct page *pgtable_t;
> +#endif

This looks really bogus.  A pgtable_t is a pointer to PTEs on 64K, but
a pointer to a struct page on 4k.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* [PATCH] powerpc: Remove unused postfix parameter to DEFINE_BITOP()
From: Michael Ellerman @ 2013-02-22  3:25 UTC (permalink / raw)
  To: linuxppc-dev

None of the users of DEFINE_BITOP pass a postfix, and as far as I can
tell none ever did, so drop it.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
---
 arch/powerpc/include/asm/bitops.h |   11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index ef918a2..810c5fc 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -55,7 +55,7 @@
 #define BITOP_LE_SWIZZLE	((BITS_PER_LONG-1) & ~0x7)
 
 /* Macro for generating the ***_bits() functions */
-#define DEFINE_BITOP(fn, op, prefix, postfix)	\
+#define DEFINE_BITOP(fn, op, prefix)		\
 static __inline__ void fn(unsigned long mask,	\
 		volatile unsigned long *_p)	\
 {						\
@@ -68,16 +68,15 @@ static __inline__ void fn(unsigned long mask,	\
 	PPC405_ERR77(0,%3)			\
 	PPC_STLCX "%0,0,%3\n"			\
 	"bne- 1b\n"				\
-	postfix					\
 	: "=&r" (old), "+m" (*p)		\
 	: "r" (mask), "r" (p)			\
 	: "cc", "memory");			\
 }
 
-DEFINE_BITOP(set_bits, or, "", "")
-DEFINE_BITOP(clear_bits, andc, "", "")
-DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER, "")
-DEFINE_BITOP(change_bits, xor, "", "")
+DEFINE_BITOP(set_bits, or, "")
+DEFINE_BITOP(clear_bits, andc, "")
+DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER)
+DEFINE_BITOP(change_bits, xor, "")
 
 static __inline__ void set_bit(int nr, volatile unsigned long *addr)
 {
-- 
1.7.10.4

^ permalink raw reply related

* Re: [RFC PATCH -V2 01/21] powerpc: Use signed formatting when printing error
From: Paul Mackerras @ 2013-02-22  5:00 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Thu, Feb 21, 2013 at 10:17:08PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> PAPR define these errors as negative values. So print them accordingly
       ^ defines

> for easy debugging.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Acked-by: Paul Mackerras <paulus@samba.org>

^ permalink raw reply

* Re: [RFC PATCH -V2 02/21] powerpc: Save DAR and DSISR in pt_regs on MCE
From: Paul Mackerras @ 2013-02-22  5:03 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-3-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Thu, Feb 21, 2013 at 10:17:09PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> We were not saving DAR and DSISR on MCE. Save then and also print the values
> along with exception details in xmon.

The one reservation I have about this is that xmon will now be
printing bogus values on 32-bit and embedded processors.  However, it
seems 32-bit doesn't set regs->dar on a DSI (300) interrupt either.
So:

Acked-by: Paul Mackerras <paulus@samba.org>

^ permalink raw reply

* Re: [RFC PATCH -V2 03/21] powerpc: Don't hard code the size of pte page
From: Paul Mackerras @ 2013-02-22  5:06 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-4-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Thu, Feb 21, 2013 at 10:17:10PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> USE PTRS_PER_PTE to indicate the size of pte page.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> powerpc: Don't hard code the size of pte page
> 
> USE PTRS_PER_PTE to indicate the size of pte page.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Description and signoff are duplicated.  Description could be more
informative, for example - why would we want to do this?

> +/*
> + * hidx is in the second half of the page table. We use the
> + * 8 bytes per each pte entry.

The casual reader probably wouldn't know what "hidx" is.  The comment
needs at least to use a better name than "hidx".

Paul.

^ permalink raw reply

* Re: [RFC PATCH -V2 04/21] powerpc: Reduce the PTE_INDEX_SIZE
From: Paul Mackerras @ 2013-02-22  5:07 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-5-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Thu, Feb 21, 2013 at 10:17:11PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> This make one PMD cover 16MB range. That helps in easier implementation of THP
> on power. THP core code make use of one pmd entry to track the huge page and
> the range mapped by a single pmd entry should be equal to the huge page size
> supported by the hardware.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Acked-by: Paul Mackerras <paulus@samba.org>

^ permalink raw reply

* Re: [RFC PATCH -V2 05/21] powerpc: Reduce PTE table memory wastage
From: Aneesh Kumar K.V @ 2013-02-22  5:14 UTC (permalink / raw)
  To: David Gibson; +Cc: paulus, linuxppc-dev, linux-mm
In-Reply-To: <20130222003235.GJ21011@truffula.fritz.box>

David Gibson <david@gibson.dropbear.id.au> writes:

> On Thu, Feb 21, 2013 at 10:17:12PM +0530, Aneesh Kumar K.V wrote:
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> We now have PTE page consuming only 2K of the 64K page.This is in order to
>> facilitate transparent huge page support, which works much better if our PMDs
>> cover 16MB instead of 256MB.
>> 
>> Inorder to reduce the wastage, we now have multiple PTE page fragment
>> from the same PTE page.
>
> This needs a much better description of what you're doing here to
> manage the allocations.  It's certainly not easy to figure out from
> the code.


I will add more detailed description in the commit message.

We allocate one page for the last level of linux page table. With THP and
large page size of 16MB, that would mean we are be wasting large part
of that page. To map 16MB area, we only need a PTE space of 2K with 64K
Page size. This patch reduce the space wastage by sharing the page
allocated for the last level of linux page table with multiple pmd
entries. We call these smaller chunks PTE page fragments and allocated
page, PTE page. We use the page->_mapcount as bitmap to indicate which
PTE fragments are free.


>
>
> [snip]
>> +#ifdef CONFIG_PPC_64K_PAGES
>> +typedef pte_t *pgtable_t;
>> +#else
>>  typedef struct page *pgtable_t;
>> +#endif
>
> This looks really bogus.  A pgtable_t is a pointer to PTEs on 64K, but
> a pointer to a struct page on 4k.
>

We enable all the above only with 64K Pages. 

-aneesh

^ permalink raw reply

* Re: [RFC PATCH -V2 05/21] powerpc: Reduce PTE table memory wastage
From: Paul Mackerras @ 2013-02-22  5:23 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-6-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Thu, Feb 21, 2013 at 10:17:12PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> We now have PTE page consuming only 2K of the 64K page.This is in order to

In fact the PTE page together with the hash table indexes occupies 4k,
doesn't it?  The comments in the code are similarly confusing since
they talk about 2k but actually allocate 4k.

> facilitate transparent huge page support, which works much better if our PMDs
> cover 16MB instead of 256MB.
> 
> Inorder to reduce the wastage, we now have multiple PTE page fragment
  ^ In order (two words)

> from the same PTE page.

A patch like this needs a more complete description and explanation
than you have given.  For instance, you could mention that the code
that you're adding for the 32-bit and non-64k cases are just copies of
the previously generic code from pgalloc.h (actually, this movement
might be something that could be split out as a separate patch).
Also, you should describe in outline how you keep a list of pages that
aren't fully allocated and have a bitmap of which 4k sections are in
use, and also how your scheme interacts with RCU.

[snip]

> +#ifdef CONFIG_PPC_64K_PAGES
> +/*
> + * we support 15 fragments per PTE page. This is limited by how many

Why only 15?  Don't we get 16 fragments per page?

> + * bits we can pack in page->_mapcount. We use the first half for
> + * tracking the usage for rcu page table free.

What does "first" mean?  The high half or the low half?

> +unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
> +{
> +	struct page *page;
> +	unsigned int mask, bit;
> +	unsigned long *table;
> +
> +	/* Allocate fragments of a 4K page as 1K/2K page table */

A 4k page?  Do you mean a 64k page?  And what is 1K to do with
anything?

> +#ifdef CONFIG_SMP
> +static void __page_table_free_rcu(void *table)
> +{
> +	unsigned int bit;
> +	struct page *page;
> +	/*
> +	 * this is a PTE page free 2K page table
> +	 * fragment of a 64K page.
> +	 */
> +	page = virt_to_page(table);
> +	bit = 1 << ((__pa(table) & ~PAGE_MASK) / PTE_FRAG_SIZE);
> +	bit <<= FRAG_MASK_BITS;
> +	/*
> +	 * clear the higher half and if nobody used the page in
> +	 * between, even lower half would be zero.
> +	 */
> +	if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
> +		pgtable_page_dtor(page);
> +		atomic_set(&page->_mapcount, -1);
> +		__free_page(page);
> +	}
> +}
> +
> +static void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
> +{
> +	struct page *page;
> +	struct mm_struct *mm;
> +	unsigned int bit, mask;
> +
> +	mm = tlb->mm;
> +	/* Free 2K page table fragment of a 64K page */
> +	page = virt_to_page(table);
> +	bit = 1 << ((__pa(table) & ~PAGE_MASK) / PTE_FRAG_SIZE);
> +	spin_lock(&mm->page_table_lock);
> +	/*
> +	 * stash the actual mask in higher half, and clear the lower half
> +	 * and selectively, add remove from pgtable list
> +	 */
> +	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << FRAG_MASK_BITS));
> +	if (!(mask & FRAG_MASK))
> +		list_del(&page->lru);
> +	else {
> +		/*
> +		 * Add the page table page to pgtable_list so that
> +		 * the free fragment can be used by the next alloc
> +		 */
> +		list_del_init(&page->lru);
> +		list_add_tail(&page->lru, &mm->context.pgtable_list);
> +	}
> +	spin_unlock(&mm->page_table_lock);
> +	tlb_remove_table(tlb, table);
> +}

This looks like you're allowing a fragment that is being freed to be
reallocated and used again during the grace period when we are waiting
for any references to the fragment to disappear.  Doesn't that allow a
race where one CPU traversing the page table and using the fragment in
its old location in the tree could see a PTE created after the
fragment was reallocated?  In other words, why is it safe to allow the
fragment to be used during the grace period?  If it is safe, it at
least needs a comment explaining why.

Paul.

^ permalink raw reply

* Re: [RFC PATCH -V2 06/21] powerpc: Add size argument to pgtable_cache_add
From: Paul Mackerras @ 2013-02-22  5:27 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-7-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Thu, Feb 21, 2013 at 10:17:13PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> We will use this later with THP changes. With THP we want to create PMD with
> twice the size. The second half will be used to depoist pgtable, which will
                                                  ^ deposit?
> carry the hpte hash index value

I'm not familiar with what "deposit" and "withdraw" mean in the THP
context.  If you can find a way to make the patch description more
informative for people who are not completely familiar with THP
(without adding a full-blown description of THP, of course) that would
be good.

Paul.

^ permalink raw reply

* Re: [RFC PATCH -V2 07/21] powerpc: Use encode avpn where we need only avpn values
From: Paul Mackerras @ 2013-02-22  5:28 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-8-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Thu, Feb 21, 2013 at 10:17:14PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Needs a patch description.  What is the motivation for doing this?  Is
the new code completely equivalent to the old, or if not, what are the
differences?  Etc.

Paul.

^ permalink raw reply

* Re: [RFC PATCH -V2 08/21] powerpc: Decode the pte-lp-encoding bits correctly.
From: Paul Mackerras @ 2013-02-22  5:37 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <1361465248-10867-9-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Thu, Feb 21, 2013 at 10:17:15PM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> We look at both the segment base page size and actual page size and store
> the pte-lp-encodings in an array per base page size.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

This needs more than 2 lines of patch description.  In fact what
you're doing is adding general mixed page-size segment (MPSS)
support.  Doing this should mean that you can also get rid of the
MMU_PAGE_64K_AP value from the list in asm/mmu.h.

>  struct mmu_psize_def
>  {
>  	unsigned int	shift;	/* number of bits */
> -	unsigned int	penc;	/* HPTE encoding */
> +	unsigned int	penc[MMU_PAGE_COUNT];	/* HPTE encoding */

I guess this is reasonable, though adding space for 14 page size
encodings seems a little bit over the top.  Also, you don't seem to
have any way to indicate which encodings are valid, since 0 is a valid
encoding.  Maybe you need to add a valid bit higher up to indicate
which page sizes are valid.

> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 71d0c90..d2c9932 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1515,7 +1515,12 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
>  	(*sps)->page_shift = def->shift;
>  	(*sps)->slb_enc = def->sllp;
>  	(*sps)->enc[0].page_shift = def->shift;
> -	(*sps)->enc[0].pte_enc = def->penc;
> +	/*
> +	 * FIXME!!
> +	 * This is returned to user space. Do we need to
> +	 * return details of MPSS here ?

Yes, we do, probably a separate entry for each valid base/actual page
size pair.

> +static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
> +{
> +	unsigned int mask;
> +	int i, penc, shift;
> +	/* Look at the 8 bit LP value */
> +	unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << (LP_BITS + 1)) - 1);

Why LP_BITS + 1 here?  You seem to be extracting and comparing 9 bits
rather than 8.  Why is that?

> @@ -395,12 +422,13 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
>  			/* valid entries have a shift value */
>  			if (!mmu_psize_defs[size].shift)
>  				continue;
> -
> -			if (penc == mmu_psize_defs[size].penc)
> -				break;
> +			for (a_size = 0; a_size < MMU_PAGE_COUNT; a_size++)
> +				if (penc == mmu_psize_defs[size].penc[a_size])
> +					goto out;

I think this will get false matches due to unused/invalid entries
in mmu_psize_defs[size].penc[] containing 0.

Paul.

^ permalink raw reply

* RE: [RFC][PATCH] powerpc: add Book E support to 64-bit hibernation
From: Wang Dongsheng-B40534 @ 2013-02-22  7:35 UTC (permalink / raw)
  To: benh@kernel.crashing.org, johannes@sipsolutions.net,
	linuxppc-dev@lists.ozlabs.org
  Cc: Wood Scott-B07421, Li Yang-R58472, Zhao Chenhui-B35336
In-Reply-To: <1360203915-22112-1-git-send-email-dongsheng.wang@freescale.com>

Hi Benjamin & Johannes,

Any thoughts about this patch?

> -----Original Message-----
> From: Wang Dongsheng-B40534
> Sent: Thursday, February 07, 2013 10:25 AM
> To: linuxppc-dev@lists.ozlabs.org
> Cc: Wood Scott-B07421; Li Yang-R58472; Zhao Chenhui-B35336; Wang
> Dongsheng-B40534
> Subject: [RFC][PATCH] powerpc: add Book E support to 64-bit hibernation
>=20
> Update the 64-bit hibernation code to support Book E CPUs.
> Some registers and instructions are not defined for Book3e
> (SDR reg, tlbia instruction).
> SDR: Storage Description Register. Book3S and Book3E have different
> address translation mode, we do not need HTABORG & HTABSIZE to
> translate virtual address to real address.
> More registers are saved in BookE-64bit.(TCR, SPRGx)
>=20
> Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
> ---
>=20
> Hopefully someone can give me some advice about cache flush. It
> confused me that why only 1 MiB is flushed from KERNEL_START on
> Book3S. Is there a need to flush L2 cache if I have already flushed
> L1 cache? If yes, how about L3 cache? Cache levels are different
> in cores, the instruction sets may also be different.
>=20
>  1 files changed, 62 insertions(+), 2 deletions(-)
>=20
> diff --git a/arch/powerpc/kernel/swsusp_asm64.S
> b/arch/powerpc/kernel/swsusp_asm64.S
> index 86ac1d9..608e4ceb 100644
> --- a/arch/powerpc/kernel/swsusp_asm64.S
> +++ b/arch/powerpc/kernel/swsusp_asm64.S
> @@ -46,10 +46,29 @@
>  #define SL_r29		0xe8
>  #define SL_r30		0xf0
>  #define SL_r31		0xf8
> -#define SL_SIZE		SL_r31+8
> +#define SL_SPRG0	0x100
> +#define SL_SPRG1	0x108
> +#define SL_SPRG2	0x110
> +#define SL_SPRG3	0x118
> +#define SL_SPRG4	0x120
> +#define SL_SPRG5	0x128
> +#define SL_SPRG6	0x130
> +#define SL_SPRG7	0x138
> +#define SL_TCR		0x140
> +#define SL_SIZE		SL_TCR+8
>=20
>  /* these macros rely on the save area being
>   * pointed to by r11 */
> +
> +#define SAVE_SPR(register)		\
> +	mfspr	r0,SPRN_##register	;\
> +	std	r0,SL_##register(r11)
> +#define RESTORE_SPR(register)		\
> +	ld	r0,SL_##register(r11)	;\
> +	mtspr	SPRN_##register,r0
> +#define RESTORE_SPRG(n)			\
> +	ld	r0,SL_SPRG##n(r11)	;\
> +	mtsprg	n,r0
>  #define SAVE_SPECIAL(special)		\
>  	mf##special	r0		;\
>  	std	r0, SL_##special(r11)
> @@ -103,8 +122,21 @@ _GLOBAL(swsusp_arch_suspend)
>  	SAVE_REGISTER(r30)
>  	SAVE_REGISTER(r31)
>  	SAVE_SPECIAL(MSR)
> -	SAVE_SPECIAL(SDR1)
>  	SAVE_SPECIAL(XER)
> +#ifdef CONFIG_PPC_BOOK3S_64
> +	SAVE_SPECIAL(SDR1)
> +#else
> +	SAVE_SPR(TCR)
> +	/* Save SPRGs */
> +	SAVE_SPR(SPRG0)
> +	SAVE_SPR(SPRG1)
> +	SAVE_SPR(SPRG2)
> +	SAVE_SPR(SPRG3)
> +	SAVE_SPR(SPRG4)
> +	SAVE_SPR(SPRG5)
> +	SAVE_SPR(SPRG6)
> +	SAVE_SPR(SPRG7)
> +#endif
>=20
>  	/* we push the stack up 128 bytes but don't store the
>  	 * stack pointer on the stack like a real stackframe */
> @@ -151,6 +183,7 @@ copy_page_loop:
>  	bne+	copyloop
>  nothing_to_copy:
>=20
> +#ifdef CONFIG_PPC_BOOK3S_64
>  	/* flush caches */
>  	lis	r3, 0x10
>  	mtctr	r3
> @@ -167,6 +200,7 @@ nothing_to_copy:
>  	sync
>=20
>  	tlbia
> +#endif
>=20
>  	ld	r11,swsusp_save_area_ptr@toc(r2)
>=20
> @@ -208,16 +242,42 @@ nothing_to_copy:
>  	RESTORE_REGISTER(r29)
>  	RESTORE_REGISTER(r30)
>  	RESTORE_REGISTER(r31)
> +
> +#ifdef CONFIG_PPC_BOOK3S_64
>  	/* can't use RESTORE_SPECIAL(MSR) */
>  	ld	r0, SL_MSR(r11)
>  	mtmsrd	r0, 0
>  	RESTORE_SPECIAL(SDR1)
> +#else
> +	/* Save SPRGs */
> +	RESTORE_SPRG(0)
> +	RESTORE_SPRG(1)
> +	RESTORE_SPRG(2)
> +	RESTORE_SPRG(3)
> +	RESTORE_SPRG(4)
> +	RESTORE_SPRG(5)
> +	RESTORE_SPRG(6)
> +	RESTORE_SPRG(7)
> +
> +	RESTORE_SPECIAL(MSR)
> +
> +	/* Restore TCR and clear any pending bits in TSR. */
> +	RESTORE_SPR(TCR)
> +	lis	r0, (TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS)@h
> +	mtspr	SPRN_TSR,r0
> +
> +	/* Kick decrementer */
> +	li	r0,1
> +	mtdec	r0
> +#endif
>  	RESTORE_SPECIAL(XER)
>=20
>  	sync
>=20
>  	addi	r1,r1,-128
> +#ifdef CONFIG_PPC_BOOK3S_64
>  	bl	slb_flush_and_rebolt
> +#endif
>  	bl	do_after_copyback
>  	addi	r1,r1,128
>=20
> --
> 1.7.5.1

^ permalink raw reply

* Re: [RFC][PATCH] powerpc: add Book E support to 64-bit hibernation
From: Johannes Berg @ 2013-02-22  9:08 UTC (permalink / raw)
  To: Wang Dongsheng-B40534
  Cc: Zhao Chenhui-B35336, linuxppc-dev@lists.ozlabs.org,
	Li Yang-R58472, Wood Scott-B07421
In-Reply-To: <ABB05CD9C9F68C46A5CEDC7F15439259E728F2@039-SN2MPN1-021.039d.mgd.msft.net>

Hi,

> > Subject: [RFC][PATCH] powerpc: add Book E support to 64-bit hibernation

> Any thoughts about this patch?

Heh, honestly, no. I know nothing about Book E, and it's too long ago
that I wrote (some of) the 64-bit hibernation code.

johannes

^ permalink raw reply

* RE: [RFC][PATCH] powerpc: add Book E support to 64-bit hibernation
From: Wang Dongsheng-B40534 @ 2013-02-22  9:50 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Zhao Chenhui-B35336, linuxppc-dev@lists.ozlabs.org,
	Li Yang-R58472, Wood Scott-B07421
In-Reply-To: <1361524129.8146.1.camel@jlt4.sipsolutions.net>

PiAtLS0tLU9yaWdpbmFsIE1lc3NhZ2UtLS0tLQ0KPiBGcm9tOiBKb2hhbm5lcyBCZXJnIFttYWls
dG86am9oYW5uZXNAc2lwc29sdXRpb25zLm5ldF0NCj4gU2VudDogRnJpZGF5LCBGZWJydWFyeSAy
MiwgMjAxMyA1OjA5IFBNDQo+IFRvOiBXYW5nIERvbmdzaGVuZy1CNDA1MzQNCj4gQ2M6IGJlbmhA
a2VybmVsLmNyYXNoaW5nLm9yZzsgbGludXhwcGMtZGV2QGxpc3RzLm96bGFicy5vcmc7IFdvb2Qg
U2NvdHQtDQo+IEIwNzQyMTsgTGkgWWFuZy1SNTg0NzI7IFpoYW8gQ2hlbmh1aS1CMzUzMzYNCj4g
U3ViamVjdDogUmU6IFtSRkNdW1BBVENIXSBwb3dlcnBjOiBhZGQgQm9vayBFIHN1cHBvcnQgdG8g
NjQtYml0DQo+IGhpYmVybmF0aW9uDQo+IA0KPiBIaSwNCj4gDQo+ID4gPiBTdWJqZWN0OiBbUkZD
XVtQQVRDSF0gcG93ZXJwYzogYWRkIEJvb2sgRSBzdXBwb3J0IHRvIDY0LWJpdA0KPiA+ID4gaGli
ZXJuYXRpb24NCj4gDQo+ID4gQW55IHRob3VnaHRzIGFib3V0IHRoaXMgcGF0Y2g/DQo+IA0KPiBI
ZWgsIGhvbmVzdGx5LCBuby4gSSBrbm93IG5vdGhpbmcgYWJvdXQgQm9vayBFLCBhbmQgaXQncyB0
b28gbG9uZyBhZ28NCj4gdGhhdCBJIHdyb3RlIChzb21lIG9mKSB0aGUgNjQtYml0IGhpYmVybmF0
aW9uIGNvZGUuDQo+IA0KVGhhbmtzIGZvciB5b3VyIGZlZWRiYWNrLg0KDQpDb3VsZCB5b3UgZ2l2
ZSBtZSBzb21lIGFkdmljZSBhYm91dCBjYWNoZSBmbHVzaC4gSXQgY29uZnVzZWQgbWUgdGhhdCB3
aHkNCm9ubHkgMSBNaUIgaXMgZmx1c2hlZCBmcm9tIEtFUk5FTF9TVEFSVCBvbiBQb3dlcm1hYyBH
NS4NCg0KT24gRlNMIEJvb2sgRShlNTAwIGZhbWlseSksIEV2ZW4gaWYgSSBkb24ndCBmbGFzaCBj
YWNoZSwgdGhlIHdvcmsgaXMNCmFsc28gbm9ybWFsKERjYWNoZXMgYXJlIGtlcHQgY29oZXJlbnQg
YnkgaGFyZHdhcmUsIGFuZCBpY2FjaGVzIGFyZSBrZXB0DQpjb2hlcmVudCBieSBzb2Z0d2FyZSku
IEJ1dCBJIHN0aWxsIHdvcnJ5IGFib3V0IGNhY2hlcy4gSXMgdGhlcmUgYSBuZWVkDQp0byBmbHVz
aCBMMiBjYWNoZSBpZiBJIGhhdmUgYWxyZWFkeSBmbHVzaGVkIEwxIGNhY2hlPyANCk9yIG9uIFBv
d2VybWFjIEc1LCBmbHVzaCBjYWNoZXMgaXMgYSBnZW5lcmFsIHdheT8NCg0KUG93ZXJtYWMgRzUg
Q29kZXM6DQogICAgICAgIC8qIGZsdXNoIGNhY2hlcyAqLw0KICAgICAgICBsaXMgICAgIHIzLCAw
eDEwDQogICAgICAgIG10Y3RyICAgcjMNCiAgICAgICAgbGkgICAgICByMywgMA0KICAgICAgICBv
cmkgICAgIHIzLCByMywgQ09ORklHX0tFUk5FTF9TVEFSVD4+NDgNCiAgICAgICAgbGkgICAgICBy
MCwgNDgNCiAgICAgICAgc2xkICAgICByMywgcjMsIHIwDQogICAgICAgIGxpICAgICAgcjAsIDAN
CjE6DQogICAgICAgIGRjYmYgICAgcjAscjMNCiAgICAgICAgYWRkaSAgICByMyxyMywweDIwDQog
ICAgICAgIGJkbnogICAgMWINCg0KICAgICAgICBzeW5jDQoNCiAgICAgICAgdGxiaWENCg0KPiBq
b2hhbm5lcw0KPiANCg==

^ permalink raw reply

* Re: [RFC PATCH -V2 05/21] powerpc: Reduce PTE table memory wastage
From: Aneesh Kumar K.V @ 2013-02-22 17:20 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <20130222052351.GE6139@drongo>

Paul Mackerras <paulus@samba.org> writes:

I will reply to the other parts in a seperate email, but the below

>> +static void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
>> +{
>> +	struct page *page;
>> +	struct mm_struct *mm;
>> +	unsigned int bit, mask;
>> +
>> +	mm = tlb->mm;
>> +	/* Free 2K page table fragment of a 64K page */
>> +	page = virt_to_page(table);
>> +	bit = 1 << ((__pa(table) & ~PAGE_MASK) / PTE_FRAG_SIZE);
>> +	spin_lock(&mm->page_table_lock);
>> +	/*
>> +	 * stash the actual mask in higher half, and clear the lower half
>> +	 * and selectively, add remove from pgtable list
>> +	 */
>> +	mask = atomic_xor_bits(&page->_mapcount, bit | (bit << FRAG_MASK_BITS));
>> +	if (!(mask & FRAG_MASK))
>> +		list_del(&page->lru);
>> +	else {
>> +		/*
>> +		 * Add the page table page to pgtable_list so that
>> +		 * the free fragment can be used by the next alloc
>> +		 */
>> +		list_del_init(&page->lru);
>> +		list_add_tail(&page->lru, &mm->context.pgtable_list);
>> +	}
>> +	spin_unlock(&mm->page_table_lock);
>> +	tlb_remove_table(tlb, table);
>> +}
>
> This looks like you're allowing a fragment that is being freed to be
> reallocated and used again during the grace period when we are waiting
> for any references to the fragment to disappear.  Doesn't that allow a
> race where one CPU traversing the page table and using the fragment in
> its old location in the tree could see a PTE created after the
> fragment was reallocated?  In other words, why is it safe to allow the
> fragment to be used during the grace period?  If it is safe, it at
> least needs a comment explaining why.
>

We don't allow it to be reallocated during the grace period. The trick
is in the below lines of page_table_alloc()

		/*
		 * Update with the higher order mask bits accumulated,
		 * added as a part of rcu free.
		 */
		mask = mask | (mask >> FRAG_MASK_BITS);

When checking for mask, we also look at the higher order bits.

The reason we add the page back to &mm->context.pgtable_list in
page_table_free_rcu is because we need to have access to struct
mm_struct. We don't have that in the rcu call back. So we add early and
make sure we don't reallocate them, until the grace period is over.

I will definitely add more comments around the code to clarify these details.

-aneesh

^ permalink raw reply

* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Phileas Fogg @ 2013-02-22 20:49 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1361490374.4676.58.camel@pasglop>

Benjamin Herrenschmidt wrote:
> On Thu, 2013-02-21 at 22:44 +0100, Phileas Fogg wrote:
>> Stripped OpenWRT image:
>> ------------------------
>>
>> c00000000001a474:       48 00 00 05     bl      0xc00000000001a478
>> c00000000001a478:       7c a8 02 a6     mflr    r5
>> c00000000001a47c:       38 a5 00 1c     addi    r5,r5,28
>> c00000000001a480:       7c 21 0b 78     mr      r1,r1
>> c00000000001a484:       80 85 00 00     lwz     r4,0(r5)
>> c00000000001a488:       2c 04 00 00     cmpwi   r4,0
>> c00000000001a48c:       40 82 00 62     bnea-   0x60
>> c00000000001a490:       4b ff ff f0     b       0xc00000000001a480
>> c00000000001a494:       00 00 00 00     .long 0x0
>> c00000000001a498:       a0 6d 00 48     lhz     r3,72(r13)
>> c00000000001a49c:       48 00 00 11     bl      0xc00000000001a4ac
>
>
> Smell like a bad stack pointer to me...
>
> One thing I noticed is that kexec doesn't seem to hard disable
> interrupts, which is ... fishy at best. It should do that
> before it switches stacks around. Dunno if that's the cause
> of the problem but it might be worth adding a hard_irq_disable()
> after all the local_irq_disable(), making sure we are hard
> disabled before going into asm.
>
> Cheers,
> Ben.
>
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>

Hi,

i wanted to let you know that i tested your advice. And let me say, it's was a 
damn good advice :) I can boot FreeBSD loader on Linux 3.8 now, no SHA256 
checksum failures. And no panics with FreeBSD LiveCD anymore too.

I just inserted hard_irq_disable() after each local_irq_disable() in 
arch/powerpc/kernel/machine_kexec_64.c

Thanks

regards

^ permalink raw reply

* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Benjamin Herrenschmidt @ 2013-02-22 19:52 UTC (permalink / raw)
  To: Phileas Fogg; +Cc: linuxppc-dev
In-Reply-To: <5127D9CE.2090500@mail.ru>

On Fri, 2013-02-22 at 21:49 +0100, Phileas Fogg wrote:
> i wanted to let you know that i tested your advice. And let me say, it's was a 
> damn good advice :) I can boot FreeBSD loader on Linux 3.8 now, no SHA256 
> checksum failures. And no panics with FreeBSD LiveCD anymore too.
> 
> I just inserted hard_irq_disable() after each local_irq_disable() in 
> arch/powerpc/kernel/machine_kexec_64.c

Awesome ! :-)

Care to send a patch with a Signed-off-by: ?

Cheers,
Ben.

^ permalink raw reply

* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Phileas Fogg @ 2013-02-22 23:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1361562773.15017.36.camel@pasglop>

Benjamin Herrenschmidt wrote:
> On Fri, 2013-02-22 at 21:49 +0100, Phileas Fogg wrote:
>> i wanted to let you know that i tested your advice. And let me say, it's was a
>> damn good advice :) I can boot FreeBSD loader on Linux 3.8 now, no SHA256
>> checksum failures. And no panics with FreeBSD LiveCD anymore too.
>>
>> I just inserted hard_irq_disable() after each local_irq_disable() in
>> arch/powerpc/kernel/machine_kexec_64.c
>
> Awesome ! :-)
>
> Care to send a patch with a Signed-off-by: ?
>
> Cheers,
> Ben.
>
>

No problem, but as i said it was your idea how to fix the issue with kexec.
Anyways here is the patch which i tested on my PS3 console with Linux 3.8.
After applying this patch i can boot any Linux kernel starting with 2.6,
FreeBSD loader, FreeBSD LiveCD and my own tiny ELF kernels too.
Even OpenBSD bootloader starts now too :)
And i don't see any failed SHA256 checksums in the purgatory code.

regards

 From c17cdf38dfe180b4a571827bb547aaf9b678cf29 Mon Sep 17 00:00:00 2001
From: Phileas Fogg <phileas-fogg@mail.ru>
Date: Sat, 23 Feb 2013 00:32:19 +0100
Subject: [PATCH] kexec: disable hard IRQ before kexec

Disable hard IRQ before kexec a new kernel image.
Not doing it can result in corrupted data in the memory segments
reserved for the new kernel.
---
  arch/powerpc/kernel/machine_kexec_64.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/machine_kexec_64.c 
b/arch/powerpc/kernel/machine_kexec_64.c
index 7206701..e08b9d0 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -162,6 +162,7 @@ static int kexec_all_irq_disabled = 0;
  static void kexec_smp_down(void *arg)
  {
  	local_irq_disable();
+	hard_irq_disable();
  	mb(); /* make sure our irqs are disabled before we say they are */
  	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
  	while(kexec_all_irq_disabled == 0)
@@ -244,6 +245,7 @@ static void kexec_prepare_cpus(void)
  	wake_offline_cpus();
  	smp_call_function(kexec_smp_down, NULL, /* wait */0);
  	local_irq_disable();
+	hard_irq_disable();
  	mb(); /* make sure IRQs are disabled before we say they are */
  	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;

@@ -281,6 +283,7 @@ static void kexec_prepare_cpus(void)
  	if (ppc_md.kexec_cpu_down)
  		ppc_md.kexec_cpu_down(0, 0);
  	local_irq_disable();
+	hard_irq_disable();
  }

  #endif /* SMP */
-- 
1.8.1.4

^ permalink raw reply related

* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Benjamin Herrenschmidt @ 2013-02-22 22:45 UTC (permalink / raw)
  To: Phileas Fogg; +Cc: linuxppc-dev
In-Reply-To: <51280223.9070109@mail.ru>

On Sat, 2013-02-23 at 00:41 +0100, Phileas Fogg wrote:
> Benjamin Herrenschmidt wrote:
> > On Fri, 2013-02-22 at 21:49 +0100, Phileas Fogg wrote:
> >> i wanted to let you know that i tested your advice. And let me say, it's was a
> >> damn good advice :) I can boot FreeBSD loader on Linux 3.8 now, no SHA256
> >> checksum failures. And no panics with FreeBSD LiveCD anymore too.
> >>
> >> I just inserted hard_irq_disable() after each local_irq_disable() in
> >> arch/powerpc/kernel/machine_kexec_64.c
> >
> > Awesome ! :-)
> >
> > Care to send a patch with a Signed-off-by: ?
>
> No problem, but as i said it was your idea how to fix the issue with kexec.
> Anyways here is the patch which i tested on my PS3 console with Linux 3.8.
> After applying this patch i can boot any Linux kernel starting with 2.6,
> FreeBSD loader, FreeBSD LiveCD and my own tiny ELF kernels too.
> Even OpenBSD bootloader starts now too :)
> And i don't see any failed SHA256 checksums in the purgatory code.

Thanks, but I still need the Signed-off-by: line before i can apply
it :-) (legal...)

Cheers,
Ben.

> regards
> 
>  From c17cdf38dfe180b4a571827bb547aaf9b678cf29 Mon Sep 17 00:00:00 2001
> From: Phileas Fogg <phileas-fogg@mail.ru>
> Date: Sat, 23 Feb 2013 00:32:19 +0100
> Subject: [PATCH] kexec: disable hard IRQ before kexec
> 
> Disable hard IRQ before kexec a new kernel image.
> Not doing it can result in corrupted data in the memory segments
> reserved for the new kernel.
> ---
>   arch/powerpc/kernel/machine_kexec_64.c | 3 +++
>   1 file changed, 3 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/machine_kexec_64.c 
> b/arch/powerpc/kernel/machine_kexec_64.c
> index 7206701..e08b9d0 100644
> --- a/arch/powerpc/kernel/machine_kexec_64.c
> +++ b/arch/powerpc/kernel/machine_kexec_64.c
> @@ -162,6 +162,7 @@ static int kexec_all_irq_disabled = 0;
>   static void kexec_smp_down(void *arg)
>   {
>   	local_irq_disable();
> +	hard_irq_disable();
>   	mb(); /* make sure our irqs are disabled before we say they are */
>   	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
>   	while(kexec_all_irq_disabled == 0)
> @@ -244,6 +245,7 @@ static void kexec_prepare_cpus(void)
>   	wake_offline_cpus();
>   	smp_call_function(kexec_smp_down, NULL, /* wait */0);
>   	local_irq_disable();
> +	hard_irq_disable();
>   	mb(); /* make sure IRQs are disabled before we say they are */
>   	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
> 
> @@ -281,6 +283,7 @@ static void kexec_prepare_cpus(void)
>   	if (ppc_md.kexec_cpu_down)
>   		ppc_md.kexec_cpu_down(0, 0);
>   	local_irq_disable();
> +	hard_irq_disable();
>   }
> 
>   #endif /* SMP */

^ permalink raw reply

* Re: PS3: Strange issue with kexec and FreeBSD loader
From: Phileas Fogg @ 2013-02-22 23:53 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1361573108.15017.39.camel@pasglop>

Benjamin Herrenschmidt wrote:
> On Sat, 2013-02-23 at 00:41 +0100, Phileas Fogg wrote:
>> Benjamin Herrenschmidt wrote:
>>> On Fri, 2013-02-22 at 21:49 +0100, Phileas Fogg wrote:
>>>> i wanted to let you know that i tested your advice. And let me say, it's was a
>>>> damn good advice :) I can boot FreeBSD loader on Linux 3.8 now, no SHA256
>>>> checksum failures. And no panics with FreeBSD LiveCD anymore too.
>>>>
>>>> I just inserted hard_irq_disable() after each local_irq_disable() in
>>>> arch/powerpc/kernel/machine_kexec_64.c
>>>
>>> Awesome ! :-)
>>>
>>> Care to send a patch with a Signed-off-by: ?
>>
>> No problem, but as i said it was your idea how to fix the issue with kexec.
>> Anyways here is the patch which i tested on my PS3 console with Linux 3.8.
>> After applying this patch i can boot any Linux kernel starting with 2.6,
>> FreeBSD loader, FreeBSD LiveCD and my own tiny ELF kernels too.
>> Even OpenBSD bootloader starts now too :)
>> And i don't see any failed SHA256 checksums in the purgatory code.
>
> Thanks, but I still need the Signed-off-by: line before i can apply
> it :-) (legal...)
>
> Cheers,
> Ben.
>
>> regards
>>
>>   From c17cdf38dfe180b4a571827bb547aaf9b678cf29 Mon Sep 17 00:00:00 2001
>> From: Phileas Fogg <phileas-fogg@mail.ru>
>> Date: Sat, 23 Feb 2013 00:32:19 +0100
>> Subject: [PATCH] kexec: disable hard IRQ before kexec
>>
>> Disable hard IRQ before kexec a new kernel image.
>> Not doing it can result in corrupted data in the memory segments
>> reserved for the new kernel.
>> ---
>>    arch/powerpc/kernel/machine_kexec_64.c | 3 +++
>>    1 file changed, 3 insertions(+)
>>
>> diff --git a/arch/powerpc/kernel/machine_kexec_64.c
>> b/arch/powerpc/kernel/machine_kexec_64.c
>> index 7206701..e08b9d0 100644
>> --- a/arch/powerpc/kernel/machine_kexec_64.c
>> +++ b/arch/powerpc/kernel/machine_kexec_64.c
>> @@ -162,6 +162,7 @@ static int kexec_all_irq_disabled = 0;
>>    static void kexec_smp_down(void *arg)
>>    {
>>    	local_irq_disable();
>> +	hard_irq_disable();
>>    	mb(); /* make sure our irqs are disabled before we say they are */
>>    	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
>>    	while(kexec_all_irq_disabled == 0)
>> @@ -244,6 +245,7 @@ static void kexec_prepare_cpus(void)
>>    	wake_offline_cpus();
>>    	smp_call_function(kexec_smp_down, NULL, /* wait */0);
>>    	local_irq_disable();
>> +	hard_irq_disable();
>>    	mb(); /* make sure IRQs are disabled before we say they are */
>>    	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
>>
>> @@ -281,6 +283,7 @@ static void kexec_prepare_cpus(void)
>>    	if (ppc_md.kexec_cpu_down)
>>    		ppc_md.kexec_cpu_down(0, 0);
>>    	local_irq_disable();
>> +	hard_irq_disable();
>>    }
>>
>>    #endif /* SMP */
>
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>

Next attempt.


Signed-off-by: Phileas Fogg <phileas-fogg@mail.ru>
---

 From c17cdf38dfe180b4a571827bb547aaf9b678cf29 Mon Sep 17 00:00:00 2001
From: Phileas Fogg <phileas-fogg@mail.ru>
Date: Sat, 23 Feb 2013 00:32:19 +0100
Subject: [PATCH] kexec: disable hard IRQ before kexec

Disable hard IRQ before kexec a new kernel image.
Not doing it can result in corrupted data in the memory segments
reserved for the new kernel.
---
  arch/powerpc/kernel/machine_kexec_64.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/machine_kexec_64.c 
b/arch/powerpc/kernel/machine_kexec_64.c
index 7206701..e08b9d0 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -162,6 +162,7 @@ static int kexec_all_irq_disabled = 0;
  static void kexec_smp_down(void *arg)
  {
  	local_irq_disable();
+	hard_irq_disable();
  	mb(); /* make sure our irqs are disabled before we say they are */
  	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;
  	while(kexec_all_irq_disabled == 0)
@@ -244,6 +245,7 @@ static void kexec_prepare_cpus(void)
  	wake_offline_cpus();
  	smp_call_function(kexec_smp_down, NULL, /* wait */0);
  	local_irq_disable();
+	hard_irq_disable();
  	mb(); /* make sure IRQs are disabled before we say they are */
  	get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF;

@@ -281,6 +283,7 @@ static void kexec_prepare_cpus(void)
  	if (ppc_md.kexec_cpu_down)
  		ppc_md.kexec_cpu_down(0, 0);
  	local_irq_disable();
+	hard_irq_disable();
  }

  #endif /* SMP */
-- 
1.8.1.4

^ permalink raw reply related

* Re: [RFC PATCH -V2 03/21] powerpc: Don't hard code the size of pte page
From: Aneesh Kumar K.V @ 2013-02-23 16:17 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, linux-mm
In-Reply-To: <20130222050607.GC6139@drongo>

Paul Mackerras <paulus@samba.org> writes:

> On Thu, Feb 21, 2013 at 10:17:10PM +0530, Aneesh Kumar K.V wrote:
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> USE PTRS_PER_PTE to indicate the size of pte page.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> powerpc: Don't hard code the size of pte page
>> 
>> USE PTRS_PER_PTE to indicate the size of pte page.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>
> Description and signoff are duplicated.  Description could be more
> informative, for example - why would we want to do this?
>
>> +/*
>> + * hidx is in the second half of the page table. We use the
>> + * 8 bytes per each pte entry.
>
> The casual reader probably wouldn't know what "hidx" is.  The comment
> needs at least to use a better name than "hidx".

how about

+/*
+ * We save the slot number & secondary bit in the second half of the
+ * PTE page. We use the 8 bytes per each pte entry.
+ */


-aneesh

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox