* [PATCH 2/6] 8xx: Update TLB asm so it behaves as linux mm expects.
From: Joakim Tjernlund @ 2009-10-08 13:24 UTC (permalink / raw)
To: Benjamin Herrenschmidt, linuxppc-dev@ozlabs.org, Rex Feany,
Scott Wood
In-Reply-To: <1255008298-19949-2-git-send-email-Joakim.Tjernlund@transmode.se>
Update the TLB asm to make proper use of _PAGE_DIRY and _PAGE_ACCESSED.
Get rid of _PAGE_HWWRITE too.
Pros:
- I/D TLB Miss never needs to write to the linux pte.
- _PAGE_ACCESSED is only set on TLB Error fixing accounting
- _PAGE_DIRTY is mapped to 0x100, the changed bit, and is set directly
when a page has been made dirty.
- Proper RO/RW mapping of user space.
- Free up 2 SW TLB bits in the linux pte(add back _PAGE_WRITETHRU ?)
- Less instructions in I/D TLB Miss.
- Prepared for HWEXEC support.
- Prepared kernel RO/user NA support.
Cons:
- None ?
---
arch/powerpc/include/asm/pte-8xx.h | 13 ++---
arch/powerpc/kernel/head_8xx.S | 93 ++++++++++++++++++------------------
2 files changed, 52 insertions(+), 54 deletions(-)
diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
index 8c6e312..f23cd15 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/pte-8xx.h
@@ -32,22 +32,21 @@
#define _PAGE_FILE 0x0002 /* when !present: nonlinear file mapping */
#define _PAGE_NO_CACHE 0x0002 /* I: cache inhibit */
#define _PAGE_SHARED 0x0004 /* No ASID (context) compare */
+#define _PAGE_DIRTY 0x0100 /* C: page changed */
-/* These five software bits must be masked out when the entry is loaded
- * into the TLB.
+/* These 3 software bits must be masked out when the entry is loaded
+ * into the TLB, 2 SW bits left.
*/
#define _PAGE_EXEC 0x0008 /* software: i-cache coherency required */
#define _PAGE_GUARDED 0x0010 /* software: guarded access */
-#define _PAGE_DIRTY 0x0020 /* software: page changed */
-#define _PAGE_RW 0x0040 /* software: user write access allowed */
-#define _PAGE_ACCESSED 0x0080 /* software: page referenced */
+#define _PAGE_ACCESSED 0x0020 /* software: page referenced */
/* Setting any bits in the nibble with the follow two controls will
* require a TLB exception handler change. It is assumed unused bits
* are always zero.
*/
-#define _PAGE_HWWRITE 0x0100 /* h/w write enable: never set in Linux PTE */
-#define _PAGE_USER 0x0800 /* One of the PP bits, the other is USER&~RW */
+#define _PAGE_RW 0x0400 /* lsb PP bits, inverted in HW */
+#define _PAGE_USER 0x0800 /* msb PP bits */
#define _PMD_PRESENT 0x0001
#define _PMD_BAD 0x0ff0
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 118bb05..1639d16 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -333,26 +333,21 @@ InstructionTLBMiss:
mfspr r11, SPRN_MD_TWC /* ....and get the pte address */
lwz r10, 0(r11) /* Get the pte */
-#ifdef CONFIG_SWAP
- /* do not set the _PAGE_ACCESSED bit of a non-present page */
- andi. r11, r10, _PAGE_PRESENT
- beq 4f
- ori r10, r10, _PAGE_ACCESSED
- mfspr r11, SPRN_MD_TWC /* get the pte address again */
- stw r10, 0(r11)
-4:
-#else
- ori r10, r10, _PAGE_ACCESSED
- stw r10, 0(r11)
-#endif
+ /* r10=(r10&~_PAGE_PRESENT)|((r10&_PAGE_ACCESSED)>>5) */
+ rlwimi. r10, r10, 27, 31, 31
+ beq- cr0, 2f /* Can be removed, costs a ITLB Err */
+#if 0 /* Dont' bother with PP lsb, bit 21 for now */
+ /* r10 = (r10 & ~0x0400) | ((r10 & _PAGE_EXEC) << 7) */
+ rlwimi r10, r10, 7, 21, 21 /* Set _PAGE_EXEC << 7 */
+#endif
/* The Linux PTE won't go exactly into the MMU TLB.
- * Software indicator bits 21, 22 and 28 must be clear.
+ * Software indicator bits 22 and 28 must be clear.
* Software indicator bits 24, 25, 26, and 27 must be
* set. All other Linux PTE bits control the behavior
* of the MMU.
*/
-2: li r11, 0x00f0
+ li r11, 0x00f0
rlwimi r10, r11, 0, 24, 28 /* Set 24-27, clear 28 */
DO_8xx_CPU6(0x2d80, r3)
mtspr SPRN_MI_RPN, r10 /* Update TLB entry */
@@ -365,6 +360,22 @@ InstructionTLBMiss:
lwz r3, 8(r0)
#endif
rfi
+2:
+ mfspr r11, SRR1
+ /* clear all error bits as TLB Miss
+ * sets a few unconditionally
+ */
+ rlwinm r11, r11, 0, 0xffff
+ mtspr SRR1, r11
+
+ mfspr r10, SPRN_M_TW /* Restore registers */
+ lwz r11, 0(r0)
+ mtcr r11
+ lwz r11, 4(r0)
+#ifdef CONFIG_8xx_CPU6
+ lwz r3, 8(r0)
+#endif
+ b InstructionAccess
. = 0x1200
DataStoreTLBMiss:
@@ -409,21 +420,22 @@ DataStoreTLBMiss:
DO_8xx_CPU6(0x3b80, r3)
mtspr SPRN_MD_TWC, r11
-#ifdef CONFIG_SWAP
- /* do not set the _PAGE_ACCESSED bit of a non-present page */
- andi. r11, r10, _PAGE_PRESENT
- beq 4f
- ori r10, r10, _PAGE_ACCESSED
-4:
- /* and update pte in table */
-#else
- ori r10, r10, _PAGE_ACCESSED
+ /* Need to know if load/store -> force a TLB Error
+ * by copying ACCESSED to PRESENT.
+ */
+ /* r10=(r10&~_PAGE_PRESENT)|((r10&_PAGE_ACCESSED)>>5) */
+ rlwimi r10, r10, 27, 31, 31
+
+#if 0 /* Not yet */
+ /* Honour kernel RO, User NA */
+ andi. r11, r10, _PAGE_USER | _PAGE_RW
+ bne- cr0, 5f
+ ori r10,r10, 0x200 /* Extended encoding, bit 22 */
#endif
- mfspr r11, SPRN_MD_TWC /* get the pte address again */
- stw r10, 0(r11)
+5: xori r10, r10, _PAGE_RW /* invert RW bit */
/* The Linux PTE won't go exactly into the MMU TLB.
- * Software indicator bits 21, 22 and 28 must be clear.
+ * Software indicator bits 22 and 28 must be clear.
* Software indicator bits 24, 25, 26, and 27 must be
* set. All other Linux PTE bits control the behavior
* of the MMU.
@@ -469,11 +481,12 @@ DataTLBError:
stw r10, 0(r0)
stw r11, 4(r0)
- /* First, make sure this was a store operation.
- */
- mfspr r10, SPRN_DSISR
- andis. r11, r10, 0x4800 /* no translation, no permission. */
+ mfspr r11, SPRN_DSISR
+ andis. r11, r11, 0x4800 /* !translation or protection */
bne 2f /* branch if either is set */
+ /* Only Change bit left now, do it here as it is faster
+ * than trapping to the C fault handler.
+ */
/* The EA of a data TLB miss is automatically stored in the MD_EPN
* register. The EA of a data TLB error is automatically stored in
@@ -522,26 +535,12 @@ DataTLBError:
mfspr r11, SPRN_MD_TWC /* ....and get the pte address */
lwz r10, 0(r11) /* Get the pte */
- andi. r11, r10, _PAGE_RW /* Is it writeable? */
- beq 2f /* Bail out if not */
-
- /* Update 'changed', among others.
- */
-#ifdef CONFIG_SWAP
- ori r10, r10, _PAGE_DIRTY|_PAGE_HWWRITE
- /* do not set the _PAGE_ACCESSED bit of a non-present page */
- andi. r11, r10, _PAGE_PRESENT
- beq 4f
- ori r10, r10, _PAGE_ACCESSED
-4:
-#else
- ori r10, r10, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
-#endif
- mfspr r11, SPRN_MD_TWC /* Get pte address again */
+ ori r10, r10, _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_HWWRITE
stw r10, 0(r11) /* and update pte in table */
+ xori r10, r10, _PAGE_RW /* RW bit is inverted */
/* The Linux PTE won't go exactly into the MMU TLB.
- * Software indicator bits 21, 22 and 28 must be clear.
+ * Software indicator bits 22 and 28 must be clear.
* Software indicator bits 24, 25, 26, and 27 must be
* set. All other Linux PTE bits control the behavior
* of the MMU.
--
1.6.4.4
^ permalink raw reply related
* [PATCH 5/6] 8xx: Fixup DAR from buggy dcbX instructions.
From: Joakim Tjernlund @ 2009-10-08 13:24 UTC (permalink / raw)
To: Benjamin Herrenschmidt, linuxppc-dev@ozlabs.org, Rex Feany,
Scott Wood
In-Reply-To: <1255008298-19949-5-git-send-email-Joakim.Tjernlund@transmode.se>
This is an assembler version to fixup DAR not being set
by dcbX, icbi instructions. There are two versions, one
uses selfmodifing code, the other uses a
jump table but is much bigger(default).
---
arch/powerpc/kernel/head_8xx.S | 146 +++++++++++++++++++++++++++++++++++++++-
1 files changed, 145 insertions(+), 1 deletions(-)
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 9707dc4..6541855 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -490,7 +490,8 @@ DataTLBError:
mfspr r10, SPRN_DAR
cmpwi cr0, r10, 0x00f0
- beq- 2f /* must be a buggy dcbX, icbi insn. */
+ beq- FixDAR /* must be a buggy dcbX, icbi insn. */
+DARFix: /* Return from dcbx instruction bug workaround, r10 holds value of DAR */
mfspr r11, SPRN_DSISR
andis. r11, r11, 0x4800 /* !translation or protection */
@@ -600,6 +601,149 @@ DataTLBError:
. = 0x2000
+/* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions
+ * by decoding the registers used by the dcbx instruction and adding them.
+ * DAR is set to the calculated address and r10 also holds the EA on exit.
+ */
+#define NO_SELF_MODIFYING_CODE /* define if you don't want to use self modifying code */
+ nop /* A few nops to make the modified_instr: space below cache line aligned */
+ nop
+139: /* fetch instruction from userspace memory */
+ DO_8xx_CPU6(0x3780, r3)
+ mtspr SPRN_MD_EPN, r10
+ mfspr r11, SPRN_M_TWB /* Get level 1 table entry address */
+ lwz r11, 0(r11) /* Get the level 1 entry */
+ tophys (r11, r11)
+ DO_8xx_CPU6(0x3b80, r3)
+ mtspr SPRN_MD_TWC, r11 /* Load pte table base address */
+ mfspr r11, SPRN_MD_TWC /* ....and get the pte address */
+ lwz r11, 0(r11) /* Get the pte */
+ /* concat physical page address(r11) and page offset(r10) */
+ rlwimi r11, r10, 0, 20, 31
+ b 140f
+FixDAR: /* Entry point for dcbx workaround. */
+ /* fetch instruction from memory. */
+ mfspr r10, SPRN_SRR0
+ andis. r11, r10, 0x8000
+ tophys (r11, r10)
+ beq- 139b /* Branch if user space address */
+140: lwz r11,0(r11)
+#ifdef CONFIG_8xx_CPU6
+ lwz r3, 8(r0) /* restore r3 from memory */
+#endif
+#ifndef NO_SELF_MODIFYING_CODE
+ andis. r10,r11,0x1f /* test if reg RA is r0 */
+ li r10,modified_instr@l
+ dcbtst r0,r10 /* touch for store */
+ rlwinm r11,r11,0,0,20 /* Zero lower 10 bits */
+ oris r11,r11,640 /* Transform instr. to a "add r10,RA,RB" */
+ ori r11,r11,532
+ stw r11,0(r10) /* store add/and instruction */
+ dcbf 0,r10 /* flush new instr. to memory. */
+ icbi 0,r10 /* invalidate instr. cache line */
+ lwz r11, 4(r0) /* restore r11 from memory */
+ mfspr r10, SPRN_M_TW /* restore r10 from M_TW */
+ isync /* Wait until new instr is loaded from memory */
+modified_instr:
+ .space 4 /* this is where the add/and instr. is stored */
+ bne+ 143f
+ subf r10,r0,r10 /* r10=r10-r0, only if reg RA is r0 */
+143: mtdar r10 /* store faulting EA in DAR */
+ b DARFix /* Go back to normal TLB handling */
+#else
+ mfctr r10
+ mtdar r10 /* save ctr reg in DAR */
+ rlwinm r10, r11, 24, 24, 28 /* offset into jump table for reg RB */
+ addi r10, r10, 150f@l /* add start of table */
+ mtctr r10 /* load ctr with jump address */
+ xor r10, r10, r10 /* sum starts at zero */
+ bctr /* jump into table */
+150:
+ add r10, r10, r0
+ b 151f
+ add r10, r10, r1
+ b 151f
+ add r10, r10, r2
+ b 151f
+ add r10, r10, r3
+ b 151f
+ add r10, r10, r4
+ b 151f
+ add r10, r10, r5
+ b 151f
+ add r10, r10, r6
+ b 151f
+ add r10, r10, r7
+ b 151f
+ add r10, r10, r8
+ b 151f
+ add r10, r10, r9
+ b 151f
+ add r10, r10, r10
+ b 151f
+ add r10, r10, r11
+ b 151f
+ add r10, r10, r12
+ b 151f
+ add r10, r10, r13
+ b 151f
+ add r10, r10, r14
+ b 151f
+ add r10, r10, r15
+ b 151f
+ add r10, r10, r16
+ b 151f
+ add r10, r10, r17
+ b 151f
+ add r10, r10, r18
+ b 151f
+ add r10, r10, r19
+ b 151f
+ mtctr r11 /* r10 needs special handling */
+ b 154f
+ mtctr r11 /* r11 needs special handling */
+ b 153f
+ add r10, r10, r22
+ b 151f
+ add r10, r10, r23
+ b 151f
+ add r10, r10, r24
+ b 151f
+ add r10, r10, r25
+ b 151f
+ add r10, r10, r25
+ b 151f
+ add r10, r10, r27
+ b 151f
+ add r10, r10, r28
+ b 151f
+ add r10, r10, r29
+ b 151f
+ add r10, r10, r30
+ b 151f
+ add r10, r10, r31
+151:
+ rlwinm. r11,r11,19,24,28 /* offset into jump table for reg RA */
+ beq 152f /* if reg RA is zero, don't add it */
+ addi r11, r11, 150b@l /* add start of table */
+ mtctr r11 /* load ctr with jump address */
+ rlwinm r11,r11,0,16,10 /* make sure we don't execute this more than once */
+ bctr /* jump into table */
+152:
+ mfdar r11
+ mtctr r11 /* restore ctr reg from DAR */
+ mtdar r10 /* save fault EA to DAR */
+ b DARFix /* Go back to normal TLB handling */
+
+ /* special handling for r10,r11 since these are modified already */
+153: lwz r11, 4(r0) /* load r11 from memory */
+ b 155f
+154: mfspr r11, SPRN_M_TW /* load r10 from M_TW */
+155: add r10, r10, r11 /* add it */
+ mfctr r11 /* restore r11 */
+ b 151b
+#endif
+
.globl giveup_fpu
giveup_fpu:
blr
--
1.6.4.4
^ permalink raw reply related
* [PATCH 3/6] 8xx: invalidate non present TLBs
From: Joakim Tjernlund @ 2009-10-08 13:24 UTC (permalink / raw)
To: Benjamin Herrenschmidt, linuxppc-dev@ozlabs.org, Rex Feany,
Scott Wood
In-Reply-To: <1255008298-19949-3-git-send-email-Joakim.Tjernlund@transmode.se>
8xx sometimes need to load a invalid/non-present TLBs in
it DTLB asm handler.
These must be invalidated separaly as linux mm don't.
---
arch/powerpc/mm/fault.c | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7699394..72941c7 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -39,7 +39,7 @@
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/siginfo.h>
-
+#include <mm/mmu_decl.h>
#ifdef CONFIG_KPROBES
static inline int notify_page_fault(struct pt_regs *regs)
@@ -243,6 +243,12 @@ good_area:
goto bad_area;
#endif /* CONFIG_6xx */
#if defined(CONFIG_8xx)
+ /* 8xx sometimes need to load a invalid/non-present TLBs.
+ * These must be invalidated separately as linux mm don't.
+ */
+ if (error_code & 0x40000000) /* no translation? */
+ _tlbil_va(address);
+
/* The MPC8xx seems to always set 0x80000000, which is
* "undefined". Of those that can be set, this is the only
* one which seems bad.
--
1.6.4.4
^ permalink raw reply related
* [PATCH 1/6] 8xx: DTLB Error must check for more errors.
From: Joakim Tjernlund @ 2009-10-08 13:24 UTC (permalink / raw)
To: Benjamin Herrenschmidt, linuxppc-dev@ozlabs.org, Rex Feany,
Scott Wood
In-Reply-To: <1255008298-19949-1-git-send-email-Joakim.Tjernlund@transmode.se>
DataTLBError currently does:
if ((err & 0x02000000) == 0)
DSI();
This won't handle a store with no valid translation.
Change this to
if ((err & 0x48000000) != 0)
DSI();
that is, branch to DSI if either !permission or
!translation.
---
arch/powerpc/kernel/head_8xx.S | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 52ff8c5..118bb05 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -472,8 +472,8 @@ DataTLBError:
/* First, make sure this was a store operation.
*/
mfspr r10, SPRN_DSISR
- andis. r11, r10, 0x0200 /* If set, indicates store op */
- beq 2f
+ andis. r11, r10, 0x4800 /* no translation, no permission. */
+ bne 2f /* branch if either is set */
/* The EA of a data TLB miss is automatically stored in the MD_EPN
* register. The EA of a data TLB error is automatically stored in
--
1.6.4.4
^ permalink raw reply related
* [PATCH 0/6] 8xx MMU fixes
From: Joakim Tjernlund @ 2009-10-08 13:24 UTC (permalink / raw)
To: Benjamin Herrenschmidt, linuxppc-dev@ozlabs.org, Rex Feany,
Scott Wood
So here we go again. This time I am
fairly confindent I got most things correct :)
Also manged to use even less instructions in the
TLB Miss handlers.
Scott and Rex, forget previous versions and
try this one out.
Once this works we can discuss further enchantments.
Joakim Tjernlund (6):
8xx: DTLB Error must check for more errors.
8xx: Update TLB asm so it behaves as linux mm expects.
8xx: invalidate non present TLBs
8xx: Tag DAR with 0x00f0 to catch buggy instructions.
8xx: Fixup DAR from buggy dcbX instructions.
8xx: start using dcbX instructions in various copy routines
arch/powerpc/include/asm/pte-8xx.h | 13 +-
arch/powerpc/kernel/head_8xx.S | 252 +++++++++++++++++++++++++++++-------
arch/powerpc/kernel/misc_32.S | 18 ---
arch/powerpc/lib/copy_32.S | 24 ----
arch/powerpc/mm/fault.c | 8 +-
5 files changed, 217 insertions(+), 98 deletions(-)
^ permalink raw reply
* Re: [RFC PATCH 00/12] Merge common OpenFirmware device tree code
From: Kjetil Oftedal @ 2009-10-08 13:24 UTC (permalink / raw)
To: David Miller
Cc: sammy, sfr, julian.calaby, devicetree-discuss, sparclinux, crn,
microblaze-uclinux, wmb, linuxppc-dev
In-Reply-To: <20091007.213915.37481688.davem@davemloft.net>
On Wed, 7 Oct 2009, David Miller wrote:
> From: Chris Newport <crn@netunix.com>
> Date: Thu, 8 Oct 2009 02:29:25 +0100 (BST)
>
>> Sun4d has never had SMP support and
>
> Wrong.
>
>> this is apparantly problematic due to Cray interlectual property
>> causing a lack of bus documentation.
>
> XBUS documentation is not available, but we fully know how to
> program the SBUS interrupt controller and whatnot. It's all
> there in the sun4d interrupt and SMP support and it did work
> just fine at one point.
>
> Amusingly the SBUS interrupt stuff on sun4d is a very close
> sibling to the IMAP/ICLR scheme used on sun4u.
>
The Sun4d SMP support exists, but is broken in 2.6.
And the UP support is buggy. At least on my test-setup, the
kernel is unable to load userland from scsi-drives.
Dropping the sun4 32-bit SPARC is a bit counterproductive when
considering the resent effort to unify sparc64 and sparc
arch-branches ?
^ permalink raw reply
* Re: [PATCH 2/2][v2] powerpc: Make the CMM memory hotplug aware
From: Robert Jennings @ 2009-10-08 13:13 UTC (permalink / raw)
To: gerald.schaefer
Cc: linux-mm, Mel Gorman, linux-kernel, linuxppc-dev,
Martin Schwidefsky, Badari Pulavarty, Brian King, Paul Mackerras,
Ingo Molnar
In-Reply-To: <4ACDD71D.30809@linux.vnet.ibm.com>
* Gerald Schaefer (geralds@linux.vnet.ibm.com) wrote:
> Hi,
>
> I am currently working on the s390 port for the cmm + hotplug
> patch, and I'm a little confused about the memory allocation
> policy, see below. Is it correct that the balloon cannot grow
> into ZONE_MOVABLE, while the pages for the balloon page list
> can?
>
> Robert Jennings wrote:
>> @@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
>> cmm_dbg("Begin request for %ld pages\n", nr);
>>
>> while (nr) {
>> + if (atomic_read(&hotplug_active))
>> + break;
>> +
>> addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
>> __GFP_NORETRY | __GFP_NOMEMALLOC);
>> if (!addr)
>> @@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
>> if (!pa || pa->index >= CMM_NR_PAGES) {
>> /* Need a new page for the page list. */
>> spin_unlock(&cmm_lock);
>> - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
>> - __GFP_NORETRY | __GFP_NOMEMALLOC);
>> + npa = (struct cmm_page_array *)__get_free_page(
>> + GFP_NOIO | __GFP_NOWARN |
>> + __GFP_NORETRY | __GFP_NOMEMALLOC |
>> + __GFP_MOVABLE);
>> if (!npa) {
>> pr_info("%s: Can not allocate new page list\n", __func__);
>> free_page(addr);
>
> Why is the __GFP_MOVABLE added here, for the page list alloc, and not
> above for the balloon page alloc?
The pages allocated as __GFP_MOVABLE are used to store the list of pages
allocated by the balloon. They reference virtual addresses and it would
be fine for the kernel to migrate the physical pages for those, the
balloon would not notice this.
The pages loaned by the balloon are not allocated with __GFP_MOVABLE
because we will inform the hypervisor which page has been loaned by
Linux according to the physical address. Migration of those physical
pages would invalidate the loan, so we do not mark them as movable.
Regards,
Robert Jennings
^ permalink raw reply
* Re: [v8 PATCH 2/8]: cpuidle: implement a list based approach to register a set of idle routines.
From: Vaidyanathan Srinivasan @ 2009-10-08 13:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-arch, linux-kernel, linux-acpi, arun, Ingo Molnar,
linuxppc-dev, Arjan van de Ven
In-Reply-To: <1255004737.26976.307.camel@twins>
* Peter Zijlstra <a.p.zijlstra@chello.nl> [2009-10-08 14:25:37]:
> On Thu, 2009-10-08 at 17:31 +0530, Arun R Bharadwaj wrote:
> >
> > > Uhm, no, it would mean ACPI putting its idle routines on the same level
> > > as all others.
> > >
> >
> > Putting them all on the same level would mean, we need an
> > enable/disable routine to enable only the currently active routines.
>
> What's this enable/disable stuff about?
Don't we need an explicit 'don't use this routine' apart from having
the weights based on power consumption and latency. In multiple
registration the assumption is that the top most 'set' has all
necessary routines and we do not need any other idle routines for
optimum operation.
Otherwise the governor has to select from larger 'set' which could
have redundant or conflicting idle routines.
For example we now have c1e_idle to start with and then a set of ACPI
C1, C2, C3 routines. The expectation now is that once we have the
ACPI's routines, we do not need the previous used c1e_idle because
this new set is self contained and picking one from this set based on
latency is good for power savings.
> > Also, the way governor works is that, it assumes that idle routines
> > are indexed in the increasing order of power benefit that can be got
> > out of the state. So this would get messed up.
>
> Right, which is why I initially had a power-savings field in my
> proposal, so it could weight the power savings vs the wakeup latency.
>
> http://lkml.org/lkml/2009/8/27/159
This proposal that you had suggested is being used for the 'set' of
idle routines. The patch changes the idle routines as 'sets' and does
not mix use of routines between two registrations.
> There it was said that was exactly what these governors were doing,
> seems its not.
Governors select from a set of idle routines based on latency. But
there is a probability that any of the routine in the set can be
selected.
> > > Sounds like something is wrong alright. If you can register an idle
> > > routine you should be able to unregister it too.
> > >
> >
> > Yes, we can register and unregister in a clean way now.
> > Consider this. We have a set of routines A, B, C currently registered.
> > Now a module comes and registers D and E, and later on at some point
> > of time wants to unregister. So how do you keep track of what all idle
> > routines the module registered and unregister only those?
> > Best way to do that is a stack, which is how I have currently
> > implemented.
>
> Right, so destroy that inner set thing, that way we only have one
> left ;-)
If un-registration is not needed, then this framework can easily
override the current set with the new one and not worry about the set
of sets.
Ideally, during system boot, we could wait until we know enough
information about idle states and then have a single registration.
The boot process can be in poll-idle until this decision happens.
Like in x86, we can register either c1e_idle or ACPI's routines at
a stage where we know for sure if ACPI is enabled or not.
--Vaidy
^ permalink raw reply
* [PATCH] powerpc/4xx: Add 16K FIFO size DTS entries on supported platforms
From: Dave Mitchell @ 2009-10-08 12:50 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Dave Mitchell
In-Reply-To: <1253079117-833-1-git-send-email-dmitchell@appliedmicro.com>
Adding tx/rx-fifo-size-gige to EMAC fields for evaluation kit DTS
files where appropriate.
Signed-off-by: Dave Mitchell <dmitchell@appliedmicro.com>
Acked-by: Prodyut Hazarika <phazarika@appliedmicro.com>
Acked-by: Victor Gallardo <vgallardo@appliedmicro.com>
Acked-by: Loc Ho <lho@appliedmicro.com>
---
arch/powerpc/boot/dts/canyonlands.dts | 2 ++
arch/powerpc/boot/dts/eiger.dts | 6 ++++++
arch/powerpc/boot/dts/glacier.dts | 6 ++++++
arch/powerpc/boot/dts/haleakala.dts | 2 ++
arch/powerpc/boot/dts/kilauea.dts | 4 ++++
arch/powerpc/boot/dts/makalu.dts | 4 ++++
arch/powerpc/boot/dts/redwood.dts | 1 +
7 files changed, 25 insertions(+), 0 deletions(-)
diff --git a/arch/powerpc/boot/dts/canyonlands.dts b/arch/powerpc/boot/dts/canyonlands.dts
index c920170..cd56bb5 100644
--- a/arch/powerpc/boot/dts/canyonlands.dts
+++ b/arch/powerpc/boot/dts/canyonlands.dts
@@ -352,6 +352,7 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
@@ -381,6 +382,7 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
diff --git a/arch/powerpc/boot/dts/eiger.dts b/arch/powerpc/boot/dts/eiger.dts
index c4a934f..48bcf71 100644
--- a/arch/powerpc/boot/dts/eiger.dts
+++ b/arch/powerpc/boot/dts/eiger.dts
@@ -316,6 +316,7 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
@@ -345,6 +346,7 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
@@ -375,6 +377,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>; /* emac2&3 only */
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII1>;
@@ -403,6 +407,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>; /* emac2&3 only */
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII1>;
diff --git a/arch/powerpc/boot/dts/glacier.dts b/arch/powerpc/boot/dts/glacier.dts
index f3787a2..f6f6189 100644
--- a/arch/powerpc/boot/dts/glacier.dts
+++ b/arch/powerpc/boot/dts/glacier.dts
@@ -292,6 +292,7 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
@@ -321,6 +322,7 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
@@ -351,6 +353,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>; /* emac2&3 only */
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII1>;
@@ -379,6 +383,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>; /* emac2&3 only */
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII1>;
diff --git a/arch/powerpc/boot/dts/haleakala.dts b/arch/powerpc/boot/dts/haleakala.dts
index 5b2a494..2b25669 100644
--- a/arch/powerpc/boot/dts/haleakala.dts
+++ b/arch/powerpc/boot/dts/haleakala.dts
@@ -226,6 +226,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
diff --git a/arch/powerpc/boot/dts/kilauea.dts b/arch/powerpc/boot/dts/kilauea.dts
index c465614..083e68e 100644
--- a/arch/powerpc/boot/dts/kilauea.dts
+++ b/arch/powerpc/boot/dts/kilauea.dts
@@ -272,6 +272,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
@@ -300,6 +302,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
diff --git a/arch/powerpc/boot/dts/makalu.dts b/arch/powerpc/boot/dts/makalu.dts
index ffc246e..63d48b6 100644
--- a/arch/powerpc/boot/dts/makalu.dts
+++ b/arch/powerpc/boot/dts/makalu.dts
@@ -227,6 +227,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x0000003f>; /* Start at 6 */
rgmii-device = <&RGMII0>;
@@ -255,6 +257,8 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
+ tx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
diff --git a/arch/powerpc/boot/dts/redwood.dts b/arch/powerpc/boot/dts/redwood.dts
index ad402c4..d2af32e 100644
--- a/arch/powerpc/boot/dts/redwood.dts
+++ b/arch/powerpc/boot/dts/redwood.dts
@@ -226,6 +226,7 @@
max-frame-size = <9000>;
rx-fifo-size = <4096>;
tx-fifo-size = <2048>;
+ rx-fifo-size-gige = <16384>;
phy-mode = "rgmii";
phy-map = <0x00000000>;
rgmii-device = <&RGMII0>;
--
1.6.3.2
^ permalink raw reply related
* [PATCH] ibm_newemac: Added 16K Tx FIFO size support for EMAC4
From: Dave Mitchell @ 2009-10-08 12:50 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Dave Mitchell
Some of the EMAC V4 implementations support 16K Tx FIFOs. This
patch adds support for this functionality and fixes typos in the
Tx FIFO size error messages.
Signed-off-by: Dave Mitchell <dmitchell@appliedmicro.com>
Acked-by: Prodyut Hazarika <phazarika@appliedmicro.com>
Acked-by: Victor Gallardo <vgallardo@appliedmicro.com>
Acked-by: Loc Ho <lho@appliedmicro.com>
---
drivers/net/ibm_newemac/core.c | 7 +++++--
drivers/net/ibm_newemac/emac.h | 1 +
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index 89c82c5..c6591cb 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -443,7 +443,7 @@ static u32 __emac_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_s
ret |= EMAC_MR1_TFS_2K;
break;
default:
- printk(KERN_WARNING "%s: Unknown Rx FIFO size %d\n",
+ printk(KERN_WARNING "%s: Unknown Tx FIFO size %d\n",
dev->ndev->name, tx_size);
}
@@ -470,6 +470,9 @@ static u32 __emac4_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_
DBG2(dev, "__emac4_calc_base_mr1" NL);
switch(tx_size) {
+ case 16384:
+ ret |= EMAC4_MR1_TFS_16K;
+ break;
case 4096:
ret |= EMAC4_MR1_TFS_4K;
break;
@@ -477,7 +480,7 @@ static u32 __emac4_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_
ret |= EMAC4_MR1_TFS_2K;
break;
default:
- printk(KERN_WARNING "%s: Unknown Rx FIFO size %d\n",
+ printk(KERN_WARNING "%s: Unknown Tx FIFO size %d\n",
dev->ndev->name, tx_size);
}
diff --git a/drivers/net/ibm_newemac/emac.h b/drivers/net/ibm_newemac/emac.h
index 0afc2cf..d34adf9 100644
--- a/drivers/net/ibm_newemac/emac.h
+++ b/drivers/net/ibm_newemac/emac.h
@@ -153,6 +153,7 @@ struct emac_regs {
#define EMAC4_MR1_RFS_16K 0x00280000
#define EMAC4_MR1_TFS_2K 0x00020000
#define EMAC4_MR1_TFS_4K 0x00030000
+#define EMAC4_MR1_TFS_16K 0x00050000
#define EMAC4_MR1_TR 0x00008000
#define EMAC4_MR1_MWSW_001 0x00001000
#define EMAC4_MR1_JPSM 0x00000800
--
1.6.3.2
^ permalink raw reply related
* Re: [v8 PATCH 2/8]: cpuidle: implement a list based approach to register a set of idle routines.
From: Peter Zijlstra @ 2009-10-08 12:25 UTC (permalink / raw)
To: arun
Cc: linux-arch, linux-kernel, linux-acpi, Ingo Molnar, linuxppc-dev,
Arjan van de Ven
In-Reply-To: <20091008120120.GL20595@linux.vnet.ibm.com>
On Thu, 2009-10-08 at 17:31 +0530, Arun R Bharadwaj wrote:
>
> > Uhm, no, it would mean ACPI putting its idle routines on the same level
> > as all others.
> >
>
> Putting them all on the same level would mean, we need an
> enable/disable routine to enable only the currently active routines.
What's this enable/disable stuff about?
> Also, the way governor works is that, it assumes that idle routines
> are indexed in the increasing order of power benefit that can be got
> out of the state. So this would get messed up.
Right, which is why I initially had a power-savings field in my
proposal, so it could weight the power savings vs the wakeup latency.
http://lkml.org/lkml/2009/8/27/159
There it was said that was exactly what these governors were doing,
seems its not.
> > Sounds like something is wrong alright. If you can register an idle
> > routine you should be able to unregister it too.
> >
>
> Yes, we can register and unregister in a clean way now.
> Consider this. We have a set of routines A, B, C currently registered.
> Now a module comes and registers D and E, and later on at some point
> of time wants to unregister. So how do you keep track of what all idle
> routines the module registered and unregister only those?
> Best way to do that is a stack, which is how I have currently
> implemented.
Right, so destroy that inner set thing, that way we only have one
left ;-)
^ permalink raw reply
* [PATCH] net: Fix OF platform drivers coldplug/hotplug when compiled as modules
From: Anton Vorontsov @ 2009-10-08 12:15 UTC (permalink / raw)
To: David Miller; +Cc: linuxppc-dev, netdev
Some OF platform drivers are missing module device tables, so they won't
load automatically on boot. This patch fixes the issue by adding proper
MODULE_DEVICE_TABLE() macros to the drivers.
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
---
drivers/net/can/sja1000/sja1000_of_platform.c | 1 +
drivers/net/fec_mpc52xx_phy.c | 1 +
drivers/net/fs_enet/fs_enet-main.c | 1 +
drivers/net/fs_enet/mii-bitbang.c | 1 +
drivers/net/fs_enet/mii-fec.c | 1 +
drivers/net/fsl_pq_mdio.c | 1 +
drivers/net/gianfar.c | 4 +---
drivers/net/ibm_newemac/core.c | 2 ++
drivers/net/phy/mdio-gpio.c | 1 +
9 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/net/can/sja1000/sja1000_of_platform.c b/drivers/net/can/sja1000/sja1000_of_platform.c
index 3373560..9dd076a 100644
--- a/drivers/net/can/sja1000/sja1000_of_platform.c
+++ b/drivers/net/can/sja1000/sja1000_of_platform.c
@@ -213,6 +213,7 @@ static struct of_device_id __devinitdata sja1000_ofp_table[] = {
{.compatible = "nxp,sja1000"},
{},
};
+MODULE_DEVICE_TABLE(of, sja1000_ofp_table);
static struct of_platform_driver sja1000_ofp_driver = {
.owner = THIS_MODULE,
diff --git a/drivers/net/fec_mpc52xx_phy.c b/drivers/net/fec_mpc52xx_phy.c
index 31e6d62..ee0f3c6 100644
--- a/drivers/net/fec_mpc52xx_phy.c
+++ b/drivers/net/fec_mpc52xx_phy.c
@@ -155,6 +155,7 @@ static struct of_device_id mpc52xx_fec_mdio_match[] = {
{ .compatible = "mpc5200b-fec-phy", },
{}
};
+MODULE_DEVICE_TABLE(of, mpc52xx_fec_mdio_match);
struct of_platform_driver mpc52xx_fec_mdio_driver = {
.name = "mpc5200b-fec-phy",
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index 2bc2d2b..ec2f503 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -1110,6 +1110,7 @@ static struct of_device_id fs_enet_match[] = {
#endif
{}
};
+MODULE_DEVICE_TABLE(of, fs_enet_match);
static struct of_platform_driver fs_enet_driver = {
.name = "fs_enet",
diff --git a/drivers/net/fs_enet/mii-bitbang.c b/drivers/net/fs_enet/mii-bitbang.c
index 93b481b..24ff9f4 100644
--- a/drivers/net/fs_enet/mii-bitbang.c
+++ b/drivers/net/fs_enet/mii-bitbang.c
@@ -221,6 +221,7 @@ static struct of_device_id fs_enet_mdio_bb_match[] = {
},
{},
};
+MODULE_DEVICE_TABLE(of, fs_enet_mdio_bb_match);
static struct of_platform_driver fs_enet_bb_mdio_driver = {
.name = "fsl-bb-mdio",
diff --git a/drivers/net/fs_enet/mii-fec.c b/drivers/net/fs_enet/mii-fec.c
index a2d69c1..96eba42 100644
--- a/drivers/net/fs_enet/mii-fec.c
+++ b/drivers/net/fs_enet/mii-fec.c
@@ -219,6 +219,7 @@ static struct of_device_id fs_enet_mdio_fec_match[] = {
#endif
{},
};
+MODULE_DEVICE_TABLE(of, fs_enet_mdio_fec_match);
static struct of_platform_driver fs_enet_fec_mdio_driver = {
.name = "fsl-fec-mdio",
diff --git a/drivers/net/fsl_pq_mdio.c b/drivers/net/fsl_pq_mdio.c
index d167090..6ac4648 100644
--- a/drivers/net/fsl_pq_mdio.c
+++ b/drivers/net/fsl_pq_mdio.c
@@ -407,6 +407,7 @@ static struct of_device_id fsl_pq_mdio_match[] = {
},
{},
};
+MODULE_DEVICE_TABLE(of, fsl_pq_mdio_match);
static struct of_platform_driver fsl_pq_mdio_driver = {
.name = "fsl-pq_mdio",
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 1e5289f..5bf31f1 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -2325,9 +2325,6 @@ static irqreturn_t gfar_error(int irq, void *dev_id)
return IRQ_HANDLED;
}
-/* work with hotplug and coldplug */
-MODULE_ALIAS("platform:fsl-gianfar");
-
static struct of_device_id gfar_match[] =
{
{
@@ -2336,6 +2333,7 @@ static struct of_device_id gfar_match[] =
},
{},
};
+MODULE_DEVICE_TABLE(of, gfar_match);
/* Structure for a device driver */
static struct of_platform_driver gfar_driver = {
diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index 89c82c5..4baa37c 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -24,6 +24,7 @@
*
*/
+#include <linux/module.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
@@ -2985,6 +2986,7 @@ static struct of_device_id emac_match[] =
},
{},
};
+MODULE_DEVICE_TABLE(of, emac_match);
static struct of_platform_driver emac_driver = {
.name = "emac",
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 250e10f..8659d34 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -238,6 +238,7 @@ static struct of_device_id mdio_ofgpio_match[] = {
},
{},
};
+MODULE_DEVICE_TABLE(of, mdio_ofgpio_match);
static struct of_platform_driver mdio_ofgpio_driver = {
.name = "mdio-gpio",
--
1.6.3.3
^ permalink raw reply related
* Re: [PATCH 2/2][v2] powerpc: Make the CMM memory hotplug aware
From: Gerald Schaefer @ 2009-10-08 12:12 UTC (permalink / raw)
To: Robert Jennings
Cc: linux-mm, Mel Gorman, linux-kernel, linuxppc-dev,
Martin Schwidefsky, Badari Pulavarty, Brian King, Paul Mackerras,
Ingo Molnar
In-Reply-To: <20091002185248.GD4908@austin.ibm.com>
Hi,
I am currently working on the s390 port for the cmm + hotplug
patch, and I'm a little confused about the memory allocation
policy, see below. Is it correct that the balloon cannot grow
into ZONE_MOVABLE, while the pages for the balloon page list
can?
Robert Jennings wrote:
> @@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
> cmm_dbg("Begin request for %ld pages\n", nr);
>
> while (nr) {
> + if (atomic_read(&hotplug_active))
> + break;
> +
> addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
> __GFP_NORETRY | __GFP_NOMEMALLOC);
> if (!addr)
> @@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
> if (!pa || pa->index >= CMM_NR_PAGES) {
> /* Need a new page for the page list. */
> spin_unlock(&cmm_lock);
> - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
> - __GFP_NORETRY | __GFP_NOMEMALLOC);
> + npa = (struct cmm_page_array *)__get_free_page(
> + GFP_NOIO | __GFP_NOWARN |
> + __GFP_NORETRY | __GFP_NOMEMALLOC |
> + __GFP_MOVABLE);
> if (!npa) {
> pr_info("%s: Can not allocate new page list\n", __func__);
> free_page(addr);
Why is the __GFP_MOVABLE added here, for the page list alloc, and not
above for the balloon page alloc?
--
Regards,
Gerald
^ permalink raw reply
* Re: [v8 PATCH 2/8]: cpuidle: implement a list based approach to register a set of idle routines.
From: Arun R Bharadwaj @ 2009-10-08 12:01 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-arch, linux-kernel, linux-acpi, Arun Bharadwaj, Ingo Molnar,
linuxppc-dev, Arjan van de Ven
In-Reply-To: <1255001110.26976.292.camel@twins>
* Peter Zijlstra <a.p.zijlstra@chello.nl> [2009-10-08 13:25:10]:
> On Thu, 2009-10-08 at 16:31 +0530, Arun R Bharadwaj wrote:
> > * Peter Zijlstra <a.p.zijlstra@chello.nl> [2009-10-08 12:50:33]:
> >
> > > On Thu, 2009-10-08 at 16:12 +0530, Arun R Bharadwaj wrote:
> > > >
> > > > > So cpuidle didn't already have a list of idle functions it takes an
> > > > > appropriate one from?
> > > > >
> > > >
> > > > No.. As of now, cpuidle supported only one _set_ of idle states that
> > > > can be registered. So in this one set, it would choose the appropriate
> > > > idle state. But this list mechanism(actually a stack) allows for
> > > > multiple sets.
> > > >
> > > > This is needed because we have a hierarchy of idle states discovery
> > > > in x86. First, select_idle_routine() would select poll/mwait/default/c1e.
> > > > It doesn't know of existance of ACPI. Later when ACPI comes up,
> > > > it registers a set of routines on top of the earlier set.
> > > >
> > > > > Then what does this governor do?
> > > > >
> > > >
> > > > The governor would only select the best idle state available from the
> > > > set of states which is at the top of the stack. (In the above case, it
> > > > would only consider the states registered by ACPI).
> > > >
> > > > If the top-of-the-stack set of idle states is unregistered, the next
> > > > set of states on the stack are considered.
> > > >
> > > > > Also, does this imply the governor doesn't consider these idle routines?
> > > > >
> > > >
> > > > As i said above, governor would only consider the idle routines which
> > > > are at the top of the stack.
> > > >
> > > > Hope this gave a better idea..
> > >
> > > So does it make sense to have a set of sets?
> > >
> > > Why not integrate them all into one set to be ruled by this governor
> > > thing?
> > >
> >
> > Right now there is a clean hierarchy. So breaking that would mean
> > putting the registration of all idle routines under ACPI.
>
> Uhm, no, it would mean ACPI putting its idle routines on the same level
> as all others.
>
Putting them all on the same level would mean, we need an
enable/disable routine to enable only the currently active routines.
Also, the way governor works is that, it assumes that idle routines
are indexed in the increasing order of power benefit that can be got
out of the state. So this would get messed up.
> > So, if ACPI
> > fails to come up or if ACPI is not supported, that would lead to
> > problems.
>
> I think the problem is that ACPI is thinking its special, that should be
> rectified, its not.
>
> > Because if that happens now, we can fallback to the
> > initially registered set.
>
> I'm thinking its all daft and we should be having one set of idle
> routines, if ACPI fails (a tautology if ever there was one) we simply
> wouldn't have its idle routines to pick from.
>
> > Also, if a module wants to register a set of routines later on, that
> > cant be added to the initially registered set. So i think we need this
> > set of sets.
>
> Sounds like something is wrong alright. If you can register an idle
> routine you should be able to unregister it too.
>
Yes, we can register and unregister in a clean way now.
Consider this. We have a set of routines A, B, C currently registered.
Now a module comes and registers D and E, and later on at some point
of time wants to unregister. So how do you keep track of what all idle
routines the module registered and unregister only those?
Best way to do that is a stack, which is how I have currently
implemented.
> What about making ACPI register its idle routines too, 1 for each C
> state, and have the governor make a selection out of the full set?
>
> That also allows you to do away with this default_idle() nonsense and
> simply panic the box when there are no registered idle routines when the
> box wants to go idle.
>
^ permalink raw reply
* Re: [v8 PATCH 2/8]: cpuidle: implement a list based approach to register a set of idle routines.
From: Peter Zijlstra @ 2009-10-08 11:25 UTC (permalink / raw)
To: arun
Cc: linux-arch, linux-kernel, linux-acpi, Ingo Molnar, linuxppc-dev,
Arjan van de Ven
In-Reply-To: <20091008110106.GK20595@linux.vnet.ibm.com>
On Thu, 2009-10-08 at 16:31 +0530, Arun R Bharadwaj wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> [2009-10-08 12:50:33]:
>
> > On Thu, 2009-10-08 at 16:12 +0530, Arun R Bharadwaj wrote:
> > >
> > > > So cpuidle didn't already have a list of idle functions it takes an
> > > > appropriate one from?
> > > >
> > >
> > > No.. As of now, cpuidle supported only one _set_ of idle states that
> > > can be registered. So in this one set, it would choose the appropriate
> > > idle state. But this list mechanism(actually a stack) allows for
> > > multiple sets.
> > >
> > > This is needed because we have a hierarchy of idle states discovery
> > > in x86. First, select_idle_routine() would select poll/mwait/default/c1e.
> > > It doesn't know of existance of ACPI. Later when ACPI comes up,
> > > it registers a set of routines on top of the earlier set.
> > >
> > > > Then what does this governor do?
> > > >
> > >
> > > The governor would only select the best idle state available from the
> > > set of states which is at the top of the stack. (In the above case, it
> > > would only consider the states registered by ACPI).
> > >
> > > If the top-of-the-stack set of idle states is unregistered, the next
> > > set of states on the stack are considered.
> > >
> > > > Also, does this imply the governor doesn't consider these idle routines?
> > > >
> > >
> > > As i said above, governor would only consider the idle routines which
> > > are at the top of the stack.
> > >
> > > Hope this gave a better idea..
> >
> > So does it make sense to have a set of sets?
> >
> > Why not integrate them all into one set to be ruled by this governor
> > thing?
> >
>
> Right now there is a clean hierarchy. So breaking that would mean
> putting the registration of all idle routines under ACPI.
Uhm, no, it would mean ACPI putting its idle routines on the same level
as all others.
> So, if ACPI
> fails to come up or if ACPI is not supported, that would lead to
> problems.
I think the problem is that ACPI is thinking its special, that should be
rectified, its not.
> Because if that happens now, we can fallback to the
> initially registered set.
I'm thinking its all daft and we should be having one set of idle
routines, if ACPI fails (a tautology if ever there was one) we simply
wouldn't have its idle routines to pick from.
> Also, if a module wants to register a set of routines later on, that
> cant be added to the initially registered set. So i think we need this
> set of sets.
Sounds like something is wrong alright. If you can register an idle
routine you should be able to unregister it too.
What about making ACPI register its idle routines too, 1 for each C
state, and have the governor make a selection out of the full set?
That also allows you to do away with this default_idle() nonsense and
simply panic the box when there are no registered idle routines when the
box wants to go idle.
^ permalink raw reply
* Re: Nested function in drivers/of/of_mdio.c
From: Gabriel Paubert @ 2009-10-08 11:14 UTC (permalink / raw)
To: Jérôme Pouiller
Cc: netdev, linuxppc-dev, Andy Fleming, David S. Miller
In-Reply-To: <200910081045.12590.j.pouiller@sysmic.fr>
On Thu, Oct 08, 2009 at 10:45:12AM +0200, Jérôme Pouiller wrote:
> I did some grep on codebase. I have not found any other instances of
> nested functions, but my regexps are not enough to be 100% sure.
>From Documentation/CodingStyle, written by the Head Penguin himself:
"Heretic people all over the world have claimed that this inconsistency
is ... well ... inconsistent, but all right-thinking people know that
(a) K&R are _right_ and (b) K&R are right. Besides, functions are
special anyway (you can't nest them in C)."
^^^^^^^^^^^^^^^^^^^^^^^^
I interpret it as a clear prohibition of using nested functions
in the kernel.
Regards,
Gabriel
^ permalink raw reply
* Re: [v8 PATCH 2/8]: cpuidle: implement a list based approach to register a set of idle routines.
From: Arun R Bharadwaj @ 2009-10-08 11:01 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-arch, linux-kernel, linux-acpi, Arun Bharadwaj, Ingo Molnar,
linuxppc-dev, Arjan van de Ven
In-Reply-To: <1254999033.26976.272.camel@twins>
* Peter Zijlstra <a.p.zijlstra@chello.nl> [2009-10-08 12:50:33]:
> On Thu, 2009-10-08 at 16:12 +0530, Arun R Bharadwaj wrote:
> >
> > > So cpuidle didn't already have a list of idle functions it takes an
> > > appropriate one from?
> > >
> >
> > No.. As of now, cpuidle supported only one _set_ of idle states that
> > can be registered. So in this one set, it would choose the appropriate
> > idle state. But this list mechanism(actually a stack) allows for
> > multiple sets.
> >
> > This is needed because we have a hierarchy of idle states discovery
> > in x86. First, select_idle_routine() would select poll/mwait/default/c1e.
> > It doesn't know of existance of ACPI. Later when ACPI comes up,
> > it registers a set of routines on top of the earlier set.
> >
> > > Then what does this governor do?
> > >
> >
> > The governor would only select the best idle state available from the
> > set of states which is at the top of the stack. (In the above case, it
> > would only consider the states registered by ACPI).
> >
> > If the top-of-the-stack set of idle states is unregistered, the next
> > set of states on the stack are considered.
> >
> > > Also, does this imply the governor doesn't consider these idle routines?
> > >
> >
> > As i said above, governor would only consider the idle routines which
> > are at the top of the stack.
> >
> > Hope this gave a better idea..
>
> So does it make sense to have a set of sets?
>
> Why not integrate them all into one set to be ruled by this governor
> thing?
>
Right now there is a clean hierarchy. So breaking that would mean
putting the registration of all idle routines under ACPI. So, if ACPI
fails to come up or if ACPI is not supported, that would lead to
problems. Because if that happens now, we can fallback to the
initially registered set.
Also, if a module wants to register a set of routines later on, that
cant be added to the initially registered set. So i think we need this
set of sets.
^ permalink raw reply
* Re: [v8 PATCH 2/8]: cpuidle: implement a list based approach to register a set of idle routines.
From: Peter Zijlstra @ 2009-10-08 10:50 UTC (permalink / raw)
To: arun
Cc: linux-arch, linux-kernel, linux-acpi, Ingo Molnar, linuxppc-dev,
Arjan van de Ven
In-Reply-To: <20091008104249.GJ20595@linux.vnet.ibm.com>
On Thu, 2009-10-08 at 16:12 +0530, Arun R Bharadwaj wrote:
>
> > So cpuidle didn't already have a list of idle functions it takes an
> > appropriate one from?
> >
>
> No.. As of now, cpuidle supported only one _set_ of idle states that
> can be registered. So in this one set, it would choose the appropriate
> idle state. But this list mechanism(actually a stack) allows for
> multiple sets.
>
> This is needed because we have a hierarchy of idle states discovery
> in x86. First, select_idle_routine() would select poll/mwait/default/c1e.
> It doesn't know of existance of ACPI. Later when ACPI comes up,
> it registers a set of routines on top of the earlier set.
>
> > Then what does this governor do?
> >
>
> The governor would only select the best idle state available from the
> set of states which is at the top of the stack. (In the above case, it
> would only consider the states registered by ACPI).
>
> If the top-of-the-stack set of idle states is unregistered, the next
> set of states on the stack are considered.
>
> > Also, does this imply the governor doesn't consider these idle routines?
> >
>
> As i said above, governor would only consider the idle routines which
> are at the top of the stack.
>
> Hope this gave a better idea..
So does it make sense to have a set of sets?
Why not integrate them all into one set to be ruled by this governor
thing?
^ permalink raw reply
* Re: [v8 PATCH 2/8]: cpuidle: implement a list based approach to register a set of idle routines.
From: Arun R Bharadwaj @ 2009-10-08 10:42 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-arch, linux-kernel, linux-acpi, Arun Bharadwaj, Ingo Molnar,
linuxppc-dev, Arjan van de Ven
In-Reply-To: <1254998162.26976.270.camel@twins>
* Peter Zijlstra <a.p.zijlstra@chello.nl> [2009-10-08 12:36:02]:
> On Thu, 2009-10-08 at 15:20 +0530, Arun R Bharadwaj wrote:
> > * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-10-08 15:18:28]:
> >
> > Implement a list based registering mechanism for architectures which
> > have multiple sets of idle routines which are to be registered.
> >
> > Currently, in x86 it is done by merely setting pm_idle = idle_routine
> > and managing this pm_idle pointer is messy.
> >
> > To give an example of how this mechanism works:
> > In x86, initially, idle routine is selected from the set of poll/mwait/
> > c1e/default idle loops. So the selected idle loop is registered in cpuidle
> > as one idle state cpuidle devices. Once ACPI comes up, it registers
> > another set of idle states on top of this state. Again, suppose a module
> > registers another set of idle loops, it is added to this list.
> >
> > This provides a clean way of registering and unregistering idle state
> > routines.
>
> So cpuidle didn't already have a list of idle functions it takes an
> appropriate one from?
>
No.. As of now, cpuidle supported only one _set_ of idle states that
can be registered. So in this one set, it would choose the appropriate
idle state. But this list mechanism(actually a stack) allows for
multiple sets.
This is needed because we have a hierarchy of idle states discovery
in x86. First, select_idle_routine() would select poll/mwait/default/c1e.
It doesn't know of existance of ACPI. Later when ACPI comes up,
it registers a set of routines on top of the earlier set.
> Then what does this governor do?
>
The governor would only select the best idle state available from the
set of states which is at the top of the stack. (In the above case, it
would only consider the states registered by ACPI).
If the top-of-the-stack set of idle states is unregistered, the next
set of states on the stack are considered.
> Also, does this imply the governor doesn't consider these idle routines?
>
As i said above, governor would only consider the idle routines which
are at the top of the stack.
Hope this gave a better idea..
^ permalink raw reply
* Re: [v8 PATCH 2/8]: cpuidle: implement a list based approach to register a set of idle routines.
From: Peter Zijlstra @ 2009-10-08 10:36 UTC (permalink / raw)
To: arun
Cc: linux-arch, linux-kernel, linux-acpi, Ingo Molnar, linuxppc-dev,
Arjan van de Ven
In-Reply-To: <20091008095027.GC20595@linux.vnet.ibm.com>
On Thu, 2009-10-08 at 15:20 +0530, Arun R Bharadwaj wrote:
> * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-10-08 15:18:28]:
>
> Implement a list based registering mechanism for architectures which
> have multiple sets of idle routines which are to be registered.
>
> Currently, in x86 it is done by merely setting pm_idle = idle_routine
> and managing this pm_idle pointer is messy.
>
> To give an example of how this mechanism works:
> In x86, initially, idle routine is selected from the set of poll/mwait/
> c1e/default idle loops. So the selected idle loop is registered in cpuidle
> as one idle state cpuidle devices. Once ACPI comes up, it registers
> another set of idle states on top of this state. Again, suppose a module
> registers another set of idle loops, it is added to this list.
>
> This provides a clean way of registering and unregistering idle state
> routines.
So cpuidle didn't already have a list of idle functions it takes an
appropriate one from?
Then what does this governor do?
Also, does this imply the governor doesn't consider these idle routines?
^ permalink raw reply
* [v8 PATCH 8/8]: POWER: Enable default_idle when power_save=off.
From: Arun R Bharadwaj @ 2009-10-08 9:56 UTC (permalink / raw)
To: Peter Zijlstra, Benjamin Herrenschmidt, Ingo Molnar,
Vaidyanathan Srinivasan, Dipankar Sarma, Balbir Singh,
Arjan van de Ven, Arun Bharadwaj
Cc: linux-arch, linux-acpi, linuxppc-dev, linux-kernel
In-Reply-To: <20091008094828.GA20595@linux.vnet.ibm.com>
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-10-08 15:18:28]:
This patch enables default_idle when power_save=off kernel boot
option is specified.
Earlier, this was done by setting ppc_md.power_save = NULL and hence
HMT_low() and HMT_very_low() was called. Now this is defined under
default_idle() and hence by setting boot_option_idle_override = 1,
the cpuidle registration stuff does not happen and hence default_idle
is chosen in cpuidle_idle_call.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/processor.h | 2 ++
arch/powerpc/kernel/idle.c | 4 +++-
arch/powerpc/platforms/pseries/processor_idle.c | 5 +++++
3 files changed, 10 insertions(+), 1 deletion(-)
Index: linux.trees.git/arch/powerpc/include/asm/processor.h
===================================================================
--- linux.trees.git.orig/arch/powerpc/include/asm/processor.h
+++ linux.trees.git/arch/powerpc/include/asm/processor.h
@@ -332,6 +332,8 @@ static inline unsigned long get_clean_sp
}
#endif
+extern int boot_option_idle_override;
+
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_PROCESSOR_H */
Index: linux.trees.git/arch/powerpc/kernel/idle.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/kernel/idle.c
+++ linux.trees.git/arch/powerpc/kernel/idle.c
@@ -40,9 +40,11 @@
#define cpu_should_die() 0
#endif
+int boot_option_idle_override = 0;
+
static int __init powersave_off(char *arg)
{
- ppc_md.power_save = NULL;
+ boot_option_idle_override = 1;
return 0;
}
__setup("powersave=off", powersave_off);
Index: linux.trees.git/arch/powerpc/platforms/pseries/processor_idle.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/processor_idle.c
+++ linux.trees.git/arch/powerpc/platforms/pseries/processor_idle.c
@@ -185,6 +185,11 @@ static int __init pseries_processor_idle
int cpu;
int result;
+ if (boot_option_idle_override) {
+ printk(KERN_DEBUG "Using default idle\n");
+ return 0;
+ }
+
result = cpuidle_register_driver(&pseries_idle_driver);
if (result < 0)
^ permalink raw reply
* [v8 PATCH 7/8]: pSeries: implement pSeries processor idle module.
From: Arun R Bharadwaj @ 2009-10-08 9:54 UTC (permalink / raw)
To: Peter Zijlstra, Benjamin Herrenschmidt, Ingo Molnar,
Vaidyanathan Srinivasan, Dipankar Sarma, Balbir Singh,
Arjan van de Ven, Arun Bharadwaj
Cc: linux-arch, linux-acpi, linuxppc-dev, linux-kernel
In-Reply-To: <20091008094828.GA20595@linux.vnet.ibm.com>
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-10-08 15:18:28]:
This patch creates arch/powerpc/platforms/pseries/processor_idle.c,
which implements the cpuidle infrastructure for pseries.
It implements a pseries_cpuidle_loop() which would be the main idle loop
called from cpu_idle(). It makes decision of entering either
dedicated_snooze_loop or dedicated_cede_loop for dedicated lpar and
shared_cede_loop for shared lpar processor based on the
decision taken by the cpuidle governor.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/system.h | 1
arch/powerpc/kernel/sysfs.c | 2
arch/powerpc/platforms/pseries/Makefile | 1
arch/powerpc/platforms/pseries/processor_idle.c | 210 ++++++++++++++++++++++++
arch/powerpc/platforms/pseries/pseries.h | 8
5 files changed, 222 insertions(+)
Index: linux.trees.git/arch/powerpc/platforms/pseries/Makefile
===================================================================
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/Makefile
+++ linux.trees.git/arch/powerpc/platforms/pseries/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_HCALL_STATS) += hvCall_inst
obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_DTL) += dtl.o
+obj-$(CONFIG_PSERIES_PROCESSOR_IDLE) += processor_idle.o
Index: linux.trees.git/arch/powerpc/platforms/pseries/pseries.h
===================================================================
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/pseries.h
+++ linux.trees.git/arch/powerpc/platforms/pseries/pseries.h
@@ -10,6 +10,8 @@
#ifndef _PSERIES_PSERIES_H
#define _PSERIES_PSERIES_H
+#include <linux/cpuidle.h>
+
extern void __init fw_feature_init(const char *hypertas, unsigned long len);
struct pt_regs;
@@ -40,4 +42,10 @@ extern unsigned long rtas_poweron_auto;
extern void find_udbg_vterm(void);
+DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
+
+#ifdef CONFIG_PSERIES_PROCESSOR_IDLE
+extern struct cpuidle_driver pseries_idle_driver;
+#endif
+
#endif /* _PSERIES_PSERIES_H */
Index: linux.trees.git/arch/powerpc/platforms/pseries/processor_idle.c
===================================================================
--- /dev/null
+++ linux.trees.git/arch/powerpc/platforms/pseries/processor_idle.c
@@ -0,0 +1,210 @@
+/*
+ * processor_idle - idle state cpuidle driver.
+ * Adapted from drivers/acpi/processor_idle.c
+ *
+ * Arun R Bharadwaj <arun@linux.vnet.ibm.com>
+ *
+ * Copyright (C) 2009 IBM Corporation.
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+
+#include <asm/paca.h>
+#include <asm/reg.h>
+#include <asm/system.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+MODULE_AUTHOR("Arun R Bharadwaj");
+MODULE_DESCRIPTION("pSeries Idle State Driver");
+MODULE_LICENSE("GPL");
+
+struct cpuidle_driver pseries_idle_driver = {
+ .name = "pseries_idle",
+ .owner = THIS_MODULE,
+};
+
+DEFINE_PER_CPU(struct cpuidle_device, pseries_dev);
+
+#define IDLE_STATE_COUNT 2
+
+/* pSeries Idle state Flags */
+#define PSERIES_DEDICATED_SNOOZE (0x01)
+#define PSERIES_DEDICATED_CEDE (0x02)
+#define PSERIES_SHARED_CEDE (0x03)
+
+static int pseries_idle_init(struct cpuidle_device *dev)
+{
+ return cpuidle_register_device(dev);
+}
+
+static void shared_cede_loop(void)
+{
+ get_lppaca()->idle = 1;
+ cede_processor();
+ get_lppaca()->idle = 0;
+}
+
+static void dedicated_snooze_loop(void)
+{
+ local_irq_enable();
+ set_thread_flag(TIF_POLLING_NRFLAG);
+ while (!need_resched()) {
+ ppc64_runlatch_off();
+ HMT_low();
+ HMT_very_low();
+ }
+ HMT_medium();
+ clear_thread_flag(TIF_POLLING_NRFLAG);
+ smp_mb();
+ local_irq_disable();
+}
+
+static void dedicated_cede_loop(void)
+{
+ ppc64_runlatch_off();
+ HMT_medium();
+ cede_processor();
+}
+
+static int pseries_cpuidle_loop(struct cpuidle_device *dev,
+ struct cpuidle_state *st)
+{
+ ktime_t t1, t2;
+ s64 diff;
+ int ret;
+ unsigned long in_purr, out_purr;
+
+ get_lppaca()->idle = 1;
+ get_lppaca()->donate_dedicated_cpu = 1;
+ in_purr = mfspr(SPRN_PURR);
+
+ t1 = ktime_get();
+
+ if (st->flags & PSERIES_SHARED_CEDE)
+ shared_cede_loop();
+ else if (st->flags & PSERIES_DEDICATED_SNOOZE)
+ dedicated_snooze_loop();
+ else
+ dedicated_cede_loop();
+
+ t2 = ktime_get();
+ diff = ktime_to_us(ktime_sub(t2, t1));
+ if (diff > INT_MAX)
+ diff = INT_MAX;
+
+ ret = (int) diff;
+
+ out_purr = mfspr(SPRN_PURR);
+ get_lppaca()->wait_state_cycles += out_purr - in_purr;
+ get_lppaca()->donate_dedicated_cpu = 0;
+ get_lppaca()->idle = 0;
+
+ return ret;
+}
+
+static int pseries_setup_cpuidle(struct cpuidle_device *dev, int cpu)
+{
+ int i;
+ struct cpuidle_state *state;
+
+ dev->cpu = cpu;
+
+ if (get_lppaca()->shared_proc) {
+ state = &dev->states[0];
+ snprintf(state->name, CPUIDLE_NAME_LEN, "IDLE");
+ state->enter = pseries_cpuidle_loop;
+ strncpy(state->desc, "shared_cede", CPUIDLE_DESC_LEN);
+ state->flags = PSERIES_SHARED_CEDE;
+ state->exit_latency = 0;
+ state->target_residency = 0;
+ return 0;
+ }
+
+ for (i = 0; i < IDLE_STATE_COUNT; i++) {
+ state = &dev->states[i];
+
+ snprintf(state->name, CPUIDLE_NAME_LEN, "CEDE%d", i);
+ state->enter = pseries_cpuidle_loop;
+
+ switch (i) {
+ case 0:
+ strncpy(state->desc, "snooze", CPUIDLE_DESC_LEN);
+ state->flags = PSERIES_DEDICATED_SNOOZE;
+ state->exit_latency = 0;
+ state->target_residency = 0;
+ break;
+
+ case 1:
+ strncpy(state->desc, "cede", CPUIDLE_DESC_LEN);
+ state->flags = PSERIES_DEDICATED_CEDE;
+ state->exit_latency = 1;
+ state->target_residency =
+ __get_cpu_var(smt_snooze_delay);
+ break;
+ }
+ }
+ dev->state_count = IDLE_STATE_COUNT;
+
+ return 0;
+}
+
+void update_smt_snooze_delay(int snooze)
+{
+ int cpu;
+ for_each_online_cpu(cpu)
+ per_cpu(pseries_dev, cpu).states[0].target_residency = snooze;
+}
+
+static int __init pseries_processor_idle_init(void)
+{
+ int cpu;
+ int result;
+
+ result = cpuidle_register_driver(&pseries_idle_driver);
+
+ if (result < 0)
+ return result;
+
+ printk(KERN_DEBUG "pSeries idle driver registered\n");
+
+ if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ printk(KERN_DEBUG "Using default idle\n");
+ return 0;
+ }
+
+ for_each_online_cpu(cpu) {
+ pseries_setup_cpuidle(&per_cpu(pseries_dev, cpu), cpu);
+ pseries_idle_init(&per_cpu(pseries_dev, cpu));
+ }
+
+ printk(KERN_DEBUG "Using cpuidle idle loop\n");
+
+ return 0;
+}
+
+device_initcall(pseries_processor_idle_init);
Index: linux.trees.git/arch/powerpc/include/asm/system.h
===================================================================
--- linux.trees.git.orig/arch/powerpc/include/asm/system.h
+++ linux.trees.git/arch/powerpc/include/asm/system.h
@@ -548,6 +548,7 @@ extern void account_system_vtime(struct
extern struct dentry *powerpc_debugfs_root;
void cpu_idle_wait(void);
+extern void update_smt_snooze_delay(int snooze);
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_SYSTEM_H */
Index: linux.trees.git/arch/powerpc/kernel/sysfs.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/kernel/sysfs.c
+++ linux.trees.git/arch/powerpc/kernel/sysfs.c
@@ -18,6 +18,7 @@
#include <asm/machdep.h>
#include <asm/smp.h>
#include <asm/pmc.h>
+#include <asm/system.h>
#include "cacheinfo.h"
@@ -51,6 +52,7 @@ static ssize_t store_smt_snooze_delay(st
return -EINVAL;
per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze;
+ update_smt_snooze_delay(snooze);
return count;
}
^ permalink raw reply
* [v8 PATCH 6/8]: POWER: add a default_idle idle loop for POWER.
From: Arun R Bharadwaj @ 2009-10-08 9:53 UTC (permalink / raw)
To: Peter Zijlstra, Benjamin Herrenschmidt, Ingo Molnar,
Vaidyanathan Srinivasan, Dipankar Sarma, Balbir Singh,
Arjan van de Ven, Arun Bharadwaj
Cc: linux-arch, linux-acpi, linuxppc-dev, linux-kernel
In-Reply-To: <20091008094828.GA20595@linux.vnet.ibm.com>
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-10-08 15:18:28]:
In arch/powerpc/kernel/idle.c create a default_idle() routine by moving
the failover condition of the cpu_idle() idle loop. This is needed by
cpuidle infrastructure to call default_idle when other idle routines
are not yet registered. Functionality remains the same, but the code is
slightly moved around.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
arch/powerpc/Kconfig | 3 +++
arch/powerpc/include/asm/system.h | 1 +
arch/powerpc/kernel/idle.c | 6 ++++++
3 files changed, 10 insertions(+)
Index: linux.trees.git/arch/powerpc/Kconfig
===================================================================
--- linux.trees.git.orig/arch/powerpc/Kconfig
+++ linux.trees.git/arch/powerpc/Kconfig
@@ -94,6 +94,9 @@ config ARCH_HAS_ILOG2_U64
config ARCH_HAS_CPU_IDLE_WAIT
def_bool y
+config ARCH_HAS_DEFAULT_IDLE
+ def_bool y
+
config GENERIC_HWEIGHT
bool
default y
Index: linux.trees.git/arch/powerpc/include/asm/system.h
===================================================================
--- linux.trees.git.orig/arch/powerpc/include/asm/system.h
+++ linux.trees.git/arch/powerpc/include/asm/system.h
@@ -218,6 +218,7 @@ extern unsigned long klimit;
extern void *alloc_maybe_bootmem(size_t size, gfp_t mask);
extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
+extern void default_idle(void);
extern int powersave_nap; /* set if nap mode can be used in idle loop */
/*
Index: linux.trees.git/arch/powerpc/kernel/idle.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/kernel/idle.c
+++ linux.trees.git/arch/powerpc/kernel/idle.c
@@ -113,6 +113,12 @@ void cpu_idle_wait(void)
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
+void default_idle(void)
+{
+ HMT_low();
+ HMT_very_low();
+}
+
int powersave_nap;
#ifdef CONFIG_SYSCTL
^ permalink raw reply
* [v8 PATCH 5/8]: pSeries/cpuidle: remove dedicate/shared idle loops, which will be moved to arch/powerpc/platforms/pseries/processor_idle.c
From: Arun R Bharadwaj @ 2009-10-08 9:53 UTC (permalink / raw)
To: Peter Zijlstra, Benjamin Herrenschmidt, Ingo Molnar,
Vaidyanathan Srinivasan, Dipankar Sarma, Balbir Singh,
Arjan van de Ven, Arun Bharadwaj
Cc: linux-arch, linux-acpi, linuxppc-dev, linux-kernel
In-Reply-To: <20091008094828.GA20595@linux.vnet.ibm.com>
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-10-08 15:18:28]:
This patch removes the routines, pseries_shared_idle_sleep and
pseries_dedicated_idle_sleep, since this is implemented as a part
of arch/powerpc/platform/pseries/processor_idle.c
Also, similar to x86, call cpuidle_idle_call from cpu_idle() idle
loop instead of ppc_md.power_save.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
arch/powerpc/kernel/idle.c | 50 +++++++-----------
arch/powerpc/platforms/pseries/setup.c | 89 ---------------------------------
2 files changed, 22 insertions(+), 117 deletions(-)
Index: linux.trees.git/arch/powerpc/platforms/pseries/setup.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/setup.c
+++ linux.trees.git/arch/powerpc/platforms/pseries/setup.c
@@ -75,9 +75,6 @@ EXPORT_SYMBOL(CMO_PageSize);
int fwnmi_active; /* TRUE if an FWNMI handler is present */
-static void pseries_shared_idle_sleep(void);
-static void pseries_dedicated_idle_sleep(void);
-
static struct device_node *pSeries_mpic_node;
static void pSeries_show_cpuinfo(struct seq_file *m)
@@ -297,18 +294,8 @@ static void __init pSeries_setup_arch(vo
pSeries_nvram_init();
/* Choose an idle loop */
- if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ if (firmware_has_feature(FW_FEATURE_SPLPAR))
vpa_init(boot_cpuid);
- if (get_lppaca()->shared_proc) {
- printk(KERN_DEBUG "Using shared processor idle loop\n");
- ppc_md.power_save = pseries_shared_idle_sleep;
- } else {
- printk(KERN_DEBUG "Using dedicated idle loop\n");
- ppc_md.power_save = pseries_dedicated_idle_sleep;
- }
- } else {
- printk(KERN_DEBUG "Using default idle loop\n");
- }
if (firmware_has_feature(FW_FEATURE_LPAR))
ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
@@ -496,80 +483,6 @@ static int __init pSeries_probe(void)
return 1;
}
-
-DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
-
-static void pseries_dedicated_idle_sleep(void)
-{
- unsigned int cpu = smp_processor_id();
- unsigned long start_snooze;
- unsigned long in_purr, out_purr;
-
- /*
- * Indicate to the HV that we are idle. Now would be
- * a good time to find other work to dispatch.
- */
- get_lppaca()->idle = 1;
- get_lppaca()->donate_dedicated_cpu = 1;
- in_purr = mfspr(SPRN_PURR);
-
- /*
- * We come in with interrupts disabled, and need_resched()
- * has been checked recently. If we should poll for a little
- * while, do so.
- */
- if (__get_cpu_var(smt_snooze_delay)) {
- start_snooze = get_tb() +
- __get_cpu_var(smt_snooze_delay) * tb_ticks_per_usec;
- local_irq_enable();
- set_thread_flag(TIF_POLLING_NRFLAG);
-
- while (get_tb() < start_snooze) {
- if (need_resched() || cpu_is_offline(cpu))
- goto out;
- ppc64_runlatch_off();
- HMT_low();
- HMT_very_low();
- }
-
- HMT_medium();
- clear_thread_flag(TIF_POLLING_NRFLAG);
- smp_mb();
- local_irq_disable();
- if (need_resched() || cpu_is_offline(cpu))
- goto out;
- }
-
- cede_processor();
-
-out:
- HMT_medium();
- out_purr = mfspr(SPRN_PURR);
- get_lppaca()->wait_state_cycles += out_purr - in_purr;
- get_lppaca()->donate_dedicated_cpu = 0;
- get_lppaca()->idle = 0;
-}
-
-static void pseries_shared_idle_sleep(void)
-{
- /*
- * Indicate to the HV that we are idle. Now would be
- * a good time to find other work to dispatch.
- */
- get_lppaca()->idle = 1;
-
- /*
- * Yield the processor to the hypervisor. We return if
- * an external interrupt occurs (which are driven prior
- * to returning here) or if a prod occurs from another
- * processor. When returning here, external interrupts
- * are enabled.
- */
- cede_processor();
-
- get_lppaca()->idle = 0;
-}
-
static int pSeries_pci_probe_mode(struct pci_bus *bus)
{
if (firmware_has_feature(FW_FEATURE_LPAR))
Index: linux.trees.git/arch/powerpc/kernel/idle.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/kernel/idle.c
+++ linux.trees.git/arch/powerpc/kernel/idle.c
@@ -25,6 +25,7 @@
#include <linux/cpu.h>
#include <linux/sysctl.h>
#include <linux/tick.h>
+#include <linux/cpuidle.h>
#include <asm/system.h>
#include <asm/processor.h>
@@ -60,35 +61,26 @@ void cpu_idle(void)
while (!need_resched() && !cpu_should_die()) {
ppc64_runlatch_off();
- if (ppc_md.power_save) {
- clear_thread_flag(TIF_POLLING_NRFLAG);
- /*
- * smp_mb is so clearing of TIF_POLLING_NRFLAG
- * is ordered w.r.t. need_resched() test.
- */
- smp_mb();
- local_irq_disable();
-
- /* Don't trace irqs off for idle */
- stop_critical_timings();
-
- /* check again after disabling irqs */
- if (!need_resched() && !cpu_should_die())
- ppc_md.power_save();
-
- start_critical_timings();
-
- local_irq_enable();
- set_thread_flag(TIF_POLLING_NRFLAG);
-
- } else {
- /*
- * Go into low thread priority and possibly
- * low power mode.
- */
- HMT_low();
- HMT_very_low();
- }
+ clear_thread_flag(TIF_POLLING_NRFLAG);
+ /*
+ * smp_mb is so clearing of TIF_POLLING_NRFLAG
+ * is ordered w.r.t. need_resched() test.
+ */
+ smp_mb();
+ local_irq_disable();
+
+ /* Don't trace irqs off for idle */
+ stop_critical_timings();
+
+ /* check again after disabling irqs */
+ if (!need_resched() && !cpu_should_die())
+ cpuidle_idle_call();
+
+ start_critical_timings();
+
+ local_irq_enable();
+ set_thread_flag(TIF_POLLING_NRFLAG);
+
}
HMT_medium();
^ permalink raw reply
* [v8 PATCH 4/8]: POWER: enable cpuidle for POWER.
From: Arun R Bharadwaj @ 2009-10-08 9:52 UTC (permalink / raw)
To: Peter Zijlstra, Benjamin Herrenschmidt, Ingo Molnar,
Vaidyanathan Srinivasan, Dipankar Sarma, Balbir Singh,
Arjan van de Ven, Arun Bharadwaj
Cc: linux-arch, linux-acpi, linuxppc-dev, linux-kernel
In-Reply-To: <20091008094828.GA20595@linux.vnet.ibm.com>
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-10-08 15:18:28]:
This patch enables the cpuidle option in Kconfig for pSeries.
Currently cpuidle infrastructure is enabled only for x86 and ARM.
This code is almost completely borrowed from x86 to enable
cpuidle for pSeries.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
arch/powerpc/Kconfig | 17 +++++++++++++++++
arch/powerpc/include/asm/system.h | 2 ++
arch/powerpc/kernel/idle.c | 19 +++++++++++++++++++
3 files changed, 38 insertions(+)
Index: linux.trees.git/arch/powerpc/Kconfig
===================================================================
--- linux.trees.git.orig/arch/powerpc/Kconfig
+++ linux.trees.git/arch/powerpc/Kconfig
@@ -91,6 +91,9 @@ config ARCH_HAS_ILOG2_U64
bool
default y if 64BIT
+config ARCH_HAS_CPU_IDLE_WAIT
+ def_bool y
+
config GENERIC_HWEIGHT
bool
default y
@@ -247,6 +250,20 @@ source "kernel/Kconfig.freezer"
source "arch/powerpc/sysdev/Kconfig"
source "arch/powerpc/platforms/Kconfig"
+menu "Power management options"
+
+source "drivers/cpuidle/Kconfig"
+
+config PSERIES_PROCESSOR_IDLE
+ bool "Idle Power Management Support for pSeries"
+ depends on PPC_PSERIES && CPU_IDLE
+ default y
+ help
+ Idle Power Management Support for pSeries. This hooks onto cpuidle
+ infrastructure to help in idle cpu power management.
+
+endmenu
+
menu "Kernel options"
config HIGHMEM
Index: linux.trees.git/arch/powerpc/include/asm/system.h
===================================================================
--- linux.trees.git.orig/arch/powerpc/include/asm/system.h
+++ linux.trees.git/arch/powerpc/include/asm/system.h
@@ -546,5 +546,7 @@ extern void account_system_vtime(struct
extern struct dentry *powerpc_debugfs_root;
+void cpu_idle_wait(void);
+
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_SYSTEM_H */
Index: linux.trees.git/arch/powerpc/kernel/idle.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/kernel/idle.c
+++ linux.trees.git/arch/powerpc/kernel/idle.c
@@ -102,6 +102,25 @@ void cpu_idle(void)
}
}
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
+ * idle loop and start using the new idle loop.
+ * Required while changing idle handler on SMP systems.
+ * Caller must have changed idle handler to the new value before the call.
+ */
+void cpu_idle_wait(void)
+{
+ /* Ensure that new value of idle is set */
+ smp_mb();
+ /* kick all the CPUs so that they exit out of old idle routine */
+ smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
int powersave_nap;
#ifdef CONFIG_SYSCTL
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox