LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH 2/2] ARM64: powernv: remove redundant cpuidle_idle_call()
From: Daniel Lezcano @ 2014-02-06 16:20 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: Lists linaro-kernel, linux-pm@vger.kernel.org, Peter Zijlstra,
	Rafael J. Wysocki, LKML, Ingo Molnar, Preeti U Murthy,
	Thomas Gleixner, linuxppc-dev, Linux ARM Kernel ML
In-Reply-To: <1391696188-14540-2-git-send-email-nicolas.pitre@linaro.org>

[-- Attachment #1: Type: text/plain, Size: 225 bytes --]

On 6 February 2014 14:16, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:

> The core idle loop now takes care of it.
>
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
>

Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>

[-- Attachment #2: Type: text/html, Size: 658 bytes --]

^ permalink raw reply

* Re: [PATCH 2/2] ARM64: powernv: remove redundant cpuidle_idle_call()
From: Nicolas Pitre @ 2014-02-06 16:09 UTC (permalink / raw)
  To: Preeti U Murthy
  Cc: linaro-kernel, linux-pm, Peter Zijlstra, Daniel Lezcano,
	Rafael J. Wysocki, linux-kernel, Ingo Molnar, Thomas Gleixner,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <52F3B17E.4040209@linux.vnet.ibm.com>

On Thu, 6 Feb 2014, Preeti U Murthy wrote:

> Hi Nicolas,
> 
> powernv in the subject of the patch?

Crap.  You're right.

That's what you get when posting patches while attending a meeting.



> 
> Regards
> Preeti U Murthy
> On 02/06/2014 07:46 PM, Nicolas Pitre wrote:
> > The core idle loop now takes care of it.
> > 
> > Signed-off-by: Nicolas Pitre <nico@linaro.org>
> > ---
> >  arch/arm64/kernel/process.c | 7 ++-----
> >  1 file changed, 2 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> > index 1c0a9be2ff..9cce0098f4 100644
> > --- a/arch/arm64/kernel/process.c
> > +++ b/arch/arm64/kernel/process.c
> > @@ -33,7 +33,6 @@
> >  #include <linux/kallsyms.h>
> >  #include <linux/init.h>
> >  #include <linux/cpu.h>
> > -#include <linux/cpuidle.h>
> >  #include <linux/elfcore.h>
> >  #include <linux/pm.h>
> >  #include <linux/tick.h>
> > @@ -94,10 +93,8 @@ void arch_cpu_idle(void)
> >  	 * This should do all the clock switching and wait for interrupt
> >  	 * tricks
> >  	 */
> > -	if (cpuidle_idle_call()) {
> > -		cpu_do_idle();
> > -		local_irq_enable();
> > -	}
> > +	cpu_do_idle();
> > +	local_irq_enable();
> >  }
> > 
> >  #ifdef CONFIG_HOTPLUG_CPU
> > 
> 

^ permalink raw reply

* Re: [PATCH V3 2/3] tick/cpuidle: Initialize hrtimer mode of broadcast
From: Thomas Gleixner @ 2014-02-06 16:03 UTC (permalink / raw)
  To: Preeti U Murthy
  Cc: deepthi, daniel.lezcano, linux-pm, peterz, fweisbec,
	rafael.j.wysocki, linux-kernel, paulus, srivatsa.bhat, paulmck,
	linuxppc-dev, mingo
In-Reply-To: <20140206055012.4595.55692.stgit@preeti>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1952 bytes --]

On Thu, 6 Feb 2014, Preeti U Murthy wrote:

Compiler warnings are not so important, right?

kernel/time/tick-broadcast.c: In function ‘tick_broadcast_oneshot_control’:
kernel/time/tick-broadcast.c:700:3: warning: ‘return’ with no value, in function returning non-void [-Wreturn-type]
kernel/time/tick-broadcast.c:711:3: warning: ‘return’ with no value, in function returning non-void [-Wreturn-type]

> +		/*
> +		 * If the current CPU owns the hrtimer broadcast
> +		 * mechanism, it cannot go deep idle.
> +		 */
> +		ret = broadcast_needs_cpu(bc, cpu);

So we leave the CPU in the broadcast mask, just to force another call
to the notify code right away to remove it again. Wouldn't it be more
clever to clear the flag right away? That would make the changes to
the cpuidle code simpler. Delta patch below.

Thanks,

	tglx
---

--- tip.orig/kernel/time/tick-broadcast.c
+++ tip/kernel/time/tick-broadcast.c
@@ -697,7 +697,7 @@ int tick_broadcast_oneshot_control(unsig
 	 * states
 	 */
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
-		return;
+		return 0;
 
 	/*
 	 * We are called with preemtion disabled from the depth of the
@@ -708,7 +708,7 @@ int tick_broadcast_oneshot_control(unsig
 	dev = td->evtdev;
 
 	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-		return;
+		return 0;
 
 	bc = tick_broadcast_device.evtdev;
 
@@ -731,9 +731,14 @@ int tick_broadcast_oneshot_control(unsig
 		}
 		/*
 		 * If the current CPU owns the hrtimer broadcast
-		 * mechanism, it cannot go deep idle.
+		 * mechanism, it cannot go deep idle and we remove the
+		 * CPU from the broadcast mask. We don't have to go
+		 * through the EXIT path as the local timer is not
+		 * shutdown.
 		 */
 		ret = broadcast_needs_cpu(bc, cpu);
+		if (ret)
+			cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);

^ permalink raw reply

* Re: [PATCH 2/2] ARM64: powernv: remove redundant cpuidle_idle_call()
From: Preeti U Murthy @ 2014-02-06 15:59 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: linaro-kernel, linux-pm, Peter Zijlstra, Daniel Lezcano,
	Rafael J. Wysocki, linux-kernel, Ingo Molnar, Thomas Gleixner,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <1391696188-14540-2-git-send-email-nicolas.pitre@linaro.org>

Hi Nicolas,

powernv in the subject of the patch?

Regards
Preeti U Murthy
On 02/06/2014 07:46 PM, Nicolas Pitre wrote:
> The core idle loop now takes care of it.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> ---
>  arch/arm64/kernel/process.c | 7 ++-----
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> index 1c0a9be2ff..9cce0098f4 100644
> --- a/arch/arm64/kernel/process.c
> +++ b/arch/arm64/kernel/process.c
> @@ -33,7 +33,6 @@
>  #include <linux/kallsyms.h>
>  #include <linux/init.h>
>  #include <linux/cpu.h>
> -#include <linux/cpuidle.h>
>  #include <linux/elfcore.h>
>  #include <linux/pm.h>
>  #include <linux/tick.h>
> @@ -94,10 +93,8 @@ void arch_cpu_idle(void)
>  	 * This should do all the clock switching and wait for interrupt
>  	 * tricks
>  	 */
> -	if (cpuidle_idle_call()) {
> -		cpu_do_idle();
> -		local_irq_enable();
> -	}
> +	cpu_do_idle();
> +	local_irq_enable();
>  }
> 
>  #ifdef CONFIG_HOTPLUG_CPU
> 

^ permalink raw reply

* Re: [PATCH 1/2] PPC: powernv: remove redundant cpuidle_idle_call()
From: Preeti U Murthy @ 2014-02-06 15:56 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: linaro-kernel, linux-pm, Peter Zijlstra, Daniel Lezcano,
	Rafael J. Wysocki, linux-kernel, Ingo Molnar, Thomas Gleixner,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <1391696188-14540-1-git-send-email-nicolas.pitre@linaro.org>

On 02/06/2014 07:46 PM, Nicolas Pitre wrote:
> The core idle loop now takes care of it.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>
> ---
>  arch/powerpc/platforms/powernv/setup.c | 13 +------------
>  1 file changed, 1 insertion(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
> index 21166f65c9..a932feb290 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -26,7 +26,6 @@
>  #include <linux/of_fdt.h>
>  #include <linux/interrupt.h>
>  #include <linux/bug.h>
> -#include <linux/cpuidle.h>
> 
>  #include <asm/machdep.h>
>  #include <asm/firmware.h>
> @@ -217,16 +216,6 @@ static int __init pnv_probe(void)
>  	return 1;
>  }
> 
> -void powernv_idle(void)
> -{
> -	/* Hook to cpuidle framework if available, else
> -	 * call on default platform idle code
> -	 */
> -	if (cpuidle_idle_call()) {
> -		power7_idle();
> -	}
> -}
> -
>  define_machine(powernv) {
>  	.name			= "PowerNV",
>  	.probe			= pnv_probe,
> @@ -236,7 +225,7 @@ define_machine(powernv) {
>  	.show_cpuinfo		= pnv_show_cpuinfo,
>  	.progress		= pnv_progress,
>  	.machine_shutdown	= pnv_shutdown,
> -	.power_save             = powernv_idle,
> +	.power_save             = power7_idle,
>  	.calibrate_decr		= generic_calibrate_decr,
>  #ifdef CONFIG_KEXEC
>  	.kexec_cpu_down		= pnv_kexec_cpu_down,
> 

Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>

^ permalink raw reply

* Re: [PATCH] Convert powerpc simple spinlocks into ticket locks
From: Benjamin Herrenschmidt @ 2014-02-06 15:53 UTC (permalink / raw)
  To: Torsten Duwe
  Cc: Peter Zijlstra, linux-kernel, Paul Mackerras, Anton Blanchard,
	Paul E. McKenney, linuxppc-dev, Ingo Molnar
In-Reply-To: <20140206103736.GA18054@lst.de>

On Thu, 2014-02-06 at 11:37 +0100, Torsten Duwe wrote:
> x86 has them, MIPS has them, ARM has them, even ia64 has them:
> ticket locks. They reduce memory bus and cache pressure especially
> for contended spinlocks, increasing performance.
> 
> This patch is a port of the x86 spin locks, mostly written in C,
> to the powerpc, introducing inline asm where needed. The pSeries
> directed yield for vCPUs is taken care of by an additional "holder"
> field in the lock.

Thanks, I've been meaning to look into this for ages and never quite got
to it :-)

I'm travelling right now, I'll review this next week.

Cheers,
Ben.

> Signed-off-by: Torsten Duwe <duwe@suse.de>
> --
>  arch/powerpc/include/asm/spinlock_types.h |   27 ++++-
>  arch/powerpc/include/asm/spinlock.h       |  157 +++++++++++++++++++++++-------
>  arch/powerpc/lib/locks.c                  |    6 -
>  3 files changed, 151 insertions(+), 39 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
> index 2351adc..64a98be 100644
> --- a/arch/powerpc/include/asm/spinlock_types.h
> +++ b/arch/powerpc/include/asm/spinlock_types.h
> @@ -5,11 +5,30 @@
>  # error "please don't include this file directly"
>  #endif
>  
> -typedef struct {
> -	volatile unsigned int slock;
> -} arch_spinlock_t;
> +typedef u16 __ticket_t;
> +typedef u32 __ticketpair_t;
> +
> +#define TICKET_LOCK_INC	((__ticket_t)1)
> +
> +#define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
> +
> +typedef struct arch_spinlock {
> +	union {
> +		__ticketpair_t head_tail;
> +		struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__		/* The "tail" part should be in the MSBs */
> +			__ticket_t tail, head;
> +#else
> +			__ticket_t head, tail;
> +#endif
> +		} tickets;
> +	};
> +#if defined(CONFIG_PPC64)
> +	u32 holder;
> +#endif
> +} arch_spinlock_t __aligned(8);
>  
> -#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
> +#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 }, 0 }
>  
>  typedef struct {
>  	volatile signed int lock;
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 5f54a74..6e33691 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -9,8 +9,7 @@
>   * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
>   * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
>   *	Rework to support virtual processors
> - *
> - * Type of int is used as a full 64b word is not necessary.
> + * Copyright (C) 2014 Torsten Duwe <duwe@suse.de>, ticket lock port
>   *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
> @@ -28,7 +27,20 @@
>  #include <asm/synch.h>
>  #include <asm/ppc-opcode.h>
>  
> -#define arch_spin_is_locked(x)		((x)->slock != 0)
> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
> +{
> +	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> +
> +	return tmp.tail != tmp.head;
> +}
> +
> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
> +{
> +	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> +
> +	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
> +}
> +#define arch_spin_is_contended	arch_spin_is_contended
>  
>  #ifdef CONFIG_PPC64
>  /* use 0x800000yy when locked, where yy == CPU number */
> @@ -37,8 +49,12 @@
>  #else
>  #define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
>  #endif
> +#define SIZE_LETTER	"d"
> +#define L64		"1"
>  #else
>  #define LOCK_TOKEN	1
> +#define SIZE_LETTER	"w"
> +#define L64		"0"
>  #endif
>  
>  #if defined(CONFIG_PPC64) && defined(CONFIG_SMP)
> @@ -55,33 +71,59 @@
>  #endif
>  
>  /*
> - * This returns the old value in the lock, so we succeeded
> - * in getting the lock if the return value is 0.
> + * Our own cmpxchg, operating on spinlock_t's.  Returns 0 iff value
> + * read at lock was equal to "old" AND the cmpxchg succeeded
> + * uninterruptedly.
>   */
> -static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
> +static __always_inline int __arch_spin_cmpxchg_eq(arch_spinlock_t *lock,
> +					       arch_spinlock_t old,
> +					       arch_spinlock_t new)
>  {
> -	unsigned long tmp, token;
> +	register int retval = 1;
> +	register arch_spinlock_t tmp;
> +
> +	__asm__ __volatile__ (
> +"	li %0,1\n"		/* default to "fail" */
> +	PPC_RELEASE_BARRIER
> +"1:	l" SIZE_LETTER "arx   %2,0,%5         # __arch_spin_cmpxchg_eq\n"
> +"	cmp 0,"L64",%3,%2\n"
> +"	bne-    2f\n"
> +	PPC405_ERR77(0, "%5")
> +"	st" SIZE_LETTER "cx.  %4,0,%5\n"
> +"	bne-    1b\n"
> +"	isync\n"
> +"	li %0,0\n"
> +"2:"
> +	: "=&r" (retval), "+m" (*lock)
> +	: "r" (tmp), "r" (old), "r" (new), "r" (lock)
> +	: "cc", "memory");
> +
> +	return retval;
> +}
> +#undef SIZE_LETTER
> +#undef L64
>  
> -	token = LOCK_TOKEN;
> -	__asm__ __volatile__(
> -"1:	" PPC_LWARX(%0,0,%2,1) "\n\
> -	cmpwi		0,%0,0\n\
> -	bne-		2f\n\
> -	stwcx.		%1,0,%2\n\
> -	bne-		1b\n"
> -	PPC_ACQUIRE_BARRIER
> -"2:"
> -	: "=&r" (tmp)
> -	: "r" (token), "r" (&lock->slock)
> -	: "cr0", "memory");
> +static __always_inline int __arch_spin_trylock(arch_spinlock_t *lock)
> +{
> +	arch_spinlock_t old, new;
>  
> -	return tmp;
> +	old.tickets = ACCESS_ONCE(lock->tickets);
> +	if (old.tickets.head != old.tickets.tail)
> +		return 0;
> +
> +	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
> +#if defined(CONFIG_PPC64)
> +	old.holder = lock->holder;
> +	new.holder = LOCK_TOKEN;
> +#endif
> +
> +	return !__arch_spin_cmpxchg_eq(lock, old, new);
>  }
>  
>  static inline int arch_spin_trylock(arch_spinlock_t *lock)
>  {
>  	CLEAR_IO_SYNC;
> -	return __arch_spin_trylock(lock) == 0;
> +	return  __arch_spin_trylock(lock);
>  }
>  
>  /*
> @@ -93,9 +135,8 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
>   * rest of our timeslice to the lock holder.
>   *
>   * So that we can tell which virtual processor is holding a lock,
> - * we put 0x80000000 | smp_processor_id() in the lock when it is
> - * held.  Conveniently, we have a word in the paca that holds this
> - * value.
> + * we put 0x80000000 | smp_processor_id() into lock->holder.
> + * Conveniently, we have a word in the paca that holds this value.
>   */
>  
>  #if defined(CONFIG_PPC_SPLPAR)
> @@ -109,19 +150,59 @@ extern void __rw_yield(arch_rwlock_t *lock);
>  #define SHARED_PROCESSOR	0
>  #endif
>  
> -static inline void arch_spin_lock(arch_spinlock_t *lock)
> +/*
> + * Ticket locks are conceptually two parts, one indicating the current head of
> + * the queue, and the other indicating the current tail. The lock is acquired
> + * by atomically noting the tail and incrementing it by one (thus adding
> + * ourself to the queue and noting our position), then waiting until the head
> + * becomes equal to the the initial value of the tail.
> + *
> + * We use an asm covering *both* parts of the lock, to increment the tail and
> + * also load the position of the head, which takes care of memory ordering
> + * issues and should be optimal for the uncontended case. Note the tail must be
> + * in the high part, because a wide add increment of the low part would carry
> + * up and contaminate the high part.
> + */
> +static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
>  {
> +	register struct __raw_tickets old, tmp,
> +		inc = { .tail = TICKET_LOCK_INC };
> +
>  	CLEAR_IO_SYNC;
> -	while (1) {
> -		if (likely(__arch_spin_trylock(lock) == 0))
> -			break;
> +	__asm__ __volatile__(
> +"1:	lwarx   %0,0,%4         # arch_spin_lock\n"
> +"	add     %1,%3,%0\n"
> +	PPC405_ERR77(0, "%4")
> +"	stwcx.  %1,0,%4\n"
> +"	bne-    1b"
> +	: "=&r" (old), "=&r" (tmp), "+m" (lock->tickets)
> +	: "r" (inc), "r" (&lock->tickets)
> +	: "cc");
> +
> +	if (likely(old.head == old.tail)) {
> +#if defined(CONFIG_PPC64)
> +		lock->holder = LOCK_TOKEN;
> +#endif
> +		goto out;
> +	}
> +
> +	for (;;) {
> +		unsigned count = 100;
> +
>  		do {
> +			if (ACCESS_ONCE(lock->tickets.head) == old.tail) {
> +#if defined(CONFIG_PPC64)
> +				lock->holder = LOCK_TOKEN;
> +#endif
> +				goto out;
> +			}
>  			HMT_low();
>  			if (SHARED_PROCESSOR)
>  				__spin_yield(lock);
> -		} while (unlikely(lock->slock != 0));
> +		} while (--count);
>  		HMT_medium();
>  	}
> +out:	barrier();	/* make sure nothing creeps before the lock is taken */
>  }
>  
>  static inline
> @@ -131,7 +211,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
>  
>  	CLEAR_IO_SYNC;
>  	while (1) {
> -		if (likely(__arch_spin_trylock(lock) == 0))
> +		if (likely(__arch_spin_trylock(lock)))
>  			break;
>  		local_save_flags(flags_dis);
>  		local_irq_restore(flags);
> @@ -139,7 +219,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
>  			HMT_low();
>  			if (SHARED_PROCESSOR)
>  				__spin_yield(lock);
> -		} while (unlikely(lock->slock != 0));
> +		} while (arch_spin_is_locked(lock));
>  		HMT_medium();
>  		local_irq_restore(flags_dis);
>  	}
> @@ -147,10 +227,22 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
>  
>  static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> +	arch_spinlock_t old, new;
> +
> +#if defined(CONFIG_PPC64)
> +	new.holder = 0;
> +#endif
> +	do {
> +		old.tickets = ACCESS_ONCE(lock->tickets);
> +#if defined(CONFIG_PPC64)
> +		old.holder = lock->holder;
> +#endif
> +		new.tickets.head = old.tickets.head + TICKET_LOCK_INC;
> +		new.tickets.tail = old.tickets.tail;
> +	} while (unlikely(__arch_spin_cmpxchg_eq(lock, old, new)));
>  	SYNC_IO;
>  	__asm__ __volatile__("# arch_spin_unlock\n\t"
>  				PPC_RELEASE_BARRIER: : :"memory");
> -	lock->slock = 0;
>  }
>  
>  #ifdef CONFIG_PPC64
> diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
> index 0c9c8d7..4a57e32 100644
> --- a/arch/powerpc/lib/locks.c
> +++ b/arch/powerpc/lib/locks.c
> @@ -27,7 +27,7 @@ void __spin_yield(arch_spinlock_t *lock)
>  {
>  	unsigned int lock_value, holder_cpu, yield_count;
>  
> -	lock_value = lock->slock;
> +	lock_value = lock->holder;
>  	if (lock_value == 0)
>  		return;
>  	holder_cpu = lock_value & 0xffff;
> @@ -36,7 +36,7 @@ void __spin_yield(arch_spinlock_t *lock)
>  	if ((yield_count & 1) == 0)
>  		return;		/* virtual cpu is currently running */
>  	rmb();
> -	if (lock->slock != lock_value)
> +	if (lock->holder != lock_value)
>  		return;		/* something has changed */
>  	plpar_hcall_norets(H_CONFER,
>  		get_hard_smp_processor_id(holder_cpu), yield_count);
> @@ -70,7 +70,7 @@ void __rw_yield(arch_rwlock_t *rw)
>  
>  void arch_spin_unlock_wait(arch_spinlock_t *lock)
>  {
> -	while (lock->slock) {
> +	while (arch_spin_is_locked(lock)) {
>  		HMT_low();
>  		if (SHARED_PROCESSOR)
>  			__spin_yield(lock);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* Re: [PATCH V3] POWERPC: BOOK3S: KVM: Use the saved dsisr and dar values on book3s 64
From: Alexander Graf @ 2014-02-06 15:52 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Paul Mackerras, linuxppc-dev, kvm-ppc,
	kvm@vger.kernel.org mailing list
In-Reply-To: <1390891190-20186-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>


On 28.01.2014, at 07:39, Aneesh Kumar K.V =
<aneesh.kumar@linux.vnet.ibm.com> wrote:

> Although it's optional IBM POWER cpus always had DAR value set on
> alignment interrupt. So don't try to compute these values.
>=20
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
> Changes from V2:
> * Depend on cpu feature flag to decide whether to use fault_dsir or =
not
>=20
> arch/powerpc/include/asm/cputable.h    |  1 +
> arch/powerpc/include/asm/disassemble.h | 34 +++++++++++++++++
> arch/powerpc/kernel/align.c            | 34 +----------------
> arch/powerpc/kernel/cputable.c         | 15 +++++++-
> arch/powerpc/kvm/book3s_emulate.c      | 69 =
++++++++++++++++------------------
> 5 files changed, 82 insertions(+), 71 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/cputable.h =
b/arch/powerpc/include/asm/cputable.h
> index 0d4939ba48e7..1922dce6124d 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -555,6 +555,7 @@ static inline int cpu_has_feature(unsigned long =
feature)
> }
>=20
> #define HBP_NUM 1
> +extern struct cpu_spec *find_cpuspec(unsigned int pvr);
>=20
> #endif /* !__ASSEMBLY__ */
>=20
> diff --git a/arch/powerpc/include/asm/disassemble.h =
b/arch/powerpc/include/asm/disassemble.h
> index 856f8deb557a..6330a61b875a 100644
> --- a/arch/powerpc/include/asm/disassemble.h
> +++ b/arch/powerpc/include/asm/disassemble.h
> @@ -81,4 +81,38 @@ static inline unsigned int get_oc(u32 inst)
> {
> 	return (inst >> 11) & 0x7fff;
> }
> +
> +#define IS_XFORM(inst)	(get_op(inst)  =3D=3D 31)
> +#define IS_DSFORM(inst)	(get_op(inst) >=3D 56)
> +
> +/*
> + * Create a DSISR value from the instruction
> + */
> +static inline unsigned make_dsisr(unsigned instr)
> +{
> +	unsigned dsisr;
> +
> +
> +	/* bits  6:15 --> 22:31 */
> +	dsisr =3D (instr & 0x03ff0000) >> 16;
> +
> +	if (IS_XFORM(instr)) {
> +		/* bits 29:30 --> 15:16 */
> +		dsisr |=3D (instr & 0x00000006) << 14;
> +		/* bit     25 -->    17 */
> +		dsisr |=3D (instr & 0x00000040) << 8;
> +		/* bits 21:24 --> 18:21 */
> +		dsisr |=3D (instr & 0x00000780) << 3;
> +	} else {
> +		/* bit      5 -->    17 */
> +		dsisr |=3D (instr & 0x04000000) >> 12;
> +		/* bits  1: 4 --> 18:21 */
> +		dsisr |=3D (instr & 0x78000000) >> 17;
> +		/* bits 30:31 --> 12:13 */
> +		if (IS_DSFORM(instr))
> +			dsisr |=3D (instr & 0x00000003) << 18;
> +	}
> +
> +	return dsisr;
> +}
> #endif /* __ASM_PPC_DISASSEMBLE_H__ */
> diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
> index de91f3ae631e..111d93ec7f34 100644
> --- a/arch/powerpc/kernel/align.c
> +++ b/arch/powerpc/kernel/align.c
> @@ -25,14 +25,13 @@
> #include <asm/cputable.h>
> #include <asm/emulated_ops.h>
> #include <asm/switch_to.h>
> +#include <asm/disassemble.h>
>=20
> struct aligninfo {
> 	unsigned char len;
> 	unsigned char flags;
> };
>=20
> -#define IS_XFORM(inst)	(((inst) >> 26) =3D=3D 31)
> -#define IS_DSFORM(inst)	(((inst) >> 26) >=3D 56)
>=20
> #define INVALID	{ 0, 0 }
>=20
> @@ -192,37 +191,6 @@ static struct aligninfo aligninfo[128] =3D {
> };
>=20
> /*
> - * Create a DSISR value from the instruction
> - */
> -static inline unsigned make_dsisr(unsigned instr)
> -{
> -	unsigned dsisr;
> -
> -
> -	/* bits  6:15 --> 22:31 */
> -	dsisr =3D (instr & 0x03ff0000) >> 16;
> -
> -	if (IS_XFORM(instr)) {
> -		/* bits 29:30 --> 15:16 */
> -		dsisr |=3D (instr & 0x00000006) << 14;
> -		/* bit     25 -->    17 */
> -		dsisr |=3D (instr & 0x00000040) << 8;
> -		/* bits 21:24 --> 18:21 */
> -		dsisr |=3D (instr & 0x00000780) << 3;
> -	} else {
> -		/* bit      5 -->    17 */
> -		dsisr |=3D (instr & 0x04000000) >> 12;
> -		/* bits  1: 4 --> 18:21 */
> -		dsisr |=3D (instr & 0x78000000) >> 17;
> -		/* bits 30:31 --> 12:13 */
> -		if (IS_DSFORM(instr))
> -			dsisr |=3D (instr & 0x00000003) << 18;
> -	}
> -
> -	return dsisr;
> -}
> -
> -/*
>  * The dcbz (data cache block zero) instruction
>  * gives an alignment fault if used on non-cacheable
>  * memory.  We handle the fault mainly for the
> diff --git a/arch/powerpc/kernel/cputable.c =
b/arch/powerpc/kernel/cputable.c
> index 597d954e5860..b367f5b772f6 100644
> --- a/arch/powerpc/kernel/cputable.c
> +++ b/arch/powerpc/kernel/cputable.c
> @@ -116,7 +116,7 @@ extern void __restore_cpu_e6500(void);
> 				 PPC_FEATURE_BOOKE)
> #endif
>=20
> -static struct cpu_spec __initdata cpu_specs[] =3D {
> +static struct cpu_spec cpu_specs[] =3D {
> #ifdef CONFIG_PPC_BOOK3S_64
> 	{	/* Power3 */
> 		.pvr_mask		=3D 0xffff0000,
> @@ -2258,3 +2258,16 @@ struct cpu_spec * __init identify_cpu(unsigned =
long offset, unsigned int pvr)
>=20
> 	return NULL;
> }
> +
> +struct cpu_spec *find_cpuspec(unsigned int pvr)
> +{
> +	int i;
> +	struct cpu_spec *s =3D cpu_specs;
> +
> +	for (i =3D 0; i < ARRAY_SIZE(cpu_specs); i++, s++) {
> +		if ((pvr & s->pvr_mask) =3D=3D s->pvr_value)
> +			return s;
> +	}
> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(find_cpuspec);
> diff --git a/arch/powerpc/kvm/book3s_emulate.c =
b/arch/powerpc/kvm/book3s_emulate.c
> index 99d40f8977e8..a7d54aa203d0 100644
> --- a/arch/powerpc/kvm/book3s_emulate.c
> +++ b/arch/powerpc/kvm/book3s_emulate.c
> @@ -569,48 +569,42 @@ unprivileged:
>=20
> u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
> {
> -	u32 dsisr =3D 0;
> -
> -	/*
> -	 * This is what the spec says about DSISR bits (not mentioned =3D =
0):
> -	 *
> -	 * 12:13		[DS]	Set to bits 30:31
> -	 * 15:16		[X]	Set to bits 29:30
> -	 * 17			[X]	Set to bit 25
> -	 *			[D/DS]	Set to bit 5
> -	 * 18:21		[X]	Set to bits 21:24
> -	 *			[D/DS]	Set to bits 1:4
> -	 * 22:26			Set to bits 6:10 (RT/RS/FRT/FRS)
> -	 * 27:31			Set to bits 11:15 (RA)
> -	 */
> -
> -	switch (get_op(inst)) {
> -	/* D-form */
> -	case OP_LFS:
> -	case OP_LFD:
> -	case OP_STFD:
> -	case OP_STFS:
> -		dsisr |=3D (inst >> 12) & 0x4000;	/* bit 17 */
> -		dsisr |=3D (inst >> 17) & 0x3c00; /* bits 18:21 */
> -		break;
> -	/* X-form */
> -	case 31:
> -		dsisr |=3D (inst << 14) & 0x18000; /* bits 15:16 */
> -		dsisr |=3D (inst << 8)  & 0x04000; /* bit 17 */
> -		dsisr |=3D (inst << 3)  & 0x03c00; /* bits 18:21 */
> -		break;
> -	default:
> -		printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", =
inst);
> -		break;
> +	struct cpu_spec *s =3D find_cpuspec(vcpu->arch.pvr);

What if we don't find the pvr?

> +
> +	if (s->cpu_features & CPU_FTR_NODSISRALIGN) {
> +		/*
> +		 * We don't care much we pass the DSISR we found on =
fault

Please indicate that this if() checks the guest cpu, while the other =
if() checks the host cpu.

> +		 */
> +		return vcpu->arch.fault_dsisr;
> +	} else if (cpu_has_feature(CPU_FTR_NODSISRALIGN)) {
> +		/*
> +		 * Fault DSISR is not valid, and virualized cpu expect a

virtualized

> +		 * proper DSISR, so build one
> +		 *
> +		 * Mac OS X has some applications - namely the Finder - =
that
> +		 * require alignment interrupts to work properly. So we =
need
> +		 * to implement them.
> +		 *
> +		 * But the spec for 970 and 750 also looks different. =
While 750
> +		 * requires the DSISR and DAR fields to reflect some =
instruction
> +		 * bits (DSISR) and the fault address (DAR), the 970 =
declares
> +		 * this as an optional feature. So we need to =
reconstruct DSISR
> +		 * and DAR manually.
> +		 */
> +		return make_dsisr(inst);

I quite frankly don't see why we're jumping through so many hoops here. =
Why doesn't dsisr reconstruction work in LE mode? And why do we care if =
the guest isn't guaranteed a working dsisr value?


Alex

> +	} else {
> +		/*
> +		 * We have valid dsisr on fault
> +		 */
> +		return vcpu->arch.fault_dsisr;
> 	}
> -
> -	dsisr |=3D (inst >> 16) & 0x03ff; /* bits 22:31 */
> -
> -	return dsisr;
> }
>=20
> ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
> {
> +#ifdef CONFIG_PPC_BOOK3S_64
> +	return vcpu->arch.fault_dar;
> +#else
> 	ulong dar =3D 0;
> 	ulong ra =3D get_ra(inst);
> 	ulong rb =3D get_rb(inst);
> @@ -635,4 +629,5 @@ ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, =
unsigned int inst)
> 	}
>=20
> 	return dar;
> +#endif
> }
> --=20
> 1.8.5.3
>=20
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH V3] KVM: PPC: BOOK3S: PR: Enable Little Endian PR guest
From: Alexander Graf @ 2014-02-06 15:46 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Paul Mackerras, linuxppc-dev, kvm-ppc,
	kvm@vger.kernel.org mailing list
In-Reply-To: <1390891569-21161-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>


On 28.01.2014, at 07:46, Aneesh Kumar K.V =
<aneesh.kumar@linux.vnet.ibm.com> wrote:

> This patch make sure we inherit the LE bit correctly in different case
> so that we can run Little Endian distro in PR mode
>=20
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
> Changes from V2:
> * Move H_SET_MODE to qemu
>=20
>=20
> arch/powerpc/include/asm/kvm_host.h |  1 +
> arch/powerpc/kernel/asm-offsets.c   |  1 +
> arch/powerpc/kvm/book3s_64_mmu.c    |  2 +-
> arch/powerpc/kvm/book3s_pr.c        | 32 =
+++++++++++++++++++++++++++++++-
> 4 files changed, 34 insertions(+), 2 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/kvm_host.h =
b/arch/powerpc/include/asm/kvm_host.h
> index 207b7826c9b1..f4be7be14330 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -550,6 +550,7 @@ struct kvm_vcpu_arch {
> #ifdef CONFIG_PPC_BOOK3S
> 	ulong fault_dar;
> 	u32 fault_dsisr;
> +	unsigned long intr_msr;

We already have a field with that name and semantics in the vcpu_arch =
struct. Just move that one to a more commonly accessible part of the =
struct.

> #endif
>=20
> #ifdef CONFIG_BOOKE
> diff --git a/arch/powerpc/kernel/asm-offsets.c =
b/arch/powerpc/kernel/asm-offsets.c
> index b754f629a177..7484676b8f25 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -518,6 +518,7 @@ int main(void)
> 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
> 	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, =
arch.fault_dsisr));
> 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, =
arch.fault_dar));
> +	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));

Same here:

arch/powerpc/kernel/asm-offsets.c:      DEFINE(VCPU_INTR_MSR, =
offsetof(struct kvm_vcpu, arch.intr_msr));

> 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, =
arch.last_inst));
> 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
> 	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
> diff --git a/arch/powerpc/kvm/book3s_64_mmu.c =
b/arch/powerpc/kvm/book3s_64_mmu.c
> index 83da1f868fd5..8231b83c493b 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu.c
> @@ -38,7 +38,7 @@
>=20
> static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
> {
> -	kvmppc_set_msr(vcpu, MSR_SF);
> +	kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
> }
>=20
> static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
> diff --git a/arch/powerpc/kvm/book3s_pr.c =
b/arch/powerpc/kvm/book3s_pr.c
> index eb070eb4da40..828056ec208f 100644
> --- a/arch/powerpc/kvm/book3s_pr.c
> +++ b/arch/powerpc/kvm/book3s_pr.c
> @@ -263,7 +263,7 @@ static void kvmppc_recalc_shadow_msr(struct =
kvm_vcpu *vcpu)
> 	ulong smsr =3D vcpu->arch.shared->msr;
>=20
> 	/* Guest MSR values */
> -	smsr &=3D MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
> +	smsr &=3D MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
> 	/* Process MSR values */
> 	smsr |=3D MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
> 	/* External providers the guest reserved */
> @@ -1178,6 +1178,15 @@ static int kvmppc_get_one_reg_pr(struct =
kvm_vcpu *vcpu, u64 id,
> 		break;
> 	}
> #endif /* CONFIG_VSX */
> +	case KVM_REG_PPC_LPCR:
> +		/*
> +		 * We are only interested in the LPCR_ILE bit
> +		 */
> +		if (vcpu->arch.intr_msr & MSR_LE)
> +			*val =3D get_reg_val(id, LPCR_ILE);
> +		else
> +			*val =3D get_reg_val(id, 0);
> +		break;
> 	default:
> 		r =3D -EINVAL;
> 		break;
> @@ -1186,6 +1195,23 @@ static int kvmppc_get_one_reg_pr(struct =
kvm_vcpu *vcpu, u64 id,
> 	return r;
> }
>=20
> +static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr)
> +{
> +	struct kvm *kvm =3D vcpu->kvm;
> +	/*
> +	 * If ILE (interrupt little-endian) has changed, update the
> +	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
> +	 */
> +	if ((new_lpcr & LPCR_ILE) !=3D (vcpu->arch.intr_msr & MSR_LE)) {
> +		mutex_lock(&kvm->lock);

Why the lock?


Alex

> +		if (new_lpcr & LPCR_ILE)
> +			vcpu->arch.intr_msr |=3D MSR_LE;
> +		else
> +			vcpu->arch.intr_msr &=3D ~MSR_LE;
> +		mutex_unlock(&kvm->lock);
> +	}
> +}
> +
> static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
> 				 union kvmppc_one_reg *val)
> {
> @@ -1209,6 +1235,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu =
*vcpu, u64 id,
> 		break;
> 	}
> #endif /* CONFIG_VSX */
> +	case KVM_REG_PPC_LPCR:
> +		kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
> +		break;
> 	default:
> 		r =3D -EINVAL;
> 		break;
> @@ -1261,6 +1290,7 @@ static struct kvm_vcpu =
*kvmppc_core_vcpu_create_pr(struct kvm *kvm,
> 	vcpu->arch.pvr =3D 0x3C0301;
> 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
> 		vcpu->arch.pvr =3D mfspr(SPRN_PVR);
> +	vcpu->arch.intr_msr =3D MSR_SF;
> #else
> 	/* default to book3s_32 (750) */
> 	vcpu->arch.pvr =3D 0x84202;
> --=20
> 1.8.5.3
>=20

^ permalink raw reply

* Re: [PATCH 2/2] ARM64: powernv: remove redundant cpuidle_idle_call()
From: Thomas Gleixner @ 2014-02-06 14:20 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: linaro-kernel, linux-pm, Peter Zijlstra, Daniel Lezcano,
	Rafael J. Wysocki, linux-kernel, Ingo Molnar, Preeti U Murthy,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <1391696188-14540-2-git-send-email-nicolas.pitre@linaro.org>

On Thu, 6 Feb 2014, Nicolas Pitre wrote:

> The core idle loop now takes care of it.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>

Acked-by: Thomas Gleixner <tglx@linutronix.de>

^ permalink raw reply

* Re: [PATCH 1/2] PPC: powernv: remove redundant cpuidle_idle_call()
From: Thomas Gleixner @ 2014-02-06 14:20 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: linaro-kernel, linux-pm, Peter Zijlstra, Daniel Lezcano,
	Rafael J. Wysocki, linux-kernel, Ingo Molnar, Preeti U Murthy,
	linuxppc-dev, linux-arm-kernel
In-Reply-To: <1391696188-14540-1-git-send-email-nicolas.pitre@linaro.org>

An Thu, 6 Feb 2014, Nicolas Pitre wrote:

> The core idle loop now takes care of it.
> 
> Signed-off-by: Nicolas Pitre <nico@linaro.org>

Acked-by: Thomas Gleixner <tglx@linutronix.de>

^ permalink raw reply

* [PATCH 1/2] PPC: powernv: remove redundant cpuidle_idle_call()
From: Nicolas Pitre @ 2014-02-06 14:16 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Preeti U Murthy, Thomas Gleixner,
	Ingo Molnar, Peter Zijlstra, Rafael J. Wysocki, Daniel Lezcano
  Cc: linaro-kernel, linuxppc-dev, linux-kernel, linux-arm-kernel,
	linux-pm

The core idle loop now takes care of it.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 arch/powerpc/platforms/powernv/setup.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 21166f65c9..a932feb290 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -26,7 +26,6 @@
 #include <linux/of_fdt.h>
 #include <linux/interrupt.h>
 #include <linux/bug.h>
-#include <linux/cpuidle.h>
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
@@ -217,16 +216,6 @@ static int __init pnv_probe(void)
 	return 1;
 }
 
-void powernv_idle(void)
-{
-	/* Hook to cpuidle framework if available, else
-	 * call on default platform idle code
-	 */
-	if (cpuidle_idle_call()) {
-		power7_idle();
-	}
-}
-
 define_machine(powernv) {
 	.name			= "PowerNV",
 	.probe			= pnv_probe,
@@ -236,7 +225,7 @@ define_machine(powernv) {
 	.show_cpuinfo		= pnv_show_cpuinfo,
 	.progress		= pnv_progress,
 	.machine_shutdown	= pnv_shutdown,
-	.power_save             = powernv_idle,
+	.power_save             = power7_idle,
 	.calibrate_decr		= generic_calibrate_decr,
 #ifdef CONFIG_KEXEC
 	.kexec_cpu_down		= pnv_kexec_cpu_down,
-- 
1.8.4.108.g55ea5f6

^ permalink raw reply related

* [PATCH 2/2] ARM64: powernv: remove redundant cpuidle_idle_call()
From: Nicolas Pitre @ 2014-02-06 14:16 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Preeti U Murthy, Thomas Gleixner,
	Ingo Molnar, Peter Zijlstra, Rafael J. Wysocki, Daniel Lezcano
  Cc: linaro-kernel, linuxppc-dev, linux-kernel, linux-arm-kernel,
	linux-pm
In-Reply-To: <1391696188-14540-1-git-send-email-nicolas.pitre@linaro.org>

The core idle loop now takes care of it.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
---
 arch/arm64/kernel/process.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 1c0a9be2ff..9cce0098f4 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -33,7 +33,6 @@
 #include <linux/kallsyms.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
-#include <linux/cpuidle.h>
 #include <linux/elfcore.h>
 #include <linux/pm.h>
 #include <linux/tick.h>
@@ -94,10 +93,8 @@ void arch_cpu_idle(void)
 	 * This should do all the clock switching and wait for interrupt
 	 * tricks
 	 */
-	if (cpuidle_idle_call()) {
-		cpu_do_idle();
-		local_irq_enable();
-	}
+	cpu_do_idle();
+	local_irq_enable();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
1.8.4.108.g55ea5f6

^ permalink raw reply related

* Re: [PATCH v2 6/6] cpu/idle.c: move to sched/idle.c
From: Nicolas Pitre @ 2014-02-06 14:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linaro-kernel, Russell King, linux-sh, linux-pm, Daniel Lezcano,
	Rafael J. Wysocki, linux-kernel, Paul Mundt, Preeti U Murthy,
	Thomas Gleixner, linuxppc-dev, Ingo Molnar, linux-arm-kernel
In-Reply-To: <20140130162753.GF5002@laptop.programming.kicks-ass.net>

On Thu, 30 Jan 2014, Peter Zijlstra wrote:

> On Thu, Jan 30, 2014 at 11:03:31AM -0500, Nicolas Pitre wrote:
> > > This is not a valid patch for PATCH(1). Please try again.
> > 
> > Don't you use git?  ;-)
> 
> Nah, git and me don't get along well.
> 
> > Here's a plain patch:
> 
> Thanks!

Hi Peter,

Did you merge those patches in your tree?  If so, is it published 
somewhere?  That would be a good idea if that could appear in linux-next 
so to prevent people from adding more calls to cpuidle_idle_call() from 
architecture code.  I'm sending you 2 additional patches right away to 
remove those that appeared in v3.14-rc1.


Nicolas

^ permalink raw reply

* Re: [PATCH V3 3/3] time/cpuidle:Handle failed call to BROADCAST_ENTER on archs with CPUIDLE_FLAG_TIMER_STOP set
From: Rafael J. Wysocki @ 2014-02-06 12:50 UTC (permalink / raw)
  To: Preeti U Murthy
  Cc: deepthi, daniel.lezcano, linux-pm, peterz, fweisbec,
	rafael.j.wysocki, linux-kernel, paulus, srivatsa.bhat, tglx,
	paulmck, linuxppc-dev, mingo
In-Reply-To: <20140206055036.4595.86947.stgit@preeti>

On Thursday, February 06, 2014 11:20:37 AM Preeti U Murthy wrote:
> Some archs set the CPUIDLE_FLAG_TIMER_STOP flag for idle states in which the
> local timers stop. The cpuidle_idle_call() currently handles such idle states
> by calling into the broadcast framework so as to wakeup CPUs at their next
> wakeup event. With the hrtimer mode of broadcast, the BROADCAST_ENTER call
> into the broadcast frameowork can fail for archs that do not have an external
> clock device to handle wakeups and the CPU in question has to thus be made
> the stand by CPU. This patch handles such cases by failing the call into
> cpuidle so that the arch can take some default action. The arch will certainly
> not enter a similar idle state because a failed cpuidle call will also implicitly
> indicate that the broadcast framework has not registered this CPU to be woken up.
> Hence we are safe if we fail the cpuidle call.
> 
> In the process move the functions that trace idle statistics just before and
> after the entry and exit into idle states respectively. In other
> scenarios where the call to cpuidle fails, we end up not tracing idle
> entry and exit since a decision on an idle state could not be taken. Similarly
> when the call to broadcast framework fails, we skip tracing idle statistics
> because we are in no further position to take a decision on an alternative
> idle state to enter into.
> 
> Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>

The cpuidle changes in this patch look reasonable to me, but I guess you'd
like it to go in via tip along with the rest of the series, so

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

> ---
> 
>  drivers/cpuidle/cpuidle.c |   38 +++++++++++++++++++++++---------------
>  1 file changed, 23 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index a55e68f..8f42033 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -117,15 +117,19 @@ int cpuidle_idle_call(void)
>  {
>  	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
>  	struct cpuidle_driver *drv;
> -	int next_state, entered_state;
> -	bool broadcast;
> +	int next_state, entered_state, ret = 0;
> +	bool broadcast = false;
>  
> -	if (off || !initialized)
> -		return -ENODEV;
> +	if (off || !initialized) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
>  
>  	/* check if the device is ready */
> -	if (!dev || !dev->enabled)
> -		return -EBUSY;
> +	if (!dev || !dev->enabled) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
>  
>  	drv = cpuidle_get_cpu_driver(dev);
>  
> @@ -137,15 +141,18 @@ int cpuidle_idle_call(void)
>  		if (cpuidle_curr_governor->reflect)
>  			cpuidle_curr_governor->reflect(dev, next_state);
>  		local_irq_enable();
> -		return 0;
> +		goto out;
>  	}
>  
> -	trace_cpu_idle_rcuidle(next_state, dev->cpu);
> -
>  	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
>  
> -	if (broadcast)
> -		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
> +	if (broadcast) {
> +		ret = clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	trace_cpu_idle_rcuidle(next_state, dev->cpu);
>  
>  	if (cpuidle_state_is_coupled(dev, drv, next_state))
>  		entered_state = cpuidle_enter_state_coupled(dev, drv,
> @@ -153,16 +160,17 @@ int cpuidle_idle_call(void)
>  	else
>  		entered_state = cpuidle_enter_state(dev, drv, next_state);
>  
> -	if (broadcast)
> -		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
> -
>  	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
>  
>  	/* give the governor an opportunity to reflect on the outcome */
>  	if (cpuidle_curr_governor->reflect)
>  		cpuidle_curr_governor->reflect(dev, entered_state);
>  
> -	return 0;
> +out:	if (broadcast)
> +		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
> +
> +
> +	return ret;
>  }
>  
>  /**
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* [PATCH] Convert powerpc simple spinlocks into ticket locks
From: Torsten Duwe @ 2014-02-06 10:37 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Anton Blanchard,
	Paul E. McKenney, Peter Zijlstra, Ingo Molnar
  Cc: linuxppc-dev, linux-kernel

x86 has them, MIPS has them, ARM has them, even ia64 has them:
ticket locks. They reduce memory bus and cache pressure especially
for contended spinlocks, increasing performance.

This patch is a port of the x86 spin locks, mostly written in C,
to the powerpc, introducing inline asm where needed. The pSeries
directed yield for vCPUs is taken care of by an additional "holder"
field in the lock.

Signed-off-by: Torsten Duwe <duwe@suse.de>
--
 arch/powerpc/include/asm/spinlock_types.h |   27 ++++-
 arch/powerpc/include/asm/spinlock.h       |  157 +++++++++++++++++++++++-------
 arch/powerpc/lib/locks.c                  |    6 -
 3 files changed, 151 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 2351adc..64a98be 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -5,11 +5,30 @@
 # error "please don't include this file directly"
 #endif
 
-typedef struct {
-	volatile unsigned int slock;
-} arch_spinlock_t;
+typedef u16 __ticket_t;
+typedef u32 __ticketpair_t;
+
+#define TICKET_LOCK_INC	((__ticket_t)1)
+
+#define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
+
+typedef struct arch_spinlock {
+	union {
+		__ticketpair_t head_tail;
+		struct __raw_tickets {
+#ifdef __BIG_ENDIAN__		/* The "tail" part should be in the MSBs */
+			__ticket_t tail, head;
+#else
+			__ticket_t head, tail;
+#endif
+		} tickets;
+	};
+#if defined(CONFIG_PPC64)
+	u32 holder;
+#endif
+} arch_spinlock_t __aligned(8);
 
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 }, 0 }
 
 typedef struct {
 	volatile signed int lock;
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 5f54a74..6e33691 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -9,8 +9,7 @@
  * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
  * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
  *	Rework to support virtual processors
- *
- * Type of int is used as a full 64b word is not necessary.
+ * Copyright (C) 2014 Torsten Duwe <duwe@suse.de>, ticket lock port
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -28,7 +27,20 @@
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 
-#define arch_spin_is_locked(x)		((x)->slock != 0)
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+	return tmp.tail != tmp.head;
+}
+
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
+{
+	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+}
+#define arch_spin_is_contended	arch_spin_is_contended
 
 #ifdef CONFIG_PPC64
 /* use 0x800000yy when locked, where yy == CPU number */
@@ -37,8 +49,12 @@
 #else
 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
 #endif
+#define SIZE_LETTER	"d"
+#define L64		"1"
 #else
 #define LOCK_TOKEN	1
+#define SIZE_LETTER	"w"
+#define L64		"0"
 #endif
 
 #if defined(CONFIG_PPC64) && defined(CONFIG_SMP)
@@ -55,33 +71,59 @@
 #endif
 
 /*
- * This returns the old value in the lock, so we succeeded
- * in getting the lock if the return value is 0.
+ * Our own cmpxchg, operating on spinlock_t's.  Returns 0 iff value
+ * read at lock was equal to "old" AND the cmpxchg succeeded
+ * uninterruptedly.
  */
-static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
+static __always_inline int __arch_spin_cmpxchg_eq(arch_spinlock_t *lock,
+					       arch_spinlock_t old,
+					       arch_spinlock_t new)
 {
-	unsigned long tmp, token;
+	register int retval = 1;
+	register arch_spinlock_t tmp;
+
+	__asm__ __volatile__ (
+"	li %0,1\n"		/* default to "fail" */
+	PPC_RELEASE_BARRIER
+"1:	l" SIZE_LETTER "arx   %2,0,%5         # __arch_spin_cmpxchg_eq\n"
+"	cmp 0,"L64",%3,%2\n"
+"	bne-    2f\n"
+	PPC405_ERR77(0, "%5")
+"	st" SIZE_LETTER "cx.  %4,0,%5\n"
+"	bne-    1b\n"
+"	isync\n"
+"	li %0,0\n"
+"2:"
+	: "=&r" (retval), "+m" (*lock)
+	: "r" (tmp), "r" (old), "r" (new), "r" (lock)
+	: "cc", "memory");
+
+	return retval;
+}
+#undef SIZE_LETTER
+#undef L64
 
-	token = LOCK_TOKEN;
-	__asm__ __volatile__(
-"1:	" PPC_LWARX(%0,0,%2,1) "\n\
-	cmpwi		0,%0,0\n\
-	bne-		2f\n\
-	stwcx.		%1,0,%2\n\
-	bne-		1b\n"
-	PPC_ACQUIRE_BARRIER
-"2:"
-	: "=&r" (tmp)
-	: "r" (token), "r" (&lock->slock)
-	: "cr0", "memory");
+static __always_inline int __arch_spin_trylock(arch_spinlock_t *lock)
+{
+	arch_spinlock_t old, new;
 
-	return tmp;
+	old.tickets = ACCESS_ONCE(lock->tickets);
+	if (old.tickets.head != old.tickets.tail)
+		return 0;
+
+	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+#if defined(CONFIG_PPC64)
+	old.holder = lock->holder;
+	new.holder = LOCK_TOKEN;
+#endif
+
+	return !__arch_spin_cmpxchg_eq(lock, old, new);
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
-	return __arch_spin_trylock(lock) == 0;
+	return  __arch_spin_trylock(lock);
 }
 
 /*
@@ -93,9 +135,8 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * rest of our timeslice to the lock holder.
  *
  * So that we can tell which virtual processor is holding a lock,
- * we put 0x80000000 | smp_processor_id() in the lock when it is
- * held.  Conveniently, we have a word in the paca that holds this
- * value.
+ * we put 0x80000000 | smp_processor_id() into lock->holder.
+ * Conveniently, we have a word in the paca that holds this value.
  */
 
 #if defined(CONFIG_PPC_SPLPAR)
@@ -109,19 +150,59 @@ extern void __rw_yield(arch_rwlock_t *lock);
 #define SHARED_PROCESSOR	0
 #endif
 
-static inline void arch_spin_lock(arch_spinlock_t *lock)
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an asm covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide add increment of the low part would carry
+ * up and contaminate the high part.
+ */
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 {
+	register struct __raw_tickets old, tmp,
+		inc = { .tail = TICKET_LOCK_INC };
+
 	CLEAR_IO_SYNC;
-	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
-			break;
+	__asm__ __volatile__(
+"1:	lwarx   %0,0,%4         # arch_spin_lock\n"
+"	add     %1,%3,%0\n"
+	PPC405_ERR77(0, "%4")
+"	stwcx.  %1,0,%4\n"
+"	bne-    1b"
+	: "=&r" (old), "=&r" (tmp), "+m" (lock->tickets)
+	: "r" (inc), "r" (&lock->tickets)
+	: "cc");
+
+	if (likely(old.head == old.tail)) {
+#if defined(CONFIG_PPC64)
+		lock->holder = LOCK_TOKEN;
+#endif
+		goto out;
+	}
+
+	for (;;) {
+		unsigned count = 100;
+
 		do {
+			if (ACCESS_ONCE(lock->tickets.head) == old.tail) {
+#if defined(CONFIG_PPC64)
+				lock->holder = LOCK_TOKEN;
+#endif
+				goto out;
+			}
 			HMT_low();
 			if (SHARED_PROCESSOR)
 				__spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
+		} while (--count);
 		HMT_medium();
 	}
+out:	barrier();	/* make sure nothing creeps before the lock is taken */
 }
 
 static inline
@@ -131,7 +211,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 
 	CLEAR_IO_SYNC;
 	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
+		if (likely(__arch_spin_trylock(lock)))
 			break;
 		local_save_flags(flags_dis);
 		local_irq_restore(flags);
@@ -139,7 +219,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 			HMT_low();
 			if (SHARED_PROCESSOR)
 				__spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
+		} while (arch_spin_is_locked(lock));
 		HMT_medium();
 		local_irq_restore(flags_dis);
 	}
@@ -147,10 +227,22 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
+	arch_spinlock_t old, new;
+
+#if defined(CONFIG_PPC64)
+	new.holder = 0;
+#endif
+	do {
+		old.tickets = ACCESS_ONCE(lock->tickets);
+#if defined(CONFIG_PPC64)
+		old.holder = lock->holder;
+#endif
+		new.tickets.head = old.tickets.head + TICKET_LOCK_INC;
+		new.tickets.tail = old.tickets.tail;
+	} while (unlikely(__arch_spin_cmpxchg_eq(lock, old, new)));
 	SYNC_IO;
 	__asm__ __volatile__("# arch_spin_unlock\n\t"
 				PPC_RELEASE_BARRIER: : :"memory");
-	lock->slock = 0;
 }
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 0c9c8d7..4a57e32 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -27,7 +27,7 @@ void __spin_yield(arch_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 
-	lock_value = lock->slock;
+	lock_value = lock->holder;
 	if (lock_value == 0)
 		return;
 	holder_cpu = lock_value & 0xffff;
@@ -36,7 +36,7 @@ void __spin_yield(arch_spinlock_t *lock)
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
-	if (lock->slock != lock_value)
+	if (lock->holder != lock_value)
 		return;		/* something has changed */
 	plpar_hcall_norets(H_CONFER,
 		get_hard_smp_processor_id(holder_cpu), yield_count);
@@ -70,7 +70,7 @@ void __rw_yield(arch_rwlock_t *rw)
 
 void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	while (lock->slock) {
+	while (arch_spin_is_locked(lock)) {
 		HMT_low();
 		if (SHARED_PROCESSOR)
 			__spin_yield(lock);

^ permalink raw reply related

* Re: [RFC PATCH 2/3] topology: support node_numa_mem() for determining the fallback node
From: Joonsoo Kim @ 2014-02-06 10:29 UTC (permalink / raw)
  To: David Rientjes
  Cc: Han Pingtian, Nishanth Aravamudan, Pekka Enberg,
	Linux Memory Management List, Paul Mackerras, Anton Blanchard,
	Matt Mackall, Joonsoo Kim, linuxppc-dev, Christoph Lameter,
	Wanpeng Li
In-Reply-To: <alpine.DEB.2.02.1402060041040.21148@chino.kir.corp.google.com>

2014-02-06 David Rientjes <rientjes@google.com>:
> On Thu, 6 Feb 2014, Joonsoo Kim wrote:
>
>> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>>
>
> I may be misunderstanding this patch and there's no help because there's
> no changelog.

Sorry about that.
I made this patch just for testing. :)
Thanks for looking this.

>> diff --git a/include/linux/topology.h b/include/linux/topology.h
>> index 12ae6ce..a6d5438 100644
>> --- a/include/linux/topology.h
>> +++ b/include/linux/topology.h
>> @@ -233,11 +233,20 @@ static inline int numa_node_id(void)
>>   * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
>>   */
>>  DECLARE_PER_CPU(int, _numa_mem_);
>> +int _node_numa_mem_[MAX_NUMNODES];
>>
>>  #ifndef set_numa_mem
>>  static inline void set_numa_mem(int node)
>>  {
>>       this_cpu_write(_numa_mem_, node);
>> +     _node_numa_mem_[numa_node_id()] = node;
>> +}
>> +#endif
>> +
>> +#ifndef get_numa_mem
>> +static inline int get_numa_mem(int node)
>> +{
>> +     return _node_numa_mem_[node];
>>  }
>>  #endif
>>
>> @@ -260,6 +269,7 @@ static inline int cpu_to_mem(int cpu)
>>  static inline void set_cpu_numa_mem(int cpu, int node)
>>  {
>>       per_cpu(_numa_mem_, cpu) = node;
>> +     _node_numa_mem_[numa_node_id()] = node;
>
> The intention seems to be that _node_numa_mem_[X] for a node X will return
> a node Y with memory that has the nearest distance?  In other words,
> caching the value returned by local_memory_node(X)?

Yes, you are right.

> That doesn't seem to be what it's doing since numa_node_id() is the node
> of the cpu that current is running on so this ends up getting initialized
> to whatever local_memory_node(cpu_to_node(cpu)) is for the last bit set in
> cpu_possible_mask.

Yes, I made a mistake.
Thanks for pointer.
I fix it and attach v2.
Now I'm out of office, so I'm not sure this second version is correct :(

Thanks.

----------8<--------------
>From bf691e7eb07f966e3aed251eaeb18f229ee32d1f Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 6 Feb 2014 17:07:05 +0900
Subject: [RFC PATCH 2/3 v2] topology: support node_numa_mem() for
determining the
 fallback node

We need to determine the fallback node in slub allocator if the allocation
target node is memoryless node. Without it, the SLUB wrongly select
the node which has no memory and can't use a partial slab, because of node
mismatch. Introduced function, node_numa_mem(X), will return
a node Y with memory that has the nearest distance. If X is memoryless
node, it will return nearest distance node, but, if
X is normal node, it will return itself.

We will use this function in following patch to determine the fallback
node.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 12ae6ce..66b19b8 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -233,11 +233,20 @@ static inline int numa_node_id(void)
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
  */
 DECLARE_PER_CPU(int, _numa_mem_);
+int _node_numa_mem_[MAX_NUMNODES];

 #ifndef set_numa_mem
 static inline void set_numa_mem(int node)
 {
  this_cpu_write(_numa_mem_, node);
+ _node_numa_mem_[numa_node_id()] = node;
+}
+#endif
+
+#ifndef get_numa_mem
+static inline int get_numa_mem(int node)
+{
+ return _node_numa_mem_[node];
 }
 #endif

@@ -260,6 +269,7 @@ static inline int cpu_to_mem(int cpu)
 static inline void set_cpu_numa_mem(int cpu, int node)
 {
  per_cpu(_numa_mem_, cpu) = node;
+ _node_numa_mem_[cpu_to_node(cpu)] = node;
 }
 #endif

@@ -273,6 +283,13 @@ static inline int numa_mem_id(void)
 }
 #endif

+#ifndef get_numa_mem
+static inline int get_numa_mem(int node)
+{
+ return node;
+}
+#endif
+
 #ifndef cpu_to_mem
 static inline int cpu_to_mem(int cpu)
 {
-- 
1.7.9.5

^ permalink raw reply related

* Re: arch/powerpc/math-emu/mtfsf.c - incorrect mask?
From: Gabriel Paubert @ 2014-02-06  8:26 UTC (permalink / raw)
  To: Stephen N Chivers; +Cc: Chris Proctor, linuxppc-dev
In-Reply-To: <OF784BA598.ACFE9DA4-ONCA257C76.00827472-CA257C77.000BCFAD@csc.com>

On Thu, Feb 06, 2014 at 12:09:00PM +1000, Stephen N Chivers wrote:
> I have a MPC8548e based board and an application that makes
> extensive use of floating point including numerous calls to cos.
> In the same program there is the use of an sqlite database.
> 
> The kernel is derived from 2.6.31 and is compiled with math emulation.
> 
> At some point after the reading of the SQLITE database, the
> return result from cos goes from an in range value to an out
> of range value.
> 
> This is as a result of the FP rounding mode mutating from "round to 
> nearest"
> to "round toward zero".
> 
> The cos in the glibc version being used is known to be sensitive to 
> rounding
> direction and Joseph Myers has previously fixed glibc.
> 
> The failure does not occur on a machine that has a hardware floating
> point unit (a MPC7410 processor).
> 
> I have traced the mutation to the following series of instructions:
> 
>         mffs            f0
>         mtfsb1          4*cr7+so
>         mtfsb0          4*cr7+eq
>         fadd            f13,f1,f2
>         mtfsf           1, f0
> 
> The instructions are part of the stream emitted by gcc for the conversion
> of a 128 bit floating point value into an integer in the sqlite database 
> read.
> 
> Immediately before the execution of the mffs instruction the "rounding
> mode" is "round to nearest".
> 
> On the MPC8548 board, the execution of the mtfsf instruction does not
> restore the rounding mode to "round to nearest".
> 
> I believe that the mask computation in mtfsf.c is incorrect and is 
> reversed.
> 
> In the latest version of the file (linux-3.14-rc1), the mask is computed 
> by:
> 
>                  mask = 0;
>                  if (FM & (1 << 0))
>                         mask |= 0x90000000;
>                  if (FM & (1 << 1))
>                         mask |= 0x0f000000;
>                  if (FM & (1 << 2))
>                         mask |= 0x00f00000;
>                  if (FM & (1 << 3))
>                         mask |= 0x000f0000;
>                  if (FM & (1 << 4))
>                         mask |= 0x0000f000;
>                  if (FM & (1 << 5))
>                         mask |= 0x00000f00;
>                  if (FM & (1 << 6))
>                         mask |= 0x000000f0;
>                  if (FM & (1 << 7))
>                         mask |= 0x0000000f;
> 
> I think it should be:
> 
>                 mask = 0;
>                 if (FM & (1 << 0))
>                         mask |= 0x0000000f;
>                 if (FM & (1 << 1))
>                         mask |= 0x000000f0;
>                 if (FM & (1 << 2))
>                         mask |= 0x00000f00;
>                 if (FM & (1 << 3))
>                         mask |= 0x0000f000;
>                 if (FM & (1 << 4))
>                         mask |= 0x000f0000;
>                 if (FM & (1 << 5))
>                         mask |= 0x00f00000;
>                 if (FM & (1 << 6))
>                         mask |= 0x0f000000;
>                 if (FM & (1 << 7))
>                         mask |= 0x90000000;
> 
> With the above mask computation I get consistent results for both the 
> MPC8548
> and MPC7410 boards.
> 
> Am I missing something subtle?

No I think you are correct. This said, this code may probably be optimized 
to eliminate a lot of the conditional branches. I think that:

mask = (FM & 1);
mask |= (FM << 3) & 0x10;
mask |= (FM << 6) & 0x100;
mask |= (FM << 9) & 0x1000;
mask |= (FM << 12) & 0x10000;
mask |= (FM << 15) & 0x100000;
mask |= (FM << 18) & 0x1000000;
mask |= (FM << 21) & 0x10000000;
mask *= 15;

should do the job, in less code space and without a single branch.

Each one of the "mask |=" lines should be translated into an
rlwinm instruction followed by an "or". Actually it should be possible
to transform each of these lines into a single "rlwimi" instruction
but I don't know how to coerce gcc to reach this level of optimization.

Another way of optomizing this could be:

mask = (FM & 0x0f) | ((FM << 12) & 0x000f0000);
mask = (mask & 0x00030003) | ((mask << 6) & 0x03030303);
mask = (mask & 0x01010101) | ((mask << 3) & 0x10101010);
mask *= 15;

It's not easy to see which of the solutions is faster, the second one
needs to create quite a few constants, but its dependency length is 
lower. It is very likely that the first solution is faster in cache-cold
case and the second in cache-hot. 

Regardless, the original code is rather naïve, larger and slower in all cases,
with timing variation depending on branch mispredictions.

	Regards,
	Gabriel

^ permalink raw reply

* Re: [RFC PATCH 2/3] topology: support node_numa_mem() for determining the fallback node
From: David Rientjes @ 2014-02-06  8:52 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Han Pingtian, Nishanth Aravamudan, penberg, linux-mm, paulus,
	Anton Blanchard, mpm, Christoph Lameter, linuxppc-dev, Wanpeng Li
In-Reply-To: <1391674026-20092-2-git-send-email-iamjoonsoo.kim@lge.com>

On Thu, 6 Feb 2014, Joonsoo Kim wrote:

> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 

I may be misunderstanding this patch and there's no help because there's 
no changelog.

> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 12ae6ce..a6d5438 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -233,11 +233,20 @@ static inline int numa_node_id(void)
>   * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
>   */
>  DECLARE_PER_CPU(int, _numa_mem_);
> +int _node_numa_mem_[MAX_NUMNODES];
>  
>  #ifndef set_numa_mem
>  static inline void set_numa_mem(int node)
>  {
>  	this_cpu_write(_numa_mem_, node);
> +	_node_numa_mem_[numa_node_id()] = node;
> +}
> +#endif
> +
> +#ifndef get_numa_mem
> +static inline int get_numa_mem(int node)
> +{
> +	return _node_numa_mem_[node];
>  }
>  #endif
>  
> @@ -260,6 +269,7 @@ static inline int cpu_to_mem(int cpu)
>  static inline void set_cpu_numa_mem(int cpu, int node)
>  {
>  	per_cpu(_numa_mem_, cpu) = node;
> +	_node_numa_mem_[numa_node_id()] = node;

The intention seems to be that _node_numa_mem_[X] for a node X will return 
a node Y with memory that has the nearest distance?  In other words, 
caching the value returned by local_memory_node(X)?

That doesn't seem to be what it's doing since numa_node_id() is the node 
of the cpu that current is running on so this ends up getting initialized 
to whatever local_memory_node(cpu_to_node(cpu)) is for the last bit set in 
cpu_possible_mask.

>  }
>  #endif
>  
> @@ -273,6 +283,13 @@ static inline int numa_mem_id(void)
>  }
>  #endif
>  
> +#ifndef get_numa_mem
> +static inline int get_numa_mem(int node)
> +{
> +	return node;
> +}
> +#endif
> +
>  #ifndef cpu_to_mem
>  static inline int cpu_to_mem(int cpu)
>  {

^ permalink raw reply

* Re: [RFC PATCH 1/3] slub: search partial list on numa_mem_id(), instead of numa_node_id()
From: David Rientjes @ 2014-02-06  8:37 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Han Pingtian, Nishanth Aravamudan, penberg, linux-mm, paulus,
	Anton Blanchard, mpm, Christoph Lameter, linuxppc-dev, Wanpeng Li
In-Reply-To: <1391674026-20092-1-git-send-email-iamjoonsoo.kim@lge.com>

On Thu, 6 Feb 2014, Joonsoo Kim wrote:

> Currently, if allocation constraint to node is NUMA_NO_NODE, we search
> a partial slab on numa_node_id() node. This doesn't work properly on the
> system having memoryless node, since it can have no memory on that node and
> there must be no partial slab on that node.
> 
> On that node, page allocation always fallback to numa_mem_id() first. So
> searching a partial slab on numa_node_id() in that case is proper solution
> for memoryless node case.
> 
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 

Acked-by: David Rientjes <rientjes@google.com>

I think you'll need to send these to Andrew since he appears to be picking 
up slub patches these days.

^ permalink raw reply

* [RFC PATCH 1/3] slub: search partial list on numa_mem_id(), instead of numa_node_id()
From: Joonsoo Kim @ 2014-02-06  8:07 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Han Pingtian, mpm, penberg, linux-mm, paulus, Anton Blanchard,
	David Rientjes, Christoph Lameter, linuxppc-dev, Joonsoo Kim,
	Wanpeng Li
In-Reply-To: <20140206020757.GC5433@linux.vnet.ibm.com>

Currently, if allocation constraint to node is NUMA_NO_NODE, we search
a partial slab on numa_node_id() node. This doesn't work properly on the
system having memoryless node, since it can have no memory on that node and
there must be no partial slab on that node.

On that node, page allocation always fallback to numa_mem_id() first. So
searching a partial slab on numa_node_id() in that case is proper solution
for memoryless node case.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

diff --git a/mm/slub.c b/mm/slub.c
index 545a170..cc1f995 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1698,7 +1698,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
 		struct kmem_cache_cpu *c)
 {
 	void *object;
-	int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
+	int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node;
 
 	object = get_partial_node(s, get_node(s, searchnode), c, flags);
 	if (object || node != NUMA_NO_NODE)
-- 
1.7.9.5

^ permalink raw reply related

* [RFC PATCH 2/3] topology: support node_numa_mem() for determining the fallback node
From: Joonsoo Kim @ 2014-02-06  8:07 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Han Pingtian, mpm, penberg, linux-mm, paulus, Anton Blanchard,
	David Rientjes, Christoph Lameter, linuxppc-dev, Joonsoo Kim,
	Wanpeng Li
In-Reply-To: <1391674026-20092-1-git-send-email-iamjoonsoo.kim@lge.com>

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 12ae6ce..a6d5438 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -233,11 +233,20 @@ static inline int numa_node_id(void)
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
  */
 DECLARE_PER_CPU(int, _numa_mem_);
+int _node_numa_mem_[MAX_NUMNODES];
 
 #ifndef set_numa_mem
 static inline void set_numa_mem(int node)
 {
 	this_cpu_write(_numa_mem_, node);
+	_node_numa_mem_[numa_node_id()] = node;
+}
+#endif
+
+#ifndef get_numa_mem
+static inline int get_numa_mem(int node)
+{
+	return _node_numa_mem_[node];
 }
 #endif
 
@@ -260,6 +269,7 @@ static inline int cpu_to_mem(int cpu)
 static inline void set_cpu_numa_mem(int cpu, int node)
 {
 	per_cpu(_numa_mem_, cpu) = node;
+	_node_numa_mem_[numa_node_id()] = node;
 }
 #endif
 
@@ -273,6 +283,13 @@ static inline int numa_mem_id(void)
 }
 #endif
 
+#ifndef get_numa_mem
+static inline int get_numa_mem(int node)
+{
+	return node;
+}
+#endif
+
 #ifndef cpu_to_mem
 static inline int cpu_to_mem(int cpu)
 {
-- 
1.7.9.5

^ permalink raw reply related

* [RFC PATCH 3/3] slub: fallback to get_numa_mem() node if we want to allocate on memoryless node
From: Joonsoo Kim @ 2014-02-06  8:07 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Han Pingtian, mpm, penberg, linux-mm, paulus, Anton Blanchard,
	David Rientjes, Christoph Lameter, linuxppc-dev, Joonsoo Kim,
	Wanpeng Li
In-Reply-To: <1391674026-20092-1-git-send-email-iamjoonsoo.kim@lge.com>

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>

diff --git a/mm/slub.c b/mm/slub.c
index cc1f995..c851f82 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1700,6 +1700,14 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
 	void *object;
 	int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node;
 
+	if (node == NUMA_NO_NODE)
+		searchnode = numa_mem_id();
+	else {
+		searchnode = node;
+		if (!node_present_pages(node))
+			searchnode = get_numa_mem(node);
+	}
+
 	object = get_partial_node(s, get_node(s, searchnode), c, flags);
 	if (object || node != NUMA_NO_NODE)
 		return object;
@@ -2277,11 +2285,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 redo:
 
 	if (unlikely(!node_match(page, node))) {
-		stat(s, ALLOC_NODE_MISMATCH);
-		deactivate_slab(s, page, c->freelist);
-		c->page = NULL;
-		c->freelist = NULL;
-		goto new_slab;
+		int searchnode = node;
+
+		if (node != NUMA_NO_NODE && !node_present_pages(node))
+			searchnode = get_numa_mem(node);
+
+		if (!node_match(page, searchnode)) {
+			stat(s, ALLOC_NODE_MISMATCH);
+			deactivate_slab(s, page, c->freelist);
+			c->page = NULL;
+			c->freelist = NULL;
+			goto new_slab;
+		}
 	}
 
 	/*
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH] slub: Don't throw away partial remote slabs if there is no local memory
From: Joonsoo Kim @ 2014-02-06  8:04 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Han Pingtian, mpm, penberg, linux-mm, paulus, Anton Blanchard,
	David Rientjes, Christoph Lameter, linuxppc-dev, Wanpeng Li
In-Reply-To: <20140206020757.GC5433@linux.vnet.ibm.com>

On Wed, Feb 05, 2014 at 06:07:57PM -0800, Nishanth Aravamudan wrote:
> On 24.01.2014 [16:25:58 -0800], David Rientjes wrote:
> > On Fri, 24 Jan 2014, Nishanth Aravamudan wrote:
> > 
> > > Thank you for clarifying and providing  a test patch. I ran with this on
> > > the system showing the original problem, configured to have 15GB of
> > > memory.
> > > 
> > > With your patch after boot:
> > > 
> > > MemTotal:       15604736 kB
> > > MemFree:         8768192 kB
> > > Slab:            3882560 kB
> > > SReclaimable:     105408 kB
> > > SUnreclaim:      3777152 kB
> > > 
> > > With Anton's patch after boot:
> > > 
> > > MemTotal:       15604736 kB
> > > MemFree:        11195008 kB
> > > Slab:            1427968 kB
> > > SReclaimable:     109184 kB
> > > SUnreclaim:      1318784 kB
> > > 
> > > 
> > > I know that's fairly unscientific, but the numbers are reproducible. 
> > > 
> > 
> > I don't think the goal of the discussion is to reduce the amount of slab 
> > allocated, but rather get the most local slab memory possible by use of 
> > kmalloc_node().  When a memoryless node is being passed to kmalloc_node(), 
> > which is probably cpu_to_node() for a cpu bound to a node without memory, 
> > my patch is allocating it on the most local node; Anton's patch is 
> > allocating it on whatever happened to be the cpu slab.
> > 
> > > > diff --git a/mm/slub.c b/mm/slub.c
> > > > --- a/mm/slub.c
> > > > +++ b/mm/slub.c
> > > > @@ -2278,10 +2278,14 @@ redo:
> > > > 
> > > >  	if (unlikely(!node_match(page, node))) {
> > > >  		stat(s, ALLOC_NODE_MISMATCH);
> > > > -		deactivate_slab(s, page, c->freelist);
> > > > -		c->page = NULL;
> > > > -		c->freelist = NULL;
> > > > -		goto new_slab;
> > > > +		if (unlikely(!node_present_pages(node)))
> > > > +			node = numa_mem_id();
> > > > +		if (!node_match(page, node)) {
> > > > +			deactivate_slab(s, page, c->freelist);
> > > > +			c->page = NULL;
> > > > +			c->freelist = NULL;
> > > > +			goto new_slab;
> > > > +		}
> > > 
> > > Semantically, and please correct me if I'm wrong, this patch is saying
> > > if we have a memoryless node, we expect the page's locality to be that
> > > of numa_mem_id(), and we still deactivate the slab if that isn't true.
> > > Just wanting to make sure I understand the intent.
> > > 
> > 
> > Yeah, the default policy should be to fallback to local memory if the node 
> > passed is memoryless.
> > 
> > > What I find odd is that there are only 2 nodes on this system, node 0
> > > (empty) and node 1. So won't numa_mem_id() always be 1? And every page
> > > should be coming from node 1 (thus node_match() should always be true?)
> > > 
> > 
> > The nice thing about slub is its debugging ability, what is 
> > /sys/kernel/slab/cache/objects showing in comparison between the two 
> > patches?
> 
> Ok, I finally got around to writing a script that compares the objects
> output from both kernels.
> 
> log1 is with CONFIG_HAVE_MEMORYLESS_NODES on, my kthread locality patch
> and Joonsoo's patch.
> 
> log2 is with CONFIG_HAVE_MEMORYLESS_NODES on, my kthread locality patch
> and Anton's patch.
> 
> slab                           objects    objects   percent
>                                log1       log2      change
> -----------------------------------------------------------
> :t-0000104                     71190      85680      20.353982 %
> UDP                            4352       3392       22.058824 %
> inode_cache                    54302      41923      22.796582 %
> fscache_cookie_jar             3276       2457       25.000000 %
> :t-0000896                     438        292        33.333333 %
> :t-0000080                     310401     195323     37.073978 %
> ext4_inode_cache               335        201        40.000000 %
> :t-0000192                     89408      128898     44.168307 %
> :t-0000184                     151300     81880      45.882353 %
> :t-0000512                     49698      73648      48.191074 %
> :at-0000192                    242867     120948     50.199904 %
> xfs_inode                      34350      15221      55.688501 %
> :t-0016384                     11005      17257      56.810541 %
> proc_inode_cache               103868     34717      66.575846 %
> tw_sock_TCP                    768        256        66.666667 %
> :t-0004096                     15240      25672      68.451444 %
> nfs_inode_cache                1008       315        68.750000 %
> :t-0001024                     14528      24720      70.154185 %
> :t-0032768                     655        1312       100.305344%
> :t-0002048                     14242      30720      115.700042%
> :t-0000640                     1020       2550       150.000000%
> :t-0008192                     10005      27905      178.910545%
> 
> FWIW, the configuration of this LPAR has slightly changed. It is now configured
> for maximally 400 CPUs, of which 200 are present. The result is that even with
> Joonsoo's patch (log1 above), we OOM pretty easily and Anton's slab usage
> script reports:
> 
> slab                                   mem     objs    slabs
>                                       used   active   active
> ------------------------------------------------------------
> kmalloc-512                        1182 MB    2.03%  100.00%
> kmalloc-192                        1182 MB    1.38%  100.00%
> kmalloc-16384                       966 MB   17.66%  100.00%
> kmalloc-4096                        353 MB   15.92%  100.00%
> kmalloc-8192                        259 MB   27.28%  100.00%
> kmalloc-32768                       207 MB    9.86%  100.00%
> 
> In comparison (log2 above):
> 
> slab                                   mem     objs    slabs
>                                       used   active   active
> ------------------------------------------------------------
> kmalloc-16384                       273 MB   98.76%  100.00%
> kmalloc-8192                        225 MB   98.67%  100.00%
> pgtable-2^11                        114 MB  100.00%  100.00%
> pgtable-2^12                        109 MB  100.00%  100.00%
> kmalloc-4096                        104 MB   98.59%  100.00%
> 
> I appreciate all the help so far, if anyone has any ideas how best to
> proceed further, or what they'd like debugged more, I'm happy to get
> this fixed. We're hitting this on a couple of different systems and I'd
> like to find a good resolution to the problem.

Hello,

I have no memoryless system, so, to debug it, I need your help. :)
First, please let me know node information on your system.

I'm preparing 3 another patches which are nearly same with previous patch,
but slightly different approach. Could you test them on your system?
I will send them soon.

And I think that same problem exists if CONFIG_SLAB is enabled. Could you
confirm that?

And, could you confirm that your system's numa_mem_id() is properly set?
And, could you confirm that node_present_pages() test works properly?
And, with my patches, could you give me more information on slub stat?
For this, you need to enable CONFIG_SLUB_STATS. Then please send me all the
slub stat on /proc/sys/kernel/debug/slab.

Sorry for too many request.
If it bothers you too much, please ignore it :)

Thanks.

^ permalink raw reply

* [PATCH V3 2/3] tick/cpuidle: Initialize hrtimer mode of broadcast
From: Preeti U Murthy @ 2014-02-06  5:50 UTC (permalink / raw)
  To: rafael.j.wysocki, linux-pm, peterz, fweisbec, daniel.lezcano,
	linux-kernel, paulus, benh, tglx, linuxppc-dev, mingo
  Cc: deepthi, paulmck, srivatsa.bhat
In-Reply-To: <20140206054158.4595.17827.stgit@preeti>

From: Thomas Gleixner <tglx@linutronix.de>

On some architectures, in certain CPU deep idle states the local timers stop.
An external clock device is used to wakeup these CPUs. The kernel support for the
wakeup of these CPUs is provided by the tick broadcast framework by using the
external clock device as the wakeup source.

However not all implementations of architectures provide such an external
clock device. This patch includes support in the broadcast framework to handle
the wakeup of the CPUs in deep idle states on such systems by queuing a hrtimer
on one of the CPUs, which is meant to handle the wakeup of CPUs in deep idle states.

This patchset introduces a pseudo clock device which can be registered by the
archs as tick_broadcast_device in the absence of a real external clock
device. Once registered, the broadcast framework will work as is for these
architectures as long as the archs take care of the BROADCAST_ENTER
notification failing for one of the CPUs. This CPU is made the stand by CPU to
handle wakeup of the CPUs in deep idle and it *must not enter deep idle states*.

The CPU with the earliest wakeup is chosen to be this CPU. Hence this way the
stand by CPU dynamically moves around and so does the hrtimer which is queued
to trigger at the next earliest wakeup time. This is consistent with the case where
an external clock device is present. The smp affinity of this clock device is
set to the CPU with the earliest wakeup. This patchset handles the hotplug of
the stand by CPU as well by moving the hrtimer on to the CPU handling the CPU_DEAD
notification.

Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
[Added Changelog and code to handle reprogramming of hrtimer]
---

 include/linux/clockchips.h           |    9 +++
 kernel/time/Makefile                 |    2 -
 kernel/time/tick-broadcast-hrtimer.c |  105 ++++++++++++++++++++++++++++++++++
 kernel/time/tick-broadcast.c         |   45 ++++++++++++++-
 4 files changed, 159 insertions(+), 2 deletions(-)
 create mode 100644 kernel/time/tick-broadcast-hrtimer.c

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index e0c5a6c..dbe9e14 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -62,6 +62,11 @@ enum clock_event_mode {
 #define CLOCK_EVT_FEAT_DYNIRQ		0x000020
 #define CLOCK_EVT_FEAT_PERCPU		0x000040
 
+/*
+ * Clockevent device is based on a hrtimer for broadcast
+ */
+#define CLOCK_EVT_FEAT_HRTIMER		0x000080
+
 /**
  * struct clock_event_device - clock event device descriptor
  * @event_handler:	Assigned by the framework to be called by the low
@@ -83,6 +88,7 @@ enum clock_event_mode {
  * @name:		ptr to clock event name
  * @rating:		variable to rate clock event devices
  * @irq:		IRQ number (only for non CPU local devices)
+ * @bound_on:		Bound on CPU
  * @cpumask:		cpumask to indicate for which CPUs this device works
  * @list:		list head for the management code
  * @owner:		module reference
@@ -113,6 +119,7 @@ struct clock_event_device {
 	const char		*name;
 	int			rating;
 	int			irq;
+	int			bound_on;
 	const struct cpumask	*cpumask;
 	struct list_head	list;
 	struct module		*owner;
@@ -180,9 +187,11 @@ extern int tick_receive_broadcast(void);
 #endif
 
 #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
+extern void tick_setup_hrtimer_broadcast(void);
 extern int tick_check_broadcast_expired(void);
 #else
 static inline int tick_check_broadcast_expired(void) { return 0; }
+static void tick_setup_hrtimer_broadcast(void) {};
 #endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 9250130..06151ef 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -3,7 +3,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o tick-broadcast-hrtimer.o
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-sched.o
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
new file mode 100644
index 0000000..af1e119
--- /dev/null
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -0,0 +1,105 @@
+/*
+ * linux/kernel/time/tick-broadcast-hrtimer.c
+ * This file emulates a local clock event device
+ * via a pseudo clock device.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/clockchips.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#include "tick-internal.h"
+
+static struct hrtimer bctimer;
+
+static void bc_set_mode(enum clock_event_mode mode,
+			struct clock_event_device *bc)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		/*
+		 * Note, we cannot cancel the timer here as we might
+		 * run into the following live lock scenario:
+		 *
+		 * cpu 0		cpu1
+		 * lock(broadcast_lock);
+		 *			hrtimer_interrupt()
+		 *			bc_handler()
+		 *			   tick_handle_oneshot_broadcast();
+		 *			    lock(broadcast_lock);
+		 * hrtimer_cancel()
+		 *  wait_for_callback()
+		 */
+		hrtimer_try_to_cancel(&bctimer);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * This is called from the guts of the broadcast code when the cpu
+ * which is about to enter idle has the earliest broadcast timer event.
+ */
+static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
+{
+	/*
+	 * We try to cancel the timer first. If the callback is on
+	 * flight on some other cpu then we let it handle it. If we
+	 * were able to cancel the timer nothing can rearm it as we
+	 * own broadcast_lock.
+	 *
+	 * However we can also be called from the event handler of
+	 * ce_broadcast_hrtimer itself when it expires. We cannot therefore
+	 * restart the timer since it is on flight on the same CPU. But
+	 * due to the same reason we can reset it.
+	 */
+	if (hrtimer_try_to_cancel(&bctimer) >= 0) {
+		hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+		/* Bind the "device" to the cpu */
+		bc->bound_on = smp_processor_id();
+	} else if (bc->bound_on == smp_processor_id()) {
+		hrtimer_set_expires(&bctimer, expires);
+	}
+	return 0;
+}
+
+static struct clock_event_device ce_broadcast_hrtimer = {
+	.set_mode		= bc_set_mode,
+	.set_next_ktime		= bc_set_next,
+	.features		= CLOCK_EVT_FEAT_ONESHOT |
+				  CLOCK_EVT_FEAT_KTIME |
+				  CLOCK_EVT_FEAT_HRTIMER,
+	.rating			= 0,
+	.bound_on		= -1,
+	.min_delta_ns		= 1,
+	.max_delta_ns		= KTIME_MAX,
+	.min_delta_ticks	= 1,
+	.max_delta_ticks	= KTIME_MAX,
+	.mult			= 1,
+	.shift			= 0,
+	.cpumask		= cpu_all_mask,
+};
+
+static enum hrtimer_restart bc_handler(struct hrtimer *t)
+{
+	ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
+
+	if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+		return HRTIMER_NORESTART;
+
+	return HRTIMER_RESTART;
+}
+
+void tick_setup_hrtimer_broadcast(void)
+{
+	hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	bctimer.function = bc_handler;
+	clockevents_register_device(&ce_broadcast_hrtimer);
+}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index ddf2ac2..1e46e62f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -630,6 +630,42 @@ again:
 	raw_spin_unlock(&tick_broadcast_lock);
 }
 
+static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
+{
+	if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+		return 0;
+	if (bc->next_event.tv64 == KTIME_MAX)
+		return 0;
+	return bc->bound_on == cpu ? -EBUSY : 0;
+}
+
+static void broadcast_shutdown_local(struct clock_event_device *bc,
+				     struct clock_event_device *dev)
+{
+	/*
+	 * For hrtimer based broadcasting we cannot shutdown the cpu
+	 * local device if our own event is the first one to expire or
+	 * if we own the broadcast timer.
+	 */
+	if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
+		if (broadcast_needs_cpu(bc, smp_processor_id()))
+			return;
+		if (dev->next_event.tv64 < bc->next_event.tv64)
+			return;
+	}
+	clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+}
+
+static void broadcast_move_bc(int deadcpu)
+{
+	struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+	if (!bc || !broadcast_needs_cpu(bc, deadcpu))
+		return;
+	/* This moves the broadcast assignment to this cpu */
+	clockevents_program_event(bc, bc->next_event, 1);
+}
+
 /*
  * Powerstate information: The system enters/leaves a state, where
  * affected devices might stop
@@ -667,7 +703,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
-			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+			broadcast_shutdown_local(bc, dev);
 			/*
 			 * We only reprogram the broadcast timer if we
 			 * did not mark ourself in the force mask and
@@ -680,6 +716,11 @@ int tick_broadcast_oneshot_control(unsigned long reason)
 			    dev->next_event.tv64 < bc->next_event.tv64)
 				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
 		}
+		/*
+		 * If the current CPU owns the hrtimer broadcast
+		 * mechanism, it cannot go deep idle.
+		 */
+		ret = broadcast_needs_cpu(bc, cpu);
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
@@ -853,6 +894,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
 	cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
 
+	broadcast_move_bc(cpu);
+
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 

^ permalink raw reply related

* [PATCH V3 3/3] time/cpuidle:Handle failed call to BROADCAST_ENTER on archs with CPUIDLE_FLAG_TIMER_STOP set
From: Preeti U Murthy @ 2014-02-06  5:50 UTC (permalink / raw)
  To: rafael.j.wysocki, linux-pm, peterz, fweisbec, daniel.lezcano,
	linux-kernel, paulus, benh, tglx, linuxppc-dev, mingo
  Cc: deepthi, paulmck, srivatsa.bhat
In-Reply-To: <20140206054158.4595.17827.stgit@preeti>

Some archs set the CPUIDLE_FLAG_TIMER_STOP flag for idle states in which the
local timers stop. The cpuidle_idle_call() currently handles such idle states
by calling into the broadcast framework so as to wakeup CPUs at their next
wakeup event. With the hrtimer mode of broadcast, the BROADCAST_ENTER call
into the broadcast frameowork can fail for archs that do not have an external
clock device to handle wakeups and the CPU in question has to thus be made
the stand by CPU. This patch handles such cases by failing the call into
cpuidle so that the arch can take some default action. The arch will certainly
not enter a similar idle state because a failed cpuidle call will also implicitly
indicate that the broadcast framework has not registered this CPU to be woken up.
Hence we are safe if we fail the cpuidle call.

In the process move the functions that trace idle statistics just before and
after the entry and exit into idle states respectively. In other
scenarios where the call to cpuidle fails, we end up not tracing idle
entry and exit since a decision on an idle state could not be taken. Similarly
when the call to broadcast framework fails, we skip tracing idle statistics
because we are in no further position to take a decision on an alternative
idle state to enter into.

Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
---

 drivers/cpuidle/cpuidle.c |   38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index a55e68f..8f42033 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -117,15 +117,19 @@ int cpuidle_idle_call(void)
 {
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_driver *drv;
-	int next_state, entered_state;
-	bool broadcast;
+	int next_state, entered_state, ret = 0;
+	bool broadcast = false;
 
-	if (off || !initialized)
-		return -ENODEV;
+	if (off || !initialized) {
+		ret = -ENODEV;
+		goto out;
+	}
 
 	/* check if the device is ready */
-	if (!dev || !dev->enabled)
-		return -EBUSY;
+	if (!dev || !dev->enabled) {
+		ret = -EBUSY;
+		goto out;
+	}
 
 	drv = cpuidle_get_cpu_driver(dev);
 
@@ -137,15 +141,18 @@ int cpuidle_idle_call(void)
 		if (cpuidle_curr_governor->reflect)
 			cpuidle_curr_governor->reflect(dev, next_state);
 		local_irq_enable();
-		return 0;
+		goto out;
 	}
 
-	trace_cpu_idle_rcuidle(next_state, dev->cpu);
-
 	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
 
-	if (broadcast)
-		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+	if (broadcast) {
+		ret = clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+		if (ret)
+			goto out;
+	}
+
+	trace_cpu_idle_rcuidle(next_state, dev->cpu);
 
 	if (cpuidle_state_is_coupled(dev, drv, next_state))
 		entered_state = cpuidle_enter_state_coupled(dev, drv,
@@ -153,16 +160,17 @@ int cpuidle_idle_call(void)
 	else
 		entered_state = cpuidle_enter_state(dev, drv, next_state);
 
-	if (broadcast)
-		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
-
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev, entered_state);
 
-	return 0;
+out:	if (broadcast)
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+
+	return ret;
 }
 
 /**

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox