Re: [RFC V2 PATCH 3/6] cpuidle/ppc: Add timer offload framework to support deep idle states

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
To: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: fweisbec@gmail.com, paul.gortmaker@windriver.com,
	paulus@samba.org, shangw@linux.vnet.ibm.com, rjw@sisk.pl,
	paulmck@linux.vnet.ibm.com, arnd@arndb.de,
	linux-pm@vger.kernel.org, rostedt@goodmis.org,
	john.stultz@linaro.org, tglx@linutronix.de,
	chenhui.zhao@freescale.com, deepthi@linux.vnet.ibm.com,
	r58472@freescale.com, geoff@infradead.org,
	linux-kernel@vger.kernel.org, srivatsa.bhat@linux.vnet.ibm.com,
	schwidefsky@de.ibm.com, linuxppc-dev@lists.ozlabs.org
Subject: Re: [RFC V2 PATCH 3/6] cpuidle/ppc: Add timer offload framework to support deep idle states
Date: Thu, 22 Aug 2013 13:27:17 +1000	[thread overview]
Message-ID: <1377142037.25016.272.camel@pasglop> (raw)
In-Reply-To: <20130814115613.5193.43161.stgit@preeti.in.ibm.com>

On Wed, 2013-08-14 at 17:26 +0530, Preeti U Murthy wrote:

>  static irqreturn_t timer_action(int irq, void *data)
>  {
> -	timer_interrupt();
> +	decrementer_timer_interrupt();
>  	return IRQ_HANDLED;
>  }

I don't completely understand what you are doing here, but ...

> @@ -223,7 +223,7 @@ irqreturn_t smp_ipi_demux(void)
>  
>  #ifdef __BIG_ENDIAN
>  		if (all & (1 << (24 - 8 * PPC_MSG_TIMER)))
> -			timer_interrupt();
> +			decrementer_timer_interrupt();

Why call this decrementer_* since it's specifically *not* the
decrementer ?

Makes more sense to be called broadcast_timer_interrupt() no ?

>  		if (all & (1 << (24 - 8 * PPC_MSG_RESCHEDULE)))
>  			scheduler_ipi();
>  		if (all & (1 << (24 - 8 * PPC_MSG_CALL_FUNC_SINGLE)))
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 65ab9e9..7e858e1 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -42,6 +42,7 @@
>  #include <linux/timex.h>
>  #include <linux/kernel_stat.h>
>  #include <linux/time.h>
> +#include <linux/timer.h>
>  #include <linux/init.h>
>  #include <linux/profile.h>
>  #include <linux/cpu.h>
> @@ -97,8 +98,11 @@ static struct clocksource clocksource_timebase = {
>  
>  static int decrementer_set_next_event(unsigned long evt,
>  				      struct clock_event_device *dev);
> +static int broadcast_set_next_event(unsigned long evt,
> +				      struct clock_event_device *dev);
>  static void decrementer_set_mode(enum clock_event_mode mode,
>  				 struct clock_event_device *dev);
> +static void decrementer_timer_broadcast(const struct cpumask *mask);
>  
>  struct clock_event_device decrementer_clockevent = {
>  	.name           = "decrementer",
> @@ -106,13 +110,26 @@ struct clock_event_device decrementer_clockevent = {
>  	.irq            = 0,
>  	.set_next_event = decrementer_set_next_event,
>  	.set_mode       = decrementer_set_mode,
> -	.features       = CLOCK_EVT_FEAT_ONESHOT,
> +	.broadcast	= decrementer_timer_broadcast,
> +	.features       = CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_ONESHOT,
>  };
>  EXPORT_SYMBOL(decrementer_clockevent);
>  
> +struct clock_event_device broadcast_clockevent = {
> +	.name           = "broadcast",
> +	.rating         = 200,
> +	.irq            = 0,
> +	.set_next_event = broadcast_set_next_event,
> +	.set_mode       = decrementer_set_mode,

Same here, why "decrementer" ? This event device is *not* the
decrementer right ?

Also, pardon my ignorance, by why do we need a separate
clock_event_device ? Ie what does that do ? I am not familiar with the
broadcast scheme and what .broadcast do in the "decrementer" one, so
you need to provide me at least with better explanations.

> +	.features       = CLOCK_EVT_FEAT_ONESHOT,
> +};
> +EXPORT_SYMBOL(broadcast_clockevent);
> +
>  DEFINE_PER_CPU(u64, decrementers_next_tb);
>  static DEFINE_PER_CPU(struct clock_event_device, decrementers);
> +static DEFINE_PER_CPU(struct clock_event_device, bc_timer);

Do we really need an additional per CPU here ? What does the bc_timer
actually represent ? The sender (broadcaster) or receiver ? In the
latter case, why does it have to differ from the decrementer ?

> +int bc_cpu;

A global with that name ? Exported ? That's gross...

>  #define XSEC_PER_SEC (1024*1024)
>  
>  #ifdef CONFIG_PPC64
> @@ -487,6 +504,8 @@ void timer_interrupt(struct pt_regs * regs)
>  	struct pt_regs *old_regs;
>  	u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
>  	struct clock_event_device *evt = &__get_cpu_var(decrementers);
> +	struct clock_event_device *bc_evt = &__get_cpu_var(bc_timer);
> +	int cpu = smp_processor_id();
>  	u64 now;
>  
>  	/* Ensure a positive value is written to the decrementer, or else
> @@ -532,6 +551,10 @@ void timer_interrupt(struct pt_regs * regs)
>  		*next_tb = ~(u64)0;
>  		if (evt->event_handler)
>  			evt->event_handler(evt);
> +		if (cpu == bc_cpu && bc_evt->event_handler) {
> +			bc_evt->event_handler(bc_evt);
> +		}
> +

So here, on a CPU that happens to be "bc_cpu", we call an additional
"event_handler" on every timer interrupt ? What does that handler
actually do ? How does it relate to the "broadcast" field in the
original clock source ?

>  	} else {
>  		now = *next_tb - now;
>  		if (now <= DECREMENTER_MAX)
> @@ -806,6 +829,20 @@ static int decrementer_set_next_event(unsigned long evt,
>  	return 0;
>  }
>  
> +/*
> + * We cannot program the decrementer of a remote CPU. Hence CPUs going into
> + * deep idle states need to send IPIs to the broadcast CPU to program its
> + * decrementer for their next local event so as to receive a broadcast IPI
> + * for the same. In order to avoid the overhead of multiple CPUs from sending
> + * IPIs, this function is a nop. Instead the broadcast CPU will handle the
> + * wakeup of CPUs in deep idle states in each of its local timer interrupts.
> + */
> +static int broadcast_set_next_event(unsigned long evt,
> +					struct clock_event_device *dev)
> +{
> +	return 0;
> +}
> +
>  static void decrementer_set_mode(enum clock_event_mode mode,
>  				 struct clock_event_device *dev)
>  {
> @@ -813,6 +850,20 @@ static void decrementer_set_mode(enum clock_event_mode mode,
>  		decrementer_set_next_event(DECREMENTER_MAX, dev);
>  }
>  
> +void decrementer_timer_interrupt(void)
> +{
> +	struct clock_event_device *evt;
> +	evt = &per_cpu(decrementers, smp_processor_id());
> +
> +	if (evt->event_handler)
> +		evt->event_handler(evt);
> +}

So that's what happens when we receive the broadcast... it might need
some stats ... and it's using the normal "decrementer" clock source,
so I still have a problem understanding why you need another one.

> +static void decrementer_timer_broadcast(const struct cpumask *mask)
> +{
> +	arch_send_tick_broadcast(mask);
> +}

Ok, so far so good. But that's also hooked into the normal clock
source...

>  static void register_decrementer_clockevent(int cpu)
>  {
>  	struct clock_event_device *dec = &per_cpu(decrementers, cpu);
> @@ -826,6 +877,20 @@ static void register_decrementer_clockevent(int cpu)
>  	clockevents_register_device(dec);
>  }
>  
> +static void register_broadcast_clockevent(int cpu)
> +{
> +	struct clock_event_device *bc_evt = &per_cpu(bc_timer, cpu);
> +
> +	*bc_evt = broadcast_clockevent;
> +	bc_evt->cpumask = cpumask_of(cpu);
> +
> +	printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
> +		    bc_evt->name, bc_evt->mult, bc_evt->shift, cpu);
> +
> +	clockevents_register_device(bc_evt);
> +	bc_cpu = cpu;
> +}
> +
>  static void __init init_decrementer_clockevent(void)
>  {
>  	int cpu = smp_processor_id();
> @@ -840,6 +905,19 @@ static void __init init_decrementer_clockevent(void)
>  	register_decrementer_clockevent(cpu);
>  }
>  
> +static void __init init_broadcast_clockevent(void)
> +{
> +	int cpu = smp_processor_id();
> +
> +	clockevents_calc_mult_shift(&broadcast_clockevent, ppc_tb_freq, 4);
> +
> +	broadcast_clockevent.max_delta_ns =
> +		clockevent_delta2ns(DECREMENTER_MAX, &broadcast_clockevent);
> +	broadcast_clockevent.min_delta_ns =
> +		clockevent_delta2ns(2, &broadcast_clockevent);
> +	register_broadcast_clockevent(cpu);
> +}
> +
>  void secondary_cpu_time_init(void)
>  {
>  	/* Start the decrementer on CPUs that have manual control
> @@ -916,6 +994,7 @@ void __init time_init(void)
>  	clocksource_init();
>  
>  	init_decrementer_clockevent();
> +	init_broadcast_clockevent();
>  }
>  
> 
> diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
> index ace2d22..e1a96eb 100644
> --- a/arch/powerpc/platforms/powernv/Kconfig
> +++ b/arch/powerpc/platforms/powernv/Kconfig
> @@ -6,6 +6,7 @@ config PPC_POWERNV
>  	select PPC_ICP_NATIVE
>  	select PPC_P7_NAP
>  	select PPC_PCI_CHOICE if EMBEDDED
> +	select GENERIC_CLOCKEVENTS_BROADCAST
>  	select EPAPR_BOOT
>  	default y
>  
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

next prev parent reply	other threads:[~2013-08-22  3:28 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-08-14 11:55 [RFC V2 PATCH 0/6] cpuidle/ppc: Timer offload framework to support deep idle states Preeti U Murthy
2013-08-14 11:55 ` [RFC V2 PATCH 1/6] powerpc: Free up the IPI message slot of ipi call function (PPC_MSG_CALL_FUNC) Preeti U Murthy
2013-08-14 11:56 ` [RFC V2 PATCH 2/6] powerpc: Implement broadcast timer interrupt as an IPI message Preeti U Murthy
2013-08-22  3:10   ` Benjamin Herrenschmidt
2013-08-22  5:49     ` Preeti U Murthy
2013-08-14 11:56 ` [RFC V2 PATCH 3/6] cpuidle/ppc: Add timer offload framework to support deep idle states Preeti U Murthy
2013-08-22  3:27   ` Benjamin Herrenschmidt [this message]
2013-08-22  9:03     ` Preeti U Murthy
2013-08-14 11:56 ` [RFC V2 PATCH 4/6] cpuidle/ppc: Add longnap state to the idle states on powernv Preeti U Murthy
2013-08-22  3:28   ` Benjamin Herrenschmidt
2013-08-22  9:09     ` Preeti U Murthy
2013-08-14 11:56 ` [RFC V2 PATCH 5/6] cpuidle/ppc: Enable dynamic movement of the broadcast functionality across CPUs Preeti U Murthy
2013-08-14 11:56 ` [RFC V2 PATCH 6/6] cpuidle/ppc : Queue a hrtimer on bc_cpu, explicitly to do broadcast handling Preeti U Murthy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1377142037.25016.272.camel@pasglop \
    --to=benh@kernel.crashing.org \
    --cc=arnd@arndb.de \
    --cc=chenhui.zhao@freescale.com \
    --cc=deepthi@linux.vnet.ibm.com \
    --cc=fweisbec@gmail.com \
    --cc=geoff@infradead.org \
    --cc=john.stultz@linaro.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pm@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=paul.gortmaker@windriver.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=paulus@samba.org \
    --cc=preeti@linux.vnet.ibm.com \
    --cc=r58472@freescale.com \
    --cc=rjw@sisk.pl \
    --cc=rostedt@goodmis.org \
    --cc=schwidefsky@de.ibm.com \
    --cc=shangw@linux.vnet.ibm.com \
    --cc=srivatsa.bhat@linux.vnet.ibm.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).