linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/4] tickless idle cpu: Skip ticks when CPU is idle
@ 2006-04-07  6:31 Srivatsa Vaddagiri
  2006-04-07 14:16 ` Kumar Gala
  2006-04-07 23:40 ` Paul Mackerras
  0 siblings, 2 replies; 4+ messages in thread
From: Srivatsa Vaddagiri @ 2006-04-07  6:31 UTC (permalink / raw)
  To: anton, benh, paulus; +Cc: linuxppc-dev, sri_vatsa_v

This is the core patch which skips ticks when a CPU is idle.
Should work on pSeries, pmac and maple machines.

The patch is against 2.6.17-rc1-mm1 and has been tested on a 16-way (with SMT) 
Power5 box (p570).

Signed-off-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>

---

 linux-2.6.17-rc1-root/arch/powerpc/Kconfig                   |    6 
 linux-2.6.17-rc1-root/arch/powerpc/kernel/idle_power4.S      |    3 
 linux-2.6.17-rc1-root/arch/powerpc/kernel/irq.c              |    3 
 linux-2.6.17-rc1-root/arch/powerpc/kernel/time.c             |  150 ++++++++---
 linux-2.6.17-rc1-root/arch/powerpc/kernel/traps.c            |    1 
 linux-2.6.17-rc1-root/arch/powerpc/platforms/pseries/setup.c |    6 
 linux-2.6.17-rc1-root/include/asm-powerpc/time.h             |    8 
 7 files changed, 147 insertions(+), 30 deletions(-)

diff -puN arch/powerpc/kernel/time.c~no_idle_hz arch/powerpc/kernel/time.c
--- linux-2.6.17-rc1/arch/powerpc/kernel/time.c~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/kernel/time.c	2006-04-07 11:29:13.000000000 +0530
@@ -633,40 +633,12 @@ static void iSeries_tb_recal(void)
 }
 #endif
 
-/*
- * For iSeries shared processors, we have to let the hypervisor
- * set the hardware decrementer.  We set a virtual decrementer
- * in the lppaca and call the hypervisor if the virtual
- * decrementer is less than the current value in the hardware
- * decrementer. (almost always the new decrementer value will
- * be greater than the current hardware decementer so the hypervisor
- * call will not be needed)
- */
-
-/*
- * timer_interrupt - gets called when the decrementer overflows,
- * with interrupts disabled.
- */
-void timer_interrupt(struct pt_regs * regs)
+static void account_ticks(struct pt_regs *regs)
 {
 	int next_dec;
 	int cpu = smp_processor_id();
 	unsigned long ticks;
 
-#ifdef CONFIG_PPC32
-	if (atomic_read(&ppc_n_lost_interrupts) != 0)
-		do_IRQ(regs);
-#endif
-
-	irq_enter();
-
-	profile_tick(CPU_PROFILING, regs);
-	calculate_steal_time();
-
-#ifdef CONFIG_PPC_ISERIES
-	get_lppaca()->int_dword.fields.decr_int = 0;
-#endif
-
 	while ((ticks = tb_ticks_since(per_cpu(last_jiffy, cpu)))
 	       >= tb_ticks_per_jiffy) {
 		/* Update last_jiffy */
@@ -701,6 +673,123 @@ void timer_interrupt(struct pt_regs * re
 	
 	next_dec = tb_ticks_per_jiffy - ticks;
 	set_dec(next_dec);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+
+/* Returns 1 if this CPU was set in the mask */
+static inline int clear_hzless_mask(void)
+{
+	unsigned long cpu = smp_processor_id();
+	int rc = 0;
+
+	if (unlikely(cpu_isset(cpu, nohz_cpu_mask))) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		rc = 1;
+	}
+
+	return rc;
+}
+
+#define MAX_DEC_COUNT	UINT_MAX	/* Decrementer is 32-bit */
+static int min_skip = 2;		/* Minimum number of ticks to skip */
+static int max_skip;			/* Maximum number of ticks to skip */
+
+
+int sysctl_hz_timer = 1;
+
+/* Defer timer interrupt for as long as possible. This is accomplished by
+ * programming the decrementer to a suitable value such that it raises the
+ * exception after desired interval. This features allows CPUs to
+ * be used more efficiently in virtualized environments.
+ *
+ * Called with interrupts disabled on an idle CPU. Caller has to ensure that
+ * idle loop is not exited w/o start_hz_timer being called via an interrupt
+ * to restore timer interrupt frequency.
+ */
+
+void stop_hz_timer(void)
+{
+	unsigned long cpu = smp_processor_id(), seq, delta;
+	int next_dec;
+
+	if (sysctl_hz_timer != 0)
+		return;
+
+	cpu_set(cpu, nohz_cpu_mask);
+	mb();
+	if (rcu_pending(cpu) || local_softirq_pending()) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		return;
+	}
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		delta = next_timer_interrupt() - jiffies;
+
+		if (delta < min_skip) {
+			cpu_clear(cpu, nohz_cpu_mask);
+			return;
+		}
+
+		if (delta > max_skip)
+			delta = max_skip;
+
+		next_dec = tb_last_stamp + delta * tb_ticks_per_jiffy;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	next_dec -= get_tb();
+	set_dec(next_dec);
+
+	return;
+}
+
+/* Take into account skipped ticks and restore the HZ timer frequency */
+void start_hz_timer(struct pt_regs *regs)
+{
+	if (clear_hzless_mask())
+		account_ticks(regs);
+}
+
+#else
+static inline int clear_hzless_mask(void) { return 0;}
+#endif
+
+/*
+ * For iSeries shared processors, we have to let the hypervisor
+ * set the hardware decrementer.  We set a virtual decrementer
+ * in the lppaca and call the hypervisor if the virtual
+ * decrementer is less than the current value in the hardware
+ * decrementer. (almost always the new decrementer value will
+ * be greater than the current hardware decementer so the hypervisor
+ * call will not be needed)
+ */
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+void timer_interrupt(struct pt_regs * regs)
+{
+#ifdef CONFIG_PPC32
+	if (atomic_read(&ppc_n_lost_interrupts) != 0)
+		do_IRQ(regs);
+#endif
+
+	irq_enter();
+
+	clear_hzless_mask();
+
+	profile_tick(CPU_PROFILING, regs);
+	calculate_steal_time();
+
+#ifdef CONFIG_PPC_ISERIES
+	get_lppaca()->int_dword.fields.decr_int = 0;
+#endif
+
+	account_ticks(regs);
 
 #ifdef CONFIG_PPC_ISERIES
 	if (hvlpevent_is_pending())
@@ -955,6 +1044,9 @@ void __init time_init(void)
 	tb_ticks_per_usec = ppc_tb_freq / 1000000;
 	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
 	calc_cputime_factors();
+#ifdef CONFIG_NO_IDLE_HZ
+	max_skip = __USE_RTC() ? HZ : MAX_DEC_COUNT / tb_ticks_per_jiffy;
+#endif
 
 	/*
 	 * Calculate the length of each tick in ns.  It will not be
diff -puN arch/powerpc/kernel/irq.c~no_idle_hz arch/powerpc/kernel/irq.c
--- linux-2.6.17-rc1/arch/powerpc/kernel/irq.c~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/kernel/irq.c	2006-04-07 04:14:57.000000000 +0530
@@ -60,6 +60,7 @@
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/paca.h>
 #endif
+#include <asm/time.h>
 
 int __irq_offset_value;
 #ifdef CONFIG_PPC32
@@ -189,6 +190,8 @@ void do_IRQ(struct pt_regs *regs)
 
         irq_enter();
 
+	start_hz_timer(regs);
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 2KB free? */
 	{
diff -puN include/asm-powerpc/time.h~no_idle_hz include/asm-powerpc/time.h
--- linux-2.6.17-rc1/include/asm-powerpc/time.h~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/include/asm-powerpc/time.h	2006-04-07 04:14:58.000000000 +0530
@@ -198,6 +198,14 @@ static inline unsigned long tb_ticks_sin
 	return get_tbl() - tstamp;
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+extern void stop_hz_timer(void);
+extern void start_hz_timer(struct pt_regs *);
+#else
+static inline void stop_hz_timer(void) { }
+static inline void start_hz_timer(struct pt_regs *regs) { }
+#endif
+
 #define mulhwu(x,y) \
 ({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
 
diff -puN arch/powerpc/Kconfig~no_idle_hz arch/powerpc/Kconfig
--- linux-2.6.17-rc1/arch/powerpc/Kconfig~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/Kconfig	2006-04-07 04:14:58.000000000 +0530
@@ -593,6 +593,12 @@ config HOTPLUG_CPU
 
 	  Say N if you are unsure.
 
+config NO_IDLE_HZ
+	depends on EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_MAPLE)
+	bool "Switch off timer ticks on idle CPUs"
+	help
+	  Switches the HZ timer interrupts off when a CPU is idle.
+
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on PPC_MULTIPLATFORM && EXPERIMENTAL
diff -puN arch/powerpc/kernel/traps.c~no_idle_hz arch/powerpc/kernel/traps.c
--- linux-2.6.17-rc1/arch/powerpc/kernel/traps.c~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/kernel/traps.c	2006-04-07 04:14:58.000000000 +0530
@@ -875,6 +875,7 @@ void altivec_unavailable_exception(struc
 
 void performance_monitor_exception(struct pt_regs *regs)
 {
+	start_hz_timer(regs);
 	perf_irq(regs);
 }
 
diff -puN arch/powerpc/platforms/pseries/setup.c~no_idle_hz arch/powerpc/platforms/pseries/setup.c
--- linux-2.6.17-rc1/arch/powerpc/platforms/pseries/setup.c~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/platforms/pseries/setup.c	2006-04-07 04:15:50.000000000 +0530
@@ -463,8 +463,10 @@ static void pseries_dedicated_idle_sleep
 	 * very low priority.  The cede enables interrupts, which
 	 * doesn't matter here.
 	 */
-	if (!lppaca[cpu ^ 1].idle || poll_pending() == H_PENDING)
+	if (!lppaca[cpu ^ 1].idle || poll_pending() == H_PENDING) {
+		stop_hz_timer();
 		cede_processor();
+	}
 
 out:
 	HMT_medium();
@@ -479,6 +481,8 @@ static void pseries_shared_idle_sleep(vo
 	 */
 	get_lppaca()->idle = 1;
 
+	stop_hz_timer();
+
 	/*
 	 * Yield the processor to the hypervisor.  We return if
 	 * an external interrupt occurs (which are driven prior
diff -puN arch/powerpc/kernel/idle_power4.S~no_idle_hz arch/powerpc/kernel/idle_power4.S
--- linux-2.6.17-rc1/arch/powerpc/kernel/idle_power4.S~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/kernel/idle_power4.S	2006-04-07 04:14:58.000000000 +0530
@@ -30,6 +30,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
 	cmpwi	0,r4,0
 	beqlr
 
+	mflr	r4
+	bl	.stop_hz_timer
+	mtlr	r4
 	/* Go to NAP now */
 BEGIN_FTR_SECTION
 	DSSALL

_


-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/4] tickless idle cpu: Skip ticks when CPU is idle
  2006-04-07  6:31 [PATCH 2/4] tickless idle cpu: Skip ticks when CPU is idle Srivatsa Vaddagiri
@ 2006-04-07 14:16 ` Kumar Gala
  2006-04-10 12:23   ` Srivatsa Vaddagiri
  2006-04-07 23:40 ` Paul Mackerras
  1 sibling, 1 reply; 4+ messages in thread
From: Kumar Gala @ 2006-04-07 14:16 UTC (permalink / raw)
  To: vatsa; +Cc: sri_vatsa_v, paulus, linuxppc-dev


On Apr 7, 2006, at 1:31 AM, Srivatsa Vaddagiri wrote:

> This is the core patch which skips ticks when a CPU is idle.
> Should work on pSeries, pmac and maple machines.
>
> The patch is against 2.6.17-rc1-mm1 and has been tested on a 16-way  
> (with SMT)
> Power5 box (p570).
>
> Signed-off-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
>
> ---
>
>  linux-2.6.17-rc1-root/arch/powerpc/Kconfig                   |    6
>  linux-2.6.17-rc1-root/arch/powerpc/kernel/idle_power4.S      |    3
>  linux-2.6.17-rc1-root/arch/powerpc/kernel/irq.c              |    3
>  linux-2.6.17-rc1-root/arch/powerpc/kernel/time.c             |   
> 150 ++++++++---
>  linux-2.6.17-rc1-root/arch/powerpc/kernel/traps.c            |    1
>  linux-2.6.17-rc1-root/arch/powerpc/platforms/pseries/setup.c |    6
>  linux-2.6.17-rc1-root/include/asm-powerpc/time.h             |    8
>  7 files changed, 147 insertions(+), 30 deletions(-)
>

[snip]

> diff -puN arch/powerpc/Kconfig~no_idle_hz arch/powerpc/Kconfig
> --- linux-2.6.17-rc1/arch/powerpc/Kconfig~no_idle_hz	2006-04-07  
> 04:14:39.000000000 +0530
> +++ linux-2.6.17-rc1-root/arch/powerpc/Kconfig	2006-04-07  
> 04:14:58.000000000 +0530
> @@ -593,6 +593,12 @@ config HOTPLUG_CPU
>
>  	  Say N if you are unsure.
>
> +config NO_IDLE_HZ
> +	depends on EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_MAPLE)
> +	bool "Switch off timer ticks on idle CPUs"
> +	help
> +	  Switches the HZ timer interrupts off when a CPU is idle.
> +

any reason not to provide this for all 6xx class processors?

>  config KEXEC
>  	bool "kexec system call (EXPERIMENTAL)"
>  	depends on PPC_MULTIPLATFORM && EXPERIMENTAL

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/4] tickless idle cpu: Skip ticks when CPU is idle
  2006-04-07  6:31 [PATCH 2/4] tickless idle cpu: Skip ticks when CPU is idle Srivatsa Vaddagiri
  2006-04-07 14:16 ` Kumar Gala
@ 2006-04-07 23:40 ` Paul Mackerras
  1 sibling, 0 replies; 4+ messages in thread
From: Paul Mackerras @ 2006-04-07 23:40 UTC (permalink / raw)
  To: vatsa; +Cc: sri_vatsa_v, linuxppc-dev

Srivatsa Vaddagiri writes:

> diff -puN arch/powerpc/kernel/idle_power4.S~no_idle_hz arch/powerpc/kernel/idle_power4.S
> --- linux-2.6.17-rc1/arch/powerpc/kernel/idle_power4.S~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
> +++ linux-2.6.17-rc1-root/arch/powerpc/kernel/idle_power4.S	2006-04-07 04:14:58.000000000 +0530
> @@ -30,6 +30,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
>  	cmpwi	0,r4,0
>  	beqlr
>  
> +	mflr	r4
> +	bl	.stop_hz_timer
> +	mtlr	r4

This won't work - r4 is volatile across function calls, that is,
stop_hz_timer() could change r4 and is not required to save and
restore it.

Paul.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/4] tickless idle cpu: Skip ticks when CPU is idle
  2006-04-07 14:16 ` Kumar Gala
@ 2006-04-10 12:23   ` Srivatsa Vaddagiri
  0 siblings, 0 replies; 4+ messages in thread
From: Srivatsa Vaddagiri @ 2006-04-10 12:23 UTC (permalink / raw)
  To: Kumar Gala; +Cc: sri_vatsa_v, paulus, linuxppc-dev

On Fri, Apr 07, 2006 at 09:16:58AM -0500, Kumar Gala wrote:
> >+config NO_IDLE_HZ
> >+	depends on EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_MAPLE)
> >+	bool "Switch off timer ticks on idle CPUs"
> >+	help
> >+	  Switches the HZ timer interrupts off when a CPU is idle.
> >+
> 
> any reason not to provide this for all 6xx class processors?

I think the same patch would work mostly for 6xx cpus as well. I however
dont think have any hardware to test it. If I am not mistaken, to
support 6xx CPUs, only ppc6xx_idle needs to be modified to call stop_hz_timer 
before going into power-save mode?


-- 
Regards,
vatsa

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2006-04-10 13:44 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-07  6:31 [PATCH 2/4] tickless idle cpu: Skip ticks when CPU is idle Srivatsa Vaddagiri
2006-04-07 14:16 ` Kumar Gala
2006-04-10 12:23   ` Srivatsa Vaddagiri
2006-04-07 23:40 ` Paul Mackerras

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).