All of lore.kernel.org
 help / color / mirror / Atom feed
From: Srivatsa Vaddagiri <vatsa@in.ibm.com>
To: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: schwidefsky@de.ibm.com, Ingo Molnar <mingo@elte.hu>,
	linux-kernel@vger.kernel.org
Subject: Re: [RFC] (How to) Let idle CPUs sleep
Date: Thu, 12 May 2005 14:08:19 +0530	[thread overview]
Message-ID: <20050512083819.GC17644@in.ibm.com> (raw)
In-Reply-To: <427F02CE.7080108@yahoo.com.au>

On Mon, May 09, 2005 at 04:27:26PM +1000, Nick Piggin wrote:
> I could probably find some time to do my implementation if you have
> a complete working patch for eg. UML.

Well, turns out that if we restrict the amount of time idle cpus are 
allowed to sleep, then there is very little change reqd in the scheduler.
Most of the calculation of exponential sleep times can be done outside
it (in the idle CPU's code).

First, the scheduler support to zero cpu_load[] counters before idle
cpu sleeps.

---

 linux-2.6.12-rc3-mm3-vatsa/include/linux/sched.h |    1 
 linux-2.6.12-rc3-mm3-vatsa/kernel/sched.c        |   33 +++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff -puN kernel/sched.c~sched-nohz kernel/sched.c
--- linux-2.6.12-rc3-mm3/kernel/sched.c~sched-nohz	2005-05-11 17:05:13.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/kernel/sched.c	2005-05-11 17:06:38.000000000 +0530
@@ -2323,6 +2323,39 @@ static void rebalance_tick(int this_cpu,
 		}
 	}
 }
+
+#ifdef CONFIG_NO_IDLE_HZ
+/*
+ * Try hard to pull tasks. Called by idle task before it sleeps cutting off
+ * local timer ticks.  This clears the various load counters and tries to pull
+ * tasks.
+ *
+ * Returns 1 if tasks were pulled over, 0 otherwise.
+ */
+int idle_balance_retry(void)
+{
+	int j, moved = 0, this_cpu = smp_processor_id();
+	runqueue_t *this_rq = this_rq();
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	for (j = 0; j < 3; j++)
+		this_rq->cpu_load[j] = 0;
+
+	rebalance_tick(this_cpu, this_rq, SCHED_IDLE);
+
+	if (this_rq->nr_running) {
+		moved = 1;
+		set_tsk_need_resched(current);
+	}
+
+	local_irq_restore(flags);
+
+	return moved;
+}
+#endif
+
 #else
 /*
  * on UP we do not need to balance between CPUs:
diff -puN include/linux/sched.h~sched-nohz include/linux/sched.h
--- linux-2.6.12-rc3-mm3/include/linux/sched.h~sched-nohz	2005-05-11 17:05:13.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/include/linux/sched.h	2005-05-11 17:13:19.000000000 +0530
@@ -897,6 +897,7 @@ extern int task_curr(const task_t *p);
 extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
 extern task_t *idle_task(int cpu);
+extern int idle_balance_retry(void);
 
 void yield(void);
 

_


A sample patch that implements exponential sleep time is below. Note that this 
patch only makes idle cpu pretend as if it is asleep (instead of really cutting
of timer ticks). I used this merely to test the scheduler change.

Martin,
	You probably need something like this for S390 arch!



---

 linux-2.6.12-rc3-mm3-vatsa/arch/i386/Kconfig          |    4 +
 linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/apic.c    |   16 ++++--
 linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/irq.c     |    4 +
 linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/process.c |   47 ++++++++++++++++--
 linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/smp.c     |    6 ++
 5 files changed, 69 insertions(+), 8 deletions(-)

diff -puN arch/i386/Kconfig~vst-sim arch/i386/Kconfig
--- linux-2.6.12-rc3-mm3/arch/i386/Kconfig~vst-sim	2005-05-10 15:53:33.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/Kconfig	2005-05-10 15:54:22.000000000 +0530
@@ -443,6 +443,10 @@ config X86_OOSTORE
 	depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
 	default y
 
+config NO_IDLE_HZ
+        bool "Tickless Idle CPUs support"
+        default n
+
 config HPET_TIMER
 	bool "HPET Timer Support"
 	help
diff -puN arch/i386/kernel/process.c~vst-sim arch/i386/kernel/process.c
--- linux-2.6.12-rc3-mm3/arch/i386/kernel/process.c~vst-sim	2005-05-10 15:53:34.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/process.c	2005-05-12 14:06:16.000000000 +0530
@@ -94,6 +94,12 @@ void enable_hlt(void)
 
 EXPORT_SYMBOL(enable_hlt);
 
+DEFINE_PER_CPU(int, idle_asleep);
+DEFINE_PER_CPU(unsigned long, sleep_duration);
+
+#define MAX_SLEEP_DURATION 	128	/* in tick counts */
+#define MIN_SLEEP_DURATION	8	/* in tick counts */
+
 /*
  * We use this if we don't have any better
  * idle routine..
@@ -102,8 +108,36 @@ void default_idle(void)
 {
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
 		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
+		if (!need_resched()) {
+			unsigned long jif_next, jif_delta;
+
+			jif_next = next_timer_interrupt();
+			jif_delta = jif_next - jiffies;
+
+			if (jif_delta > MIN_SLEEP_DURATION) {
+				unsigned long slpint;
+
+				if (idle_balance_retry()) {
+					local_irq_enable();
+					return;
+				}
+
+				slpint = min(__get_cpu_var(sleep_duration),
+					     jif_delta);
+
+				jif_next = jiffies + slpint;
+				/* Hack to discard local timer ticks */
+				__get_cpu_var(idle_asleep) = 1;
+				cpu_set(smp_processor_id(), nohz_cpu_mask);
+				local_irq_enable();
+				while ((jiffies < jif_next-1) &&
+					 __get_cpu_var(idle_asleep))
+					cpu_relax();
+				__get_cpu_var(idle_asleep) = 0;
+				cpu_clear(smp_processor_id(), nohz_cpu_mask);
+			} else
+				safe_halt();
+		}
 		else
 			local_irq_enable();
 	} else {
@@ -178,6 +212,8 @@ void cpu_idle(void)
 {
 	int cpu = _smp_processor_id();
 
+	__get_cpu_var(sleep_duration) = MIN_SLEEP_DURATION;
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
@@ -189,7 +225,7 @@ void cpu_idle(void)
 			rmb();
 			idle = pm_idle;
 
-			if (!idle)
+			//if (!idle)
 				idle = default_idle;
 
 			if (cpu_is_offline(cpu))
@@ -197,7 +233,12 @@ void cpu_idle(void)
 
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
+
+			if (__get_cpu_var(sleep_duration) < MAX_SLEEP_DURATION)
+				__get_cpu_var(sleep_duration) *= 2;
+
 		}
+		__get_cpu_var(sleep_duration) = MIN_SLEEP_DURATION;
 		schedule();
 	}
 }
diff -puN arch/i386/kernel/irq.c~vst-sim arch/i386/kernel/irq.c
--- linux-2.6.12-rc3-mm3/arch/i386/kernel/irq.c~vst-sim	2005-05-10 15:53:34.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/irq.c	2005-05-10 15:53:47.000000000 +0530
@@ -46,6 +46,8 @@ static union irq_ctx *hardirq_ctx[NR_CPU
 static union irq_ctx *softirq_ctx[NR_CPUS];
 #endif
 
+DECLARE_PER_CPU(int, idle_asleep);
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
@@ -60,6 +62,8 @@ fastcall unsigned int do_IRQ(struct pt_r
 	u32 *isp;
 #endif
 
+	__get_cpu_var(idle_asleep) = 0;
+
 	irq_enter();
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
diff -puN arch/i386/kernel/smp.c~vst-sim arch/i386/kernel/smp.c
--- linux-2.6.12-rc3-mm3/arch/i386/kernel/smp.c~vst-sim	2005-05-11 16:59:38.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/smp.c	2005-05-11 16:59:58.000000000 +0530
@@ -309,6 +309,8 @@ static inline void leave_mm (unsigned lo
  * 2) Leave the mm if we are in the lazy tlb mode.
  */
 
+DECLARE_PER_CPU(int, idle_asleep);
+
 fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
 {
 	unsigned long cpu;
@@ -336,6 +338,7 @@ fastcall void smp_invalidate_interrupt(s
 			leave_mm(cpu);
 	}
 	ack_APIC_irq();
+	__get_cpu_var(idle_asleep) = 0;
 	smp_mb__before_clear_bit();
 	cpu_clear(cpu, flush_cpumask);
 	smp_mb__after_clear_bit();
@@ -598,6 +601,8 @@ void smp_send_stop(void)
 fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
+
+	__get_cpu_var(idle_asleep) = 0;
 }
 
 fastcall void smp_call_function_interrupt(struct pt_regs *regs)
@@ -607,6 +612,7 @@ fastcall void smp_call_function_interrup
 	int wait = call_data->wait;
 
 	ack_APIC_irq();
+	__get_cpu_var(idle_asleep) = 0;
 	/*
 	 * Notify initiating CPU that I've grabbed the data and am
 	 * about to execute the function
diff -puN arch/i386/kernel/apic.c~vst-sim arch/i386/kernel/apic.c
--- linux-2.6.12-rc3-mm3/arch/i386/kernel/apic.c~vst-sim	2005-05-10 15:53:36.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/apic.c	2005-05-10 15:53:47.000000000 +0530
@@ -1171,6 +1171,8 @@ inline void smp_local_timer_interrupt(st
 	 */
 }
 
+DECLARE_PER_CPU(int, idle_asleep);
+
 /*
  * Local APIC timer interrupt. This is the most natural way for doing
  * local interrupts, but local timer interrupts can be emulated by
@@ -1185,15 +1187,19 @@ fastcall void smp_apic_timer_interrupt(s
 	int cpu = smp_processor_id();
 
 	/*
-	 * the NMI deadlock-detector uses this.
-	 */
-	per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
-	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
 	 */
 	ack_APIC_irq();
+
+	if (__get_cpu_var(idle_asleep))
+		return;
+
+	/*
+	 * the NMI deadlock-detector uses this.
+	 */
+	per_cpu(irq_stat, cpu).apic_timer_irqs++;
+
 	/*
 	 * update_process_times() expects us to have done irq_enter().
 	 * Besides, if we don't timer interrupts ignore the global

_
-- 


Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

  reply	other threads:[~2005-05-12  8:39 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-05-07 18:27 [uml-devel] [RFC] (How to) Let idle CPUs sleep Srivatsa Vaddagiri
2005-05-07 18:27 ` Srivatsa Vaddagiri
2005-05-08  3:50 ` [uml-devel] " Rusty Russell
2005-05-08  3:50   ` Rusty Russell
2005-05-08  4:14   ` [uml-devel] " Nick Piggin
2005-05-08  4:14     ` Nick Piggin
2005-05-08 12:19     ` [uml-devel] " Srivatsa Vaddagiri
2005-05-08 12:19       ` Srivatsa Vaddagiri
2005-05-09  6:27       ` [uml-devel] " Nick Piggin
2005-05-09  6:27         ` Nick Piggin
2005-05-12  8:38         ` Srivatsa Vaddagiri [this message]
2005-05-11 18:03     ` [uml-devel] " Tony Lindgren
2005-05-11 18:03       ` Tony Lindgren
2005-05-12  8:46       ` Srivatsa Vaddagiri
2005-05-12 16:01         ` Lee Revell
2005-05-12 16:16           ` Tony Lindgren
2005-05-12 16:28             ` Jesse Barnes
2005-05-12 17:12               ` Srivatsa Vaddagiri
2005-05-12 17:59                 ` Jesse Barnes
2005-05-12 18:16                   ` Tony Lindgren
2005-05-13  6:27                   ` Srivatsa Vaddagiri
2005-05-12 18:08                 ` Martin Schwidefsky
2005-05-12 18:21                   ` Tony Lindgren
2005-05-13  6:23                   ` Srivatsa Vaddagiri
2005-05-13  7:16                     ` Nick Piggin
2005-05-13  8:04                       ` Ingo Molnar
2005-05-13  8:27                         ` Nick Piggin
2005-05-13  9:19                           ` Srivatsa Vaddagiri
2005-05-13  9:33                             ` Nick Piggin
2005-05-12 21:16                 ` George Anzinger
2005-05-12 21:35                   ` Jesse Barnes
2005-05-12 22:15                     ` George Anzinger
2005-05-13  0:43                       ` Jesse Barnes
2005-05-13  6:31                         ` Srivatsa Vaddagiri
2005-06-30 12:47     ` Srivatsa Vaddagiri
2005-07-06 17:31       ` Srivatsa Vaddagiri
2005-05-08 10:13   ` [uml-devel] " Arjan van de Ven
2005-05-08 10:13     ` Arjan van de Ven
2005-05-08 13:33     ` [uml-devel] " Andi Kleen
2005-05-08 13:33       ` Andi Kleen
2005-05-08 13:44       ` [uml-devel] " Arjan van de Ven
2005-05-08 13:44         ` Arjan van de Ven
2005-05-08 14:53         ` [uml-devel] " Andi Kleen
2005-05-08 14:53           ` Andi Kleen
2005-05-08 13:31 ` [uml-devel] " Andi Kleen
2005-05-08 13:31   ` Andi Kleen
2005-05-08 15:26   ` [uml-devel] " Srivatsa Vaddagiri
2005-05-08 15:26     ` Srivatsa Vaddagiri

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20050512083819.GC17644@in.ibm.com \
    --to=vatsa@in.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=nickpiggin@yahoo.com.au \
    --cc=schwidefsky@de.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.