[patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop
@ 2009-05-14 11:21 Thomas Gleixner
  2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 11:21 UTC (permalink / raw)
  To: LKML; +Cc: Andrew Morton, Dimitri Sivanich, Peter Zijlstra, Ingo Molnar

Dimitri Sivanich pointed out that calc_load iterates over all online
CPUs under xtime lock which can cause long latencies on large SMP
systems.

The following patch series removes the iteration loop and lets
scheduler_tick() on each CPU update the number of active tasks. The
calc_load() function just uses this variable and updates avenrun from
do_timer().

Dimitri, can you please test run this on one of your large machines ?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [patch 1/2] sched, timers: move calc_load() to scheduler
  2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
@ 2009-05-14 11:21 ` Thomas Gleixner
  2009-05-14 20:00   ` Peter Zijlstra
  2009-05-14 11:21 ` [patch 2/2] sched, timers: cleanup avenrun users Thomas Gleixner
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 11:21 UTC (permalink / raw)
  To: LKML; +Cc: Andrew Morton, Dimitri Sivanich, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: move-calc-load-to-scheduler-v1.patch --]
[-- Type: text/plain, Size: 8866 bytes --]

Dimitri Sivanich noticed that xtime_lock is held write locked across
calc_load() which iterates over all online CPUs. That can cause long
latencies for xtime_lock readers on large SMP systems. 

The load average calculation is a rough estimate anyway so there is
no real need to protect the readers vs. the update. It's not a problem
when the avenrun array is updated while a reader copies the values.

Instead of iterating over all online CPUs let the scheduler_tick code
update the number of active tasks shortly before the avenrun update
happens. The avenrun update itself is handled by the CPU which calls
do_timer().

[ Impact: reduce xtime_lock write locked section ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h     |    2 -
 kernel/sched.c            |   84 ++++++++++++++++++++++++++++++++++++++++------
 kernel/sched_idletask.c   |    3 +
 kernel/time/timekeeping.c |    2 -
 kernel/timer.c            |   54 +----------------------------
 5 files changed, 80 insertions(+), 65 deletions(-)

Index: linux-2.6-tip/include/linux/sched.h
===================================================================
--- linux-2.6-tip.orig/include/linux/sched.h
+++ linux-2.6-tip/include/linux/sched.h
@@ -137,9 +137,9 @@ DECLARE_PER_CPU(unsigned long, process_c
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
-extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 extern u64 cpu_nr_migrations(int cpu);
+extern void calc_global_load(void);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
Index: linux-2.6-tip/kernel/sched.c
===================================================================
--- linux-2.6-tip.orig/kernel/sched.c
+++ linux-2.6-tip/kernel/sched.c
@@ -628,6 +628,10 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+	/* calc_load related fields */
+	unsigned long calc_load_update;
+	long calc_load_active;
+
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
@@ -1726,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs
 }
 #endif
 
+static void calc_load_account_active(struct rq *this_rq);
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2934,19 +2940,57 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
-	unsigned long i, running = 0, uninterruptible = 0;
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
+}
 
-	for_each_online_cpu(i) {
-		running += cpu_rq(i)->nr_running;
-		uninterruptible += cpu_rq(i)->nr_uninterruptible;
-	}
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+	unsigned long upd = calc_load_update + 10;
+	long active;
+
+	if (time_before(jiffies, upd))
+		return;
 
-	if (unlikely((long)uninterruptible < 0))
-		uninterruptible = 0;
+	active = atomic_long_read(&calc_load_tasks);
+	active = active > 0 ? active * FIXED_1 : 0;
 
-	return running + uninterruptible;
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+	long nr_active, delta;
+
+	nr_active = this_rq->nr_running;
+	nr_active += (long) this_rq->nr_uninterruptible;
+
+	if (nr_active != this_rq->calc_load_active) {
+		delta = nr_active - this_rq->calc_load_active;
+		this_rq->calc_load_active = nr_active;
+		atomic_long_add(delta, &calc_load_tasks);
+	}
 }
 
 /*
@@ -2986,6 +3030,11 @@ static void update_cpu_load(struct rq *t
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
+
+	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+		this_rq->calc_load_update += LOAD_FREQ;
+		calc_load_account_active(this_rq);
+	}
 }
 
 #ifdef CONFIG_SMP
@@ -7192,6 +7241,14 @@ static void migrate_dead_tasks(unsigned 
 
 	}
 }
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7426,6 +7483,8 @@ migration_call(struct notifier_block *nf
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
+		rq->calc_load_update = calc_load_update;
+		rq->calc_load_active = 0;
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
@@ -7465,7 +7524,7 @@ migration_call(struct notifier_block *nf
 		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
-
+		calc_global_load_remove(rq);
 		/*
 		 * No need to migrate the tasks: it was best-effort if
 		 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -9162,6 +9221,8 @@ void __init sched_init(void)
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
+		rq->calc_load_active = 0;
+		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9269,6 +9330,9 @@ void __init sched_init(void)
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
+
+	calc_load_update = jiffies + LOAD_FREQ;
+
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
Index: linux-2.6-tip/kernel/sched_idletask.c
===================================================================
--- linux-2.6-tip.orig/kernel/sched_idletask.c
+++ linux-2.6-tip/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(stru
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-
+	/* adjust the active tasks as we might go into a long sleep */
+	calc_load_account_active(rq);
 	return rq->idle;
 }
 
Index: linux-2.6-tip/kernel/time/timekeeping.c
===================================================================
--- linux-2.6-tip.orig/kernel/time/timekeeping.c
+++ linux-2.6-tip/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
 
 /*
  * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
Index: linux-2.6-tip/kernel/timer.c
===================================================================
--- linux-2.6-tip.orig/kernel/timer.c
+++ linux-2.6-tip/kernel/timer.c
@@ -1127,47 +1127,6 @@ void update_process_times(int user_tick)
 }
 
 /*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-	return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
-	unsigned long active_tasks; /* fixed-point */
-	static int count = LOAD_FREQ;
-
-	count -= ticks;
-	if (unlikely(count < 0)) {
-		active_tasks = count_active_tasks();
-		do {
-			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
-			count += LOAD_FREQ;
-		} while (count < 0);
-	}
-}
-
-/*
  * This function runs timers and the timer-tq in bottom half context.
  */
 static void run_timer_softirq(struct softirq_action *h)
@@ -1193,16 +1152,6 @@ void run_local_timers(void)
 }
 
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
-	update_wall_time();
-	calc_load(ticks);
-}
-
-/*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
  * jiffies is defined in the linker script...
@@ -1211,7 +1160,8 @@ static inline void update_times(unsigned
 void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
-	update_times(ticks);
+	update_wall_time();
+	calc_global_load();
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [patch 2/2] sched, timers: cleanup avenrun users
  2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
  2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
@ 2009-05-14 11:21 ` Thomas Gleixner
  2009-05-14 17:23 ` [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Dimitri Sivanich
  2009-05-15 12:23 ` Dimitri Sivanich
  3 siblings, 0 replies; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 11:21 UTC (permalink / raw)
  To: LKML; +Cc: Andrew Morton, Dimitri Sivanich, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: calc-load-cleanup-users.patch --]
[-- Type: text/plain, Size: 4324 bytes --]

avenrun is a rough estimate so we don't have to worry about
consistency of the three avenrun values. Remove the xtime lock
dependency and provide a function to scale the values. Cleanup the
users.

[ Impact: cleanup ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/loadavg.c     |   20 +++++++-------------
 include/linux/sched.h |    1 +
 kernel/sched.c        |   15 +++++++++++++++
 kernel/timer.c        |   32 ++++++--------------------------
 4 files changed, 29 insertions(+), 39 deletions(-)

Index: linux-2.6-tip/fs/proc/loadavg.c
===================================================================
--- linux-2.6-tip.orig/fs/proc/loadavg.c
+++ linux-2.6-tip/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-	int a, b, c;
-	unsigned long seq;
+	unsigned long avnrun[3];
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		a = avenrun[0] + (FIXED_1/200);
-		b = avenrun[1] + (FIXED_1/200);
-		c = avenrun[2] + (FIXED_1/200);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-		LOAD_INT(a), LOAD_FRAC(a),
-		LOAD_INT(b), LOAD_FRAC(b),
-		LOAD_INT(c), LOAD_FRAC(c),
+	get_avenrun(avnrun, FIXED_1/200, 0);
+
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
 		task_active_pid_ns(current)->last_pid);
 	return 0;
Index: linux-2.6-tip/include/linux/sched.h
===================================================================
--- linux-2.6-tip.orig/include/linux/sched.h
+++ linux-2.6-tip/include/linux/sched.h
@@ -118,6 +118,7 @@ struct bts_context;
  *    11 bit fractions.
  */
 extern unsigned long avenrun[];		/* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
Index: linux-2.6-tip/kernel/sched.c
===================================================================
--- linux-2.6-tip.orig/kernel/sched.c
+++ linux-2.6-tip/kernel/sched.c
@@ -2946,6 +2946,21 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
Index: linux-2.6-tip/kernel/timer.c
===================================================================
--- linux-2.6-tip.orig/kernel/timer.c
+++ linux-2.6-tip/kernel/timer.c
@@ -1362,37 +1362,17 @@ int do_sysinfo(struct sysinfo *info)
 {
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
-	unsigned long seq;
+	struct timespec tp;
 
 	memset(info, 0, sizeof(struct sysinfo));
 
-	do {
-		struct timespec tp;
-		seq = read_seqbegin(&xtime_lock);
-
-		/*
-		 * This is annoying.  The below is the same thing
-		 * posix_get_clock_monotonic() does, but it wants to
-		 * take the lock which we want to cover the loads stuff
-		 * too.
-		 */
-
-		getnstimeofday(&tp);
-		tp.tv_sec += wall_to_monotonic.tv_sec;
-		tp.tv_nsec += wall_to_monotonic.tv_nsec;
-		monotonic_to_bootbased(&tp);
-		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-			tp.tv_sec++;
-		}
-		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+	ktime_get_ts(&tp);
+	monotonic_to_bootbased(&tp);
+	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-		info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
 
-		info->procs = nr_threads;
-	} while (read_seqretry(&xtime_lock, seq));
+	info->procs = nr_threads;
 
 	si_meminfo(info);
 	si_swapinfo(info);



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop
  2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
  2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
  2009-05-14 11:21 ` [patch 2/2] sched, timers: cleanup avenrun users Thomas Gleixner
@ 2009-05-14 17:23 ` Dimitri Sivanich
  2009-05-14 19:26   ` Thomas Gleixner
  2009-05-15 12:23 ` Dimitri Sivanich
  3 siblings, 1 reply; 9+ messages in thread
From: Dimitri Sivanich @ 2009-05-14 17:23 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Peter Zijlstra, Ingo Molnar

I ran this patchset on a 64p ia64 machine and latency-wise it looks good.  I can't really speak for the loadavg calculation.

On Thu, May 14, 2009 at 11:21:14AM -0000, Thomas Gleixner wrote:
> Dimitri Sivanich pointed out that calc_load iterates over all online
> CPUs under xtime lock which can cause long latencies on large SMP
> systems.
> 
> The following patch series removes the iteration loop and lets
> scheduler_tick() on each CPU update the number of active tasks. The
> calc_load() function just uses this variable and updates avenrun from
> do_timer().
> 
> Dimitri, can you please test run this on one of your large machines ?
> 
> Thanks,
> 
> 	tglx
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop
  2009-05-14 17:23 ` [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Dimitri Sivanich
@ 2009-05-14 19:26   ` Thomas Gleixner
  0 siblings, 0 replies; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 19:26 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: LKML, Andrew Morton, Peter Zijlstra, Ingo Molnar

On Thu, 14 May 2009, Dimitri Sivanich wrote:

> I ran this patchset on a 64p ia64 machine and latency-wise it looks good.  I can't really speak for the loadavg calculation.

I did some comparisons vs. the old implementation and it looks pretty
much the same. Thanks for testing,

     tglx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch 1/2] sched, timers: move calc_load() to scheduler
  2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
@ 2009-05-14 20:00   ` Peter Zijlstra
  2009-05-14 20:40     ` Thomas Gleixner
  0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-14 20:00 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Dimitri Sivanich, Ingo Molnar

On Thu, 2009-05-14 at 11:21 +0000, Thomas Gleixner wrote:
> plain text document attachment (move-calc-load-to-scheduler-v1.patch)

> +/*
> + * calc_load - update the avenrun load estimates 10 ticks after the
> + * CPUs have updated calc_load_tasks.
> + */
> +void calc_global_load(void)
> +{
> +	unsigned long upd = calc_load_update + 10;
> +	long active;
> +
> +	if (time_before(jiffies, upd))
> +		return;
>  
> +	active = atomic_long_read(&calc_load_tasks);
> +	active = active > 0 ? active * FIXED_1 : 0;
>  
> +	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
> +	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
> +	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
> +
> +	calc_load_update += LOAD_FREQ;
> +}

> @@ -1211,7 +1160,8 @@ static inline void update_times(unsigned
>  void do_timer(unsigned long ticks)
>  {
>  	jiffies_64 += ticks;
> -	update_times(ticks);
> +	update_wall_time();
> +	calc_global_load();
>  }

I can see multiple cpus fall into calc_global_load() concurrently, which
would 'age' the load faster than expected.

Should we plug that hole?


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch 1/2] sched, timers: move calc_load() to scheduler
  2009-05-14 20:00   ` Peter Zijlstra
@ 2009-05-14 20:40     ` Thomas Gleixner
  2009-05-15  5:13       ` Peter Zijlstra
  0 siblings, 1 reply; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 20:40 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Andrew Morton, Dimitri Sivanich, Ingo Molnar

On Thu, 14 May 2009, Peter Zijlstra wrote:
> On Thu, 2009-05-14 at 11:21 +0000, Thomas Gleixner wrote:
> > plain text document attachment (move-calc-load-to-scheduler-v1.patch)
> 
> > +/*
> > + * calc_load - update the avenrun load estimates 10 ticks after the
> > + * CPUs have updated calc_load_tasks.
> > + */
> > +void calc_global_load(void)
> > +{
> > +	unsigned long upd = calc_load_update + 10;
> > +	long active;
> > +
> > +	if (time_before(jiffies, upd))
> > +		return;
> >  
> > +	active = atomic_long_read(&calc_load_tasks);
> > +	active = active > 0 ? active * FIXED_1 : 0;
> >  
> > +	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
> > +	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
> > +	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
> > +
> > +	calc_load_update += LOAD_FREQ;
> > +}
> 
> > @@ -1211,7 +1160,8 @@ static inline void update_times(unsigned
> >  void do_timer(unsigned long ticks)
> >  {
> >  	jiffies_64 += ticks;
> > -	update_times(ticks);
> > +	update_wall_time();
> > +	calc_global_load();
> >  }
> 
> I can see multiple cpus fall into calc_global_load() concurrently, which
> would 'age' the load faster than expected.
> 
> Should we plug that hole?

They can't. do_timer() is called by exactly one CPU under xtime
lock. What we removed is the loop over all online CPUs to retrieve the
number of active tasks.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch 1/2] sched, timers: move calc_load() to scheduler
  2009-05-14 20:40     ` Thomas Gleixner
@ 2009-05-15  5:13       ` Peter Zijlstra
  0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-15  5:13 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Dimitri Sivanich, Ingo Molnar

On Thu, 2009-05-14 at 22:40 +0200, Thomas Gleixner wrote:
> On Thu, 14 May 2009, Peter Zijlstra wrote:
> > On Thu, 2009-05-14 at 11:21 +0000, Thomas Gleixner wrote:
> > > plain text document attachment (move-calc-load-to-scheduler-v1.patch)
> > 
> > > +/*
> > > + * calc_load - update the avenrun load estimates 10 ticks after the
> > > + * CPUs have updated calc_load_tasks.
> > > + */
> > > +void calc_global_load(void)
> > > +{
> > > +	unsigned long upd = calc_load_update + 10;
> > > +	long active;
> > > +
> > > +	if (time_before(jiffies, upd))
> > > +		return;
> > >  
> > > +	active = atomic_long_read(&calc_load_tasks);
> > > +	active = active > 0 ? active * FIXED_1 : 0;
> > >  
> > > +	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
> > > +	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
> > > +	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
> > > +
> > > +	calc_load_update += LOAD_FREQ;
> > > +}
> > 
> > > @@ -1211,7 +1160,8 @@ static inline void update_times(unsigned
> > >  void do_timer(unsigned long ticks)
> > >  {
> > >  	jiffies_64 += ticks;
> > > -	update_times(ticks);
> > > +	update_wall_time();
> > > +	calc_global_load();
> > >  }
> > 
> > I can see multiple cpus fall into calc_global_load() concurrently, which
> > would 'age' the load faster than expected.
> > 
> > Should we plug that hole?
> 
> They can't. do_timer() is called by exactly one CPU under xtime
> lock. What we removed is the loop over all online CPUs to retrieve the
> number of active tasks.

Ah, D'0h, yes.

Patches look good then.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop
  2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
                   ` (2 preceding siblings ...)
  2009-05-14 17:23 ` [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Dimitri Sivanich
@ 2009-05-15 12:23 ` Dimitri Sivanich
  3 siblings, 0 replies; 9+ messages in thread
From: Dimitri Sivanich @ 2009-05-15 12:23 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Peter Zijlstra, Ingo Molnar

Acked-by: Dimitri Sivanich <sivanich@sgi.com>

On Thu, May 14, 2009 at 11:21:14AM -0000, Thomas Gleixner wrote:
> Dimitri Sivanich pointed out that calc_load iterates over all online
> CPUs under xtime lock which can cause long latencies on large SMP
> systems.
> 
> The following patch series removes the iteration loop and lets
> scheduler_tick() on each CPU update the number of active tasks. The
> calc_load() function just uses this variable and updates avenrun from
> do_timer().
> 
> Dimitri, can you please test run this on one of your large machines ?
> 
> Thanks,
> 
> 	tglx
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2009-05-15 12:28 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
2009-05-14 20:00   ` Peter Zijlstra
2009-05-14 20:40     ` Thomas Gleixner
2009-05-15  5:13       ` Peter Zijlstra
2009-05-14 11:21 ` [patch 2/2] sched, timers: cleanup avenrun users Thomas Gleixner
2009-05-14 17:23 ` [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Dimitri Sivanich
2009-05-14 19:26   ` Thomas Gleixner
2009-05-15 12:23 ` Dimitri Sivanich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox