* [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop
@ 2009-05-14 11:21 Thomas Gleixner
2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 11:21 UTC (permalink / raw)
To: LKML; +Cc: Andrew Morton, Dimitri Sivanich, Peter Zijlstra, Ingo Molnar
Dimitri Sivanich pointed out that calc_load iterates over all online
CPUs under xtime lock which can cause long latencies on large SMP
systems.
The following patch series removes the iteration loop and lets
scheduler_tick() on each CPU update the number of active tasks. The
calc_load() function just uses this variable and updates avenrun from
do_timer().
Dimitri, can you please test run this on one of your large machines ?
Thanks,
tglx
^ permalink raw reply [flat|nested] 9+ messages in thread
* [patch 1/2] sched, timers: move calc_load() to scheduler
2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
@ 2009-05-14 11:21 ` Thomas Gleixner
2009-05-14 20:00 ` Peter Zijlstra
2009-05-14 11:21 ` [patch 2/2] sched, timers: cleanup avenrun users Thomas Gleixner
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 11:21 UTC (permalink / raw)
To: LKML; +Cc: Andrew Morton, Dimitri Sivanich, Peter Zijlstra, Ingo Molnar
[-- Attachment #1: move-calc-load-to-scheduler-v1.patch --]
[-- Type: text/plain, Size: 8866 bytes --]
Dimitri Sivanich noticed that xtime_lock is held write locked across
calc_load() which iterates over all online CPUs. That can cause long
latencies for xtime_lock readers on large SMP systems.
The load average calculation is a rough estimate anyway so there is
no real need to protect the readers vs. the update. It's not a problem
when the avenrun array is updated while a reader copies the values.
Instead of iterating over all online CPUs let the scheduler_tick code
update the number of active tasks shortly before the avenrun update
happens. The avenrun update itself is handled by the CPU which calls
do_timer().
[ Impact: reduce xtime_lock write locked section ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/sched.h | 2 -
kernel/sched.c | 84 ++++++++++++++++++++++++++++++++++++++++------
kernel/sched_idletask.c | 3 +
kernel/time/timekeeping.c | 2 -
kernel/timer.c | 54 +----------------------------
5 files changed, 80 insertions(+), 65 deletions(-)
Index: linux-2.6-tip/include/linux/sched.h
===================================================================
--- linux-2.6-tip.orig/include/linux/sched.h
+++ linux-2.6-tip/include/linux/sched.h
@@ -137,9 +137,9 @@ DECLARE_PER_CPU(unsigned long, process_c
extern int nr_processes(void);
extern unsigned long nr_running(void);
extern unsigned long nr_uninterruptible(void);
-extern unsigned long nr_active(void);
extern unsigned long nr_iowait(void);
extern u64 cpu_nr_migrations(int cpu);
+extern void calc_global_load(void);
extern unsigned long get_parent_ip(unsigned long addr);
Index: linux-2.6-tip/kernel/sched.c
===================================================================
--- linux-2.6-tip.orig/kernel/sched.c
+++ linux-2.6-tip/kernel/sched.c
@@ -628,6 +628,10 @@ struct rq {
struct list_head migration_queue;
#endif
+ /* calc_load related fields */
+ unsigned long calc_load_update;
+ long calc_load_active;
+
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int hrtick_csd_pending;
@@ -1726,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs
}
#endif
+static void calc_load_account_active(struct rq *this_rq);
+
#include "sched_stats.h"
#include "sched_idletask.c"
#include "sched_fair.c"
@@ -2934,19 +2940,57 @@ unsigned long nr_iowait(void)
return sum;
}
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
- unsigned long i, running = 0, uninterruptible = 0;
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ return load >> FSHIFT;
+}
- for_each_online_cpu(i) {
- running += cpu_rq(i)->nr_running;
- uninterruptible += cpu_rq(i)->nr_uninterruptible;
- }
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+ unsigned long upd = calc_load_update + 10;
+ long active;
+
+ if (time_before(jiffies, upd))
+ return;
- if (unlikely((long)uninterruptible < 0))
- uninterruptible = 0;
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
- return running + uninterruptible;
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+ long nr_active, delta;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ atomic_long_add(delta, &calc_load_tasks);
+ }
}
/*
@@ -2986,6 +3030,11 @@ static void update_cpu_load(struct rq *t
new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
+
+ if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+ this_rq->calc_load_update += LOAD_FREQ;
+ calc_load_account_active(this_rq);
+ }
}
#ifdef CONFIG_SMP
@@ -7192,6 +7241,14 @@ static void migrate_dead_tasks(unsigned
}
}
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+ atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7426,6 +7483,8 @@ migration_call(struct notifier_block *nf
/* Update our root-domain */
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
+ rq->calc_load_update = calc_load_update;
+ rq->calc_load_active = 0;
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7465,7 +7524,7 @@ migration_call(struct notifier_block *nf
cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
-
+ calc_global_load_remove(rq);
/*
* No need to migrate the tasks: it was best-effort if
* they didn't take sched_hotcpu_mutex. Just wake up
@@ -9162,6 +9221,8 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
rq->nr_running = 0;
+ rq->calc_load_active = 0;
+ rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9269,6 +9330,9 @@ void __init sched_init(void)
* when this runqueue becomes "idle".
*/
init_idle(current, smp_processor_id());
+
+ calc_load_update = jiffies + LOAD_FREQ;
+
/*
* During early bootup we pretend to be a normal task:
*/
Index: linux-2.6-tip/kernel/sched_idletask.c
===================================================================
--- linux-2.6-tip.orig/kernel/sched_idletask.c
+++ linux-2.6-tip/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(stru
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
-
+ /* adjust the active tasks as we might go into a long sleep */
+ calc_load_account_active(rq);
return rq->idle;
}
Index: linux-2.6-tip/kernel/time/timekeeping.c
===================================================================
--- linux-2.6-tip.orig/kernel/time/timekeeping.c
+++ linux-2.6-tip/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
/*
* This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
Index: linux-2.6-tip/kernel/timer.c
===================================================================
--- linux-2.6-tip.orig/kernel/timer.c
+++ linux-2.6-tip/kernel/timer.c
@@ -1127,47 +1127,6 @@ void update_process_times(int user_tick)
}
/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
- return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
- unsigned long active_tasks; /* fixed-point */
- static int count = LOAD_FREQ;
-
- count -= ticks;
- if (unlikely(count < 0)) {
- active_tasks = count_active_tasks();
- do {
- CALC_LOAD(avenrun[0], EXP_1, active_tasks);
- CALC_LOAD(avenrun[1], EXP_5, active_tasks);
- CALC_LOAD(avenrun[2], EXP_15, active_tasks);
- count += LOAD_FREQ;
- } while (count < 0);
- }
-}
-
-/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
@@ -1193,16 +1152,6 @@ void run_local_timers(void)
}
/*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
- update_wall_time();
- calc_load(ticks);
-}
-
-/*
* The 64-bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock.
* jiffies is defined in the linker script...
@@ -1211,7 +1160,8 @@ static inline void update_times(unsigned
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
- update_times(ticks);
+ update_wall_time();
+ calc_global_load();
}
#ifdef __ARCH_WANT_SYS_ALARM
^ permalink raw reply [flat|nested] 9+ messages in thread
* [patch 2/2] sched, timers: cleanup avenrun users
2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
@ 2009-05-14 11:21 ` Thomas Gleixner
2009-05-14 17:23 ` [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Dimitri Sivanich
2009-05-15 12:23 ` Dimitri Sivanich
3 siblings, 0 replies; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 11:21 UTC (permalink / raw)
To: LKML; +Cc: Andrew Morton, Dimitri Sivanich, Peter Zijlstra, Ingo Molnar
[-- Attachment #1: calc-load-cleanup-users.patch --]
[-- Type: text/plain, Size: 4324 bytes --]
avenrun is a rough estimate so we don't have to worry about
consistency of the three avenrun values. Remove the xtime lock
dependency and provide a function to scale the values. Cleanup the
users.
[ Impact: cleanup ]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
fs/proc/loadavg.c | 20 +++++++-------------
include/linux/sched.h | 1 +
kernel/sched.c | 15 +++++++++++++++
kernel/timer.c | 32 ++++++--------------------------
4 files changed, 29 insertions(+), 39 deletions(-)
Index: linux-2.6-tip/fs/proc/loadavg.c
===================================================================
--- linux-2.6-tip.orig/fs/proc/loadavg.c
+++ linux-2.6-tip/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
static int loadavg_proc_show(struct seq_file *m, void *v)
{
- int a, b, c;
- unsigned long seq;
+ unsigned long avnrun[3];
- do {
- seq = read_seqbegin(&xtime_lock);
- a = avenrun[0] + (FIXED_1/200);
- b = avenrun[1] + (FIXED_1/200);
- c = avenrun[2] + (FIXED_1/200);
- } while (read_seqretry(&xtime_lock, seq));
-
- seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
- LOAD_INT(a), LOAD_FRAC(a),
- LOAD_INT(b), LOAD_FRAC(b),
- LOAD_INT(c), LOAD_FRAC(c),
+ get_avenrun(avnrun, FIXED_1/200, 0);
+
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+ LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+ LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+ LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
task_active_pid_ns(current)->last_pid);
return 0;
Index: linux-2.6-tip/include/linux/sched.h
===================================================================
--- linux-2.6-tip.orig/include/linux/sched.h
+++ linux-2.6-tip/include/linux/sched.h
@@ -118,6 +118,7 @@ struct bts_context;
* 11 bit fractions.
*/
extern unsigned long avenrun[]; /* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
#define FSHIFT 11 /* nr of bits of precision */
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
Index: linux-2.6-tip/kernel/sched.c
===================================================================
--- linux-2.6-tip.orig/kernel/sched.c
+++ linux-2.6-tip/kernel/sched.c
@@ -2946,6 +2946,21 @@ static unsigned long calc_load_update;
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun);
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
Index: linux-2.6-tip/kernel/timer.c
===================================================================
--- linux-2.6-tip.orig/kernel/timer.c
+++ linux-2.6-tip/kernel/timer.c
@@ -1362,37 +1362,17 @@ int do_sysinfo(struct sysinfo *info)
{
unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount;
- unsigned long seq;
+ struct timespec tp;
memset(info, 0, sizeof(struct sysinfo));
- do {
- struct timespec tp;
- seq = read_seqbegin(&xtime_lock);
-
- /*
- * This is annoying. The below is the same thing
- * posix_get_clock_monotonic() does, but it wants to
- * take the lock which we want to cover the loads stuff
- * too.
- */
-
- getnstimeofday(&tp);
- tp.tv_sec += wall_to_monotonic.tv_sec;
- tp.tv_nsec += wall_to_monotonic.tv_nsec;
- monotonic_to_bootbased(&tp);
- if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
- tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
- tp.tv_sec++;
- }
- info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+ ktime_get_ts(&tp);
+ monotonic_to_bootbased(&tp);
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
- info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
- info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
- info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
- info->procs = nr_threads;
- } while (read_seqretry(&xtime_lock, seq));
+ info->procs = nr_threads;
si_meminfo(info);
si_swapinfo(info);
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop
2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
2009-05-14 11:21 ` [patch 2/2] sched, timers: cleanup avenrun users Thomas Gleixner
@ 2009-05-14 17:23 ` Dimitri Sivanich
2009-05-14 19:26 ` Thomas Gleixner
2009-05-15 12:23 ` Dimitri Sivanich
3 siblings, 1 reply; 9+ messages in thread
From: Dimitri Sivanich @ 2009-05-14 17:23 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Peter Zijlstra, Ingo Molnar
I ran this patchset on a 64p ia64 machine and latency-wise it looks good. I can't really speak for the loadavg calculation.
On Thu, May 14, 2009 at 11:21:14AM -0000, Thomas Gleixner wrote:
> Dimitri Sivanich pointed out that calc_load iterates over all online
> CPUs under xtime lock which can cause long latencies on large SMP
> systems.
>
> The following patch series removes the iteration loop and lets
> scheduler_tick() on each CPU update the number of active tasks. The
> calc_load() function just uses this variable and updates avenrun from
> do_timer().
>
> Dimitri, can you please test run this on one of your large machines ?
>
> Thanks,
>
> tglx
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop
2009-05-14 17:23 ` [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Dimitri Sivanich
@ 2009-05-14 19:26 ` Thomas Gleixner
0 siblings, 0 replies; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 19:26 UTC (permalink / raw)
To: Dimitri Sivanich; +Cc: LKML, Andrew Morton, Peter Zijlstra, Ingo Molnar
On Thu, 14 May 2009, Dimitri Sivanich wrote:
> I ran this patchset on a 64p ia64 machine and latency-wise it looks good. I can't really speak for the loadavg calculation.
I did some comparisons vs. the old implementation and it looks pretty
much the same. Thanks for testing,
tglx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 1/2] sched, timers: move calc_load() to scheduler
2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
@ 2009-05-14 20:00 ` Peter Zijlstra
2009-05-14 20:40 ` Thomas Gleixner
0 siblings, 1 reply; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-14 20:00 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Dimitri Sivanich, Ingo Molnar
On Thu, 2009-05-14 at 11:21 +0000, Thomas Gleixner wrote:
> plain text document attachment (move-calc-load-to-scheduler-v1.patch)
> +/*
> + * calc_load - update the avenrun load estimates 10 ticks after the
> + * CPUs have updated calc_load_tasks.
> + */
> +void calc_global_load(void)
> +{
> + unsigned long upd = calc_load_update + 10;
> + long active;
> +
> + if (time_before(jiffies, upd))
> + return;
>
> + active = atomic_long_read(&calc_load_tasks);
> + active = active > 0 ? active * FIXED_1 : 0;
>
> + avenrun[0] = calc_load(avenrun[0], EXP_1, active);
> + avenrun[1] = calc_load(avenrun[1], EXP_5, active);
> + avenrun[2] = calc_load(avenrun[2], EXP_15, active);
> +
> + calc_load_update += LOAD_FREQ;
> +}
> @@ -1211,7 +1160,8 @@ static inline void update_times(unsigned
> void do_timer(unsigned long ticks)
> {
> jiffies_64 += ticks;
> - update_times(ticks);
> + update_wall_time();
> + calc_global_load();
> }
I can see multiple cpus fall into calc_global_load() concurrently, which
would 'age' the load faster than expected.
Should we plug that hole?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 1/2] sched, timers: move calc_load() to scheduler
2009-05-14 20:00 ` Peter Zijlstra
@ 2009-05-14 20:40 ` Thomas Gleixner
2009-05-15 5:13 ` Peter Zijlstra
0 siblings, 1 reply; 9+ messages in thread
From: Thomas Gleixner @ 2009-05-14 20:40 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: LKML, Andrew Morton, Dimitri Sivanich, Ingo Molnar
On Thu, 14 May 2009, Peter Zijlstra wrote:
> On Thu, 2009-05-14 at 11:21 +0000, Thomas Gleixner wrote:
> > plain text document attachment (move-calc-load-to-scheduler-v1.patch)
>
> > +/*
> > + * calc_load - update the avenrun load estimates 10 ticks after the
> > + * CPUs have updated calc_load_tasks.
> > + */
> > +void calc_global_load(void)
> > +{
> > + unsigned long upd = calc_load_update + 10;
> > + long active;
> > +
> > + if (time_before(jiffies, upd))
> > + return;
> >
> > + active = atomic_long_read(&calc_load_tasks);
> > + active = active > 0 ? active * FIXED_1 : 0;
> >
> > + avenrun[0] = calc_load(avenrun[0], EXP_1, active);
> > + avenrun[1] = calc_load(avenrun[1], EXP_5, active);
> > + avenrun[2] = calc_load(avenrun[2], EXP_15, active);
> > +
> > + calc_load_update += LOAD_FREQ;
> > +}
>
> > @@ -1211,7 +1160,8 @@ static inline void update_times(unsigned
> > void do_timer(unsigned long ticks)
> > {
> > jiffies_64 += ticks;
> > - update_times(ticks);
> > + update_wall_time();
> > + calc_global_load();
> > }
>
> I can see multiple cpus fall into calc_global_load() concurrently, which
> would 'age' the load faster than expected.
>
> Should we plug that hole?
They can't. do_timer() is called by exactly one CPU under xtime
lock. What we removed is the loop over all online CPUs to retrieve the
number of active tasks.
Thanks,
tglx
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 1/2] sched, timers: move calc_load() to scheduler
2009-05-14 20:40 ` Thomas Gleixner
@ 2009-05-15 5:13 ` Peter Zijlstra
0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2009-05-15 5:13 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Dimitri Sivanich, Ingo Molnar
On Thu, 2009-05-14 at 22:40 +0200, Thomas Gleixner wrote:
> On Thu, 14 May 2009, Peter Zijlstra wrote:
> > On Thu, 2009-05-14 at 11:21 +0000, Thomas Gleixner wrote:
> > > plain text document attachment (move-calc-load-to-scheduler-v1.patch)
> >
> > > +/*
> > > + * calc_load - update the avenrun load estimates 10 ticks after the
> > > + * CPUs have updated calc_load_tasks.
> > > + */
> > > +void calc_global_load(void)
> > > +{
> > > + unsigned long upd = calc_load_update + 10;
> > > + long active;
> > > +
> > > + if (time_before(jiffies, upd))
> > > + return;
> > >
> > > + active = atomic_long_read(&calc_load_tasks);
> > > + active = active > 0 ? active * FIXED_1 : 0;
> > >
> > > + avenrun[0] = calc_load(avenrun[0], EXP_1, active);
> > > + avenrun[1] = calc_load(avenrun[1], EXP_5, active);
> > > + avenrun[2] = calc_load(avenrun[2], EXP_15, active);
> > > +
> > > + calc_load_update += LOAD_FREQ;
> > > +}
> >
> > > @@ -1211,7 +1160,8 @@ static inline void update_times(unsigned
> > > void do_timer(unsigned long ticks)
> > > {
> > > jiffies_64 += ticks;
> > > - update_times(ticks);
> > > + update_wall_time();
> > > + calc_global_load();
> > > }
> >
> > I can see multiple cpus fall into calc_global_load() concurrently, which
> > would 'age' the load faster than expected.
> >
> > Should we plug that hole?
>
> They can't. do_timer() is called by exactly one CPU under xtime
> lock. What we removed is the loop over all online CPUs to retrieve the
> number of active tasks.
Ah, D'0h, yes.
Patches look good then.
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop
2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
` (2 preceding siblings ...)
2009-05-14 17:23 ` [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Dimitri Sivanich
@ 2009-05-15 12:23 ` Dimitri Sivanich
3 siblings, 0 replies; 9+ messages in thread
From: Dimitri Sivanich @ 2009-05-15 12:23 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Peter Zijlstra, Ingo Molnar
Acked-by: Dimitri Sivanich <sivanich@sgi.com>
On Thu, May 14, 2009 at 11:21:14AM -0000, Thomas Gleixner wrote:
> Dimitri Sivanich pointed out that calc_load iterates over all online
> CPUs under xtime lock which can cause long latencies on large SMP
> systems.
>
> The following patch series removes the iteration loop and lets
> scheduler_tick() on each CPU update the number of active tasks. The
> calc_load() function just uses this variable and updates avenrun from
> do_timer().
>
> Dimitri, can you please test run this on one of your large machines ?
>
> Thanks,
>
> tglx
>
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2009-05-15 12:28 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-14 11:21 [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Thomas Gleixner
2009-05-14 11:21 ` [patch 1/2] sched, timers: move calc_load() to scheduler Thomas Gleixner
2009-05-14 20:00 ` Peter Zijlstra
2009-05-14 20:40 ` Thomas Gleixner
2009-05-15 5:13 ` Peter Zijlstra
2009-05-14 11:21 ` [patch 2/2] sched, timers: cleanup avenrun users Thomas Gleixner
2009-05-14 17:23 ` [patch 0/2] sched, timers: simplify calc_load and avoid cpu iteration loop Dimitri Sivanich
2009-05-14 19:26 ` Thomas Gleixner
2009-05-15 12:23 ` Dimitri Sivanich
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox