* [patch 1/3] timers: use function instead of macro for calc_load
2009-05-02 19:06 [patch 0/3] move calc_load to softirq Thomas Gleixner
@ 2009-05-02 19:06 ` Thomas Gleixner
2009-05-07 18:16 ` Dimitri Sivanich
2009-05-02 19:06 ` [patch 2/3] timer: move calc_load to softirq Thomas Gleixner
2009-05-02 19:06 ` [patch 3/3] tiemrs: cleanup avenrun users Thomas Gleixner
2 siblings, 1 reply; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:06 UTC (permalink / raw)
To: LKML; +Cc: Dimitri Sivanich, Andrew Morton, Ingo Molnar, Peter Zijlstra
[-- Attachment #1: calc-load-use-function-instead-of-macro.patch --]
[-- Type: text/plain, Size: 2764 bytes --]
Replace the CALC_LOAD macro by a static function.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/sched.h | 5 -----
kernel/timer.c | 29 ++++++++++++++---------------
2 files changed, 14 insertions(+), 20 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -124,11 +124,6 @@ extern unsigned long avenrun[]; /* Load
#define EXP_5 2014 /* 1/exp(5sec/5min) */
#define EXP_15 2037 /* 1/exp(5sec/15min) */
-#define CALC_LOAD(load,exp,n) \
- load *= exp; \
- load += n*(FIXED_1-exp); \
- load >>= FSHIFT;
-
extern unsigned long total_forks;
extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1123,14 +1123,6 @@ void update_process_times(int user_tick)
}
/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
- return nr_active() * FIXED_1;
-}
-
-/*
* Hmm.. Changed this, as the GNU make sources (load.c) seems to
* imply that avenrun[] is the standard name for this kind of thing.
* Nothing else seems to be standardized: the fractional size etc
@@ -1139,25 +1131,32 @@ static unsigned long count_active_tasks(
* Requires xtime_lock to access.
*/
unsigned long avenrun[3];
-
EXPORT_SYMBOL(avenrun);
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ return load >> FSHIFT;
+}
+
/*
* calc_load - given tick count, update the avenrun load estimates.
* This is called while holding a write_lock on xtime_lock.
*/
-static inline void calc_load(unsigned long ticks)
+static void calc_global_load(unsigned long ticks)
{
unsigned long active_tasks; /* fixed-point */
static int count = LOAD_FREQ;
count -= ticks;
if (unlikely(count < 0)) {
- active_tasks = count_active_tasks();
+ active_tasks = nr_active() * FIXED_1;
do {
- CALC_LOAD(avenrun[0], EXP_1, active_tasks);
- CALC_LOAD(avenrun[1], EXP_5, active_tasks);
- CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
count += LOAD_FREQ;
} while (count < 0);
}
@@ -1193,7 +1192,7 @@ void run_local_timers(void)
static inline void update_times(unsigned long ticks)
{
update_wall_time();
- calc_load(ticks);
+ calc_global_load(ticks);
}
/*
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [patch 1/3] timers: use function instead of macro for calc_load
2009-05-02 19:06 ` [patch 1/3] timers: use function instead of macro for calc_load Thomas Gleixner
@ 2009-05-07 18:16 ` Dimitri Sivanich
0 siblings, 0 replies; 8+ messages in thread
From: Dimitri Sivanich @ 2009-05-07 18:16 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Ingo Molnar, Peter Zijlstra
On Sat, May 02, 2009 at 07:06:31PM -0000, Thomas Gleixner wrote:
> Replace the CALC_LOAD macro by a static function.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@sgi.com>
> ---
> include/linux/sched.h | 5 -----
> kernel/timer.c | 29 ++++++++++++++---------------
> 2 files changed, 14 insertions(+), 20 deletions(-)
>
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -124,11 +124,6 @@ extern unsigned long avenrun[]; /* Load
> #define EXP_5 2014 /* 1/exp(5sec/5min) */
> #define EXP_15 2037 /* 1/exp(5sec/15min) */
>
> -#define CALC_LOAD(load,exp,n) \
> - load *= exp; \
> - load += n*(FIXED_1-exp); \
> - load >>= FSHIFT;
> -
> extern unsigned long total_forks;
> extern int nr_threads;
> DECLARE_PER_CPU(unsigned long, process_counts);
> Index: linux-2.6/kernel/timer.c
> ===================================================================
> --- linux-2.6.orig/kernel/timer.c
> +++ linux-2.6/kernel/timer.c
> @@ -1123,14 +1123,6 @@ void update_process_times(int user_tick)
> }
>
> /*
> - * Nr of active tasks - counted in fixed-point numbers
> - */
> -static unsigned long count_active_tasks(void)
> -{
> - return nr_active() * FIXED_1;
> -}
> -
> -/*
> * Hmm.. Changed this, as the GNU make sources (load.c) seems to
> * imply that avenrun[] is the standard name for this kind of thing.
> * Nothing else seems to be standardized: the fractional size etc
> @@ -1139,25 +1131,32 @@ static unsigned long count_active_tasks(
> * Requires xtime_lock to access.
> */
> unsigned long avenrun[3];
> -
> EXPORT_SYMBOL(avenrun);
>
> +static unsigned long
> +calc_load(unsigned long load, unsigned long exp, unsigned long active)
> +{
> + load *= exp;
> + load += active * (FIXED_1 - exp);
> + return load >> FSHIFT;
> +}
> +
> /*
> * calc_load - given tick count, update the avenrun load estimates.
> * This is called while holding a write_lock on xtime_lock.
> */
> -static inline void calc_load(unsigned long ticks)
> +static void calc_global_load(unsigned long ticks)
> {
> unsigned long active_tasks; /* fixed-point */
> static int count = LOAD_FREQ;
>
> count -= ticks;
> if (unlikely(count < 0)) {
> - active_tasks = count_active_tasks();
> + active_tasks = nr_active() * FIXED_1;
> do {
> - CALC_LOAD(avenrun[0], EXP_1, active_tasks);
> - CALC_LOAD(avenrun[1], EXP_5, active_tasks);
> - CALC_LOAD(avenrun[2], EXP_15, active_tasks);
> + avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> + avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> + avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> count += LOAD_FREQ;
> } while (count < 0);
> }
> @@ -1193,7 +1192,7 @@ void run_local_timers(void)
> static inline void update_times(unsigned long ticks)
> {
> update_wall_time();
> - calc_load(ticks);
> + calc_global_load(ticks);
> }
>
> /*
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [patch 2/3] timer: move calc_load to softirq
2009-05-02 19:06 [patch 0/3] move calc_load to softirq Thomas Gleixner
2009-05-02 19:06 ` [patch 1/3] timers: use function instead of macro for calc_load Thomas Gleixner
@ 2009-05-02 19:06 ` Thomas Gleixner
2009-05-02 19:24 ` Andrew Morton
2009-05-02 19:06 ` [patch 3/3] tiemrs: cleanup avenrun users Thomas Gleixner
2 siblings, 1 reply; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:06 UTC (permalink / raw)
To: LKML; +Cc: Dimitri Sivanich, Andrew Morton, Ingo Molnar, Peter Zijlstra
[-- Attachment #1: calc-load-move-to-soft-irq.patch --]
[-- Type: text/plain, Size: 4106 bytes --]
xtime_lock is held write locked across calc_load() which iterates over
all online CPUs. That can cause long latencies for xtime_lock readers
on large SMP systems. The load average calculation is an rough
estimate anyway so there is no real need to protect the readers
vs. the update. It's not a problem when the avenrun array is updated
while a reader copies the values.
Move the calculation to the softirq and reduce the xtime_lock write
locked section. This also reduces the interrupts off section.
Inspired by an inital patch from Dimitri Sivanich.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/time/timekeeping.c | 2 -
kernel/timer.c | 59 +++++++++++++++++++++++++++++++++-------------
2 files changed, 44 insertions(+), 17 deletions(-)
Index: linux-2.6/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.orig/kernel/time/timekeeping.c
+++ linux-2.6/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
/*
* This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1127,12 +1127,14 @@ void update_process_times(int user_tick)
* imply that avenrun[] is the standard name for this kind of thing.
* Nothing else seems to be standardized: the fractional size etc
* all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
*/
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun);
+static atomic_t avenrun_ticks;
+static DEFINE_SPINLOCK(avenrun_lock);
+static DEFINE_PER_CPU(int, avenrun_calculate);
+
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -1143,23 +1145,47 @@ calc_load(unsigned long load, unsigned l
/*
* calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
*/
-static void calc_global_load(unsigned long ticks)
+static void calc_global_load(void)
{
- unsigned long active_tasks; /* fixed-point */
- static int count = LOAD_FREQ;
+ unsigned long active_tasks = nr_active() * FIXED_1;
- count -= ticks;
- if (unlikely(count < 0)) {
- active_tasks = nr_active() * FIXED_1;
- do {
- avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
- avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
- avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
- count += LOAD_FREQ;
- } while (count < 0);
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
+}
+
+/*
+ * Check whether do_timer has set avenrun_calculate. The variable is
+ * cpu local so we avoid cache line bouncing of avenrun_lock and
+ * avenrun_ticks. avenrun_lock protects the avenrun calculation.
+ */
+static void check_calc_load(void)
+{
+ int ticks, *calc = &__get_cpu_var(avenrun_calculate);
+
+ if (!*calc)
+ return;
+
+ spin_lock(&avenrun_lock);
+ ticks = atomic_read(&avenrun_ticks);
+ if (ticks >= LOAD_FREQ) {
+ atomic_sub(LOAD_FREQ, &avenrun_ticks);
+ calc_global_load();
}
+ spin_unlock(&avenrun_lock);
+ *calc = 0;
+}
+
+/*
+ * Update avenrun_ticks and trigger the load calculation when the
+ * result is >= LOAD_FREQ.
+ */
+static void calc_load_update(unsigned long ticks)
+{
+ ticks = atomic_add_return(ticks, &avenrun_ticks);
+ if (ticks >= LOAD_FREQ)
+ __get_cpu_var(avenrun_calculate) = 1;
}
/*
@@ -1169,6 +1195,7 @@ static void run_timer_softirq(struct sof
{
struct tvec_base *base = __get_cpu_var(tvec_bases);
+ check_calc_load();
hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1192,7 +1219,7 @@ void run_local_timers(void)
static inline void update_times(unsigned long ticks)
{
update_wall_time();
- calc_global_load(ticks);
+ calc_load_update(ticks);
}
/*
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [patch 2/3] timer: move calc_load to softirq
2009-05-02 19:06 ` [patch 2/3] timer: move calc_load to softirq Thomas Gleixner
@ 2009-05-02 19:24 ` Andrew Morton
2009-05-02 19:54 ` Thomas Gleixner
0 siblings, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2009-05-02 19:24 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: LKML, Dimitri Sivanich, Ingo Molnar, Peter Zijlstra
On Sat, 02 May 2009 19:06:35 -0000 Thomas Gleixner <tglx@linutronix.de> wrote:
> xtime_lock is held write locked across calc_load() which iterates over
> all online CPUs. That can cause long latencies for xtime_lock readers
> on large SMP systems. The load average calculation is an rough
> estimate anyway so there is no real need to protect the readers
> vs. the update. It's not a problem when the avenrun array is updated
> while a reader copies the values.
>
> Move the calculation to the softirq and reduce the xtime_lock write
> locked section. This also reduces the interrupts off section.
>
> ...
>
> +static atomic_t avenrun_ticks;
> +static DEFINE_SPINLOCK(avenrun_lock);
> +static DEFINE_PER_CPU(int, avenrun_calculate);
> +
> static unsigned long
> calc_load(unsigned long load, unsigned long exp, unsigned long active)
> {
> @@ -1143,23 +1145,47 @@ calc_load(unsigned long load, unsigned l
>
> /*
> * calc_load - given tick count, update the avenrun load estimates.
> - * This is called while holding a write_lock on xtime_lock.
> */
> -static void calc_global_load(unsigned long ticks)
> +static void calc_global_load(void)
> {
> - unsigned long active_tasks; /* fixed-point */
> - static int count = LOAD_FREQ;
> + unsigned long active_tasks = nr_active() * FIXED_1;
>
> - count -= ticks;
> - if (unlikely(count < 0)) {
> - active_tasks = nr_active() * FIXED_1;
> - do {
> - avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> - avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> - avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> - count += LOAD_FREQ;
> - } while (count < 0);
> + avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> + avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> + avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> +}
> +
> +/*
> + * Check whether do_timer has set avenrun_calculate. The variable is
> + * cpu local so we avoid cache line bouncing of avenrun_lock and
> + * avenrun_ticks. avenrun_lock protects the avenrun calculation.
> + */
> +static void check_calc_load(void)
> +{
> + int ticks, *calc = &__get_cpu_var(avenrun_calculate);
> +
> + if (!*calc)
> + return;
> +
> + spin_lock(&avenrun_lock);
> + ticks = atomic_read(&avenrun_ticks);
> + if (ticks >= LOAD_FREQ) {
> + atomic_sub(LOAD_FREQ, &avenrun_ticks);
> + calc_global_load();
> }
> + spin_unlock(&avenrun_lock);
> + *calc = 0;
> +}
I wonder if we really really need avenrun_lock. Various bits of code
(eg net/sched/em_meta.c) cheerily read avenrun[] without locking.
avenrun_lock could be made static within check_calc_load().
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [patch 2/3] timer: move calc_load to softirq
2009-05-02 19:24 ` Andrew Morton
@ 2009-05-02 19:54 ` Thomas Gleixner
2009-05-07 18:18 ` Dimitri Sivanich
0 siblings, 1 reply; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:54 UTC (permalink / raw)
To: Andrew Morton; +Cc: LKML, Dimitri Sivanich, Ingo Molnar, Peter Zijlstra
On Sat, 2 May 2009, Andrew Morton wrote:
> > + spin_lock(&avenrun_lock);
> > + ticks = atomic_read(&avenrun_ticks);
> > + if (ticks >= LOAD_FREQ) {
> > + atomic_sub(LOAD_FREQ, &avenrun_ticks);
> > + calc_global_load();
> > }
> > + spin_unlock(&avenrun_lock);
> > + *calc = 0;
> > +}
>
> I wonder if we really really need avenrun_lock. Various bits of code
> (eg net/sched/em_meta.c) cheerily read avenrun[] without locking.
I don't care about the reader side anyway, the lock is just there to
protect the calc_load update from two cpus, but that's probably
paranoia.
Though, there is a theoretical race between 2 cpus which might want to
update avenrun_ticks in the NOHZ case, but thinking more about it we
can just prevent this by clever usage of the atomic ops on
avenrun_ticks.
Thanks,
tglx
----------->
Subject: timer: move calc_load to softirq
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 19:43:41 +0200
xtime_lock is held write locked across calc_load() which iterates over
all online CPUs. That can cause long latencies for xtime_lock readers
on large SMP systems. The load average calculation is an rough
estimate anyway so there is no real need to protect the readers
vs. the update. It's not a problem when the avenrun array is updated
while a reader copies the values.
Move the calculation to the softirq and reduce the xtime_lock write
locked section. This also reduces the interrupts off section.
Inspired by an inital patch from Dimitri Sivanich.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/time/timekeeping.c | 2 -
kernel/timer.c | 57 ++++++++++++++++++++++++++++++++--------------
2 files changed, 41 insertions(+), 18 deletions(-)
Index: linux-2.6/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.orig/kernel/time/timekeeping.c
+++ linux-2.6/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
/*
* This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1127,12 +1127,13 @@ void update_process_times(int user_tick)
* imply that avenrun[] is the standard name for this kind of thing.
* Nothing else seems to be standardized: the fractional size etc
* all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
*/
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun);
+static atomic_t avenrun_ticks;
+static DEFINE_PER_CPU(int, avenrun_calculate);
+
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -1143,23 +1144,44 @@ calc_load(unsigned long load, unsigned l
/*
* calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
*/
-static void calc_global_load(unsigned long ticks)
+static void calc_global_load(void)
{
- unsigned long active_tasks; /* fixed-point */
- static int count = LOAD_FREQ;
+ unsigned long active_tasks = nr_active() * FIXED_1;
- count -= ticks;
- if (unlikely(count < 0)) {
- active_tasks = nr_active() * FIXED_1;
- do {
- avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
- avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
- avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
- count += LOAD_FREQ;
- } while (count < 0);
- }
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
+}
+
+/*
+ * Check whether do_timer has set avenrun_calculate. The variable is
+ * cpu local so we avoid cache line bouncing of avenrun_ticks.
+ */
+static void check_calc_load(void)
+{
+ int ticks, *calc = &__get_cpu_var(avenrun_calculate);
+
+ if (!*calc)
+ return;
+
+ ticks = atomic_sub_return(LOAD_FREQ, &avenrun_ticks);
+ if (ticks >= 0)
+ calc_global_load();
+ else
+ atomic_add(LOAD_FREQ, &avenrun_ticks);
+ *calc = 0;
+}
+
+/*
+ * Update avenrun_ticks and trigger the load calculation when the
+ * result is >= LOAD_FREQ.
+ */
+static void calc_load_update(unsigned long ticks)
+{
+ ticks = atomic_add_return(ticks, &avenrun_ticks);
+ if (ticks >= LOAD_FREQ)
+ __get_cpu_var(avenrun_calculate) = 1;
}
/*
@@ -1169,6 +1191,7 @@ static void run_timer_softirq(struct sof
{
struct tvec_base *base = __get_cpu_var(tvec_bases);
+ check_calc_load();
hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1192,7 +1215,7 @@ void run_local_timers(void)
static inline void update_times(unsigned long ticks)
{
update_wall_time();
- calc_global_load(ticks);
+ calc_load_update(ticks);
}
/*
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [patch 2/3] timer: move calc_load to softirq
2009-05-02 19:54 ` Thomas Gleixner
@ 2009-05-07 18:18 ` Dimitri Sivanich
0 siblings, 0 replies; 8+ messages in thread
From: Dimitri Sivanich @ 2009-05-07 18:18 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: Andrew Morton, LKML, Ingo Molnar, Peter Zijlstra
On Sat, May 02, 2009 at 09:54:49PM +0200, Thomas Gleixner wrote:
> On Sat, 2 May 2009, Andrew Morton wrote:
> > > + spin_lock(&avenrun_lock);
> > > + ticks = atomic_read(&avenrun_ticks);
> > > + if (ticks >= LOAD_FREQ) {
> > > + atomic_sub(LOAD_FREQ, &avenrun_ticks);
> > > + calc_global_load();
> > > }
> > > + spin_unlock(&avenrun_lock);
> > > + *calc = 0;
> > > +}
> >
> > I wonder if we really really need avenrun_lock. Various bits of code
> > (eg net/sched/em_meta.c) cheerily read avenrun[] without locking.
>
> I don't care about the reader side anyway, the lock is just there to
> protect the calc_load update from two cpus, but that's probably
> paranoia.
>
> Though, there is a theoretical race between 2 cpus which might want to
> update avenrun_ticks in the NOHZ case, but thinking more about it we
> can just prevent this by clever usage of the atomic ops on
> avenrun_ticks.
>
> Thanks,
>
> tglx
>
> ----------->
> Subject: timer: move calc_load to softirq
> From: Thomas Gleixner <tglx@linutronix.de>
> Date: Sat, 2 May 2009 19:43:41 +0200
>
> xtime_lock is held write locked across calc_load() which iterates over
> all online CPUs. That can cause long latencies for xtime_lock readers
> on large SMP systems. The load average calculation is an rough
> estimate anyway so there is no real need to protect the readers
> vs. the update. It's not a problem when the avenrun array is updated
> while a reader copies the values.
>
> Move the calculation to the softirq and reduce the xtime_lock write
> locked section. This also reduces the interrupts off section.
>
> Inspired by an inital patch from Dimitri Sivanich.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@sgi.com>
> ---
> kernel/time/timekeeping.c | 2 -
> kernel/timer.c | 57 ++++++++++++++++++++++++++++++++--------------
> 2 files changed, 41 insertions(+), 18 deletions(-)
>
> Index: linux-2.6/kernel/time/timekeeping.c
> ===================================================================
> --- linux-2.6.orig/kernel/time/timekeeping.c
> +++ linux-2.6/kernel/time/timekeeping.c
> @@ -22,7 +22,7 @@
>
> /*
> * This read-write spinlock protects us from races in SMP while
> - * playing with xtime and avenrun.
> + * playing with xtime.
> */
> __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
>
> Index: linux-2.6/kernel/timer.c
> ===================================================================
> --- linux-2.6.orig/kernel/timer.c
> +++ linux-2.6/kernel/timer.c
> @@ -1127,12 +1127,13 @@ void update_process_times(int user_tick)
> * imply that avenrun[] is the standard name for this kind of thing.
> * Nothing else seems to be standardized: the fractional size etc
> * all seem to differ on different machines.
> - *
> - * Requires xtime_lock to access.
> */
> unsigned long avenrun[3];
> EXPORT_SYMBOL(avenrun);
>
> +static atomic_t avenrun_ticks;
> +static DEFINE_PER_CPU(int, avenrun_calculate);
> +
> static unsigned long
> calc_load(unsigned long load, unsigned long exp, unsigned long active)
> {
> @@ -1143,23 +1144,44 @@ calc_load(unsigned long load, unsigned l
>
> /*
> * calc_load - given tick count, update the avenrun load estimates.
> - * This is called while holding a write_lock on xtime_lock.
> */
> -static void calc_global_load(unsigned long ticks)
> +static void calc_global_load(void)
> {
> - unsigned long active_tasks; /* fixed-point */
> - static int count = LOAD_FREQ;
> + unsigned long active_tasks = nr_active() * FIXED_1;
>
> - count -= ticks;
> - if (unlikely(count < 0)) {
> - active_tasks = nr_active() * FIXED_1;
> - do {
> - avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> - avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> - avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> - count += LOAD_FREQ;
> - } while (count < 0);
> - }
> + avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> + avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> + avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> +}
> +
> +/*
> + * Check whether do_timer has set avenrun_calculate. The variable is
> + * cpu local so we avoid cache line bouncing of avenrun_ticks.
> + */
> +static void check_calc_load(void)
> +{
> + int ticks, *calc = &__get_cpu_var(avenrun_calculate);
> +
> + if (!*calc)
> + return;
> +
> + ticks = atomic_sub_return(LOAD_FREQ, &avenrun_ticks);
> + if (ticks >= 0)
> + calc_global_load();
> + else
> + atomic_add(LOAD_FREQ, &avenrun_ticks);
> + *calc = 0;
> +}
> +
> +/*
> + * Update avenrun_ticks and trigger the load calculation when the
> + * result is >= LOAD_FREQ.
> + */
> +static void calc_load_update(unsigned long ticks)
> +{
> + ticks = atomic_add_return(ticks, &avenrun_ticks);
> + if (ticks >= LOAD_FREQ)
> + __get_cpu_var(avenrun_calculate) = 1;
> }
>
> /*
> @@ -1169,6 +1191,7 @@ static void run_timer_softirq(struct sof
> {
> struct tvec_base *base = __get_cpu_var(tvec_bases);
>
> + check_calc_load();
> hrtimer_run_pending();
>
> if (time_after_eq(jiffies, base->timer_jiffies))
> @@ -1192,7 +1215,7 @@ void run_local_timers(void)
> static inline void update_times(unsigned long ticks)
> {
> update_wall_time();
> - calc_global_load(ticks);
> + calc_load_update(ticks);
> }
>
> /*
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [patch 3/3] tiemrs: cleanup avenrun users
2009-05-02 19:06 [patch 0/3] move calc_load to softirq Thomas Gleixner
2009-05-02 19:06 ` [patch 1/3] timers: use function instead of macro for calc_load Thomas Gleixner
2009-05-02 19:06 ` [patch 2/3] timer: move calc_load to softirq Thomas Gleixner
@ 2009-05-02 19:06 ` Thomas Gleixner
2 siblings, 0 replies; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:06 UTC (permalink / raw)
To: LKML; +Cc: Dimitri Sivanich, Andrew Morton, Ingo Molnar, Peter Zijlstra
[-- Attachment #1: calc-load-cleanup-users.patch --]
[-- Type: text/plain, Size: 4010 bytes --]
avenrun is an rough estimate so we don't have to worry about
consistency of the three avenrun values. Remove the xtime lock
dependency and provide a function to scale the values. Cleanup the
users.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
fs/proc/loadavg.c | 16 +++++-----------
include/linux/sched.h | 1 +
kernel/timer.c | 47 +++++++++++++++++++++--------------------------
3 files changed, 27 insertions(+), 37 deletions(-)
Index: linux-2.6/fs/proc/loadavg.c
===================================================================
--- linux-2.6.orig/fs/proc/loadavg.c
+++ linux-2.6/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
static int loadavg_proc_show(struct seq_file *m, void *v)
{
- int a, b, c;
- unsigned long seq;
+ int avnrun[3];
- do {
- seq = read_seqbegin(&xtime_lock);
- a = avenrun[0] + (FIXED_1/200);
- b = avenrun[1] + (FIXED_1/200);
- c = avenrun[2] + (FIXED_1/200);
- } while (read_seqretry(&xtime_lock, seq));
+ get_avenrun(avnrun, FIXED_1/200, 0);
seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
- LOAD_INT(a), LOAD_FRAC(a),
- LOAD_INT(b), LOAD_FRAC(b),
- LOAD_INT(c), LOAD_FRAC(c),
+ LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+ LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+ LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
task_active_pid_ns(current)->last_pid);
return 0;
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -116,6 +116,7 @@ struct fs_struct;
* 11 bit fractions.
*/
extern unsigned long avenrun[]; /* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
#define FSHIFT 11 /* nr of bits of precision */
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1135,6 +1135,21 @@ static atomic_t avenrun_ticks;
static DEFINE_SPINLOCK(avenrun_lock);
static DEFINE_PER_CPU(int, avenrun_calculate);
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -1432,37 +1447,17 @@ int do_sysinfo(struct sysinfo *info)
{
unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount;
- unsigned long seq;
+ struct timespec tp;
memset(info, 0, sizeof(struct sysinfo));
- do {
- struct timespec tp;
- seq = read_seqbegin(&xtime_lock);
-
- /*
- * This is annoying. The below is the same thing
- * posix_get_clock_monotonic() does, but it wants to
- * take the lock which we want to cover the loads stuff
- * too.
- */
-
- getnstimeofday(&tp);
- tp.tv_sec += wall_to_monotonic.tv_sec;
- tp.tv_nsec += wall_to_monotonic.tv_nsec;
- monotonic_to_bootbased(&tp);
- if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
- tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
- tp.tv_sec++;
- }
- info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+ ktime_get_ts(&tp);
+ monotonic_to_bootbased(&tp);
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
- info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
- info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
- info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
- info->procs = nr_threads;
- } while (read_seqretry(&xtime_lock, seq));
+ info->procs = nr_threads;
si_meminfo(info);
si_swapinfo(info);
^ permalink raw reply [flat|nested] 8+ messages in thread