[patch 0/3] move calc_load to softirq

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 0/3] move calc_load to softirq
@ 2009-05-02 19:06 Thomas Gleixner
  2009-05-02 19:06 ` [patch 1/3] timers: use function instead of macro for calc_load Thomas Gleixner
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:06 UTC (permalink / raw)
  To: LKML; +Cc: Dimitri Sivanich, Andrew Morton, Ingo Molnar, Peter Zijlstra

calc_load() calculates the load average, which is a rough estimate at
best. Right now calc_load() is called from do_timer() under the xtime
lock write locked and with interrupts disabled. Dimitri Sivanich
noticed 55us xtime lock hold times on a 64 cpu system. He suggested to
split calc_load() out of do_timer() and move it out of the xtime
locked section of each caller of do_timer():
http://lkml.org/lkml/2009/4/10/318

The following patch series moves the load average calculation to the
timer softirq and removes the xtime lock dependency in the generic
code. This reduces also the interrupt off section of the timer
interrupt.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [patch 1/3] timers: use function instead of macro for calc_load
  2009-05-02 19:06 [patch 0/3] move calc_load to softirq Thomas Gleixner
@ 2009-05-02 19:06 ` Thomas Gleixner
  2009-05-07 18:16   ` Dimitri Sivanich
  2009-05-02 19:06 ` [patch 2/3] timer: move calc_load to softirq Thomas Gleixner
  2009-05-02 19:06 ` [patch 3/3] tiemrs: cleanup avenrun users Thomas Gleixner
  2 siblings, 1 reply; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:06 UTC (permalink / raw)
  To: LKML; +Cc: Dimitri Sivanich, Andrew Morton, Ingo Molnar, Peter Zijlstra

[-- Attachment #1: calc-load-use-function-instead-of-macro.patch --]
[-- Type: text/plain, Size: 2764 bytes --]

Replace the CALC_LOAD macro by a static function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |    5 -----
 kernel/timer.c        |   29 ++++++++++++++---------------
 2 files changed, 14 insertions(+), 20 deletions(-)

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -124,11 +124,6 @@ extern unsigned long avenrun[];		/* Load
 #define EXP_5		2014		/* 1/exp(5sec/5min) */
 #define EXP_15		2037		/* 1/exp(5sec/15min) */
 
-#define CALC_LOAD(load,exp,n) \
-	load *= exp; \
-	load += n*(FIXED_1-exp); \
-	load >>= FSHIFT;
-
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1123,14 +1123,6 @@ void update_process_times(int user_tick)
 }
 
 /*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-	return nr_active() * FIXED_1;
-}
-
-/*
  * Hmm.. Changed this, as the GNU make sources (load.c) seems to
  * imply that avenrun[] is the standard name for this kind of thing.
  * Nothing else seems to be standardized: the fractional size etc
@@ -1139,25 +1131,32 @@ static unsigned long count_active_tasks(
  * Requires xtime_lock to access.
  */
 unsigned long avenrun[3];
-
 EXPORT_SYMBOL(avenrun);
 
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
+}
+
 /*
  * calc_load - given tick count, update the avenrun load estimates.
  * This is called while holding a write_lock on xtime_lock.
  */
-static inline void calc_load(unsigned long ticks)
+static void calc_global_load(unsigned long ticks)
 {
 	unsigned long active_tasks; /* fixed-point */
 	static int count = LOAD_FREQ;
 
 	count -= ticks;
 	if (unlikely(count < 0)) {
-		active_tasks = count_active_tasks();
+		active_tasks = nr_active() * FIXED_1;
 		do {
-			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+			avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
+			avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
+			avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
 			count += LOAD_FREQ;
 		} while (count < 0);
 	}
@@ -1193,7 +1192,7 @@ void run_local_timers(void)
 static inline void update_times(unsigned long ticks)
 {
 	update_wall_time();
-	calc_load(ticks);
+	calc_global_load(ticks);
 }
 
 /*



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/3] timers: use function instead of macro for calc_load
  2009-05-02 19:06 ` [patch 1/3] timers: use function instead of macro for calc_load Thomas Gleixner
@ 2009-05-07 18:16   ` Dimitri Sivanich
  0 siblings, 0 replies; 8+ messages in thread
From: Dimitri Sivanich @ 2009-05-07 18:16 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Ingo Molnar, Peter Zijlstra

On Sat, May 02, 2009 at 07:06:31PM -0000, Thomas Gleixner wrote:
> Replace the CALC_LOAD macro by a static function.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dimitri Sivanich <sivanich@sgi.com>
> ---
>  include/linux/sched.h |    5 -----
>  kernel/timer.c        |   29 ++++++++++++++---------------
>  2 files changed, 14 insertions(+), 20 deletions(-)
> 
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h
> +++ linux-2.6/include/linux/sched.h
> @@ -124,11 +124,6 @@ extern unsigned long avenrun[];		/* Load
>  #define EXP_5		2014		/* 1/exp(5sec/5min) */
>  #define EXP_15		2037		/* 1/exp(5sec/15min) */
>  
> -#define CALC_LOAD(load,exp,n) \
> -	load *= exp; \
> -	load += n*(FIXED_1-exp); \
> -	load >>= FSHIFT;
> -
>  extern unsigned long total_forks;
>  extern int nr_threads;
>  DECLARE_PER_CPU(unsigned long, process_counts);
> Index: linux-2.6/kernel/timer.c
> ===================================================================
> --- linux-2.6.orig/kernel/timer.c
> +++ linux-2.6/kernel/timer.c
> @@ -1123,14 +1123,6 @@ void update_process_times(int user_tick)
>  }
>  
>  /*
> - * Nr of active tasks - counted in fixed-point numbers
> - */
> -static unsigned long count_active_tasks(void)
> -{
> -	return nr_active() * FIXED_1;
> -}
> -
> -/*
>   * Hmm.. Changed this, as the GNU make sources (load.c) seems to
>   * imply that avenrun[] is the standard name for this kind of thing.
>   * Nothing else seems to be standardized: the fractional size etc
> @@ -1139,25 +1131,32 @@ static unsigned long count_active_tasks(
>   * Requires xtime_lock to access.
>   */
>  unsigned long avenrun[3];
> -
>  EXPORT_SYMBOL(avenrun);
>  
> +static unsigned long
> +calc_load(unsigned long load, unsigned long exp, unsigned long active)
> +{
> +	load *= exp;
> +	load += active * (FIXED_1 - exp);
> +	return load >> FSHIFT;
> +}
> +
>  /*
>   * calc_load - given tick count, update the avenrun load estimates.
>   * This is called while holding a write_lock on xtime_lock.
>   */
> -static inline void calc_load(unsigned long ticks)
> +static void calc_global_load(unsigned long ticks)
>  {
>  	unsigned long active_tasks; /* fixed-point */
>  	static int count = LOAD_FREQ;
>  
>  	count -= ticks;
>  	if (unlikely(count < 0)) {
> -		active_tasks = count_active_tasks();
> +		active_tasks = nr_active() * FIXED_1;
>  		do {
> -			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
> -			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
> -			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
> +			avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> +			avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> +			avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
>  			count += LOAD_FREQ;
>  		} while (count < 0);
>  	}
> @@ -1193,7 +1192,7 @@ void run_local_timers(void)
>  static inline void update_times(unsigned long ticks)
>  {
>  	update_wall_time();
> -	calc_load(ticks);
> +	calc_global_load(ticks);
>  }
>  
>  /*
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [patch 2/3] timer: move calc_load to softirq
  2009-05-02 19:06 [patch 0/3] move calc_load to softirq Thomas Gleixner
  2009-05-02 19:06 ` [patch 1/3] timers: use function instead of macro for calc_load Thomas Gleixner
@ 2009-05-02 19:06 ` Thomas Gleixner
  2009-05-02 19:24   ` Andrew Morton
  2009-05-02 19:06 ` [patch 3/3] tiemrs: cleanup avenrun users Thomas Gleixner
  2 siblings, 1 reply; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:06 UTC (permalink / raw)
  To: LKML; +Cc: Dimitri Sivanich, Andrew Morton, Ingo Molnar, Peter Zijlstra

[-- Attachment #1: calc-load-move-to-soft-irq.patch --]
[-- Type: text/plain, Size: 4106 bytes --]

xtime_lock is held write locked across calc_load() which iterates over
all online CPUs. That can cause long latencies for xtime_lock readers
on large SMP systems. The load average calculation is an rough
estimate anyway so there is no real need to protect the readers
vs. the update. It's not a problem when the avenrun array is updated
while a reader copies the values.

Move the calculation to the softirq and reduce the xtime_lock write
locked section. This also reduces the interrupts off section.

Inspired by an inital patch from Dimitri Sivanich.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c |    2 -
 kernel/timer.c            |   59 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 17 deletions(-)

Index: linux-2.6/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.orig/kernel/time/timekeeping.c
+++ linux-2.6/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
 
 /*
  * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1127,12 +1127,14 @@ void update_process_times(int user_tick)
  * imply that avenrun[] is the standard name for this kind of thing.
  * Nothing else seems to be standardized: the fractional size etc
  * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
  */
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+static atomic_t avenrun_ticks;
+static DEFINE_SPINLOCK(avenrun_lock);
+static DEFINE_PER_CPU(int, avenrun_calculate);
+
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -1143,23 +1145,47 @@ calc_load(unsigned long load, unsigned l
 
 /*
  * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
  */
-static void calc_global_load(unsigned long ticks)
+static void calc_global_load(void)
 {
-	unsigned long active_tasks; /* fixed-point */
-	static int count = LOAD_FREQ;
+	unsigned long active_tasks = nr_active() * FIXED_1;
 
-	count -= ticks;
-	if (unlikely(count < 0)) {
-		active_tasks = nr_active() * FIXED_1;
-		do {
-			avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
-			avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
-			avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
-			count += LOAD_FREQ;
-		} while (count < 0);
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
+}
+
+/*
+ * Check whether do_timer has set avenrun_calculate. The variable is
+ * cpu local so we avoid cache line bouncing of avenrun_lock and
+ * avenrun_ticks. avenrun_lock protects the avenrun calculation.
+ */
+static void check_calc_load(void)
+{
+	int ticks, *calc = &__get_cpu_var(avenrun_calculate);
+
+	if (!*calc)
+		return;
+
+	spin_lock(&avenrun_lock);
+	ticks = atomic_read(&avenrun_ticks);
+	if (ticks >= LOAD_FREQ) {
+		atomic_sub(LOAD_FREQ, &avenrun_ticks);
+		calc_global_load();
 	}
+	spin_unlock(&avenrun_lock);
+	*calc = 0;
+}
+
+/*
+ * Update avenrun_ticks and trigger the load calculation when the
+ * result is >= LOAD_FREQ.
+ */
+static void calc_load_update(unsigned long ticks)
+{
+	ticks = atomic_add_return(ticks, &avenrun_ticks);
+	if (ticks >= LOAD_FREQ)
+		__get_cpu_var(avenrun_calculate) = 1;
 }
 
 /*
@@ -1169,6 +1195,7 @@ static void run_timer_softirq(struct sof
 {
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
+	check_calc_load();
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1192,7 +1219,7 @@ void run_local_timers(void)
 static inline void update_times(unsigned long ticks)
 {
 	update_wall_time();
-	calc_global_load(ticks);
+	calc_load_update(ticks);
 }
 
 /*



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 2/3] timer: move calc_load to softirq
  2009-05-02 19:06 ` [patch 2/3] timer: move calc_load to softirq Thomas Gleixner
@ 2009-05-02 19:24   ` Andrew Morton
  2009-05-02 19:54     ` Thomas Gleixner
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2009-05-02 19:24 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Dimitri Sivanich, Ingo Molnar, Peter Zijlstra

On Sat, 02 May 2009 19:06:35 -0000 Thomas Gleixner <tglx@linutronix.de> wrote:

> xtime_lock is held write locked across calc_load() which iterates over
> all online CPUs. That can cause long latencies for xtime_lock readers
> on large SMP systems. The load average calculation is an rough
> estimate anyway so there is no real need to protect the readers
> vs. the update. It's not a problem when the avenrun array is updated
> while a reader copies the values.
> 
> Move the calculation to the softirq and reduce the xtime_lock write
> locked section. This also reduces the interrupts off section.
> 
> ...
>
> +static atomic_t avenrun_ticks;
> +static DEFINE_SPINLOCK(avenrun_lock);
> +static DEFINE_PER_CPU(int, avenrun_calculate);
> +
>  static unsigned long
>  calc_load(unsigned long load, unsigned long exp, unsigned long active)
>  {
> @@ -1143,23 +1145,47 @@ calc_load(unsigned long load, unsigned l
>  
>  /*
>   * calc_load - given tick count, update the avenrun load estimates.
> - * This is called while holding a write_lock on xtime_lock.
>   */
> -static void calc_global_load(unsigned long ticks)
> +static void calc_global_load(void)
>  {
> -	unsigned long active_tasks; /* fixed-point */
> -	static int count = LOAD_FREQ;
> +	unsigned long active_tasks = nr_active() * FIXED_1;
>  
> -	count -= ticks;
> -	if (unlikely(count < 0)) {
> -		active_tasks = nr_active() * FIXED_1;
> -		do {
> -			avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> -			avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> -			avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> -			count += LOAD_FREQ;
> -		} while (count < 0);
> +	avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> +	avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> +	avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> +}
> +
> +/*
> + * Check whether do_timer has set avenrun_calculate. The variable is
> + * cpu local so we avoid cache line bouncing of avenrun_lock and
> + * avenrun_ticks. avenrun_lock protects the avenrun calculation.
> + */
> +static void check_calc_load(void)
> +{
> +	int ticks, *calc = &__get_cpu_var(avenrun_calculate);
> +
> +	if (!*calc)
> +		return;
> +
> +	spin_lock(&avenrun_lock);
> +	ticks = atomic_read(&avenrun_ticks);
> +	if (ticks >= LOAD_FREQ) {
> +		atomic_sub(LOAD_FREQ, &avenrun_ticks);
> +		calc_global_load();
>  	}
> +	spin_unlock(&avenrun_lock);
> +	*calc = 0;
> +}

I wonder if we really really need avenrun_lock.  Various bits of code
(eg net/sched/em_meta.c) cheerily read avenrun[] without locking.

avenrun_lock could be made static within check_calc_load().



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 2/3] timer: move calc_load to softirq
  2009-05-02 19:24   ` Andrew Morton
@ 2009-05-02 19:54     ` Thomas Gleixner
  2009-05-07 18:18       ` Dimitri Sivanich
  0 siblings, 1 reply; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: LKML, Dimitri Sivanich, Ingo Molnar, Peter Zijlstra

On Sat, 2 May 2009, Andrew Morton wrote:
> > +	spin_lock(&avenrun_lock);
> > +	ticks = atomic_read(&avenrun_ticks);
> > +	if (ticks >= LOAD_FREQ) {
> > +		atomic_sub(LOAD_FREQ, &avenrun_ticks);
> > +		calc_global_load();
> >  	}
> > +	spin_unlock(&avenrun_lock);
> > +	*calc = 0;
> > +}
> 
> I wonder if we really really need avenrun_lock.  Various bits of code
> (eg net/sched/em_meta.c) cheerily read avenrun[] without locking.

I don't care about the reader side anyway, the lock is just there to
protect the calc_load update from two cpus, but that's probably
paranoia.

Though, there is a theoretical race between 2 cpus which might want to
update avenrun_ticks in the NOHZ case, but thinking more about it we
can just prevent this by clever usage of the atomic ops on
avenrun_ticks.

Thanks,

	tglx

----------->
Subject: timer: move calc_load to softirq
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 19:43:41 +0200

xtime_lock is held write locked across calc_load() which iterates over
all online CPUs. That can cause long latencies for xtime_lock readers
on large SMP systems. The load average calculation is an rough
estimate anyway so there is no real need to protect the readers
vs. the update. It's not a problem when the avenrun array is updated
while a reader copies the values.

Move the calculation to the softirq and reduce the xtime_lock write
locked section. This also reduces the interrupts off section.

Inspired by an inital patch from Dimitri Sivanich.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c |    2 -
 kernel/timer.c            |   57 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 41 insertions(+), 18 deletions(-)

Index: linux-2.6/kernel/time/timekeeping.c
===================================================================
--- linux-2.6.orig/kernel/time/timekeeping.c
+++ linux-2.6/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
 
 /*
  * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1127,12 +1127,13 @@ void update_process_times(int user_tick)
  * imply that avenrun[] is the standard name for this kind of thing.
  * Nothing else seems to be standardized: the fractional size etc
  * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
  */
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+static atomic_t avenrun_ticks;
+static DEFINE_PER_CPU(int, avenrun_calculate);
+
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -1143,23 +1144,44 @@ calc_load(unsigned long load, unsigned l
 
 /*
  * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
  */
-static void calc_global_load(unsigned long ticks)
+static void calc_global_load(void)
 {
-	unsigned long active_tasks; /* fixed-point */
-	static int count = LOAD_FREQ;
+	unsigned long active_tasks = nr_active() * FIXED_1;
 
-	count -= ticks;
-	if (unlikely(count < 0)) {
-		active_tasks = nr_active() * FIXED_1;
-		do {
-			avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
-			avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
-			avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
-			count += LOAD_FREQ;
-		} while (count < 0);
-	}
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
+}
+
+/*
+ * Check whether do_timer has set avenrun_calculate. The variable is
+ * cpu local so we avoid cache line bouncing of avenrun_ticks.
+ */
+static void check_calc_load(void)
+{
+	int ticks, *calc = &__get_cpu_var(avenrun_calculate);
+
+	if (!*calc)
+		return;
+
+	ticks = atomic_sub_return(LOAD_FREQ, &avenrun_ticks);
+	if (ticks >= 0)
+		calc_global_load();
+	else
+		atomic_add(LOAD_FREQ, &avenrun_ticks);
+	*calc = 0;
+}
+
+/*
+ * Update avenrun_ticks and trigger the load calculation when the
+ * result is >= LOAD_FREQ.
+ */
+static void calc_load_update(unsigned long ticks)
+{
+	ticks = atomic_add_return(ticks, &avenrun_ticks);
+	if (ticks >= LOAD_FREQ)
+		__get_cpu_var(avenrun_calculate) = 1;
 }
 
 /*
@@ -1169,6 +1191,7 @@ static void run_timer_softirq(struct sof
 {
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
+	check_calc_load();
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1192,7 +1215,7 @@ void run_local_timers(void)
 static inline void update_times(unsigned long ticks)
 {
 	update_wall_time();
-	calc_global_load(ticks);
+	calc_load_update(ticks);
 }
 
 /*



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 2/3] timer: move calc_load to softirq
  2009-05-02 19:54     ` Thomas Gleixner
@ 2009-05-07 18:18       ` Dimitri Sivanich
  0 siblings, 0 replies; 8+ messages in thread
From: Dimitri Sivanich @ 2009-05-07 18:18 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Andrew Morton, LKML, Ingo Molnar, Peter Zijlstra

On Sat, May 02, 2009 at 09:54:49PM +0200, Thomas Gleixner wrote:
> On Sat, 2 May 2009, Andrew Morton wrote:
> > > +	spin_lock(&avenrun_lock);
> > > +	ticks = atomic_read(&avenrun_ticks);
> > > +	if (ticks >= LOAD_FREQ) {
> > > +		atomic_sub(LOAD_FREQ, &avenrun_ticks);
> > > +		calc_global_load();
> > >  	}
> > > +	spin_unlock(&avenrun_lock);
> > > +	*calc = 0;
> > > +}
> > 
> > I wonder if we really really need avenrun_lock.  Various bits of code
> > (eg net/sched/em_meta.c) cheerily read avenrun[] without locking.
> 
> I don't care about the reader side anyway, the lock is just there to
> protect the calc_load update from two cpus, but that's probably
> paranoia.
> 
> Though, there is a theoretical race between 2 cpus which might want to
> update avenrun_ticks in the NOHZ case, but thinking more about it we
> can just prevent this by clever usage of the atomic ops on
> avenrun_ticks.
> 
> Thanks,
> 
> 	tglx
> 
> ----------->
> Subject: timer: move calc_load to softirq
> From: Thomas Gleixner <tglx@linutronix.de>
> Date: Sat, 2 May 2009 19:43:41 +0200
> 
> xtime_lock is held write locked across calc_load() which iterates over
> all online CPUs. That can cause long latencies for xtime_lock readers
> on large SMP systems. The load average calculation is an rough
> estimate anyway so there is no real need to protect the readers
> vs. the update. It's not a problem when the avenrun array is updated
> while a reader copies the values.
> 
> Move the calculation to the softirq and reduce the xtime_lock write
> locked section. This also reduces the interrupts off section.
> 
> Inspired by an inital patch from Dimitri Sivanich.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Acked-by: Dimitri Sivanich <sivanich@sgi.com>

> ---
>  kernel/time/timekeeping.c |    2 -
>  kernel/timer.c            |   57 ++++++++++++++++++++++++++++++++--------------
>  2 files changed, 41 insertions(+), 18 deletions(-)
> 
> Index: linux-2.6/kernel/time/timekeeping.c
> ===================================================================
> --- linux-2.6.orig/kernel/time/timekeeping.c
> +++ linux-2.6/kernel/time/timekeeping.c
> @@ -22,7 +22,7 @@
>  
>  /*
>   * This read-write spinlock protects us from races in SMP while
> - * playing with xtime and avenrun.
> + * playing with xtime.
>   */
>  __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
>  
> Index: linux-2.6/kernel/timer.c
> ===================================================================
> --- linux-2.6.orig/kernel/timer.c
> +++ linux-2.6/kernel/timer.c
> @@ -1127,12 +1127,13 @@ void update_process_times(int user_tick)
>   * imply that avenrun[] is the standard name for this kind of thing.
>   * Nothing else seems to be standardized: the fractional size etc
>   * all seem to differ on different machines.
> - *
> - * Requires xtime_lock to access.
>   */
>  unsigned long avenrun[3];
>  EXPORT_SYMBOL(avenrun);
>  
> +static atomic_t avenrun_ticks;
> +static DEFINE_PER_CPU(int, avenrun_calculate);
> +
>  static unsigned long
>  calc_load(unsigned long load, unsigned long exp, unsigned long active)
>  {
> @@ -1143,23 +1144,44 @@ calc_load(unsigned long load, unsigned l
>  
>  /*
>   * calc_load - given tick count, update the avenrun load estimates.
> - * This is called while holding a write_lock on xtime_lock.
>   */
> -static void calc_global_load(unsigned long ticks)
> +static void calc_global_load(void)
>  {
> -	unsigned long active_tasks; /* fixed-point */
> -	static int count = LOAD_FREQ;
> +	unsigned long active_tasks = nr_active() * FIXED_1;
>  
> -	count -= ticks;
> -	if (unlikely(count < 0)) {
> -		active_tasks = nr_active() * FIXED_1;
> -		do {
> -			avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> -			avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> -			avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> -			count += LOAD_FREQ;
> -		} while (count < 0);
> -	}
> +	avenrun[0] = calc_load(avenrun[0], EXP_1, active_tasks);
> +	avenrun[1] = calc_load(avenrun[1], EXP_5, active_tasks);
> +	avenrun[2] = calc_load(avenrun[2], EXP_15, active_tasks);
> +}
> +
> +/*
> + * Check whether do_timer has set avenrun_calculate. The variable is
> + * cpu local so we avoid cache line bouncing of avenrun_ticks.
> + */
> +static void check_calc_load(void)
> +{
> +	int ticks, *calc = &__get_cpu_var(avenrun_calculate);
> +
> +	if (!*calc)
> +		return;
> +
> +	ticks = atomic_sub_return(LOAD_FREQ, &avenrun_ticks);
> +	if (ticks >= 0)
> +		calc_global_load();
> +	else
> +		atomic_add(LOAD_FREQ, &avenrun_ticks);
> +	*calc = 0;
> +}
> +
> +/*
> + * Update avenrun_ticks and trigger the load calculation when the
> + * result is >= LOAD_FREQ.
> + */
> +static void calc_load_update(unsigned long ticks)
> +{
> +	ticks = atomic_add_return(ticks, &avenrun_ticks);
> +	if (ticks >= LOAD_FREQ)
> +		__get_cpu_var(avenrun_calculate) = 1;
>  }
>  
>  /*
> @@ -1169,6 +1191,7 @@ static void run_timer_softirq(struct sof
>  {
>  	struct tvec_base *base = __get_cpu_var(tvec_bases);
>  
> +	check_calc_load();
>  	hrtimer_run_pending();
>  
>  	if (time_after_eq(jiffies, base->timer_jiffies))
> @@ -1192,7 +1215,7 @@ void run_local_timers(void)
>  static inline void update_times(unsigned long ticks)
>  {
>  	update_wall_time();
> -	calc_global_load(ticks);
> +	calc_load_update(ticks);
>  }
>  
>  /*
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [patch 3/3] tiemrs: cleanup avenrun users
  2009-05-02 19:06 [patch 0/3] move calc_load to softirq Thomas Gleixner
  2009-05-02 19:06 ` [patch 1/3] timers: use function instead of macro for calc_load Thomas Gleixner
  2009-05-02 19:06 ` [patch 2/3] timer: move calc_load to softirq Thomas Gleixner
@ 2009-05-02 19:06 ` Thomas Gleixner
  2 siblings, 0 replies; 8+ messages in thread
From: Thomas Gleixner @ 2009-05-02 19:06 UTC (permalink / raw)
  To: LKML; +Cc: Dimitri Sivanich, Andrew Morton, Ingo Molnar, Peter Zijlstra

[-- Attachment #1: calc-load-cleanup-users.patch --]
[-- Type: text/plain, Size: 4010 bytes --]

avenrun is an rough estimate so we don't have to worry about
consistency of the three avenrun values. Remove the xtime lock
dependency and provide a function to scale the values. Cleanup the
users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/loadavg.c     |   16 +++++-----------
 include/linux/sched.h |    1 +
 kernel/timer.c        |   47 +++++++++++++++++++++--------------------------
 3 files changed, 27 insertions(+), 37 deletions(-)

Index: linux-2.6/fs/proc/loadavg.c
===================================================================
--- linux-2.6.orig/fs/proc/loadavg.c
+++ linux-2.6/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-	int a, b, c;
-	unsigned long seq;
+	int avnrun[3];
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		a = avenrun[0] + (FIXED_1/200);
-		b = avenrun[1] + (FIXED_1/200);
-		c = avenrun[2] + (FIXED_1/200);
-	} while (read_seqretry(&xtime_lock, seq));
+	get_avenrun(avnrun, FIXED_1/200, 0);
 
 	seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-		LOAD_INT(a), LOAD_FRAC(a),
-		LOAD_INT(b), LOAD_FRAC(b),
-		LOAD_INT(c), LOAD_FRAC(c),
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
 		task_active_pid_ns(current)->last_pid);
 	return 0;
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -116,6 +116,7 @@ struct fs_struct;
  *    11 bit fractions.
  */
 extern unsigned long avenrun[];		/* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
Index: linux-2.6/kernel/timer.c
===================================================================
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1135,6 +1135,21 @@ static atomic_t avenrun_ticks;
 static DEFINE_SPINLOCK(avenrun_lock);
 static DEFINE_PER_CPU(int, avenrun_calculate);
 
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -1432,37 +1447,17 @@ int do_sysinfo(struct sysinfo *info)
 {
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
-	unsigned long seq;
+	struct timespec tp;
 
 	memset(info, 0, sizeof(struct sysinfo));
 
-	do {
-		struct timespec tp;
-		seq = read_seqbegin(&xtime_lock);
-
-		/*
-		 * This is annoying.  The below is the same thing
-		 * posix_get_clock_monotonic() does, but it wants to
-		 * take the lock which we want to cover the loads stuff
-		 * too.
-		 */
-
-		getnstimeofday(&tp);
-		tp.tv_sec += wall_to_monotonic.tv_sec;
-		tp.tv_nsec += wall_to_monotonic.tv_nsec;
-		monotonic_to_bootbased(&tp);
-		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-			tp.tv_sec++;
-		}
-		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+	ktime_get_ts(&tp);
+	monotonic_to_bootbased(&tp);
+	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-		info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
 
-		info->procs = nr_threads;
-	} while (read_seqretry(&xtime_lock, seq));
+	info->procs = nr_threads;
 
 	si_meminfo(info);
 	si_swapinfo(info);



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2009-05-07 18:18 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-02 19:06 [patch 0/3] move calc_load to softirq Thomas Gleixner
2009-05-02 19:06 ` [patch 1/3] timers: use function instead of macro for calc_load Thomas Gleixner
2009-05-07 18:16   ` Dimitri Sivanich
2009-05-02 19:06 ` [patch 2/3] timer: move calc_load to softirq Thomas Gleixner
2009-05-02 19:24   ` Andrew Morton
2009-05-02 19:54     ` Thomas Gleixner
2009-05-07 18:18       ` Dimitri Sivanich
2009-05-02 19:06 ` [patch 3/3] tiemrs: cleanup avenrun users Thomas Gleixner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox