* [v5 PATCH 2/4] timers: Identifying the existing pinned timers
2009-04-06 16:13 [v5 PATCH 1/4] timers: Framework for identifying pinned timers Arun R Bharadwaj
@ 2009-04-06 16:16 ` Arun R Bharadwaj
2009-04-06 16:16 ` Arun R Bharadwaj
` (4 subsequent siblings)
5 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-06 16:16 UTC (permalink / raw)
To: linux-kernel, linux-pm
Cc: a.p.zijlstra, ego, tglx, mingo, andi, venkatesh.pallipadi, vatsa,
arjan, svaidy, Arun Bharadwaj
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-06 21:43:57]:
The following pinned hrtimers have been identified and marked:
1)sched_rt_period_timer
2)tick_sched_timer
3)stack_trace_timer_fn
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
kernel/sched.c | 5 +++--
kernel/time/tick-sched.c | 7 ++++---
kernel/trace/trace_sysprof.c | 3 ++-
4 files changed, 10 insertions(+), 7 deletions(-)
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -236,7 +236,7 @@ static void start_rt_bandwidth(struct rt
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start_expires(&rt_b->rt_period_timer,
+ hrtimer_start_expires_pinned(&rt_b->rt_period_timer,
HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
@@ -1156,7 +1156,8 @@ static __init void init_hrtick(void)
*/
static void hrtick_start(struct rq *rq, u64 delay)
{
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+ hrtimer_start_pinned(&rq->hrtick_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL);
}
static inline void init_hrtick(void)
Index: linux.trees.git/kernel/time/tick-sched.c
===================================================================
--- linux.trees.git.orig/kernel/time/tick-sched.c
+++ linux.trees.git/kernel/time/tick-sched.c
@@ -348,7 +348,7 @@ void tick_nohz_stop_sched_tick(int inidl
ts->idle_expires = expires;
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, expires,
+ hrtimer_start_pinned(&ts->sched_timer, expires,
HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
@@ -394,7 +394,7 @@ static void tick_nohz_restart(struct tic
hrtimer_forward(&ts->sched_timer, now, tick_period);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start_expires(&ts->sched_timer,
+ hrtimer_start_expires_pinned(&ts->sched_timer,
HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
@@ -698,7 +698,8 @@ void tick_setup_sched_timer(void)
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+ hrtimer_start_expires_pinned(&ts->sched_timer,
+ HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
Index: linux.trees.git/kernel/trace/trace_sysprof.c
===================================================================
--- linux.trees.git.orig/kernel/trace/trace_sysprof.c
+++ linux.trees.git/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unus
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = stack_trace_timer_fn;
- hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+ hrtimer_start_pinned(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL);
}
static void start_stack_timers(void)
Index: linux.trees.git/arch/x86/kernel/apic/x2apic_uv_x.c
===================================================================
--- linux.trees.git.orig/arch/x86/kernel/apic/x2apic_uv_x.c
+++ linux.trees.git/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -455,7 +455,7 @@ static void uv_heartbeat(unsigned long i
uv_set_scir_bits(bits);
/* enable next timer period */
- mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+ mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
}
static void __cpuinit uv_heartbeat_enable(int cpu)
^ permalink raw reply [flat|nested] 13+ messages in thread* [v5 PATCH 2/4] timers: Identifying the existing pinned timers
2009-04-06 16:13 [v5 PATCH 1/4] timers: Framework for identifying pinned timers Arun R Bharadwaj
2009-04-06 16:16 ` [v5 PATCH 2/4] timers: Identifying the existing " Arun R Bharadwaj
@ 2009-04-06 16:16 ` Arun R Bharadwaj
2009-04-06 16:18 ` [v5 PATCH 3/4] timers: /proc/sys sysctl hook to enable timer migration Arun R Bharadwaj
` (3 subsequent siblings)
5 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-06 16:16 UTC (permalink / raw)
To: linux-kernel, linux-pm
Cc: a.p.zijlstra, vatsa, andi, Arun Bharadwaj, tglx, mingo, arjan
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-06 21:43:57]:
The following pinned hrtimers have been identified and marked:
1)sched_rt_period_timer
2)tick_sched_timer
3)stack_trace_timer_fn
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
kernel/sched.c | 5 +++--
kernel/time/tick-sched.c | 7 ++++---
kernel/trace/trace_sysprof.c | 3 ++-
4 files changed, 10 insertions(+), 7 deletions(-)
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -236,7 +236,7 @@ static void start_rt_bandwidth(struct rt
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
- hrtimer_start_expires(&rt_b->rt_period_timer,
+ hrtimer_start_expires_pinned(&rt_b->rt_period_timer,
HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
@@ -1156,7 +1156,8 @@ static __init void init_hrtick(void)
*/
static void hrtick_start(struct rq *rq, u64 delay)
{
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+ hrtimer_start_pinned(&rq->hrtick_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL);
}
static inline void init_hrtick(void)
Index: linux.trees.git/kernel/time/tick-sched.c
===================================================================
--- linux.trees.git.orig/kernel/time/tick-sched.c
+++ linux.trees.git/kernel/time/tick-sched.c
@@ -348,7 +348,7 @@ void tick_nohz_stop_sched_tick(int inidl
ts->idle_expires = expires;
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, expires,
+ hrtimer_start_pinned(&ts->sched_timer, expires,
HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
@@ -394,7 +394,7 @@ static void tick_nohz_restart(struct tic
hrtimer_forward(&ts->sched_timer, now, tick_period);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start_expires(&ts->sched_timer,
+ hrtimer_start_expires_pinned(&ts->sched_timer,
HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
@@ -698,7 +698,8 @@ void tick_setup_sched_timer(void)
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
- hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
+ hrtimer_start_expires_pinned(&ts->sched_timer,
+ HRTIMER_MODE_ABS);
/* Check, if the timer was already in the past */
if (hrtimer_active(&ts->sched_timer))
break;
Index: linux.trees.git/kernel/trace/trace_sysprof.c
===================================================================
--- linux.trees.git.orig/kernel/trace/trace_sysprof.c
+++ linux.trees.git/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unus
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = stack_trace_timer_fn;
- hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+ hrtimer_start_pinned(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL);
}
static void start_stack_timers(void)
Index: linux.trees.git/arch/x86/kernel/apic/x2apic_uv_x.c
===================================================================
--- linux.trees.git.orig/arch/x86/kernel/apic/x2apic_uv_x.c
+++ linux.trees.git/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -455,7 +455,7 @@ static void uv_heartbeat(unsigned long i
uv_set_scir_bits(bits);
/* enable next timer period */
- mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+ mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
}
static void __cpuinit uv_heartbeat_enable(int cpu)
^ permalink raw reply [flat|nested] 13+ messages in thread* [v5 PATCH 3/4] timers: /proc/sys sysctl hook to enable timer migration
2009-04-06 16:13 [v5 PATCH 1/4] timers: Framework for identifying pinned timers Arun R Bharadwaj
2009-04-06 16:16 ` [v5 PATCH 2/4] timers: Identifying the existing " Arun R Bharadwaj
2009-04-06 16:16 ` Arun R Bharadwaj
@ 2009-04-06 16:18 ` Arun R Bharadwaj
2009-04-06 16:18 ` Arun R Bharadwaj
` (2 subsequent siblings)
5 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-06 16:18 UTC (permalink / raw)
To: linux-kernel, linux-pm
Cc: a.p.zijlstra, ego, tglx, mingo, andi, venkatesh.pallipadi, vatsa,
arjan, svaidy, Arun Bharadwaj
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-06 21:43:57]:
This patch creates the /proc/sys sysctl interface at
/proc/sys/kernel/timer_migration
Timer migration is enabled by default.
To disable timer migration, when CONFIG_SCHED_DEBUG = y,
echo 0 > /proc/sys/kernel/timer_migration
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
include/linux/sched.h | 1 +
kernel/sched.c | 1 +
kernel/sysctl.c | 8 ++++++++
3 files changed, 10 insertions(+)
Index: linux.trees.git/include/linux/sched.h
===================================================================
--- linux.trees.git.orig/include/linux/sched.h
+++ linux.trees.git/include/linux/sched.h
@@ -1763,6 +1763,7 @@ extern unsigned int sysctl_sched_child_r
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_timer_migration;
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,
Index: linux.trees.git/kernel/sysctl.c
===================================================================
--- linux.trees.git.orig/kernel/sysctl.c
+++ linux.trees.git/kernel/sysctl.c
@@ -328,6 +328,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#endif
{
.ctl_name = CTL_UNNUMBERED,
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -7445,6 +7445,7 @@ static void sched_domain_node_span(int n
#endif /* CONFIG_NUMA */
int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+const_debug unsigned int sysctl_timer_migration = 1;
/*
* The cpus mask in sched_group and sched_domain hangs off the end.
^ permalink raw reply [flat|nested] 13+ messages in thread* [v5 PATCH 3/4] timers: /proc/sys sysctl hook to enable timer migration
2009-04-06 16:13 [v5 PATCH 1/4] timers: Framework for identifying pinned timers Arun R Bharadwaj
` (2 preceding siblings ...)
2009-04-06 16:18 ` [v5 PATCH 3/4] timers: /proc/sys sysctl hook to enable timer migration Arun R Bharadwaj
@ 2009-04-06 16:18 ` Arun R Bharadwaj
2009-04-06 16:22 ` [v5 PATCH 4/4] timers: logic to move non pinned timers Arun R Bharadwaj
2009-04-06 16:22 ` Arun R Bharadwaj
5 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-06 16:18 UTC (permalink / raw)
To: linux-kernel, linux-pm
Cc: a.p.zijlstra, vatsa, andi, Arun Bharadwaj, tglx, mingo, arjan
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-06 21:43:57]:
This patch creates the /proc/sys sysctl interface at
/proc/sys/kernel/timer_migration
Timer migration is enabled by default.
To disable timer migration, when CONFIG_SCHED_DEBUG = y,
echo 0 > /proc/sys/kernel/timer_migration
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
include/linux/sched.h | 1 +
kernel/sched.c | 1 +
kernel/sysctl.c | 8 ++++++++
3 files changed, 10 insertions(+)
Index: linux.trees.git/include/linux/sched.h
===================================================================
--- linux.trees.git.orig/include/linux/sched.h
+++ linux.trees.git/include/linux/sched.h
@@ -1763,6 +1763,7 @@ extern unsigned int sysctl_sched_child_r
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_timer_migration;
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,
Index: linux.trees.git/kernel/sysctl.c
===================================================================
--- linux.trees.git.orig/kernel/sysctl.c
+++ linux.trees.git/kernel/sysctl.c
@@ -328,6 +328,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "timer_migration",
+ .data = &sysctl_timer_migration,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
#endif
{
.ctl_name = CTL_UNNUMBERED,
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -7445,6 +7445,7 @@ static void sched_domain_node_span(int n
#endif /* CONFIG_NUMA */
int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+const_debug unsigned int sysctl_timer_migration = 1;
/*
* The cpus mask in sched_group and sched_domain hangs off the end.
^ permalink raw reply [flat|nested] 13+ messages in thread* [v5 PATCH 4/4] timers: logic to move non pinned timers
2009-04-06 16:13 [v5 PATCH 1/4] timers: Framework for identifying pinned timers Arun R Bharadwaj
` (3 preceding siblings ...)
2009-04-06 16:18 ` Arun R Bharadwaj
@ 2009-04-06 16:22 ` Arun R Bharadwaj
2009-04-06 16:48 ` Thomas Gleixner
2009-04-06 16:48 ` Thomas Gleixner
2009-04-06 16:22 ` Arun R Bharadwaj
5 siblings, 2 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-06 16:22 UTC (permalink / raw)
To: linux-kernel, linux-pm
Cc: a.p.zijlstra, ego, tglx, mingo, andi, venkatesh.pallipadi, vatsa,
arjan, svaidy, Arun Bharadwaj
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-06 21:43:57]:
This patch migrates all non pinned timers and hrtimers to the current
idle load balancer, from all the idle CPUs. Timers firing on busy CPUs
are not migrated.
While migrating hrtimers, care should be taken to check if migrating
a hrtimer would result in a latency or not. So we compare the expiry of the
hrtimer with the next timer interrupt on the target cpu and migrate the
hrtimer only if it expires *after* the next interrupt on the target cpu.
So, added a clockevents_get_next_event() helper function to return the
next_event on the target cpu's clock_event_device.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
include/linux/clockchips.h | 9 +++++++++
include/linux/sched.h | 12 ++++++++++++
kernel/hrtimer.c | 41 ++++++++++++++++++++++++++++++++++++++++-
kernel/sched.c | 5 +++++
kernel/time/clockevents.c | 14 ++++++++++++++
kernel/timer.c | 12 +++++++++++-
6 files changed, 91 insertions(+), 2 deletions(-)
Index: linux.trees.git/kernel/timer.c
===================================================================
--- linux.trees.git.orig/kernel/timer.c
+++ linux.trees.git/kernel/timer.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -606,7 +607,7 @@ __mod_timer(struct timer_list *timer, un
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret;
+ int ret, preferred_cpu, cpu;
ret = 0;
@@ -627,6 +628,15 @@ __mod_timer(struct timer_list *timer, un
new_base = __get_cpu_var(tvec_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && idle_cpu(cpu) && !pinned) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+ new_base = per_cpu(tvec_bases, cpu);
+
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
Index: linux.trees.git/kernel/hrtimer.c
===================================================================
--- linux.trees.git.orig/kernel/hrtimer.c
+++ linux.trees.git/kernel/hrtimer.c
@@ -43,6 +43,8 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -198,8 +200,17 @@ switch_hrtimer_base(struct hrtimer *time
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
+ int cpu, preferred_cpu = -1;
- new_cpu_base = &__get_cpu_var(hrtimer_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && !pinned && idle_cpu(cpu)) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
@@ -219,6 +230,34 @@ switch_hrtimer_base(struct hrtimer *time
timer->base = NULL;
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu == preferred_cpu) {
+ /* Calculate clock monotonic expiry time */
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+ new_base->offset);
+
+ /*
+ * Get the next event on target cpu from the
+ * clock events layer.
+ * This covers the highres=off nohz=on case as well.
+ */
+ ktime_t next = clockevents_get_next_event(cpu);
+
+ ktime_t delta = ktime_sub(expires, next);
+
+ /*
+ * We do not migrate the timer when it is expiring
+ * before the next event on the target cpu because
+ * we cannot reprogram the target cpu hardware and
+ * we would cause it to fire late.
+ */
+ if (delta.tv64 < 0) {
+ cpu = smp_processor_id();
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ goto again;
+ }
+ }
timer->base = new_base;
}
return new_base;
Index: linux.trees.git/include/linux/sched.h
===================================================================
--- linux.trees.git.orig/include/linux/sched.h
+++ linux.trees.git/include/linux/sched.h
@@ -265,6 +265,7 @@ static inline int select_nohz_load_balan
}
#endif
+extern int get_nohz_load_balancer(void);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
@@ -1769,6 +1770,17 @@ int sched_nr_latency_handler(struct ctl_
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos);
#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline int get_sysctl_timer_migration(void)
+{
+ return sysctl_timer_migration;
+}
+#else
+static inline int get_sysctl_timer_migration(void)
+{
+ return 1;
+}
+#endif
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -4009,6 +4009,11 @@ static struct {
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
Index: linux.trees.git/kernel/time/clockevents.c
===================================================================
--- linux.trees.git.orig/kernel/time/clockevents.c
+++ linux.trees.git/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
+#include <linux/tick.h>
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
@@ -252,3 +253,16 @@ void clockevents_notify(unsigned long re
}
EXPORT_SYMBOL_GPL(clockevents_notify);
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ktime_t clockevents_get_next_event(int cpu)
+{
+ struct tick_device *td;
+ struct clock_event_device *dev;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ return dev->next_event;
+}
+#endif
Index: linux.trees.git/include/linux/clockchips.h
===================================================================
--- linux.trees.git.orig/include/linux/clockchips.h
+++ linux.trees.git/include/linux/clockchips.h
@@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned
#endif
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern ktime_t clockevents_get_next_event(int cpu);
+#else
+static inline ktime_t clockevents_get_next_event(int cpu)
+{
+ return KTIME_MAX;
+}
+#endif
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [v5 PATCH 4/4] timers: logic to move non pinned timers
2009-04-06 16:22 ` [v5 PATCH 4/4] timers: logic to move non pinned timers Arun R Bharadwaj
@ 2009-04-06 16:48 ` Thomas Gleixner
2009-04-06 16:48 ` Thomas Gleixner
1 sibling, 0 replies; 13+ messages in thread
From: Thomas Gleixner @ 2009-04-06 16:48 UTC (permalink / raw)
To: Arun R Bharadwaj
Cc: a.p.zijlstra, linux-kernel, vatsa, andi, linux-pm, mingo, arjan
Arun,
On Mon, 6 Apr 2009, Arun R Bharadwaj wrote:
> +again:
> + new_cpu_base = &per_cpu(hrtimer_bases, cpu);
> new_base = &new_cpu_base->clock_base[base->index];
>
> if (base != new_base) {
> @@ -219,6 +230,34 @@ switch_hrtimer_base(struct hrtimer *time
> timer->base = NULL;
> spin_unlock(&base->cpu_base->lock);
> spin_lock(&new_base->cpu_base->lock);
> +
> + if (cpu == preferred_cpu) {
> + /* Calculate clock monotonic expiry time */
> + ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
> + new_base->offset);
> +
> + /*
> + * Get the next event on target cpu from the
> + * clock events layer.
> + * This covers the highres=off nohz=on case as well.
> + */
> + ktime_t next = clockevents_get_next_event(cpu);
> +
> + ktime_t delta = ktime_sub(expires, next);
> +
> + /*
> + * We do not migrate the timer when it is expiring
> + * before the next event on the target cpu because
> + * we cannot reprogram the target cpu hardware and
> + * we would cause it to fire late.
> + */
> + if (delta.tv64 < 0) {
> + cpu = smp_processor_id();
> + spin_unlock(&new_base->cpu_base->lock);
> + spin_lock(&base->cpu_base->lock);
Darn, I knew that I missed something when I looked at this before:
timer->base = base;
Thanks,
tglx
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [v5 PATCH 4/4] timers: logic to move non pinned timers
2009-04-06 16:22 ` [v5 PATCH 4/4] timers: logic to move non pinned timers Arun R Bharadwaj
2009-04-06 16:48 ` Thomas Gleixner
@ 2009-04-06 16:48 ` Thomas Gleixner
2009-04-07 8:11 ` Arun R Bharadwaj
` (3 more replies)
1 sibling, 4 replies; 13+ messages in thread
From: Thomas Gleixner @ 2009-04-06 16:48 UTC (permalink / raw)
To: Arun R Bharadwaj
Cc: linux-kernel, linux-pm, a.p.zijlstra, ego, mingo, andi,
venkatesh.pallipadi, vatsa, arjan, svaidy
Arun,
On Mon, 6 Apr 2009, Arun R Bharadwaj wrote:
> +again:
> + new_cpu_base = &per_cpu(hrtimer_bases, cpu);
> new_base = &new_cpu_base->clock_base[base->index];
>
> if (base != new_base) {
> @@ -219,6 +230,34 @@ switch_hrtimer_base(struct hrtimer *time
> timer->base = NULL;
> spin_unlock(&base->cpu_base->lock);
> spin_lock(&new_base->cpu_base->lock);
> +
> + if (cpu == preferred_cpu) {
> + /* Calculate clock monotonic expiry time */
> + ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
> + new_base->offset);
> +
> + /*
> + * Get the next event on target cpu from the
> + * clock events layer.
> + * This covers the highres=off nohz=on case as well.
> + */
> + ktime_t next = clockevents_get_next_event(cpu);
> +
> + ktime_t delta = ktime_sub(expires, next);
> +
> + /*
> + * We do not migrate the timer when it is expiring
> + * before the next event on the target cpu because
> + * we cannot reprogram the target cpu hardware and
> + * we would cause it to fire late.
> + */
> + if (delta.tv64 < 0) {
> + cpu = smp_processor_id();
> + spin_unlock(&new_base->cpu_base->lock);
> + spin_lock(&base->cpu_base->lock);
Darn, I knew that I missed something when I looked at this before:
timer->base = base;
Thanks,
tglx
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [v5 PATCH 4/4] timers: logic to move non pinned timers
2009-04-06 16:48 ` Thomas Gleixner
@ 2009-04-07 8:11 ` Arun R Bharadwaj
2009-04-07 8:11 ` Arun R Bharadwaj
` (2 subsequent siblings)
3 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-07 8:11 UTC (permalink / raw)
To: Thomas Gleixner, mingo
Cc: a.p.zijlstra, linux-kernel, vatsa, andi, linux-pm, arjan
* Thomas Gleixner <tglx@linutronix.de> [2009-04-06 18:48:48]:
Hi,
I tested my patchset for any possible regression, with kernbench.
Posting the results below. The results show that we are not facing
any regression due to these patches.
Ingo,
Can you please include the patch series in the -tip so that it gets
more testing? I worked with Thomas to implement all the necessary
fine-tunings that were required.
I'm reposting the [PATCH 4/4] in this mail. All the other patches in
the patchset are unchanged.
Kernbech results on an 8-cpu, 2 package machine.
-----------------------------------------------------------------------
| No. of Threads | Time(s) - Without | Time(s) - With the |
| | patches applied | patches applied |
-----------------------------------------------------------------------
| 1 | 298.9 | 298.8 |
| 2 | 148.2 | 148.6 |
| 4 | 76.9 | 76.5 |
| 8 | 43.8 | 43.5 |
| 16 | 39.5 | 39.3 |
| 32 | 39.1 | 39.1 |
-----------------------------------------------------------------------
--arun
---
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
include/linux/clockchips.h | 9 +++++++++
include/linux/sched.h | 12 ++++++++++++
kernel/hrtimer.c | 42 +++++++++++++++++++++++++++++++++++++++++-
kernel/sched.c | 5 +++++
kernel/time/clockevents.c | 14 ++++++++++++++
kernel/timer.c | 12 +++++++++++-
6 files changed, 92 insertions(+), 2 deletions(-)
Index: linux.trees.git/kernel/timer.c
===================================================================
--- linux.trees.git.orig/kernel/timer.c
+++ linux.trees.git/kernel/timer.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -606,7 +607,7 @@ __mod_timer(struct timer_list *timer, un
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret;
+ int ret, preferred_cpu, cpu;
ret = 0;
@@ -627,6 +628,15 @@ __mod_timer(struct timer_list *timer, un
new_base = __get_cpu_var(tvec_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && idle_cpu(cpu) && !pinned) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+ new_base = per_cpu(tvec_bases, cpu);
+
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
Index: linux.trees.git/kernel/hrtimer.c
===================================================================
--- linux.trees.git.orig/kernel/hrtimer.c
+++ linux.trees.git/kernel/hrtimer.c
@@ -43,6 +43,8 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -198,8 +200,17 @@ switch_hrtimer_base(struct hrtimer *time
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
+ int cpu, preferred_cpu = -1;
- new_cpu_base = &__get_cpu_var(hrtimer_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && !pinned && idle_cpu(cpu)) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
@@ -219,6 +230,35 @@ switch_hrtimer_base(struct hrtimer *time
timer->base = NULL;
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu == preferred_cpu) {
+ /* Calculate clock monotonic expiry time */
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+ new_base->offset);
+
+ /*
+ * Get the next event on target cpu from the
+ * clock events layer.
+ * This covers the highres=off nohz=on case as well.
+ */
+ ktime_t next = clockevents_get_next_event(cpu);
+
+ ktime_t delta = ktime_sub(expires, next);
+
+ /*
+ * We do not migrate the timer when it is expiring
+ * before the next event on the target cpu because
+ * we cannot reprogram the target cpu hardware and
+ * we would cause it to fire late.
+ */
+ if (delta.tv64 < 0) {
+ cpu = smp_processor_id();
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ timer->base = base;
+ goto again;
+ }
+ }
timer->base = new_base;
}
return new_base;
Index: linux.trees.git/include/linux/sched.h
===================================================================
--- linux.trees.git.orig/include/linux/sched.h
+++ linux.trees.git/include/linux/sched.h
@@ -265,6 +265,7 @@ static inline int select_nohz_load_balan
}
#endif
+extern int get_nohz_load_balancer(void);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
@@ -1769,6 +1770,17 @@ int sched_nr_latency_handler(struct ctl_
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos);
#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline int get_sysctl_timer_migration(void)
+{
+ return sysctl_timer_migration;
+}
+#else
+static inline int get_sysctl_timer_migration(void)
+{
+ return 1;
+}
+#endif
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -4009,6 +4009,11 @@ static struct {
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
Index: linux.trees.git/kernel/time/clockevents.c
===================================================================
--- linux.trees.git.orig/kernel/time/clockevents.c
+++ linux.trees.git/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
+#include <linux/tick.h>
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
@@ -252,3 +253,16 @@ void clockevents_notify(unsigned long re
}
EXPORT_SYMBOL_GPL(clockevents_notify);
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ktime_t clockevents_get_next_event(int cpu)
+{
+ struct tick_device *td;
+ struct clock_event_device *dev;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ return dev->next_event;
+}
+#endif
Index: linux.trees.git/include/linux/clockchips.h
===================================================================
--- linux.trees.git.orig/include/linux/clockchips.h
+++ linux.trees.git/include/linux/clockchips.h
@@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned
#endif
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern ktime_t clockevents_get_next_event(int cpu);
+#else
+static inline ktime_t clockevents_get_next_event(int cpu)
+{
+ return KTIME_MAX;
+}
+#endif
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [v5 PATCH 4/4] timers: logic to move non pinned timers
2009-04-06 16:48 ` Thomas Gleixner
2009-04-07 8:11 ` Arun R Bharadwaj
@ 2009-04-07 8:11 ` Arun R Bharadwaj
2009-04-07 8:17 ` Arun R Bharadwaj
2009-04-07 8:17 ` Arun R Bharadwaj
3 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-07 8:11 UTC (permalink / raw)
To: Thomas Gleixner, mingo
Cc: linux-kernel, linux-pm, a.p.zijlstra, ego, andi,
venkatesh.pallipadi, vatsa, arjan, svaidy
* Thomas Gleixner <tglx@linutronix.de> [2009-04-06 18:48:48]:
Hi,
I tested my patchset for any possible regression, with kernbench.
Posting the results below. The results show that we are not facing
any regression due to these patches.
Ingo,
Can you please include the patch series in the -tip so that it gets
more testing? I worked with Thomas to implement all the necessary
fine-tunings that were required.
I'm reposting the [PATCH 4/4] in this mail. All the other patches in
the patchset are unchanged.
Kernbech results on an 8-cpu, 2 package machine.
-----------------------------------------------------------------------
| No. of Threads | Time(s) - Without | Time(s) - With the |
| | patches applied | patches applied |
-----------------------------------------------------------------------
| 1 | 298.9 | 298.8 |
| 2 | 148.2 | 148.6 |
| 4 | 76.9 | 76.5 |
| 8 | 43.8 | 43.5 |
| 16 | 39.5 | 39.3 |
| 32 | 39.1 | 39.1 |
-----------------------------------------------------------------------
--arun
---
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
include/linux/clockchips.h | 9 +++++++++
include/linux/sched.h | 12 ++++++++++++
kernel/hrtimer.c | 42 +++++++++++++++++++++++++++++++++++++++++-
kernel/sched.c | 5 +++++
kernel/time/clockevents.c | 14 ++++++++++++++
kernel/timer.c | 12 +++++++++++-
6 files changed, 92 insertions(+), 2 deletions(-)
Index: linux.trees.git/kernel/timer.c
===================================================================
--- linux.trees.git.orig/kernel/timer.c
+++ linux.trees.git/kernel/timer.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -606,7 +607,7 @@ __mod_timer(struct timer_list *timer, un
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret;
+ int ret, preferred_cpu, cpu;
ret = 0;
@@ -627,6 +628,15 @@ __mod_timer(struct timer_list *timer, un
new_base = __get_cpu_var(tvec_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && idle_cpu(cpu) && !pinned) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+ new_base = per_cpu(tvec_bases, cpu);
+
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
Index: linux.trees.git/kernel/hrtimer.c
===================================================================
--- linux.trees.git.orig/kernel/hrtimer.c
+++ linux.trees.git/kernel/hrtimer.c
@@ -43,6 +43,8 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -198,8 +200,17 @@ switch_hrtimer_base(struct hrtimer *time
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
+ int cpu, preferred_cpu = -1;
- new_cpu_base = &__get_cpu_var(hrtimer_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && !pinned && idle_cpu(cpu)) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
@@ -219,6 +230,35 @@ switch_hrtimer_base(struct hrtimer *time
timer->base = NULL;
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu == preferred_cpu) {
+ /* Calculate clock monotonic expiry time */
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+ new_base->offset);
+
+ /*
+ * Get the next event on target cpu from the
+ * clock events layer.
+ * This covers the highres=off nohz=on case as well.
+ */
+ ktime_t next = clockevents_get_next_event(cpu);
+
+ ktime_t delta = ktime_sub(expires, next);
+
+ /*
+ * We do not migrate the timer when it is expiring
+ * before the next event on the target cpu because
+ * we cannot reprogram the target cpu hardware and
+ * we would cause it to fire late.
+ */
+ if (delta.tv64 < 0) {
+ cpu = smp_processor_id();
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ timer->base = base;
+ goto again;
+ }
+ }
timer->base = new_base;
}
return new_base;
Index: linux.trees.git/include/linux/sched.h
===================================================================
--- linux.trees.git.orig/include/linux/sched.h
+++ linux.trees.git/include/linux/sched.h
@@ -265,6 +265,7 @@ static inline int select_nohz_load_balan
}
#endif
+extern int get_nohz_load_balancer(void);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
@@ -1769,6 +1770,17 @@ int sched_nr_latency_handler(struct ctl_
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos);
#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline int get_sysctl_timer_migration(void)
+{
+ return sysctl_timer_migration;
+}
+#else
+static inline int get_sysctl_timer_migration(void)
+{
+ return 1;
+}
+#endif
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -4009,6 +4009,11 @@ static struct {
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
Index: linux.trees.git/kernel/time/clockevents.c
===================================================================
--- linux.trees.git.orig/kernel/time/clockevents.c
+++ linux.trees.git/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
+#include <linux/tick.h>
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
@@ -252,3 +253,16 @@ void clockevents_notify(unsigned long re
}
EXPORT_SYMBOL_GPL(clockevents_notify);
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ktime_t clockevents_get_next_event(int cpu)
+{
+ struct tick_device *td;
+ struct clock_event_device *dev;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ return dev->next_event;
+}
+#endif
Index: linux.trees.git/include/linux/clockchips.h
===================================================================
--- linux.trees.git.orig/include/linux/clockchips.h
+++ linux.trees.git/include/linux/clockchips.h
@@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned
#endif
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern ktime_t clockevents_get_next_event(int cpu);
+#else
+static inline ktime_t clockevents_get_next_event(int cpu)
+{
+ return KTIME_MAX;
+}
+#endif
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [v5 PATCH 4/4] timers: logic to move non pinned timers
2009-04-06 16:48 ` Thomas Gleixner
2009-04-07 8:11 ` Arun R Bharadwaj
2009-04-07 8:11 ` Arun R Bharadwaj
@ 2009-04-07 8:17 ` Arun R Bharadwaj
2009-04-07 8:17 ` Arun R Bharadwaj
3 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-07 8:17 UTC (permalink / raw)
To: Thomas Gleixner, mingo
Cc: linux-kernel, linux-pm, a.p.zijlstra, ego, andi,
venkatesh.pallipadi, vatsa, arjan, svaidy, Arun Bharadwaj
* Thomas Gleixner <tglx@linutronix.de> [2009-04-06 18:48:48]:
(The patch changelog got missed out somehow, so reposting)
Hi,
I tested my patchset for any possible regression, with kernbench.
Posting the results below. The results show that we are not facing
any regression due to these patches.
Ingo,
Can you please include the patch series in the -tip so that it gets
more testing? I worked with Thomas to implement all the necessary
fine-tunings that were required.
I'm reposting the [PATCH 4/4] in this mail. All the other patches in
the patchset are unchanged.
Kernbech results on an 8-cpu, 2 package machine.
-----------------------------------------------------------------------
| No. of Threads | Time(s) - Without | Time(s) - With the |
| | patches applied | patches applied |
-----------------------------------------------------------------------
| 1 | 298.9 | 298.8 |
| 2 | 148.2 | 148.6 |
| 4 | 76.9 | 76.5 |
| 8 | 43.8 | 43.5 |
| 16 | 39.5 | 39.3 |
| 32 | 39.1 | 39.1 |
-----------------------------------------------------------------------
--arun
---
This patch migrates all non pinned timers and hrtimers to the current
idle load balancer, from all the idle CPUs. Timers firing on busy CPUs
are not migrated.
While migrating hrtimers, care should be taken to check if migrating
a hrtimer would result in a latency or not. So we compare the expiry of the
hrtimer with the next timer interrupt on the target cpu and migrate the
hrtimer only if it expires *after* the next interrupt on the target cpu.
So, added a clockevents_get_next_event() helper function to return the
next_event on the target cpu's clock_event_device.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
include/linux/clockchips.h | 9 +++++++++
include/linux/sched.h | 12 ++++++++++++
kernel/hrtimer.c | 42 +++++++++++++++++++++++++++++++++++++++++-
kernel/sched.c | 5 +++++
kernel/time/clockevents.c | 14 ++++++++++++++
kernel/timer.c | 12 +++++++++++-
6 files changed, 92 insertions(+), 2 deletions(-)
Index: linux.trees.git/kernel/timer.c
===================================================================
--- linux.trees.git.orig/kernel/timer.c
+++ linux.trees.git/kernel/timer.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -606,7 +607,7 @@ __mod_timer(struct timer_list *timer, un
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret;
+ int ret, preferred_cpu, cpu;
ret = 0;
@@ -627,6 +628,15 @@ __mod_timer(struct timer_list *timer, un
new_base = __get_cpu_var(tvec_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && idle_cpu(cpu) && !pinned) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+ new_base = per_cpu(tvec_bases, cpu);
+
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
Index: linux.trees.git/kernel/hrtimer.c
===================================================================
--- linux.trees.git.orig/kernel/hrtimer.c
+++ linux.trees.git/kernel/hrtimer.c
@@ -43,6 +43,8 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -198,8 +200,17 @@ switch_hrtimer_base(struct hrtimer *time
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
+ int cpu, preferred_cpu = -1;
- new_cpu_base = &__get_cpu_var(hrtimer_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && !pinned && idle_cpu(cpu)) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
@@ -219,6 +230,35 @@ switch_hrtimer_base(struct hrtimer *time
timer->base = NULL;
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu == preferred_cpu) {
+ /* Calculate clock monotonic expiry time */
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+ new_base->offset);
+
+ /*
+ * Get the next event on target cpu from the
+ * clock events layer.
+ * This covers the highres=off nohz=on case as well.
+ */
+ ktime_t next = clockevents_get_next_event(cpu);
+
+ ktime_t delta = ktime_sub(expires, next);
+
+ /*
+ * We do not migrate the timer when it is expiring
+ * before the next event on the target cpu because
+ * we cannot reprogram the target cpu hardware and
+ * we would cause it to fire late.
+ */
+ if (delta.tv64 < 0) {
+ cpu = smp_processor_id();
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ timer->base = base;
+ goto again;
+ }
+ }
timer->base = new_base;
}
return new_base;
Index: linux.trees.git/include/linux/sched.h
===================================================================
--- linux.trees.git.orig/include/linux/sched.h
+++ linux.trees.git/include/linux/sched.h
@@ -265,6 +265,7 @@ static inline int select_nohz_load_balan
}
#endif
+extern int get_nohz_load_balancer(void);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
@@ -1769,6 +1770,17 @@ int sched_nr_latency_handler(struct ctl_
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos);
#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline int get_sysctl_timer_migration(void)
+{
+ return sysctl_timer_migration;
+}
+#else
+static inline int get_sysctl_timer_migration(void)
+{
+ return 1;
+}
+#endif
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -4009,6 +4009,11 @@ static struct {
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
Index: linux.trees.git/kernel/time/clockevents.c
===================================================================
--- linux.trees.git.orig/kernel/time/clockevents.c
+++ linux.trees.git/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
+#include <linux/tick.h>
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
@@ -252,3 +253,16 @@ void clockevents_notify(unsigned long re
}
EXPORT_SYMBOL_GPL(clockevents_notify);
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ktime_t clockevents_get_next_event(int cpu)
+{
+ struct tick_device *td;
+ struct clock_event_device *dev;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ return dev->next_event;
+}
+#endif
Index: linux.trees.git/include/linux/clockchips.h
===================================================================
--- linux.trees.git.orig/include/linux/clockchips.h
+++ linux.trees.git/include/linux/clockchips.h
@@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned
#endif
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern ktime_t clockevents_get_next_event(int cpu);
+#else
+static inline ktime_t clockevents_get_next_event(int cpu)
+{
+ return KTIME_MAX;
+}
+#endif
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [v5 PATCH 4/4] timers: logic to move non pinned timers
2009-04-06 16:48 ` Thomas Gleixner
` (2 preceding siblings ...)
2009-04-07 8:17 ` Arun R Bharadwaj
@ 2009-04-07 8:17 ` Arun R Bharadwaj
3 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-07 8:17 UTC (permalink / raw)
To: Thomas Gleixner, mingo
Cc: a.p.zijlstra, linux-kernel, vatsa, andi, Arun Bharadwaj, linux-pm,
arjan
* Thomas Gleixner <tglx@linutronix.de> [2009-04-06 18:48:48]:
(The patch changelog got missed out somehow, so reposting)
Hi,
I tested my patchset for any possible regression, with kernbench.
Posting the results below. The results show that we are not facing
any regression due to these patches.
Ingo,
Can you please include the patch series in the -tip so that it gets
more testing? I worked with Thomas to implement all the necessary
fine-tunings that were required.
I'm reposting the [PATCH 4/4] in this mail. All the other patches in
the patchset are unchanged.
Kernbech results on an 8-cpu, 2 package machine.
-----------------------------------------------------------------------
| No. of Threads | Time(s) - Without | Time(s) - With the |
| | patches applied | patches applied |
-----------------------------------------------------------------------
| 1 | 298.9 | 298.8 |
| 2 | 148.2 | 148.6 |
| 4 | 76.9 | 76.5 |
| 8 | 43.8 | 43.5 |
| 16 | 39.5 | 39.3 |
| 32 | 39.1 | 39.1 |
-----------------------------------------------------------------------
--arun
---
This patch migrates all non pinned timers and hrtimers to the current
idle load balancer, from all the idle CPUs. Timers firing on busy CPUs
are not migrated.
While migrating hrtimers, care should be taken to check if migrating
a hrtimer would result in a latency or not. So we compare the expiry of the
hrtimer with the next timer interrupt on the target cpu and migrate the
hrtimer only if it expires *after* the next interrupt on the target cpu.
So, added a clockevents_get_next_event() helper function to return the
next_event on the target cpu's clock_event_device.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
include/linux/clockchips.h | 9 +++++++++
include/linux/sched.h | 12 ++++++++++++
kernel/hrtimer.c | 42 +++++++++++++++++++++++++++++++++++++++++-
kernel/sched.c | 5 +++++
kernel/time/clockevents.c | 14 ++++++++++++++
kernel/timer.c | 12 +++++++++++-
6 files changed, 92 insertions(+), 2 deletions(-)
Index: linux.trees.git/kernel/timer.c
===================================================================
--- linux.trees.git.orig/kernel/timer.c
+++ linux.trees.git/kernel/timer.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -606,7 +607,7 @@ __mod_timer(struct timer_list *timer, un
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret;
+ int ret, preferred_cpu, cpu;
ret = 0;
@@ -627,6 +628,15 @@ __mod_timer(struct timer_list *timer, un
new_base = __get_cpu_var(tvec_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && idle_cpu(cpu) && !pinned) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+ new_base = per_cpu(tvec_bases, cpu);
+
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
Index: linux.trees.git/kernel/hrtimer.c
===================================================================
--- linux.trees.git.orig/kernel/hrtimer.c
+++ linux.trees.git/kernel/hrtimer.c
@@ -43,6 +43,8 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -198,8 +200,17 @@ switch_hrtimer_base(struct hrtimer *time
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
+ int cpu, preferred_cpu = -1;
- new_cpu_base = &__get_cpu_var(hrtimer_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && !pinned && idle_cpu(cpu)) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
@@ -219,6 +230,35 @@ switch_hrtimer_base(struct hrtimer *time
timer->base = NULL;
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu == preferred_cpu) {
+ /* Calculate clock monotonic expiry time */
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+ new_base->offset);
+
+ /*
+ * Get the next event on target cpu from the
+ * clock events layer.
+ * This covers the highres=off nohz=on case as well.
+ */
+ ktime_t next = clockevents_get_next_event(cpu);
+
+ ktime_t delta = ktime_sub(expires, next);
+
+ /*
+ * We do not migrate the timer when it is expiring
+ * before the next event on the target cpu because
+ * we cannot reprogram the target cpu hardware and
+ * we would cause it to fire late.
+ */
+ if (delta.tv64 < 0) {
+ cpu = smp_processor_id();
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ timer->base = base;
+ goto again;
+ }
+ }
timer->base = new_base;
}
return new_base;
Index: linux.trees.git/include/linux/sched.h
===================================================================
--- linux.trees.git.orig/include/linux/sched.h
+++ linux.trees.git/include/linux/sched.h
@@ -265,6 +265,7 @@ static inline int select_nohz_load_balan
}
#endif
+extern int get_nohz_load_balancer(void);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
@@ -1769,6 +1770,17 @@ int sched_nr_latency_handler(struct ctl_
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos);
#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline int get_sysctl_timer_migration(void)
+{
+ return sysctl_timer_migration;
+}
+#else
+static inline int get_sysctl_timer_migration(void)
+{
+ return 1;
+}
+#endif
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -4009,6 +4009,11 @@ static struct {
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
Index: linux.trees.git/kernel/time/clockevents.c
===================================================================
--- linux.trees.git.orig/kernel/time/clockevents.c
+++ linux.trees.git/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
+#include <linux/tick.h>
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
@@ -252,3 +253,16 @@ void clockevents_notify(unsigned long re
}
EXPORT_SYMBOL_GPL(clockevents_notify);
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ktime_t clockevents_get_next_event(int cpu)
+{
+ struct tick_device *td;
+ struct clock_event_device *dev;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ return dev->next_event;
+}
+#endif
Index: linux.trees.git/include/linux/clockchips.h
===================================================================
--- linux.trees.git.orig/include/linux/clockchips.h
+++ linux.trees.git/include/linux/clockchips.h
@@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned
#endif
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern ktime_t clockevents_get_next_event(int cpu);
+#else
+static inline ktime_t clockevents_get_next_event(int cpu)
+{
+ return KTIME_MAX;
+}
+#endif
^ permalink raw reply [flat|nested] 13+ messages in thread
* [v5 PATCH 4/4] timers: logic to move non pinned timers
2009-04-06 16:13 [v5 PATCH 1/4] timers: Framework for identifying pinned timers Arun R Bharadwaj
` (4 preceding siblings ...)
2009-04-06 16:22 ` [v5 PATCH 4/4] timers: logic to move non pinned timers Arun R Bharadwaj
@ 2009-04-06 16:22 ` Arun R Bharadwaj
5 siblings, 0 replies; 13+ messages in thread
From: Arun R Bharadwaj @ 2009-04-06 16:22 UTC (permalink / raw)
To: linux-kernel, linux-pm
Cc: a.p.zijlstra, vatsa, andi, Arun Bharadwaj, tglx, mingo, arjan
* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-04-06 21:43:57]:
This patch migrates all non pinned timers and hrtimers to the current
idle load balancer, from all the idle CPUs. Timers firing on busy CPUs
are not migrated.
While migrating hrtimers, care should be taken to check if migrating
a hrtimer would result in a latency or not. So we compare the expiry of the
hrtimer with the next timer interrupt on the target cpu and migrate the
hrtimer only if it expires *after* the next interrupt on the target cpu.
So, added a clockevents_get_next_event() helper function to return the
next_event on the target cpu's clock_event_device.
Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
include/linux/clockchips.h | 9 +++++++++
include/linux/sched.h | 12 ++++++++++++
kernel/hrtimer.c | 41 ++++++++++++++++++++++++++++++++++++++++-
kernel/sched.c | 5 +++++
kernel/time/clockevents.c | 14 ++++++++++++++
kernel/timer.c | 12 +++++++++++-
6 files changed, 91 insertions(+), 2 deletions(-)
Index: linux.trees.git/kernel/timer.c
===================================================================
--- linux.trees.git.orig/kernel/timer.c
+++ linux.trees.git/kernel/timer.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -606,7 +607,7 @@ __mod_timer(struct timer_list *timer, un
{
struct tvec_base *base, *new_base;
unsigned long flags;
- int ret;
+ int ret, preferred_cpu, cpu;
ret = 0;
@@ -627,6 +628,15 @@ __mod_timer(struct timer_list *timer, un
new_base = __get_cpu_var(tvec_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && idle_cpu(cpu) && !pinned) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+ new_base = per_cpu(tvec_bases, cpu);
+
if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
Index: linux.trees.git/kernel/hrtimer.c
===================================================================
--- linux.trees.git.orig/kernel/hrtimer.c
+++ linux.trees.git/kernel/hrtimer.c
@@ -43,6 +43,8 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -198,8 +200,17 @@ switch_hrtimer_base(struct hrtimer *time
{
struct hrtimer_clock_base *new_base;
struct hrtimer_cpu_base *new_cpu_base;
+ int cpu, preferred_cpu = -1;
- new_cpu_base = &__get_cpu_var(hrtimer_bases);
+ cpu = smp_processor_id();
+ if (get_sysctl_timer_migration() && !pinned && idle_cpu(cpu)) {
+ preferred_cpu = get_nohz_load_balancer();
+ if (preferred_cpu >= 0)
+ cpu = preferred_cpu;
+ }
+
+again:
+ new_cpu_base = &per_cpu(hrtimer_bases, cpu);
new_base = &new_cpu_base->clock_base[base->index];
if (base != new_base) {
@@ -219,6 +230,34 @@ switch_hrtimer_base(struct hrtimer *time
timer->base = NULL;
spin_unlock(&base->cpu_base->lock);
spin_lock(&new_base->cpu_base->lock);
+
+ if (cpu == preferred_cpu) {
+ /* Calculate clock monotonic expiry time */
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
+ new_base->offset);
+
+ /*
+ * Get the next event on target cpu from the
+ * clock events layer.
+ * This covers the highres=off nohz=on case as well.
+ */
+ ktime_t next = clockevents_get_next_event(cpu);
+
+ ktime_t delta = ktime_sub(expires, next);
+
+ /*
+ * We do not migrate the timer when it is expiring
+ * before the next event on the target cpu because
+ * we cannot reprogram the target cpu hardware and
+ * we would cause it to fire late.
+ */
+ if (delta.tv64 < 0) {
+ cpu = smp_processor_id();
+ spin_unlock(&new_base->cpu_base->lock);
+ spin_lock(&base->cpu_base->lock);
+ goto again;
+ }
+ }
timer->base = new_base;
}
return new_base;
Index: linux.trees.git/include/linux/sched.h
===================================================================
--- linux.trees.git.orig/include/linux/sched.h
+++ linux.trees.git/include/linux/sched.h
@@ -265,6 +265,7 @@ static inline int select_nohz_load_balan
}
#endif
+extern int get_nohz_load_balancer(void);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
@@ -1769,6 +1770,17 @@ int sched_nr_latency_handler(struct ctl_
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos);
#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline int get_sysctl_timer_migration(void)
+{
+ return sysctl_timer_migration;
+}
+#else
+static inline int get_sysctl_timer_migration(void)
+{
+ return 1;
+}
+#endif
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
Index: linux.trees.git/kernel/sched.c
===================================================================
--- linux.trees.git.orig/kernel/sched.c
+++ linux.trees.git/kernel/sched.c
@@ -4009,6 +4009,11 @@ static struct {
.load_balancer = ATOMIC_INIT(-1),
};
+int get_nohz_load_balancer(void)
+{
+ return atomic_read(&nohz.load_balancer);
+}
+
/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
Index: linux.trees.git/kernel/time/clockevents.c
===================================================================
--- linux.trees.git.orig/kernel/time/clockevents.c
+++ linux.trees.git/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
+#include <linux/tick.h>
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
@@ -252,3 +253,16 @@ void clockevents_notify(unsigned long re
}
EXPORT_SYMBOL_GPL(clockevents_notify);
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ktime_t clockevents_get_next_event(int cpu)
+{
+ struct tick_device *td;
+ struct clock_event_device *dev;
+
+ td = &per_cpu(tick_cpu_device, cpu);
+ dev = td->evtdev;
+
+ return dev->next_event;
+}
+#endif
Index: linux.trees.git/include/linux/clockchips.h
===================================================================
--- linux.trees.git.orig/include/linux/clockchips.h
+++ linux.trees.git/include/linux/clockchips.h
@@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned
#endif
#endif
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+extern ktime_t clockevents_get_next_event(int cpu);
+#else
+static inline ktime_t clockevents_get_next_event(int cpu)
+{
+ return KTIME_MAX;
+}
+#endif
^ permalink raw reply [flat|nested] 13+ messages in thread