From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
To: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Venki Pallipadi <venki@google.com>,
Peter Zijlstra <peterz@infradead.org>,
Andi Kleen <andi@firstfloor.org>,
Tim Chen <tim.c.chen@linux.intel.com>,
Ingo Molnar <mingo@elte.hu>,
"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
Subject: Re: [Patch] Idle balancer: cache align nohz structure to improve idle load balancing scalability
Date: Wed, 2 Nov 2011 18:34:54 +0530 [thread overview]
Message-ID: <20111102130453.GB6820@linux.vnet.ibm.com> (raw)
In-Reply-To: <1320191558.28097.44.camel@sbsiddha-desk.sc.intel.com>
* Suresh Siddha <suresh.b.siddha@intel.com> [2011-11-01 16:52:38]:
> That is correct. So to minimize the global updates I did two things.
Another issue I am trying to solve with nohz balancer is stale-ness of
nohz.next_balance. As cpus enter/exit tickless state, nohz.next_balance
is not updated to reflect that. I have found that its a predominant source
of increased idle time with CFS bandwidth patches :
https://lkml.org/lkml/2011/9/26/162
While it may be costly to update nohz.next_balance during exit event,
I think we should update it upon entry. I am working on a patch to make
nohz.next_balance more precise. This requires tickless idle cpus to be tracked
in a global rb-tree (each tickless idle cpu is inserted in this tree,
indexed by its rq->next_balance). Having this rb-tree makes it easy to
precisely determine min(rq->next_balance) when a kick is deserved, which
I am hoping is good for both performance (on the dot wakeup) but also power
(avoid unnecessary/early wakeup).
Draft patch is below. I was intending to post it soon along with some
real-world benchmark numbers (based on matrix multiplication).
Any other suggestions to avoid this global rb-tree, but still keep
nohz.next_balance as precise as possible, would be welcome!
Not-yet-Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
---
Makefile | 2
kernel/sched.c | 9 ++
kernel/sched_fair.c | 185 +++++++++++++++++++++++++++++++++++++++++++-----
kernel/sched_idletask.c | 2
4 files changed, 179 insertions(+), 19 deletions(-)
Index: current/Makefile
===================================================================
--- current.orig/Makefile
+++ current/Makefile
@@ -1,6 +1,6 @@
VERSION = 3
PATCHLEVEL = 1
-SUBLEVEL = 0
+SUBLEVEL = 0-myp
EXTRAVERSION =
NAME = "Divemaster Edition"
Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c
+++ current/kernel/sched.c
@@ -602,6 +602,7 @@ struct rq {
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
unsigned char nohz_balance_kick;
+ struct rb_node nohz_node;
#endif
int skip_clock_update;
@@ -1409,6 +1410,8 @@ static inline bool got_nohz_idle_kick(vo
return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
}
+static void reset_first_second_pick_cpu(int cpu);
+
#else /* CONFIG_NO_HZ */
static inline bool got_nohz_idle_kick(void)
@@ -1416,6 +1419,8 @@ static inline bool got_nohz_idle_kick(vo
return false;
}
+static inline void reset_first_second_pick_cpu(int cpu) { }
+
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
@@ -8299,6 +8304,7 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ
rq->nohz_balance_kick = 0;
+ rb_init_node(&rq->nohz_node);
#endif
#endif
init_rq_hrtick(rq);
@@ -8344,10 +8350,13 @@ void __init sched_init(void)
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&nohz.stale_idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
atomic_set(&nohz.load_balancer, nr_cpu_ids);
atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
+ nohz.rq_next_balance = RB_ROOT;
+ spin_lock_init(&nohz.next_balance_lock);
#endif
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c
+++ current/kernel/sched_fair.c
@@ -4285,7 +4285,11 @@ static struct {
atomic_t second_pick_cpu;
cpumask_var_t idle_cpus_mask;
cpumask_var_t grp_idle_mask;
+ cpumask_var_t stale_idle_cpus_mask;
unsigned long next_balance; /* in jiffy units */
+ struct rb_root rq_next_balance;
+ struct rb_node *rb_leftmost;
+ spinlock_t next_balance_lock;
} nohz ____cacheline_aligned;
int get_nohz_load_balancer(void)
@@ -4427,8 +4431,12 @@ static void nohz_balancer_kick(int cpu)
ilb_cpu = get_nohz_load_balancer();
- if (ilb_cpu >= nr_cpu_ids) {
- ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+ /*
+ * ilb_cpu itself can be attempting to kick another idle cpu. Pick
+ * another idle cpu in that case.
+ */
+ if (ilb_cpu >= nr_cpu_ids || ilb_cpu == cpu) {
+ ilb_cpu = cpumask_any_but(nohz.idle_cpus_mask, cpu);
if (ilb_cpu >= nr_cpu_ids)
return;
}
@@ -4449,6 +4457,122 @@ static void nohz_balancer_kick(int cpu)
}
/*
+ * Lazy dequeue of no-longer-tickless-idle cpu from rb-tree.
+ *
+ * This is done lazily, as otherwise taking a spinlock (nohz.next_balance_lock)
+ * to do immediate update of rb-tree can slowdown transition out of tickless
+ * state (and thus may hurt scheduling latencies for a task waking on a tickless
+ * idle cpu).
+ */
+static void __dequeue_nohz_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu), *entry;
+
+ WARN_ON(RB_EMPTY_NODE(&rq->nohz_node));
+
+ if (nohz.rb_leftmost == &rq->nohz_node) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&rq->nohz_node);
+ nohz.rb_leftmost = next_node;
+ if (next_node) {
+ entry = rb_entry(next_node, struct rq, nohz_node);
+
+ nohz.next_balance = entry->next_balance;
+ }
+ }
+
+ rb_erase(&rq->nohz_node, &nohz.rq_next_balance);
+ RB_CLEAR_NODE(&rq->nohz_node);
+}
+
+static void __enqueue_nohz_idle_cpu(int cpu)
+{
+ struct rb_node **link = &nohz.rq_next_balance.rb_node;
+ struct rb_node *parent = NULL;
+ struct rq *entry;
+ struct rq *rq = cpu_rq(cpu);
+ int leftmost = 1;
+
+ /*
+ * Find the right place in the rbtree:
+ */
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct rq, nohz_node);
+ /*
+ * We dont care about collisions. Nodes with
+ * the same key stay together.
+ */
+ if (time_before(rq->next_balance, entry->next_balance)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ /*
+ * Maintain a cache of leftmost tree entries (it is frequently
+ * used):
+ */
+ if (leftmost) {
+ nohz.rb_leftmost = &rq->nohz_node;
+ nohz.next_balance = rq->next_balance;
+ }
+
+ rb_link_node(&rq->nohz_node, parent, link);
+ rb_insert_color(&rq->nohz_node, &nohz.rq_next_balance);
+}
+
+static void __clear_stale_nohz_idle_cpus(void)
+{
+ int stale_idle_cpu;
+
+ for_each_cpu(stale_idle_cpu, nohz.stale_idle_cpus_mask) {
+ __dequeue_nohz_idle_cpu(stale_idle_cpu);
+ cpumask_clear_cpu(stale_idle_cpu, nohz.stale_idle_cpus_mask);
+ }
+}
+
+/*
+ * Remove no-longer-tickless-idle cpus from rb-tree.
+ */
+static void clear_stale_nohz_idle_cpus(void)
+{
+ spin_lock(&nohz.next_balance_lock);
+ __clear_stale_nohz_idle_cpus();
+ spin_unlock(&nohz.next_balance_lock);
+}
+
+/*
+ * Enqueue a idle cpu going tickless in a rb-tree, indexed by its
+ * rq->next_balance value.
+ */
+static void enqueue_nohz_idle_cpu(int cpu)
+{
+ spin_lock(&nohz.next_balance_lock);
+ if (cpumask_test_cpu(cpu, nohz.stale_idle_cpus_mask)) {
+ __dequeue_nohz_idle_cpu(cpu);
+ cpumask_clear_cpu(cpu, nohz.stale_idle_cpus_mask);
+ }
+ __enqueue_nohz_idle_cpu(cpu);
+ spin_unlock(&nohz.next_balance_lock);
+}
+
+/*
+ * Dequeue and enqueue a tickless idle cpu indexed at its new rq->next_balance
+ * value.
+ */
+static void re_enqueue_nohz_idle_cpu(int cpu)
+{
+ spin_lock(&nohz.next_balance_lock);
+ __dequeue_nohz_idle_cpu(cpu);
+ __enqueue_nohz_idle_cpu(cpu);
+ spin_unlock(&nohz.next_balance_lock);
+}
+
+/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
* load balancing on behalf of all those cpus.
@@ -4481,12 +4605,9 @@ void select_nohz_load_balancer(int stop_
return;
}
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ enqueue_nohz_idle_cpu(cpu);
- if (atomic_read(&nohz.first_pick_cpu) == cpu)
- atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
- if (atomic_read(&nohz.second_pick_cpu) == cpu)
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
int new_ilb;
@@ -4512,6 +4633,7 @@ void select_nohz_load_balancer(int stop_
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return;
+ cpumask_set_cpu(cpu, nohz.stale_idle_cpus_mask);
cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
if (atomic_read(&nohz.load_balancer) == cpu)
@@ -4622,10 +4744,20 @@ static void nohz_idle_balance(int this_c
struct rq *this_rq = cpu_rq(this_cpu);
struct rq *rq;
int balance_cpu;
+ unsigned long old_next_balance;
- if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+ if (!this_rq->nohz_balance_kick)
return;
+ /* Wakeup another idle cpu to do idle load balance if we got busy */
+ if (!idle_cpu(this_cpu)) {
+ nohz_balancer_kick(this_cpu);
+ goto out;
+ }
+
+ /* Remove no-longer-tickless-idle cpus from rb-tree */
+ clear_stale_nohz_idle_cpus();
+
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
if (balance_cpu == this_cpu)
continue;
@@ -4636,8 +4768,8 @@ static void nohz_idle_balance(int this_c
* balancing owner will pick it up.
*/
if (need_resched()) {
- this_rq->nohz_balance_kick = 0;
- break;
+ nohz_balancer_kick(this_cpu);
+ goto out;
}
raw_spin_lock_irq(&this_rq->lock);
@@ -4645,13 +4777,15 @@ static void nohz_idle_balance(int this_c
update_cpu_load(this_rq);
raw_spin_unlock_irq(&this_rq->lock);
- rebalance_domains(balance_cpu, CPU_IDLE);
-
rq = cpu_rq(balance_cpu);
- if (time_after(this_rq->next_balance, rq->next_balance))
- this_rq->next_balance = rq->next_balance;
+ old_next_balance = rq->next_balance;
+ if (time_after_eq(jiffies, old_next_balance)) {
+ rebalance_domains(balance_cpu, CPU_IDLE);
+ if (rq->next_balance != old_next_balance)
+ re_enqueue_nohz_idle_cpu(balance_cpu);
+ }
}
- nohz.next_balance = this_rq->next_balance;
+out:
this_rq->nohz_balance_kick = 0;
}
@@ -4700,9 +4834,24 @@ static inline int nohz_kick_needed(struc
}
return 0;
}
-#else
+
+/*
+ * Reset first_pick_cpu or second_pick_cpu identifier in case
+ * corresponding cpu is going idle.
+ */
+static void reset_first_second_pick_cpu(int cpu)
+{
+ if (atomic_read(&nohz.first_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+ if (atomic_read(&nohz.second_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+}
+
+#else /* CONFIG_NO_HZ */
+
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
-#endif
+
+#endif /* CONFIG_NO_HZ */
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
@@ -4740,7 +4889,7 @@ static inline void trigger_load_balance(
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ
- else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
nohz_balancer_kick(cpu);
#endif
}
Index: current/kernel/sched_idletask.c
===================================================================
--- current.orig/kernel/sched_idletask.c
+++ current/kernel/sched_idletask.c
@@ -24,6 +24,8 @@ static struct task_struct *pick_next_tas
{
schedstat_inc(rq, sched_goidle);
calc_load_account_idle(rq);
+ reset_first_second_pick_cpu(cpu_of(rq));
+
return rq->idle;
}
next prev parent reply other threads:[~2011-11-02 13:07 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-10-19 21:45 [Patch] Idle balancer: cache align nohz structure to improve idle load balancing scalability Tim Chen
2011-10-20 4:18 ` Eric Dumazet
2011-10-20 5:57 ` Suresh Siddha
2011-10-20 6:43 ` Eric Dumazet
2011-10-20 17:19 ` Tim Chen
2011-10-20 4:24 ` Andi Kleen
2011-10-20 12:26 ` Venki Pallipadi
2011-10-20 17:31 ` Suresh Siddha
2011-10-20 17:38 ` Peter Zijlstra
[not found] ` <4FF5AC937153B0459463C1A88EB478F20135D6ECB5@orsmsx505.amr.corp.intel.com>
2011-11-01 23:52 ` Suresh Siddha
2011-11-02 13:04 ` Srivatsa Vaddagiri [this message]
2011-11-02 13:54 ` Srivatsa Vaddagiri
2011-11-02 15:13 ` Suresh Siddha
2011-11-14 9:32 ` Peter Zijlstra
2011-11-14 19:37 ` Suresh Siddha
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20111102130453.GB6820@linux.vnet.ibm.com \
--to=vatsa@linux.vnet.ibm.com \
--cc=andi@firstfloor.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=peterz@infradead.org \
--cc=suresh.b.siddha@intel.com \
--cc=tim.c.chen@linux.intel.com \
--cc=venki@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.