* [patch 0/3] sched: bandwidth-control tweaks for v3.2
@ 2011-11-08 4:26 Paul Turner
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
` (2 more replies)
0 siblings, 3 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-08 4:26 UTC (permalink / raw)
To: linux-kernel, Peter Zijlstra, Ingo Molnar
Hi,
Please find attached a few minor tweaks for bandwidth control that are worth
considering for 3.2
The first two are a performance improvement for the !enabled case and a buglet
fix. While the last makes idle_balance() work with throttling.
Thanks,
- Paul
^ permalink raw reply [flat|nested] 15+ messages in thread
* [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
2011-11-08 4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner
@ 2011-11-08 4:26 ` Paul Turner
2011-11-08 9:26 ` Peter Zijlstra
` (3 more replies)
2011-11-08 4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner
2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
2 siblings, 4 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-08 4:26 UTC (permalink / raw)
To: linux-kernel, Peter Zijlstra, Ingo Molnar
[-- Attachment #1: sched-bwc-add_jump_labels.patch --]
[-- Type: text/plain, Size: 4859 bytes --]
Now that the linkage of jump-labels has been fixed they show a measurable
improvement in overhead for the enabled-but-unused case.
Workload is:
'taskset -c 0 perf stat --repeat 50 -e instructions,cycles,branches
bash -c "for ((i=0;i<5;i++)); do $(dirname $0)/pipe-test 20000; done"'
instructions cycles branches
-------------------------------------------------------------------------
Intel Westmere
base 806611770 745895590 146765378
+jumplabel 803090165 (-0.44) 713381840 (-4.36) 144561130
AMD Barcelona
base 824657415 740055589 148855354
+jumplabel 821056910 (-0.44) 737558389 (-0.34) 146635229
Signed-off-by: Paul Turner <pjt@google.com>
---
kernel/sched.c | 33 +++++++++++++++++++++++++++++++--
kernel/sched_fair.c | 17 +++++++++++++----
2 files changed, 44 insertions(+), 6 deletions(-)
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -71,6 +71,7 @@
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
+#include <linux/jump_label.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -502,7 +503,32 @@ static void destroy_cfs_bandwidth(struct
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
-#else
+
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return static_branch(&__cfs_bandwidth_used);
+}
+
+static void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+ /* only need to count groups transitioning between enabled/!enabled */
+ if (enabled && !was_enabled)
+ jump_label_inc(&__cfs_bandwidth_used);
+ else if (!enabled && was_enabled)
+ jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* !HAVE_JUMP_LABEL */
+/* static_branch doesn't help unless supported */
+static int cfs_bandwidth_used(void)
+{
+ return 1;
+}
+static void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
+#else /* !CONFIG_CFS_BANDWIDTH */
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
@@ -9188,7 +9214,7 @@ static int __cfs_schedulable(struct task
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
{
- int i, ret = 0, runtime_enabled;
+ int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
if (tg == &root_task_group)
@@ -9216,6 +9242,9 @@ static int tg_set_cfs_bandwidth(struct t
goto out_unlock;
runtime_enabled = quota != RUNTIME_INF;
+ runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+ account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1408,7 +1408,7 @@ static void __account_cfs_rq_runtime(str
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
- if (!cfs_rq->runtime_enabled)
+ if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,13 +1416,13 @@ static __always_inline void account_cfs_
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_rq->throttled;
+ return cfs_bandwidth_used() && cfs_rq->throttled;
}
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
- return cfs_rq->throttle_count;
+ return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
/*
@@ -1743,6 +1743,9 @@ static void __return_cfs_rq_runtime(stru
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
return;
@@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
return;
^ permalink raw reply [flat|nested] 15+ messages in thread
* [patch 2/3] sched: fix buglet in return_cfs_rq_runtime()
2011-11-08 4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
@ 2011-11-08 4:26 ` Paul Turner
2011-11-18 23:41 ` [tip:sched/core] sched: Fix " tip-bot for Paul Turner
2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
2 siblings, 1 reply; 15+ messages in thread
From: Paul Turner @ 2011-11-08 4:26 UTC (permalink / raw)
To: linux-kernel, Peter Zijlstra, Ingo Molnar
[-- Attachment #1: sched-bwc-fix_return_cfs_rq_runtime.patch --]
[-- Type: text/plain, Size: 679 bytes --]
In return_cfs_rq_runtime() we want to return bandwidth when there are no
remaining tasks, not "return" when this is the case.
Signed-off-by: Paul Turner <pjt@google.com>
---
kernel/sched_fair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1746,7 +1746,7 @@ static __always_inline void return_cfs_r
if (!cfs_bandwidth_used())
return;
- if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
^ permalink raw reply [flat|nested] 15+ messages in thread
* [patch 3/3] From: Ben Segall <bsegall@google.com>
2011-11-08 4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
2011-11-08 4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner
@ 2011-11-08 4:26 ` Paul Turner
2011-11-10 2:28 ` Paul Turner
2011-11-10 2:30 ` Paul Turner
2 siblings, 2 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-08 4:26 UTC (permalink / raw)
To: linux-kernel, Peter Zijlstra, Ingo Molnar; +Cc: Ben Segall
[-- Attachment #1: sched-bwc-fix_nr_running.patch --]
[-- Type: text/plain, Size: 8571 bytes --]
sched: update task accounting on throttle so that idle_balance() will trigger
Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().
Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.
This allows schedule() to call idle_balance when we go idle due to throttling.
Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
+patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261
Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
kernel/sched.c | 24 ++++++++----
kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++----------------
2 files changed, 87 insertions(+), 38 deletions(-)
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -269,6 +269,13 @@ struct cfs_bandwidth {
#endif
};
+enum runtime_state {
+ RUNTIME_UNLIMITED,
+ RUNTIME_AVAILABLE,
+ RUNTIME_THROTTLING,
+ RUNTIME_THROTTLED
+};
+
/* task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -402,12 +409,12 @@ struct cfs_rq {
unsigned long load_contribution;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
- int runtime_enabled;
+ enum runtime_state runtime_state;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_timestamp;
- int throttled, throttle_count;
+ int throttle_count;
struct list_head throttled_list;
#endif
#endif
@@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- cfs_rq->runtime_enabled = 0;
+ cfs_rq->runtime_state = RUNTIME_UNLIMITED;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
@@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s
for_each_leaf_cfs_rq(rq, cfs_rq) {
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- if (!cfs_rq->runtime_enabled)
+ if (!cfs_rq_runtime_enabled(cfs_rq))
continue;
/*
@@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t
struct rq *rq = rq_of(cfs_rq);
raw_spin_lock_irq(&rq->lock);
- cfs_rq->runtime_enabled = runtime_enabled;
- cfs_rq->runtime_remaining = 0;
-
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
+
+ if (runtime_enabled)
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+ else
+ cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+ cfs_rq->runtime_remaining = 0;
raw_spin_unlock_irq(&rq->lock);
}
out_unlock:
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct
}
}
+static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
+{
+ struct sched_entity *se;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ if (!se->on_rq)
+ break;
+
+ qcfs_rq->h_nr_running -= nr_throttling;
+
+ if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+ break;
+ }
+
+ if (!se)
+ rq_of(cfs_rq)->nr_running -= nr_throttling;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
*/
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_task(rq_of(cfs_rq)->curr);
+ if (assign_cfs_rq_runtime(cfs_rq))
+ return;
+
+ if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+ cfs_rq->runtime_state == RUNTIME_THROTTLING)
+ return;
+
+ resched_task(rq_of(cfs_rq)->curr);
+
+ /*
+ * Remove us from nr_running/h_nr_running so
+ * that idle_balance gets called if necessary
+ */
+ account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+ cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() &&
+ cfs_rq->runtime_state != RUNTIME_UNLIMITED;
}
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
- if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+ if (!cfs_rq_runtime_enabled(cfs_rq))
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+ return cfs_bandwidth_used() &&
+ (cfs_rq->runtime_state == RUNTIME_THROTTLED ||
+ cfs_rq->runtime_state == RUNTIME_THROTTLING);
}
/* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
- if (dequeue)
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- qcfs_rq->h_nr_running -= task_delta;
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
if (qcfs_rq->load.weight)
- dequeue = 0;
+ break;
}
- if (!se)
- rq->nr_running -= task_delta;
-
- cfs_rq->throttled = 1;
+ cfs_rq->runtime_state = RUNTIME_THROTTLED;
cfs_rq->throttled_timestamp = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs
int enqueue = 1;
long task_delta;
+ if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+ /* do only the partial unthrottle */
+ account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+ return;
+ }
+
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- cfs_rq->throttled = 0;
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_bandwidth_used())
- return;
-
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
- if (!cfs_bandwidth_used())
- return;
-
/* an active group must be handled by the update_curr()->put() path */
- if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+ if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
return;
/* ensure the group is not already throttled */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
return;
/* update runtime allocation */
@@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct
if (!cfs_bandwidth_used())
return;
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+ if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
return;
/*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
+ * it is possible for additional bandwidth to arrive between
+ * when we call resched_task for being out of runtime and we
+ * call put_prev_task, in which case reaccount the now running
+ * tasks
*/
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- throttle_cfs_rq(cfs_rq);
+ if (unlikely(cfs_rq->runtime_remaining > 0)) {
+ account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+ } else {
+ throttle_cfs_rq(cfs_rq);
+ }
}
#else
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
@ 2011-11-08 9:26 ` Peter Zijlstra
2011-11-08 9:28 ` Peter Zijlstra
` (2 subsequent siblings)
3 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2011-11-08 9:26 UTC (permalink / raw)
To: Paul Turner; +Cc: linux-kernel, Ingo Molnar
On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote:
> +static void account_cfs_bandwidth_used(int enabled, int was_enabled)
> +{
> + /* only need to count groups transitioning between enabled/!enabled */
> + if (enabled && !was_enabled)
> + jump_label_inc(&__cfs_bandwidth_used);
> + else if (!enabled && was_enabled)
> + jump_label_dec(&__cfs_bandwidth_used);
> +}
Can't you use jump_label_enabled(&__cfs_bandwidth_used) to was_enabled ?
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
2011-11-08 9:26 ` Peter Zijlstra
@ 2011-11-08 9:28 ` Peter Zijlstra
2011-11-08 9:29 ` Peter Zijlstra
2011-11-18 23:42 ` [tip:sched/core] sched: Use " tip-bot for Paul Turner
3 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2011-11-08 9:28 UTC (permalink / raw)
To: Paul Turner; +Cc: linux-kernel, Ingo Molnar
On Tue, 2011-11-08 at 10:26 +0100, Peter Zijlstra wrote:
> On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote:
> > +static void account_cfs_bandwidth_used(int enabled, int was_enabled)
> > +{
> > + /* only need to count groups transitioning between enabled/!enabled */
> > + if (enabled && !was_enabled)
> > + jump_label_inc(&__cfs_bandwidth_used);
> > + else if (!enabled && was_enabled)
> > + jump_label_dec(&__cfs_bandwidth_used);
> > +}
>
> Can't you use jump_label_enabled(&__cfs_bandwidth_used) to was_enabled ?
n/m I'm not awake yet..
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
2011-11-08 9:26 ` Peter Zijlstra
2011-11-08 9:28 ` Peter Zijlstra
@ 2011-11-08 9:29 ` Peter Zijlstra
2011-11-11 4:23 ` Paul Turner
2011-11-18 23:42 ` [tip:sched/core] sched: Use " tip-bot for Paul Turner
3 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2011-11-08 9:29 UTC (permalink / raw)
To: Paul Turner; +Cc: linux-kernel, Ingo Molnar
On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote:
> @@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str
> */
> static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
> {
> + if (!cfs_bandwidth_used())
> + return;
> +
> /* an active group must be handled by the update_curr()->put() path */
> if (!cfs_rq->runtime_enabled || cfs_rq->curr)
> return;
> @@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc
> /* conditionally throttle active cfs_rq's from put_prev_entity() */
> static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> {
> + if (!cfs_bandwidth_used())
> + return;
> +
> if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
> return;
>
does it matter if you pull this out into an inline function like:
static __always_inline void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
if (cfs_bandwidth_used())
__check_enqueue_throttle(cfs_rq);
}
That would avoid the superfluous function call as well.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
@ 2011-11-10 2:28 ` Paul Turner
2011-11-10 2:30 ` Paul Turner
1 sibling, 0 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-10 2:28 UTC (permalink / raw)
To: linux-kernel
I mangled the subject/from headers on this one, fixed should be below.
Thanks,
- Paul
---
sched: update task accounting on throttle so that idle_balance() will trigger
From: Ben Segall <bsegall@google.com>
Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().
Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.
This allows schedule() to call idle_balance when we go idle due to throttling.
Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
+patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261
Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
kernel/sched.c | 24 ++++++++----
kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++----------------
2 files changed, 87 insertions(+), 38 deletions(-)
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -269,6 +269,13 @@ struct cfs_bandwidth {
#endif
};
+enum runtime_state {
+ RUNTIME_UNLIMITED,
+ RUNTIME_AVAILABLE,
+ RUNTIME_THROTTLING,
+ RUNTIME_THROTTLED
+};
+
/* task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -402,12 +409,12 @@ struct cfs_rq {
unsigned long load_contribution;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
- int runtime_enabled;
+ enum runtime_state runtime_state;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_timestamp;
- int throttled, throttle_count;
+ int throttle_count;
struct list_head throttled_list;
#endif
#endif
@@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- cfs_rq->runtime_enabled = 0;
+ cfs_rq->runtime_state = RUNTIME_UNLIMITED;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
@@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s
for_each_leaf_cfs_rq(rq, cfs_rq) {
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- if (!cfs_rq->runtime_enabled)
+ if (!cfs_rq_runtime_enabled(cfs_rq))
continue;
/*
@@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t
struct rq *rq = rq_of(cfs_rq);
raw_spin_lock_irq(&rq->lock);
- cfs_rq->runtime_enabled = runtime_enabled;
- cfs_rq->runtime_remaining = 0;
-
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
+
+ if (runtime_enabled)
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+ else
+ cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+ cfs_rq->runtime_remaining = 0;
raw_spin_unlock_irq(&rq->lock);
}
out_unlock:
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct
}
}
+static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
+{
+ struct sched_entity *se;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ if (!se->on_rq)
+ break;
+
+ qcfs_rq->h_nr_running -= nr_throttling;
+
+ if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+ break;
+ }
+
+ if (!se)
+ rq_of(cfs_rq)->nr_running -= nr_throttling;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
*/
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_task(rq_of(cfs_rq)->curr);
+ if (assign_cfs_rq_runtime(cfs_rq))
+ return;
+
+ if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+ cfs_rq->runtime_state == RUNTIME_THROTTLING)
+ return;
+
+ resched_task(rq_of(cfs_rq)->curr);
+
+ /*
+ * Remove us from nr_running/h_nr_running so
+ * that idle_balance gets called if necessary
+ */
+ account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+ cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() &&
+ cfs_rq->runtime_state != RUNTIME_UNLIMITED;
}
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
- if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+ if (!cfs_rq_runtime_enabled(cfs_rq))
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+ return cfs_bandwidth_used() &&
+ (cfs_rq->runtime_state == RUNTIME_THROTTLED ||
+ cfs_rq->runtime_state == RUNTIME_THROTTLING);
}
/* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
- if (dequeue)
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- qcfs_rq->h_nr_running -= task_delta;
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
if (qcfs_rq->load.weight)
- dequeue = 0;
+ break;
}
- if (!se)
- rq->nr_running -= task_delta;
-
- cfs_rq->throttled = 1;
+ cfs_rq->runtime_state = RUNTIME_THROTTLED;
cfs_rq->throttled_timestamp = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs
int enqueue = 1;
long task_delta;
+ if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+ /* do only the partial unthrottle */
+ account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+ return;
+ }
+
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- cfs_rq->throttled = 0;
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_bandwidth_used())
- return;
-
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
- if (!cfs_bandwidth_used())
- return;
-
/* an active group must be handled by the update_curr()->put() path */
- if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+ if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
return;
/* ensure the group is not already throttled */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
return;
/* update runtime allocation */
@@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct
if (!cfs_bandwidth_used())
return;
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+ if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
return;
/*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
+ * it is possible for additional bandwidth to arrive between
+ * when we call resched_task for being out of runtime and we
+ * call put_prev_task, in which case reaccount the now running
+ * tasks
*/
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- throttle_cfs_rq(cfs_rq);
+ if (unlikely(cfs_rq->runtime_remaining > 0)) {
+ account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+ } else {
+ throttle_cfs_rq(cfs_rq);
+ }
}
#else
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
2011-11-10 2:28 ` Paul Turner
@ 2011-11-10 2:30 ` Paul Turner
2011-11-14 10:03 ` Kamalesh Babulal
2011-11-14 12:01 ` Peter Zijlstra
1 sibling, 2 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-10 2:30 UTC (permalink / raw)
Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Ben Segall
I mangled the subject/from headers on this one, fixed should be below.
Thanks,
- Paul
---
sched: update task accounting on throttle so that idle_balance() will trigger
From: Ben Segall <bsegall@google.com>
Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().
Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.
This allows schedule() to call idle_balance when we go idle due to throttling.
Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
+patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261
Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
kernel/sched.c | 24 ++++++++----
kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++----------------
2 files changed, 87 insertions(+), 38 deletions(-)
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -269,6 +269,13 @@ struct cfs_bandwidth {
#endif
};
+enum runtime_state {
+ RUNTIME_UNLIMITED,
+ RUNTIME_AVAILABLE,
+ RUNTIME_THROTTLING,
+ RUNTIME_THROTTLED
+};
+
/* task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -402,12 +409,12 @@ struct cfs_rq {
unsigned long load_contribution;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
- int runtime_enabled;
+ enum runtime_state runtime_state;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_timestamp;
- int throttled, throttle_count;
+ int throttle_count;
struct list_head throttled_list;
#endif
#endif
@@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- cfs_rq->runtime_enabled = 0;
+ cfs_rq->runtime_state = RUNTIME_UNLIMITED;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
@@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s
for_each_leaf_cfs_rq(rq, cfs_rq) {
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- if (!cfs_rq->runtime_enabled)
+ if (!cfs_rq_runtime_enabled(cfs_rq))
continue;
/*
@@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t
struct rq *rq = rq_of(cfs_rq);
raw_spin_lock_irq(&rq->lock);
- cfs_rq->runtime_enabled = runtime_enabled;
- cfs_rq->runtime_remaining = 0;
-
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
+
+ if (runtime_enabled)
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+ else
+ cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+ cfs_rq->runtime_remaining = 0;
raw_spin_unlock_irq(&rq->lock);
}
out_unlock:
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct
}
}
+static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
+{
+ struct sched_entity *se;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ if (!se->on_rq)
+ break;
+
+ qcfs_rq->h_nr_running -= nr_throttling;
+
+ if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+ break;
+ }
+
+ if (!se)
+ rq_of(cfs_rq)->nr_running -= nr_throttling;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
*/
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_task(rq_of(cfs_rq)->curr);
+ if (assign_cfs_rq_runtime(cfs_rq))
+ return;
+
+ if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+ cfs_rq->runtime_state == RUNTIME_THROTTLING)
+ return;
+
+ resched_task(rq_of(cfs_rq)->curr);
+
+ /*
+ * Remove us from nr_running/h_nr_running so
+ * that idle_balance gets called if necessary
+ */
+ account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+ cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() &&
+ cfs_rq->runtime_state != RUNTIME_UNLIMITED;
}
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
- if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+ if (!cfs_rq_runtime_enabled(cfs_rq))
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+ return cfs_bandwidth_used() &&
+ (cfs_rq->runtime_state == RUNTIME_THROTTLED ||
+ cfs_rq->runtime_state == RUNTIME_THROTTLING);
}
/* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
- if (dequeue)
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- qcfs_rq->h_nr_running -= task_delta;
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
if (qcfs_rq->load.weight)
- dequeue = 0;
+ break;
}
- if (!se)
- rq->nr_running -= task_delta;
-
- cfs_rq->throttled = 1;
+ cfs_rq->runtime_state = RUNTIME_THROTTLED;
cfs_rq->throttled_timestamp = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs
int enqueue = 1;
long task_delta;
+ if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+ /* do only the partial unthrottle */
+ account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+ return;
+ }
+
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- cfs_rq->throttled = 0;
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_bandwidth_used())
- return;
-
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
- if (!cfs_bandwidth_used())
- return;
-
/* an active group must be handled by the update_curr()->put() path */
- if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+ if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
return;
/* ensure the group is not already throttled */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
return;
/* update runtime allocation */
@@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct
if (!cfs_bandwidth_used())
return;
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+ if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
return;
/*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
+ * it is possible for additional bandwidth to arrive between
+ * when we call resched_task for being out of runtime and we
+ * call put_prev_task, in which case reaccount the now running
+ * tasks
*/
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- throttle_cfs_rq(cfs_rq);
+ if (unlikely(cfs_rq->runtime_remaining > 0)) {
+ account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+ } else {
+ throttle_cfs_rq(cfs_rq);
+ }
}
#else
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
2011-11-08 9:29 ` Peter Zijlstra
@ 2011-11-11 4:23 ` Paul Turner
0 siblings, 0 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-11 4:23 UTC (permalink / raw)
To: linux-kernel
On 11/08/2011 01:29 AM, Peter Zijlstra wrote:
> On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote:
>> @@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str
>> */
>> static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
>> {
>> + if (!cfs_bandwidth_used())
>> + return;
>> +
>> /* an active group must be handled by the update_curr()->put() path */
>> if (!cfs_rq->runtime_enabled || cfs_rq->curr)
>> return;
>> @@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc
>> /* conditionally throttle active cfs_rq's from put_prev_entity() */
>> static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>> {
>> + if (!cfs_bandwidth_used())
>> + return;
>> +
>> if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining> 0))
>> return;
>>
>
> does it matter if you pull this out into an inline function like:
>
> static __always_inline void check_enqueue_throttle(struct cfs_rq *cfs_rq)
> {
> if (cfs_bandwidth_used())
> __check_enqueue_throttle(cfs_rq);
> }
>
> That would avoid the superfluous function call as well.
This this is the only call-site it's being inlined already so there's no
difference atm. That said, check_enqueue_throttle is sufficiently small we
could just add __always_inline to force this to be the default behavior and
avoid any surprises from this in the future.
Let me know if you'd like me to re-post with this added.
- Paul
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
2011-11-10 2:30 ` Paul Turner
@ 2011-11-14 10:03 ` Kamalesh Babulal
2011-11-14 12:01 ` Peter Zijlstra
1 sibling, 0 replies; 15+ messages in thread
From: Kamalesh Babulal @ 2011-11-14 10:03 UTC (permalink / raw)
To: Paul Turner
Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Ben Segall,
Srivatsa Vaddagiri
[snip]
> Since throttling occurs in the put_prev_task() path we do not get to observe
> this delta against nr_running when making the decision to idle_balance().
>
> Fix this by first enumerating cfs_rq throttle states so that we can distinguish
> throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task
> from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
> rather than delaying until put_prev_task.
>
> This allows schedule() to call idle_balance when we go idle due to throttling.
>
> Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
> a 16 core system:
> baseline: Average CPU Idle percentage 13.9667%
> +patch: Average CPU Idle percentage 3.53333%
> [1]: https://lkml.org/lkml/2011/9/15/261
>
> Signed-off-by: Ben Segall <bsegall@google.com>
> Signed-off-by: Paul Turner <pjt@google.com>
Tested-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Thanks for the patch. I tested patches on the same test environment, over which
the cpu idle time was reported first at https://lkml.org/lkml/2011/6/7/352. In
brief, tests were run on 2 socket quad core machine with three level of nested
cgroups hierarchy and five cgroups created below the third level. Each of the
five cgroups, having 2,2,4,8,16 while1 or cpu-matrix (https://lkml.org/lkml/2011/11/4/107)
tasks attached to them respectively.
[1] CFS Bandwith tweaks, were the patches posted by Paul Turner (https://lkml.org/lkml/2011/11/7/603)
[2] nohz idle balance RFC patch by Srivatsa Vaddagiri (https://lkml.org/lkml/2011/11/2/117)
While running the cpu-matrix benchmark with the patches, there was an improvement
around ~50 to 55% and additional ~3% benefit in idle time with nohz idle balance
patch. With while1 loop the improvment was around ~36 to 40% over tip and an
additional benefit of ~4 to 5% was seen with nohz idle balance patch.
(1) cpu-matrix benchmark with nohz=on
----------------------------------
Run Base (tip) tip + CFS Bandwith tweaks tip + CFS Bandwith tweaks + nohz idle patch
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 Average CPU Idle percentage 4.1% Average CPU Idle percentage 2.36667% Average CPU Idle percentage 2.23333%
Bandwidth shared with remaining non-Idle 95.9% Bandwidth shared with remaining non-Idle 97.63333% Bandwidth shared with remaining non-Idle 97.76667%
2 Average CPU Idle percentage 4.23% Average CPU Idle percentage 2.3% Average CPU Idle percentage 2.16667%
Bandwidth shared with remaining non-Idle 95.77% Bandwidth shared with remaining non-Idle 97.7% Bandwidth shared with remaining non-Idle 97.83333%
(2) cpu-matrix benchmark with nohz=off
-----------------------------------
Run Base (tip) tip + CFS Bandwith tweaks tip + CFS Bandwith tweaks + nohz idle patch
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 Average CPU Idle percentage 4.53333% Average CPU Idle percentage 2.43333% Average CPU Idle percentage 2.36667%
Bandwidth shared with remaining non-Idle 95.46667% Bandwidth shared with remaining non-Idle 97.56667% Bandwidth shared with remaining non-Idle 97.63333%
2 Average CPU Idle percentage 4.4% Average CPU Idle percentage 2.36667% Average CPU Idle percentage 2.4%
Bandwidth shared with remaining non-Idle 95.6% Bandwidth shared with remaining non-Idle 97.63333% Bandwidth shared with remaining non-Idle 97.6%
(3) while1 loop with nohz=on
-------------------------
Run Base (tip) tip + CFS Bandwith tweaks tip + CFS Bandwith tweaks + nohz idle patch
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 Average CPU Idle percentage 6.26667% Average CPU Idle percentage 2.5% Average CPU Idle percentage 2.23333%
Bandwidth shared with remaining non-Idle 93.73333% Bandwidth shared with remaining non-Idle 97.5% Bandwidth shared with remaining non-Idle 97.76667%
2 Average CPU Idle percentage 6.73333% Average CPU Idle percentage 2.46667% Average CPU Idle percentage 2.13333%
Bandwidth shared with remaining non-Idle 93.26667% Bandwidth shared with remaining non-Idle 97.53333% Bandwidth shared with remaining non-Idle 97.86667%
(4) while1 loop with nohz=off
--------------------------
Run Base (tip) tip + CFS Bandwith tweaks tip + CFS Bandwith tweaks + nohz idle patch
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1 Average CPU Idle percentage 3.6% Average CPU Idle percentage 2.4% Average CPU Idle percentage 2.43333%
Bandwidth shared with remaining non-Idle 96.4% Bandwidth shared with remaining non-Idle 97.6% Bandwidth shared with remaining non-Idle 97.56667%
2 Average CPU Idle percentage 3.46667% Average CPU Idle percentage 2.33333% Average CPU Idle percentage 2.4%
Bandwidth shared with remaining non-Idle 96.53333% Bandwidth shared with remaining non-Idle 97.66667% Bandwidth shared with remaining non-Idle 97.6%
each cpu-matrix benchmark task was run as # perf sched cpu-matrix -s1k -i 1000 -p100
Kamalesh.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
2011-11-10 2:30 ` Paul Turner
2011-11-14 10:03 ` Kamalesh Babulal
@ 2011-11-14 12:01 ` Peter Zijlstra
2011-11-15 21:14 ` Benjamin Segall
1 sibling, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2011-11-14 12:01 UTC (permalink / raw)
To: Paul Turner; +Cc: linux-kernel, Ingo Molnar, Ben Segall
On Wed, 2011-11-09 at 18:30 -0800, Paul Turner wrote:
>
> sched: update task accounting on throttle so that idle_balance() will trigger
> From: Ben Segall <bsegall@google.com>
>
> Since throttling occurs in the put_prev_task() path we do not get to observe
> this delta against nr_running when making the decision to idle_balance().
>
> Fix this by first enumerating cfs_rq throttle states so that we can distinguish
> throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task
> from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
> rather than delaying until put_prev_task.
>
> This allows schedule() to call idle_balance when we go idle due to throttling.
>
> Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
> a 16 core system:
> baseline: Average CPU Idle percentage 13.9667%
> +patch: Average CPU Idle percentage 3.53333%
> [1]: https://lkml.org/lkml/2011/9/15/261
>
> Signed-off-by: Ben Segall <bsegall@google.com>
> Signed-off-by: Paul Turner <pjt@google.com>
I really don't like this patch... There's something wrong about
decoupling the dequeue from nr_running accounting.
That said, I haven't got a bright idea either.. anyway, I think the
patch is somewhat too big for 3.2 at this point.
> ---
> kernel/sched.c | 24 ++++++++----
> kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++----------------
> 2 files changed, 87 insertions(+), 38 deletions(-)
>
> Index: tip/kernel/sched.c
> ===================================================================
> --- tip.orig/kernel/sched.c
> +++ tip/kernel/sched.c
> @@ -269,6 +269,13 @@ struct cfs_bandwidth {
> #endif
> };
>
> +enum runtime_state {
> + RUNTIME_UNLIMITED,
> + RUNTIME_AVAILABLE,
> + RUNTIME_THROTTLING,
> + RUNTIME_THROTTLED
> +};
What's the difference between throttling and throttled? Throttling is
between actually getting throttled and put_prev_task() getting called?
This all wants a comment.
> +static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
> +{
> + struct sched_entity *se;
> +
> + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> +
> + for_each_sched_entity(se) {
> + struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> + if (!se->on_rq)
> + break;
> +
> + qcfs_rq->h_nr_running -= nr_throttling;
> +
> + if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
> + break;
> + }
> +
> + if (!se)
> + rq_of(cfs_rq)->nr_running -= nr_throttling;
> +}
Since you'll end up calling this stuff with a negative nr_throttling,
please use += to avoid the double negative brain twist.
> static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
> unsigned long delta_exec)
> {
> @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
> * if we're unable to extend our runtime we resched so that the active
> * hierarchy can be throttled
> */
> - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
> - resched_task(rq_of(cfs_rq)->curr);
> + if (assign_cfs_rq_runtime(cfs_rq))
> + return;
> +
> + if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
> + cfs_rq->runtime_state == RUNTIME_THROTTLING)
> + return;
How exactly can we get here if we're throttling already?
> + resched_task(rq_of(cfs_rq)->curr);
> +
> + /*
> + * Remove us from nr_running/h_nr_running so
> + * that idle_balance gets called if necessary
> + */
> + account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
> + cfs_rq->runtime_state = RUNTIME_THROTTLING;
> +}
> @@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_
>
> static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> {
> - return cfs_bandwidth_used() && cfs_rq->throttled;
> + return cfs_bandwidth_used() &&
> + (cfs_rq->runtime_state == RUNTIME_THROTTLED ||
> + cfs_rq->runtime_state == RUNTIME_THROTTLING);
> }
>= THROTTLING saves a test.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
2011-11-14 12:01 ` Peter Zijlstra
@ 2011-11-15 21:14 ` Benjamin Segall
0 siblings, 0 replies; 15+ messages in thread
From: Benjamin Segall @ 2011-11-15 21:14 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Paul Turner, linux-kernel, Ingo Molnar
Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> On Wed, 2011-11-09 at 18:30 -0800, Paul Turner wrote:
>>
>> sched: update task accounting on throttle so that idle_balance() will trigger
>> From: Ben Segall <bsegall@google.com>
>>
>> Since throttling occurs in the put_prev_task() path we do not get to observe
>> this delta against nr_running when making the decision to idle_balance().
>>
>> Fix this by first enumerating cfs_rq throttle states so that we can distinguish
>> throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task
>> from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
>> rather than delaying until put_prev_task.
>>
>> This allows schedule() to call idle_balance when we go idle due to throttling.
>>
>> Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
>> a 16 core system:
>> baseline: Average CPU Idle percentage 13.9667%
>> +patch: Average CPU Idle percentage 3.53333%
>> [1]: https://lkml.org/lkml/2011/9/15/261
>>
>> Signed-off-by: Ben Segall <bsegall@google.com>
>> Signed-off-by: Paul Turner <pjt@google.com>
>
> I really don't like this patch... There's something wrong about
> decoupling the dequeue from nr_running accounting.
>
> That said, I haven't got a bright idea either.. anyway, I think the
> patch is somewhat too big for 3.2 at this point.
Since we can't really throttle until schedule or so, the simplest
solution would be to move it from put_prev_task to a prev_schedule hook
so that it could be before idle_balance, but that seems a bit heavy just
for cfs bandwidth.
>
>> ---
>> kernel/sched.c | 24 ++++++++----
>> kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++----------------
>> 2 files changed, 87 insertions(+), 38 deletions(-)
>>
>> Index: tip/kernel/sched.c
>> ===================================================================
>> --- tip.orig/kernel/sched.c
>> +++ tip/kernel/sched.c
>> @@ -269,6 +269,13 @@ struct cfs_bandwidth {
>> #endif
>> };
>>
>> +enum runtime_state {
>> + RUNTIME_UNLIMITED,
>> + RUNTIME_AVAILABLE,
>> + RUNTIME_THROTTLING,
>> + RUNTIME_THROTTLED
>> +};
>
> What's the difference between throttling and throttled? Throttling is
> between actually getting throttled and put_prev_task() getting called?
> This all wants a comment.
Yes, although THROTTLING->AVAILABLE (or even ->UNLIMITED) is also possible.
>> @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
>> * if we're unable to extend our runtime we resched so that the active
>> * hierarchy can be throttled
>> */
>> - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
>> - resched_task(rq_of(cfs_rq)->curr);
>> + if (assign_cfs_rq_runtime(cfs_rq))
>> + return;
>> +
>> + if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
>> + cfs_rq->runtime_state == RUNTIME_THROTTLING)
>> + return;
>
> How exactly can we get here if we're throttling already?
A wakeup in the same to-be-throttled cfs_rq before need_resched is
handled. The simplest trigger would be three tasks on the same cfs_rq,
only one of which is running. If it tries to wake both of the other
tasks just as it runs out of runtime, we would hit this.
And the patch updated for these and your other comments:
-- 8< --
From: Benjamin Segall <bsegall@google.com>
Subject: sched: update task accounting on throttle so that idle_balance() will trigger
Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().
Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.
This allows schedule() to call idle_balance when we go idle due to throttling.
Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
+patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261
Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
kernel/sched.c | 28 +++++++++++---
kernel/sched_fair.c | 100 +++++++++++++++++++++++++++++++++++----------------
2 files changed, 90 insertions(+), 38 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 7c1b2f5..0af1680 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -269,6 +269,17 @@ struct cfs_bandwidth {
#endif
};
+/*
+ * THROTTLING tasks have been removed from nr_running, but have not
+ * yet been dequeued. THROTTLED tasks have been entirely dequeued.
+ */
+enum runtime_state {
+ RUNTIME_UNLIMITED,
+ RUNTIME_AVAILABLE,
+ RUNTIME_THROTTLING,
+ RUNTIME_THROTTLED
+};
+
/* task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -402,12 +413,12 @@ struct cfs_rq {
unsigned long load_contribution;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
- int runtime_enabled;
+ enum runtime_state runtime_state;
u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_timestamp;
- int throttled, throttle_count;
+ int throttle_count;
struct list_head throttled_list;
#endif
#endif
@@ -470,7 +481,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- cfs_rq->runtime_enabled = 0;
+ cfs_rq->runtime_state = RUNTIME_UNLIMITED;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
@@ -6368,7 +6379,7 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
for_each_leaf_cfs_rq(rq, cfs_rq) {
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- if (!cfs_rq->runtime_enabled)
+ if (!cfs_rq_runtime_enabled(cfs_rq))
continue;
/*
@@ -9263,11 +9274,14 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
struct rq *rq = rq_of(cfs_rq);
raw_spin_lock_irq(&rq->lock);
- cfs_rq->runtime_enabled = runtime_enabled;
- cfs_rq->runtime_remaining = 0;
-
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
+
+ if (runtime_enabled)
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+ else
+ cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+ cfs_rq->runtime_remaining = 0;
raw_spin_unlock_irq(&rq->lock);
}
out_unlock:
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3539569..7dc5c19 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
}
}
+static void account_nr_throttling(struct cfs_rq *cfs_rq, long delta)
+{
+ struct sched_entity *se;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ if (!se->on_rq)
+ break;
+
+ qcfs_rq->h_nr_running += delta;
+
+ if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+ break;
+ }
+
+ if (!se)
+ rq_of(cfs_rq)->nr_running += delta;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
*/
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_task(rq_of(cfs_rq)->curr);
+ if (assign_cfs_rq_runtime(cfs_rq))
+ return;
+
+ if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+ cfs_rq->runtime_state == RUNTIME_THROTTLING)
+ return;
+
+ resched_task(rq_of(cfs_rq)->curr);
+
+ /*
+ * Remove us from nr_running/h_nr_running so
+ * that idle_balance gets called if necessary
+ */
+ account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+ cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() &&
+ cfs_rq->runtime_state != RUNTIME_UNLIMITED;
}
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
- if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+ if (!cfs_rq_runtime_enabled(cfs_rq))
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,8 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_bandwidth_used() && cfs_rq->throttled;
+ return cfs_bandwidth_used() &&
+ cfs_rq->runtime_state >= RUNTIME_THROTTLING;
}
/* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1524,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -1492,25 +1532,19 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
- if (dequeue)
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- qcfs_rq->h_nr_running -= task_delta;
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
if (qcfs_rq->load.weight)
- dequeue = 0;
+ break;
}
- if (!se)
- rq->nr_running -= task_delta;
-
- cfs_rq->throttled = 1;
+ cfs_rq->runtime_state = RUNTIME_THROTTLED;
cfs_rq->throttled_timestamp = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1559,15 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
int enqueue = 1;
long task_delta;
+ if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+ /* do only the partial unthrottle */
+ account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+ return;
+ }
+
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
- cfs_rq->throttled = 0;
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
raw_spin_lock(&cfs_b->lock);
cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1783,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_bandwidth_used())
- return;
-
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1828,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
- if (!cfs_bandwidth_used())
- return;
-
/* an active group must be handled by the update_curr()->put() path */
- if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+ if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
return;
/* ensure the group is not already throttled */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
return;
/* update runtime allocation */
@@ -1814,17 +1848,21 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+ if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
return;
/*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
+ * it is possible for additional bandwidth to arrive between
+ * when we call resched_task for being out of runtime and we
+ * call put_prev_task, in which case reaccount the now running
+ * tasks
*/
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- throttle_cfs_rq(cfs_rq);
+ if (unlikely(cfs_rq->runtime_remaining > 0)) {
+ account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+ cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+ } else {
+ throttle_cfs_rq(cfs_rq);
+ }
}
#else
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [tip:sched/core] sched: Fix buglet in return_cfs_rq_runtime()
2011-11-08 4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner
@ 2011-11-18 23:41 ` tip-bot for Paul Turner
0 siblings, 0 replies; 15+ messages in thread
From: tip-bot for Paul Turner @ 2011-11-18 23:41 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, hpa, mingo, a.p.zijlstra, pjt, tglx, mingo
Commit-ID: fccfdc6f0d8c83c854eeb6d93aa158f0e551bd49
Gitweb: http://git.kernel.org/tip/fccfdc6f0d8c83c854eeb6d93aa158f0e551bd49
Author: Paul Turner <pjt@google.com>
AuthorDate: Mon, 7 Nov 2011 20:26:34 -0800
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 16 Nov 2011 08:43:45 +0100
sched: Fix buglet in return_cfs_rq_runtime()
In return_cfs_rq_runtime() we want to return bandwidth when there are no
remaining tasks, not "return" when this is the case.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111108042736.623812423@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched_fair.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ba0e1f4..a78ed27 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1756,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [tip:sched/core] sched: Use jump labels to reduce overhead when bandwidth control is inactive
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
` (2 preceding siblings ...)
2011-11-08 9:29 ` Peter Zijlstra
@ 2011-11-18 23:42 ` tip-bot for Paul Turner
3 siblings, 0 replies; 15+ messages in thread
From: tip-bot for Paul Turner @ 2011-11-18 23:42 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, hpa, mingo, a.p.zijlstra, pjt, tglx, mingo
Commit-ID: 56f570e512eeb5b412cb3a62234adc446a8eb32b
Gitweb: http://git.kernel.org/tip/56f570e512eeb5b412cb3a62234adc446a8eb32b
Author: Paul Turner <pjt@google.com>
AuthorDate: Mon, 7 Nov 2011 20:26:33 -0800
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 16 Nov 2011 08:48:18 +0100
sched: Use jump labels to reduce overhead when bandwidth control is inactive
Now that the linkage of jump-labels has been fixed they show a measurable
improvement in overhead for the enabled-but-unused case.
Workload is:
'taskset -c 0 perf stat --repeat 50 -e instructions,cycles,branches
bash -c "for ((i=0;i<5;i++)); do $(dirname $0)/pipe-test 20000; done"'
There's a speedup for all situations:
instructions cycles branches
-------------------------------------------------------------------------
Intel Westmere
base 806611770 745895590 146765378
+jumplabel 803090165 (-0.44%) 713381840 (-4.36%) 144561130
AMD Barcelona
base 824657415 740055589 148855354
+jumplabel 821056910 (-0.44%) 737558389 (-0.34%) 146635229
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111108042736.560831357@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 33 +++++++++++++++++++++++++++++++--
kernel/sched_fair.c | 15 ++++++++++++---
2 files changed, 43 insertions(+), 5 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index d6b149c..d9d79a4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -72,6 +72,7 @@
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
+#include <linux/jump_label.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -503,7 +504,32 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
}
-#else
+
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return static_branch(&__cfs_bandwidth_used);
+}
+
+static void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+ /* only need to count groups transitioning between enabled/!enabled */
+ if (enabled && !was_enabled)
+ jump_label_inc(&__cfs_bandwidth_used);
+ else if (!enabled && was_enabled)
+ jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* !HAVE_JUMP_LABEL */
+/* static_branch doesn't help unless supported */
+static int cfs_bandwidth_used(void)
+{
+ return 1;
+}
+static void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
+#else /* !CONFIG_CFS_BANDWIDTH */
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
@@ -9203,7 +9229,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
{
- int i, ret = 0, runtime_enabled;
+ int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
if (tg == &root_task_group)
@@ -9231,6 +9257,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
goto out_unlock;
runtime_enabled = quota != RUNTIME_INF;
+ runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+ account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a78ed27..a608593 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1421,7 +1421,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
- if (!cfs_rq->runtime_enabled)
+ if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1429,13 +1429,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_rq->throttled;
+ return cfs_bandwidth_used() && cfs_rq->throttled;
}
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
- return cfs_rq->throttle_count;
+ return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
/*
@@ -1756,6 +1756,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
@@ -1801,6 +1804,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -1818,6 +1824,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
return;
^ permalink raw reply related [flat|nested] 15+ messages in thread
end of thread, other threads:[~2011-11-18 23:42 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-08 4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
2011-11-08 9:26 ` Peter Zijlstra
2011-11-08 9:28 ` Peter Zijlstra
2011-11-08 9:29 ` Peter Zijlstra
2011-11-11 4:23 ` Paul Turner
2011-11-18 23:42 ` [tip:sched/core] sched: Use " tip-bot for Paul Turner
2011-11-08 4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner
2011-11-18 23:41 ` [tip:sched/core] sched: Fix " tip-bot for Paul Turner
2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
2011-11-10 2:28 ` Paul Turner
2011-11-10 2:30 ` Paul Turner
2011-11-14 10:03 ` Kamalesh Babulal
2011-11-14 12:01 ` Peter Zijlstra
2011-11-15 21:14 ` Benjamin Segall
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox