* [patch 0/3] sched: bandwidth-control tweaks for v3.2
@ 2011-11-08 4:26 Paul Turner
2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
` (2 more replies)
0 siblings, 3 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-08 4:26 UTC (permalink / raw)
To: linux-kernel, Peter Zijlstra, Ingo Molnar
Hi,
Please find attached a few minor tweaks for bandwidth control that are worth
considering for 3.2
The first two are a performance improvement for the !enabled case and a buglet
fix. While the last makes idle_balance() work with throttling.
Thanks,
- Paul
^ permalink raw reply [flat|nested] 15+ messages in thread* [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive 2011-11-08 4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner @ 2011-11-08 4:26 ` Paul Turner 2011-11-08 9:26 ` Peter Zijlstra ` (3 more replies) 2011-11-08 4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner 2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner 2 siblings, 4 replies; 15+ messages in thread From: Paul Turner @ 2011-11-08 4:26 UTC (permalink / raw) To: linux-kernel, Peter Zijlstra, Ingo Molnar [-- Attachment #1: sched-bwc-add_jump_labels.patch --] [-- Type: text/plain, Size: 4859 bytes --] Now that the linkage of jump-labels has been fixed they show a measurable improvement in overhead for the enabled-but-unused case. Workload is: 'taskset -c 0 perf stat --repeat 50 -e instructions,cycles,branches bash -c "for ((i=0;i<5;i++)); do $(dirname $0)/pipe-test 20000; done"' instructions cycles branches ------------------------------------------------------------------------- Intel Westmere base 806611770 745895590 146765378 +jumplabel 803090165 (-0.44) 713381840 (-4.36) 144561130 AMD Barcelona base 824657415 740055589 148855354 +jumplabel 821056910 (-0.44) 737558389 (-0.34) 146635229 Signed-off-by: Paul Turner <pjt@google.com> --- kernel/sched.c | 33 +++++++++++++++++++++++++++++++-- kernel/sched_fair.c | 17 +++++++++++++---- 2 files changed, 44 insertions(+), 6 deletions(-) Index: tip/kernel/sched.c =================================================================== --- tip.orig/kernel/sched.c +++ tip/kernel/sched.c @@ -71,6 +71,7 @@ #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> +#include <linux/jump_label.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -502,7 +503,32 @@ static void destroy_cfs_bandwidth(struct hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); } -#else + +#ifdef HAVE_JUMP_LABEL +static struct jump_label_key __cfs_bandwidth_used; + +static inline bool cfs_bandwidth_used(void) +{ + return static_branch(&__cfs_bandwidth_used); +} + +static void account_cfs_bandwidth_used(int enabled, int was_enabled) +{ + /* only need to count groups transitioning between enabled/!enabled */ + if (enabled && !was_enabled) + jump_label_inc(&__cfs_bandwidth_used); + else if (!enabled && was_enabled) + jump_label_dec(&__cfs_bandwidth_used); +} +#else /* !HAVE_JUMP_LABEL */ +/* static_branch doesn't help unless supported */ +static int cfs_bandwidth_used(void) +{ + return 1; +} +static void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +#endif /* HAVE_JUMP_LABEL */ +#else /* !CONFIG_CFS_BANDWIDTH */ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} @@ -9188,7 +9214,7 @@ static int __cfs_schedulable(struct task static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { - int i, ret = 0, runtime_enabled; + int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); if (tg == &root_task_group) @@ -9216,6 +9242,9 @@ static int tg_set_cfs_bandwidth(struct t goto out_unlock; runtime_enabled = quota != RUNTIME_INF; + runtime_was_enabled = cfs_b->quota != RUNTIME_INF; + account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); + raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1408,7 +1408,7 @@ static void __account_cfs_rq_runtime(str static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_rq->runtime_enabled) + if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; __account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1416,13 +1416,13 @@ static __always_inline void account_cfs_ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { - return cfs_rq->throttled; + return cfs_bandwidth_used() && cfs_rq->throttled; } /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { - return cfs_rq->throttle_count; + return cfs_bandwidth_used() && cfs_rq->throttle_count; } /* @@ -1743,6 +1743,9 @@ static void __return_cfs_rq_runtime(stru static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) return; @@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc /* conditionally throttle active cfs_rq's from put_prev_entity() */ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) return; ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive 2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner @ 2011-11-08 9:26 ` Peter Zijlstra 2011-11-08 9:28 ` Peter Zijlstra ` (2 subsequent siblings) 3 siblings, 0 replies; 15+ messages in thread From: Peter Zijlstra @ 2011-11-08 9:26 UTC (permalink / raw) To: Paul Turner; +Cc: linux-kernel, Ingo Molnar On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote: > +static void account_cfs_bandwidth_used(int enabled, int was_enabled) > +{ > + /* only need to count groups transitioning between enabled/!enabled */ > + if (enabled && !was_enabled) > + jump_label_inc(&__cfs_bandwidth_used); > + else if (!enabled && was_enabled) > + jump_label_dec(&__cfs_bandwidth_used); > +} Can't you use jump_label_enabled(&__cfs_bandwidth_used) to was_enabled ? ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive 2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner 2011-11-08 9:26 ` Peter Zijlstra @ 2011-11-08 9:28 ` Peter Zijlstra 2011-11-08 9:29 ` Peter Zijlstra 2011-11-18 23:42 ` [tip:sched/core] sched: Use " tip-bot for Paul Turner 3 siblings, 0 replies; 15+ messages in thread From: Peter Zijlstra @ 2011-11-08 9:28 UTC (permalink / raw) To: Paul Turner; +Cc: linux-kernel, Ingo Molnar On Tue, 2011-11-08 at 10:26 +0100, Peter Zijlstra wrote: > On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote: > > +static void account_cfs_bandwidth_used(int enabled, int was_enabled) > > +{ > > + /* only need to count groups transitioning between enabled/!enabled */ > > + if (enabled && !was_enabled) > > + jump_label_inc(&__cfs_bandwidth_used); > > + else if (!enabled && was_enabled) > > + jump_label_dec(&__cfs_bandwidth_used); > > +} > > Can't you use jump_label_enabled(&__cfs_bandwidth_used) to was_enabled ? n/m I'm not awake yet.. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive 2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner 2011-11-08 9:26 ` Peter Zijlstra 2011-11-08 9:28 ` Peter Zijlstra @ 2011-11-08 9:29 ` Peter Zijlstra 2011-11-11 4:23 ` Paul Turner 2011-11-18 23:42 ` [tip:sched/core] sched: Use " tip-bot for Paul Turner 3 siblings, 1 reply; 15+ messages in thread From: Peter Zijlstra @ 2011-11-08 9:29 UTC (permalink / raw) To: Paul Turner; +Cc: linux-kernel, Ingo Molnar On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote: > @@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str > */ > static void check_enqueue_throttle(struct cfs_rq *cfs_rq) > { > + if (!cfs_bandwidth_used()) > + return; > + > /* an active group must be handled by the update_curr()->put() path */ > if (!cfs_rq->runtime_enabled || cfs_rq->curr) > return; > @@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc > /* conditionally throttle active cfs_rq's from put_prev_entity() */ > static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) > { > + if (!cfs_bandwidth_used()) > + return; > + > if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) > return; > does it matter if you pull this out into an inline function like: static __always_inline void check_enqueue_throttle(struct cfs_rq *cfs_rq) { if (cfs_bandwidth_used()) __check_enqueue_throttle(cfs_rq); } That would avoid the superfluous function call as well. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive 2011-11-08 9:29 ` Peter Zijlstra @ 2011-11-11 4:23 ` Paul Turner 0 siblings, 0 replies; 15+ messages in thread From: Paul Turner @ 2011-11-11 4:23 UTC (permalink / raw) To: linux-kernel On 11/08/2011 01:29 AM, Peter Zijlstra wrote: > On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote: >> @@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str >> */ >> static void check_enqueue_throttle(struct cfs_rq *cfs_rq) >> { >> + if (!cfs_bandwidth_used()) >> + return; >> + >> /* an active group must be handled by the update_curr()->put() path */ >> if (!cfs_rq->runtime_enabled || cfs_rq->curr) >> return; >> @@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc >> /* conditionally throttle active cfs_rq's from put_prev_entity() */ >> static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) >> { >> + if (!cfs_bandwidth_used()) >> + return; >> + >> if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining> 0)) >> return; >> > > does it matter if you pull this out into an inline function like: > > static __always_inline void check_enqueue_throttle(struct cfs_rq *cfs_rq) > { > if (cfs_bandwidth_used()) > __check_enqueue_throttle(cfs_rq); > } > > That would avoid the superfluous function call as well. This this is the only call-site it's being inlined already so there's no difference atm. That said, check_enqueue_throttle is sufficiently small we could just add __always_inline to force this to be the default behavior and avoid any surprises from this in the future. Let me know if you'd like me to re-post with this added. - Paul ^ permalink raw reply [flat|nested] 15+ messages in thread
* [tip:sched/core] sched: Use jump labels to reduce overhead when bandwidth control is inactive 2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner ` (2 preceding siblings ...) 2011-11-08 9:29 ` Peter Zijlstra @ 2011-11-18 23:42 ` tip-bot for Paul Turner 3 siblings, 0 replies; 15+ messages in thread From: tip-bot for Paul Turner @ 2011-11-18 23:42 UTC (permalink / raw) To: linux-tip-commits Cc: linux-kernel, hpa, mingo, a.p.zijlstra, pjt, tglx, mingo Commit-ID: 56f570e512eeb5b412cb3a62234adc446a8eb32b Gitweb: http://git.kernel.org/tip/56f570e512eeb5b412cb3a62234adc446a8eb32b Author: Paul Turner <pjt@google.com> AuthorDate: Mon, 7 Nov 2011 20:26:33 -0800 Committer: Ingo Molnar <mingo@elte.hu> CommitDate: Wed, 16 Nov 2011 08:48:18 +0100 sched: Use jump labels to reduce overhead when bandwidth control is inactive Now that the linkage of jump-labels has been fixed they show a measurable improvement in overhead for the enabled-but-unused case. Workload is: 'taskset -c 0 perf stat --repeat 50 -e instructions,cycles,branches bash -c "for ((i=0;i<5;i++)); do $(dirname $0)/pipe-test 20000; done"' There's a speedup for all situations: instructions cycles branches ------------------------------------------------------------------------- Intel Westmere base 806611770 745895590 146765378 +jumplabel 803090165 (-0.44%) 713381840 (-4.36%) 144561130 AMD Barcelona base 824657415 740055589 148855354 +jumplabel 821056910 (-0.44%) 737558389 (-0.34%) 146635229 Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20111108042736.560831357@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu> --- kernel/sched.c | 33 +++++++++++++++++++++++++++++++-- kernel/sched_fair.c | 15 ++++++++++++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index d6b149c..d9d79a4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -72,6 +72,7 @@ #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> +#include <linux/jump_label.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -503,7 +504,32 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); } -#else + +#ifdef HAVE_JUMP_LABEL +static struct jump_label_key __cfs_bandwidth_used; + +static inline bool cfs_bandwidth_used(void) +{ + return static_branch(&__cfs_bandwidth_used); +} + +static void account_cfs_bandwidth_used(int enabled, int was_enabled) +{ + /* only need to count groups transitioning between enabled/!enabled */ + if (enabled && !was_enabled) + jump_label_inc(&__cfs_bandwidth_used); + else if (!enabled && was_enabled) + jump_label_dec(&__cfs_bandwidth_used); +} +#else /* !HAVE_JUMP_LABEL */ +/* static_branch doesn't help unless supported */ +static int cfs_bandwidth_used(void) +{ + return 1; +} +static void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +#endif /* HAVE_JUMP_LABEL */ +#else /* !CONFIG_CFS_BANDWIDTH */ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} @@ -9203,7 +9229,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { - int i, ret = 0, runtime_enabled; + int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); if (tg == &root_task_group) @@ -9231,6 +9257,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) goto out_unlock; runtime_enabled = quota != RUNTIME_INF; + runtime_was_enabled = cfs_b->quota != RUNTIME_INF; + account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); + raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a78ed27..a608593 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1421,7 +1421,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_rq->runtime_enabled) + if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; __account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1429,13 +1429,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { - return cfs_rq->throttled; + return cfs_bandwidth_used() && cfs_rq->throttled; } /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { - return cfs_rq->throttle_count; + return cfs_bandwidth_used() && cfs_rq->throttle_count; } /* @@ -1756,6 +1756,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) return; @@ -1801,6 +1804,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -1818,6 +1824,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) /* conditionally throttle active cfs_rq's from put_prev_entity() */ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) return; ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() 2011-11-08 4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner 2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner @ 2011-11-08 4:26 ` Paul Turner 2011-11-18 23:41 ` [tip:sched/core] sched: Fix " tip-bot for Paul Turner 2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner 2 siblings, 1 reply; 15+ messages in thread From: Paul Turner @ 2011-11-08 4:26 UTC (permalink / raw) To: linux-kernel, Peter Zijlstra, Ingo Molnar [-- Attachment #1: sched-bwc-fix_return_cfs_rq_runtime.patch --] [-- Type: text/plain, Size: 679 bytes --] In return_cfs_rq_runtime() we want to return bandwidth when there are no remaining tasks, not "return" when this is the case. Signed-off-by: Paul Turner <pjt@google.com> --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1746,7 +1746,7 @@ static __always_inline void return_cfs_r if (!cfs_bandwidth_used()) return; - if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) return; __return_cfs_rq_runtime(cfs_rq); ^ permalink raw reply [flat|nested] 15+ messages in thread
* [tip:sched/core] sched: Fix buglet in return_cfs_rq_runtime() 2011-11-08 4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner @ 2011-11-18 23:41 ` tip-bot for Paul Turner 0 siblings, 0 replies; 15+ messages in thread From: tip-bot for Paul Turner @ 2011-11-18 23:41 UTC (permalink / raw) To: linux-tip-commits Cc: linux-kernel, hpa, mingo, a.p.zijlstra, pjt, tglx, mingo Commit-ID: fccfdc6f0d8c83c854eeb6d93aa158f0e551bd49 Gitweb: http://git.kernel.org/tip/fccfdc6f0d8c83c854eeb6d93aa158f0e551bd49 Author: Paul Turner <pjt@google.com> AuthorDate: Mon, 7 Nov 2011 20:26:34 -0800 Committer: Ingo Molnar <mingo@elte.hu> CommitDate: Wed, 16 Nov 2011 08:43:45 +0100 sched: Fix buglet in return_cfs_rq_runtime() In return_cfs_rq_runtime() we want to return bandwidth when there are no remaining tasks, not "return" when this is the case. Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20111108042736.623812423@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu> --- kernel/sched_fair.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ba0e1f4..a78ed27 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1756,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) return; __return_cfs_rq_runtime(cfs_rq); ^ permalink raw reply related [flat|nested] 15+ messages in thread
* [patch 3/3] From: Ben Segall <bsegall@google.com> 2011-11-08 4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner 2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner 2011-11-08 4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner @ 2011-11-08 4:26 ` Paul Turner 2011-11-10 2:28 ` Paul Turner 2011-11-10 2:30 ` Paul Turner 2 siblings, 2 replies; 15+ messages in thread From: Paul Turner @ 2011-11-08 4:26 UTC (permalink / raw) To: linux-kernel, Peter Zijlstra, Ingo Molnar; +Cc: Ben Segall [-- Attachment #1: sched-bwc-fix_nr_running.patch --] [-- Type: text/plain, Size: 8571 bytes --] sched: update task accounting on throttle so that idle_balance() will trigger Since throttling occurs in the put_prev_task() path we do not get to observe this delta against nr_running when making the decision to idle_balance(). Fix this by first enumerating cfs_rq throttle states so that we can distinguish throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime, rather than delaying until put_prev_task. This allows schedule() to call idle_balance when we go idle due to throttling. Using Kamalesh's nested-cgroup test case[1] we see the following improvement on a 16 core system: baseline: Average CPU Idle percentage 13.9667% +patch: Average CPU Idle percentage 3.53333% [1]: https://lkml.org/lkml/2011/9/15/261 Signed-off-by: Ben Segall <bsegall@google.com> Signed-off-by: Paul Turner <pjt@google.com> --- kernel/sched.c | 24 ++++++++---- kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 87 insertions(+), 38 deletions(-) Index: tip/kernel/sched.c =================================================================== --- tip.orig/kernel/sched.c +++ tip/kernel/sched.c @@ -269,6 +269,13 @@ struct cfs_bandwidth { #endif }; +enum runtime_state { + RUNTIME_UNLIMITED, + RUNTIME_AVAILABLE, + RUNTIME_THROTTLING, + RUNTIME_THROTTLED +}; + /* task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -402,12 +409,12 @@ struct cfs_rq { unsigned long load_contribution; #endif #ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; + enum runtime_state runtime_state; u64 runtime_expires; s64 runtime_remaining; u64 throttled_timestamp; - int throttled, throttle_count; + int throttle_count; struct list_head throttled_list; #endif #endif @@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - cfs_rq->runtime_enabled = 0; + cfs_rq->runtime_state = RUNTIME_UNLIMITED; INIT_LIST_HEAD(&cfs_rq->throttled_list); } @@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s for_each_leaf_cfs_rq(rq, cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) + if (!cfs_rq_runtime_enabled(cfs_rq)) continue; /* @@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t struct rq *rq = rq_of(cfs_rq); raw_spin_lock_irq(&rq->lock); - cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; - if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); + + if (runtime_enabled) + cfs_rq->runtime_state = RUNTIME_AVAILABLE; + else + cfs_rq->runtime_state = RUNTIME_UNLIMITED; + cfs_rq->runtime_remaining = 0; raw_spin_unlock_irq(&rq->lock); } out_unlock: Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct } } +static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling) +{ + struct sched_entity *se; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (!se->on_rq) + break; + + qcfs_rq->h_nr_running -= nr_throttling; + + if (qcfs_rq->runtime_state == RUNTIME_THROTTLING) + break; + } + + if (!se) + rq_of(cfs_rq)->nr_running -= nr_throttling; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + if (assign_cfs_rq_runtime(cfs_rq)) + return; + + if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) || + cfs_rq->runtime_state == RUNTIME_THROTTLING) + return; + + resched_task(rq_of(cfs_rq)->curr); + + /* + * Remove us from nr_running/h_nr_running so + * that idle_balance gets called if necessary + */ + account_nr_throttling(cfs_rq, cfs_rq->h_nr_running); + cfs_rq->runtime_state = RUNTIME_THROTTLING; +} + +static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && + cfs_rq->runtime_state != RUNTIME_UNLIMITED; } static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) + if (!cfs_rq_runtime_enabled(cfs_rq)) return; __account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { - return cfs_bandwidth_used() && cfs_rq->throttled; + return cfs_bandwidth_used() && + (cfs_rq->runtime_state == RUNTIME_THROTTLED || + cfs_rq->runtime_state == RUNTIME_THROTTLING); } /* check whether cfs_rq, or any parent, is throttled */ @@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) break; - if (dequeue) - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - qcfs_rq->h_nr_running -= task_delta; + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); if (qcfs_rq->load.weight) - dequeue = 0; + break; } - if (!se) - rq->nr_running -= task_delta; - - cfs_rq->throttled = 1; + cfs_rq->runtime_state = RUNTIME_THROTTLED; cfs_rq->throttled_timestamp = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); @@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs int enqueue = 1; long task_delta; + if (cfs_rq->runtime_state == RUNTIME_THROTTLING) { + /* do only the partial unthrottle */ + account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running); + return; + } + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - cfs_rq->throttled = 0; + cfs_rq->runtime_state = RUNTIME_AVAILABLE; raw_spin_lock(&cfs_b->lock); cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; list_del_rcu(&cfs_rq->throttled_list); @@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running) return; __return_cfs_rq_runtime(cfs_rq); @@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - /* an active group must be handled by the update_curr()->put() path */ - if (!cfs_rq->runtime_enabled || cfs_rq->curr) + if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr) return; /* ensure the group is not already throttled */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq->runtime_state == RUNTIME_THROTTLED) return; /* update runtime allocation */ @@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct if (!cfs_bandwidth_used()) return; - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) + if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING)) return; /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. + * it is possible for additional bandwidth to arrive between + * when we call resched_task for being out of runtime and we + * call put_prev_task, in which case reaccount the now running + * tasks */ - if (cfs_rq_throttled(cfs_rq)) - return; - - throttle_cfs_rq(cfs_rq); + if (unlikely(cfs_rq->runtime_remaining > 0)) { + account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running); + cfs_rq->runtime_state = RUNTIME_AVAILABLE; + } else { + throttle_cfs_rq(cfs_rq); + } } #else static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com> 2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner @ 2011-11-10 2:28 ` Paul Turner 2011-11-10 2:30 ` Paul Turner 1 sibling, 0 replies; 15+ messages in thread From: Paul Turner @ 2011-11-10 2:28 UTC (permalink / raw) To: linux-kernel I mangled the subject/from headers on this one, fixed should be below. Thanks, - Paul --- sched: update task accounting on throttle so that idle_balance() will trigger From: Ben Segall <bsegall@google.com> Since throttling occurs in the put_prev_task() path we do not get to observe this delta against nr_running when making the decision to idle_balance(). Fix this by first enumerating cfs_rq throttle states so that we can distinguish throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime, rather than delaying until put_prev_task. This allows schedule() to call idle_balance when we go idle due to throttling. Using Kamalesh's nested-cgroup test case[1] we see the following improvement on a 16 core system: baseline: Average CPU Idle percentage 13.9667% +patch: Average CPU Idle percentage 3.53333% [1]: https://lkml.org/lkml/2011/9/15/261 Signed-off-by: Ben Segall <bsegall@google.com> Signed-off-by: Paul Turner <pjt@google.com> --- kernel/sched.c | 24 ++++++++---- kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 87 insertions(+), 38 deletions(-) Index: tip/kernel/sched.c =================================================================== --- tip.orig/kernel/sched.c +++ tip/kernel/sched.c @@ -269,6 +269,13 @@ struct cfs_bandwidth { #endif }; +enum runtime_state { + RUNTIME_UNLIMITED, + RUNTIME_AVAILABLE, + RUNTIME_THROTTLING, + RUNTIME_THROTTLED +}; + /* task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -402,12 +409,12 @@ struct cfs_rq { unsigned long load_contribution; #endif #ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; + enum runtime_state runtime_state; u64 runtime_expires; s64 runtime_remaining; u64 throttled_timestamp; - int throttled, throttle_count; + int throttle_count; struct list_head throttled_list; #endif #endif @@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - cfs_rq->runtime_enabled = 0; + cfs_rq->runtime_state = RUNTIME_UNLIMITED; INIT_LIST_HEAD(&cfs_rq->throttled_list); } @@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s for_each_leaf_cfs_rq(rq, cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) + if (!cfs_rq_runtime_enabled(cfs_rq)) continue; /* @@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t struct rq *rq = rq_of(cfs_rq); raw_spin_lock_irq(&rq->lock); - cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; - if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); + + if (runtime_enabled) + cfs_rq->runtime_state = RUNTIME_AVAILABLE; + else + cfs_rq->runtime_state = RUNTIME_UNLIMITED; + cfs_rq->runtime_remaining = 0; raw_spin_unlock_irq(&rq->lock); } out_unlock: Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct } } +static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling) +{ + struct sched_entity *se; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (!se->on_rq) + break; + + qcfs_rq->h_nr_running -= nr_throttling; + + if (qcfs_rq->runtime_state == RUNTIME_THROTTLING) + break; + } + + if (!se) + rq_of(cfs_rq)->nr_running -= nr_throttling; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + if (assign_cfs_rq_runtime(cfs_rq)) + return; + + if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) || + cfs_rq->runtime_state == RUNTIME_THROTTLING) + return; + + resched_task(rq_of(cfs_rq)->curr); + + /* + * Remove us from nr_running/h_nr_running so + * that idle_balance gets called if necessary + */ + account_nr_throttling(cfs_rq, cfs_rq->h_nr_running); + cfs_rq->runtime_state = RUNTIME_THROTTLING; +} + +static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && + cfs_rq->runtime_state != RUNTIME_UNLIMITED; } static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) + if (!cfs_rq_runtime_enabled(cfs_rq)) return; __account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { - return cfs_bandwidth_used() && cfs_rq->throttled; + return cfs_bandwidth_used() && + (cfs_rq->runtime_state == RUNTIME_THROTTLED || + cfs_rq->runtime_state == RUNTIME_THROTTLING); } /* check whether cfs_rq, or any parent, is throttled */ @@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) break; - if (dequeue) - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - qcfs_rq->h_nr_running -= task_delta; + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); if (qcfs_rq->load.weight) - dequeue = 0; + break; } - if (!se) - rq->nr_running -= task_delta; - - cfs_rq->throttled = 1; + cfs_rq->runtime_state = RUNTIME_THROTTLED; cfs_rq->throttled_timestamp = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); @@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs int enqueue = 1; long task_delta; + if (cfs_rq->runtime_state == RUNTIME_THROTTLING) { + /* do only the partial unthrottle */ + account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running); + return; + } + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - cfs_rq->throttled = 0; + cfs_rq->runtime_state = RUNTIME_AVAILABLE; raw_spin_lock(&cfs_b->lock); cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; list_del_rcu(&cfs_rq->throttled_list); @@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running) return; __return_cfs_rq_runtime(cfs_rq); @@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - /* an active group must be handled by the update_curr()->put() path */ - if (!cfs_rq->runtime_enabled || cfs_rq->curr) + if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr) return; /* ensure the group is not already throttled */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq->runtime_state == RUNTIME_THROTTLED) return; /* update runtime allocation */ @@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct if (!cfs_bandwidth_used()) return; - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) + if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING)) return; /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. + * it is possible for additional bandwidth to arrive between + * when we call resched_task for being out of runtime and we + * call put_prev_task, in which case reaccount the now running + * tasks */ - if (cfs_rq_throttled(cfs_rq)) - return; - - throttle_cfs_rq(cfs_rq); + if (unlikely(cfs_rq->runtime_remaining > 0)) { + account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running); + cfs_rq->runtime_state = RUNTIME_AVAILABLE; + } else { + throttle_cfs_rq(cfs_rq); + } } #else static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com> 2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner 2011-11-10 2:28 ` Paul Turner @ 2011-11-10 2:30 ` Paul Turner 2011-11-14 10:03 ` Kamalesh Babulal 2011-11-14 12:01 ` Peter Zijlstra 1 sibling, 2 replies; 15+ messages in thread From: Paul Turner @ 2011-11-10 2:30 UTC (permalink / raw) Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Ben Segall I mangled the subject/from headers on this one, fixed should be below. Thanks, - Paul --- sched: update task accounting on throttle so that idle_balance() will trigger From: Ben Segall <bsegall@google.com> Since throttling occurs in the put_prev_task() path we do not get to observe this delta against nr_running when making the decision to idle_balance(). Fix this by first enumerating cfs_rq throttle states so that we can distinguish throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime, rather than delaying until put_prev_task. This allows schedule() to call idle_balance when we go idle due to throttling. Using Kamalesh's nested-cgroup test case[1] we see the following improvement on a 16 core system: baseline: Average CPU Idle percentage 13.9667% +patch: Average CPU Idle percentage 3.53333% [1]: https://lkml.org/lkml/2011/9/15/261 Signed-off-by: Ben Segall <bsegall@google.com> Signed-off-by: Paul Turner <pjt@google.com> --- kernel/sched.c | 24 ++++++++---- kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 87 insertions(+), 38 deletions(-) Index: tip/kernel/sched.c =================================================================== --- tip.orig/kernel/sched.c +++ tip/kernel/sched.c @@ -269,6 +269,13 @@ struct cfs_bandwidth { #endif }; +enum runtime_state { + RUNTIME_UNLIMITED, + RUNTIME_AVAILABLE, + RUNTIME_THROTTLING, + RUNTIME_THROTTLED +}; + /* task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -402,12 +409,12 @@ struct cfs_rq { unsigned long load_contribution; #endif #ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; + enum runtime_state runtime_state; u64 runtime_expires; s64 runtime_remaining; u64 throttled_timestamp; - int throttled, throttle_count; + int throttle_count; struct list_head throttled_list; #endif #endif @@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - cfs_rq->runtime_enabled = 0; + cfs_rq->runtime_state = RUNTIME_UNLIMITED; INIT_LIST_HEAD(&cfs_rq->throttled_list); } @@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s for_each_leaf_cfs_rq(rq, cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) + if (!cfs_rq_runtime_enabled(cfs_rq)) continue; /* @@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t struct rq *rq = rq_of(cfs_rq); raw_spin_lock_irq(&rq->lock); - cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; - if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); + + if (runtime_enabled) + cfs_rq->runtime_state = RUNTIME_AVAILABLE; + else + cfs_rq->runtime_state = RUNTIME_UNLIMITED; + cfs_rq->runtime_remaining = 0; raw_spin_unlock_irq(&rq->lock); } out_unlock: Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct } } +static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling) +{ + struct sched_entity *se; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (!se->on_rq) + break; + + qcfs_rq->h_nr_running -= nr_throttling; + + if (qcfs_rq->runtime_state == RUNTIME_THROTTLING) + break; + } + + if (!se) + rq_of(cfs_rq)->nr_running -= nr_throttling; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + if (assign_cfs_rq_runtime(cfs_rq)) + return; + + if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) || + cfs_rq->runtime_state == RUNTIME_THROTTLING) + return; + + resched_task(rq_of(cfs_rq)->curr); + + /* + * Remove us from nr_running/h_nr_running so + * that idle_balance gets called if necessary + */ + account_nr_throttling(cfs_rq, cfs_rq->h_nr_running); + cfs_rq->runtime_state = RUNTIME_THROTTLING; +} + +static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && + cfs_rq->runtime_state != RUNTIME_UNLIMITED; } static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) + if (!cfs_rq_runtime_enabled(cfs_rq)) return; __account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { - return cfs_bandwidth_used() && cfs_rq->throttled; + return cfs_bandwidth_used() && + (cfs_rq->runtime_state == RUNTIME_THROTTLED || + cfs_rq->runtime_state == RUNTIME_THROTTLING); } /* check whether cfs_rq, or any parent, is throttled */ @@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) break; - if (dequeue) - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - qcfs_rq->h_nr_running -= task_delta; + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); if (qcfs_rq->load.weight) - dequeue = 0; + break; } - if (!se) - rq->nr_running -= task_delta; - - cfs_rq->throttled = 1; + cfs_rq->runtime_state = RUNTIME_THROTTLED; cfs_rq->throttled_timestamp = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); @@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs int enqueue = 1; long task_delta; + if (cfs_rq->runtime_state == RUNTIME_THROTTLING) { + /* do only the partial unthrottle */ + account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running); + return; + } + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - cfs_rq->throttled = 0; + cfs_rq->runtime_state = RUNTIME_AVAILABLE; raw_spin_lock(&cfs_b->lock); cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; list_del_rcu(&cfs_rq->throttled_list); @@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running) return; __return_cfs_rq_runtime(cfs_rq); @@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - /* an active group must be handled by the update_curr()->put() path */ - if (!cfs_rq->runtime_enabled || cfs_rq->curr) + if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr) return; /* ensure the group is not already throttled */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq->runtime_state == RUNTIME_THROTTLED) return; /* update runtime allocation */ @@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct if (!cfs_bandwidth_used()) return; - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) + if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING)) return; /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. + * it is possible for additional bandwidth to arrive between + * when we call resched_task for being out of runtime and we + * call put_prev_task, in which case reaccount the now running + * tasks */ - if (cfs_rq_throttled(cfs_rq)) - return; - - throttle_cfs_rq(cfs_rq); + if (unlikely(cfs_rq->runtime_remaining > 0)) { + account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running); + cfs_rq->runtime_state = RUNTIME_AVAILABLE; + } else { + throttle_cfs_rq(cfs_rq); + } } #else static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com> 2011-11-10 2:30 ` Paul Turner @ 2011-11-14 10:03 ` Kamalesh Babulal 2011-11-14 12:01 ` Peter Zijlstra 1 sibling, 0 replies; 15+ messages in thread From: Kamalesh Babulal @ 2011-11-14 10:03 UTC (permalink / raw) To: Paul Turner Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Ben Segall, Srivatsa Vaddagiri [snip] > Since throttling occurs in the put_prev_task() path we do not get to observe > this delta against nr_running when making the decision to idle_balance(). > > Fix this by first enumerating cfs_rq throttle states so that we can distinguish > throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task > from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime, > rather than delaying until put_prev_task. > > This allows schedule() to call idle_balance when we go idle due to throttling. > > Using Kamalesh's nested-cgroup test case[1] we see the following improvement on > a 16 core system: > baseline: Average CPU Idle percentage 13.9667% > +patch: Average CPU Idle percentage 3.53333% > [1]: https://lkml.org/lkml/2011/9/15/261 > > Signed-off-by: Ben Segall <bsegall@google.com> > Signed-off-by: Paul Turner <pjt@google.com> Tested-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com> Thanks for the patch. I tested patches on the same test environment, over which the cpu idle time was reported first at https://lkml.org/lkml/2011/6/7/352. In brief, tests were run on 2 socket quad core machine with three level of nested cgroups hierarchy and five cgroups created below the third level. Each of the five cgroups, having 2,2,4,8,16 while1 or cpu-matrix (https://lkml.org/lkml/2011/11/4/107) tasks attached to them respectively. [1] CFS Bandwith tweaks, were the patches posted by Paul Turner (https://lkml.org/lkml/2011/11/7/603) [2] nohz idle balance RFC patch by Srivatsa Vaddagiri (https://lkml.org/lkml/2011/11/2/117) While running the cpu-matrix benchmark with the patches, there was an improvement around ~50 to 55% and additional ~3% benefit in idle time with nohz idle balance patch. With while1 loop the improvment was around ~36 to 40% over tip and an additional benefit of ~4 to 5% was seen with nohz idle balance patch. (1) cpu-matrix benchmark with nohz=on ---------------------------------- Run Base (tip) tip + CFS Bandwith tweaks tip + CFS Bandwith tweaks + nohz idle patch -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 1 Average CPU Idle percentage 4.1% Average CPU Idle percentage 2.36667% Average CPU Idle percentage 2.23333% Bandwidth shared with remaining non-Idle 95.9% Bandwidth shared with remaining non-Idle 97.63333% Bandwidth shared with remaining non-Idle 97.76667% 2 Average CPU Idle percentage 4.23% Average CPU Idle percentage 2.3% Average CPU Idle percentage 2.16667% Bandwidth shared with remaining non-Idle 95.77% Bandwidth shared with remaining non-Idle 97.7% Bandwidth shared with remaining non-Idle 97.83333% (2) cpu-matrix benchmark with nohz=off ----------------------------------- Run Base (tip) tip + CFS Bandwith tweaks tip + CFS Bandwith tweaks + nohz idle patch -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 1 Average CPU Idle percentage 4.53333% Average CPU Idle percentage 2.43333% Average CPU Idle percentage 2.36667% Bandwidth shared with remaining non-Idle 95.46667% Bandwidth shared with remaining non-Idle 97.56667% Bandwidth shared with remaining non-Idle 97.63333% 2 Average CPU Idle percentage 4.4% Average CPU Idle percentage 2.36667% Average CPU Idle percentage 2.4% Bandwidth shared with remaining non-Idle 95.6% Bandwidth shared with remaining non-Idle 97.63333% Bandwidth shared with remaining non-Idle 97.6% (3) while1 loop with nohz=on ------------------------- Run Base (tip) tip + CFS Bandwith tweaks tip + CFS Bandwith tweaks + nohz idle patch -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 1 Average CPU Idle percentage 6.26667% Average CPU Idle percentage 2.5% Average CPU Idle percentage 2.23333% Bandwidth shared with remaining non-Idle 93.73333% Bandwidth shared with remaining non-Idle 97.5% Bandwidth shared with remaining non-Idle 97.76667% 2 Average CPU Idle percentage 6.73333% Average CPU Idle percentage 2.46667% Average CPU Idle percentage 2.13333% Bandwidth shared with remaining non-Idle 93.26667% Bandwidth shared with remaining non-Idle 97.53333% Bandwidth shared with remaining non-Idle 97.86667% (4) while1 loop with nohz=off -------------------------- Run Base (tip) tip + CFS Bandwith tweaks tip + CFS Bandwith tweaks + nohz idle patch -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 1 Average CPU Idle percentage 3.6% Average CPU Idle percentage 2.4% Average CPU Idle percentage 2.43333% Bandwidth shared with remaining non-Idle 96.4% Bandwidth shared with remaining non-Idle 97.6% Bandwidth shared with remaining non-Idle 97.56667% 2 Average CPU Idle percentage 3.46667% Average CPU Idle percentage 2.33333% Average CPU Idle percentage 2.4% Bandwidth shared with remaining non-Idle 96.53333% Bandwidth shared with remaining non-Idle 97.66667% Bandwidth shared with remaining non-Idle 97.6% each cpu-matrix benchmark task was run as # perf sched cpu-matrix -s1k -i 1000 -p100 Kamalesh. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com> 2011-11-10 2:30 ` Paul Turner 2011-11-14 10:03 ` Kamalesh Babulal @ 2011-11-14 12:01 ` Peter Zijlstra 2011-11-15 21:14 ` Benjamin Segall 1 sibling, 1 reply; 15+ messages in thread From: Peter Zijlstra @ 2011-11-14 12:01 UTC (permalink / raw) To: Paul Turner; +Cc: linux-kernel, Ingo Molnar, Ben Segall On Wed, 2011-11-09 at 18:30 -0800, Paul Turner wrote: > > sched: update task accounting on throttle so that idle_balance() will trigger > From: Ben Segall <bsegall@google.com> > > Since throttling occurs in the put_prev_task() path we do not get to observe > this delta against nr_running when making the decision to idle_balance(). > > Fix this by first enumerating cfs_rq throttle states so that we can distinguish > throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task > from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime, > rather than delaying until put_prev_task. > > This allows schedule() to call idle_balance when we go idle due to throttling. > > Using Kamalesh's nested-cgroup test case[1] we see the following improvement on > a 16 core system: > baseline: Average CPU Idle percentage 13.9667% > +patch: Average CPU Idle percentage 3.53333% > [1]: https://lkml.org/lkml/2011/9/15/261 > > Signed-off-by: Ben Segall <bsegall@google.com> > Signed-off-by: Paul Turner <pjt@google.com> I really don't like this patch... There's something wrong about decoupling the dequeue from nr_running accounting. That said, I haven't got a bright idea either.. anyway, I think the patch is somewhat too big for 3.2 at this point. > --- > kernel/sched.c | 24 ++++++++---- > kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++---------------- > 2 files changed, 87 insertions(+), 38 deletions(-) > > Index: tip/kernel/sched.c > =================================================================== > --- tip.orig/kernel/sched.c > +++ tip/kernel/sched.c > @@ -269,6 +269,13 @@ struct cfs_bandwidth { > #endif > }; > > +enum runtime_state { > + RUNTIME_UNLIMITED, > + RUNTIME_AVAILABLE, > + RUNTIME_THROTTLING, > + RUNTIME_THROTTLED > +}; What's the difference between throttling and throttled? Throttling is between actually getting throttled and put_prev_task() getting called? This all wants a comment. > +static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling) > +{ > + struct sched_entity *se; > + > + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; > + > + for_each_sched_entity(se) { > + struct cfs_rq *qcfs_rq = cfs_rq_of(se); > + if (!se->on_rq) > + break; > + > + qcfs_rq->h_nr_running -= nr_throttling; > + > + if (qcfs_rq->runtime_state == RUNTIME_THROTTLING) > + break; > + } > + > + if (!se) > + rq_of(cfs_rq)->nr_running -= nr_throttling; > +} Since you'll end up calling this stuff with a negative nr_throttling, please use += to avoid the double negative brain twist. > static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, > unsigned long delta_exec) > { > @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str > * if we're unable to extend our runtime we resched so that the active > * hierarchy can be throttled > */ > - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) > - resched_task(rq_of(cfs_rq)->curr); > + if (assign_cfs_rq_runtime(cfs_rq)) > + return; > + > + if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) || > + cfs_rq->runtime_state == RUNTIME_THROTTLING) > + return; How exactly can we get here if we're throttling already? > + resched_task(rq_of(cfs_rq)->curr); > + > + /* > + * Remove us from nr_running/h_nr_running so > + * that idle_balance gets called if necessary > + */ > + account_nr_throttling(cfs_rq, cfs_rq->h_nr_running); > + cfs_rq->runtime_state = RUNTIME_THROTTLING; > +} > @@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_ > > static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) > { > - return cfs_bandwidth_used() && cfs_rq->throttled; > + return cfs_bandwidth_used() && > + (cfs_rq->runtime_state == RUNTIME_THROTTLED || > + cfs_rq->runtime_state == RUNTIME_THROTTLING); > } >= THROTTLING saves a test. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] From: Ben Segall <bsegall@google.com> 2011-11-14 12:01 ` Peter Zijlstra @ 2011-11-15 21:14 ` Benjamin Segall 0 siblings, 0 replies; 15+ messages in thread From: Benjamin Segall @ 2011-11-15 21:14 UTC (permalink / raw) To: Peter Zijlstra; +Cc: Paul Turner, linux-kernel, Ingo Molnar Peter Zijlstra <a.p.zijlstra@chello.nl> writes: > On Wed, 2011-11-09 at 18:30 -0800, Paul Turner wrote: >> >> sched: update task accounting on throttle so that idle_balance() will trigger >> From: Ben Segall <bsegall@google.com> >> >> Since throttling occurs in the put_prev_task() path we do not get to observe >> this delta against nr_running when making the decision to idle_balance(). >> >> Fix this by first enumerating cfs_rq throttle states so that we can distinguish >> throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task >> from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime, >> rather than delaying until put_prev_task. >> >> This allows schedule() to call idle_balance when we go idle due to throttling. >> >> Using Kamalesh's nested-cgroup test case[1] we see the following improvement on >> a 16 core system: >> baseline: Average CPU Idle percentage 13.9667% >> +patch: Average CPU Idle percentage 3.53333% >> [1]: https://lkml.org/lkml/2011/9/15/261 >> >> Signed-off-by: Ben Segall <bsegall@google.com> >> Signed-off-by: Paul Turner <pjt@google.com> > > I really don't like this patch... There's something wrong about > decoupling the dequeue from nr_running accounting. > > That said, I haven't got a bright idea either.. anyway, I think the > patch is somewhat too big for 3.2 at this point. Since we can't really throttle until schedule or so, the simplest solution would be to move it from put_prev_task to a prev_schedule hook so that it could be before idle_balance, but that seems a bit heavy just for cfs bandwidth. > >> --- >> kernel/sched.c | 24 ++++++++---- >> kernel/sched_fair.c | 101 ++++++++++++++++++++++++++++++++++++---------------- >> 2 files changed, 87 insertions(+), 38 deletions(-) >> >> Index: tip/kernel/sched.c >> =================================================================== >> --- tip.orig/kernel/sched.c >> +++ tip/kernel/sched.c >> @@ -269,6 +269,13 @@ struct cfs_bandwidth { >> #endif >> }; >> >> +enum runtime_state { >> + RUNTIME_UNLIMITED, >> + RUNTIME_AVAILABLE, >> + RUNTIME_THROTTLING, >> + RUNTIME_THROTTLED >> +}; > > What's the difference between throttling and throttled? Throttling is > between actually getting throttled and put_prev_task() getting called? > This all wants a comment. Yes, although THROTTLING->AVAILABLE (or even ->UNLIMITED) is also possible. >> @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str >> * if we're unable to extend our runtime we resched so that the active >> * hierarchy can be throttled >> */ >> - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) >> - resched_task(rq_of(cfs_rq)->curr); >> + if (assign_cfs_rq_runtime(cfs_rq)) >> + return; >> + >> + if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) || >> + cfs_rq->runtime_state == RUNTIME_THROTTLING) >> + return; > > How exactly can we get here if we're throttling already? A wakeup in the same to-be-throttled cfs_rq before need_resched is handled. The simplest trigger would be three tasks on the same cfs_rq, only one of which is running. If it tries to wake both of the other tasks just as it runs out of runtime, we would hit this. And the patch updated for these and your other comments: -- 8< -- From: Benjamin Segall <bsegall@google.com> Subject: sched: update task accounting on throttle so that idle_balance() will trigger Since throttling occurs in the put_prev_task() path we do not get to observe this delta against nr_running when making the decision to idle_balance(). Fix this by first enumerating cfs_rq throttle states so that we can distinguish throttling cfs_rqs. Then remove tasks that will be throttled in put_prev_task from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime, rather than delaying until put_prev_task. This allows schedule() to call idle_balance when we go idle due to throttling. Using Kamalesh's nested-cgroup test case[1] we see the following improvement on a 16 core system: baseline: Average CPU Idle percentage 13.9667% +patch: Average CPU Idle percentage 3.53333% [1]: https://lkml.org/lkml/2011/9/15/261 Signed-off-by: Ben Segall <bsegall@google.com> Signed-off-by: Paul Turner <pjt@google.com> --- kernel/sched.c | 28 +++++++++++--- kernel/sched_fair.c | 100 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 90 insertions(+), 38 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7c1b2f5..0af1680 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -269,6 +269,17 @@ struct cfs_bandwidth { #endif }; +/* + * THROTTLING tasks have been removed from nr_running, but have not + * yet been dequeued. THROTTLED tasks have been entirely dequeued. + */ +enum runtime_state { + RUNTIME_UNLIMITED, + RUNTIME_AVAILABLE, + RUNTIME_THROTTLING, + RUNTIME_THROTTLED +}; + /* task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -402,12 +413,12 @@ struct cfs_rq { unsigned long load_contribution; #endif #ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; + enum runtime_state runtime_state; u64 runtime_expires; s64 runtime_remaining; u64 throttled_timestamp; - int throttled, throttle_count; + int throttle_count; struct list_head throttled_list; #endif #endif @@ -470,7 +481,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - cfs_rq->runtime_enabled = 0; + cfs_rq->runtime_state = RUNTIME_UNLIMITED; INIT_LIST_HEAD(&cfs_rq->throttled_list); } @@ -6368,7 +6379,7 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq) for_each_leaf_cfs_rq(rq, cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - if (!cfs_rq->runtime_enabled) + if (!cfs_rq_runtime_enabled(cfs_rq)) continue; /* @@ -9263,11 +9274,14 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) struct rq *rq = rq_of(cfs_rq); raw_spin_lock_irq(&rq->lock); - cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; - if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); + + if (runtime_enabled) + cfs_rq->runtime_state = RUNTIME_AVAILABLE; + else + cfs_rq->runtime_state = RUNTIME_UNLIMITED; + cfs_rq->runtime_remaining = 0; raw_spin_unlock_irq(&rq->lock); } out_unlock: diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3539569..7dc5c19 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) } } +static void account_nr_throttling(struct cfs_rq *cfs_rq, long delta) +{ + struct sched_entity *se; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (!se->on_rq) + break; + + qcfs_rq->h_nr_running += delta; + + if (qcfs_rq->runtime_state == RUNTIME_THROTTLING) + break; + } + + if (!se) + rq_of(cfs_rq)->nr_running += delta; +} + static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + if (assign_cfs_rq_runtime(cfs_rq)) + return; + + if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) || + cfs_rq->runtime_state == RUNTIME_THROTTLING) + return; + + resched_task(rq_of(cfs_rq)->curr); + + /* + * Remove us from nr_running/h_nr_running so + * that idle_balance gets called if necessary + */ + account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running); + cfs_rq->runtime_state = RUNTIME_THROTTLING; +} + +static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && + cfs_rq->runtime_state != RUNTIME_UNLIMITED; } static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) + if (!cfs_rq_runtime_enabled(cfs_rq)) return; __account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1416,7 +1456,8 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { - return cfs_bandwidth_used() && cfs_rq->throttled; + return cfs_bandwidth_used() && + cfs_rq->runtime_state >= RUNTIME_THROTTLING; } /* check whether cfs_rq, or any parent, is throttled */ @@ -1483,7 +1524,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -1492,25 +1532,19 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) break; - if (dequeue) - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - qcfs_rq->h_nr_running -= task_delta; + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); if (qcfs_rq->load.weight) - dequeue = 0; + break; } - if (!se) - rq->nr_running -= task_delta; - - cfs_rq->throttled = 1; + cfs_rq->runtime_state = RUNTIME_THROTTLED; cfs_rq->throttled_timestamp = rq->clock; raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); @@ -1525,9 +1559,15 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) int enqueue = 1; long task_delta; + if (cfs_rq->runtime_state == RUNTIME_THROTTLING) { + /* do only the partial unthrottle */ + account_nr_throttling(cfs_rq, cfs_rq->h_nr_running); + return; + } + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; - cfs_rq->throttled = 0; + cfs_rq->runtime_state = RUNTIME_AVAILABLE; raw_spin_lock(&cfs_b->lock); cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; list_del_rcu(&cfs_rq->throttled_list); @@ -1743,10 +1783,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running) return; __return_cfs_rq_runtime(cfs_rq); @@ -1791,15 +1828,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { - if (!cfs_bandwidth_used()) - return; - /* an active group must be handled by the update_curr()->put() path */ - if (!cfs_rq->runtime_enabled || cfs_rq->curr) + if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr) return; /* ensure the group is not already throttled */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq->runtime_state == RUNTIME_THROTTLED) return; /* update runtime allocation */ @@ -1814,17 +1848,21 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) + if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING)) return; /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. + * it is possible for additional bandwidth to arrive between + * when we call resched_task for being out of runtime and we + * call put_prev_task, in which case reaccount the now running + * tasks */ - if (cfs_rq_throttled(cfs_rq)) - return; - - throttle_cfs_rq(cfs_rq); + if (unlikely(cfs_rq->runtime_remaining > 0)) { + account_nr_throttling(cfs_rq, cfs_rq->h_nr_running); + cfs_rq->runtime_state = RUNTIME_AVAILABLE; + } else { + throttle_cfs_rq(cfs_rq); + } } #else static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, ^ permalink raw reply related [flat|nested] 15+ messages in thread
end of thread, other threads:[~2011-11-18 23:42 UTC | newest] Thread overview: 15+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2011-11-08 4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner 2011-11-08 4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner 2011-11-08 9:26 ` Peter Zijlstra 2011-11-08 9:28 ` Peter Zijlstra 2011-11-08 9:29 ` Peter Zijlstra 2011-11-11 4:23 ` Paul Turner 2011-11-18 23:42 ` [tip:sched/core] sched: Use " tip-bot for Paul Turner 2011-11-08 4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner 2011-11-18 23:41 ` [tip:sched/core] sched: Fix " tip-bot for Paul Turner 2011-11-08 4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner 2011-11-10 2:28 ` Paul Turner 2011-11-10 2:30 ` Paul Turner 2011-11-14 10:03 ` Kamalesh Babulal 2011-11-14 12:01 ` Peter Zijlstra 2011-11-15 21:14 ` Benjamin Segall
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox