[patch 0/3] sched: bandwidth-control tweaks for v3.2

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 0/3] sched: bandwidth-control tweaks for v3.2
@ 2011-11-08  4:26 Paul Turner
  2011-11-08  4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-08  4:26 UTC (permalink / raw)
  To: linux-kernel, Peter Zijlstra, Ingo Molnar

Hi,

Please find attached a few minor tweaks for bandwidth control that are worth
considering for 3.2

The first two are a performance improvement for the !enabled case and a buglet
fix.  While the last makes idle_balance() work with throttling.

Thanks,

- Paul





^ permalink raw reply	[flat|nested] 15+ messages in thread

* [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
  2011-11-08  4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner
@ 2011-11-08  4:26 ` Paul Turner
  2011-11-08  9:26   ` Peter Zijlstra
                     ` (3 more replies)
  2011-11-08  4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner
  2011-11-08  4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
  2 siblings, 4 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-08  4:26 UTC (permalink / raw)
  To: linux-kernel, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: sched-bwc-add_jump_labels.patch --]
[-- Type: text/plain, Size: 4859 bytes --]

Now that the linkage of jump-labels has been fixed they show a measurable
improvement in overhead for the enabled-but-unused case.

Workload is:
'taskset -c 0 perf stat --repeat 50 -e instructions,cycles,branches
bash -c "for ((i=0;i<5;i++)); do $(dirname $0)/pipe-test 20000; done"'

                instructions            cycles                  branches
-------------------------------------------------------------------------
Intel Westmere
base            806611770               745895590               146765378
+jumplabel      803090165 (-0.44)       713381840 (-4.36)       144561130

AMD Barcelona
base            824657415               740055589               148855354
+jumplabel      821056910 (-0.44)       737558389 (-0.34)       146635229


Signed-off-by: Paul Turner <pjt@google.com>

---
 kernel/sched.c      |   33 +++++++++++++++++++++++++++++++--
 kernel/sched_fair.c |   17 +++++++++++++----
 2 files changed, 44 insertions(+), 6 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -71,6 +71,7 @@
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
+#include <linux/jump_label.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -502,7 +503,32 @@ static void destroy_cfs_bandwidth(struct
 	hrtimer_cancel(&cfs_b->period_timer);
 	hrtimer_cancel(&cfs_b->slack_timer);
 }
-#else
+
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+	return static_branch(&__cfs_bandwidth_used);
+}
+
+static void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+	/* only need to count groups transitioning between enabled/!enabled */
+	if (enabled && !was_enabled)
+		jump_label_inc(&__cfs_bandwidth_used);
+	else if (!enabled && was_enabled)
+		jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* !HAVE_JUMP_LABEL */
+/* static_branch doesn't help unless supported */
+static int cfs_bandwidth_used(void)
+{
+	return 1;
+}
+static void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
+#else /* !CONFIG_CFS_BANDWIDTH */
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
@@ -9188,7 +9214,7 @@ static int __cfs_schedulable(struct task
 
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
-	int i, ret = 0, runtime_enabled;
+	int i, ret = 0, runtime_enabled, runtime_was_enabled;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 
 	if (tg == &root_task_group)
@@ -9216,6 +9242,9 @@ static int tg_set_cfs_bandwidth(struct t
 		goto out_unlock;
 
 	runtime_enabled = quota != RUNTIME_INF;
+	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+
 	raw_spin_lock_irq(&cfs_b->lock);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1408,7 +1408,7 @@ static void __account_cfs_rq_runtime(str
 static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 						   unsigned long delta_exec)
 {
-	if (!cfs_rq->runtime_enabled)
+	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 		return;
 
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,13 +1416,13 @@ static __always_inline void account_cfs_
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->throttled;
+	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->throttle_count;
+	return cfs_bandwidth_used() && cfs_rq->throttle_count;
 }
 
 /*
@@ -1743,6 +1743,9 @@ static void __return_cfs_rq_runtime(stru
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
+	if (!cfs_bandwidth_used())
+		return;
+
 	if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
 		return;
 
@@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str
  */
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 {
+	if (!cfs_bandwidth_used())
+		return;
+
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
@@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
+	if (!cfs_bandwidth_used())
+		return;
+
 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
 		return;
 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
  2011-11-08  4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
@ 2011-11-08  9:26   ` Peter Zijlstra
  2011-11-08  9:28   ` Peter Zijlstra
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2011-11-08  9:26 UTC (permalink / raw)
  To: Paul Turner; +Cc: linux-kernel, Ingo Molnar

On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote:
> +static void account_cfs_bandwidth_used(int enabled, int was_enabled)
> +{
> +       /* only need to count groups transitioning between enabled/!enabled */
> +       if (enabled && !was_enabled)
> +               jump_label_inc(&__cfs_bandwidth_used);
> +       else if (!enabled && was_enabled)
> +               jump_label_dec(&__cfs_bandwidth_used);
> +} 

Can't you use jump_label_enabled(&__cfs_bandwidth_used) to was_enabled ?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
  2011-11-08  4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
  2011-11-08  9:26   ` Peter Zijlstra
@ 2011-11-08  9:28   ` Peter Zijlstra
  2011-11-08  9:29   ` Peter Zijlstra
  2011-11-18 23:42   ` [tip:sched/core] sched: Use " tip-bot for Paul Turner
  3 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2011-11-08  9:28 UTC (permalink / raw)
  To: Paul Turner; +Cc: linux-kernel, Ingo Molnar

On Tue, 2011-11-08 at 10:26 +0100, Peter Zijlstra wrote:
> On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote:
> > +static void account_cfs_bandwidth_used(int enabled, int was_enabled)
> > +{
> > +       /* only need to count groups transitioning between enabled/!enabled */
> > +       if (enabled && !was_enabled)
> > +               jump_label_inc(&__cfs_bandwidth_used);
> > +       else if (!enabled && was_enabled)
> > +               jump_label_dec(&__cfs_bandwidth_used);
> > +} 
> 
> Can't you use jump_label_enabled(&__cfs_bandwidth_used) to was_enabled ?

n/m I'm not awake yet..

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
  2011-11-08  4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
  2011-11-08  9:26   ` Peter Zijlstra
  2011-11-08  9:28   ` Peter Zijlstra
@ 2011-11-08  9:29   ` Peter Zijlstra
  2011-11-11  4:23     ` Paul Turner
  2011-11-18 23:42   ` [tip:sched/core] sched: Use " tip-bot for Paul Turner
  3 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2011-11-08  9:29 UTC (permalink / raw)
  To: Paul Turner; +Cc: linux-kernel, Ingo Molnar

On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote:
> @@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str
>   */
>  static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
>  {
> +       if (!cfs_bandwidth_used())
> +               return;
> +
>         /* an active group must be handled by the update_curr()->put() path */
>         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
>                 return;
> @@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc
>  /* conditionally throttle active cfs_rq's from put_prev_entity() */
>  static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>  {
> +       if (!cfs_bandwidth_used())
> +               return;
> +
>         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
>                 return;
>   

does it matter if you pull this out into an inline function like:

static __always_inline void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
	if (cfs_bandwidth_used())
		__check_enqueue_throttle(cfs_rq);
}

That would avoid the superfluous function call as well.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive
  2011-11-08  9:29   ` Peter Zijlstra
@ 2011-11-11  4:23     ` Paul Turner
  0 siblings, 0 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-11  4:23 UTC (permalink / raw)
  To: linux-kernel

On 11/08/2011 01:29 AM, Peter Zijlstra wrote:
> On Mon, 2011-11-07 at 20:26 -0800, Paul Turner wrote:
>> @@ -1788,6 +1791,9 @@ static void do_sched_cfs_slack_timer(str
>>    */
>>   static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
>>   {
>> +       if (!cfs_bandwidth_used())
>> +               return;
>> +
>>          /* an active group must be handled by the update_curr()->put() path */
>>          if (!cfs_rq->runtime_enabled || cfs_rq->curr)
>>                  return;
>> @@ -1805,6 +1811,9 @@ static void check_enqueue_throttle(struc
>>   /* conditionally throttle active cfs_rq's from put_prev_entity() */
>>   static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
>>   {
>> +       if (!cfs_bandwidth_used())
>> +               return;
>> +
>>          if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining>  0))
>>                  return;
>>
>
> does it matter if you pull this out into an inline function like:
>
> static __always_inline void check_enqueue_throttle(struct cfs_rq *cfs_rq)
> {
> 	if (cfs_bandwidth_used())
> 		__check_enqueue_throttle(cfs_rq);
> }
>
> That would avoid the superfluous function call as well.

This this is the only call-site it's being inlined already so there's no 
difference atm.  That said, check_enqueue_throttle is sufficiently small we 
could just add __always_inline to force this to be the default behavior and 
avoid any surprises from this in the future.

Let me know if you'd like me to re-post with this added.

- Paul


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [tip:sched/core] sched: Use jump labels to reduce overhead when bandwidth control is inactive
  2011-11-08  4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
                     ` (2 preceding siblings ...)
  2011-11-08  9:29   ` Peter Zijlstra
@ 2011-11-18 23:42   ` tip-bot for Paul Turner
  3 siblings, 0 replies; 15+ messages in thread
From: tip-bot for Paul Turner @ 2011-11-18 23:42 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, a.p.zijlstra, pjt, tglx, mingo

Commit-ID:  56f570e512eeb5b412cb3a62234adc446a8eb32b
Gitweb:     http://git.kernel.org/tip/56f570e512eeb5b412cb3a62234adc446a8eb32b
Author:     Paul Turner <pjt@google.com>
AuthorDate: Mon, 7 Nov 2011 20:26:33 -0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 16 Nov 2011 08:48:18 +0100

sched: Use jump labels to reduce overhead when bandwidth control is inactive

Now that the linkage of jump-labels has been fixed they show a measurable
improvement in overhead for the enabled-but-unused case.

Workload is:

 'taskset -c 0 perf stat --repeat 50 -e instructions,cycles,branches
  bash -c "for ((i=0;i<5;i++)); do $(dirname $0)/pipe-test 20000; done"'

There's a speedup for all situations:

                 instructions            cycles                  branches
 -------------------------------------------------------------------------
 Intel Westmere
 base            806611770               745895590               146765378
 +jumplabel      803090165 (-0.44%)      713381840 (-4.36%)      144561130

 AMD Barcelona
 base            824657415               740055589               148855354
 +jumplabel      821056910 (-0.44%)      737558389 (-0.34%)      146635229

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111108042736.560831357@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |   33 +++++++++++++++++++++++++++++++--
 kernel/sched_fair.c |   15 ++++++++++++---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d6b149c..d9d79a4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -72,6 +72,7 @@
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
+#include <linux/jump_label.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -503,7 +504,32 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	hrtimer_cancel(&cfs_b->period_timer);
 	hrtimer_cancel(&cfs_b->slack_timer);
 }
-#else
+
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+	return static_branch(&__cfs_bandwidth_used);
+}
+
+static void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+	/* only need to count groups transitioning between enabled/!enabled */
+	if (enabled && !was_enabled)
+		jump_label_inc(&__cfs_bandwidth_used);
+	else if (!enabled && was_enabled)
+		jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* !HAVE_JUMP_LABEL */
+/* static_branch doesn't help unless supported */
+static int cfs_bandwidth_used(void)
+{
+	return 1;
+}
+static void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
+#else /* !CONFIG_CFS_BANDWIDTH */
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
@@ -9203,7 +9229,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
-	int i, ret = 0, runtime_enabled;
+	int i, ret = 0, runtime_enabled, runtime_was_enabled;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 
 	if (tg == &root_task_group)
@@ -9231,6 +9257,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 		goto out_unlock;
 
 	runtime_enabled = quota != RUNTIME_INF;
+	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+
 	raw_spin_lock_irq(&cfs_b->lock);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a78ed27..a608593 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1421,7 +1421,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 						   unsigned long delta_exec)
 {
-	if (!cfs_rq->runtime_enabled)
+	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 		return;
 
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1429,13 +1429,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->throttled;
+	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->throttle_count;
+	return cfs_bandwidth_used() && cfs_rq->throttle_count;
 }
 
 /*
@@ -1756,6 +1756,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
+	if (!cfs_bandwidth_used())
+		return;
+
 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
 		return;
 
@@ -1801,6 +1804,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  */
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 {
+	if (!cfs_bandwidth_used())
+		return;
+
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
@@ -1818,6 +1824,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
+	if (!cfs_bandwidth_used())
+		return;
+
 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
 		return;
 

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [patch 2/3] sched: fix buglet in return_cfs_rq_runtime()
  2011-11-08  4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner
  2011-11-08  4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
@ 2011-11-08  4:26 ` Paul Turner
  2011-11-18 23:41   ` [tip:sched/core] sched: Fix " tip-bot for Paul Turner
  2011-11-08  4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
  2 siblings, 1 reply; 15+ messages in thread
From: Paul Turner @ 2011-11-08  4:26 UTC (permalink / raw)
  To: linux-kernel, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: sched-bwc-fix_return_cfs_rq_runtime.patch --]
[-- Type: text/plain, Size: 679 bytes --]

In return_cfs_rq_runtime() we want to return bandwidth when there are no
remaining tasks, not "return" when this is the case.

Signed-off-by: Paul Turner <pjt@google.com>

---
 kernel/sched_fair.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1746,7 +1746,7 @@ static __always_inline void return_cfs_r
 	if (!cfs_bandwidth_used())
 		return;
 
-	if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
 		return;
 
 	__return_cfs_rq_runtime(cfs_rq);



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [tip:sched/core] sched: Fix buglet in return_cfs_rq_runtime()
  2011-11-08  4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner
@ 2011-11-18 23:41   ` tip-bot for Paul Turner
  0 siblings, 0 replies; 15+ messages in thread
From: tip-bot for Paul Turner @ 2011-11-18 23:41 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, a.p.zijlstra, pjt, tglx, mingo

Commit-ID:  fccfdc6f0d8c83c854eeb6d93aa158f0e551bd49
Gitweb:     http://git.kernel.org/tip/fccfdc6f0d8c83c854eeb6d93aa158f0e551bd49
Author:     Paul Turner <pjt@google.com>
AuthorDate: Mon, 7 Nov 2011 20:26:34 -0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 16 Nov 2011 08:43:45 +0100

sched: Fix buglet in return_cfs_rq_runtime()

In return_cfs_rq_runtime() we want to return bandwidth when there are no
remaining tasks, not "return" when this is the case.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111108042736.623812423@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ba0e1f4..a78ed27 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1756,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-	if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
 		return;
 
 	__return_cfs_rq_runtime(cfs_rq);

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [patch 3/3] From: Ben Segall <bsegall@google.com>
  2011-11-08  4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner
  2011-11-08  4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
  2011-11-08  4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner
@ 2011-11-08  4:26 ` Paul Turner
  2011-11-10  2:28   ` Paul Turner
  2011-11-10  2:30   ` Paul Turner
  2 siblings, 2 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-08  4:26 UTC (permalink / raw)
  To: linux-kernel, Peter Zijlstra, Ingo Molnar; +Cc: Ben Segall

[-- Attachment #1: sched-bwc-fix_nr_running.patch --]
[-- Type: text/plain, Size: 8571 bytes --]

sched: update task accounting on throttle so that idle_balance() will trigger

Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().

Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs.  Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.

This allows schedule() to call idle_balance when we go idle due to throttling.

Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
  +patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261

Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
 kernel/sched.c      |   24 ++++++++----
 kernel/sched_fair.c |  101 ++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 87 insertions(+), 38 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -269,6 +269,13 @@ struct cfs_bandwidth {
 #endif
 };
 
+enum runtime_state {
+	RUNTIME_UNLIMITED,
+	RUNTIME_AVAILABLE,
+	RUNTIME_THROTTLING,
+	RUNTIME_THROTTLED
+};
+
 /* task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -402,12 +409,12 @@ struct cfs_rq {
 	unsigned long load_contribution;
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
-	int runtime_enabled;
+	enum runtime_state runtime_state;
 	u64 runtime_expires;
 	s64 runtime_remaining;
 
 	u64 throttled_timestamp;
-	int throttled, throttle_count;
+	int throttle_count;
 	struct list_head throttled_list;
 #endif
 #endif
@@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-	cfs_rq->runtime_enabled = 0;
+	cfs_rq->runtime_state = RUNTIME_UNLIMITED;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
@@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 
-		if (!cfs_rq->runtime_enabled)
+		if (!cfs_rq_runtime_enabled(cfs_rq))
 			continue;
 
 		/*
@@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t
 		struct rq *rq = rq_of(cfs_rq);
 
 		raw_spin_lock_irq(&rq->lock);
-		cfs_rq->runtime_enabled = runtime_enabled;
-		cfs_rq->runtime_remaining = 0;
-
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
+
+		if (runtime_enabled)
+			cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+		else
+			cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+		cfs_rq->runtime_remaining = 0;
 		raw_spin_unlock_irq(&rq->lock);
 	}
 out_unlock:
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct
 	}
 }
 
+static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
+{
+	struct sched_entity *se;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		if (!se->on_rq)
+			break;
+
+		qcfs_rq->h_nr_running -= nr_throttling;
+
+		if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+			break;
+	}
+
+	if (!se)
+		rq_of(cfs_rq)->nr_running -= nr_throttling;
+}
+
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 				     unsigned long delta_exec)
 {
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
 	 * if we're unable to extend our runtime we resched so that the active
 	 * hierarchy can be throttled
 	 */
-	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_task(rq_of(cfs_rq)->curr);
+	if (assign_cfs_rq_runtime(cfs_rq))
+		return;
+
+	if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+	   cfs_rq->runtime_state == RUNTIME_THROTTLING)
+		return;
+
+	resched_task(rq_of(cfs_rq)->curr);
+
+	/*
+	* Remove us from nr_running/h_nr_running so
+	* that idle_balance gets called if necessary
+	*/
+	account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+	cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() &&
+		cfs_rq->runtime_state != RUNTIME_UNLIMITED;
 }
 
 static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 						   unsigned long delta_exec)
 {
-	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+	if (!cfs_rq_runtime_enabled(cfs_rq))
 		return;
 
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
-	return cfs_bandwidth_used() && cfs_rq->throttled;
+	return cfs_bandwidth_used() &&
+		(cfs_rq->runtime_state == RUNTIME_THROTTLED ||
+		 cfs_rq->runtime_state == RUNTIME_THROTTLING);
 }
 
 /* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
-	long task_delta, dequeue = 1;
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
@@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
-	task_delta = cfs_rq->h_nr_running;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 		/* throttled entity or throttle-on-deactivate */
 		if (!se->on_rq)
 			break;
 
-		if (dequeue)
-			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
-		qcfs_rq->h_nr_running -= task_delta;
+		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 
 		if (qcfs_rq->load.weight)
-			dequeue = 0;
+			break;
 	}
 
-	if (!se)
-		rq->nr_running -= task_delta;
-
-	cfs_rq->throttled = 1;
+	cfs_rq->runtime_state = RUNTIME_THROTTLED;
 	cfs_rq->throttled_timestamp = rq->clock;
 	raw_spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs
 	int enqueue = 1;
 	long task_delta;
 
+	if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+		/* do only the partial unthrottle */
+		account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+		return;
+	}
+
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
-	cfs_rq->throttled = 0;
+	cfs_rq->runtime_state = RUNTIME_AVAILABLE;
 	raw_spin_lock(&cfs_b->lock);
 	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
 	list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-	if (!cfs_bandwidth_used())
-		return;
-
-	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
 		return;
 
 	__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str
  */
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 {
-	if (!cfs_bandwidth_used())
-		return;
-
 	/* an active group must be handled by the update_curr()->put() path */
-	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
 		return;
 
 	/* ensure the group is not already throttled */
-	if (cfs_rq_throttled(cfs_rq))
+	if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
 		return;
 
 	/* update runtime allocation */
@@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct 
 	if (!cfs_bandwidth_used())
 		return;
 
-	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+	if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
 		return;
 
 	/*
-	 * it's possible for a throttled entity to be forced into a running
-	 * state (e.g. set_curr_task), in this case we're finished.
+	* it is possible for additional bandwidth to arrive between
+	* when we call resched_task for being out of runtime and we
+	* call put_prev_task, in which case reaccount the now running
+	* tasks
 	 */
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	throttle_cfs_rq(cfs_rq);
+	if (unlikely(cfs_rq->runtime_remaining > 0)) {
+		account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+		cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+	} else {
+		throttle_cfs_rq(cfs_rq);
+	}
 }
 #else
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
  2011-11-08  4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
@ 2011-11-10  2:28   ` Paul Turner
  2011-11-10  2:30   ` Paul Turner
  1 sibling, 0 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-10  2:28 UTC (permalink / raw)
  To: linux-kernel

I mangled the subject/from headers on this one, fixed should be below.

Thanks,

- Paul
---

sched: update task accounting on throttle so that idle_balance() will trigger
From: Ben Segall <bsegall@google.com>

Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().

Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs.  Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.

This allows schedule() to call idle_balance when we go idle due to throttling.

Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
   +patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261

Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
  kernel/sched.c      |   24 ++++++++----
  kernel/sched_fair.c |  101 ++++++++++++++++++++++++++++++++++++----------------
  2 files changed, 87 insertions(+), 38 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -269,6 +269,13 @@ struct cfs_bandwidth {
  #endif
  };

+enum runtime_state {
+	RUNTIME_UNLIMITED,
+	RUNTIME_AVAILABLE,
+	RUNTIME_THROTTLING,
+	RUNTIME_THROTTLED
+};
+
  /* task group related information */
  struct task_group {
  	struct cgroup_subsys_state css;
@@ -402,12 +409,12 @@ struct cfs_rq {
  	unsigned long load_contribution;
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
-	int runtime_enabled;
+	enum runtime_state runtime_state;
  	u64 runtime_expires;
  	s64 runtime_remaining;

  	u64 throttled_timestamp;
-	int throttled, throttle_count;
+	int throttle_count;
  	struct list_head throttled_list;
  #endif
  #endif
@@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf

  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
-	cfs_rq->runtime_enabled = 0;
+	cfs_rq->runtime_state = RUNTIME_UNLIMITED;
  	INIT_LIST_HEAD(&cfs_rq->throttled_list);
  }

@@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s
  	for_each_leaf_cfs_rq(rq, cfs_rq) {
  		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

-		if (!cfs_rq->runtime_enabled)
+		if (!cfs_rq_runtime_enabled(cfs_rq))
  			continue;

  		/*
@@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t
  		struct rq *rq = rq_of(cfs_rq);

  		raw_spin_lock_irq(&rq->lock);
-		cfs_rq->runtime_enabled = runtime_enabled;
-		cfs_rq->runtime_remaining = 0;
-
  		if (cfs_rq_throttled(cfs_rq))
  			unthrottle_cfs_rq(cfs_rq);
+
+		if (runtime_enabled)
+			cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+		else
+			cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+		cfs_rq->runtime_remaining = 0;
  		raw_spin_unlock_irq(&rq->lock);
  	}
  out_unlock:
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct
  	}
  }

+static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
+{
+	struct sched_entity *se;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		if (!se->on_rq)
+			break;
+
+		qcfs_rq->h_nr_running -= nr_throttling;
+
+		if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+			break;
+	}
+
+	if (!se)
+		rq_of(cfs_rq)->nr_running -= nr_throttling;
+}
+
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  				     unsigned long delta_exec)
  {
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
  	 * if we're unable to extend our runtime we resched so that the active
  	 * hierarchy can be throttled
  	 */
-	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_task(rq_of(cfs_rq)->curr);
+	if (assign_cfs_rq_runtime(cfs_rq))
+		return;
+
+	if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+	   cfs_rq->runtime_state == RUNTIME_THROTTLING)
+		return;
+
+	resched_task(rq_of(cfs_rq)->curr);
+
+	/*
+	* Remove us from nr_running/h_nr_running so
+	* that idle_balance gets called if necessary
+	*/
+	account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+	cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() &&
+		cfs_rq->runtime_state != RUNTIME_UNLIMITED;
  }

  static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  						   unsigned long delta_exec)
  {
-	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+	if (!cfs_rq_runtime_enabled(cfs_rq))
  		return;

  	__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_

  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  {
-	return cfs_bandwidth_used() && cfs_rq->throttled;
+	return cfs_bandwidth_used() &&
+		(cfs_rq->runtime_state == RUNTIME_THROTTLED ||
+		 cfs_rq->runtime_state == RUNTIME_THROTTLING);
  }

  /* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r
  	struct rq *rq = rq_of(cfs_rq);
  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  	struct sched_entity *se;
-	long task_delta, dequeue = 1;

  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

@@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r
  	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
  	rcu_read_unlock();

-	task_delta = cfs_rq->h_nr_running;
  	for_each_sched_entity(se) {
  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  		/* throttled entity or throttle-on-deactivate */
  		if (!se->on_rq)
  			break;

-		if (dequeue)
-			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
-		qcfs_rq->h_nr_running -= task_delta;
+		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);

  		if (qcfs_rq->load.weight)
-			dequeue = 0;
+			break;
  	}

-	if (!se)
-		rq->nr_running -= task_delta;
-
-	cfs_rq->throttled = 1;
+	cfs_rq->runtime_state = RUNTIME_THROTTLED;
  	cfs_rq->throttled_timestamp = rq->clock;
  	raw_spin_lock(&cfs_b->lock);
  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs
  	int enqueue = 1;
  	long task_delta;

+	if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+		/* do only the partial unthrottle */
+		account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+		return;
+	}
+
  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

-	cfs_rq->throttled = 0;
+	cfs_rq->runtime_state = RUNTIME_AVAILABLE;
  	raw_spin_lock(&cfs_b->lock);
  	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
  	list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru

  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
-	if (!cfs_bandwidth_used())
-		return;
-
-	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
  		return;

  	__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str
   */
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
  {
-	if (!cfs_bandwidth_used())
-		return;
-
  	/* an active group must be handled by the update_curr()->put() path */
-	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
  		return;

  	/* ensure the group is not already throttled */
-	if (cfs_rq_throttled(cfs_rq))
+	if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
  		return;

  	/* update runtime allocation */
@@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct
  	if (!cfs_bandwidth_used())
  		return;

-	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+	if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
  		return;

  	/*
-	 * it's possible for a throttled entity to be forced into a running
-	 * state (e.g. set_curr_task), in this case we're finished.
+	* it is possible for additional bandwidth to arrive between
+	* when we call resched_task for being out of runtime and we
+	* call put_prev_task, in which case reaccount the now running
+	* tasks
  	 */
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	throttle_cfs_rq(cfs_rq);
+	if (unlikely(cfs_rq->runtime_remaining > 0)) {
+		account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+		cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+	} else {
+		throttle_cfs_rq(cfs_rq);
+	}
  }
  #else
  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
  2011-11-08  4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
  2011-11-10  2:28   ` Paul Turner
@ 2011-11-10  2:30   ` Paul Turner
  2011-11-14 10:03     ` Kamalesh Babulal
  2011-11-14 12:01     ` Peter Zijlstra
  1 sibling, 2 replies; 15+ messages in thread
From: Paul Turner @ 2011-11-10  2:30 UTC (permalink / raw)
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Ben Segall

I mangled the subject/from headers on this one, fixed should be below.

Thanks,

- Paul
---
sched: update task accounting on throttle so that idle_balance() will trigger
From: Ben Segall <bsegall@google.com>

Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().

Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs.  Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.

This allows schedule() to call idle_balance when we go idle due to throttling.

Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
   +patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261

Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
  kernel/sched.c      |   24 ++++++++----
  kernel/sched_fair.c |  101 ++++++++++++++++++++++++++++++++++++----------------
  2 files changed, 87 insertions(+), 38 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -269,6 +269,13 @@ struct cfs_bandwidth {
  #endif
  };

+enum runtime_state {
+	RUNTIME_UNLIMITED,
+	RUNTIME_AVAILABLE,
+	RUNTIME_THROTTLING,
+	RUNTIME_THROTTLED
+};
+
  /* task group related information */
  struct task_group {
  	struct cgroup_subsys_state css;
@@ -402,12 +409,12 @@ struct cfs_rq {
  	unsigned long load_contribution;
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
-	int runtime_enabled;
+	enum runtime_state runtime_state;
  	u64 runtime_expires;
  	s64 runtime_remaining;

  	u64 throttled_timestamp;
-	int throttled, throttle_count;
+	int throttle_count;
  	struct list_head throttled_list;
  #endif
  #endif
@@ -470,7 +477,7 @@ static void init_cfs_bandwidth(struct cf

  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
-	cfs_rq->runtime_enabled = 0;
+	cfs_rq->runtime_state = RUNTIME_UNLIMITED;
  	INIT_LIST_HEAD(&cfs_rq->throttled_list);
  }

@@ -6368,7 +6375,7 @@ static void unthrottle_offline_cfs_rqs(s
  	for_each_leaf_cfs_rq(rq, cfs_rq) {
  		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

-		if (!cfs_rq->runtime_enabled)
+		if (!cfs_rq_runtime_enabled(cfs_rq))
  			continue;

  		/*
@@ -9263,11 +9270,14 @@ static int tg_set_cfs_bandwidth(struct t
  		struct rq *rq = rq_of(cfs_rq);

  		raw_spin_lock_irq(&rq->lock);
-		cfs_rq->runtime_enabled = runtime_enabled;
-		cfs_rq->runtime_remaining = 0;
-
  		if (cfs_rq_throttled(cfs_rq))
  			unthrottle_cfs_rq(cfs_rq);
+
+		if (runtime_enabled)
+			cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+		else
+			cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+		cfs_rq->runtime_remaining = 0;
  		raw_spin_unlock_irq(&rq->lock);
  	}
  out_unlock:
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct
  	}
  }

+static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
+{
+	struct sched_entity *se;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		if (!se->on_rq)
+			break;
+
+		qcfs_rq->h_nr_running -= nr_throttling;
+
+		if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+			break;
+	}
+
+	if (!se)
+		rq_of(cfs_rq)->nr_running -= nr_throttling;
+}
+
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  				     unsigned long delta_exec)
  {
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
  	 * if we're unable to extend our runtime we resched so that the active
  	 * hierarchy can be throttled
  	 */
-	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_task(rq_of(cfs_rq)->curr);
+	if (assign_cfs_rq_runtime(cfs_rq))
+		return;
+
+	if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+	   cfs_rq->runtime_state == RUNTIME_THROTTLING)
+		return;
+
+	resched_task(rq_of(cfs_rq)->curr);
+
+	/*
+	* Remove us from nr_running/h_nr_running so
+	* that idle_balance gets called if necessary
+	*/
+	account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+	cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() &&
+		cfs_rq->runtime_state != RUNTIME_UNLIMITED;
  }

  static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
  						   unsigned long delta_exec)
  {
-	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+	if (!cfs_rq_runtime_enabled(cfs_rq))
  		return;

  	__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_

  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  {
-	return cfs_bandwidth_used() && cfs_rq->throttled;
+	return cfs_bandwidth_used() &&
+		(cfs_rq->runtime_state == RUNTIME_THROTTLED ||
+		 cfs_rq->runtime_state == RUNTIME_THROTTLING);
  }

  /* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1525,6 @@ static void throttle_cfs_rq(struct cfs_r
  	struct rq *rq = rq_of(cfs_rq);
  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  	struct sched_entity *se;
-	long task_delta, dequeue = 1;

  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

@@ -1492,25 +1533,19 @@ static void throttle_cfs_rq(struct cfs_r
  	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
  	rcu_read_unlock();

-	task_delta = cfs_rq->h_nr_running;
  	for_each_sched_entity(se) {
  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  		/* throttled entity or throttle-on-deactivate */
  		if (!se->on_rq)
  			break;

-		if (dequeue)
-			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
-		qcfs_rq->h_nr_running -= task_delta;
+		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);

  		if (qcfs_rq->load.weight)
-			dequeue = 0;
+			break;
  	}

-	if (!se)
-		rq->nr_running -= task_delta;
-
-	cfs_rq->throttled = 1;
+	cfs_rq->runtime_state = RUNTIME_THROTTLED;
  	cfs_rq->throttled_timestamp = rq->clock;
  	raw_spin_lock(&cfs_b->lock);
  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1560,15 @@ static void unthrottle_cfs_rq(struct cfs
  	int enqueue = 1;
  	long task_delta;

+	if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+		/* do only the partial unthrottle */
+		account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+		return;
+	}
+
  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

-	cfs_rq->throttled = 0;
+	cfs_rq->runtime_state = RUNTIME_AVAILABLE;
  	raw_spin_lock(&cfs_b->lock);
  	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
  	list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1784,7 @@ static void __return_cfs_rq_runtime(stru

  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
-	if (!cfs_bandwidth_used())
-		return;
-
-	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
  		return;

  	__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1829,12 @@ static void do_sched_cfs_slack_timer(str
   */
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
  {
-	if (!cfs_bandwidth_used())
-		return;
-
  	/* an active group must be handled by the update_curr()->put() path */
-	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
  		return;

  	/* ensure the group is not already throttled */
-	if (cfs_rq_throttled(cfs_rq))
+	if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
  		return;

  	/* update runtime allocation */
@@ -1814,17 +1849,21 @@ static void check_cfs_rq_runtime(struct
  	if (!cfs_bandwidth_used())
  		return;

-	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+	if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
  		return;

  	/*
-	 * it's possible for a throttled entity to be forced into a running
-	 * state (e.g. set_curr_task), in this case we're finished.
+	* it is possible for additional bandwidth to arrive between
+	* when we call resched_task for being out of runtime and we
+	* call put_prev_task, in which case reaccount the now running
+	* tasks
  	 */
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	throttle_cfs_rq(cfs_rq);
+	if (unlikely(cfs_rq->runtime_remaining > 0)) {
+		account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+		cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+	} else {
+		throttle_cfs_rq(cfs_rq);
+	}
  }
  #else
  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
  2011-11-10  2:30   ` Paul Turner
@ 2011-11-14 10:03     ` Kamalesh Babulal
  2011-11-14 12:01     ` Peter Zijlstra
  1 sibling, 0 replies; 15+ messages in thread
From: Kamalesh Babulal @ 2011-11-14 10:03 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Ben Segall,
	Srivatsa Vaddagiri

[snip]

> Since throttling occurs in the put_prev_task() path we do not get to observe
> this delta against nr_running when making the decision to idle_balance().
> 
> Fix this by first enumerating cfs_rq throttle states so that we can distinguish
> throttling cfs_rqs.  Then remove tasks that will be throttled in put_prev_task
> from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
> rather than delaying until put_prev_task.
> 
> This allows schedule() to call idle_balance when we go idle due to throttling.
> 
> Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
> a 16 core system:
> baseline: Average CPU Idle percentage 13.9667%
>   +patch: Average CPU Idle percentage 3.53333%
> [1]: https://lkml.org/lkml/2011/9/15/261
> 
> Signed-off-by: Ben Segall <bsegall@google.com>
> Signed-off-by: Paul Turner <pjt@google.com>

  Tested-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>

Thanks for the patch. I tested patches on the same test environment, over which 
the cpu idle time was reported first at https://lkml.org/lkml/2011/6/7/352. In 
brief, tests were run on 2 socket quad core machine with three level of nested 
cgroups hierarchy and five cgroups created below the third level. Each of the 
five cgroups, having 2,2,4,8,16 while1 or cpu-matrix (https://lkml.org/lkml/2011/11/4/107) 
tasks attached to them respectively.

[1] CFS Bandwith tweaks, were the patches posted by Paul Turner (https://lkml.org/lkml/2011/11/7/603)
[2] nohz idle balance RFC patch by Srivatsa Vaddagiri (https://lkml.org/lkml/2011/11/2/117)

While running the cpu-matrix benchmark with the patches, there was an improvement 
around ~50 to 55% and additional ~3% benefit in idle time with nohz idle balance
patch. With while1 loop the improvment was around ~36 to 40% over tip and an 
additional benefit of ~4 to 5% was seen with nohz idle balance patch.

(1) cpu-matrix benchmark with nohz=on
    ----------------------------------

Run     Base (tip)                                              tip +  CFS Bandwith tweaks                              tip +  CFS Bandwith tweaks + nohz idle patch
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1       Average CPU Idle percentage 4.1%                        Average CPU Idle percentage 2.36667%                    Average CPU Idle percentage 2.23333%
	Bandwidth shared with remaining non-Idle 95.9%          Bandwidth shared with remaining non-Idle 97.63333%      Bandwidth shared with remaining non-Idle 97.76667%

2       Average CPU Idle percentage 4.23%                       Average CPU Idle percentage 2.3%                        Average CPU Idle percentage 2.16667%
	Bandwidth shared with remaining non-Idle 95.77%         Bandwidth shared with remaining non-Idle 97.7%          Bandwidth shared with remaining non-Idle 97.83333%

(2) cpu-matrix benchmark with nohz=off
    -----------------------------------

Run     Base (tip)                                              tip +  CFS Bandwith tweaks                              tip +  CFS Bandwith tweaks + nohz idle patch
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1       Average CPU Idle percentage 4.53333%                    Average CPU Idle percentage 2.43333%                    Average CPU Idle percentage 2.36667%
	Bandwidth shared with remaining non-Idle 95.46667%      Bandwidth shared with remaining non-Idle 97.56667%      Bandwidth shared with remaining non-Idle 97.63333%

2       Average CPU Idle percentage 4.4%                        Average CPU Idle percentage 2.36667%                    Average CPU Idle percentage 2.4%
	Bandwidth shared with remaining non-Idle 95.6%          Bandwidth shared with remaining non-Idle 97.63333%      Bandwidth shared with remaining non-Idle 97.6%


(3) while1 loop with nohz=on
    -------------------------

Run     Base (tip)                                              tip +  CFS Bandwith tweaks                              tip +  CFS Bandwith tweaks + nohz idle patch
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1       Average CPU Idle percentage 6.26667%                    Average CPU Idle percentage 2.5%                        Average CPU Idle percentage 2.23333%
	Bandwidth shared with remaining non-Idle 93.73333%      Bandwidth shared with remaining non-Idle 97.5%          Bandwidth shared with remaining non-Idle 97.76667%

2       Average CPU Idle percentage 6.73333%                    Average CPU Idle percentage 2.46667%                    Average CPU Idle percentage 2.13333%
	Bandwidth shared with remaining non-Idle 93.26667%      Bandwidth shared with remaining non-Idle 97.53333%      Bandwidth shared with remaining non-Idle 97.86667%

(4) while1 loop with nohz=off
    --------------------------

Run     Base (tip)                                              tip +  CFS Bandwith tweaks                              tip +  CFS Bandwith tweaks + nohz idle patch
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1       Average CPU Idle percentage 3.6%                        Average CPU Idle percentage 2.4%                        Average CPU Idle percentage 2.43333%
	Bandwidth shared with remaining non-Idle 96.4%          Bandwidth shared with remaining non-Idle 97.6%          Bandwidth shared with remaining non-Idle 97.56667%

2       Average CPU Idle percentage 3.46667%                    Average CPU Idle percentage 2.33333%                    Average CPU Idle percentage 2.4%
	Bandwidth shared with remaining non-Idle 96.53333%      Bandwidth shared with remaining non-Idle 97.66667%      Bandwidth shared with remaining non-Idle 97.6%

each cpu-matrix benchmark task was run as # perf sched cpu-matrix -s1k -i 1000 -p100

		Kamalesh.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
  2011-11-10  2:30   ` Paul Turner
  2011-11-14 10:03     ` Kamalesh Babulal
@ 2011-11-14 12:01     ` Peter Zijlstra
  2011-11-15 21:14       ` Benjamin Segall
  1 sibling, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2011-11-14 12:01 UTC (permalink / raw)
  To: Paul Turner; +Cc: linux-kernel, Ingo Molnar, Ben Segall

On Wed, 2011-11-09 at 18:30 -0800, Paul Turner wrote:
> 
> sched: update task accounting on throttle so that idle_balance() will trigger
> From: Ben Segall <bsegall@google.com>
> 
> Since throttling occurs in the put_prev_task() path we do not get to observe
> this delta against nr_running when making the decision to idle_balance().
> 
> Fix this by first enumerating cfs_rq throttle states so that we can distinguish
> throttling cfs_rqs.  Then remove tasks that will be throttled in put_prev_task
> from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
> rather than delaying until put_prev_task.
> 
> This allows schedule() to call idle_balance when we go idle due to throttling.
> 
> Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
> a 16 core system:
> baseline: Average CPU Idle percentage 13.9667%
>    +patch: Average CPU Idle percentage 3.53333%
> [1]: https://lkml.org/lkml/2011/9/15/261
> 
> Signed-off-by: Ben Segall <bsegall@google.com>
> Signed-off-by: Paul Turner <pjt@google.com>

I really don't like this patch... There's something wrong about
decoupling the dequeue from nr_running accounting.

That said, I haven't got a bright idea either.. anyway, I think the
patch is somewhat too big for 3.2 at this point.

> ---
>   kernel/sched.c      |   24 ++++++++----
>   kernel/sched_fair.c |  101 ++++++++++++++++++++++++++++++++++++----------------
>   2 files changed, 87 insertions(+), 38 deletions(-)
> 
> Index: tip/kernel/sched.c
> ===================================================================
> --- tip.orig/kernel/sched.c
> +++ tip/kernel/sched.c
> @@ -269,6 +269,13 @@ struct cfs_bandwidth {
>   #endif
>   };
> 
> +enum runtime_state {
> +       RUNTIME_UNLIMITED,
> +       RUNTIME_AVAILABLE,
> +       RUNTIME_THROTTLING,
> +       RUNTIME_THROTTLED
> +};

What's the difference between throttling and throttled? Throttling is
between actually getting throttled and put_prev_task() getting called?
This all wants a comment.

> +static void account_nr_throttling(struct cfs_rq *cfs_rq, long nr_throttling)
> +{
> +       struct sched_entity *se;
> +
> +       se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
> +
> +       for_each_sched_entity(se) {
> +               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> +               if (!se->on_rq)
> +                       break;
> +
> +               qcfs_rq->h_nr_running -= nr_throttling;
> +
> +               if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
> +                       break;
> +       }
> +
> +       if (!se)
> +               rq_of(cfs_rq)->nr_running -= nr_throttling;
> +}

Since you'll end up calling this stuff with a negative nr_throttling,
please use += to avoid the double negative brain twist.

>   static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
>                                      unsigned long delta_exec)
>   {
> @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
>          * if we're unable to extend our runtime we resched so that the active
>          * hierarchy can be throttled
>          */
> -       if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
> -               resched_task(rq_of(cfs_rq)->curr);
> +       if (assign_cfs_rq_runtime(cfs_rq))
> +               return;
> +
> +       if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
> +          cfs_rq->runtime_state == RUNTIME_THROTTLING)
> +               return;

How exactly can we get here if we're throttling already?

> +       resched_task(rq_of(cfs_rq)->curr);
> +
> +       /*
> +       * Remove us from nr_running/h_nr_running so
> +       * that idle_balance gets called if necessary
> +       */
> +       account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
> +       cfs_rq->runtime_state = RUNTIME_THROTTLING;
> +}

> @@ -1416,7 +1456,9 @@ static __always_inline void account_cfs_
> 
>   static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>   {
> -       return cfs_bandwidth_used() && cfs_rq->throttled;
> +       return cfs_bandwidth_used() &&
> +               (cfs_rq->runtime_state == RUNTIME_THROTTLED ||
> +                cfs_rq->runtime_state == RUNTIME_THROTTLING);
>   }

>= THROTTLING saves a test.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch 3/3] From: Ben Segall <bsegall@google.com>
  2011-11-14 12:01     ` Peter Zijlstra
@ 2011-11-15 21:14       ` Benjamin Segall
  0 siblings, 0 replies; 15+ messages in thread
From: Benjamin Segall @ 2011-11-15 21:14 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Turner, linux-kernel, Ingo Molnar

Peter Zijlstra <a.p.zijlstra@chello.nl> writes:

> On Wed, 2011-11-09 at 18:30 -0800, Paul Turner wrote:
>> 
>> sched: update task accounting on throttle so that idle_balance() will trigger
>> From: Ben Segall <bsegall@google.com>
>> 
>> Since throttling occurs in the put_prev_task() path we do not get to observe
>> this delta against nr_running when making the decision to idle_balance().
>> 
>> Fix this by first enumerating cfs_rq throttle states so that we can distinguish
>> throttling cfs_rqs.  Then remove tasks that will be throttled in put_prev_task
>> from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
>> rather than delaying until put_prev_task.
>> 
>> This allows schedule() to call idle_balance when we go idle due to throttling.
>> 
>> Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
>> a 16 core system:
>> baseline: Average CPU Idle percentage 13.9667%
>>    +patch: Average CPU Idle percentage 3.53333%
>> [1]: https://lkml.org/lkml/2011/9/15/261
>> 
>> Signed-off-by: Ben Segall <bsegall@google.com>
>> Signed-off-by: Paul Turner <pjt@google.com>
>
> I really don't like this patch... There's something wrong about
> decoupling the dequeue from nr_running accounting.
>
> That said, I haven't got a bright idea either.. anyway, I think the
> patch is somewhat too big for 3.2 at this point.

Since we can't really throttle until schedule or so, the simplest
solution would be to move it from put_prev_task to a prev_schedule hook
so that it could be before idle_balance, but that seems a bit heavy just
for cfs bandwidth.

>
>> ---
>>   kernel/sched.c      |   24 ++++++++----
>>   kernel/sched_fair.c |  101 ++++++++++++++++++++++++++++++++++++----------------
>>   2 files changed, 87 insertions(+), 38 deletions(-)
>> 
>> Index: tip/kernel/sched.c
>> ===================================================================
>> --- tip.orig/kernel/sched.c
>> +++ tip/kernel/sched.c
>> @@ -269,6 +269,13 @@ struct cfs_bandwidth {
>>   #endif
>>   };
>> 
>> +enum runtime_state {
>> +       RUNTIME_UNLIMITED,
>> +       RUNTIME_AVAILABLE,
>> +       RUNTIME_THROTTLING,
>> +       RUNTIME_THROTTLED
>> +};
>
> What's the difference between throttling and throttled? Throttling is
> between actually getting throttled and put_prev_task() getting called?
> This all wants a comment.

Yes, although THROTTLING->AVAILABLE (or even ->UNLIMITED) is also possible.

>> @@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(str
>>          * if we're unable to extend our runtime we resched so that the active
>>          * hierarchy can be throttled
>>          */
>> -       if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
>> -               resched_task(rq_of(cfs_rq)->curr);
>> +       if (assign_cfs_rq_runtime(cfs_rq))
>> +               return;
>> +
>> +       if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
>> +          cfs_rq->runtime_state == RUNTIME_THROTTLING)
>> +               return;
>
> How exactly can we get here if we're throttling already?

A wakeup in the same to-be-throttled cfs_rq before need_resched is
handled. The simplest trigger would be three tasks on the same cfs_rq,
only one of which is running. If it tries to wake both of the other
tasks just as it runs out of runtime, we would hit this.

And the patch updated for these and your other comments:

-- 8< --
From: Benjamin Segall <bsegall@google.com>
Subject: sched: update task accounting on throttle so that idle_balance() will trigger

Since throttling occurs in the put_prev_task() path we do not get to observe
this delta against nr_running when making the decision to idle_balance().

Fix this by first enumerating cfs_rq throttle states so that we can distinguish
throttling cfs_rqs.  Then remove tasks that will be throttled in put_prev_task
from rq->nr_running/cfs_rq->h_nr_running when in account_cfs_rq_runtime,
rather than delaying until put_prev_task.

This allows schedule() to call idle_balance when we go idle due to throttling.

Using Kamalesh's nested-cgroup test case[1] we see the following improvement on
a 16 core system:
baseline: Average CPU Idle percentage 13.9667%
  +patch: Average CPU Idle percentage 3.53333%
[1]: https://lkml.org/lkml/2011/9/15/261

Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
 kernel/sched.c      |   28 +++++++++++---
 kernel/sched_fair.c |  100 +++++++++++++++++++++++++++++++++++----------------
 2 files changed, 90 insertions(+), 38 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 7c1b2f5..0af1680 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -269,6 +269,17 @@ struct cfs_bandwidth {
 #endif
 };
 
+/*
+ * THROTTLING tasks have been removed from nr_running, but have not
+ * yet been dequeued. THROTTLED tasks have been entirely dequeued.
+ */
+enum runtime_state {
+	RUNTIME_UNLIMITED,
+	RUNTIME_AVAILABLE,
+	RUNTIME_THROTTLING,
+	RUNTIME_THROTTLED
+};
+
 /* task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -402,12 +413,12 @@ struct cfs_rq {
 	unsigned long load_contribution;
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
-	int runtime_enabled;
+	enum runtime_state runtime_state;
 	u64 runtime_expires;
 	s64 runtime_remaining;
 
 	u64 throttled_timestamp;
-	int throttled, throttle_count;
+	int throttle_count;
 	struct list_head throttled_list;
 #endif
 #endif
@@ -470,7 +481,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-	cfs_rq->runtime_enabled = 0;
+	cfs_rq->runtime_state = RUNTIME_UNLIMITED;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
@@ -6368,7 +6379,7 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 
-		if (!cfs_rq->runtime_enabled)
+		if (!cfs_rq_runtime_enabled(cfs_rq))
 			continue;
 
 		/*
@@ -9263,11 +9274,14 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 		struct rq *rq = rq_of(cfs_rq);
 
 		raw_spin_lock_irq(&rq->lock);
-		cfs_rq->runtime_enabled = runtime_enabled;
-		cfs_rq->runtime_remaining = 0;
-
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
+
+		if (runtime_enabled)
+			cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+		else
+			cfs_rq->runtime_state = RUNTIME_UNLIMITED;
+		cfs_rq->runtime_remaining = 0;
 		raw_spin_unlock_irq(&rq->lock);
 	}
 out_unlock:
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3539569..7dc5c19 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1387,6 +1387,27 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	}
 }
 
+static void account_nr_throttling(struct cfs_rq *cfs_rq, long delta)
+{
+	struct sched_entity *se;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		if (!se->on_rq)
+			break;
+
+		qcfs_rq->h_nr_running += delta;
+
+		if (qcfs_rq->runtime_state == RUNTIME_THROTTLING)
+			break;
+	}
+
+	if (!se)
+		rq_of(cfs_rq)->nr_running += delta;
+}
+
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 				     unsigned long delta_exec)
 {
@@ -1401,14 +1422,33 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 	 * if we're unable to extend our runtime we resched so that the active
 	 * hierarchy can be throttled
 	 */
-	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_task(rq_of(cfs_rq)->curr);
+	if (assign_cfs_rq_runtime(cfs_rq))
+		return;
+
+	if (unlikely(!cfs_rq->curr) || throttled_hierarchy(cfs_rq) ||
+	    cfs_rq->runtime_state == RUNTIME_THROTTLING)
+		return;
+
+	resched_task(rq_of(cfs_rq)->curr);
+
+	/*
+	 * Remove us from nr_running/h_nr_running so
+	 * that idle_balance gets called if necessary
+	 */
+	account_nr_throttling(cfs_rq, -cfs_rq->h_nr_running);
+	cfs_rq->runtime_state = RUNTIME_THROTTLING;
+}
+
+static inline int cfs_rq_runtime_enabled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() &&
+		cfs_rq->runtime_state != RUNTIME_UNLIMITED;
 }
 
 static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 						   unsigned long delta_exec)
 {
-	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+	if (!cfs_rq_runtime_enabled(cfs_rq))
 		return;
 
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,7 +1456,8 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
-	return cfs_bandwidth_used() && cfs_rq->throttled;
+	return cfs_bandwidth_used() &&
+		cfs_rq->runtime_state >= RUNTIME_THROTTLING;
 }
 
 /* check whether cfs_rq, or any parent, is throttled */
@@ -1483,7 +1524,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
-	long task_delta, dequeue = 1;
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
@@ -1492,25 +1532,19 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
-	task_delta = cfs_rq->h_nr_running;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 		/* throttled entity or throttle-on-deactivate */
 		if (!se->on_rq)
 			break;
 
-		if (dequeue)
-			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
-		qcfs_rq->h_nr_running -= task_delta;
+		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 
 		if (qcfs_rq->load.weight)
-			dequeue = 0;
+			break;
 	}
 
-	if (!se)
-		rq->nr_running -= task_delta;
-
-	cfs_rq->throttled = 1;
+	cfs_rq->runtime_state = RUNTIME_THROTTLED;
 	cfs_rq->throttled_timestamp = rq->clock;
 	raw_spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -1525,9 +1559,15 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 	int enqueue = 1;
 	long task_delta;
 
+	if (cfs_rq->runtime_state == RUNTIME_THROTTLING) {
+		/* do only the partial unthrottle */
+		account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+		return;
+	}
+
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
-	cfs_rq->throttled = 0;
+	cfs_rq->runtime_state = RUNTIME_AVAILABLE;
 	raw_spin_lock(&cfs_b->lock);
 	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
 	list_del_rcu(&cfs_rq->throttled_list);
@@ -1743,10 +1783,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-	if (!cfs_bandwidth_used())
-		return;
-
-	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->nr_running)
 		return;
 
 	__return_cfs_rq_runtime(cfs_rq);
@@ -1791,15 +1828,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  */
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 {
-	if (!cfs_bandwidth_used())
-		return;
-
 	/* an active group must be handled by the update_curr()->put() path */
-	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+	if (!cfs_rq_runtime_enabled(cfs_rq) || cfs_rq->curr)
 		return;
 
 	/* ensure the group is not already throttled */
-	if (cfs_rq_throttled(cfs_rq))
+	if (cfs_rq->runtime_state == RUNTIME_THROTTLED)
 		return;
 
 	/* update runtime allocation */
@@ -1814,17 +1848,21 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (!cfs_bandwidth_used())
 		return;
 
-	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+	if (likely(cfs_rq->runtime_state != RUNTIME_THROTTLING))
 		return;
 
 	/*
-	 * it's possible for a throttled entity to be forced into a running
-	 * state (e.g. set_curr_task), in this case we're finished.
+	* it is possible for additional bandwidth to arrive between
+	* when we call resched_task for being out of runtime and we
+	* call put_prev_task, in which case reaccount the now running
+	* tasks
 	 */
-	if (cfs_rq_throttled(cfs_rq))
-		return;
-
-	throttle_cfs_rq(cfs_rq);
+	if (unlikely(cfs_rq->runtime_remaining > 0)) {
+		account_nr_throttling(cfs_rq, cfs_rq->h_nr_running);
+		cfs_rq->runtime_state = RUNTIME_AVAILABLE;
+	} else {
+		throttle_cfs_rq(cfs_rq);
+	}
 }
 #else
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2011-11-18 23:42 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-11-08  4:26 [patch 0/3] sched: bandwidth-control tweaks for v3.2 Paul Turner
2011-11-08  4:26 ` [patch 1/3] sched: use jump labels to reduce overhead when bandwidth control is inactive Paul Turner
2011-11-08  9:26   ` Peter Zijlstra
2011-11-08  9:28   ` Peter Zijlstra
2011-11-08  9:29   ` Peter Zijlstra
2011-11-11  4:23     ` Paul Turner
2011-11-18 23:42   ` [tip:sched/core] sched: Use " tip-bot for Paul Turner
2011-11-08  4:26 ` [patch 2/3] sched: fix buglet in return_cfs_rq_runtime() Paul Turner
2011-11-18 23:41   ` [tip:sched/core] sched: Fix " tip-bot for Paul Turner
2011-11-08  4:26 ` [patch 3/3] From: Ben Segall <bsegall@google.com> Paul Turner
2011-11-10  2:28   ` Paul Turner
2011-11-10  2:30   ` Paul Turner
2011-11-14 10:03     ` Kamalesh Babulal
2011-11-14 12:01     ` Peter Zijlstra
2011-11-15 21:14       ` Benjamin Segall

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox