[PATCH 1/2] sched: normalize sleeper's vruntime during group change

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] sched: normalize sleeper's vruntime during group change
@ 2010-09-29  6:46 Dima Zavin
  2010-09-29  6:46 ` [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue Dima Zavin
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Dima Zavin @ 2010-09-29  6:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Mike Galbraith, Dima Zavin,
	Arve Hjønnevåg

This adds a new callback, prep_move_task, to struct sched_class
to give sched_fair the opportunity to adjust the task's vruntime
just before setting its new group.

This allows us to properly normalize a sleeping task's vruntime
when moving it between different cgroups.

Cc: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Dima Zavin <dima@android.com>
---
 include/linux/sched.h |    1 +
 kernel/sched.c        |    5 +++++
 kernel/sched_fair.c   |   16 +++++++++++++++-
 3 files changed, 21 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db..ba3494e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1073,6 +1073,7 @@ struct sched_class {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*moved_group) (struct task_struct *p, int on_rq);
+	void (*prep_move_group) (struct task_struct *p, int on_rq);
 #endif
 };
 
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb..fe4bb20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,6 +8297,11 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tsk->sched_class->prep_move_group)
+		tsk->sched_class->prep_move_group(tsk, on_rq);
+#endif
+
 	set_task_rq(tsk, task_cpu(tsk));
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..008fe57 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3827,10 +3827,23 @@ static void set_curr_task_fair(struct rq *rq)
 static void moved_group_fair(struct task_struct *p, int on_rq)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
 
 	update_curr(cfs_rq);
-	if (!on_rq)
+	if (!on_rq) {
+		se->vruntime += cfs_rq->min_vruntime;
 		place_entity(cfs_rq, &p->se, 1);
+	}
+}
+
+static void prep_move_group_fair(struct task_struct *p, int on_rq)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
+
+	/* normalize the runtime of a sleeping task before moving it */
+	if (!on_rq)
+		se->vruntime -= cfs_rq->min_vruntime;
 }
 #endif
 
@@ -3883,6 +3896,7 @@ static const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.moved_group		= moved_group_fair,
+	.prep_move_group	= prep_move_group_fair,
 #endif
 };
 
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue
  2010-09-29  6:46 [PATCH 1/2] sched: normalize sleeper's vruntime during group change Dima Zavin
@ 2010-09-29  6:46 ` Dima Zavin
  2010-10-07 21:00   ` Dima Zavin
  2010-09-29  6:54 ` [PATCH 1/2] sched: normalize sleeper's vruntime during group change Pekka Enberg
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 17+ messages in thread
From: Dima Zavin @ 2010-09-29  6:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Mike Galbraith, Dima Zavin,
	Arve Hjønnevåg

After pulling the thread off the run-queue during a cgroup change,
the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime
then gets normalized to this new value. This can then lead to the thread
getting an unfair boost in the new group if the vruntime of the next
task in the old run-queue was way further ahead.

Cc: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Dima Zavin <dima@android.com>
---
 kernel/sched_fair.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 008fe57..cb24ddb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -802,6 +802,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+	u64 min_vruntime;
+
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
@@ -826,6 +828,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	account_entity_dequeue(cfs_rq, se);
+
+	min_vruntime = cfs_rq->min_vruntime;
 	update_min_vruntime(cfs_rq);
 
 	/*
@@ -834,7 +838,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * movement in our normalized position.
 	 */
 	if (!(flags & DEQUEUE_SLEEP))
-		se->vruntime -= cfs_rq->min_vruntime;
+		se->vruntime -= min_vruntime;
 }
 
 /*
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue
  2010-09-29  6:46 ` [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue Dima Zavin
@ 2010-10-07 21:00   ` Dima Zavin
  2010-10-08  6:57     ` Mike Galbraith
  0 siblings, 1 reply; 17+ messages in thread
From: Dima Zavin @ 2010-10-07 21:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Mike Galbraith, Dima Zavin,
	Arve Hjønnevåg

Mike,

Thanks for the Ack for patch 1/2, could you take a look at this one too?

Should I re-upload the series as v2 or you can pick the latest from
patch 1 and take this one?

--Dima

On Tue, Sep 28, 2010 at 11:46 PM, Dima Zavin <dima@android.com> wrote:
> After pulling the thread off the run-queue during a cgroup change,
> the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime
> then gets normalized to this new value. This can then lead to the thread
> getting an unfair boost in the new group if the vruntime of the next
> task in the old run-queue was way further ahead.
>
> Cc: Arve Hjønnevåg <arve@android.com>
> Signed-off-by: Dima Zavin <dima@android.com>
> ---
>  kernel/sched_fair.c |    6 +++++-
>  1 files changed, 5 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index 008fe57..cb24ddb 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -802,6 +802,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  static void
>  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  {
> +       u64 min_vruntime;
> +
>        /*
>         * Update run-time statistics of the 'current'.
>         */
> @@ -826,6 +828,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>        if (se != cfs_rq->curr)
>                __dequeue_entity(cfs_rq, se);
>        account_entity_dequeue(cfs_rq, se);
> +
> +       min_vruntime = cfs_rq->min_vruntime;
>        update_min_vruntime(cfs_rq);
>
>        /*
> @@ -834,7 +838,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>         * movement in our normalized position.
>         */
>        if (!(flags & DEQUEUE_SLEEP))
> -               se->vruntime -= cfs_rq->min_vruntime;
> +               se->vruntime -= min_vruntime;
>  }
>
>  /*
> --
> 1.6.6
>
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue
  2010-10-07 21:00   ` Dima Zavin
@ 2010-10-08  6:57     ` Mike Galbraith
  0 siblings, 0 replies; 17+ messages in thread
From: Mike Galbraith @ 2010-10-08  6:57 UTC (permalink / raw)
  To: Dima Zavin
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar,
	Arve Hjønnevåg

On Thu, 2010-10-07 at 14:00 -0700, Dima Zavin wrote: 
> Mike,
> 
> Thanks for the Ack for patch 1/2, could you take a look at this one too?

Ok, did that.  I'd do it like below instead.

> Should I re-upload the series as v2 or you can pick the latest from
> patch 1 and take this one?

Peter's the merge point, I just help break stuff ;-)

I tested the below with pinned/unpinned mixes of sleepers and hogs, and
saw no ill effects.  My thought on the logic is embedded in the comment.

From: Dima Zavin <dima@android.com>
Subject: [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue
Date: Tue, 28 Sep 2010 23:46:14 -0700

After pulling the thread off the run-queue during a cgroup change,
the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime
then gets normalized to this new value. This can then lead to the thread
getting an unfair boost in the new group if the vruntime of the next
task in the old run-queue was way further ahead.

Cc: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Dima Zavin <dima@android.com>
---
 kernel/sched_fair.c |   10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -826,15 +826,17 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	account_entity_dequeue(cfs_rq, se);
-	update_min_vruntime(cfs_rq);
 
 	/*
-	 * Normalize the entity after updating the min_vruntime because the
-	 * update can refer to the ->curr item and we need to reflect this
-	 * movement in our normalized position.
+	 * Normalize vruntime prior to updating min_vruntime.  Any motion
+	 * referring to ->curr will have been captured by update_curr() above.
+	 * We don't want to preserve what lag might become as a result of
+	 * this dequeue, we want to preserve what lag is at dequeue time.
 	 */
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
+
+	update_min_vruntime(cfs_rq);
 }
 
 /*



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-09-29  6:46 [PATCH 1/2] sched: normalize sleeper's vruntime during group change Dima Zavin
  2010-09-29  6:46 ` [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue Dima Zavin
@ 2010-09-29  6:54 ` Pekka Enberg
  2010-09-29  7:17   ` Dima Zavin
  2010-09-29  8:13 ` Mike Galbraith
  2010-09-30 10:47 ` Peter Zijlstra
  3 siblings, 1 reply; 17+ messages in thread
From: Pekka Enberg @ 2010-09-29  6:54 UTC (permalink / raw)
  To: Dima Zavin
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, Mike Galbraith,
	Arve Hjønnevåg

On Wed, Sep 29, 2010 at 9:46 AM, Dima Zavin <dima@android.com> wrote:
> This adds a new callback, prep_move_task, to struct sched_class
> to give sched_fair the opportunity to adjust the task's vruntime
> just before setting its new group.
>
> This allows us to properly normalize a sleeping task's vruntime
> when moving it between different cgroups.
>
> Cc: Arve Hjønnevåg <arve@android.com>
> Signed-off-by: Dima Zavin <dima@android.com>

Nitpick: this changelog probably needs better description of the
problem it's trying to solve or a link to the previous discussion
about it.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-09-29  6:54 ` [PATCH 1/2] sched: normalize sleeper's vruntime during group change Pekka Enberg
@ 2010-09-29  7:17   ` Dima Zavin
  0 siblings, 0 replies; 17+ messages in thread
From: Dima Zavin @ 2010-09-29  7:17 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Mike Galbraith, Dima Zavin,
	Arve Hjønnevåg

This adds a new callback, prep_move_task, to struct sched_class
to give sched_fair the opportunity to adjust the task's vruntime
just before setting its new group.

This allows us to properly normalize a sleeping task's vruntime
when moving it between different cgroups.

More details about the problem:
  http://lkml.org/lkml/2010/9/28/24

Cc: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Dima Zavin <dima@android.com>
---
 include/linux/sched.h |    1 +
 kernel/sched.c        |    5 +++++
 kernel/sched_fair.c   |   16 +++++++++++++++-
 3 files changed, 21 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db..ba3494e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1073,6 +1073,7 @@ struct sched_class {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*moved_group) (struct task_struct *p, int on_rq);
+	void (*prep_move_group) (struct task_struct *p, int on_rq);
 #endif
 };
 
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb..fe4bb20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,6 +8297,11 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tsk->sched_class->prep_move_group)
+		tsk->sched_class->prep_move_group(tsk, on_rq);
+#endif
+
 	set_task_rq(tsk, task_cpu(tsk));
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..008fe57 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3827,10 +3827,23 @@ static void set_curr_task_fair(struct rq *rq)
 static void moved_group_fair(struct task_struct *p, int on_rq)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
 
 	update_curr(cfs_rq);
-	if (!on_rq)
+	if (!on_rq) {
+		se->vruntime += cfs_rq->min_vruntime;
 		place_entity(cfs_rq, &p->se, 1);
+	}
+}
+
+static void prep_move_group_fair(struct task_struct *p, int on_rq)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
+
+	/* normalize the runtime of a sleeping task before moving it */
+	if (!on_rq)
+		se->vruntime -= cfs_rq->min_vruntime;
 }
 #endif
 
@@ -3883,6 +3896,7 @@ static const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.moved_group		= moved_group_fair,
+	.prep_move_group	= prep_move_group_fair,
 #endif
 };
 
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-09-29  6:46 [PATCH 1/2] sched: normalize sleeper's vruntime during group change Dima Zavin
  2010-09-29  6:46 ` [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue Dima Zavin
  2010-09-29  6:54 ` [PATCH 1/2] sched: normalize sleeper's vruntime during group change Pekka Enberg
@ 2010-09-29  8:13 ` Mike Galbraith
  2010-09-29 19:02   ` Dima Zavin
  2010-09-29 21:44   ` Dima Zavin
  2010-09-30 10:47 ` Peter Zijlstra
  3 siblings, 2 replies; 17+ messages in thread
From: Mike Galbraith @ 2010-09-29  8:13 UTC (permalink / raw)
  To: Dima Zavin
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar,
	Arve Hjønnevåg

On Tue, 2010-09-28 at 23:46 -0700, Dima Zavin wrote:
> This adds a new callback, prep_move_task, to struct sched_class
> to give sched_fair the opportunity to adjust the task's vruntime
> just before setting its new group.
> 
> This allows us to properly normalize a sleeping task's vruntime
> when moving it between different cgroups.


> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index db3f674..008fe57 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -3827,10 +3827,23 @@ static void set_curr_task_fair(struct rq *rq)
>  static void moved_group_fair(struct task_struct *p, int on_rq)
>  {
>  	struct cfs_rq *cfs_rq = task_cfs_rq(p);
> +	struct sched_entity *se = &p->se;
>  
>  	update_curr(cfs_rq);
> -	if (!on_rq)
> +	if (!on_rq) {
> +		se->vruntime += cfs_rq->min_vruntime;
>  		place_entity(cfs_rq, &p->se, 1);
> +	}
> +}

I'm still missing the place_entity() rationale.  Why is a sleeping task
eligible for pain therapy that a runnable task is spared?  That state
can change a usec from now.

Nit: place_entity(cfs_rq, se, 1);

	-Mike


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-09-29  8:13 ` Mike Galbraith
@ 2010-09-29 19:02   ` Dima Zavin
  2010-09-29 21:44   ` Dima Zavin
  1 sibling, 0 replies; 17+ messages in thread
From: Dima Zavin @ 2010-09-29 19:02 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar,
	Arve Hjønnevåg

On Wed, Sep 29, 2010 at 1:13 AM, Mike Galbraith <efault@gmx.de> wrote:
> On Tue, 2010-09-28 at 23:46 -0700, Dima Zavin wrote:
>> This adds a new callback, prep_move_task, to struct sched_class
>> to give sched_fair the opportunity to adjust the task's vruntime
>> just before setting its new group.
>>
>> This allows us to properly normalize a sleeping task's vruntime
>> when moving it between different cgroups.
>
>
>> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
>> index db3f674..008fe57 100644
>> --- a/kernel/sched_fair.c
>> +++ b/kernel/sched_fair.c
>> @@ -3827,10 +3827,23 @@ static void set_curr_task_fair(struct rq *rq)
>>  static void moved_group_fair(struct task_struct *p, int on_rq)
>>  {
>>       struct cfs_rq *cfs_rq = task_cfs_rq(p);
>> +     struct sched_entity *se = &p->se;
>>
>>       update_curr(cfs_rq);
>> -     if (!on_rq)
>> +     if (!on_rq) {
>> +             se->vruntime += cfs_rq->min_vruntime;
>>               place_entity(cfs_rq, &p->se, 1);
>> +     }
>> +}
>
> I'm still missing the place_entity() rationale.  Why is a sleeping task
> eligible for pain therapy that a runnable task is spared?  That state
> can change a usec from now.

I see your point, and I can't think of a reason why we'd want to
penalize the sleeper with START_DEBIT,
but now I am wondering why it was there in the first place.

--Dima

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-09-29  8:13 ` Mike Galbraith
  2010-09-29 19:02   ` Dima Zavin
@ 2010-09-29 21:44   ` Dima Zavin
  1 sibling, 0 replies; 17+ messages in thread
From: Dima Zavin @ 2010-09-29 21:44 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Mike Galbraith, Dima Zavin,
	Arve Hjønnevåg

If you switch the cgroup of a sleeping thread, its vruntime does
not get adjusted correctly for the difference between the
min_vruntime values of the two groups.

This patch adds a new callback, prep_move_task, to struct sched_class
to give sched_fair the opportunity to adjust the task's vruntime
just before setting its new group. This allows us to properly normalize
a sleeping task's vruntime when moving it between different cgroups.

More details about the problem:
  http://lkml.org/lkml/2010/9/28/24

Cc: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Dima Zavin <dima@android.com>
---
 include/linux/sched.h |    1 +
 kernel/sched.c        |    5 +++++
 kernel/sched_fair.c   |   14 +++++++++++++-
 3 files changed, 19 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db..ba3494e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1073,6 +1073,7 @@ struct sched_class {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*moved_group) (struct task_struct *p, int on_rq);
+	void (*prep_move_group) (struct task_struct *p, int on_rq);
 #endif
 };
 
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb..fe4bb20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,6 +8297,11 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tsk->sched_class->prep_move_group)
+		tsk->sched_class->prep_move_group(tsk, on_rq);
+#endif
+
 	set_task_rq(tsk, task_cpu(tsk));
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..6ded59f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3827,10 +3827,21 @@ static void set_curr_task_fair(struct rq *rq)
 static void moved_group_fair(struct task_struct *p, int on_rq)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
 
 	update_curr(cfs_rq);
 	if (!on_rq)
-		place_entity(cfs_rq, &p->se, 1);
+		se->vruntime += cfs_rq->min_vruntime;
+}
+
+static void prep_move_group_fair(struct task_struct *p, int on_rq)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
+
+	/* normalize the runtime of a sleeping task before moving it */
+	if (!on_rq)
+		se->vruntime -= cfs_rq->min_vruntime;
 }
 #endif
 
@@ -3883,6 +3894,7 @@ static const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.moved_group		= moved_group_fair,
+	.prep_move_group	= prep_move_group_fair,
 #endif
 };
 
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-09-29  6:46 [PATCH 1/2] sched: normalize sleeper's vruntime during group change Dima Zavin
                   ` (2 preceding siblings ...)
  2010-09-29  8:13 ` Mike Galbraith
@ 2010-09-30 10:47 ` Peter Zijlstra
  2010-09-30 19:14   ` Dima Zavin
  3 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2010-09-30 10:47 UTC (permalink / raw)
  To: Dima Zavin
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith,
	Arve Hjønnevåg

On Tue, 2010-09-28 at 23:46 -0700, Dima Zavin wrote:
> This adds a new callback, prep_move_task, to struct sched_class
> to give sched_fair the opportunity to adjust the task's vruntime
> just before setting its new group.
> 
> This allows us to properly normalize a sleeping task's vruntime
> when moving it between different cgroups.

Don't much like these patches, and changelogs need full descriptions not
fuzzy links out to the interwebs that might or might not stay valid.

If you change a task's cgroup, the task is new in that cgroup and should
be placed like a new task would, carrying over any history it has in the
old cgroup is dubious at best.

Please explain this stuff..

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-09-30 10:47 ` Peter Zijlstra
@ 2010-09-30 19:14   ` Dima Zavin
  2010-10-01 11:59     ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Dima Zavin @ 2010-09-30 19:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith,
	Arve Hjønnevåg

Peter,

>> This adds a new callback, prep_move_task, to struct sched_class
>> to give sched_fair the opportunity to adjust the task's vruntime
>> just before setting its new group.
>>
>> This allows us to properly normalize a sleeping task's vruntime
>> when moving it between different cgroups.
>
> Don't much like these patches, and changelogs need full descriptions not
> fuzzy links out to the interwebs that might or might not stay valid.

Sorry, I'll resend with a better changelog and no tubes.

> If you change a task's cgroup, the task is new in that cgroup and should
> be placed like a new task would, carrying over any history it has in the
> old cgroup is dubious at best.

That is certainly not the behavior today for running tasks as I
understand it. They get their vruntime normalized before being taken
off the run_queue of old group, and then get the new min_vruntime
added when they get re-enqueued. For sleeping tasks it's just plain
broken (see below).

Also, I am skeptical that the behavior you describe is desired. If you
were to just place the task in the new cgroup like it was a brand new
task, it could allow threads to game the system and potentially let
them reset their vruntime back. If I do a bunch of work and then move
myself out of the group and then back onto it, I may get lower
vruntime than by just staying on the group.

> Please explain this stuff..

The situation today is quite bad for sleeping tasks. Currently, when
you move a sleeping thread between cgroups, the thread can retain its
old vruntime value if the old group was far ahead of the new group
since it essentially does a max(se->vruntime, new_vruntime) in
place_entity. This can prevent the task from running for a very long
time. That is what this patch was trying to address. It normalizes the
sleeper thread's vruntime before moving it to the new group.

Thanks in advance.

--Dima

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-09-30 19:14   ` Dima Zavin
@ 2010-10-01 11:59     ` Peter Zijlstra
  2010-10-04 19:18       ` Dima Zavin
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2010-10-01 11:59 UTC (permalink / raw)
  To: Dima Zavin
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith,
	Arve Hjønnevåg

On Thu, 2010-09-30 at 12:14 -0700, Dima Zavin wrote:
> 
> > Please explain this stuff..
> 
> The situation today is quite bad for sleeping tasks. Currently, when
> you move a sleeping thread between cgroups, the thread can retain its
> old vruntime value if the old group was far ahead of the new group
> since it essentially does a max(se->vruntime, new_vruntime) in
> place_entity. This can prevent the task from running for a very long
> time. That is what this patch was trying to address. It normalizes the
> sleeper thread's vruntime before moving it to the new group.
> 
> 

Hrm,.. ok, I tend to not use this cgroup gunk more that I absolutely
have to, so I'll take your word for it.

But doesn't normal cross-cpu task migration already solve this problem?
Therefore wouldn't it be possible to adapt/extend that code to also deal
with this particular issue?

It would avoid growing more cgroup fudge..

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-10-01 11:59     ` Peter Zijlstra
@ 2010-10-04 19:18       ` Dima Zavin
  2010-10-06 22:56         ` Dima Zavin
  2010-10-15 13:50         ` Peter Zijlstra
  0 siblings, 2 replies; 17+ messages in thread
From: Dima Zavin @ 2010-10-04 19:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith,
	Arve Hjønnevåg

>> > Please explain this stuff..
>>
>> The situation today is quite bad for sleeping tasks. Currently, when
>> you move a sleeping thread between cgroups, the thread can retain its
>> old vruntime value if the old group was far ahead of the new group
>> since it essentially does a max(se->vruntime, new_vruntime) in
>> place_entity. This can prevent the task from running for a very long
>> time. That is what this patch was trying to address. It normalizes the
>> sleeper thread's vruntime before moving it to the new group.
>>
>>
>
> Hrm,.. ok, I tend to not use this cgroup gunk more that I absolutely
> have to, so I'll take your word for it.
>
> But doesn't normal cross-cpu task migration already solve this problem?
> Therefore wouldn't it be possible to adapt/extend that code to also deal
> with this particular issue?

It does, but from what I can tell it does so lazily for sleeping
tasks, i.e. the logic is in try_to_wake_up(). The cgroup attach moves
the task immediately, so when we attempt to wake it up it will already
be too late for the wake_up code to do the right thing since the task
has the new cpu_rq assigned from sched_move_task(). The wakeup logic
will not have the old group info.

--Dima

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-10-04 19:18       ` Dima Zavin
@ 2010-10-06 22:56         ` Dima Zavin
  2010-10-07  2:24           ` Mike Galbraith
  2010-10-15 13:50         ` Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Dima Zavin @ 2010-10-06 22:56 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Mike Galbraith, Dima Zavin,
	Arve Hjønnevåg

If you switch the cgroup of a sleeping thread, its vruntime does
not get adjusted correctly for the difference between the
min_vruntime values of the two groups.

The problem becomes most apparent when one has cgroups whose
cpu shares differ greatly, say group A.shares=1024 and group B.shares=52.
After some time, the vruntime of the group with the larger share (A)
will be way ahead of the group with the small share (B). Currently,
when a sleeping task is moved from group A to group B, it will retain its
larger vruntime value and thus will be way ahead of all the other tasks
in its new group. This will prevent this task from executing for an
extended period of time.

This patch adds a new callback, prep_move_task, to struct sched_class
to give sched_fair the opportunity to adjust the task's vruntime
just before setting its new group. This allows us to properly normalize
a sleeping task's vruntime when moving it between different cgroups.

Cc: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Dima Zavin <dima@android.com>
---
 include/linux/sched.h |    1 +
 kernel/sched.c        |    5 +++++
 kernel/sched_fair.c   |   14 +++++++++++++-
 3 files changed, 19 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db..ba3494e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1073,6 +1073,7 @@ struct sched_class {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*moved_group) (struct task_struct *p, int on_rq);
+	void (*prep_move_group) (struct task_struct *p, int on_rq);
 #endif
 };
 
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb..fe4bb20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,6 +8297,11 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tsk->sched_class->prep_move_group)
+		tsk->sched_class->prep_move_group(tsk, on_rq);
+#endif
+
 	set_task_rq(tsk, task_cpu(tsk));
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..6ded59f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3827,10 +3827,21 @@ static void set_curr_task_fair(struct rq *rq)
 static void moved_group_fair(struct task_struct *p, int on_rq)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
 
 	update_curr(cfs_rq);
 	if (!on_rq)
-		place_entity(cfs_rq, &p->se, 1);
+		se->vruntime += cfs_rq->min_vruntime;
+}
+
+static void prep_move_group_fair(struct task_struct *p, int on_rq)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = &p->se;
+
+	/* normalize the runtime of a sleeping task before moving it */
+	if (!on_rq)
+		se->vruntime -= cfs_rq->min_vruntime;
 }
 #endif
 
@@ -3883,6 +3894,7 @@ static const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.moved_group		= moved_group_fair,
+	.prep_move_group	= prep_move_group_fair,
 #endif
 };
 
-- 
1.6.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-10-06 22:56         ` Dima Zavin
@ 2010-10-07  2:24           ` Mike Galbraith
  0 siblings, 0 replies; 17+ messages in thread
From: Mike Galbraith @ 2010-10-07  2:24 UTC (permalink / raw)
  To: Dima Zavin
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar,
	Arve Hjønnevåg

On Wed, 2010-10-06 at 15:56 -0700, Dima Zavin wrote:
> If you switch the cgroup of a sleeping thread, its vruntime does
> not get adjusted correctly for the difference between the
> min_vruntime values of the two groups.
> 
> The problem becomes most apparent when one has cgroups whose
> cpu shares differ greatly, say group A.shares=1024 and group B.shares=52.
> After some time, the vruntime of the group with the larger share (A)
> will be way ahead of the group with the small share (B). Currently,
> when a sleeping task is moved from group A to group B, it will retain its
> larger vruntime value and thus will be way ahead of all the other tasks
> in its new group. This will prevent this task from executing for an
> extended period of time.

Yeah, seems clear that normalization is a must.

Questionable is whether we should apply START_DEBIT.  That's part of a
different problem though (SCHED_PROCESS).

> This patch adds a new callback, prep_move_task, to struct sched_class
> to give sched_fair the opportunity to adjust the task's vruntime
> just before setting its new group. This allows us to properly normalize
> a sleeping task's vruntime when moving it between different cgroups.

Acked-by: Mike Galbraith <efault@gmx.de>

> Cc: Arve Hjønnevåg <arve@android.com>
> Signed-off-by: Dima Zavin <dima@android.com>
> ---
>  include/linux/sched.h |    1 +
>  kernel/sched.c        |    5 +++++
>  kernel/sched_fair.c   |   14 +++++++++++++-
>  3 files changed, 19 insertions(+), 1 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 1e2a6db..ba3494e 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1073,6 +1073,7 @@ struct sched_class {
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	void (*moved_group) (struct task_struct *p, int on_rq);
> +	void (*prep_move_group) (struct task_struct *p, int on_rq);
>  #endif
>  };
>  
> diff --git a/kernel/sched.c b/kernel/sched.c
> index dc85ceb..fe4bb20 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -8297,6 +8297,11 @@ void sched_move_task(struct task_struct *tsk)
>  	if (unlikely(running))
>  		tsk->sched_class->put_prev_task(rq, tsk);
>  
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +	if (tsk->sched_class->prep_move_group)
> +		tsk->sched_class->prep_move_group(tsk, on_rq);
> +#endif
> +
>  	set_task_rq(tsk, task_cpu(tsk));
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index db3f674..6ded59f 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -3827,10 +3827,21 @@ static void set_curr_task_fair(struct rq *rq)
>  static void moved_group_fair(struct task_struct *p, int on_rq)
>  {
>  	struct cfs_rq *cfs_rq = task_cfs_rq(p);
> +	struct sched_entity *se = &p->se;
>  
>  	update_curr(cfs_rq);
>  	if (!on_rq)
> -		place_entity(cfs_rq, &p->se, 1);
> +		se->vruntime += cfs_rq->min_vruntime;
> +}
> +
> +static void prep_move_group_fair(struct task_struct *p, int on_rq)
> +{
> +	struct cfs_rq *cfs_rq = task_cfs_rq(p);
> +	struct sched_entity *se = &p->se;
> +
> +	/* normalize the runtime of a sleeping task before moving it */
> +	if (!on_rq)
> +		se->vruntime -= cfs_rq->min_vruntime;
>  }
>  #endif
>  
> @@ -3883,6 +3894,7 @@ static const struct sched_class fair_sched_class = {
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	.moved_group		= moved_group_fair,
> +	.prep_move_group	= prep_move_group_fair,
>  #endif
>  };
>  


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] sched: normalize sleeper's vruntime during group change
  2010-10-04 19:18       ` Dima Zavin
  2010-10-06 22:56         ` Dima Zavin
@ 2010-10-15 13:50         ` Peter Zijlstra
  2010-10-22 13:02           ` [tip:sched/urgent] sched, cgroup: Fixup broken cgroup movement tip-bot for Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2010-10-15 13:50 UTC (permalink / raw)
  To: Dima Zavin
  Cc: linux-kernel, Ingo Molnar, Mike Galbraith,
	Arve Hjønnevåg

On Mon, 2010-10-04 at 12:18 -0700, Dima Zavin wrote:
> >> > Please explain this stuff..
> >>
> >> The situation today is quite bad for sleeping tasks. Currently, when
> >> you move a sleeping thread between cgroups, the thread can retain its
> >> old vruntime value if the old group was far ahead of the new group
> >> since it essentially does a max(se->vruntime, new_vruntime) in
> >> place_entity. This can prevent the task from running for a very long
> >> time. That is what this patch was trying to address. It normalizes the
> >> sleeper thread's vruntime before moving it to the new group.
> >>
> >>
> >
> > Hrm,.. ok, I tend to not use this cgroup gunk more that I absolutely
> > have to, so I'll take your word for it.
> >
> > But doesn't normal cross-cpu task migration already solve this problem?
> > Therefore wouldn't it be possible to adapt/extend that code to also deal
> > with this particular issue?
> 
> It does, but from what I can tell it does so lazily for sleeping
> tasks, i.e. the logic is in try_to_wake_up(). The cgroup attach moves
> the task immediately, so when we attempt to wake it up it will already
> be too late for the wake_up code to do the right thing since the task
> has the new cpu_rq assigned from sched_move_task(). The wakeup logic
> will not have the old group info.

Wouldn't something like the below work as expected?

---
Subject: sched, cgroup: Fixup broken cgroup movement
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri Oct 15 15:24:15 CEST 2010


Reported-by: Dima Zavin <dima@android.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |    2 +-
 kernel/sched.c        |    8 ++++----
 kernel/sched_fair.c   |   25 +++++++++++++++++++------
 3 files changed, 24 insertions(+), 11 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -8388,12 +8388,12 @@ void sched_move_task(struct task_struct 
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
-	set_task_rq(tsk, task_cpu(tsk));
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	if (tsk->sched_class->moved_group)
-		tsk->sched_class->moved_group(tsk, on_rq);
+	if (tsk->sched_class->task_move_group)
+		tsk->sched_class->task_move_group(tsk, on_rq);
+	else
 #endif
+		set_task_rq(tsk, task_cpu(tsk));
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1073,7 +1073,7 @@ struct sched_class {
 					 struct task_struct *task);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*moved_group) (struct task_struct *p, int on_rq);
+	void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
 };
 
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -3831,13 +3831,26 @@ static void set_curr_task_fair(struct rq
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
-	update_curr(cfs_rq);
+	/*
+	 * If the task was not on the rq at the time of this cgroup movement
+	 * it must have been asleep, sleeping tasks keep their ->vruntime
+	 * absolute on their old rq until wakeup (needed for the fair sleeper
+	 * bonus in place_entity()).
+	 *
+	 * If it was on the rq, we've just 'preempted' it, which does convert
+	 * ->vruntime to a relative base.
+	 *
+	 * Make sure both cases convert their relative position when migrating
+	 * to another cgroup's rq. This does somewhat interfere with the
+	 * fair sleeper stuff for the first placement, but who cares.
+	 */
+	if (!on_rq)
+		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+	set_task_rq(p, task_cpu(p));
 	if (!on_rq)
-		place_entity(cfs_rq, &p->se, 1);
+		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
 #endif
 
@@ -3889,7 +3902,7 @@ static const struct sched_class fair_sch
 	.get_rr_interval	= get_rr_interval_fair,
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	.moved_group		= moved_group_fair,
+	.task_move_group	= task_move_group_fair,
 #endif
 };
 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tip:sched/urgent] sched, cgroup: Fixup broken cgroup movement
  2010-10-15 13:50         ` Peter Zijlstra
@ 2010-10-22 13:02           ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: tip-bot for Peter Zijlstra @ 2010-10-22 13:02 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, a.p.zijlstra, efault, dima, tglx, mingo

Commit-ID:  b2b5ce022acf5e9f52f7b78c5579994fdde191d4
Gitweb:     http://git.kernel.org/tip/b2b5ce022acf5e9f52f7b78c5579994fdde191d4
Author:     Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Fri, 15 Oct 2010 15:24:15 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 22 Oct 2010 14:16:45 +0200

sched, cgroup: Fixup broken cgroup movement

Dima noticed that we fail to correct the ->vruntime of sleeping tasks
when we move them between cgroups.

Reported-by: Dima Zavin <dima@android.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1287150604.29097.1513.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |    2 +-
 kernel/sched.c        |    8 ++++----
 kernel/sched_fair.c   |   25 +++++++++++++++++++------
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2cca9a9..be312c1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1073,7 +1073,7 @@ struct sched_class {
 					 struct task_struct *task);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*moved_group) (struct task_struct *p, int on_rq);
+	void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
 };
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 5998222..3fe253e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8498,12 +8498,12 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
-	set_task_rq(tsk, task_cpu(tsk));
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	if (tsk->sched_class->moved_group)
-		tsk->sched_class->moved_group(tsk, on_rq);
+	if (tsk->sched_class->task_move_group)
+		tsk->sched_class->task_move_group(tsk, on_rq);
+	else
 #endif
+		set_task_rq(tsk, task_cpu(tsk));
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 74cccfa..3acc2a4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3866,13 +3866,26 @@ static void set_curr_task_fair(struct rq *rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
-	update_curr(cfs_rq);
+	/*
+	 * If the task was not on the rq at the time of this cgroup movement
+	 * it must have been asleep, sleeping tasks keep their ->vruntime
+	 * absolute on their old rq until wakeup (needed for the fair sleeper
+	 * bonus in place_entity()).
+	 *
+	 * If it was on the rq, we've just 'preempted' it, which does convert
+	 * ->vruntime to a relative base.
+	 *
+	 * Make sure both cases convert their relative position when migrating
+	 * to another cgroup's rq. This does somewhat interfere with the
+	 * fair sleeper stuff for the first placement, but who cares.
+	 */
+	if (!on_rq)
+		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+	set_task_rq(p, task_cpu(p));
 	if (!on_rq)
-		place_entity(cfs_rq, &p->se, 1);
+		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
 #endif
 
@@ -3924,7 +3937,7 @@ static const struct sched_class fair_sched_class = {
 	.get_rr_interval	= get_rr_interval_fair,
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	.moved_group		= moved_group_fair,
+	.task_move_group	= task_move_group_fair,
 #endif
 };
 

^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-10-22 13:02 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-29  6:46 [PATCH 1/2] sched: normalize sleeper's vruntime during group change Dima Zavin
2010-09-29  6:46 ` [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue Dima Zavin
2010-10-07 21:00   ` Dima Zavin
2010-10-08  6:57     ` Mike Galbraith
2010-09-29  6:54 ` [PATCH 1/2] sched: normalize sleeper's vruntime during group change Pekka Enberg
2010-09-29  7:17   ` Dima Zavin
2010-09-29  8:13 ` Mike Galbraith
2010-09-29 19:02   ` Dima Zavin
2010-09-29 21:44   ` Dima Zavin
2010-09-30 10:47 ` Peter Zijlstra
2010-09-30 19:14   ` Dima Zavin
2010-10-01 11:59     ` Peter Zijlstra
2010-10-04 19:18       ` Dima Zavin
2010-10-06 22:56         ` Dima Zavin
2010-10-07  2:24           ` Mike Galbraith
2010-10-15 13:50         ` Peter Zijlstra
2010-10-22 13:02           ` [tip:sched/urgent] sched, cgroup: Fixup broken cgroup movement tip-bot for Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox