Re: [RFC -v2 PATCH 2/3] sched: add yield_to function

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Rik van Riel <riel@redhat.com>
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	Avi Kiviti <avi@redhat.com>,
	Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>,
	Mike Galbraith <efault@gmx.de>,
	Chris Wright <chrisw@sous-sol.org>
Subject: Re: [RFC -v2 PATCH 2/3] sched: add yield_to function
Date: Tue, 14 Dec 2010 13:22:02 +0100	[thread overview]
Message-ID: <1292329322.6803.1609.camel@twins> (raw)
In-Reply-To: <20101213224657.7e141746@annuminas.surriel.com>

On Mon, 2010-12-13 at 22:46 -0500, Rik van Riel wrote:


> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2c79e92..408326f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1086,6 +1086,8 @@ struct sched_class {
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>         void (*task_move_group) (struct task_struct *p, int on_rq);
>  #endif
> +
> +       void (*yield_to) (struct rq *rq, struct task_struct *p);
>  };
>  
>  struct load_weight {
> @@ -1947,6 +1949,7 @@ extern void set_user_nice(struct task_struct *p, long nice);
>  extern int task_prio(const struct task_struct *p);
>  extern int task_nice(const struct task_struct *p);
>  extern int can_nice(const struct task_struct *p, const int nice);
> +extern void requeue_task(struct rq *rq, struct task_struct *p);

That definitely doesn't want to be a globally visible symbol.

>  extern int task_curr(const struct task_struct *p);
>  extern int idle_cpu(int cpu);
>  extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
> @@ -2020,6 +2023,10 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state);
>  extern int wake_up_process(struct task_struct *tsk);
>  extern void wake_up_new_task(struct task_struct *tsk,
>                                 unsigned long clone_flags);
> +
> +extern u64 slice_remain(struct task_struct *);

idem.


> +void yield_to(struct task_struct *p)
> +{
> +       unsigned long flags;
> +       struct rq *rq, *p_rq;
> +
> +       local_irq_save(flags);
> +       rq = this_rq();
> +again:
> +       p_rq = task_rq(p);
> +       double_rq_lock(rq, p_rq);
> +       if (p_rq != task_rq(p)) {
> +               double_rq_unlock(rq, p_rq);
> +               goto again;
> +       }
> +
> +       /* We can't yield to a process that doesn't want to run. */
> +       if (!p->se.on_rq)
> +               goto out;
> +
> +       /*
> +        * We can only yield to a runnable task, in the same schedule class
> +        * as the current task, if the schedule class implements yield_to_task.
> +        */
> +       if (!task_running(rq, p) && current->sched_class == p->sched_class &&
> +                       current->sched_class->yield_to)
> +               current->sched_class->yield_to(rq, p);

rq and p don't match, see below.

> +
> +out:
> +       double_rq_unlock(rq, p_rq);
> +       local_irq_restore(flags);
> +       yield();

That wants to be plain: schedule(), possibly conditional on having
called sched_class::yield_to.

> +}
> +EXPORT_SYMBOL_GPL(yield_to);

> +u64 slice_remain(struct task_struct *p)
> +{
> +       unsigned long flags;
> +       struct sched_entity *se = &p->se;
> +       struct cfs_rq *cfs_rq;
> +       struct rq *rq;
> +       u64 slice, ran;
> +       s64 delta;
> +
> +       rq = task_rq_lock(p, &flags);

You're calling this from
yield_to()->sched_class::yield_to()->yield_to_fair()->slice_remain(),
yield_to() already holds p's rq lock.

> +       cfs_rq = cfs_rq_of(se);
> +       slice = sched_slice(cfs_rq, se);
> +       ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
> +       delta = slice - ran;
> +       task_rq_unlock(rq, &flags);
> +
> +       return max(delta, 0LL);
> +}

Like Mike said, the returned figure doesn't really mean anything, its
definitely not the remaining time of a slice. It might qualify for a
weak random number generator though.. :-)

> +static void yield_to_fair(struct rq *rq, struct task_struct *p)
> +{
> +       struct sched_entity *se = &p->se;
> +       struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +       u64 remain = slice_remain(current);
> +
> +       dequeue_task(rq, p, 0);

Here you assume @p lives on @rq, but you passed:

+               current->sched_class->yield_to(rq, p);

and rq = this_rq(), so this will go splat.

> +       se->vruntime -= remain;

You cannot simply subtract wall-time from virtual time, see the usage of
calc_delta_fair() in the proposal below.

> +       if (se->vruntime < cfs_rq->min_vruntime)
> +               se->vruntime = cfs_rq->min_vruntime;

Then clipping it to min_vruntime doesn't make any sense at all.

> +       enqueue_task(rq, p, 0);
> +       check_preempt_curr(rq, p, 0);
> +} 

Also, modifying the vruntime of one task without also modifying the
vruntime of the other task breaks stuff. You're injecting time into p
without taking time out of current. 


Maybe something like:

static void yield_to_fair(struct rq *p_rq, struct task_struct *p)
{
	struct rq *rq = this_rq();
	struct sched_entity *se = &current->se;
	struct cfs_rq *cfs_rq = cfs_rq_of(se);
	struct sched_entity *pse = &p->se;
	struct cfs_rq *p_cfs_rq = cfs_rq_of(pse);

	/*
	 * Transfer wakeup_gran worth of time from current to @p,
	 * this should ensure current is no longer eligible to run.
	 */
	unsigned long wakeup_gran = ACCESS_ONCE(sysctl_sched_wakeup_granularity);

	update_rq_clock(rq);
	update_curr(cfs_rq);

	if (pse != p_cfs_rq->curr) {
		__dequeue_entity(p_cfs_rq, pse);
	} else {
		update_rq_clock(p_rq);
		update_curr(p_cfs_rq);
	}

	se->vruntime += calc_delta_fair(wakeup_gran, se);
	pse->vruntime -= calc_delta_fair(wakeup_gran, pse);

	clear_buddies(cfs_rq, se);
	
	if (pse != p_cfs_rq->curr) {
		__enqueue_entity(p_cfs_rq, pse);
		check_preempt_curr(prq, p, 0)
	}
}

This isn't strictly correct for the group scheduling case though, that
wants a for_each_sched_entity() loop for both se and pse, but I'd have
to like actually think about that ;-)

A quick hack might be simply dis-allowing yield_to between different
groups, add something like the below to the above function:

#ifdef CONFIG_FAIR_GROUP_SCHED
	if (cfs_rq->tg != p_cfs_rq->tg)
		return;
#endif

next prev parent reply	other threads:[~2010-12-14 12:22 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-12-14  3:44 [RFC -v2 PATCH 0/3] directed yield for Pause Loop Exiting Rik van Riel
2010-12-14  3:45 ` [RFC -v2 PATCH 1/3] kvm: keep track of which task is running a KVM vcpu Rik van Riel
2010-12-14  3:46 ` [RFC -v2 PATCH 2/3] sched: add yield_to function Rik van Riel
2010-12-14  6:08   ` Mike Galbraith
2010-12-14 10:24     ` Srivatsa Vaddagiri
2010-12-14 11:03       ` Mike Galbraith
2010-12-14 11:26         ` Srivatsa Vaddagiri
2010-12-14 12:47           ` Mike Galbraith
2010-12-16 19:49     ` Rik van Riel
2010-12-17  6:56       ` Mike Galbraith
2010-12-17  7:15         ` Mike Galbraith
2010-12-18 17:08           ` Avi Kivity
2010-12-18 19:13             ` Mike Galbraith
2010-12-19  6:08               ` Avi Kivity
2010-12-20 15:40           ` Rik van Riel
2010-12-20 16:04             ` Mike Galbraith
2010-12-28  5:54               ` Mike Galbraith
2010-12-28  6:08                 ` Gene Heskett
2010-12-28  6:16                   ` Mike Galbraith
2010-12-28 16:18                     ` Gene Heskett
2010-12-28 22:34                 ` Rik van Riel
2010-12-17 15:09         ` Avi Kivity
2010-12-17 19:51           ` Mike Galbraith
2010-12-18 17:02             ` Avi Kivity
2010-12-18 19:06               ` Mike Galbraith
2010-12-19  6:21                 ` Avi Kivity
2010-12-19 10:05                   ` Mike Galbraith
2010-12-19  9:19                     ` Avi Kivity
2010-12-19 11:18                       ` Mike Galbraith
2010-12-20  8:39                       ` Mike Galbraith
2010-12-20  8:45                         ` Avi Kivity
2010-12-20  8:55                           ` Mike Galbraith
2010-12-20  9:03                             ` Avi Kivity
2010-12-20  9:30                               ` Mike Galbraith
2010-12-20  9:46                                 ` Avi Kivity
2010-12-20 10:33                                   ` Mike Galbraith
2010-12-20 10:39                                     ` Avi Kivity
2010-12-20 10:46                                       ` Mike Galbraith
2010-12-20 10:49                                         ` Avi Kivity
2010-12-20 10:49                                           ` Avi Kivity
2010-12-20 10:50                                           ` Mike Galbraith
2010-12-20 11:06                                             ` Avi Kivity
2010-12-20 11:06                                               ` Avi Kivity
2010-12-14 12:22   ` Peter Zijlstra [this message]
2010-12-18 14:50     ` Rik van Riel
2010-12-14  3:48 ` [RFC -v2 PATCH 3/3] kvm: use yield_to instead of sleep in kvm_vcpu_on_spin Rik van Riel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1292329322.6803.1609.camel@twins \
    --to=a.p.zijlstra@chello.nl \
    --cc=avi@redhat.com \
    --cc=chrisw@sous-sol.org \
    --cc=efault@gmx.de \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=riel@redhat.com \
    --cc=vatsa@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.