From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752242AbZK0MVj (ORCPT ); Fri, 27 Nov 2009 07:21:39 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751833AbZK0MVi (ORCPT ); Fri, 27 Nov 2009 07:21:38 -0500 Received: from casper.infradead.org ([85.118.1.10]:41817 "EHLO casper.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751574AbZK0MVh (ORCPT ); Fri, 27 Nov 2009 07:21:37 -0500 Subject: Re: [patch] sched: fix b5d9d734 blunder in task_new_fair() From: Peter Zijlstra To: Mike Galbraith Cc: Ingo Molnar , LKML In-Reply-To: <1259322921.3878.28.camel@marge.simson.net> References: <1258891664.14325.30.camel@marge.simson.net> <1259252785.31676.216.camel@laptop> <1259322921.3878.28.camel@marge.simson.net> Content-Type: text/plain; charset="UTF-8" Date: Fri, 27 Nov 2009 13:21:39 +0100 Message-ID: <1259324499.6483.257.camel@laptop> Mime-Version: 1.0 X-Mailer: Evolution 2.28.1 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org I'm poking at it like so... the below hasn't yet seen a compiler and I know there to still be a few races in the ttwu() path. --- include/linux/sched.h | 6 +- kernel/fork.c | 2 +- kernel/sched.c | 170 +++++++++++++++++++++++++++++++------------------ kernel/sched_fair.c | 26 +++---- 4 files changed, 124 insertions(+), 80 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c46023..ff5ec5a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1099,7 +1099,7 @@ struct sched_class { void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); - void (*task_new) (struct rq *rq, struct task_struct *p); + void (*fork) (struct task_struct *p); void (*switched_from) (struct rq *this_rq, struct task_struct *task, int running); @@ -2456,7 +2456,7 @@ static inline unsigned int task_cpu(const struct task_struct *p) return task_thread_info(p)->cpu; } -extern void set_task_cpu(struct task_struct *p, unsigned int cpu); +extern void set_task_cpu_locked(struct task_struct *p, int cpu); #else @@ -2465,7 +2465,7 @@ static inline unsigned int task_cpu(const struct task_struct *p) return 0; } -static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +static inline void set_task_cpu_locked(struct task_struct *p, int cpu) { } diff --git a/kernel/fork.c b/kernel/fork.c index 166b8c4..29c4aec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1242,7 +1242,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); + set_task_cpu_locked(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { diff --git a/kernel/sched.c b/kernel/sched.c index 0cbf2ef..dced5af 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1967,7 +1967,7 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +static inline void raw_set_task_cpu(struct task_struct *p, unsigned int cpu) { set_task_rq(p, cpu); #ifdef CONFIG_SMP @@ -2008,6 +2008,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, void kthread_bind(struct task_struct *p, unsigned int cpu) { struct rq *rq = cpu_rq(cpu); + struct rq *old_rq = task_rq(p); unsigned long flags; /* Must have done schedule() in kthread() before we set_task_cpu */ @@ -2016,13 +2017,14 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) return; } - spin_lock_irqsave(&rq->lock, flags); - update_rq_clock(rq); + local_irq_save(flags); + double_rq_lock(old_rq, rq); set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; p->flags |= PF_THREAD_BOUND; - spin_unlock_irqrestore(&rq->lock, flags); + double_rq_unlock(old_rq, rq); + local_irq_restore(flags); } EXPORT_SYMBOL(kthread_bind); @@ -2056,40 +2058,101 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } +static void set_task_cpu_all(struct task_struct *p, int old_cpu, int new_cpu) +{ + trace_sched_migrate_task(p, new_cpu); + + p->se.nr_migrations++; +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, old_rq->clock, NULL)) + schedstat_inc(p, se.nr_forced2_migrations); +#endif + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + + raw_set_task_cpu(p, new_cpu); +} -void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +static void set_task_cpu(struct task_struct *p, int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - clock_offset = old_rq->clock - new_rq->clock; + if (old_cpu == new_cpu) + return; - trace_sched_migrate_task(p, new_cpu); + if (p->sched_class == &fair_sched_class) { + struct cfs_rq *old_cfsrq = task_cfs_rq(p), + *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif - if (old_cpu != new_cpu) { - p->se.nr_migrations++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); + p->se.vruntime -= old_cfsrq->min_vruntime - + new_cfsrq->min_vruntime; + } + + set_task_cpu_all(p, old_cpu, new_cpu); +} + +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + return p->sched_class->select_task_rq(p, sd_flags, wake_flags); +} + +/* + * self balance a task without holding both rq locks + * + * used by fork and wakeup, since in neither case the task was present + * on a rq the load-balancer wouldn't encounter it and move it away + * underneath us. + * + * and since we have IRQs disabled we keep off hot-unplug. + */ +static struct rq * +balance_task(struct task_struct *p, int sd_flags, int wake_flags) +{ + struct rq *rq, *old_rq; + u64 vdelta; + + rq = old_rq = task_rq(p); + + if (p->sched_class == &fair_sched_class) + vdelta = task_cfs_rq(p)->min_vruntime; + + __task_rq_unlock(old_rq); + + cpu = select_task_rq(p, sd_flags, wake_flags); + + rq = cpu_rq(cpu); + spin_lock(&rq->lock); + if (rq == old_rq) + return rq; + + update_rq_clock(rq); + + set_task_cpu_all(p, task_cpu(p), cpu); + + if (p->sched_class == &fair_sched_class) { + vdelta -= task_cfs_rq(p)->min_vruntime; + p->se.vruntime -= vdelta; } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; - __set_task_cpu(p, new_cpu); + return rq; +} + +void set_task_cpu_locked(struct task_struct *p, int new_cpu) +{ + struct rq *old_rq = task_rq(p); + struct rq *new_rq = cpu_rq(new_cpu); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(old_rq, new_rq); + /* + * called from fork() on the child task before it gets + * put on the tasklist, no way we could've been migrated + */ + WARN_ON_ONCE(old_rq != task_rq(p)); + set_task_cpu(p, new_cpu); + double_rq_unlock(old_rq, new_rq); + local_irq_restore(flags); } struct migration_req { @@ -2373,19 +2436,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, */ if (task_contributes_to_load(p)) rq->nr_uninterruptible--; + p->state = TASK_WAKING; + rq = balance_task(p, SD_BALANCE_WAKE, wake_flags); task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) { - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); - set_task_cpu(p, cpu); - local_irq_restore(flags); - } - rq = task_rq_lock(p, &flags); - WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2557,8 +2612,9 @@ static void __sched_fork(struct task_struct *p) */ void sched_fork(struct task_struct *p, int clone_flags) { - int cpu = get_cpu(); unsigned long flags; + struct *rq; + int cpu; __sched_fork(p); @@ -2592,13 +2648,15 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; -#ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); -#endif - local_irq_save(flags); - update_rq_clock(cpu_rq(cpu)); - set_task_cpu(p, cpu); - local_irq_restore(flags); + rq = task_rq_lock(current, &flags); + update_rq_clock(rq); + + if (p->sched_class->fork) + p->sched_class->fork(p); + + rq = balance_task(p, SD_BALANCE_FORK, 0); + task_rq_unlock(rq, &flags); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2631,17 +2689,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -3156,7 +3204,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); + new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); @@ -6980,7 +7028,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->prio = idle->normal_prio = MAX_PRIO; cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); - __set_task_cpu(idle, cpu); + raw_set_task_cpu(idle, cpu); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0ff21af..7c27d2f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1922,28 +1922,26 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } /* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. + * Called at fork, before the child is on the tasklist and before + * self-balance, current (parent) his rq is locked. */ -static void task_new_fair(struct rq *rq, struct task_struct *p) +static void fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se, *curr = cfs_rq->curr; + struct rq *rq = this_rq(); + struct cfs_rq *cfs_rq = task_cfs_rq(current); + struct sched_entity *se = &p->se, *curr = cfs_rq-curr; int this_cpu = smp_processor_id(); + if (unlikely(task_cpu(p) != this_cpu)) + raw_set_task_cpu(p, this_cpu); + sched_info_queued(p); - update_curr(cfs_rq); if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); - /* 'curr' will be NULL if the child belongs to a different group */ - if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && entity_before(curr, se)) { + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. @@ -1951,8 +1949,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) swap(curr->vruntime, se->vruntime); resched_task(rq->curr); } - - enqueue_task_fair(rq, p, 0); } /* @@ -2056,7 +2052,7 @@ static const struct sched_class fair_sched_class = { .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, - .task_new = task_new_fair, + .fork = fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair,