From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752840Ab0ILMpI (ORCPT ); Sun, 12 Sep 2010 08:45:08 -0400 Received: from casper.infradead.org ([85.118.1.10]:39521 "EHLO casper.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752567Ab0ILMpG convert rfc822-to-8bit (ORCPT ); Sun, 12 Sep 2010 08:45:06 -0400 Subject: Re: [RFC patch 0/2] sched: dynamically adapt granularity with nr_running From: Peter Zijlstra To: Mathieu Desnoyers Cc: LKML , Linus Torvalds , Andrew Morton , Ingo Molnar , Steven Rostedt , Thomas Gleixner , Tony Lindgren , Mike Galbraith In-Reply-To: <20100911173732.551632040@efficios.com> References: <20100911173732.551632040@efficios.com> Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 8BIT Date: Sun, 12 Sep 2010 14:44:48 +0200 Message-ID: <1284295488.2275.15.camel@laptop> Mime-Version: 1.0 X-Mailer: Evolution 2.28.3 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org You found improved latencies with something like the below as well, right? Except your proglet needs timers to be special too iirc. Thomas objected to 'special' wakeups, and I can fully appreciate why, but maybe we could try it anyway, its only a reasonably soft hint anyway. ( full series with changelogs at: programming.kicks-ass.net/sekrit/sched-patches.tar.bz2 ) I'm currently running it on my laptop, and while spread is reasonably controlled, interactivity isn't too sucky, but its not too hot either. (I did lower my min_gran to like 1/5th of latency) --- drivers/input/evdev.c | 2 + include/linux/sched.h | 22 +++++-- kernel/sched.c | 8 +- kernel/sched_debug.c | 2 - kernel/sched_fair.c | 160 +++++++++++++++++++++++------------------------ kernel/sched_features.h | 13 ++--- 7 files changed, 107 insertions(+), 102 deletions(-) diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 5808731..1c5b626 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -84,6 +84,7 @@ static void evdev_event(struct input_handle *handle, event.code = code; event.value = value; + sched_wake_interactive_enable(); rcu_read_lock(); client = rcu_dereference(evdev->grab); @@ -96,6 +97,7 @@ static void evdev_event(struct input_handle *handle, rcu_read_unlock(); wake_up_interruptible(&evdev->wait); + sched_wake_interactive_disable(); } static int evdev_fasync(int fd, struct file *file, int on) diff --git a/include/linux/sched.h b/include/linux/sched.h index 53eb33c..dd40801 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1097,7 +1097,6 @@ struct sched_statistics { u64 block_start; u64 block_max; u64 exec_max; - u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; @@ -1121,7 +1120,8 @@ struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; struct list_head group_node; - unsigned int on_rq; + unsigned int on_rq : 1, + interactive : 1; u64 exec_start; u64 sum_exec_runtime; @@ -1239,11 +1239,11 @@ struct task_struct { unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ - unsigned in_iowait:1; - - /* Revert to default priority/policy when forking */ - unsigned sched_reset_on_fork:1; + unsigned sched_in_iowait :1; /* Called io_schedule() */ + unsigned sched_reset_on_fork :1; /* Revert to default priority/policy + * on fork */ + unsigned sched_wake_interactive:4; /* User driven wakeup */ pid_t pid; pid_t tgid; @@ -1506,6 +1506,16 @@ struct task_struct { #endif }; +static inline void sched_wake_interactive_enable(void) +{ + current->sched_wake_interactive++; +} + +static inline void sched_wake_interactive_disable(void) +{ + current->sched_wake_interactive--; +} + /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) diff --git a/kernel/sched.c b/kernel/sched.c index 1ab8394..89ff2c3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5125,9 +5125,9 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); - current->in_iowait = 1; + current->sched_in_iowait = 1; schedule(); - current->in_iowait = 0; + current->sched_in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -5140,9 +5140,9 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); - current->in_iowait = 1; + current->sched_in_iowait = 1; ret = schedule_timeout(timeout); - current->in_iowait = 0; + current->sched_in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d1..c301164 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -76,7 +76,6 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, PN(se->statistics.sleep_max); PN(se->statistics.block_max); PN(se->statistics.exec_max); - PN(se->statistics.slice_max); PN(se->statistics.wait_max); PN(se->statistics.wait_sum); P(se->statistics.wait_count); @@ -408,7 +407,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.statistics.sleep_max); PN(se.statistics.block_max); PN(se.statistics.exec_max); - PN(se.statistics.slice_max); PN(se.statistics.wait_max); PN(se.statistics.wait_sum); P(se.statistics.wait_count); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9b5b4f8..a1ad97d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -301,27 +301,6 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) return se->vruntime - cfs_rq->min_vruntime; } -static void update_min_vruntime(struct cfs_rq *cfs_rq) -{ - u64 vruntime = cfs_rq->min_vruntime; - - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; - - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); - - if (!cfs_rq->curr) - vruntime = se->vruntime; - else - vruntime = min_vruntime(vruntime, se->vruntime); - } - - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); -} - /* * Enqueue an entity into the rb-tree: */ @@ -495,6 +474,30 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } +static void update_min_vruntime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +{ + struct sched_entity *left = __pick_next_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 new_vruntime, vruntime; + + if (left && curr) + vruntime = min_vruntime(left->vruntime, curr->vruntime); + else if (left) + vruntime = left->vruntime; + else if (curr) + vruntime = curr->vruntime; + else + return; + + new_vruntime = cfs_rq->min_vruntime; + if (sched_feat(DYN_MIN_VRUNTIME) && delta_exec) { + new_vruntime += calc_delta_mine(delta_exec, NICE_0_LOAD, + &cfs_rq->load); + } + + cfs_rq->min_vruntime = max_vruntime(new_vruntime, vruntime); +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -513,7 +516,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); + update_min_vruntime(cfs_rq, delta_exec); } static void update_curr(struct cfs_rq *cfs_rq) @@ -688,7 +691,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->statistics.sum_sleep_runtime += delta; if (tsk) { - if (tsk->in_iowait) { + if (tsk->sched_in_iowait) { se->statistics.iowait_sum += delta; se->statistics.iowait_count++; trace_sched_stat_iowait(tsk, delta); @@ -708,6 +711,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } } #endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; } static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -718,7 +722,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) if (d < 0) d = -d; - if (d > 3*sysctl_sched_latency) + if (d > 3*cfs_rq->nr_running*sysctl_sched_latency) schedstat_inc(cfs_rq, nr_spread_over); #endif } @@ -738,7 +742,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime += sched_vslice(cfs_rq, se); /* sleeps up to a single latency don't count. */ - if (!initial) { + if (sched_feat(FAIR_SLEEPERS) && !initial) { unsigned long thresh = sysctl_sched_latency; /* @@ -752,9 +756,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); - - se->vruntime = vruntime; + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void @@ -826,7 +828,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); - update_min_vruntime(cfs_rq); + update_min_vruntime(cfs_rq, 0); /* * Normalize the entity after updating the min_vruntime because the @@ -837,44 +839,34 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime -= cfs_rq->min_vruntime; } +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - unsigned long ideal_runtime, delta_exec; + unsigned long slice = sched_slice(cfs_rq, curr); + + if (curr->sum_exec_runtime - curr->prev_sum_exec_runtime < slice) { + struct sched_entity *pse = __pick_next_entity(cfs_rq); + + if (pse && wakeup_preempt_entity(curr, pse) == 1) + goto preempt; - ideal_runtime = sched_slice(cfs_rq, curr); - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { - resched_task(rq_of(cfs_rq)->curr); - /* - * The current task ran long enough, ensure it doesn't get - * re-elected due to buddy favours. - */ - clear_buddies(cfs_rq, curr); return; } /* - * Ensure that a task that missed wakeup preemption by a - * narrow margin doesn't have to wait for a full slice. - * This also mitigates buddy induced latencies under load. + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. */ - if (!sched_feat(WAKEUP_PREEMPT)) - return; - - if (delta_exec < sysctl_sched_min_granularity) - return; + clear_buddies(cfs_rq, curr); - if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_next_entity(cfs_rq); - s64 delta = curr->vruntime - se->vruntime; - - if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); - } +preempt: + resched_task(rq_of(cfs_rq)->curr); } static void @@ -893,36 +885,21 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; -#ifdef CONFIG_SCHEDSTATS - /* - * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it - * when there are only lesser-weight tasks around): - */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { - se->statistics.slice_max = max(se->statistics.slice_max, - se->sum_exec_runtime - se->prev_sum_exec_runtime); - } -#endif - se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; /* - * Prefer last buddy, try to return the CPU to a preempted task. + * Prefer the next buddy, only set through the interactivity logic. */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; clear_buddies(cfs_rq, se); @@ -931,6 +908,13 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { + unsigned long slice = sched_slice(cfs_rq, prev); + + prev->interactive = 0; + + if (prev->sum_exec_runtime - prev->prev_sum_exec_runtime >= slice) + prev->prev_sum_exec_runtime += slice; + /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: @@ -1652,7 +1636,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int scale = cfs_rq->nr_running >= sched_nr_latency; + /* + * The buddy logic doesn't work well when there's not actually enough + * tasks for there to be buddies. + */ + int buddies = (cfs_rq->nr_running >= 2); if (unlikely(rt_prio(p->prio))) goto preempt; @@ -1663,8 +1651,16 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) + if ((se->interactive || curr->sched_wake_interactive) && + !p->sched_in_iowait) + pse->interactive = 1; + + if (!(wake_flags & WF_FORK) && pse->interactive) { + clear_buddies(cfs_rq, NULL); set_next_buddy(pse); + update_curr(cfs_rq); + goto preempt; + } /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -1709,7 +1705,7 @@ preempt: if (unlikely(!se->on_rq || curr == rq->idle)) return; - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + if (sched_feat(LAST_BUDDY) && buddies && entity_is_task(se)) set_last_buddy(se); } @@ -3404,11 +3400,13 @@ static void nohz_balancer_kick(int cpu) } if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { - struct call_single_data *cp; - cpu_rq(ilb_cpu)->nohz_balance_kick = 1; - cp = &per_cpu(remote_sched_softirq_cb, cpu); - __smp_call_function_single(ilb_cpu, cp, 0); + + if (ilb_cpu != cpu) { + struct call_single_data *cp; + cp = &per_cpu(remote_sched_softirq_cb, cpu); + __smp_call_function_single(ilb_cpu, cp, 0); + } } return; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 83c66e8..33b81f9 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -3,13 +3,14 @@ * them to run sooner, but does not allow tons of sleepers to * rip the spread apart. */ +SCHED_FEAT(FAIR_SLEEPERS, 0) SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) /* * Place new tasks ahead so that they do not starve already running * tasks */ -SCHED_FEAT(START_DEBIT, 1) +SCHED_FEAT(START_DEBIT, 0) /* * Should wakeups try to preempt running tasks. @@ -25,13 +26,6 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) /* - * Prefer to schedule the task we woke last (assuming it failed - * wakeup-preemption), since its likely going to consume data we - * touched, increases cache locality. - */ -SCHED_FEAT(NEXT_BUDDY, 0) - -/* * Prefer to schedule the task that ran last (when we did * wake-preempt) as that likely will touch the same data, increases * cache locality. @@ -55,6 +49,9 @@ SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_SHARES_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(DYN_MIN_VRUNTIME, 1) +SCHED_FEAT(INTERACTIVE, 1) + /* * Spin-wait on mutex acquisition when the mutex owner is running on * another cpu -- assumes that when the owner is running, it will soon