From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail.linuxfoundation.org ([140.211.169.12]:59960 "EHLO mail.linuxfoundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932091AbdGJPZA (ORCPT ); Mon, 10 Jul 2017 11:25:00 -0400 Subject: Patch "sched/numa: Implement NUMA node level wake_affine()" has been added to the 4.12-stable tree To: riel@redhat.com, efault@gmx.de, gregkh@linuxfoundation.org, mgorman@suse.de, mgorman@techsingularity.net, mingo@kernel.org, peterz@infradead.org, tglx@linutronix.de, torvalds@linux-foundation.org Cc: , From: Date: Mon, 10 Jul 2017 17:24:52 +0200 Message-ID: <1499700292210226@kroah.com> MIME-Version: 1.0 Content-Type: text/plain; charset=ANSI_X3.4-1968 Content-Transfer-Encoding: 8bit Sender: stable-owner@vger.kernel.org List-ID: This is a note to let you know that I've just added the patch titled sched/numa: Implement NUMA node level wake_affine() to the 4.12-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: sched-numa-implement-numa-node-level-wake_affine.patch and it can be found in the queue-4.12 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let know about it. >>From 3fed382b46baac83703130fe4cd3d9147f427fb9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:29 -0400 Subject: sched/numa: Implement NUMA node level wake_affine() From: Rik van Riel commit 3fed382b46baac83703130fe4cd3d9147f427fb9 upstream. Since select_idle_sibling() can place a task anywhere on a socket, comparing loads between individual CPU cores makes no real sense for deciding whether to do an affine wakeup across sockets, either. Instead, compare the load between the sockets in a similar way the load balancer and the numa balancing code do. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 59 deletions(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2585,6 +2585,60 @@ void task_tick_numa(struct rq *rq, struc } } } + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + */ +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + struct numa_stats prev_load, this_load; + s64 this_eff_load, prev_eff_load; + + update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); + update_numa_stats(&this_load, cpu_to_node(this_cpu)); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + if (this_load.load > current_load) + this_load.load -= current_load; + else + this_load.load = 0; + } + + /* + * In low-load situations, where this_cpu's node is idle due to the + * sync cause above having dropped this_load.load to 0, move the task. + * Moving to an idle socket will not create a bad imbalance. + * + * Otherwise check if the nodes are near enough in load to allow this + * task to be woken on this_cpu's node. + */ + if (this_load.load > 0) { + unsigned long task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_load.compute_capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_load.compute_capacity; + + this_eff_load *= this_load.load + task_load; + prev_eff_load *= prev_load.load - task_load; + + return this_eff_load <= prev_eff_load; + } + + return true; +} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2597,6 +2651,13 @@ static inline void account_numa_enqueue( static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } + +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + return true; +} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -5386,74 +5447,25 @@ static int wake_wide(struct task_struct static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { - s64 this_load, load; - s64 this_eff_load, prev_eff_load; - int idx, this_cpu; - struct task_group *tg; - unsigned long weight; - int balanced; - - idx = sd->wake_idx; - this_cpu = smp_processor_id(); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + int this_cpu = smp_processor_id(); + bool affine = false; /* * Common case: CPUs are in the same socket, and select_idle_sibling() * will do its thing regardless of what we return: */ if (cpus_share_cache(prev_cpu, this_cpu)) - return true; - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - tg = task_group(current); - weight = current->se.avg.load_avg; - - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); - } - - tg = task_group(p); - weight = p->se.avg.load_avg; - - /* - * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped this_load to 0, we'll - * always have an imbalance, but there's really nothing you can do - * about that, so that's good too. - * - * Otherwise check if either cpus are near enough in load to allow this - * task to be woken on this_cpu. - */ - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); - - if (this_load > 0) { - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); - - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); - } - - balanced = this_eff_load <= prev_eff_load; + affine = true; + else + affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + if (affine) { + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + } - if (!balanced) - return 0; - - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - - return 1; + return affine; } static inline int task_util(struct task_struct *p); Patches currently in stable-queue which might be from riel@redhat.com are queue-4.12/sched-numa-hide-numa_wake_affine-from-up-build.patch queue-4.12/sched-numa-use-down_read_trylock-for-the-mmap_sem.patch queue-4.12/sched-core-implement-new-approach-to-scale-select_idle_cpu.patch queue-4.12/sched-fair-remove-effective_load.patch queue-4.12/sched-numa-implement-numa-node-level-wake_affine.patch queue-4.12/sched-fair-simplify-wake_affine-for-the-single-socket-case.patch queue-4.12/sched-fair-cpumask-export-for_each_cpu_wrap.patch queue-4.12/sched-numa-override-part-of-migrate_degrades_locality-when-idle-balancing.patch