[PATCH 10/11] sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Mel Gorman <mgorman@techsingularity.net>
To: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>,
	Valentin Schneider <valentin.schneider@arm.com>,
	Phil Auld <pauld@redhat.com>, LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH 10/11] sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity
Date: Wed, 12 Feb 2020 15:46:06 +0000	[thread overview]
Message-ID: <20200212154606.GO3466@techsingularity.net> (raw)
In-Reply-To: <20200212093654.4816-1-mgorman@techsingularity.net>

The standard load balancer generally allows an imbalance to exist if
a domain has spare capacity. This patch uses similar logic within NUMA
balancing when moving a task to a preferred node. This is not a perfect
comparison with the load balancer but should be a close enough match
when the destination domain has spare capacity and the imbalance is not
too large.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 79 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2476ef0b056..69e41204cfae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1473,21 +1473,19 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-
-static unsigned long cpu_runnable_load(struct rq *rq)
-{
-	return cfs_rq_runnable_load_avg(&rq->cfs);
-}
-
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
-	unsigned long load;
+	unsigned long group_load;
+	unsigned long group_util;
 
 	/* Total compute capacity of CPUs on a node */
-	unsigned long compute_capacity;
+	unsigned long group_capacity;
+
+	unsigned int sum_nr_running;
 
 	/* Details on idle CPUs */
+	unsigned int group_weight;
+	int nr_idle;
 	int idle_cpu;
 };
 
@@ -1511,6 +1509,22 @@ static inline bool is_core_idle(int cpu)
 /* Forward declarations of select_idle_sibling helpers */
 static inline bool test_idle_cores(int cpu, bool def);
 
+/* Forward declarations of lb helpers */
+static unsigned long cpu_load(struct rq *rq);
+static inline unsigned long cpu_util(int cpu);
+static inline bool __lb_has_capacity(unsigned int imbalance_pct,
+	unsigned int sum_nr_running, unsigned int group_weight,
+	unsigned long group_capacity, unsigned long group_util);
+static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+
+/* NUMA Balancing equivalents for LB helpers */
+static inline bool
+numa_has_capacity(unsigned int imbalance_pct, struct numa_stats *ns)
+{
+	return __lb_has_capacity(imbalance_pct, ns->sum_nr_running + 1,
+		ns->group_weight, ns->group_capacity, ns->group_util);
+}
+
 /*
  * Gather all necessary information to make NUMA balancing placement
  * decisions that are compatible with standard load balanced. This
@@ -1529,14 +1543,20 @@ update_numa_stats(struct numa_stats *ns, int nid,
 	ns->idle_cpu = -1;
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
 		struct rq *rq = cpu_rq(cpu);
+		unsigned int nr_running = rq->nr_running;
 
-		ns->load += cpu_runnable_load(rq);
-		ns->compute_capacity += capacity_of(cpu);
+		ns->group_load += cpu_load(rq);
+		ns->group_util += cpu_util(cpu);
+		ns->group_capacity += capacity_of(cpu);
+		ns->group_weight++;
+		ns->sum_nr_running += nr_running;
 
-		if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+		if (!nr_running && idle_cpu(cpu)) {
 			int this_llc_id;
 
-			if (READ_ONCE(rq->numa_migrate_on) ||
+			ns->nr_idle++;
+
+			if (!find_idle || READ_ONCE(rq->numa_migrate_on) ||
 			    !cpumask_test_cpu(cpu, p->cpus_ptr))
 				continue;
 
@@ -1646,13 +1666,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
 	 * ------------ vs ---------
 	 * src_capacity    dst_capacity
 	 */
-	src_capacity = env->src_stats.compute_capacity;
-	dst_capacity = env->dst_stats.compute_capacity;
+	src_capacity = env->src_stats.group_capacity;
+	dst_capacity = env->dst_stats.group_capacity;
 
 	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
 
-	orig_src_load = env->src_stats.load;
-	orig_dst_load = env->dst_stats.load;
+	orig_src_load = env->src_stats.group_load;
+	orig_dst_load = env->dst_stats.group_load;
 
 	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
 
@@ -1799,8 +1819,8 @@ static void task_numa_compare(struct task_numa_env *env,
 	if (!load)
 		goto assign;
 
-	dst_load = env->dst_stats.load + load;
-	src_load = env->src_stats.load - load;
+	dst_load = env->dst_stats.group_load + load;
+	src_load = env->src_stats.group_load - load;
 
 	if (load_too_imbalanced(src_load, dst_load, env))
 		goto unlock;
@@ -1838,23 +1858,38 @@ static void task_numa_find_cpu(struct task_numa_env *env,
 	bool maymove = false;
 	int cpu;
 
-	load = task_h_load(env->p);
-	dst_load = env->dst_stats.load + load;
-	src_load = env->src_stats.load - load;
-
 	/*
-	 * If the improvement from just moving env->p direction is better
-	 * than swapping tasks around, check if a move is possible.
+	 * If the load balancer is unlikely to interfere with the task after
+	 * a migration then use an idle CPU.
 	 */
-	maymove = !load_too_imbalanced(src_load, dst_load, env);
+	if (env->dst_stats.idle_cpu >= 0) {
+		unsigned int imbalance;
+		int src_running, dst_running;
 
-	/* Use an idle CPU if one has been found already */
-	if (maymove && env->dst_stats.idle_cpu >= 0) {
-		env->dst_cpu = env->dst_stats.idle_cpu;
-		task_numa_assign(env, NULL, 0);
-		return;
+		/* Would movement cause an imbalance? */
+		src_running = env->src_stats.sum_nr_running - 1;
+		dst_running = env->src_stats.sum_nr_running + 1;
+		imbalance = max(0, dst_running - src_running);
+		imbalance = adjust_numa_imbalance(imbalance, src_running);
+
+		/* Use idle CPU there is spare capacity and no imbalance */
+		if (numa_has_capacity(env->imbalance_pct, &env->dst_stats) &&
+		    !imbalance) {
+			env->dst_cpu = env->dst_stats.idle_cpu;
+			task_numa_assign(env, NULL, 0);
+			return;
+		}
 	}
 
+	/*
+	 * If using an idle CPU would cause an imbalance that would likely
+	 * be overridden by the load balancer, consider the load instead.
+	 */
+	load = task_h_load(env->p);
+	dst_load = env->dst_stats.group_load + load;
+	src_load = env->src_stats.group_load - load;
+	maymove = !load_too_imbalanced(src_load, dst_load, env);
+
 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 		/* Skip this CPU if the source task cannot migrate */
 		if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
@@ -8048,18 +8083,27 @@ static inline int sg_imbalanced(struct sched_group *group)
  * any benefit for the load balance.
  */
 static inline bool
-group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
+__lb_has_capacity(unsigned int imbalance_pct, unsigned int sum_nr_running,
+	unsigned int group_weight, unsigned long group_capacity,
+	unsigned long group_util)
 {
-	if (sgs->sum_nr_running < sgs->group_weight)
+	if (sum_nr_running < group_weight)
 		return true;
 
-	if ((sgs->group_capacity * 100) >
-			(sgs->group_util * imbalance_pct))
+	if ((group_capacity * 100) >
+			(group_util * imbalance_pct))
 		return true;
 
 	return false;
 }
 
+static inline bool
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
+{
+	return __lb_has_capacity(imbalance_pct, sgs->sum_nr_running,
+		sgs->group_weight, sgs->group_capacity, sgs->group_util);
+}
+
 /*
  *  group_is_overloaded returns true if the group has more tasks than it can
  *  handle.
-- 
2.16.4

next prev parent reply	other threads:[~2020-02-12 15:46 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-12  9:36 [RFC PATCH 00/11] Reconcile NUMA balancing decisions with the load balancer Mel Gorman
2020-02-12  9:36 ` [PATCH 01/11] sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains Mel Gorman
2020-02-12  9:36 ` [PATCH 02/11] sched/fair: Optimize select_idle_core() Mel Gorman
2020-02-12  9:36 ` [PATCH 03/11] sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression Mel Gorman
2020-02-12  9:36 ` [PATCH 04/11] sched/numa: Trace when no candidate CPU was found on the preferred node Mel Gorman
2020-02-12  9:36 ` [PATCH 05/11] sched/numa: Distinguish between the different task_numa_migrate failure cases Mel Gorman
2020-02-12 14:43   ` Steven Rostedt
2020-02-12 15:59     ` Mel Gorman
2020-02-12  9:36 ` [PATCH 06/11] sched/numa: Prefer using an idle cpu as a migration target instead of comparing tasks Mel Gorman
2020-02-12  9:36 ` [PATCH 07/11] sched/numa: Find an alternative idle CPU if the CPU is part of an active NUMA balance Mel Gorman
2020-02-12  9:36 ` [PATCH 08/11] sched/numa: Bias swapping tasks based on their preferred node Mel Gorman
2020-02-13 10:31   ` Peter Zijlstra
2020-02-13 11:18     ` Mel Gorman
2020-02-12 13:22 ` [RFC PATCH 00/11] Reconcile NUMA balancing decisions with the load balancer Vincent Guittot
2020-02-12 14:07   ` Valentin Schneider
2020-02-12 15:48   ` Mel Gorman
2020-02-12 16:13     ` Vincent Guittot
2020-02-12 15:45 ` [PATCH 09/11] sched/fair: Split out helper to adjust imbalances between domains Mel Gorman
2020-02-12 15:46 ` Mel Gorman [this message]
2020-02-12 15:46 ` [PATCH 11/11] sched/numa: Use similar logic to the load balancer for moving between overloaded domains Mel Gorman
     [not found] ` <20200214041232.18904-1-hdanton@sina.com>
2020-02-14  7:50   ` [PATCH 08/11] sched/numa: Bias swapping tasks based on their preferred node Mel Gorman

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:b2476ef0b05 dfblob:69e41204cfa )
 OR (
bs:"[PATCH 10/11] sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200212154606.GO3466@techsingularity.net \
    --to=mgorman@techsingularity.net \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=pauld@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=valentin.schneider@arm.com \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.