linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
To: Mel Gorman <mgorman@suse.de>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Ingo Molnar <mingo@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Linux-MM <linux-mm@kvack.org>,
	LKML <linux-kernel@vger.kernel.org>,
	Preeti U Murthy <preeti@linux.vnet.ibm.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Subject: [RFC PATCH 09/10] sched: Choose a runqueue that has lesser local affinity tasks
Date: Tue, 30 Jul 2013 13:18:24 +0530	[thread overview]
Message-ID: <1375170505-5967-10-git-send-email-srikar@linux.vnet.ibm.com> (raw)
In-Reply-To: <1375170505-5967-1-git-send-email-srikar@linux.vnet.ibm.com>

While migrating tasks to a different node, choosing the busiest runqueue
may not always be the right choice. The busiest runqueue might have
tasks that are already consolidated. Choosing such a runqueue might
actually lead to more performance impact.

Alternatively choose a runqueue that has less local numa affine tasks,
i.e, tasks that benefit if run on a node other than their current node.
The load balancer would then pitchin to move load from the busiest
runqueue to the runqueue from where tasks for cross node migration were
picked. So the load would end up being better consolidated.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 include/linux/sched.h |    2 +
 kernel/sched/fair.c   |   82 +++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h  |    1 +
 3 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba188f1..c5d0a13 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1507,6 +1507,8 @@ struct task_struct {
 	u64 node_stamp;			/* migration stamp  */
 	struct callback_head numa_work;
 	int migrate_seq;
+	bool pinned_task;
+	bool local_task;
 #endif /* CONFIG_NUMA_BALANCING */
 
 	struct rcu_head rcu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a99aebc..e749650 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -805,6 +805,36 @@ static void task_numa_placement(struct task_struct *p)
 	/* FIXME: Scheduling placement policy hints go here */
 }
 
+static void update_local_task_count(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+	int curnode = cpu_to_node(cpu_of(rq));
+	int cur_numa_weight = 0;
+	int total_numa_weight = 0;
+
+	if (!p->pinned_task) {
+		if (p->mm && p->mm->numa_weights) {
+			cur_numa_weight = atomic_read(&p->mm->numa_weights[curnode]);
+			total_numa_weight = atomic_read(&p->mm->numa_weights[nr_node_ids]);
+		}
+
+		/*
+		 * Account tasks that are neither pinned nor have numa affinity as
+		 * non local tasks.
+		 */
+		if (p->local_task != (cur_numa_weight * nr_node_ids > total_numa_weight)) {
+			if (!p->local_task) {
+				rq->non_local_task_count--;
+				p->local_task = true;
+			} else {
+				rq->non_local_task_count++;
+				p->local_task = false;
+			}
+
+		}
+	}
+}
+
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
@@ -826,6 +856,9 @@ void task_numa_fault(int node, int pages, bool migrated)
 			p->numa_scan_period + jiffies_to_msecs(10));
 
 	task_numa_placement(p);
+
+	/* Should this be moved to update_curr()? */
+	update_local_task_count(p);
 }
 
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -996,16 +1029,31 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	}
 }
 
+static void add_non_local_task_count(struct rq *rq, struct task_struct *p,
+		int value)
+{
+	if (p->pinned_task || p->local_task)
+		return;
+	else
+		rq->non_local_task_count += value;
+}
+
 static void account_numa_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p)
 {
 	struct mm_struct *mm = p->mm;
 	struct rq *rq = rq_of(cfs_rq);
 	int curnode = cpu_to_node(cpu_of(rq));
+	int cur_numa_weight = 0;
+	int total_numa_weight = 0;
 
 	if (mm && mm->numa_weights) {
-		atomic_read(&mm->numa_weights[curnode]);
-		atomic_read(&mm->numa_weights[nr_node_ids]);
+		cur_numa_weight = atomic_inc_return(&mm->numa_weights[curnode]);
+		total_numa_weight = atomic_inc_return(&mm->numa_weights[nr_node_ids]);
 	}
+
+	p->pinned_task = (p->nr_cpus_allowed == 1);
+	p->local_task = (cur_numa_weight * nr_node_ids > total_numa_weight);
+	add_non_local_task_count(rq, p, 1);
 }
 
 static void account_numa_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p)
@@ -1019,6 +1067,10 @@ static void account_numa_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p)
 		atomic_dec(&mm->numa_weights[curnode]);
 		atomic_dec(&mm->numa_weights[nr_node_ids]);
 	}
+
+	add_non_local_task_count(rq, p, -1);
+	p->pinned_task = false;
+	p->local_task = false;
 }
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
@@ -5046,6 +5098,27 @@ find_busiest_group(struct lb_env *env, int *balance)
 	return NULL;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static struct rq *find_numa_queue(struct lb_env *env,
+				struct sched_group *group, struct rq *busy_rq)
+{
+	struct rq *rq;
+	int i;
+
+	for_each_cpu(i, sched_group_cpus(group)) {
+		if (!cpumask_test_cpu(i, env->cpus))
+			continue;
+
+		rq = cpu_rq(i);
+		if (rq->nr_running > 1) {
+			if (rq->non_local_task_count > busy_rq->non_local_task_count)
+				busy_rq = rq;
+		}
+	}
+	return busy_rq;
+}
+#endif
+
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
@@ -5187,8 +5260,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	if (busiest->nr_running > 1) {
 #ifdef CONFIG_NUMA_BALANCING
 		if (sd->flags & SD_NUMA) {
-			if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu))
+			if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu)) {
 				env.iterations = 0;
+				busiest = find_numa_queue(&env, group, busiest);
+			}
+
 		}
 #endif
 		/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9f60d74..5e620b7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -486,6 +486,7 @@ struct rq {
 	struct sched_avg avg;
 #ifdef CONFIG_NUMA_BALANCING
 	struct task_struct *push_task;
+	unsigned int non_local_task_count;
 #endif
 };
 
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2013-07-30  7:50 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-07-30  7:48 [RFC PATCH 00/10] Improve numa scheduling by consolidating tasks Srikar Dronamraju
2013-07-30  7:48 ` [RFC PATCH 01/10] sched: Introduce per node numa weights Srikar Dronamraju
2013-07-30  7:48 ` [RFC PATCH 02/10] sched: Use numa weights while migrating tasks Srikar Dronamraju
2013-07-30  7:48 ` [RFC PATCH 03/10] sched: Select a better task to pull across node using iterations Srikar Dronamraju
2013-07-30  7:48 ` [RFC PATCH 04/10] sched: Move active_load_balance_cpu_stop to a new helper function Srikar Dronamraju
2013-07-30  7:48 ` [RFC PATCH 05/10] sched: Extend idle balancing to look for consolidation of tasks Srikar Dronamraju
2013-07-30  7:48 ` [RFC PATCH 06/10] sched: Limit migrations from a node Srikar Dronamraju
2013-07-30  7:48 ` [RFC PATCH 07/10] sched: Pass hint to active balancer about the task to be chosen Srikar Dronamraju
2013-07-30  7:48 ` [RFC PATCH 08/10] sched: Prevent a task from migrating immediately after an active balance Srikar Dronamraju
2013-07-30  7:48 ` Srikar Dronamraju [this message]
2013-07-30  7:48 ` [RFC PATCH 10/10] x86, mm: Prevent gcc to re-read the pagetables Srikar Dronamraju
2013-07-30  8:17 ` [RFC PATCH 00/10] Improve numa scheduling by consolidating tasks Peter Zijlstra
2013-07-30  8:20   ` Peter Zijlstra
2013-07-30  9:03     ` Srikar Dronamraju
2013-07-30  9:10       ` Peter Zijlstra
2013-07-30  9:26         ` Peter Zijlstra
2013-07-30  9:46         ` Srikar Dronamraju
2013-07-31 15:09           ` Peter Zijlstra
2013-07-31 18:06             ` Srikar Dronamraju
2013-07-30  9:15     ` Srikar Dronamraju
2013-07-30  9:33       ` Peter Zijlstra
2013-07-31 17:35         ` Srikar Dronamraju
2013-07-31 13:33 ` Andrew Theurer
2013-07-31 15:43   ` Srikar Dronamraju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1375170505-5967-10-git-send-email-srikar@linux.vnet.ibm.com \
    --to=srikar@linux.vnet.ibm.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=aarcange@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=preeti@linux.vnet.ibm.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).