All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@elte.hu>,
	Paul Turner <pjt@google.com>,
	Suresh Siddha <suresh.b.siddha@intel.com>,
	Mike Galbraith <efault@gmx.de>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Lai Jiangshan <laijs@cn.fujitsu.com>,
	Dan Smith <danms@us.ibm.com>,
	Bharata B Rao <bharata.rao@gmail.com>,
	Lee Schermerhorn <Lee.Schermerhorn@hp.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Rik van Riel <riel@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC][PATCH 13/26] sched: Implement home-node awareness
Date: Fri, 16 Mar 2012 15:40:41 +0100	[thread overview]
Message-ID: <20120316144240.952119284@chello.nl> (raw)
In-Reply-To: 20120316144028.036474157@chello.nl

[-- Attachment #1: numa-foo-5.patch --]
[-- Type: text/plain, Size: 21396 bytes --]

Implement home node preference in the load-balancer.

This is done in four pieces:

 - task_numa_hot(); make it harder to migrate tasks away from their
   home-node, controlled using the NUMA_HOT feature flag.

 - select_task_rq_fair(); prefer placing the task in their home-node,
   controlled using the NUMA_BIAS feature flag.

 - load_balance(); during the regular pull load-balance pass, try
   pulling tasks that are on the wrong node first with a preference
   of moving them nearer to their home-node through task_numa_hot(),
   controlled through the NUMA_PULL feature flag.

 - load_balance(); when the balancer finds no imbalance, introduce
   some such that it still prefers to move tasks towards their
   home-node, using active load-balance if needed, controlled through
   the NUMA_PULL_BIAS feature flag.

In order to easily find off-node tasks, split the per-cpu task list
into two parts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h   |    1 
 kernel/sched/core.c     |   22 +++
 kernel/sched/debug.c    |    3 
 kernel/sched/fair.c     |  299 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/sched/features.h |    7 +
 kernel/sched/sched.h    |    9 +
 6 files changed, 299 insertions(+), 42 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -850,6 +850,7 @@ enum cpu_idle_type {
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
 #define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
+#define SD_NUMA			0x4000	/* cross-node balancing */
 
 enum powersavings_balance_level {
 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5806,7 +5806,9 @@ static void destroy_sched_domains(struct
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_id);
 
-static void update_top_cache_domain(int cpu)
+DEFINE_PER_CPU(struct sched_domain *, sd_node);
+
+static void update_domain_cache(int cpu)
 {
 	struct sched_domain *sd;
 	int id = cpu;
@@ -5817,6 +5819,17 @@ static void update_top_cache_domain(int
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_id, cpu) = id;
+
+	for_each_domain(cpu, sd) {
+		if (cpumask_equal(sched_domain_span(sd),
+				  cpumask_of_node(cpu_to_node(cpu))))
+			goto got_node;
+	}
+	sd = NULL;
+got_node:
+	rcu_assign_pointer(per_cpu(sd_node, cpu), sd);
+	if (sd) for (sd = sd->parent; sd; sd = sd->parent)
+		sd->flags |= SD_NUMA;
 }
 
 /*
@@ -5859,7 +5872,7 @@ cpu_attach_domain(struct sched_domain *s
 	rcu_assign_pointer(rq->sd, sd);
 	destroy_sched_domains(tmp, cpu);
 
-	update_top_cache_domain(cpu);
+	update_domain_cache(cpu);
 }
 
 /* cpus with isolated domains */
@@ -7012,6 +7025,11 @@ void __init sched_init(void)
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
+#ifdef CONFIG_NUMA
+		INIT_LIST_HEAD(&rq->offnode_tasks);
+		rq->offnode_running = 0;
+		rq->offnode_weight = 0;
+#endif
 
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,6 +132,9 @@ print_task(struct seq_file *m, struct rq
 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_NUMA
+	SEQ_printf(m, " %d/%d", p->node, cpu_to_node(task_cpu(p)));
+#endif
 #ifdef CONFIG_CGROUP_SCHED
 	SEQ_printf(m, " %s", task_group_path(task_group(p)));
 #endif
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/random.h>
 
 #include <trace/events/sched.h>
 
@@ -783,8 +784,10 @@ account_entity_enqueue(struct cfs_rq *cf
 	if (!parent_entity(se))
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-	if (entity_is_task(se))
-		list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+	if (entity_is_task(se)) {
+		if (!account_numa_enqueue(task_of(se)))
+			list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+	}
 #endif
 	cfs_rq->nr_running++;
 }
@@ -795,8 +798,10 @@ account_entity_dequeue(struct cfs_rq *cf
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-	if (entity_is_task(se))
+	if (entity_is_task(se)) {
 		list_del_init(&se->group_node);
+		account_numa_dequeue(task_of(se));
+	}
 	cfs_rq->nr_running--;
 }
 
@@ -2702,6 +2707,7 @@ select_task_rq_fair(struct task_struct *
 	int want_affine = 0;
 	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
+	int node = tsk_home_node(p);
 
 	if (p->rt.nr_cpus_allowed == 1)
 		return prev_cpu;
@@ -2713,6 +2719,29 @@ select_task_rq_fair(struct task_struct *
 	}
 
 	rcu_read_lock();
+	if (sched_feat(NUMA_BIAS) && node != -1) {
+		int node_cpu;
+
+		node_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpumask_of_node(node));
+		if (node_cpu >= nr_cpu_ids)
+			goto find_sd;
+
+		/*
+		 * For fork,exec find the idlest cpu in the home-node.
+		 */
+		if (sd_flag & (SD_BALANCE_FORK|SD_BALANCE_EXEC)) {
+			new_cpu = cpu = node_cpu;
+			sd = per_cpu(sd_node, cpu);
+			goto pick_idlest;
+		}
+
+		/*
+		 * For wake, pretend we were running in the home-node.
+		 */
+		prev_cpu = node_cpu;
+	}
+
+find_sd:
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
 			continue;
@@ -2769,6 +2798,7 @@ select_task_rq_fair(struct task_struct *
 		goto unlock;
 	}
 
+pick_idlest:
 	while (sd) {
 		int load_idx = sd->forkexec_idx;
 		struct sched_group *group;
@@ -3085,6 +3115,8 @@ struct lb_env {
 	long			load_move;
 	unsigned int		flags;
 
+	struct list_head	*tasks;
+
 	unsigned int		loop;
 	unsigned int		loop_break;
 	unsigned int		loop_max;
@@ -3102,6 +3134,30 @@ static void move_task(struct task_struct
 	check_preempt_curr(env->dst_rq, p, 0);
 }
 
+#ifdef CONFIG_NUMA
+static int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
+{
+	int from_dist, to_dist;
+	int node = tsk_home_node(p);
+
+	if (!sched_feat(NUMA_HOT) || node == -1)
+		return 0; /* no node preference */
+
+	from_dist = node_distance(cpu_to_node(from_cpu), node);
+	to_dist = node_distance(cpu_to_node(to_cpu), node);
+
+	if (to_dist < from_dist)
+		return 0; /* getting closer is ok */
+
+	return 1; /* stick to where we are */
+}
+#else
+static inline int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
+{
+	return 0;
+}
+#endif /* CONFIG_NUMA */
+
 /*
  * Is this task likely cache-hot:
  */
@@ -3165,6 +3221,7 @@ int can_migrate_task(struct task_struct
 	 */
 
 	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+	tsk_cache_hot |= task_numa_hot(p, env->src_cpu, env->dst_cpu);
 	if (!tsk_cache_hot ||
 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -3190,11 +3247,11 @@ int can_migrate_task(struct task_struct
  *
  * Called with both runqueues locked.
  */
-static int move_one_task(struct lb_env *env)
+static int __move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
-	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+	list_for_each_entry_safe(p, n, env->tasks, se.group_node) {
 		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
 			continue;
 
@@ -3213,6 +3270,21 @@ static int move_one_task(struct lb_env *
 	return 0;
 }
 
+static int move_one_task(struct lb_env *env)
+{
+	if (sched_feat(NUMA_PULL)) {
+		env->tasks = &env->src_rq->offnode_tasks;
+		if (__move_one_task(env))
+			return 1;
+	}
+
+	env->tasks = &env->src_rq->cfs_tasks;
+	if (__move_one_task(env))
+		return 1;
+
+	return 0;
+}
+
 static unsigned long task_h_load(struct task_struct *p);
 
 /*
@@ -3224,7 +3296,6 @@ static unsigned long task_h_load(struct
  */
 static int move_tasks(struct lb_env *env)
 {
-	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
 	unsigned long load;
 	int pulled = 0;
@@ -3232,8 +3303,9 @@ static int move_tasks(struct lb_env *env
 	if (env->load_move <= 0)
 		return 0;
 
-	while (!list_empty(tasks)) {
-		p = list_first_entry(tasks, struct task_struct, se.group_node);
+again:
+	while (!list_empty(env->tasks)) {
+		p = list_first_entry(env->tasks, struct task_struct, se.group_node);
 
 		env->loop++;
 		/* We've more or less seen every task there is, call it quits */
@@ -3244,7 +3316,7 @@ static int move_tasks(struct lb_env *env
 		if (env->loop > env->loop_break) {
 			env->loop_break += sysctl_sched_nr_migrate;
 			env->flags |= LBF_NEED_BREAK;
-			break;
+			goto out;
 		}
 
 		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
@@ -3272,7 +3344,7 @@ static int move_tasks(struct lb_env *env
 		 * the critical section.
 		 */
 		if (env->idle == CPU_NEWLY_IDLE)
-			break;
+			goto out;
 #endif
 
 		/*
@@ -3280,13 +3352,20 @@ static int move_tasks(struct lb_env *env
 		 * weighted load.
 		 */
 		if (env->load_move <= 0)
-			break;
+			goto out;
 
 		continue;
 next:
-		list_move_tail(&p->se.group_node, tasks);
+		list_move_tail(&p->se.group_node, env->tasks);
 	}
 
+	if (env->tasks == &env->src_rq->offnode_tasks) {
+		env->tasks = &env->src_rq->cfs_tasks;
+		env->loop = 0;
+		goto again;
+	}
+
+out:
 	/*
 	 * Right now, this is one of only two places move_task() is called,
 	 * so we can safely collect move_task() stats here rather than
@@ -3441,6 +3520,15 @@ struct sd_lb_stats {
 	unsigned long leader_nr_running; /* Nr running of group_leader */
 	unsigned long min_nr_running; /* Nr running of group_min */
 #endif
+#ifdef CONFIG_NUMA
+	struct sched_group *numa_group; /* group which has offnode_tasks */
+	unsigned long numa_group_weight;
+	unsigned long numa_group_running;
+#endif
+
+	struct rq *(*find_busiest_queue)(struct sched_domain *sd,
+			struct sched_group *group, enum cpu_idle_type idle,
+			unsigned long imbalance, const struct cpumask *cpus);
 };
 
 /*
@@ -3456,6 +3544,10 @@ struct sg_lb_stats {
 	unsigned long group_weight;
 	int group_imb; /* Is there an imbalance in the group ? */
 	int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA
+	unsigned long numa_weight;
+	unsigned long numa_running;
+#endif
 };
 
 /**
@@ -3625,6 +3717,117 @@ static inline int check_power_save_busie
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
+#ifdef CONFIG_NUMA
+static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
+{
+	sgs->numa_weight += rq->offnode_weight;
+	sgs->numa_running += rq->offnode_running;
+}
+
+/*
+ * Since the offnode lists are indiscriminate (they contain tasks for all other
+ * nodes) it is impossible to say if there's any task on there that wants to
+ * move towards the pulling cpu. Therefore select a random offnode list to pull
+ * from such that eventually we'll try them all.
+ */
+static inline bool pick_numa_rand(void)
+{
+	return get_random_int() & 1;
+}
+
+static inline void update_sd_numa_stats(struct sched_domain *sd,
+		struct sched_group *group, struct sd_lb_stats *sds,
+		int local_group, struct sg_lb_stats *sgs)
+{
+	if (!(sd->flags & SD_NUMA))
+		return;
+
+	if (local_group)
+		return;
+
+	if (!sgs->numa_running)
+		return;
+
+	if (!sds->numa_group_running || pick_numa_rand()) {
+		sds->numa_group = group;
+		sds->numa_group_weight = sgs->numa_weight;
+		sds->numa_group_running = sgs->numa_running;
+	}
+}
+
+static struct rq *
+find_busiest_numa_queue(struct sched_domain *sd, struct sched_group *group,
+		   enum cpu_idle_type idle, unsigned long imbalance,
+		   const struct cpumask *cpus)
+{
+	struct rq *busiest = NULL, *rq;
+	int cpu;
+
+	for_each_cpu_and(cpu, sched_group_cpus(group), cpus) {
+		rq = cpu_rq(cpu);
+		if (!rq->offnode_running)
+			continue;
+		if (!busiest || pick_numa_rand())
+			busiest = rq;
+	}
+
+	return busiest;
+}
+
+static inline int check_numa_busiest_group(struct sd_lb_stats *sds,
+		int this_cpu, unsigned long *imbalance)
+{
+	if (!sched_feat(NUMA_PULL_BIAS))
+		return 0;
+
+	if (!sds->numa_group)
+		return 0;
+
+	*imbalance = sds->numa_group_weight / sds->numa_group_running;
+	sds->busiest = sds->numa_group;
+	sds->find_busiest_queue = find_busiest_numa_queue;
+	return 1;
+}
+
+static inline
+bool need_active_numa_balance(struct sched_domain *sd, struct rq *busiest)
+{
+	/*
+	 * Not completely fail-safe, but its a fair bet that if we're at a
+	 * rq that only has one task, and its offnode, we're here through
+	 * find_busiest_numa_queue(). In any case, we want to kick such tasks.
+	 */
+	if ((sd->flags & SD_NUMA) && busiest->offnode_running == 1 &&
+			busiest->nr_running == 1)
+		return true;
+
+	return false;
+}
+
+#else /* CONFIG_NUMA */
+
+static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
+{
+}
+
+static inline void update_sd_numa_stats(struct sched_domain *sd,
+		struct sched_group *group, struct sd_lb_stats *sds,
+		int local_group, struct sg_lb_stats *sgs)
+{
+}
+
+static inline int check_numa_busiest_group(struct sd_lb_stats *sds,
+		int this_cpu, unsigned long *imbalance)
+{
+	return 0;
+}
+
+static inline
+bool need_active_numa_balance(struct sched_domain *sd, struct rq *busiest)
+{
+	return false;
+}
+#endif /* CONFIG_NUMA */
 
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
@@ -3816,6 +4019,8 @@ static inline void update_sg_lb_stats(st
 		sgs->sum_weighted_load += weighted_cpuload(i);
 		if (idle_cpu(i))
 			sgs->idle_cpus++;
+
+		update_sg_numa_stats(sgs, rq);
 	}
 
 	/*
@@ -3977,6 +4182,8 @@ static inline void update_sd_lb_stats(st
 		}
 
 		update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+		update_sd_numa_stats(sd, sg, sds, local_group, &sgs);
+
 		sg = sg->next;
 	} while (sg != sd->groups);
 }
@@ -4192,19 +4399,16 @@ static inline void calculate_imbalance(s
  *		   put to idle by rebalancing its tasks onto our group.
  */
 static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   const struct cpumask *cpus, int *balance)
+find_busiest_group(struct sched_domain *sd, struct sd_lb_stats *sds,
+		   int this_cpu, unsigned long *imbalance,
+		   enum cpu_idle_type idle, const struct cpumask *cpus,
+		   int *balance)
 {
-	struct sd_lb_stats sds;
-
-	memset(&sds, 0, sizeof(sds));
-
 	/*
 	 * Compute the various statistics relavent for load balancing at
 	 * this level.
 	 */
-	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, sds);
 
 	/*
 	 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4214,40 +4418,40 @@ find_busiest_group(struct sched_domain *
 		goto ret;
 
 	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
-	    check_asym_packing(sd, &sds, this_cpu, imbalance))
-		return sds.busiest;
+	    check_asym_packing(sd, sds, this_cpu, imbalance))
+		return sds->busiest;
 
 	/* There is no busy sibling group to pull tasks from */
-	if (!sds.busiest || sds.busiest_nr_running == 0)
+	if (!sds->busiest || sds->busiest_nr_running == 0)
 		goto out_balanced;
 
-	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+	sds->avg_load = (SCHED_POWER_SCALE * sds->total_load) / sds->total_pwr;
 
 	/*
 	 * If the busiest group is imbalanced the below checks don't
 	 * work because they assumes all things are equal, which typically
 	 * isn't true due to cpus_allowed constraints and the like.
 	 */
-	if (sds.group_imb)
+	if (sds->group_imb)
 		goto force_balance;
 
 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
-			!sds.busiest_has_capacity)
+	if (idle == CPU_NEWLY_IDLE && sds->this_has_capacity &&
+			!sds->busiest_has_capacity)
 		goto force_balance;
 
 	/*
 	 * If the local group is more busy than the selected busiest group
 	 * don't try and pull any tasks.
 	 */
-	if (sds.this_load >= sds.max_load)
+	if (sds->this_load >= sds->max_load)
 		goto out_balanced;
 
 	/*
 	 * Don't pull any tasks if this group is already above the domain
 	 * average load.
 	 */
-	if (sds.this_load >= sds.avg_load)
+	if (sds->this_load >= sds->avg_load)
 		goto out_balanced;
 
 	if (idle == CPU_IDLE) {
@@ -4257,30 +4461,33 @@ find_busiest_group(struct sched_domain *
 		 * there is no imbalance between this and busiest group
 		 * wrt to idle cpu's, it is balanced.
 		 */
-		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
-		    sds.busiest_nr_running <= sds.busiest_group_weight)
+		if ((sds->this_idle_cpus <= sds->busiest_idle_cpus + 1) &&
+		    sds->busiest_nr_running <= sds->busiest_group_weight)
 			goto out_balanced;
 	} else {
 		/*
 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
 		 * imbalance_pct to be conservative.
 		 */
-		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+		if (100 * sds->max_load <= sd->imbalance_pct * sds->this_load)
 			goto out_balanced;
 	}
 
 force_balance:
 	/* Looks like there is an imbalance. Compute it */
-	calculate_imbalance(&sds, this_cpu, imbalance);
-	return sds.busiest;
+	calculate_imbalance(sds, this_cpu, imbalance);
+	return sds->busiest;
 
 out_balanced:
+	if (check_numa_busiest_group(sds, this_cpu, imbalance))
+		return sds->busiest;
+
 	/*
 	 * There is no obvious imbalance. But check if we can do some balancing
 	 * to save power.
 	 */
-	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-		return sds.busiest;
+	if (check_power_save_busiest_group(sds, this_cpu, imbalance))
+		return sds->busiest;
 ret:
 	*imbalance = 0;
 	return NULL;
@@ -4347,9 +4554,11 @@ find_busiest_queue(struct sched_domain *
 DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
 static int need_active_balance(struct sched_domain *sd, int idle,
-			       int busiest_cpu, int this_cpu)
+			       struct rq *busiest, struct rq *this)
 {
 	if (idle == CPU_NEWLY_IDLE) {
+		int busiest_cpu = cpu_of(busiest);
+		int this_cpu = cpu_of(this);
 
 		/*
 		 * ASYM_PACKING needs to force migrate tasks from busy but
@@ -4382,6 +4591,9 @@ static int need_active_balance(struct sc
 			return 0;
 	}
 
+	if (need_active_numa_balance(sd, busiest))
+		return 1;
+
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
@@ -4401,6 +4613,7 @@ static int load_balance(int this_cpu, st
 	struct rq *busiest;
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+	struct sd_lb_stats sds;
 
 	struct lb_env env = {
 		.sd		= sd,
@@ -4412,10 +4625,12 @@ static int load_balance(int this_cpu, st
 
 	cpumask_copy(cpus, cpu_active_mask);
 
+	memset(&sds, 0, sizeof(sds));
+	sds.find_busiest_queue = find_busiest_queue;
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle,
+	group = find_busiest_group(sd, &sds, this_cpu, &imbalance, idle,
 				   cpus, balance);
 
 	if (*balance == 0)
@@ -4426,7 +4641,7 @@ static int load_balance(int this_cpu, st
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+	busiest = sds.find_busiest_queue(sd, group, idle, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -4449,6 +4664,10 @@ static int load_balance(int this_cpu, st
 		env.src_cpu = busiest->cpu;
 		env.src_rq = busiest;
 		env.loop_max = busiest->nr_running;
+		if (sched_feat(NUMA_PULL))
+			env.tasks = &busiest->offnode_tasks;
+		else
+			env.tasks = &busiest->cfs_tasks;
 
 more_balance:
 		local_irq_save(flags);
@@ -4490,7 +4709,7 @@ static int load_balance(int this_cpu, st
 		if (idle != CPU_NEWLY_IDLE)
 			sd->nr_balance_failed++;
 
-		if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+		if (need_active_balance(sd, idle, busiest, this_rq)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the active_load_balance_cpu_stop,
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
 
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
+
+#ifdef CONFIG_NUMA
+SCHED_FEAT(NUMA_HOT,       true)
+SCHED_FEAT(NUMA_BIAS,      true)
+SCHED_FEAT(NUMA_PULL,      true)
+SCHED_FEAT(NUMA_PULL_BIAS, true)
+#endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -414,6 +414,12 @@ struct rq {
 
 	struct list_head cfs_tasks;
 
+#ifdef CONFIG_NUMA
+	unsigned long    offnode_running;
+	unsigned long	 offnode_weight;
+	struct list_head offnode_tasks;
+#endif
+
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
@@ -525,6 +531,7 @@ static inline struct sched_domain *highe
 
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_node);
 
 #endif /* CONFIG_SMP */
 
@@ -1158,3 +1165,5 @@ enum rq_nohz_flag_bits {
 #endif
 
 static inline void select_task_node(struct task_struct *p, struct mm_struct *mm, int sd_flags) { }
+static inline bool account_numa_enqueue(struct task_struct *p) { return false; }
+static inline void account_numa_dequeue(struct task_struct *p) { }


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@elte.hu>,
	Paul Turner <pjt@google.com>,
	Suresh Siddha <suresh.b.siddha@intel.com>,
	Mike Galbraith <efault@gmx.de>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Lai Jiangshan <laijs@cn.fujitsu.com>,
	Dan Smith <danms@us.ibm.com>,
	Bharata B Rao <bharata.rao@gmail.com>,
	Lee Schermerhorn <Lee.Schermerhorn@hp.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Rik van Riel <riel@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC][PATCH 13/26] sched: Implement home-node awareness
Date: Fri, 16 Mar 2012 15:40:41 +0100	[thread overview]
Message-ID: <20120316144240.952119284@chello.nl> (raw)
In-Reply-To: 20120316144028.036474157@chello.nl

[-- Attachment #1: numa-foo-5.patch --]
[-- Type: text/plain, Size: 21093 bytes --]

Implement home node preference in the load-balancer.

This is done in four pieces:

 - task_numa_hot(); make it harder to migrate tasks away from their
   home-node, controlled using the NUMA_HOT feature flag.

 - select_task_rq_fair(); prefer placing the task in their home-node,
   controlled using the NUMA_BIAS feature flag.

 - load_balance(); during the regular pull load-balance pass, try
   pulling tasks that are on the wrong node first with a preference
   of moving them nearer to their home-node through task_numa_hot(),
   controlled through the NUMA_PULL feature flag.

 - load_balance(); when the balancer finds no imbalance, introduce
   some such that it still prefers to move tasks towards their
   home-node, using active load-balance if needed, controlled through
   the NUMA_PULL_BIAS feature flag.

In order to easily find off-node tasks, split the per-cpu task list
into two parts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h   |    1 
 kernel/sched/core.c     |   22 +++
 kernel/sched/debug.c    |    3 
 kernel/sched/fair.c     |  299 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/sched/features.h |    7 +
 kernel/sched/sched.h    |    9 +
 6 files changed, 299 insertions(+), 42 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -850,6 +850,7 @@ enum cpu_idle_type {
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
 #define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
+#define SD_NUMA			0x4000	/* cross-node balancing */
 
 enum powersavings_balance_level {
 	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5806,7 +5806,9 @@ static void destroy_sched_domains(struct
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_id);
 
-static void update_top_cache_domain(int cpu)
+DEFINE_PER_CPU(struct sched_domain *, sd_node);
+
+static void update_domain_cache(int cpu)
 {
 	struct sched_domain *sd;
 	int id = cpu;
@@ -5817,6 +5819,17 @@ static void update_top_cache_domain(int
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_id, cpu) = id;
+
+	for_each_domain(cpu, sd) {
+		if (cpumask_equal(sched_domain_span(sd),
+				  cpumask_of_node(cpu_to_node(cpu))))
+			goto got_node;
+	}
+	sd = NULL;
+got_node:
+	rcu_assign_pointer(per_cpu(sd_node, cpu), sd);
+	if (sd) for (sd = sd->parent; sd; sd = sd->parent)
+		sd->flags |= SD_NUMA;
 }
 
 /*
@@ -5859,7 +5872,7 @@ cpu_attach_domain(struct sched_domain *s
 	rcu_assign_pointer(rq->sd, sd);
 	destroy_sched_domains(tmp, cpu);
 
-	update_top_cache_domain(cpu);
+	update_domain_cache(cpu);
 }
 
 /* cpus with isolated domains */
@@ -7012,6 +7025,11 @@ void __init sched_init(void)
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
+#ifdef CONFIG_NUMA
+		INIT_LIST_HEAD(&rq->offnode_tasks);
+		rq->offnode_running = 0;
+		rq->offnode_weight = 0;
+#endif
 
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -132,6 +132,9 @@ print_task(struct seq_file *m, struct rq
 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_NUMA
+	SEQ_printf(m, " %d/%d", p->node, cpu_to_node(task_cpu(p)));
+#endif
 #ifdef CONFIG_CGROUP_SCHED
 	SEQ_printf(m, " %s", task_group_path(task_group(p)));
 #endif
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/random.h>
 
 #include <trace/events/sched.h>
 
@@ -783,8 +784,10 @@ account_entity_enqueue(struct cfs_rq *cf
 	if (!parent_entity(se))
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-	if (entity_is_task(se))
-		list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+	if (entity_is_task(se)) {
+		if (!account_numa_enqueue(task_of(se)))
+			list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+	}
 #endif
 	cfs_rq->nr_running++;
 }
@@ -795,8 +798,10 @@ account_entity_dequeue(struct cfs_rq *cf
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-	if (entity_is_task(se))
+	if (entity_is_task(se)) {
 		list_del_init(&se->group_node);
+		account_numa_dequeue(task_of(se));
+	}
 	cfs_rq->nr_running--;
 }
 
@@ -2702,6 +2707,7 @@ select_task_rq_fair(struct task_struct *
 	int want_affine = 0;
 	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
+	int node = tsk_home_node(p);
 
 	if (p->rt.nr_cpus_allowed == 1)
 		return prev_cpu;
@@ -2713,6 +2719,29 @@ select_task_rq_fair(struct task_struct *
 	}
 
 	rcu_read_lock();
+	if (sched_feat(NUMA_BIAS) && node != -1) {
+		int node_cpu;
+
+		node_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpumask_of_node(node));
+		if (node_cpu >= nr_cpu_ids)
+			goto find_sd;
+
+		/*
+		 * For fork,exec find the idlest cpu in the home-node.
+		 */
+		if (sd_flag & (SD_BALANCE_FORK|SD_BALANCE_EXEC)) {
+			new_cpu = cpu = node_cpu;
+			sd = per_cpu(sd_node, cpu);
+			goto pick_idlest;
+		}
+
+		/*
+		 * For wake, pretend we were running in the home-node.
+		 */
+		prev_cpu = node_cpu;
+	}
+
+find_sd:
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
 			continue;
@@ -2769,6 +2798,7 @@ select_task_rq_fair(struct task_struct *
 		goto unlock;
 	}
 
+pick_idlest:
 	while (sd) {
 		int load_idx = sd->forkexec_idx;
 		struct sched_group *group;
@@ -3085,6 +3115,8 @@ struct lb_env {
 	long			load_move;
 	unsigned int		flags;
 
+	struct list_head	*tasks;
+
 	unsigned int		loop;
 	unsigned int		loop_break;
 	unsigned int		loop_max;
@@ -3102,6 +3134,30 @@ static void move_task(struct task_struct
 	check_preempt_curr(env->dst_rq, p, 0);
 }
 
+#ifdef CONFIG_NUMA
+static int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
+{
+	int from_dist, to_dist;
+	int node = tsk_home_node(p);
+
+	if (!sched_feat(NUMA_HOT) || node == -1)
+		return 0; /* no node preference */
+
+	from_dist = node_distance(cpu_to_node(from_cpu), node);
+	to_dist = node_distance(cpu_to_node(to_cpu), node);
+
+	if (to_dist < from_dist)
+		return 0; /* getting closer is ok */
+
+	return 1; /* stick to where we are */
+}
+#else
+static inline int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
+{
+	return 0;
+}
+#endif /* CONFIG_NUMA */
+
 /*
  * Is this task likely cache-hot:
  */
@@ -3165,6 +3221,7 @@ int can_migrate_task(struct task_struct
 	 */
 
 	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+	tsk_cache_hot |= task_numa_hot(p, env->src_cpu, env->dst_cpu);
 	if (!tsk_cache_hot ||
 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -3190,11 +3247,11 @@ int can_migrate_task(struct task_struct
  *
  * Called with both runqueues locked.
  */
-static int move_one_task(struct lb_env *env)
+static int __move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
-	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+	list_for_each_entry_safe(p, n, env->tasks, se.group_node) {
 		if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
 			continue;
 
@@ -3213,6 +3270,21 @@ static int move_one_task(struct lb_env *
 	return 0;
 }
 
+static int move_one_task(struct lb_env *env)
+{
+	if (sched_feat(NUMA_PULL)) {
+		env->tasks = &env->src_rq->offnode_tasks;
+		if (__move_one_task(env))
+			return 1;
+	}
+
+	env->tasks = &env->src_rq->cfs_tasks;
+	if (__move_one_task(env))
+		return 1;
+
+	return 0;
+}
+
 static unsigned long task_h_load(struct task_struct *p);
 
 /*
@@ -3224,7 +3296,6 @@ static unsigned long task_h_load(struct
  */
 static int move_tasks(struct lb_env *env)
 {
-	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
 	unsigned long load;
 	int pulled = 0;
@@ -3232,8 +3303,9 @@ static int move_tasks(struct lb_env *env
 	if (env->load_move <= 0)
 		return 0;
 
-	while (!list_empty(tasks)) {
-		p = list_first_entry(tasks, struct task_struct, se.group_node);
+again:
+	while (!list_empty(env->tasks)) {
+		p = list_first_entry(env->tasks, struct task_struct, se.group_node);
 
 		env->loop++;
 		/* We've more or less seen every task there is, call it quits */
@@ -3244,7 +3316,7 @@ static int move_tasks(struct lb_env *env
 		if (env->loop > env->loop_break) {
 			env->loop_break += sysctl_sched_nr_migrate;
 			env->flags |= LBF_NEED_BREAK;
-			break;
+			goto out;
 		}
 
 		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
@@ -3272,7 +3344,7 @@ static int move_tasks(struct lb_env *env
 		 * the critical section.
 		 */
 		if (env->idle == CPU_NEWLY_IDLE)
-			break;
+			goto out;
 #endif
 
 		/*
@@ -3280,13 +3352,20 @@ static int move_tasks(struct lb_env *env
 		 * weighted load.
 		 */
 		if (env->load_move <= 0)
-			break;
+			goto out;
 
 		continue;
 next:
-		list_move_tail(&p->se.group_node, tasks);
+		list_move_tail(&p->se.group_node, env->tasks);
 	}
 
+	if (env->tasks == &env->src_rq->offnode_tasks) {
+		env->tasks = &env->src_rq->cfs_tasks;
+		env->loop = 0;
+		goto again;
+	}
+
+out:
 	/*
 	 * Right now, this is one of only two places move_task() is called,
 	 * so we can safely collect move_task() stats here rather than
@@ -3441,6 +3520,15 @@ struct sd_lb_stats {
 	unsigned long leader_nr_running; /* Nr running of group_leader */
 	unsigned long min_nr_running; /* Nr running of group_min */
 #endif
+#ifdef CONFIG_NUMA
+	struct sched_group *numa_group; /* group which has offnode_tasks */
+	unsigned long numa_group_weight;
+	unsigned long numa_group_running;
+#endif
+
+	struct rq *(*find_busiest_queue)(struct sched_domain *sd,
+			struct sched_group *group, enum cpu_idle_type idle,
+			unsigned long imbalance, const struct cpumask *cpus);
 };
 
 /*
@@ -3456,6 +3544,10 @@ struct sg_lb_stats {
 	unsigned long group_weight;
 	int group_imb; /* Is there an imbalance in the group ? */
 	int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA
+	unsigned long numa_weight;
+	unsigned long numa_running;
+#endif
 };
 
 /**
@@ -3625,6 +3717,117 @@ static inline int check_power_save_busie
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
+#ifdef CONFIG_NUMA
+static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
+{
+	sgs->numa_weight += rq->offnode_weight;
+	sgs->numa_running += rq->offnode_running;
+}
+
+/*
+ * Since the offnode lists are indiscriminate (they contain tasks for all other
+ * nodes) it is impossible to say if there's any task on there that wants to
+ * move towards the pulling cpu. Therefore select a random offnode list to pull
+ * from such that eventually we'll try them all.
+ */
+static inline bool pick_numa_rand(void)
+{
+	return get_random_int() & 1;
+}
+
+static inline void update_sd_numa_stats(struct sched_domain *sd,
+		struct sched_group *group, struct sd_lb_stats *sds,
+		int local_group, struct sg_lb_stats *sgs)
+{
+	if (!(sd->flags & SD_NUMA))
+		return;
+
+	if (local_group)
+		return;
+
+	if (!sgs->numa_running)
+		return;
+
+	if (!sds->numa_group_running || pick_numa_rand()) {
+		sds->numa_group = group;
+		sds->numa_group_weight = sgs->numa_weight;
+		sds->numa_group_running = sgs->numa_running;
+	}
+}
+
+static struct rq *
+find_busiest_numa_queue(struct sched_domain *sd, struct sched_group *group,
+		   enum cpu_idle_type idle, unsigned long imbalance,
+		   const struct cpumask *cpus)
+{
+	struct rq *busiest = NULL, *rq;
+	int cpu;
+
+	for_each_cpu_and(cpu, sched_group_cpus(group), cpus) {
+		rq = cpu_rq(cpu);
+		if (!rq->offnode_running)
+			continue;
+		if (!busiest || pick_numa_rand())
+			busiest = rq;
+	}
+
+	return busiest;
+}
+
+static inline int check_numa_busiest_group(struct sd_lb_stats *sds,
+		int this_cpu, unsigned long *imbalance)
+{
+	if (!sched_feat(NUMA_PULL_BIAS))
+		return 0;
+
+	if (!sds->numa_group)
+		return 0;
+
+	*imbalance = sds->numa_group_weight / sds->numa_group_running;
+	sds->busiest = sds->numa_group;
+	sds->find_busiest_queue = find_busiest_numa_queue;
+	return 1;
+}
+
+static inline
+bool need_active_numa_balance(struct sched_domain *sd, struct rq *busiest)
+{
+	/*
+	 * Not completely fail-safe, but its a fair bet that if we're at a
+	 * rq that only has one task, and its offnode, we're here through
+	 * find_busiest_numa_queue(). In any case, we want to kick such tasks.
+	 */
+	if ((sd->flags & SD_NUMA) && busiest->offnode_running == 1 &&
+			busiest->nr_running == 1)
+		return true;
+
+	return false;
+}
+
+#else /* CONFIG_NUMA */
+
+static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
+{
+}
+
+static inline void update_sd_numa_stats(struct sched_domain *sd,
+		struct sched_group *group, struct sd_lb_stats *sds,
+		int local_group, struct sg_lb_stats *sgs)
+{
+}
+
+static inline int check_numa_busiest_group(struct sd_lb_stats *sds,
+		int this_cpu, unsigned long *imbalance)
+{
+	return 0;
+}
+
+static inline
+bool need_active_numa_balance(struct sched_domain *sd, struct rq *busiest)
+{
+	return false;
+}
+#endif /* CONFIG_NUMA */
 
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
@@ -3816,6 +4019,8 @@ static inline void update_sg_lb_stats(st
 		sgs->sum_weighted_load += weighted_cpuload(i);
 		if (idle_cpu(i))
 			sgs->idle_cpus++;
+
+		update_sg_numa_stats(sgs, rq);
 	}
 
 	/*
@@ -3977,6 +4182,8 @@ static inline void update_sd_lb_stats(st
 		}
 
 		update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+		update_sd_numa_stats(sd, sg, sds, local_group, &sgs);
+
 		sg = sg->next;
 	} while (sg != sd->groups);
 }
@@ -4192,19 +4399,16 @@ static inline void calculate_imbalance(s
  *		   put to idle by rebalancing its tasks onto our group.
  */
 static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   const struct cpumask *cpus, int *balance)
+find_busiest_group(struct sched_domain *sd, struct sd_lb_stats *sds,
+		   int this_cpu, unsigned long *imbalance,
+		   enum cpu_idle_type idle, const struct cpumask *cpus,
+		   int *balance)
 {
-	struct sd_lb_stats sds;
-
-	memset(&sds, 0, sizeof(sds));
-
 	/*
 	 * Compute the various statistics relavent for load balancing at
 	 * this level.
 	 */
-	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+	update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, sds);
 
 	/*
 	 * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4214,40 +4418,40 @@ find_busiest_group(struct sched_domain *
 		goto ret;
 
 	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
-	    check_asym_packing(sd, &sds, this_cpu, imbalance))
-		return sds.busiest;
+	    check_asym_packing(sd, sds, this_cpu, imbalance))
+		return sds->busiest;
 
 	/* There is no busy sibling group to pull tasks from */
-	if (!sds.busiest || sds.busiest_nr_running == 0)
+	if (!sds->busiest || sds->busiest_nr_running == 0)
 		goto out_balanced;
 
-	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+	sds->avg_load = (SCHED_POWER_SCALE * sds->total_load) / sds->total_pwr;
 
 	/*
 	 * If the busiest group is imbalanced the below checks don't
 	 * work because they assumes all things are equal, which typically
 	 * isn't true due to cpus_allowed constraints and the like.
 	 */
-	if (sds.group_imb)
+	if (sds->group_imb)
 		goto force_balance;
 
 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
-			!sds.busiest_has_capacity)
+	if (idle == CPU_NEWLY_IDLE && sds->this_has_capacity &&
+			!sds->busiest_has_capacity)
 		goto force_balance;
 
 	/*
 	 * If the local group is more busy than the selected busiest group
 	 * don't try and pull any tasks.
 	 */
-	if (sds.this_load >= sds.max_load)
+	if (sds->this_load >= sds->max_load)
 		goto out_balanced;
 
 	/*
 	 * Don't pull any tasks if this group is already above the domain
 	 * average load.
 	 */
-	if (sds.this_load >= sds.avg_load)
+	if (sds->this_load >= sds->avg_load)
 		goto out_balanced;
 
 	if (idle == CPU_IDLE) {
@@ -4257,30 +4461,33 @@ find_busiest_group(struct sched_domain *
 		 * there is no imbalance between this and busiest group
 		 * wrt to idle cpu's, it is balanced.
 		 */
-		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
-		    sds.busiest_nr_running <= sds.busiest_group_weight)
+		if ((sds->this_idle_cpus <= sds->busiest_idle_cpus + 1) &&
+		    sds->busiest_nr_running <= sds->busiest_group_weight)
 			goto out_balanced;
 	} else {
 		/*
 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
 		 * imbalance_pct to be conservative.
 		 */
-		if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+		if (100 * sds->max_load <= sd->imbalance_pct * sds->this_load)
 			goto out_balanced;
 	}
 
 force_balance:
 	/* Looks like there is an imbalance. Compute it */
-	calculate_imbalance(&sds, this_cpu, imbalance);
-	return sds.busiest;
+	calculate_imbalance(sds, this_cpu, imbalance);
+	return sds->busiest;
 
 out_balanced:
+	if (check_numa_busiest_group(sds, this_cpu, imbalance))
+		return sds->busiest;
+
 	/*
 	 * There is no obvious imbalance. But check if we can do some balancing
 	 * to save power.
 	 */
-	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-		return sds.busiest;
+	if (check_power_save_busiest_group(sds, this_cpu, imbalance))
+		return sds->busiest;
 ret:
 	*imbalance = 0;
 	return NULL;
@@ -4347,9 +4554,11 @@ find_busiest_queue(struct sched_domain *
 DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
 static int need_active_balance(struct sched_domain *sd, int idle,
-			       int busiest_cpu, int this_cpu)
+			       struct rq *busiest, struct rq *this)
 {
 	if (idle == CPU_NEWLY_IDLE) {
+		int busiest_cpu = cpu_of(busiest);
+		int this_cpu = cpu_of(this);
 
 		/*
 		 * ASYM_PACKING needs to force migrate tasks from busy but
@@ -4382,6 +4591,9 @@ static int need_active_balance(struct sc
 			return 0;
 	}
 
+	if (need_active_numa_balance(sd, busiest))
+		return 1;
+
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
@@ -4401,6 +4613,7 @@ static int load_balance(int this_cpu, st
 	struct rq *busiest;
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+	struct sd_lb_stats sds;
 
 	struct lb_env env = {
 		.sd		= sd,
@@ -4412,10 +4625,12 @@ static int load_balance(int this_cpu, st
 
 	cpumask_copy(cpus, cpu_active_mask);
 
+	memset(&sds, 0, sizeof(sds));
+	sds.find_busiest_queue = find_busiest_queue;
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle,
+	group = find_busiest_group(sd, &sds, this_cpu, &imbalance, idle,
 				   cpus, balance);
 
 	if (*balance == 0)
@@ -4426,7 +4641,7 @@ static int load_balance(int this_cpu, st
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+	busiest = sds.find_busiest_queue(sd, group, idle, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -4449,6 +4664,10 @@ static int load_balance(int this_cpu, st
 		env.src_cpu = busiest->cpu;
 		env.src_rq = busiest;
 		env.loop_max = busiest->nr_running;
+		if (sched_feat(NUMA_PULL))
+			env.tasks = &busiest->offnode_tasks;
+		else
+			env.tasks = &busiest->cfs_tasks;
 
 more_balance:
 		local_irq_save(flags);
@@ -4490,7 +4709,7 @@ static int load_balance(int this_cpu, st
 		if (idle != CPU_NEWLY_IDLE)
 			sd->nr_balance_failed++;
 
-		if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+		if (need_active_balance(sd, idle, busiest, this_rq)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the active_load_balance_cpu_stop,
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
 
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
+
+#ifdef CONFIG_NUMA
+SCHED_FEAT(NUMA_HOT,       true)
+SCHED_FEAT(NUMA_BIAS,      true)
+SCHED_FEAT(NUMA_PULL,      true)
+SCHED_FEAT(NUMA_PULL_BIAS, true)
+#endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -414,6 +414,12 @@ struct rq {
 
 	struct list_head cfs_tasks;
 
+#ifdef CONFIG_NUMA
+	unsigned long    offnode_running;
+	unsigned long	 offnode_weight;
+	struct list_head offnode_tasks;
+#endif
+
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
@@ -525,6 +531,7 @@ static inline struct sched_domain *highe
 
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_node);
 
 #endif /* CONFIG_SMP */
 
@@ -1158,3 +1165,5 @@ enum rq_nohz_flag_bits {
 #endif
 
 static inline void select_task_node(struct task_struct *p, struct mm_struct *mm, int sd_flags) { }
+static inline bool account_numa_enqueue(struct task_struct *p) { return false; }
+static inline void account_numa_dequeue(struct task_struct *p) { }



  parent reply	other threads:[~2012-03-16 14:53 UTC|newest]

Thread overview: 304+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-03-16 14:40 [RFC][PATCH 00/26] sched/numa Peter Zijlstra
2012-03-16 14:40 ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 01/26] mm, mpol: Re-implement check_*_range() using walk_page_range() Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 02/26] mm, mpol: Remove NUMA_INTERLEAVE_HIT Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-07-06 10:32   ` Johannes Weiner
2012-07-06 10:32     ` Johannes Weiner
2012-07-06 13:46     ` [tip:sched/core] mm: Fix vmstat names-values off-by-one tip-bot for Johannes Weiner
2012-07-06 14:48     ` [RFC][PATCH 02/26] mm, mpol: Remove NUMA_INTERLEAVE_HIT Minchan Kim
2012-07-06 14:48       ` Minchan Kim
2012-07-06 15:02       ` Peter Zijlstra
2012-07-06 15:02         ` Peter Zijlstra
2012-07-06 14:54   ` Kyungmin Park
2012-07-06 14:54     ` Kyungmin Park
2012-07-06 15:00     ` Peter Zijlstra
2012-07-06 15:00       ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 03/26] mm, mpol: add MPOL_MF_LAZY Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-23 11:50   ` Mel Gorman
2012-03-23 11:50     ` Mel Gorman
2012-07-06 16:38     ` Rik van Riel
2012-07-06 16:38       ` Rik van Riel
2012-07-06 20:04       ` Lee Schermerhorn
2012-07-06 20:04         ` Lee Schermerhorn
2012-07-06 20:27         ` Rik van Riel
2012-07-06 20:27           ` Rik van Riel
2012-07-09 11:48       ` Peter Zijlstra
2012-07-09 11:48         ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 04/26] mm, mpol: add MPOL_MF_NOOP Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-07-06 18:40   ` Rik van Riel
2012-07-06 18:40     ` Rik van Riel
2012-03-16 14:40 ` [RFC][PATCH 05/26] mm, mpol: Check for misplaced page Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 06/26] mm: Migrate " Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-04-03 17:32   ` Dan Smith
2012-04-03 17:32     ` Dan Smith
2012-03-16 14:40 ` [RFC][PATCH 07/26] mm: Handle misplaced anon pages Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 08/26] mm, mpol: Simplify do_mbind() Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 09/26] sched, mm: Introduce tsk_home_node() Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 10/26] mm, mpol: Make mempolicy home-node aware Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 18:34   ` Christoph Lameter
2012-03-16 18:34     ` Christoph Lameter
2012-03-16 21:12     ` Peter Zijlstra
2012-03-16 21:12       ` Peter Zijlstra
2012-03-19 13:53       ` Christoph Lameter
2012-03-19 13:53         ` Christoph Lameter
2012-03-19 14:05         ` Peter Zijlstra
2012-03-19 14:05           ` Peter Zijlstra
2012-03-19 15:16           ` Christoph Lameter
2012-03-19 15:16             ` Christoph Lameter
2012-03-19 15:23             ` Peter Zijlstra
2012-03-19 15:23               ` Peter Zijlstra
2012-03-19 15:31               ` Christoph Lameter
2012-03-19 15:31                 ` Christoph Lameter
2012-03-19 17:09                 ` Peter Zijlstra
2012-03-19 17:09                   ` Peter Zijlstra
2012-03-19 17:28                   ` Peter Zijlstra
2012-03-19 17:28                     ` Peter Zijlstra
2012-03-19 19:06                   ` Christoph Lameter
2012-03-19 19:06                     ` Christoph Lameter
2012-03-19 20:28                   ` Lee Schermerhorn
2012-03-19 20:28                     ` Lee Schermerhorn
2012-03-19 21:21                     ` Peter Zijlstra
2012-03-19 21:21                       ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 11/26] mm, mpol: Lazy migrate a process/vma Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 12/26] sched, mm: sched_{fork,exec} node assignment Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-06-15 18:16   ` Tony Luck
2012-06-15 18:16     ` Tony Luck
2012-06-20 19:12     ` [PATCH] sched: Fix build problems when CONFIG_NUMA=y and CONFIG_SMP=n Luck, Tony
2012-06-20 19:12       ` Luck, Tony
2012-03-16 14:40 ` Peter Zijlstra [this message]
2012-03-16 14:40   ` [RFC][PATCH 13/26] sched: Implement home-node awareness Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 14/26] sched, numa: Numa balancer Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-07-07 18:26   ` Rik van Riel
2012-07-07 18:26     ` Rik van Riel
2012-07-09 12:05     ` Peter Zijlstra
2012-07-09 12:05       ` Peter Zijlstra
2012-07-09 12:23     ` Peter Zijlstra
2012-07-09 12:23       ` Peter Zijlstra
2012-07-09 12:40       ` Peter Zijlstra
2012-07-09 12:40         ` Peter Zijlstra
2012-07-09 14:50         ` Rik van Riel
2012-07-09 14:50           ` Rik van Riel
2012-07-08 18:35   ` Rik van Riel
2012-07-08 18:35     ` Rik van Riel
2012-07-09 12:25     ` Peter Zijlstra
2012-07-09 12:25       ` Peter Zijlstra
2012-07-09 14:54       ` Rik van Riel
2012-07-09 14:54         ` Rik van Riel
2012-07-12 22:02   ` Rik van Riel
2012-07-12 22:02     ` Rik van Riel
2012-07-13 14:45     ` Don Morris
2012-07-13 14:45       ` Don Morris
2012-07-14 16:20       ` Rik van Riel
2012-07-14 16:20         ` Rik van Riel
2012-03-16 14:40 ` [RFC][PATCH 15/26] sched, numa: Implement hotplug hooks Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-19 12:16   ` Srivatsa S. Bhat
2012-03-19 12:16     ` Srivatsa S. Bhat
2012-03-19 12:19     ` Peter Zijlstra
2012-03-19 12:19       ` Peter Zijlstra
2012-03-19 12:27       ` Srivatsa S. Bhat
2012-03-19 12:27         ` Srivatsa S. Bhat
2012-03-16 14:40 ` [RFC][PATCH 16/26] sched, numa: Abstract the numa_entity Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 17/26] srcu: revert1 Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 18/26] srcu: revert2 Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 19/26] srcu: Implement call_srcu() Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 20/26] mm, mpol: Introduce vma_dup_policy() Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 21/26] mm, mpol: Introduce vma_put_policy() Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 22/26] mm, mpol: Split and explose some mempolicy functions Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 23/26] sched, numa: Introduce sys_numa_{t,m}bind() Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 24/26] mm, mpol: Implement numa_group RSS accounting Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 25/26] sched, numa: Only migrate long-running entities Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-07-08 18:34   ` Rik van Riel
2012-07-08 18:34     ` Rik van Riel
2012-07-09 12:26     ` Peter Zijlstra
2012-07-09 12:26       ` Peter Zijlstra
2012-07-09 14:53       ` Rik van Riel
2012-07-09 14:53         ` Rik van Riel
2012-07-09 14:55         ` Peter Zijlstra
2012-07-09 14:55           ` Peter Zijlstra
2012-03-16 14:40 ` [RFC][PATCH 26/26] sched, numa: A few debug bits Peter Zijlstra
2012-03-16 14:40   ` Peter Zijlstra
2012-03-16 18:25 ` [RFC] AutoNUMA alpha6 Andrea Arcangeli
2012-03-16 18:25   ` Andrea Arcangeli
2012-03-19 18:47   ` Peter Zijlstra
2012-03-19 18:47     ` Peter Zijlstra
2012-03-19 19:02     ` Andrea Arcangeli
2012-03-19 19:02       ` Andrea Arcangeli
2012-03-20 23:41   ` Dan Smith
2012-03-20 23:41     ` Dan Smith
2012-03-21  1:00     ` Andrea Arcangeli
2012-03-21  1:00       ` Andrea Arcangeli
2012-03-21  2:12     ` Andrea Arcangeli
2012-03-21  2:12       ` Andrea Arcangeli
2012-03-21  4:01       ` Dan Smith
2012-03-21  4:01         ` Dan Smith
2012-03-21 12:49         ` Andrea Arcangeli
2012-03-21 12:49           ` Andrea Arcangeli
2012-03-21 22:05           ` Dan Smith
2012-03-21 22:05             ` Dan Smith
2012-03-21 22:52             ` Andrea Arcangeli
2012-03-21 22:52               ` Andrea Arcangeli
2012-03-21 23:13               ` Dan Smith
2012-03-21 23:13                 ` Dan Smith
2012-03-21 23:41                 ` Andrea Arcangeli
2012-03-21 23:41                   ` Andrea Arcangeli
2012-03-22  0:17               ` Andrea Arcangeli
2012-03-22  0:17                 ` Andrea Arcangeli
2012-03-22 13:58                 ` Dan Smith
2012-03-22 13:58                   ` Dan Smith
2012-03-22 14:27                   ` Andrea Arcangeli
2012-03-22 18:49                     ` Andrea Arcangeli
2012-03-22 18:49                       ` Andrea Arcangeli
2012-03-22 18:56                       ` Dan Smith
2012-03-22 18:56                         ` Dan Smith
2012-03-22 19:11                         ` Andrea Arcangeli
2012-03-22 19:11                           ` Andrea Arcangeli
2012-03-23 14:15                         ` Andrew Theurer
2012-03-23 14:15                           ` Andrew Theurer
2012-03-23 16:01                           ` Andrea Arcangeli
2012-03-23 16:01                             ` Andrea Arcangeli
2012-03-25 13:30                         ` Andrea Arcangeli
2012-03-25 13:30                           ` Andrea Arcangeli
2012-03-21  7:12       ` Ingo Molnar
2012-03-21  7:12         ` Ingo Molnar
2012-03-21 12:08         ` Andrea Arcangeli
2012-03-21 12:08           ` Andrea Arcangeli
2012-03-21  7:53     ` Ingo Molnar
2012-03-21  7:53       ` Ingo Molnar
2012-03-21 12:17       ` Andrea Arcangeli
2012-03-21 12:17         ` Andrea Arcangeli
2012-03-19  9:57 ` [RFC][PATCH 00/26] sched/numa Avi Kivity
2012-03-19  9:57   ` Avi Kivity
2012-03-19 11:12   ` Peter Zijlstra
2012-03-19 11:12     ` Peter Zijlstra
2012-03-19 11:30     ` Peter Zijlstra
2012-03-19 11:30       ` Peter Zijlstra
2012-03-19 11:39     ` Peter Zijlstra
2012-03-19 11:39       ` Peter Zijlstra
2012-03-19 11:42     ` Avi Kivity
2012-03-19 11:42       ` Avi Kivity
2012-03-19 11:59       ` Peter Zijlstra
2012-03-19 11:59         ` Peter Zijlstra
2012-03-19 12:07         ` Avi Kivity
2012-03-19 12:07           ` Avi Kivity
2012-03-19 12:09       ` Peter Zijlstra
2012-03-19 12:09         ` Peter Zijlstra
2012-03-19 12:16         ` Avi Kivity
2012-03-19 12:16           ` Avi Kivity
2012-03-19 20:03           ` Peter Zijlstra
2012-03-19 20:03             ` Peter Zijlstra
2012-03-20 10:18             ` Avi Kivity
2012-03-20 10:18               ` Avi Kivity
2012-03-20 10:48               ` Peter Zijlstra
2012-03-20 10:48                 ` Peter Zijlstra
2012-03-20 10:52                 ` Avi Kivity
2012-03-20 10:52                   ` Avi Kivity
2012-03-20 11:07                   ` Peter Zijlstra
2012-03-20 11:07                     ` Peter Zijlstra
2012-03-20 11:48                     ` Avi Kivity
2012-03-20 11:48                       ` Avi Kivity
2012-03-19 12:20       ` Peter Zijlstra
2012-03-19 12:20         ` Peter Zijlstra
2012-03-19 12:24         ` Avi Kivity
2012-03-19 12:24           ` Avi Kivity
2012-03-19 15:44           ` Avi Kivity
2012-03-19 15:44             ` Avi Kivity
2012-03-19 13:40       ` Andrea Arcangeli
2012-03-19 13:40         ` Andrea Arcangeli
2012-03-19 20:06         ` Peter Zijlstra
2012-03-19 20:06           ` Peter Zijlstra
2012-03-19 13:04     ` Andrea Arcangeli
2012-03-19 13:04       ` Andrea Arcangeli
2012-03-19 13:26       ` Peter Zijlstra
2012-03-19 13:26         ` Peter Zijlstra
2012-03-19 13:57         ` Andrea Arcangeli
2012-03-19 13:57           ` Andrea Arcangeli
2012-03-19 14:06           ` Avi Kivity
2012-03-19 14:06             ` Avi Kivity
2012-03-19 14:30             ` Andrea Arcangeli
2012-03-19 14:30               ` Andrea Arcangeli
2012-03-19 18:42               ` Peter Zijlstra
2012-03-19 18:42                 ` Peter Zijlstra
2012-03-20 22:18                 ` Rik van Riel
2012-03-20 22:18                   ` Rik van Riel
2012-03-21 16:50                   ` Andrea Arcangeli
2012-03-21 16:50                     ` Andrea Arcangeli
2012-04-02 16:34                   ` Pekka Enberg
2012-04-02 16:34                     ` Pekka Enberg
2012-04-02 16:55                     ` Rik van Riel
2012-04-02 16:55                       ` Rik van Riel
2012-04-02 16:54                       ` Pekka Enberg
2012-04-02 16:54                         ` Pekka Enberg
2012-04-02 17:12                         ` Pekka Enberg
2012-04-02 17:12                           ` Pekka Enberg
2012-04-02 17:23                           ` Pekka Enberg
2012-04-02 17:23                             ` Pekka Enberg
2012-03-19 14:07           ` Peter Zijlstra
2012-03-19 14:07             ` Peter Zijlstra
2012-03-19 14:34             ` Andrea Arcangeli
2012-03-19 14:34               ` Andrea Arcangeli
2012-03-19 18:41               ` Peter Zijlstra
2012-03-19 18:41                 ` Peter Zijlstra
2012-03-19 19:13           ` Peter Zijlstra
2012-03-19 19:13             ` Peter Zijlstra
2012-03-19 14:07         ` Andrea Arcangeli
2012-03-19 14:07           ` Andrea Arcangeli
2012-03-19 19:05           ` Peter Zijlstra
2012-03-19 19:05             ` Peter Zijlstra
2012-03-19 13:26       ` Peter Zijlstra
2012-03-19 13:26         ` Peter Zijlstra
2012-03-19 14:16         ` Andrea Arcangeli
2012-03-19 14:16           ` Andrea Arcangeli
2012-03-19 13:29       ` Peter Zijlstra
2012-03-19 13:29         ` Peter Zijlstra
2012-03-19 14:19         ` Andrea Arcangeli
2012-03-19 14:19           ` Andrea Arcangeli
2012-03-19 13:39       ` Peter Zijlstra
2012-03-19 13:39         ` Peter Zijlstra
2012-03-19 14:20         ` Andrea Arcangeli
2012-03-19 14:20           ` Andrea Arcangeli
2012-03-19 20:17           ` Christoph Lameter
2012-03-19 20:17             ` Christoph Lameter
2012-03-19 20:28             ` Ingo Molnar
2012-03-19 20:28               ` Ingo Molnar
2012-03-19 20:43               ` Christoph Lameter
2012-03-19 20:43                 ` Christoph Lameter
2012-03-19 21:34                 ` Ingo Molnar
2012-03-19 21:34                   ` Ingo Molnar
2012-03-20  0:05               ` Linus Torvalds
2012-03-20  0:05                 ` Linus Torvalds
2012-03-20  7:31                 ` Ingo Molnar
2012-03-20  7:31                   ` Ingo Molnar
2012-03-21 22:53 ` Nish Aravamudan
2012-03-21 22:53   ` Nish Aravamudan
2012-03-22  9:45   ` Peter Zijlstra
2012-03-22  9:45     ` Peter Zijlstra
2012-03-22 10:34     ` Ingo Molnar
2012-03-22 10:34       ` Ingo Molnar
2012-03-24  1:41     ` Nish Aravamudan
2012-03-24  1:41       ` Nish Aravamudan
2012-03-26 11:42       ` Peter Zijlstra
2012-03-26 11:42         ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120316144240.952119284@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=Lee.Schermerhorn@hp.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=bharata.rao@gmail.com \
    --cc=danms@us.ibm.com \
    --cc=efault@gmx.de \
    --cc=hannes@cmpxchg.org \
    --cc=laijs@cn.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@elte.hu \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=pjt@google.com \
    --cc=riel@redhat.com \
    --cc=suresh.b.siddha@intel.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.