[patch 2/2] RFC sched: Scale the nohz_tracker logic by making it per NUMA node

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: venkatesh.pallipadi@intel.com
To: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Gautham R Shenoy <ego@in.ibm.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>, Thomas Gleixner <tglx@linutronix.de>,
	Arjan van de Ven <arjan@infradead.org>,
	linux-kernel@vger.kernel.org,
	Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>,
	Suresh Siddha <suresh.b.siddha@intel.com>
Subject: [patch 2/2] RFC sched: Scale the nohz_tracker logic by making it per NUMA node
Date: Wed, 17 Jun 2009 11:26:51 -0700	[thread overview]
Message-ID: <20090617182741.107659000@intel.com> (raw)
In-Reply-To: 20090617182649.604970000@intel.com

[-- Attachment #1: 0002-sched-Scale-the-nohz_tracker-logic-by-making-it-per.patch --]
[-- Type: text/plain, Size: 11203 bytes --]

Having one idle CPU doing the rebalancing for all the idle CPUs in
nohz mode does not scale well with increasing number of cores and
sockets. Make the nohz_tracker per NUMA node. This results in multiple
idle load balancing happening at NUMA node level and idle load balancer
only does the rebalance domain among all the other nohz CPUs in that
NUMA node.

This addresses the below problem with the current nohz ilb logic
* The lone balancer may end up spending a lot of time doing the balancing on
  behalf of nohz CPUs, especially with increasing number of sockets and
  cores in the platform.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 kernel/sched.c |  162 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 124 insertions(+), 38 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 22fe762..49d3bb7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4408,16 +4408,74 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 	double_unlock_balance(busiest_rq, target_rq);
 }
 
-#ifdef CONFIG_NO_HZ
-static struct {
+struct nohz_tracker {
 	atomic_t load_balancer;
 	cpumask_var_t cpu_mask;
-	cpumask_var_t ilb_grp_nohz_mask;
+	cpumask_var_t tmp_nohz_mask;
 	unsigned long next_balance; /* units in jiffies */
-} nohz ____cacheline_aligned = {
-	.load_balancer = ATOMIC_INIT(-1),
 };
 
+#ifdef CONFIG_NO_HZ
+static DEFINE_PER_CPU(struct nohz_tracker *, cpu_node_nohz_ptr);
+static struct nohz_tracker **nohz_tracker_ptrs;
+
+int alloc_node_nohz_tracker(void)
+{
+	int i, j;
+
+	/* Do all the allocations only once per boot */
+	if (nohz_tracker_ptrs)
+		return 0;
+
+	nohz_tracker_ptrs = kcalloc(nr_node_ids, sizeof(struct nohz_tracker *),
+				    GFP_KERNEL);
+	if (!nohz_tracker_ptrs) {
+		printk(KERN_WARNING "Can not alloc nohz trackers\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_node_ids; i++) {
+		nohz_tracker_ptrs[i] = kmalloc_node(sizeof(struct nohz_tracker),
+						    GFP_KERNEL, i);
+		if (!nohz_tracker_ptrs[i]) {
+			printk(KERN_WARNING "Can not alloc domain group for "
+				"node %d\n", i);
+			goto free_ret;
+		}
+
+		if (!alloc_cpumask_var_node(&nohz_tracker_ptrs[i]->cpu_mask,
+					    GFP_KERNEL, i)) {
+			kfree(nohz_tracker_ptrs[i]);
+			goto free_ret;
+		}
+
+		if (!alloc_cpumask_var_node(&nohz_tracker_ptrs[i]->tmp_nohz_mask,
+					    GFP_KERNEL, i)) {
+			free_cpumask_var(nohz_tracker_ptrs[i]->cpu_mask);
+			kfree(nohz_tracker_ptrs[i]);
+			goto free_ret;
+		}
+		atomic_set(&nohz_tracker_ptrs[i]->load_balancer, -1);
+	}
+
+	return 0;
+
+free_ret:
+	for (j = 0; j < i; j++) {
+		free_cpumask_var(nohz_tracker_ptrs[j]->tmp_nohz_mask);
+		free_cpumask_var(nohz_tracker_ptrs[j]->cpu_mask);
+		kfree(nohz_tracker_ptrs[j]);
+	}
+
+	kfree(nohz_tracker_ptrs);
+
+	for_each_online_cpu(i)
+		per_cpu(cpu_node_nohz_ptr, i) = NULL;
+
+	nohz_tracker_ptrs = NULL;
+	return -ENOMEM;
+}
+
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 /**
  * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4456,6 +4514,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 /**
  * is_semi_idle_group - Checks if the given sched_group is semi-idle.
  * @ilb_group:	group to be checked for semi-idleness
+ * @node_nohz: nohz_tracker for the node
  *
  * Returns:	1 if the group is semi-idle. 0 otherwise.
  *
@@ -4463,19 +4522,20 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
  * and atleast one non-idle CPU. This helper function checks if the given
  * sched_group is semi-idle or not.
  */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
+static inline int is_semi_idle_group(struct sched_group *ilb_group,
+				struct nohz_tracker *node_nohz)
 {
-	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+	cpumask_and(node_nohz->tmp_nohz_mask, node_nohz->cpu_mask,
 					sched_group_cpus(ilb_group));
 
 	/*
 	 * A sched_group is semi-idle when it has atleast one busy cpu
 	 * and atleast one idle cpu.
 	 */
-	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+	if (cpumask_empty(node_nohz->tmp_nohz_mask))
 		return 0;
 
-	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+	if (cpumask_equal(node_nohz->tmp_nohz_mask, sched_group_cpus(ilb_group)))
 		return 0;
 
 	return 1;
@@ -4483,6 +4543,7 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
 /**
  * find_new_power_opt_ilb - Finds the optimum idle load balancer for nomination.
  * @cpu:	The cpu which is nominating a new idle_load_balancer.
+ * @node_nohz:	nohz_tracker for the node
  *
  * Returns:	Returns the id of the idle load balancer if it exists,
  *		Else, returns >= nr_cpu_ids.
@@ -4492,7 +4553,7 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
  * completely idle packages/cores just for the purpose of idle load balancing
  * when there are other idle cpu's which are better suited for that job.
  */
-static int find_new_power_opt_ilb(int cpu)
+static int find_new_power_opt_ilb(int cpu, struct nohz_tracker *node_nohz)
 {
 	struct sched_domain *sd;
 	struct sched_group *ilb_group;
@@ -4508,15 +4569,15 @@ static int find_new_power_opt_ilb(int cpu)
 	 * Optimize for the case when we have no idle CPUs or only one
 	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
 	 */
-	if (cpumask_weight(nohz.cpu_mask) < 2)
+	if (cpumask_weight(node_nohz->cpu_mask) < 2)
 		goto out_done;
 
 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
 		ilb_group = sd->groups;
 
 		do {
-			if (is_semi_idle_group(ilb_group))
-				return cpumask_first(nohz.ilb_grp_nohz_mask);
+			if (is_semi_idle_group(ilb_group, node_nohz))
+				return cpumask_first(node_nohz->tmp_nohz_mask);
 
 			ilb_group = ilb_group->next;
 
@@ -4527,15 +4588,26 @@ out_done:
 	return nr_cpu_ids;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_power_opt_ilb(int call_cpu)
+static inline int find_new_power_opt_ilb(int call_cpu,
+						struct nohz_tracker *node_nohz)
 {
 	return nr_cpu_ids;
 }
 #endif
 
+static int get_nohz_load_balancer_node(struct nohz_tracker *node_nohz)
+{
+	if (!node_nohz)
+		return -1;
+
+	return atomic_read(&node_nohz->load_balancer);
+}
+
 int get_nohz_load_balancer(void)
 {
-	return atomic_read(&nohz.load_balancer);
+	int cpu = smp_processor_id();
+	struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, cpu);
+	return get_nohz_load_balancer_node(node_nohz);
 }
 
 /*
@@ -4547,13 +4619,17 @@ static void nohz_balancer_kick(int cpu)
 {
 	int ilb_cpu;
 	unsigned long now = jiffies;
+	struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, cpu);
+
+	if (!node_nohz)
+		return;
 
-	if (time_before(now, nohz.next_balance))
+	if (time_before(now, node_nohz->next_balance))
 		return;
 
-	ilb_cpu = get_nohz_load_balancer();
+	ilb_cpu = get_nohz_load_balancer_node(node_nohz);
 	if (ilb_cpu < 0) {
-		ilb_cpu = cpumask_first(nohz.cpu_mask);
+		ilb_cpu = cpumask_first(node_nohz->cpu_mask);
 		if (ilb_cpu >= nr_cpu_ids)
 			return;
 	}
@@ -4579,31 +4655,35 @@ static void nohz_balancer_kick(int cpu)
 int select_nohz_load_balancer(int stop_tick)
 {
 	int cpu = smp_processor_id();
+	struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, cpu);
+
+	if (!node_nohz)
+		return 0;
 
 	if (stop_tick) {
 		cpu_rq(cpu)->in_nohz_recently = 1;
 
 		if (!cpu_active(cpu)) {
-			if (atomic_read(&nohz.load_balancer) != cpu)
+			if (atomic_read(&node_nohz->load_balancer) != cpu)
 				return 0;
 
 			/*
 			 * If we are going offline and still the leader,
 			 * give up!
 			 */
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+			if (atomic_cmpxchg(&node_nohz->load_balancer, cpu, -1) != cpu)
 				BUG();
 
 			return 0;
 		}
 
-		cpumask_set_cpu(cpu, nohz.cpu_mask);
+		cpumask_set_cpu(cpu, node_nohz->cpu_mask);
 
-		if (atomic_read(&nohz.load_balancer) == -1) {
+		if (atomic_read(&node_nohz->load_balancer) == -1) {
 			int new_ilb;
 
 			/* make me the ilb owner */
-			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) != -1) 
+			if (atomic_cmpxchg(&node_nohz->load_balancer, -1, cpu) != -1) 
 				return 0;
 
 			/*
@@ -4614,20 +4694,20 @@ int select_nohz_load_balancer(int stop_tick)
 			      sched_mc_power_savings))
 				return 0;
 
-			new_ilb = find_new_power_opt_ilb(cpu);
+			new_ilb = find_new_power_opt_ilb(cpu, node_nohz);
 			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-				atomic_set(&nohz.load_balancer, -1);
+				atomic_set(&node_nohz->load_balancer, -1);
 				resched_cpu(new_ilb);
 			}
 		}
 	} else {
-		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
+		if (!cpumask_test_cpu(cpu, node_nohz->cpu_mask))
 			return 0;
 
-		cpumask_clear_cpu(cpu, nohz.cpu_mask);
+		cpumask_clear_cpu(cpu, node_nohz->cpu_mask);
 
-		if (atomic_read(&nohz.load_balancer) == cpu)
-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+		if (atomic_read(&node_nohz->load_balancer) == cpu)
+			if (atomic_cmpxchg(&node_nohz->load_balancer, cpu, -1) != cpu)
 				BUG();
 	}
 	return 0;
@@ -4732,6 +4812,8 @@ static void run_rebalance_domains(struct softirq_action *h)
  */
 static void nohz_idle_balance(int this_cpu, struct rq *this_rq)
 {
+	struct nohz_tracker *node_nohz = per_cpu(cpu_node_nohz_ptr, this_cpu);
+
 	rebalance_domains(this_cpu, CPU_IDLE);
 
 	/*
@@ -4739,11 +4821,11 @@ static void nohz_idle_balance(int this_cpu, struct rq *this_rq)
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
-	if (this_rq->nohz_balance_kick) {
+	if (this_rq->nohz_balance_kick && node_nohz) {
 		struct rq *rq;
 		int balance_cpu;
 
-		for_each_cpu(balance_cpu, nohz.cpu_mask) {
+		for_each_cpu(balance_cpu, node_nohz->cpu_mask) {
 			if (balance_cpu == this_cpu)
 				continue;
 
@@ -4761,7 +4843,7 @@ static void nohz_idle_balance(int this_cpu, struct rq *this_rq)
 			if (time_after(this_rq->next_balance, rq->next_balance))
 				this_rq->next_balance = rq->next_balance;
 		}
-		nohz.next_balance = this_rq->next_balance;
+		node_nohz->next_balance = this_rq->next_balance;
 		this_rq->nohz_balance_kick = 0;
 	}
 }
@@ -8615,6 +8697,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	}
 #endif
 
+	if (alloc_node_nohz_tracker())
+		goto error;
+
+	for_each_cpu(i, cpu_map) {
+		per_cpu(cpu_node_nohz_ptr, i) =
+					nohz_tracker_ptrs[cpu_to_node(i)];
+	}
+
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu(i, cpu_map) {
@@ -8692,12 +8782,12 @@ free_sched_groups:
 #endif
 	goto free_tmpmask;
 
-#ifdef CONFIG_NUMA
 error:
+#ifdef CONFIG_NUMA
 	free_sched_groups(cpu_map, tmpmask);
 	free_rootdomain(rd);
-	goto free_tmpmask;
 #endif
+	goto free_tmpmask;
 }
 
 static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9386,10 +9476,6 @@ void __init sched_init(void)
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
 	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
-#ifdef CONFIG_NO_HZ
-	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
-	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
-#endif
 	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
-- 
1.6.0.6

--

next prev parent reply	other threads:[~2009-06-17 18:36 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-06-17 18:26 [patch 0/2] RFC sched: Change nohz ilb logic from poll to push model venkatesh.pallipadi
2009-06-17 18:26 ` [patch 1/2] RFC sched: Change the nohz ilb logic from pull " venkatesh.pallipadi
2009-06-17 18:26 ` venkatesh.pallipadi [this message]
2009-06-17 19:21   ` [patch 2/2] RFC sched: Scale the nohz_tracker logic by making it per NUMA node Vaidyanathan Srinivasan
2009-06-17 19:16 ` [patch 0/2] RFC sched: Change nohz ilb logic from poll to push model Vaidyanathan Srinivasan
2009-06-18 23:41   ` Pallipadi, Venkatesh

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:22fe762 dfblob:49d3bb7 )
 OR (
bs:"[patch 2/2] RFC sched: Scale the nohz_tracker logic by making it per NUMA node" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090617182741.107659000@intel.com \
    --to=venkatesh.pallipadi@intel.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=arjan@infradead.org \
    --cc=ego@in.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=suresh.b.siddha@intel.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox