All of lore.kernel.org
 help / color / mirror / Atom feed
From: Li Zefan <lizf@cn.fujitsu.com>
To: Paul Jackson <pj@sgi.com>,
	Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Paul Menage <menage@google.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Lai Jiangshan <laijs@cn.fujitsu.com>
Subject: [RFC] [PATCH] cpuset: fix wrong calculation of relax domain level
Date: Thu, 17 Jul 2008 16:07:44 +0800	[thread overview]
Message-ID: <487EFDD0.2060101@cn.fujitsu.com> (raw)

When multiple cpusets are overlapping in their 'cpus' and hence they
form a single sched domain, the largest sched_relax_domain_level among
those should be used. But when top_cpuset's sched_load_balance is
set, its sched_relax_domain_level is used regardless other sub-cpusets'.

There are several proposals to solve this:

1) Travel the cpuset hierarchy to find the largest relax_domain_level
   in rebuild_sched_domains(). But cpuset avoids hierarchy travelling
   when top_cpuset.sched_load_balance is set.

2) Remember the largest relax_domain_level when we update a cpuset's
   sched_load_balance, sched_relax_domain_level and cpus. This should
   work, but seems a bit tricky and a bit ugly. (As this patch shows)

3) Don't treat this as a bug, but document this behavior.


Reported-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 cpuset.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

--- linux-mm.orig/kernel/cpuset.c	2008-07-17 15:02:12.000000000 +0800
+++ linux-mm/kernel/cpuset.c	2008-07-17 15:01:18.000000000 +0800
@@ -69,6 +69,14 @@ int number_of_cpusets __read_mostly;
 struct cgroup_subsys cpuset_subsys;
 struct cpuset;
 
+/*
+ * Tracks # of cpusets in each relax domain level. This is to avoid
+ * travelling the cpuset hierachy in rebuild_sched_domains()
+ * when top_cpuset.sched_load_balance == 1.
+ */
+static unsigned int __cpusets_rd_lv[SD_LV_MAX+1];
+static unsigned int *cpusets_rd_lv = __cpusets_rd_lv + 1;
+
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
@@ -594,6 +602,14 @@ static void rebuild_sched_domains(void)
 			update_domain_attr(dattr, &top_cpuset);
 		}
 		*doms = top_cpuset.cpus_allowed;
+
+		for (i = SD_LV_MAX - 1; i >= 0; i--) {
+			if (cpusets_rd_lv[i] && dattr) {
+				dattr->relax_domain_level = i;
+				break;
+			}
+		}
+
 		goto rebuild;
 	}
 
@@ -807,6 +823,7 @@ static int update_cpumask(struct cpuset 
 	struct cpuset trialcs;
 	int retval;
 	int is_load_balanced;
+	int cpus_empty_changed;
 
 	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
 	if (cs == &top_cpuset)
@@ -839,11 +856,20 @@ static int update_cpumask(struct cpuset 
 		return 0;
 
 	is_load_balanced = is_sched_load_balance(&trialcs);
+	cpus_empty_changed = (cpus_empty(cs->cpus_allowed) !=
+			      cpus_empty(trialcs.cpus_allowed));
 
 	mutex_lock(&callback_mutex);
 	cs->cpus_allowed = trialcs.cpus_allowed;
 	mutex_unlock(&callback_mutex);
 
+	if (is_load_balanced && cpus_empty_changed) {
+		if (cpus_empty(cs->cpus_allowed))
+			cpusets_rd_lv[cs->relax_domain_level]--;
+		else
+			cpusets_rd_lv[cs->relax_domain_level]++;
+	}
+
 	/*
 	 * Scan tasks in the cpuset, and update the cpumasks of any
 	 * that need an update.
@@ -1074,12 +1100,19 @@ int current_cpuset_is_being_rebound(void
 
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
+	int need_rebuild = (!cpus_empty(cs->cpus_allowed) &&
+			    is_sched_load_balance(cs));
+
 	if (val < -1 || val >= SD_LV_MAX)
 		return -EINVAL;
 
 	if (val != cs->relax_domain_level) {
+		if (need_rebuild) {
+			cpusets_rd_lv[cs->relax_domain_level]--;
+			cpusets_rd_lv[val]++;
+		}
 		cs->relax_domain_level = val;
-		if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+		if (need_rebuild)
 			rebuild_sched_domains();
 	}
 
@@ -1120,8 +1153,13 @@ static int update_flag(cpuset_flagbits_t
 	cs->flags = trialcs.flags;
 	mutex_unlock(&callback_mutex);
 
-	if (cpus_nonempty && balance_flag_changed)
+	if (cpus_nonempty && balance_flag_changed) {
+		if (is_sched_load_balance(cs))
+			cpusets_rd_lv[cs->relax_domain_level]++;
+		else
+			cpusets_rd_lv[cs->relax_domain_level]--;
 		rebuild_sched_domains();
+	}
 
 	return 0;
 }
@@ -1856,6 +1894,7 @@ static void scan_for_empty_cpusets(const
 	struct list_head queue;
 	struct cgroup *cont;
 	nodemask_t oldmems;
+	cpumask_t oldcpus;
 
 	INIT_LIST_HEAD(&queue);
 
@@ -1876,6 +1915,7 @@ static void scan_for_empty_cpusets(const
 			continue;
 
 		oldmems = cp->mems_allowed;
+		oldcpus = cp->cpus_allowed;
 
 		/* Remove offline cpus and mems from this cpuset. */
 		mutex_lock(&callback_mutex);
@@ -1884,6 +1924,12 @@ static void scan_for_empty_cpusets(const
 						node_states[N_HIGH_MEMORY]);
 		mutex_unlock(&callback_mutex);
 
+		if (is_sched_load_balance(cp)) {
+			if (cpus_empty(cp->cpus_allowed) &&
+			    !cpus_empty(oldcpus))
+				cpusets_rd_lv[cp->relax_domain_level]--;
+		}
+
 		/* Move tasks from the empty cpuset to a parent */
 		if (cpus_empty(cp->cpus_allowed) ||
 		     nodes_empty(cp->mems_allowed))

             reply	other threads:[~2008-07-17  8:09 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-07-17  8:07 Li Zefan [this message]
2008-07-17  8:57 ` [RFC] [PATCH] cpuset: fix wrong calculation of relax domain level Hidetoshi Seto
2008-07-17 10:13   ` Li Zefan
2008-07-17 20:09     ` Paul Jackson
2008-07-17 20:28 ` Paul Jackson
2008-07-18  0:26   ` Hidetoshi Seto
2008-07-18  0:35     ` Paul Jackson
2008-07-18  2:36   ` Li Zefan
2008-07-18  2:43     ` Paul Jackson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=487EFDD0.2060101@cn.fujitsu.com \
    --to=lizf@cn.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=laijs@cn.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=menage@google.com \
    --cc=peterz@infradead.org \
    --cc=pj@sgi.com \
    --cc=seto.hidetoshi@jp.fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.