From: Li Zefan <lizf@cn.fujitsu.com>
To: Paul Jackson <pj@sgi.com>,
Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
Paul Menage <menage@google.com>,
Peter Zijlstra <peterz@infradead.org>,
Andrew Morton <akpm@linux-foundation.org>,
Lai Jiangshan <laijs@cn.fujitsu.com>
Subject: [RFC] [PATCH] cpuset: fix wrong calculation of relax domain level
Date: Thu, 17 Jul 2008 16:07:44 +0800 [thread overview]
Message-ID: <487EFDD0.2060101@cn.fujitsu.com> (raw)
When multiple cpusets are overlapping in their 'cpus' and hence they
form a single sched domain, the largest sched_relax_domain_level among
those should be used. But when top_cpuset's sched_load_balance is
set, its sched_relax_domain_level is used regardless other sub-cpusets'.
There are several proposals to solve this:
1) Travel the cpuset hierarchy to find the largest relax_domain_level
in rebuild_sched_domains(). But cpuset avoids hierarchy travelling
when top_cpuset.sched_load_balance is set.
2) Remember the largest relax_domain_level when we update a cpuset's
sched_load_balance, sched_relax_domain_level and cpus. This should
work, but seems a bit tricky and a bit ugly. (As this patch shows)
3) Don't treat this as a bug, but document this behavior.
Reported-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
cpuset.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 48 insertions(+), 2 deletions(-)
--- linux-mm.orig/kernel/cpuset.c 2008-07-17 15:02:12.000000000 +0800
+++ linux-mm/kernel/cpuset.c 2008-07-17 15:01:18.000000000 +0800
@@ -69,6 +69,14 @@ int number_of_cpusets __read_mostly;
struct cgroup_subsys cpuset_subsys;
struct cpuset;
+/*
+ * Tracks # of cpusets in each relax domain level. This is to avoid
+ * travelling the cpuset hierachy in rebuild_sched_domains()
+ * when top_cpuset.sched_load_balance == 1.
+ */
+static unsigned int __cpusets_rd_lv[SD_LV_MAX+1];
+static unsigned int *cpusets_rd_lv = __cpusets_rd_lv + 1;
+
/* See "Frequency meter" comments, below. */
struct fmeter {
@@ -594,6 +602,14 @@ static void rebuild_sched_domains(void)
update_domain_attr(dattr, &top_cpuset);
}
*doms = top_cpuset.cpus_allowed;
+
+ for (i = SD_LV_MAX - 1; i >= 0; i--) {
+ if (cpusets_rd_lv[i] && dattr) {
+ dattr->relax_domain_level = i;
+ break;
+ }
+ }
+
goto rebuild;
}
@@ -807,6 +823,7 @@ static int update_cpumask(struct cpuset
struct cpuset trialcs;
int retval;
int is_load_balanced;
+ int cpus_empty_changed;
/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
if (cs == &top_cpuset)
@@ -839,11 +856,20 @@ static int update_cpumask(struct cpuset
return 0;
is_load_balanced = is_sched_load_balance(&trialcs);
+ cpus_empty_changed = (cpus_empty(cs->cpus_allowed) !=
+ cpus_empty(trialcs.cpus_allowed));
mutex_lock(&callback_mutex);
cs->cpus_allowed = trialcs.cpus_allowed;
mutex_unlock(&callback_mutex);
+ if (is_load_balanced && cpus_empty_changed) {
+ if (cpus_empty(cs->cpus_allowed))
+ cpusets_rd_lv[cs->relax_domain_level]--;
+ else
+ cpusets_rd_lv[cs->relax_domain_level]++;
+ }
+
/*
* Scan tasks in the cpuset, and update the cpumasks of any
* that need an update.
@@ -1074,12 +1100,19 @@ int current_cpuset_is_being_rebound(void
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
+ int need_rebuild = (!cpus_empty(cs->cpus_allowed) &&
+ is_sched_load_balance(cs));
+
if (val < -1 || val >= SD_LV_MAX)
return -EINVAL;
if (val != cs->relax_domain_level) {
+ if (need_rebuild) {
+ cpusets_rd_lv[cs->relax_domain_level]--;
+ cpusets_rd_lv[val]++;
+ }
cs->relax_domain_level = val;
- if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+ if (need_rebuild)
rebuild_sched_domains();
}
@@ -1120,8 +1153,13 @@ static int update_flag(cpuset_flagbits_t
cs->flags = trialcs.flags;
mutex_unlock(&callback_mutex);
- if (cpus_nonempty && balance_flag_changed)
+ if (cpus_nonempty && balance_flag_changed) {
+ if (is_sched_load_balance(cs))
+ cpusets_rd_lv[cs->relax_domain_level]++;
+ else
+ cpusets_rd_lv[cs->relax_domain_level]--;
rebuild_sched_domains();
+ }
return 0;
}
@@ -1856,6 +1894,7 @@ static void scan_for_empty_cpusets(const
struct list_head queue;
struct cgroup *cont;
nodemask_t oldmems;
+ cpumask_t oldcpus;
INIT_LIST_HEAD(&queue);
@@ -1876,6 +1915,7 @@ static void scan_for_empty_cpusets(const
continue;
oldmems = cp->mems_allowed;
+ oldcpus = cp->cpus_allowed;
/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
@@ -1884,6 +1924,12 @@ static void scan_for_empty_cpusets(const
node_states[N_HIGH_MEMORY]);
mutex_unlock(&callback_mutex);
+ if (is_sched_load_balance(cp)) {
+ if (cpus_empty(cp->cpus_allowed) &&
+ !cpus_empty(oldcpus))
+ cpusets_rd_lv[cp->relax_domain_level]--;
+ }
+
/* Move tasks from the empty cpuset to a parent */
if (cpus_empty(cp->cpus_allowed) ||
nodes_empty(cp->mems_allowed))
next reply other threads:[~2008-07-17 8:09 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-07-17 8:07 Li Zefan [this message]
2008-07-17 8:57 ` [RFC] [PATCH] cpuset: fix wrong calculation of relax domain level Hidetoshi Seto
2008-07-17 10:13 ` Li Zefan
2008-07-17 20:09 ` Paul Jackson
2008-07-17 20:28 ` Paul Jackson
2008-07-18 0:26 ` Hidetoshi Seto
2008-07-18 0:35 ` Paul Jackson
2008-07-18 2:36 ` Li Zefan
2008-07-18 2:43 ` Paul Jackson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=487EFDD0.2060101@cn.fujitsu.com \
--to=lizf@cn.fujitsu.com \
--cc=akpm@linux-foundation.org \
--cc=laijs@cn.fujitsu.com \
--cc=linux-kernel@vger.kernel.org \
--cc=menage@google.com \
--cc=peterz@infradead.org \
--cc=pj@sgi.com \
--cc=seto.hidetoshi@jp.fujitsu.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.