From: Liu Ping Fan <kernelfans@gmail.com>
To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, qemu-devel@nongnu.org
Cc: Peter Zijlstra <peterz@infradead.org>,
Ingo Molnar <mingo@redhat.com>, Avi Kivity <avi@redhat.com>,
Anthony Liguori <anthony@codemonkey.ws>
Subject: [Qemu-devel] [PATCH 1/2] sched: add virt sched domain for the guest
Date: Wed, 23 May 2012 14:32:28 +0800 [thread overview]
Message-ID: <1337754751-9018-2-git-send-email-kernelfans@gmail.com> (raw)
In-Reply-To: <1337754751-9018-1-git-send-email-kernelfans@gmail.com>
From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
The guest's scheduler can not see the numa info on the host and
this will result to the following scene:
Supposing vcpu-a on nodeA, vcpu-b on nodeB, when load balance,
the tasks' pull and push between these vcpus will cost more. But
unfortunately, currently, the guest is just blind to this.
This patch want to export the host numa info to the guest, and help
guest to rebuild its sched domain based on host's info.
--todo:
vcpu's hotplug will be considered.
Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
kernel/cpuset.c | 2 +-
kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 5 ++++
3 files changed, 71 insertions(+), 1 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070..1246091 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -778,7 +778,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
* to a separate workqueue thread, which ends up processing the
* above do_rebuild_sched_domains() function.
*/
-static void async_rebuild_sched_domains(void)
+void async_rebuild_sched_domains(void)
{
queue_work(cpuset_wq, &rebuild_sched_domains_work);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae..3f72c1a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6343,6 +6343,60 @@ static struct sched_domain_topology_level default_topology[] = {
{ NULL, },
};
+#ifdef CONFIG_VIRT_SCHED_DOMAIN
+/* fill in by host */
+DEFINE_PER_CPU(int, virt_numa_node);
+/* todo, exchange info about HOST_NUMNODES from host */
+#define HOST_NUMNODES 128
+/* keep map, node->cpumask; todo, make it dynamic allocated */
+static struct cpumask virt_node_to_cpumask_map[HOST_NUMNODES];
+
+static inline int virt_cpu_to_node(int cpu)
+{
+ return per_cpu(virt_numa_node, cpu);
+}
+
+const struct cpumask *virt_cpumask_of_node(int vnode)
+{
+ struct cpumask *msk = &virt_node_to_cpumask_map[vnode];
+ return msk;
+}
+
+static const struct cpumask *virt_cpu_cpu_mask(int cpu)
+{
+ return virt_cpumask_of_node(virt_cpu_to_node(cpu));
+}
+
+static struct sched_domain_topology_level virt_topology[] = {
+ { sd_init_CPU, virt_cpu_cpu_mask, },
+#ifdef CONFIG_NUMA
+ { sd_init_ALLNODES, cpu_allnodes_mask, },
+#endif
+ { NULL, },
+};
+
+static int update_virt_numa_node(void)
+{
+ int i, cpu, apicid, vnode;
+ for (i = 0; i < HOST_NUMNODES; i++)
+ cpumask_clear(&virt_node_to_cpumask_map[i]);
+ for_each_possible_cpu(cpu) {
+ apicid = cpu_physical_id(cpu);
+ vnode = __vapicid_to_vnode[apicid];
+ per_cpu(virt_numa_node, cpu) = vnode;
+ cpumask_set_cpu(cpu, &virt_node_to_cpumask_map[vnode]);
+ }
+ return 0;
+}
+
+int rebuild_virt_sd(void)
+{
+ update_virt_numa_node();
+ async_rebuild_sched_domains();
+ return 0;
+}
+#endif
+
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -6689,9 +6743,11 @@ match1:
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < ndoms_cur && !new_topology; j++) {
+#ifndef CONFIG_VIRT_SCHED_DOMAIN
if (cpumask_equal(doms_new[i], doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
+#endif
}
/* no match - add a new doms_new */
build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
@@ -6837,6 +6893,15 @@ void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
+#ifdef CONFIG_VIRT_SCHED_DOMAIN
+ int i;
+ for (i = 0; i < MAX_LOCAL_APIC; i++) {
+ /* pretend all on the same node */
+ __vapicid_to_vnode[i] = 0;
+ }
+ update_virt_numa_node();
+ sched_domain_topology = virt_topology;
+#endif
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba..232482d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -8,6 +8,9 @@
extern __read_mostly int scheduler_running;
+#ifdef CONFIG_VIRT_SCHED_DOMAIN
+extern s16 __vapicid_to_vnode[];
+#endif
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -198,6 +201,8 @@ struct cfs_bandwidth { };
#endif /* CONFIG_CGROUP_SCHED */
+extern void async_rebuild_sched_domains(void);
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
--
1.7.4.4
next prev parent reply other threads:[~2012-05-23 6:33 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-05-23 6:32 [Qemu-devel] [RFC] kvm: export host NUMA info to guest's scheduler Liu Ping Fan
2012-05-23 6:32 ` Liu Ping Fan [this message]
2012-05-23 7:54 ` [Qemu-devel] [PATCH 1/2] sched: add virt sched domain for the guest Peter Zijlstra
2012-05-23 8:10 ` Liu ping fan
2012-05-23 8:23 ` Peter Zijlstra
2012-05-23 8:34 ` Liu ping fan
2012-05-23 8:48 ` Peter Zijlstra
2012-05-23 9:58 ` Liu ping fan
2012-05-23 10:14 ` Peter Zijlstra
2012-05-23 15:23 ` Dave Hansen
2012-05-23 15:52 ` Peter Zijlstra
2012-05-23 6:32 ` [Qemu-devel] [PATCH 2/2] sched: add virt domain device's driver Liu Ping Fan
2012-05-23 6:32 ` [Qemu-devel] [PATCH] kvm: collect vcpus' numa info for guest's scheduler Liu Ping Fan
2012-05-23 6:32 ` [Qemu-devel] [PATCH] Qemu: add virt sched domain device Liu Ping Fan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1337754751-9018-2-git-send-email-kernelfans@gmail.com \
--to=kernelfans@gmail.com \
--cc=anthony@codemonkey.ws \
--cc=avi@redhat.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).