[Qemu-devel] [PATCH 1/2] sched: add virt sched domain for the guest

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Liu Ping Fan <kernelfans@gmail.com>
To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, qemu-devel@nongnu.org
Cc: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>, Avi Kivity <avi@redhat.com>,
	Anthony Liguori <anthony@codemonkey.ws>
Subject: [Qemu-devel] [PATCH 1/2] sched: add virt sched domain for the guest
Date: Wed, 23 May 2012 14:32:28 +0800	[thread overview]
Message-ID: <1337754751-9018-2-git-send-email-kernelfans@gmail.com> (raw)
In-Reply-To: <1337754751-9018-1-git-send-email-kernelfans@gmail.com>

From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

The guest's scheduler can not see the numa info on the host and
this will result to the following scene:
  Supposing vcpu-a on nodeA, vcpu-b on nodeB, when load balance,
the tasks' pull and push between these vcpus will cost more. But
unfortunately, currently, the guest is just blind to this.

This patch want to export the host numa info to the guest, and help
guest to rebuild its sched domain based on host's info.

--todo:
  vcpu's hotplug will be considered.

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 kernel/cpuset.c      |    2 +-
 kernel/sched/core.c  |   65 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |    5 ++++
 3 files changed, 71 insertions(+), 1 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070..1246091 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -778,7 +778,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
  * to a separate workqueue thread, which ends up processing the
  * above do_rebuild_sched_domains() function.
  */
-static void async_rebuild_sched_domains(void)
+void async_rebuild_sched_domains(void)
 {
 	queue_work(cpuset_wq, &rebuild_sched_domains_work);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae..3f72c1a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6343,6 +6343,60 @@ static struct sched_domain_topology_level default_topology[] = {
 	{ NULL, },
 };
 
+#ifdef CONFIG_VIRT_SCHED_DOMAIN
+/* fill in by host */
+DEFINE_PER_CPU(int, virt_numa_node);
+/* todo, exchange info about HOST_NUMNODES from host */
+#define  HOST_NUMNODES  128
+/* keep map, node->cpumask; todo, make it dynamic allocated */
+static struct cpumask virt_node_to_cpumask_map[HOST_NUMNODES];
+
+static inline int virt_cpu_to_node(int cpu)
+{
+	return per_cpu(virt_numa_node, cpu);
+}
+
+const struct cpumask *virt_cpumask_of_node(int vnode)
+{
+	struct cpumask *msk = &virt_node_to_cpumask_map[vnode];
+	return msk;
+}
+
+static const struct cpumask *virt_cpu_cpu_mask(int cpu)
+{
+	return virt_cpumask_of_node(virt_cpu_to_node(cpu));
+}
+
+static struct sched_domain_topology_level virt_topology[] = {
+	{ sd_init_CPU, virt_cpu_cpu_mask, },
+#ifdef CONFIG_NUMA
+	{ sd_init_ALLNODES, cpu_allnodes_mask, },
+#endif
+	{ NULL, },
+};
+
+static int update_virt_numa_node(void)
+{
+	int i, cpu, apicid, vnode;
+	for (i = 0; i < HOST_NUMNODES; i++)
+		cpumask_clear(&virt_node_to_cpumask_map[i]);
+	for_each_possible_cpu(cpu) {
+		apicid = cpu_physical_id(cpu);
+		vnode = __vapicid_to_vnode[apicid];
+		per_cpu(virt_numa_node, cpu) = vnode;
+		cpumask_set_cpu(cpu, &virt_node_to_cpumask_map[vnode]);
+	}
+	return 0;
+}
+
+int rebuild_virt_sd(void)
+{
+	update_virt_numa_node();
+	async_rebuild_sched_domains();
+	return 0;
+}
+#endif
+
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
 
 static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -6689,9 +6743,11 @@ match1:
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur && !new_topology; j++) {
+#ifndef CONFIG_VIRT_SCHED_DOMAIN
 			if (cpumask_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
+#endif
 		}
 		/* no match - add a new doms_new */
 		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
@@ -6837,6 +6893,15 @@ void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
 
+#ifdef CONFIG_VIRT_SCHED_DOMAIN
+	int i;
+	for (i = 0; i < MAX_LOCAL_APIC; i++) {
+		/* pretend all on the same node */
+		__vapicid_to_vnode[i] = 0;
+	}
+	update_virt_numa_node();
+	sched_domain_topology = virt_topology;
+#endif
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba..232482d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -8,6 +8,9 @@
 
 extern __read_mostly int scheduler_running;
 
+#ifdef CONFIG_VIRT_SCHED_DOMAIN
+extern s16 __vapicid_to_vnode[];
+#endif
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -198,6 +201,8 @@ struct cfs_bandwidth { };
 
 #endif	/* CONFIG_CGROUP_SCHED */
 
+extern void async_rebuild_sched_domains(void);
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
-- 
1.7.4.4

next prev parent reply	other threads:[~2012-05-23  6:33 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-23  6:32 [Qemu-devel] [RFC] kvm: export host NUMA info to guest's scheduler Liu Ping Fan
2012-05-23  6:32 ` Liu Ping Fan [this message]
2012-05-23  7:54   ` [Qemu-devel] [PATCH 1/2] sched: add virt sched domain for the guest Peter Zijlstra
2012-05-23  8:10     ` Liu ping fan
2012-05-23  8:23       ` Peter Zijlstra
2012-05-23  8:34         ` Liu ping fan
2012-05-23  8:48           ` Peter Zijlstra
2012-05-23  9:58             ` Liu ping fan
2012-05-23 10:14               ` Peter Zijlstra
2012-05-23 15:23             ` Dave Hansen
2012-05-23 15:52               ` Peter Zijlstra
2012-05-23  6:32 ` [Qemu-devel] [PATCH 2/2] sched: add virt domain device's driver Liu Ping Fan
2012-05-23  6:32 ` [Qemu-devel] [PATCH] kvm: collect vcpus' numa info for guest's scheduler Liu Ping Fan
2012-05-23  6:32 ` [Qemu-devel] [PATCH] Qemu: add virt sched domain device Liu Ping Fan

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:14f7070 dfblob:1246091 dfblob:e5212ae dfblob:3f72c1a
dfblob:fb3acba dfblob:232482d )
 OR (
bs:"[Qemu-devel] [PATCH 1/2] sched: add virt sched domain for the guest" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1337754751-9018-2-git-send-email-kernelfans@gmail.com \
    --to=kernelfans@gmail.com \
    --cc=anthony@codemonkey.ws \
    --cc=avi@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).