* [PATCH] Add migration_cost option to scheduler
@ 2009-03-09 8:18 Yang, Xiaowei
2009-03-09 12:55 ` George Dunlap
0 siblings, 1 reply; 5+ messages in thread
From: Yang, Xiaowei @ 2009-03-09 8:18 UTC (permalink / raw)
To: xen-devel@lists.xensource.com
[-- Attachment #1: Type: text/plain, Size: 1298 bytes --]
The idea is borrowed from Linux kernel: if the vCPU is just scheduled
out and put to run-queue, it's likely cache-hot on its current pCPU, and
it may be scheduled in in a short period of time; however, if vCPU is
migrated to another pCPU, it need to re-warm the cache - that's the
meaning of migration cost.
The patch introduces an option migration_cost to depress too aggressive
vCPU migration (actually we really see migration frequency is very high
most of the time.), while in the meantime keeping load balance works in
certain degree.
Linux kernel uses 0.5ms by default. Considering the cost may be higher
(e.g. VMCS impact) than in native, migration_cost=1ms is chosen for our
tests, which are performed on a 4x 6-core Dunnington platform. In 24-VM
case, there is ~2% stable performance gain for enterprise workloads like
SPECjbb and sysbench. If HVM is with stubdom, the gain is more: 4% for
the same workloads.
The best value may vary on different platforms based on different cache
hierarchy and with different workloads. Due to resource limit, we
haven't test many combinations. And we plans to try more in future.
Welcome to evaluate and give feedback on what's suitable / not suitable
for you.
Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
Thanks,
Xiaowei
[-- Attachment #2: migration_cost.patch --]
[-- Type: text/x-patch, Size: 8322 bytes --]
Don't migrate cache-hot vCPU
diff -r 633debd7b831 tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c Tue Mar 03 03:17:57 2009 +0800
+++ b/tools/libxc/xc_misc.c Tue Mar 03 03:17:59 2009 +0800
@@ -92,6 +92,21 @@ int xc_sched_id(int xc_handle,
return ret;
*sched_id = sysctl.u.sched_id.sched_id;
+
+ return 0;
+}
+
+int xc_migration_cost(int xc_handle,
+ uint64_t cost)
+{
+ int ret;
+ DECLARE_SYSCTL;
+
+ sysctl.cmd = XEN_SYSCTL_migration_cost;
+ sysctl.u.migration_cost.cost = cost;
+
+ if ( (ret = do_sysctl(xc_handle, &sysctl)) != 0 )
+ return ret;
return 0;
}
diff -r 633debd7b831 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Tue Mar 03 03:17:57 2009 +0800
+++ b/tools/libxc/xenctrl.h Tue Mar 03 03:17:59 2009 +0800
@@ -611,6 +611,9 @@ int xc_sched_id(int xc_handle,
int xc_sched_id(int xc_handle,
int *sched_id);
+int xc_migration_cost(int xc_handle,
+ uint64_t cost);
+
typedef xen_sysctl_cpuinfo_t xc_cpuinfo_t;
int xc_getcpuinfo(int xc_handle, int max_cpus,
xc_cpuinfo_t *info, int *nr_cpus);
diff -r 633debd7b831 tools/xcutils/Makefile
--- a/tools/xcutils/Makefile Tue Mar 03 03:17:57 2009 +0800
+++ b/tools/xcutils/Makefile Tue Mar 03 03:17:59 2009 +0800
@@ -14,7 +14,7 @@ CFLAGS += -Werror
CFLAGS += -Werror
CFLAGS += $(CFLAGS_libxenctrl) $(CFLAGS_libxenguest) $(CFLAGS_libxenstore)
-PROGRAMS = xc_restore xc_save readnotes lsevtchn
+PROGRAMS = xc_restore xc_save readnotes lsevtchn migration_cost
LDLIBS = $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenguest) $(LDFLAGS_libxenstore)
diff -r 633debd7b831 tools/xcutils/migration_cost.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/migration_cost.c Tue Mar 03 03:17:59 2009 +0800
@@ -0,0 +1,25 @@
+#include <err.h>
+#include <stdlib.h>
+
+#include <xenctrl.h>
+
+int main(int argc, char **argv)
+{
+ int xc_fd, rc;
+ long long cost;
+
+ if (argc != 2)
+ errx(1, "usage: %s cost_in_ns", argv[0]);
+
+ cost = strtoll(argv[1], NULL, 0);
+
+ xc_fd = xc_interface_open();
+ if ( xc_fd < 0 )
+ errx(1, "failed to open control interface");
+
+ rc = xc_migration_cost(xc_fd, cost);
+
+ xc_interface_close(xc_fd);
+
+ return rc;
+}
diff -r 633debd7b831 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Tue Mar 03 03:17:57 2009 +0800
+++ b/xen/common/sched_credit.c Wed Mar 04 02:09:47 2009 +0800
@@ -123,7 +123,8 @@
_MACRO(dom_init) \
_MACRO(dom_destroy) \
_MACRO(vcpu_init) \
- _MACRO(vcpu_destroy)
+ _MACRO(vcpu_destroy) \
+ _MACRO(vcpu_hot)
#ifndef NDEBUG
#define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \
@@ -404,14 +405,29 @@ __csched_vcpu_check(struct vcpu *vc)
#define CSCHED_VCPU_CHECK(_vc)
#endif
+uint64_t migration_cost;
+
+static inline int
+__csched_vcpu_is_cache_hot(struct vcpu *v)
+{
+ int hot = (NOW() - v->runstate.state_entry_time) < migration_cost;
+
+ if (hot)
+ CSCHED_STAT_CRANK(vcpu_hot);
+
+ return hot;
+}
+
static inline int
__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
{
/*
- * Don't pick up work that's in the peer's scheduling tail. Also only pick
- * up work that's allowed to run on our CPU.
- */
- return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
+ * Don't pick up work that's in the peer's scheduling tail or hot on
+ * peer PCPU. Only pick up work that's allowed to run on our CPU.
+ */
+ return !vc->is_running &&
+ !__csched_vcpu_is_cache_hot(vc) &&
+ cpu_isset(dest_cpu, vc->cpu_affinity);
}
static int
@@ -1306,7 +1322,8 @@ csched_dump(void)
"\tmsecs per tick = %dms\n"
"\tcredits per tick = %d\n"
"\tticks per tslice = %d\n"
- "\tticks per acct = %d\n",
+ "\tticks per acct = %d\n"
+ "\tmigration cost = %"PRIu64"ns\n",
csched_priv.ncpus,
csched_priv.master,
csched_priv.credit,
@@ -1317,7 +1334,8 @@ csched_dump(void)
CSCHED_MSECS_PER_TICK,
CSCHED_CREDITS_PER_TICK,
CSCHED_TICKS_PER_TSLICE,
- CSCHED_TICKS_PER_ACCT);
+ CSCHED_TICKS_PER_ACCT,
+ migration_cost);
cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), csched_priv.idlers);
printk("idlers: %s\n", idlers_buf);
diff -r 633debd7b831 xen/common/sysctl.c
--- a/xen/common/sysctl.c Tue Mar 03 03:17:57 2009 +0800
+++ b/xen/common/sysctl.c Tue Mar 03 03:17:59 2009 +0800
@@ -88,6 +88,19 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
}
break;
+ case XEN_SYSCTL_migration_cost:
+ {
+ extern uint64_t migration_cost;
+ ret = xsm_migration_cost();
+ if ( ret )
+ break;
+
+ migration_cost = op->u.migration_cost.cost;
+
+ ret = 0;
+ }
+ break;
+
case XEN_SYSCTL_getdomaininfolist:
{
struct domain *d;
diff -r 633debd7b831 xen/include/public/sysctl.h
--- a/xen/include/public/sysctl.h Tue Mar 03 03:17:57 2009 +0800
+++ b/xen/include/public/sysctl.h Tue Mar 03 03:17:59 2009 +0800
@@ -359,6 +359,15 @@ struct xen_sysctl_pm_op {
};
};
+/* Adjust vCPU migration_cost. */
+#define XEN_SYSCTL_migration_cost 13
+struct xen_sysctl_migration_cost {
+ /* IN variables. */
+ uint64_t cost;
+};
+typedef struct xen_sysctl_migration_cost xen_sysctl_migration_cost_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_migration_cost_t);
+
struct xen_sysctl {
uint32_t cmd;
uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
@@ -375,6 +384,7 @@ struct xen_sysctl {
struct xen_sysctl_get_pmstat get_pmstat;
struct xen_sysctl_cpu_hotplug cpu_hotplug;
struct xen_sysctl_pm_op pm_op;
+ struct xen_sysctl_migration_cost migration_cost;
uint8_t pad[128];
} u;
};
diff -r 633debd7b831 xen/include/xsm/xsm.h
--- a/xen/include/xsm/xsm.h Tue Mar 03 03:17:57 2009 +0800
+++ b/xen/include/xsm/xsm.h Tue Mar 03 03:17:59 2009 +0800
@@ -68,6 +68,7 @@ struct xsm_operations {
int (*tbufcontrol) (void);
int (*readconsole) (uint32_t clear);
int (*sched_id) (void);
+ int (*migration_cost) (void);
int (*setdomainmaxmem) (struct domain *d);
int (*setdomainhandle) (struct domain *d);
int (*setdebugging) (struct domain *d);
@@ -247,6 +248,11 @@ static inline int xsm_sched_id (void)
return xsm_call(sched_id());
}
+static inline int xsm_migration_cost (void)
+{
+ return xsm_call(migration_cost());
+}
+
static inline int xsm_setdomainmaxmem (struct domain *d)
{
return xsm_call(setdomainmaxmem(d));
diff -r 633debd7b831 xen/xsm/dummy.c
--- a/xen/xsm/dummy.c Tue Mar 03 03:17:57 2009 +0800
+++ b/xen/xsm/dummy.c Tue Mar 03 03:17:59 2009 +0800
@@ -100,6 +100,11 @@ static int dummy_readconsole (uint32_t c
}
static int dummy_sched_id (void)
+{
+ return 0;
+}
+
+static int dummy_migration_cost (void)
{
return 0;
}
@@ -486,6 +491,7 @@ void xsm_fixup_ops (struct xsm_operation
set_to_dummy_if_null(ops, tbufcontrol);
set_to_dummy_if_null(ops, readconsole);
set_to_dummy_if_null(ops, sched_id);
+ set_to_dummy_if_null(ops, migration_cost);
set_to_dummy_if_null(ops, setdomainmaxmem);
set_to_dummy_if_null(ops, setdomainhandle);
set_to_dummy_if_null(ops, setdebugging);
diff -r 633debd7b831 xen/xsm/flask/hooks.c
--- a/xen/xsm/flask/hooks.c Tue Mar 03 03:17:57 2009 +0800
+++ b/xen/xsm/flask/hooks.c Tue Mar 03 03:17:59 2009 +0800
@@ -597,6 +597,11 @@ static int flask_readconsole(uint32_t cl
}
static int flask_sched_id(void)
+{
+ return domain_has_xen(current->domain, XEN__SCHEDULER);
+}
+
+static int flask_migration_cost(void)
{
return domain_has_xen(current->domain, XEN__SCHEDULER);
}
@@ -1235,6 +1240,7 @@ static struct xsm_operations flask_ops =
.tbufcontrol = flask_tbufcontrol,
.readconsole = flask_readconsole,
.sched_id = flask_sched_id,
+ .migration_cost = flask_migration_cost,
.setdomainmaxmem = flask_setdomainmaxmem,
.setdomainhandle = flask_setdomainhandle,
.setdebugging = flask_setdebugging,
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] Add migration_cost option to scheduler
2009-03-09 8:18 [PATCH] Add migration_cost option to scheduler Yang, Xiaowei
@ 2009-03-09 12:55 ` George Dunlap
2009-03-09 13:08 ` Tian, Kevin
0 siblings, 1 reply; 5+ messages in thread
From: George Dunlap @ 2009-03-09 12:55 UTC (permalink / raw)
To: Yang, Xiaowei; +Cc: xen-devel@lists.xensource.com
Hmm, I think this patch may not be exactly what we want. It looks
like it checks for how long a vcpu has been in its current stat, not
how recently it has been running. So if a vcpu sleeps for a long time
on a cpu that's running other workloads, then wakes up
(blocked->runnable), the cache is by no means "hot". But since it has
only been in the "runnable" state for a few hundred cycles, it won't
be migrated, even though there's little cost.
However, if the pcpu was idle since the last time this vcpu ran (i.e.,
if we're just catching the vcpu in the process of waking up), the
cache *is* still hot. Hmm....
-George
2009/3/9 Yang, Xiaowei <xiaowei.yang@intel.com>:
> The idea is borrowed from Linux kernel: if the vCPU is just scheduled out
> and put to run-queue, it's likely cache-hot on its current pCPU, and it may
> be scheduled in in a short period of time; however, if vCPU is migrated to
> another pCPU, it need to re-warm the cache - that's the meaning of migration
> cost.
>
> The patch introduces an option migration_cost to depress too aggressive vCPU
> migration (actually we really see migration frequency is very high most of
> the time.), while in the meantime keeping load balance works in certain
> degree.
>
> Linux kernel uses 0.5ms by default. Considering the cost may be higher (e.g.
> VMCS impact) than in native, migration_cost=1ms is chosen for our tests,
> which are performed on a 4x 6-core Dunnington platform. In 24-VM case, there
> is ~2% stable performance gain for enterprise workloads like SPECjbb and
> sysbench. If HVM is with stubdom, the gain is more: 4% for the same
> workloads.
>
> The best value may vary on different platforms based on different cache
> hierarchy and with different workloads. Due to resource limit, we haven't
> test many combinations. And we plans to try more in future. Welcome to
> evaluate and give feedback on what's suitable / not suitable for you.
>
> Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
>
>
> Thanks,
> Xiaowei
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* RE: [PATCH] Add migration_cost option to scheduler
2009-03-09 12:55 ` George Dunlap
@ 2009-03-09 13:08 ` Tian, Kevin
2009-03-10 3:36 ` Yang, Xiaowei
0 siblings, 1 reply; 5+ messages in thread
From: Tian, Kevin @ 2009-03-09 13:08 UTC (permalink / raw)
To: 'George Dunlap', Yang, Xiaowei; +Cc: xen-devel@lists.xensource.com
>From: George Dunlap
>Sent: Monday, March 09, 2009 8:56 PM
>
>Hmm, I think this patch may not be exactly what we want. It looks
>like it checks for how long a vcpu has been in its current stat, not
>how recently it has been running. So if a vcpu sleeps for a long time
>on a cpu that's running other workloads, then wakes up
>(blocked->runnable), the cache is by no means "hot". But since it has
>only been in the "runnable" state for a few hundred cycles, it won't
>be migrated, even though there's little cost.
Then to add a per-vcpu last_running_timestamp which is recorded when
vcpu is scheduled out, could hit the purpose here?
>
>However, if the pcpu was idle since the last time this vcpu ran (i.e.,
>if we're just catching the vcpu in the process of waking up), the
>cache *is* still hot. Hmm....
>
If peer pcpu is idle, shouldn't it be skipped by load balancer running on
another pcpu, which is out of the cache-hot logic?
Thanks,
Kevin
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] Add migration_cost option to scheduler
2009-03-09 13:08 ` Tian, Kevin
@ 2009-03-10 3:36 ` Yang, Xiaowei
2009-03-10 10:56 ` Yang, Xiaowei
0 siblings, 1 reply; 5+ messages in thread
From: Yang, Xiaowei @ 2009-03-10 3:36 UTC (permalink / raw)
To: Tian, Kevin; +Cc: 'George Dunlap', xen-devel@lists.xensource.com
Tian, Kevin wrote:
>> From: George Dunlap
>> Sent: Monday, March 09, 2009 8:56 PM
>>
>> Hmm, I think this patch may not be exactly what we want. It looks
>> like it checks for how long a vcpu has been in its current stat, not
>> how recently it has been running. So if a vcpu sleeps for a long time
>> on a cpu that's running other workloads, then wakes up
>> (blocked->runnable), the cache is by no means "hot". But since it has
>> only been in the "runnable" state for a few hundred cycles, it won't
>> be migrated, even though there's little cost.
>
> Then to add a per-vcpu last_running_timestamp which is recorded when
> vcpu is scheduled out, could hit the purpose here?
Yes, it's more reasonable. I can make a patch.
Thanks,
xiaowei
>
>> However, if the pcpu was idle since the last time this vcpu ran (i.e.,
>> if we're just catching the vcpu in the process of waking up), the
>> cache *is* still hot. Hmm....
>>
>
> If peer pcpu is idle, shouldn't it be skipped by load balancer running on
> another pcpu, which is out of the cache-hot logic?
>
> Thanks,
> Kevin
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] Add migration_cost option to scheduler
2009-03-10 3:36 ` Yang, Xiaowei
@ 2009-03-10 10:56 ` Yang, Xiaowei
0 siblings, 0 replies; 5+ messages in thread
From: Yang, Xiaowei @ 2009-03-10 10:56 UTC (permalink / raw)
To: Tian, Kevin
Cc: 'George Dunlap', xen-devel@lists.xensource.com,
Keir Fraser
[-- Attachment #1: Type: text/plain, Size: 1148 bytes --]
Yang, Xiaowei wrote:
> Tian, Kevin wrote:
>>> From: George Dunlap
>>> Sent: Monday, March 09, 2009 8:56 PM
>>>
>>> Hmm, I think this patch may not be exactly what we want. It looks
>>> like it checks for how long a vcpu has been in its current stat, not
>>> how recently it has been running. So if a vcpu sleeps for a long time
>>> on a cpu that's running other workloads, then wakes up
>>> (blocked->runnable), the cache is by no means "hot". But since it has
>>> only been in the "runnable" state for a few hundred cycles, it won't
>>> be migrated, even though there's little cost.
>> Then to add a per-vcpu last_running_timestamp which is recorded when
>> vcpu is scheduled out, could hit the purpose here?
> Yes, it's more reasonable. I can make a patch.
>
The patch is attached. Had a quick test using the previous config and
saw no obvious different result with it. May be explained by the fact
that the benchmarks has no many IO operations - db and logfile are put
to ramdisk for sysbench test. Anyway it's closer to what we really want.
Thanks both of you!
Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
Thanks,
xiaowei
[-- Attachment #2: cache_hot.patch --]
[-- Type: text/x-patch, Size: 1440 bytes --]
diff -r 115c97f32dc6 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Tue Mar 10 00:18:39 2009 +0800
+++ b/xen/common/sched_credit.c Tue Mar 10 03:09:44 2009 +0800
@@ -328,7 +328,7 @@ static inline int
static inline int
__csched_vcpu_is_cache_hot(struct vcpu *v)
{
- int hot = ((NOW() - v->runstate.state_entry_time) <
+ int hot = ((NOW() - v->last_run_time) <
((uint64_t)vcpu_migration_delay * 1000u));
if ( hot )
diff -r 115c97f32dc6 xen/common/schedule.c
--- a/xen/common/schedule.c Tue Mar 10 00:18:39 2009 +0800
+++ b/xen/common/schedule.c Tue Mar 10 03:00:36 2009 +0800
@@ -836,6 +836,7 @@ static void schedule(void)
(test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
(vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
now);
+ prev->last_run_time = now;
ASSERT(next->runstate.state != RUNSTATE_running);
vcpu_runstate_change(next, RUNSTATE_running, now);
diff -r 115c97f32dc6 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Tue Mar 10 00:18:39 2009 +0800
+++ b/xen/include/xen/sched.h Tue Mar 10 02:59:15 2009 +0800
@@ -102,6 +102,9 @@ struct vcpu
} runstate_guest; /* guest address */
#endif
+ /* last time when vCPU is scheduled out */
+ uint64_t last_run_time;
+
/* Has the FPU been initialised? */
bool_t fpu_initialised;
/* Has the FPU been used since it was last saved? */
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2009-03-10 10:56 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-03-09 8:18 [PATCH] Add migration_cost option to scheduler Yang, Xiaowei
2009-03-09 12:55 ` George Dunlap
2009-03-09 13:08 ` Tian, Kevin
2009-03-10 3:36 ` Yang, Xiaowei
2009-03-10 10:56 ` Yang, Xiaowei
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.