* [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
@ 2009-09-27 19:22 Dan Magenheimer
2009-09-27 21:39 ` Keir Fraser
` (2 more replies)
0 siblings, 3 replies; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-27 19:22 UTC (permalink / raw)
To: Xen-Devel (E-mail)
[-- Attachment #1: Type: text/plain, Size: 14134 bytes --]
(I'm still struggling against the coils of python on
the tools part of this patch, so thought I'd post the
hypervisor side separately first.)
Switches rdtsc emulation from boot option to per-domain
and enables it by default. Also removes hvm tsc scaling
as it is no longer necessary.
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
diff -r 72d130772f36 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/domain.c Sun Sep 27 13:08:12 2009 -0600
@@ -494,6 +494,10 @@ int arch_domain_create(struct domain *d,
mce_init_msr(d);
}
+ if ( d->domain_id == 0 )
+ d->arch.tsc_native = 1;
+ spin_lock_init(&d->arch.vtsc_lock);
+
if ( is_hvm_domain(d) )
{
if ( (rc = hvm_domain_initialise(d)) != 0 )
@@ -503,11 +507,9 @@ int arch_domain_create(struct domain *d,
}
}
else
- {
/* 32-bit PV guest by default only if Xen is not 64-bit. */
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
(CONFIG_PAGING_LEVELS != 4);
- }
memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
for ( i = 0; i < MAX_CPUID_INPUT; i++ )
@@ -515,10 +517,6 @@ int arch_domain_create(struct domain *d,
d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
}
-
- /* For now, per-domain SoftTSC status is taken from global boot param. */
- d->arch.vtsc = opt_softtsc;
- spin_lock_init(&d->arch.vtsc_lock);
return 0;
diff -r 72d130772f36 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/domctl.c Sun Sep 27 13:08:12 2009 -0600
@@ -1060,6 +1060,20 @@ long arch_do_domctl(
}
break;
+ case XEN_DOMCTL_set_tsc_native:
+ {
+ struct domain *d;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+ d->arch.tsc_native = 1;
+ rcu_unlock_domain(d);
+ ret = 0;
+ }
+ break;
+
case XEN_DOMCTL_suppress_spurious_page_faults:
{
struct domain *d;
diff -r 72d130772f36 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/hvm.c Sun Sep 27 13:08:12 2009 -0600
@@ -146,63 +146,39 @@ void hvm_enable_rdtsc_exiting(struct dom
{
struct vcpu *v;
- if ( opt_softtsc || !hvm_funcs.enable_rdtsc_exiting )
+ if ( !hvm_funcs.enable_rdtsc_exiting )
return;
for_each_vcpu ( d, v )
hvm_funcs.enable_rdtsc_exiting(v);
}
-int hvm_gtsc_need_scale(struct domain *d)
-{
- uint32_t gtsc_mhz, htsc_mhz;
-
- gtsc_mhz = d->arch.hvm_domain.gtsc_khz / 1000;
- htsc_mhz = opt_softtsc ? 1000 : ((uint32_t)cpu_khz / 1000);
-
- d->arch.hvm_domain.tsc_scaled = (gtsc_mhz && (gtsc_mhz != htsc_mhz));
- return d->arch.hvm_domain.tsc_scaled;
-}
-
-static u64 hvm_h2g_scale_tsc(struct vcpu *v, u64 host_tsc)
-{
- uint32_t gtsc_khz, htsc_khz;
-
- if ( !v->domain->arch.hvm_domain.tsc_scaled )
- return host_tsc;
-
- htsc_khz = opt_softtsc ? 1000000 : cpu_khz;
- gtsc_khz = v->domain->arch.hvm_domain.gtsc_khz;
- return muldiv64(host_tsc, gtsc_khz, htsc_khz);
-}
-
void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
{
- uint64_t host_tsc, scaled_htsc;
+ uint64_t host_tsc;
- if ( opt_softtsc )
+ if ( v->domain->arch.tsc_native )
+ rdtscll(host_tsc);
+ else
host_tsc = hvm_get_guest_time(v);
- else
- rdtscll(host_tsc);
- scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
-
- v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - scaled_htsc;
+ v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
}
u64 hvm_get_guest_tsc(struct vcpu *v)
{
- uint64_t host_tsc, scaled_htsc;
+ uint64_t tsc;
- if ( opt_softtsc )
- host_tsc = hvm_get_guest_time(v);
+ if ( v->domain->arch.tsc_native )
+ rdtscll(tsc);
else
- rdtscll(host_tsc);
+ {
+ v->domain->arch.vtsc_kerncount++;
+ tsc = hvm_get_guest_time(v);
+ }
- scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
-
- return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset;
+ return tsc + v->arch.hvm_vcpu.cache_tsc_offset;
}
void hvm_migrate_timers(struct vcpu *v)
diff -r 72d130772f36 xen/arch/x86/hvm/save.c
--- a/xen/arch/x86/hvm/save.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/save.c Sun Sep 27 13:08:12 2009 -0600
@@ -31,9 +31,6 @@ void arch_hvm_save(struct domain *d, str
/* Save some CPUID bits */
cpuid(1, &eax, &ebx, &ecx, &edx);
hdr->cpuid = eax;
-
- /* Save guest's preferred TSC. */
- hdr->gtsc_khz = d->arch.hvm_domain.gtsc_khz;
}
int arch_hvm_load(struct domain *d, struct hvm_save_header *hdr)
@@ -60,17 +57,8 @@ int arch_hvm_load(struct domain *d, stru
gdprintk(XENLOG_WARNING, "HVM restore: saved CPUID (%#"PRIx32") "
"does not match host (%#"PRIx32").\n", hdr->cpuid, eax);
- /* Restore guest's preferred TSC frequency. */
- d->arch.hvm_domain.gtsc_khz = hdr->gtsc_khz;
-
- if ( hdr->gtsc_khz && hvm_gtsc_need_scale(d) )
- {
+ if ( !d->arch.tsc_native )
hvm_enable_rdtsc_exiting(d);
- gdprintk(XENLOG_WARNING, "Domain %d expects freq %uMHz "
- "but host's freq is %luMHz: trap and emulate rdtsc\n",
- d->domain_id, hdr->gtsc_khz / 1000, opt_softtsc ? 1000ul :
- cpu_khz / 1000);
- }
/* VGA state is not saved/restored, so we nobble the cache. */
d->arch.hvm_domain.stdvga.cache = 0;
diff -r 72d130772f36 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/svm/vmcb.c Sun Sep 27 13:08:12 2009 -0600
@@ -167,7 +167,7 @@ static int construct_vmcb(struct vcpu *v
/* TSC. */
vmcb->tsc_offset = 0;
- if ( opt_softtsc )
+ if ( !v->domain->arch.tsc_native )
vmcb->general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
/* Guest EFER. */
diff -r 72d130772f36 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/vmx/vmcs.c Sun Sep 27 13:08:12 2009 -0600
@@ -133,7 +133,7 @@ static void vmx_init_vmcs_config(void)
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_ACTIVATE_IO_BITMAP |
CPU_BASED_USE_TSC_OFFSETING |
- (opt_softtsc ? CPU_BASED_RDTSC_EXITING : 0));
+ CPU_BASED_RDTSC_EXITING );
opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
CPU_BASED_TPR_SHADOW |
CPU_BASED_MONITOR_TRAP_FLAG |
diff -r 72d130772f36 xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/vpt.c Sun Sep 27 13:08:12 2009 -0600
@@ -32,9 +32,6 @@ void hvm_init_guest_time(struct domain *
spin_lock_init(&pl->pl_time_lock);
pl->stime_offset = -(u64)get_s_time();
pl->last_guest_time = 0;
-
- d->arch.hvm_domain.gtsc_khz = opt_softtsc ? 1000000 : cpu_khz;
- d->arch.hvm_domain.tsc_scaled = 0;
}
u64 hvm_get_guest_time(struct vcpu *v)
diff -r 72d130772f36 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/time.c Sun Sep 27 13:08:12 2009 -0600
@@ -35,9 +35,6 @@
/* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
static char __initdata opt_clocksource[10];
string_param("clocksource", opt_clocksource);
-
-int opt_softtsc;
-boolean_param("softtsc", opt_softtsc);
/*
* opt_consistent_tscs: All TSCs tick at the exact same rate, allowing
@@ -1438,20 +1435,18 @@ struct tm wallclock_time(void)
* PV SoftTSC Emulation.
*/
-static unsigned long rdtsc_kerncount, rdtsc_usercount;
-
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs)
{
s_time_t now;
if ( guest_kernel_mode(v, regs) )
{
- rdtsc_kerncount++;
+ v->domain->arch.vtsc_kerncount++;
rdtsc(regs->eax, regs->edx);
}
else
{
- rdtsc_usercount++;
+ v->domain->arch.vtsc_kerncount++;
spin_lock(&v->domain->arch.vtsc_lock);
now = get_s_time() + v->domain->arch.vtsc_stime_offset;
if ( (int64_t)(now - v->domain->arch.vtsc_last) > 0 )
@@ -1464,10 +1459,27 @@ void pv_soft_rdtsc(struct vcpu *v, struc
}
}
+/* vtsc may incur measurable performance degradation, diagnose with this */
static void dump_softtsc(unsigned char key)
{
- printk("softtsc count: %lu kernel, %lu user\n",
- rdtsc_kerncount, rdtsc_usercount);
+ struct domain *d;
+ int domcnt = 0;
+
+ for_each_domain ( d )
+ {
+ if ( !d->arch.tsc_native )
+ {
+ if ( is_hvm_domain(d) )
+ printk("dom%u (hvm) vtsc count: %"PRIu64" total\n",
+ d->domain_id, d->arch.vtsc_kerncount);
+ else
+ printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
+ d->domain_id, d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
+ domcnt++;
+ }
+ }
+ if ( !domcnt )
+ printk("all domains have tsc_native enabled\n");
}
static struct keyhandler dump_softtsc_keyhandler = {
@@ -1478,8 +1490,7 @@ static struct keyhandler dump_softtsc_ke
static int __init setup_dump_softtsc(void)
{
- if ( opt_softtsc )
- register_keyhandler('s', &dump_softtsc_keyhandler);
+ register_keyhandler('s', &dump_softtsc_keyhandler);
return 0;
}
__initcall(setup_dump_softtsc);
diff -r 72d130772f36 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/traps.c Sun Sep 27 13:08:12 2009 -0600
@@ -2010,7 +2010,7 @@ static int emulate_privileged_op(struct
*/
opcode = insn_fetch(u8, code_base, eip, code_limit);
if ( !guest_kernel_mode(v, regs) &&
- !((opcode == 0x31) && v->domain->arch.vtsc) )
+ !((opcode == 0x31) && !v->domain->arch.tsc_native) )
goto fail;
if ( lock && (opcode & ~3) != 0x20 )
diff -r 72d130772f36 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/asm-x86/domain.h Sun Sep 27 13:08:12 2009 -0600
@@ -301,9 +301,11 @@ struct arch_domain
struct domain_mca_msrs vmca_msrs;
/* SoftTSC emulation */
- bool_t vtsc;
+ bool_t tsc_native; /* true means vtsc is disabled */
s_time_t vtsc_last;
spinlock_t vtsc_lock;
+ uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
+ uint64_t vtsc_usercount; /* not used for hvm */
int64_t vtsc_stime_offset;
} __cacheline_aligned;
@@ -435,7 +437,7 @@ unsigned long pv_guest_cr4_fixup(unsigne
#define pv_guest_cr4_to_real_cr4(v) \
(((v)->arch.guest_context.ctrlreg[4] \
| (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE)) \
- | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \
+ | ((v)->domain->arch.tsc_native ? 0: X86_CR4_TSD)) \
& ~X86_CR4_DE)
#define real_cr4_to_pv_guest_cr4(c) \
((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD))
diff -r 72d130772f36 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/asm-x86/hvm/domain.h Sun Sep 27 13:08:12 2009 -0600
@@ -45,8 +45,6 @@ struct hvm_domain {
struct hvm_ioreq_page ioreq;
struct hvm_ioreq_page buf_ioreq;
- uint32_t gtsc_khz; /* kHz */
- bool_t tsc_scaled;
struct pl_time pl_time;
struct hvm_io_handler io_handler;
diff -r 72d130772f36 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h Sun Sep 27 13:08:12 2009 -0600
@@ -286,7 +286,6 @@ uint8_t hvm_combine_hw_exceptions(uint8_
uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2);
void hvm_enable_rdtsc_exiting(struct domain *d);
-int hvm_gtsc_need_scale(struct domain *d);
static inline int hvm_cpu_up(void)
{
diff -r 72d130772f36 xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/asm-x86/time.h Sun Sep 27 13:08:12 2009 -0600
@@ -41,7 +41,6 @@ uint64_t acpi_pm_tick_to_ns(uint64_t tic
uint64_t acpi_pm_tick_to_ns(uint64_t ticks);
uint64_t ns_to_acpi_pm_tick(uint64_t ns);
-extern int opt_softtsc;
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
#endif /* __X86_TIME_H__ */
diff -r 72d130772f36 xen/include/public/arch-x86/hvm/save.h
--- a/xen/include/public/arch-x86/hvm/save.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/public/arch-x86/hvm/save.h Sun Sep 27 13:08:12 2009 -0600
@@ -38,7 +38,6 @@ struct hvm_save_header {
uint32_t version; /* File format version */
uint64_t changeset; /* Version of Xen that saved this file */
uint32_t cpuid; /* CPUID[0x01][%eax] on the saving machine */
- uint32_t gtsc_khz; /* Guest's TSC frequency in kHz */
};
DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
diff -r 72d130772f36 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/public/domctl.h Sun Sep 27 13:08:12 2009 -0600
@@ -400,7 +400,8 @@ struct xen_domctl_settimeoffset {
};
typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
-
+/* allow rdtsc to be natively executed (is trapped/emulated by default */
+#define XEN_DOMCTL_set_tsc_native 57
#define XEN_DOMCTL_gethvmcontext 33
#define XEN_DOMCTL_sethvmcontext 34
[-- Attachment #2: tsc-native-hyp.patch --]
[-- Type: application/octet-stream, Size: 13394 bytes --]
diff -r 72d130772f36 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/domain.c Sun Sep 27 13:08:12 2009 -0600
@@ -494,6 +494,10 @@ int arch_domain_create(struct domain *d,
mce_init_msr(d);
}
+ if ( d->domain_id == 0 )
+ d->arch.tsc_native = 1;
+ spin_lock_init(&d->arch.vtsc_lock);
+
if ( is_hvm_domain(d) )
{
if ( (rc = hvm_domain_initialise(d)) != 0 )
@@ -503,11 +507,9 @@ int arch_domain_create(struct domain *d,
}
}
else
- {
/* 32-bit PV guest by default only if Xen is not 64-bit. */
d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
(CONFIG_PAGING_LEVELS != 4);
- }
memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
for ( i = 0; i < MAX_CPUID_INPUT; i++ )
@@ -515,10 +517,6 @@ int arch_domain_create(struct domain *d,
d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
}
-
- /* For now, per-domain SoftTSC status is taken from global boot param. */
- d->arch.vtsc = opt_softtsc;
- spin_lock_init(&d->arch.vtsc_lock);
return 0;
diff -r 72d130772f36 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/domctl.c Sun Sep 27 13:08:12 2009 -0600
@@ -1060,6 +1060,20 @@ long arch_do_domctl(
}
break;
+ case XEN_DOMCTL_set_tsc_native:
+ {
+ struct domain *d;
+
+ ret = -ESRCH;
+ d = rcu_lock_domain_by_id(domctl->domain);
+ if ( d == NULL )
+ break;
+ d->arch.tsc_native = 1;
+ rcu_unlock_domain(d);
+ ret = 0;
+ }
+ break;
+
case XEN_DOMCTL_suppress_spurious_page_faults:
{
struct domain *d;
diff -r 72d130772f36 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/hvm.c Sun Sep 27 13:08:12 2009 -0600
@@ -146,63 +146,39 @@ void hvm_enable_rdtsc_exiting(struct dom
{
struct vcpu *v;
- if ( opt_softtsc || !hvm_funcs.enable_rdtsc_exiting )
+ if ( !hvm_funcs.enable_rdtsc_exiting )
return;
for_each_vcpu ( d, v )
hvm_funcs.enable_rdtsc_exiting(v);
}
-int hvm_gtsc_need_scale(struct domain *d)
-{
- uint32_t gtsc_mhz, htsc_mhz;
-
- gtsc_mhz = d->arch.hvm_domain.gtsc_khz / 1000;
- htsc_mhz = opt_softtsc ? 1000 : ((uint32_t)cpu_khz / 1000);
-
- d->arch.hvm_domain.tsc_scaled = (gtsc_mhz && (gtsc_mhz != htsc_mhz));
- return d->arch.hvm_domain.tsc_scaled;
-}
-
-static u64 hvm_h2g_scale_tsc(struct vcpu *v, u64 host_tsc)
-{
- uint32_t gtsc_khz, htsc_khz;
-
- if ( !v->domain->arch.hvm_domain.tsc_scaled )
- return host_tsc;
-
- htsc_khz = opt_softtsc ? 1000000 : cpu_khz;
- gtsc_khz = v->domain->arch.hvm_domain.gtsc_khz;
- return muldiv64(host_tsc, gtsc_khz, htsc_khz);
-}
-
void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
{
- uint64_t host_tsc, scaled_htsc;
+ uint64_t host_tsc;
- if ( opt_softtsc )
+ if ( v->domain->arch.tsc_native )
+ rdtscll(host_tsc);
+ else
host_tsc = hvm_get_guest_time(v);
- else
- rdtscll(host_tsc);
- scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
-
- v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - scaled_htsc;
+ v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
}
u64 hvm_get_guest_tsc(struct vcpu *v)
{
- uint64_t host_tsc, scaled_htsc;
+ uint64_t tsc;
- if ( opt_softtsc )
- host_tsc = hvm_get_guest_time(v);
+ if ( v->domain->arch.tsc_native )
+ rdtscll(tsc);
else
- rdtscll(host_tsc);
+ {
+ v->domain->arch.vtsc_kerncount++;
+ tsc = hvm_get_guest_time(v);
+ }
- scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
-
- return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset;
+ return tsc + v->arch.hvm_vcpu.cache_tsc_offset;
}
void hvm_migrate_timers(struct vcpu *v)
diff -r 72d130772f36 xen/arch/x86/hvm/save.c
--- a/xen/arch/x86/hvm/save.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/save.c Sun Sep 27 13:08:12 2009 -0600
@@ -31,9 +31,6 @@ void arch_hvm_save(struct domain *d, str
/* Save some CPUID bits */
cpuid(1, &eax, &ebx, &ecx, &edx);
hdr->cpuid = eax;
-
- /* Save guest's preferred TSC. */
- hdr->gtsc_khz = d->arch.hvm_domain.gtsc_khz;
}
int arch_hvm_load(struct domain *d, struct hvm_save_header *hdr)
@@ -60,17 +57,8 @@ int arch_hvm_load(struct domain *d, stru
gdprintk(XENLOG_WARNING, "HVM restore: saved CPUID (%#"PRIx32") "
"does not match host (%#"PRIx32").\n", hdr->cpuid, eax);
- /* Restore guest's preferred TSC frequency. */
- d->arch.hvm_domain.gtsc_khz = hdr->gtsc_khz;
-
- if ( hdr->gtsc_khz && hvm_gtsc_need_scale(d) )
- {
+ if ( !d->arch.tsc_native )
hvm_enable_rdtsc_exiting(d);
- gdprintk(XENLOG_WARNING, "Domain %d expects freq %uMHz "
- "but host's freq is %luMHz: trap and emulate rdtsc\n",
- d->domain_id, hdr->gtsc_khz / 1000, opt_softtsc ? 1000ul :
- cpu_khz / 1000);
- }
/* VGA state is not saved/restored, so we nobble the cache. */
d->arch.hvm_domain.stdvga.cache = 0;
diff -r 72d130772f36 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/svm/vmcb.c Sun Sep 27 13:08:12 2009 -0600
@@ -167,7 +167,7 @@ static int construct_vmcb(struct vcpu *v
/* TSC. */
vmcb->tsc_offset = 0;
- if ( opt_softtsc )
+ if ( !v->domain->arch.tsc_native )
vmcb->general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
/* Guest EFER. */
diff -r 72d130772f36 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/vmx/vmcs.c Sun Sep 27 13:08:12 2009 -0600
@@ -133,7 +133,7 @@ static void vmx_init_vmcs_config(void)
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_ACTIVATE_IO_BITMAP |
CPU_BASED_USE_TSC_OFFSETING |
- (opt_softtsc ? CPU_BASED_RDTSC_EXITING : 0));
+ CPU_BASED_RDTSC_EXITING );
opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
CPU_BASED_TPR_SHADOW |
CPU_BASED_MONITOR_TRAP_FLAG |
diff -r 72d130772f36 xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/hvm/vpt.c Sun Sep 27 13:08:12 2009 -0600
@@ -32,9 +32,6 @@ void hvm_init_guest_time(struct domain *
spin_lock_init(&pl->pl_time_lock);
pl->stime_offset = -(u64)get_s_time();
pl->last_guest_time = 0;
-
- d->arch.hvm_domain.gtsc_khz = opt_softtsc ? 1000000 : cpu_khz;
- d->arch.hvm_domain.tsc_scaled = 0;
}
u64 hvm_get_guest_time(struct vcpu *v)
diff -r 72d130772f36 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/time.c Sun Sep 27 13:08:12 2009 -0600
@@ -35,9 +35,6 @@
/* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
static char __initdata opt_clocksource[10];
string_param("clocksource", opt_clocksource);
-
-int opt_softtsc;
-boolean_param("softtsc", opt_softtsc);
/*
* opt_consistent_tscs: All TSCs tick at the exact same rate, allowing
@@ -1438,20 +1435,18 @@ struct tm wallclock_time(void)
* PV SoftTSC Emulation.
*/
-static unsigned long rdtsc_kerncount, rdtsc_usercount;
-
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs)
{
s_time_t now;
if ( guest_kernel_mode(v, regs) )
{
- rdtsc_kerncount++;
+ v->domain->arch.vtsc_kerncount++;
rdtsc(regs->eax, regs->edx);
}
else
{
- rdtsc_usercount++;
+ v->domain->arch.vtsc_kerncount++;
spin_lock(&v->domain->arch.vtsc_lock);
now = get_s_time() + v->domain->arch.vtsc_stime_offset;
if ( (int64_t)(now - v->domain->arch.vtsc_last) > 0 )
@@ -1464,10 +1459,27 @@ void pv_soft_rdtsc(struct vcpu *v, struc
}
}
+/* vtsc may incur measurable performance degradation, diagnose with this */
static void dump_softtsc(unsigned char key)
{
- printk("softtsc count: %lu kernel, %lu user\n",
- rdtsc_kerncount, rdtsc_usercount);
+ struct domain *d;
+ int domcnt = 0;
+
+ for_each_domain ( d )
+ {
+ if ( !d->arch.tsc_native )
+ {
+ if ( is_hvm_domain(d) )
+ printk("dom%u (hvm) vtsc count: %"PRIu64" total\n",
+ d->domain_id, d->arch.vtsc_kerncount);
+ else
+ printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
+ d->domain_id, d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
+ domcnt++;
+ }
+ }
+ if ( !domcnt )
+ printk("all domains have tsc_native enabled\n");
}
static struct keyhandler dump_softtsc_keyhandler = {
@@ -1478,8 +1490,7 @@ static struct keyhandler dump_softtsc_ke
static int __init setup_dump_softtsc(void)
{
- if ( opt_softtsc )
- register_keyhandler('s', &dump_softtsc_keyhandler);
+ register_keyhandler('s', &dump_softtsc_keyhandler);
return 0;
}
__initcall(setup_dump_softtsc);
diff -r 72d130772f36 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/arch/x86/traps.c Sun Sep 27 13:08:12 2009 -0600
@@ -2010,7 +2010,7 @@ static int emulate_privileged_op(struct
*/
opcode = insn_fetch(u8, code_base, eip, code_limit);
if ( !guest_kernel_mode(v, regs) &&
- !((opcode == 0x31) && v->domain->arch.vtsc) )
+ !((opcode == 0x31) && !v->domain->arch.tsc_native) )
goto fail;
if ( lock && (opcode & ~3) != 0x20 )
diff -r 72d130772f36 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/asm-x86/domain.h Sun Sep 27 13:08:12 2009 -0600
@@ -301,9 +301,11 @@ struct arch_domain
struct domain_mca_msrs vmca_msrs;
/* SoftTSC emulation */
- bool_t vtsc;
+ bool_t tsc_native; /* true means vtsc is disabled */
s_time_t vtsc_last;
spinlock_t vtsc_lock;
+ uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
+ uint64_t vtsc_usercount; /* not used for hvm */
int64_t vtsc_stime_offset;
} __cacheline_aligned;
@@ -435,7 +437,7 @@ unsigned long pv_guest_cr4_fixup(unsigne
#define pv_guest_cr4_to_real_cr4(v) \
(((v)->arch.guest_context.ctrlreg[4] \
| (mmu_cr4_features & (X86_CR4_PGE | X86_CR4_PSE)) \
- | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \
+ | ((v)->domain->arch.tsc_native ? 0: X86_CR4_TSD)) \
& ~X86_CR4_DE)
#define real_cr4_to_pv_guest_cr4(c) \
((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD))
diff -r 72d130772f36 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/asm-x86/hvm/domain.h Sun Sep 27 13:08:12 2009 -0600
@@ -45,8 +45,6 @@ struct hvm_domain {
struct hvm_ioreq_page ioreq;
struct hvm_ioreq_page buf_ioreq;
- uint32_t gtsc_khz; /* kHz */
- bool_t tsc_scaled;
struct pl_time pl_time;
struct hvm_io_handler io_handler;
diff -r 72d130772f36 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h Sun Sep 27 13:08:12 2009 -0600
@@ -286,7 +286,6 @@ uint8_t hvm_combine_hw_exceptions(uint8_
uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2);
void hvm_enable_rdtsc_exiting(struct domain *d);
-int hvm_gtsc_need_scale(struct domain *d);
static inline int hvm_cpu_up(void)
{
diff -r 72d130772f36 xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/asm-x86/time.h Sun Sep 27 13:08:12 2009 -0600
@@ -41,7 +41,6 @@ uint64_t acpi_pm_tick_to_ns(uint64_t tic
uint64_t acpi_pm_tick_to_ns(uint64_t ticks);
uint64_t ns_to_acpi_pm_tick(uint64_t ns);
-extern int opt_softtsc;
void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
#endif /* __X86_TIME_H__ */
diff -r 72d130772f36 xen/include/public/arch-x86/hvm/save.h
--- a/xen/include/public/arch-x86/hvm/save.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/public/arch-x86/hvm/save.h Sun Sep 27 13:08:12 2009 -0600
@@ -38,7 +38,6 @@ struct hvm_save_header {
uint32_t version; /* File format version */
uint64_t changeset; /* Version of Xen that saved this file */
uint32_t cpuid; /* CPUID[0x01][%eax] on the saving machine */
- uint32_t gtsc_khz; /* Guest's TSC frequency in kHz */
};
DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
diff -r 72d130772f36 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h Wed Sep 16 09:30:41 2009 +0100
+++ b/xen/include/public/domctl.h Sun Sep 27 13:08:12 2009 -0600
@@ -400,7 +400,8 @@ struct xen_domctl_settimeoffset {
};
typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
-
+/* allow rdtsc to be natively executed (is trapped/emulated by default */
+#define XEN_DOMCTL_set_tsc_native 57
#define XEN_DOMCTL_gethvmcontext 33
#define XEN_DOMCTL_sethvmcontext 34
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 37+ messages in thread* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-27 19:22 [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part) Dan Magenheimer
@ 2009-09-27 21:39 ` Keir Fraser
2009-09-27 23:19 ` Dan Magenheimer
2009-09-28 22:24 ` Jeremy Fitzhardinge
2009-10-05 23:15 ` Jeremy Fitzhardinge
2 siblings, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-09-27 21:39 UTC (permalink / raw)
To: Dan Magenheimer, Xen-Devel (E-mail)
On 27/09/2009 20:22, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:
> (I'm still struggling against the coils of python on
> the tools part of this patch, so thought I'd post the
> hypervisor side separately first.)
>
> Switches rdtsc emulation from boot option to per-domain
> and enables it by default. Also removes hvm tsc scaling
> as it is no longer necessary.
The hvm tsc scaling will still be useful in the case that trap-&-emulate was
originally disabled for an hvm domain. So I'd keep it, and also keep the
flag called d->arch.vtsc rather than flipping it and renaming it, and the
patch will end up about 20 lines long.
I'll rework this patch myself and check it in at the same time as the tools
patch, when you have that ready.
-- Keir
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-27 21:39 ` Keir Fraser
@ 2009-09-27 23:19 ` Dan Magenheimer
2009-09-28 7:25 ` Keir Fraser
2009-09-28 8:22 ` Zhang, Xiantao
0 siblings, 2 replies; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-27 23:19 UTC (permalink / raw)
To: Keir Fraser, Xen-Devel (E-mail)
Sorry I should have explained my logic for those changes
in the original posting.
The scaling is a strange corner case where a domain
can invisibly change from native-tsc to virtual-tsc.
I started to add a domctl so that the current state
could be probed, but decided that the scaling is
really a policy decision that is best made at
config time and best retained in the VM information
carried along with a save file (or live migration).
Allowing rdtsc to change dynamically according to
whether a save/restore/migration happened to occur
and whether the restored VM happened to land on
a machine with a different clock rate seems like
it could get very confusing. The domain adminstrator
either assumes correctness (the default) or turns
off emulation to get better performance and deals
with ALL the consequences. So IMHO removing the
scaling simplifies an already complex-to-explain
issue.
As for flipping/renaming the flag, I coded the patch
using the original vtsc name, but decided that if the
hypervisor default is truly going to be tsc_emulation,
then tsc_native is the unusual case and the code
should reflect that syntactically by requiring
tsc_native to be enabled. This is especially
true when the equivalent of the flag is managed in the
config file: Nearly all config parameters use
a default as 0 and setting the parameter causes
behavior to change, so setting tsc_native=1 to
change from the default tsc behavior seemed
more consistent.
Let me know your decisions on these as the tools code
depends on them.
Thanks,
Dan
> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Sunday, September 27, 2009 3:39 PM
> To: Dan Magenheimer; Xen-Devel (E-mail)
> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
> boot option with per-domain (hypervisor part)
>
>
> On 27/09/2009 20:22, "Dan Magenheimer"
> <dan.magenheimer@oracle.com> wrote:
>
> > (I'm still struggling against the coils of python on
> > the tools part of this patch, so thought I'd post the
> > hypervisor side separately first.)
> >
> > Switches rdtsc emulation from boot option to per-domain
> > and enables it by default. Also removes hvm tsc scaling
> > as it is no longer necessary.
>
> The hvm tsc scaling will still be useful in the case that
> trap-&-emulate was
> originally disabled for an hvm domain. So I'd keep it, and
> also keep the
> flag called d->arch.vtsc rather than flipping it and renaming
> it, and the
> patch will end up about 20 lines long.
>
> I'll rework this patch myself and check it in at the same
> time as the tools
> patch, when you have that ready.
>
> -- Keir
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-27 23:19 ` Dan Magenheimer
@ 2009-09-28 7:25 ` Keir Fraser
2009-09-28 9:04 ` Keir Fraser
2009-09-28 8:22 ` Zhang, Xiantao
1 sibling, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-09-28 7:25 UTC (permalink / raw)
To: Dan Magenheimer, Xen-Devel (E-mail)
On 28/09/2009 00:19, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:
> Let me know your decisions on these as the tools code
> depends on them.
I'll knock together a patch for both hypervisor and tools this morning.
-- Keir
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 7:25 ` Keir Fraser
@ 2009-09-28 9:04 ` Keir Fraser
2009-09-28 15:48 ` Dan Magenheimer
0 siblings, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-09-28 9:04 UTC (permalink / raw)
To: Dan Magenheimer, Xen-Devel (E-mail)
On 28/09/2009 08:25, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
> On 28/09/2009 00:19, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:
>
>> Let me know your decisions on these as the tools code
>> depends on them.
>
> I'll knock together a patch for both hypervisor and tools this morning.
Changeset 20257.
-- Keir
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 9:04 ` Keir Fraser
@ 2009-09-28 15:48 ` Dan Magenheimer
2009-09-28 16:07 ` Keir Fraser
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-28 15:48 UTC (permalink / raw)
To: Keir Fraser, Xen-Devel (E-mail)
(sorry for the delay... qemu git update takes forever!)
Hmmm.... I'm getting an Error 38, Function not implemented,
on XendDomainInfo.py line 2429 (xc.domain_set_tsc_native).
Got this for xm create for both PV and HVM.
Perhaps there's something strange in my environment or
maybe a python version diff (I'm using 2.4.3).
> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Monday, September 28, 2009 3:04 AM
> To: Dan Magenheimer; Xen-Devel (E-mail)
> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
> boot option with per-domain (hypervisor part)
>
>
> On 28/09/2009 08:25, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
>
> > On 28/09/2009 00:19, "Dan Magenheimer"
> <dan.magenheimer@oracle.com> wrote:
> >
> >> Let me know your decisions on these as the tools code
> >> depends on them.
> >
> > I'll knock together a patch for both hypervisor and tools
> this morning.
>
> Changeset 20257.
>
> -- Keir
>
>
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 15:48 ` Dan Magenheimer
@ 2009-09-28 16:07 ` Keir Fraser
2009-09-28 16:24 ` Dan Magenheimer
0 siblings, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-09-28 16:07 UTC (permalink / raw)
To: Dan Magenheimer, Xen-Devel (E-mail)
Have you really definitely reinstalled the C extension package xc.so? I have
tested the changeset and it does work for me.
-- Keir
On 28/09/2009 16:48, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:
> (sorry for the delay... qemu git update takes forever!)
>
> Hmmm.... I'm getting an Error 38, Function not implemented,
> on XendDomainInfo.py line 2429 (xc.domain_set_tsc_native).
> Got this for xm create for both PV and HVM.
> Perhaps there's something strange in my environment or
> maybe a python version diff (I'm using 2.4.3).
>
>> -----Original Message-----
>> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>> Sent: Monday, September 28, 2009 3:04 AM
>> To: Dan Magenheimer; Xen-Devel (E-mail)
>> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
>> boot option with per-domain (hypervisor part)
>>
>>
>> On 28/09/2009 08:25, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
>>
>>> On 28/09/2009 00:19, "Dan Magenheimer"
>> <dan.magenheimer@oracle.com> wrote:
>>>
>>>> Let me know your decisions on these as the tools code
>>>> depends on them.
>>>
>>> I'll knock together a patch for both hypervisor and tools
>> this morning.
>>
>> Changeset 20257.
>>
>> -- Keir
>>
>>
>>
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 16:07 ` Keir Fraser
@ 2009-09-28 16:24 ` Dan Magenheimer
2009-09-28 19:44 ` Dan Magenheimer
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-28 16:24 UTC (permalink / raw)
To: Keir Fraser, Xen-Devel (E-mail)
/usr/lib/python2.4/site-packages/lowlevel/xc.so is
definitely newly built and this was on a fresh clone.
However, I did only a "make install-tools", not a
complete "make" so maybe that resulted in something
screwy. I'll try again with another fresh clone
and full build.
> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Monday, September 28, 2009 10:08 AM
> To: Dan Magenheimer; Xen-Devel (E-mail)
> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
> boot option with per-domain (hypervisor part)
>
>
> Have you really definitely reinstalled the C extension
> package xc.so? I have
> tested the changeset and it does work for me.
>
> -- Keir
>
> On 28/09/2009 16:48, "Dan Magenheimer"
> <dan.magenheimer@oracle.com> wrote:
>
> > (sorry for the delay... qemu git update takes forever!)
> >
> > Hmmm.... I'm getting an Error 38, Function not implemented,
> > on XendDomainInfo.py line 2429 (xc.domain_set_tsc_native).
> > Got this for xm create for both PV and HVM.
> > Perhaps there's something strange in my environment or
> > maybe a python version diff (I'm using 2.4.3).
> >
> >> -----Original Message-----
> >> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> >> Sent: Monday, September 28, 2009 3:04 AM
> >> To: Dan Magenheimer; Xen-Devel (E-mail)
> >> Subject: Re: [Xen-devel] [PATCH] replace rdtsc
> emulation-vs-native xen
> >> boot option with per-domain (hypervisor part)
> >>
> >>
> >> On 28/09/2009 08:25, "Keir Fraser"
> <keir.fraser@eu.citrix.com> wrote:
> >>
> >>> On 28/09/2009 00:19, "Dan Magenheimer"
> >> <dan.magenheimer@oracle.com> wrote:
> >>>
> >>>> Let me know your decisions on these as the tools code
> >>>> depends on them.
> >>>
> >>> I'll knock together a patch for both hypervisor and tools
> >> this morning.
> >>
> >> Changeset 20257.
> >>
> >> -- Keir
> >>
> >>
> >>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 16:24 ` Dan Magenheimer
@ 2009-09-28 19:44 ` Dan Magenheimer
2009-09-28 20:55 ` Keir Fraser
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-28 19:44 UTC (permalink / raw)
To: dan.magenheimer, Keir Fraser, Xen-Devel (E-mail)
Rebuild from scratch didn't help. Oddly I
accidentally ran a 32-bit hypervisor and didn't
see the problem. I'm seeing the problem on a
32-bit dom0 with 64-bit hypervisor. Are you
testing on a 64-bit dom0? (e.g. maybe a conpat
mode problem?)
Continuing to look into it.
Dan
> -----Original Message-----
> From: Dan Magenheimer
> Sent: Monday, September 28, 2009 10:24 AM
> To: Keir Fraser; Xen-Devel (E-mail)
> Subject: RE: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
> boot option with per-domain (hypervisor part)
>
>
> /usr/lib/python2.4/site-packages/lowlevel/xc.so is
> definitely newly built and this was on a fresh clone.
> However, I did only a "make install-tools", not a
> complete "make" so maybe that resulted in something
> screwy. I'll try again with another fresh clone
> and full build.
>
> > -----Original Message-----
> > From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> > Sent: Monday, September 28, 2009 10:08 AM
> > To: Dan Magenheimer; Xen-Devel (E-mail)
> > Subject: Re: [Xen-devel] [PATCH] replace rdtsc
> emulation-vs-native xen
> > boot option with per-domain (hypervisor part)
> >
> >
> > Have you really definitely reinstalled the C extension
> > package xc.so? I have
> > tested the changeset and it does work for me.
> >
> > -- Keir
> >
> > On 28/09/2009 16:48, "Dan Magenheimer"
> > <dan.magenheimer@oracle.com> wrote:
> >
> > > (sorry for the delay... qemu git update takes forever!)
> > >
> > > Hmmm.... I'm getting an Error 38, Function not implemented,
> > > on XendDomainInfo.py line 2429 (xc.domain_set_tsc_native).
> > > Got this for xm create for both PV and HVM.
> > > Perhaps there's something strange in my environment or
> > > maybe a python version diff (I'm using 2.4.3).
> > >
> > >> -----Original Message-----
> > >> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> > >> Sent: Monday, September 28, 2009 3:04 AM
> > >> To: Dan Magenheimer; Xen-Devel (E-mail)
> > >> Subject: Re: [Xen-devel] [PATCH] replace rdtsc
> > emulation-vs-native xen
> > >> boot option with per-domain (hypervisor part)
> > >>
> > >>
> > >> On 28/09/2009 08:25, "Keir Fraser"
> > <keir.fraser@eu.citrix.com> wrote:
> > >>
> > >>> On 28/09/2009 00:19, "Dan Magenheimer"
> > >> <dan.magenheimer@oracle.com> wrote:
> > >>>
> > >>>> Let me know your decisions on these as the tools code
> > >>>> depends on them.
> > >>>
> > >>> I'll knock together a patch for both hypervisor and tools
> > >> this morning.
> > >>
> > >> Changeset 20257.
> > >>
> > >> -- Keir
> > >>
> > >>
> > >>
> >
> >
> >
> > _______________________________________________
> > Xen-devel mailing list
> > Xen-devel@lists.xensource.com
> > http://lists.xensource.com/xen-devel
> >
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 19:44 ` Dan Magenheimer
@ 2009-09-28 20:55 ` Keir Fraser
0 siblings, 0 replies; 37+ messages in thread
From: Keir Fraser @ 2009-09-28 20:55 UTC (permalink / raw)
To: Dan Magenheimer, Xen-Devel (E-mail)
On 28/09/2009 20:44, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:
> Rebuild from scratch didn't help. Oddly I
> accidentally ran a 32-bit hypervisor and didn't
> see the problem. I'm seeing the problem on a
> 32-bit dom0 with 64-bit hypervisor. Are you
> testing on a 64-bit dom0? (e.g. maybe a conpat
> mode problem?)
I tested 64-bit dom0 on 64-bit hypervisor. Compat mode problems shouldn't be
an issue since domctls do not have a compat shim.
Our automated tests also check 32-bit dom0 on 32-bit Xen and on 64-bit Xen,
and they all passed fine too.
-- Keir
> Continuing to look into it.
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-27 23:19 ` Dan Magenheimer
2009-09-28 7:25 ` Keir Fraser
@ 2009-09-28 8:22 ` Zhang, Xiantao
2009-09-28 15:13 ` Dan Magenheimer
1 sibling, 1 reply; 37+ messages in thread
From: Zhang, Xiantao @ 2009-09-28 8:22 UTC (permalink / raw)
To: Dan Magenheimer, Keir Fraser, Xen-Devel (E-mail)
Dan Magenheimer wrote:
> Sorry I should have explained my logic for those changes
> in the original posting.
>
> The scaling is a strange corner case where a domain
> can invisibly change from native-tsc to virtual-tsc.
> I started to add a domctl so that the current state
> could be probed, but decided that the scaling is
> really a policy decision that is best made at
> config time and best retained in the VM information
> carried along with a save file (or live migration).
> Allowing rdtsc to change dynamically according to
> whether a save/restore/migration happened to occur
> and whether the restored VM happened to land on
> a machine with a different clock rate seems like
> it could get very confusing. The domain adminstrator
> either assumes correctness (the default) or turns
> off emulation to get better performance and deals
> with ALL the consequences. So IMHO removing the
> scaling simplifies an already complex-to-explain
> issue.
Hi, Dan
I didn't see the necessity to remove the scaling logic. This logic just allows the domain can run in different TSC-freq platforms without strange timing behaviors. If the domain is migrated between tow machines with different TSC rate, system administrator can get the tsc emulation knowledge from Xen's system log, and if administrator doesn't like to suffer performance loss in emulation way, he can migrate the guest back. Besides, I don't think this logic conflicts with your boot-stage option.IMO, if the boot-stage option is set to use emulation, the scaling policy can be switched off, but if the option is not specified, I think the default scaling policy should be adopted in migration case.
Xiantao
>
>> -----Original Message-----
>> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>> Sent: Sunday, September 27, 2009 3:39 PM
>> To: Dan Magenheimer; Xen-Devel (E-mail)
>> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native
>> xen boot option with per-domain (hypervisor part)
>>
>>
>> On 27/09/2009 20:22, "Dan Magenheimer"
>> <dan.magenheimer@oracle.com> wrote:
>>
>>> (I'm still struggling against the coils of python on
>>> the tools part of this patch, so thought I'd post the
>>> hypervisor side separately first.)
>>>
>>> Switches rdtsc emulation from boot option to per-domain
>>> and enables it by default. Also removes hvm tsc scaling
>>> as it is no longer necessary.
>>
>> The hvm tsc scaling will still be useful in the case that
>> trap-&-emulate was originally disabled for an hvm domain. So I'd
>> keep it, and
>> also keep the
>> flag called d->arch.vtsc rather than flipping it and renaming
>> it, and the
>> patch will end up about 20 lines long.
>>
>> I'll rework this patch myself and check it in at the same
>> time as the tools
>> patch, when you have that ready.
>>
>> -- Keir
>>
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 8:22 ` Zhang, Xiantao
@ 2009-09-28 15:13 ` Dan Magenheimer
0 siblings, 0 replies; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-28 15:13 UTC (permalink / raw)
To: Zhang, Xiantao, Keir Fraser, Xen-Devel (E-mail)
> if administrator doesn't
> like to suffer performance loss in emulation way, he can
> migrate the guest back
OK, that makes sense.
> system administrator can get the tsc emulation
> knowledge from Xen's system log
Getting information from the log is probably not a good
longterm solution though. I think there needs
to be a management/tools mechanism to query if
a migrated/restored domain has transitioned
from tsc-native to tsc-emulated or vice versa.
Thanks,
Dan
> -----Original Message-----
> From: Zhang, Xiantao [mailto:xiantao.zhang@intel.com]
> Sent: Monday, September 28, 2009 2:23 AM
> To: Dan Magenheimer; Keir Fraser; Xen-Devel (E-mail)
> Subject: RE: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
> boot option with per-domain (hypervisor part)
>
>
> Dan Magenheimer wrote:
> > Sorry I should have explained my logic for those changes
> > in the original posting.
> >
> > The scaling is a strange corner case where a domain
> > can invisibly change from native-tsc to virtual-tsc.
> > I started to add a domctl so that the current state
> > could be probed, but decided that the scaling is
> > really a policy decision that is best made at
> > config time and best retained in the VM information
> > carried along with a save file (or live migration).
> > Allowing rdtsc to change dynamically according to
> > whether a save/restore/migration happened to occur
> > and whether the restored VM happened to land on
> > a machine with a different clock rate seems like
> > it could get very confusing. The domain adminstrator
> > either assumes correctness (the default) or turns
> > off emulation to get better performance and deals
> > with ALL the consequences. So IMHO removing the
> > scaling simplifies an already complex-to-explain
> > issue.
> Hi, Dan
> I didn't see the necessity to remove the scaling logic.
> This logic just allows the domain can run in different
> TSC-freq platforms without strange timing behaviors. If the
> domain is migrated between tow machines with different TSC
> rate, system administrator can get the tsc emulation
> knowledge from Xen's system log, and if administrator doesn't
> like to suffer performance loss in emulation way, he can
> migrate the guest back. Besides, I don't think this logic
> conflicts with your boot-stage option.IMO, if the boot-stage
> option is set to use emulation, the scaling policy can be
> switched off, but if the option is not specified, I think the
> default scaling policy should be adopted in migration case.
> Xiantao
>
> >
> >> -----Original Message-----
> >> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> >> Sent: Sunday, September 27, 2009 3:39 PM
> >> To: Dan Magenheimer; Xen-Devel (E-mail)
> >> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native
> >> xen boot option with per-domain (hypervisor part)
> >>
> >>
> >> On 27/09/2009 20:22, "Dan Magenheimer"
> >> <dan.magenheimer@oracle.com> wrote:
> >>
> >>> (I'm still struggling against the coils of python on
> >>> the tools part of this patch, so thought I'd post the
> >>> hypervisor side separately first.)
> >>>
> >>> Switches rdtsc emulation from boot option to per-domain
> >>> and enables it by default. Also removes hvm tsc scaling
> >>> as it is no longer necessary.
> >>
> >> The hvm tsc scaling will still be useful in the case that
> >> trap-&-emulate was originally disabled for an hvm domain. So I'd
> >> keep it, and
> >> also keep the
> >> flag called d->arch.vtsc rather than flipping it and renaming
> >> it, and the
> >> patch will end up about 20 lines long.
> >>
> >> I'll rework this patch myself and check it in at the same
> >> time as the tools
> >> patch, when you have that ready.
> >>
> >> -- Keir
> >>
> >>
> >>
> >> _______________________________________________
> >> Xen-devel mailing list
> >> Xen-devel@lists.xensource.com
> >> http://lists.xensource.com/xen-devel
> >>
> >
> > _______________________________________________
> > Xen-devel mailing list
> > Xen-devel@lists.xensource.com
> > http://lists.xensource.com/xen-devel
>
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-27 19:22 [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part) Dan Magenheimer
2009-09-27 21:39 ` Keir Fraser
@ 2009-09-28 22:24 ` Jeremy Fitzhardinge
2009-09-28 22:43 ` Dan Magenheimer
2009-10-05 23:15 ` Jeremy Fitzhardinge
2 siblings, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-28 22:24 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Xen-Devel (E-mail)
On 09/27/09 12:22, Dan Magenheimer wrote:
> Switches rdtsc emulation from boot option to per-domain
> and enables it by default.
Is this just on hvm domains, or PV ones too?
J
^ permalink raw reply [flat|nested] 37+ messages in thread* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 22:24 ` Jeremy Fitzhardinge
@ 2009-09-28 22:43 ` Dan Magenheimer
2009-09-28 23:10 ` Jeremy Fitzhardinge
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-28 22:43 UTC (permalink / raw)
To: Jeremy Fitzhardinge; +Cc: Xen-Devel (E-mail)
PV ones too. An app doesn't have any idea (and doesn't
and shouldn't care) if it is running on a PV or HVM domain.
> -----Original Message-----
> From: Jeremy Fitzhardinge [mailto:jeremy@goop.org]
> Sent: Monday, September 28, 2009 4:24 PM
> To: Dan Magenheimer
> Cc: Xen-Devel (E-mail)
> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
> boot option with per-domain (hypervisor part)
>
>
> On 09/27/09 12:22, Dan Magenheimer wrote:
> > Switches rdtsc emulation from boot option to per-domain
> > and enables it by default.
>
> Is this just on hvm domains, or PV ones too?
>
> J
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 22:43 ` Dan Magenheimer
@ 2009-09-28 23:10 ` Jeremy Fitzhardinge
2009-09-28 23:36 ` Dan Magenheimer
0 siblings, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-28 23:10 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Xen-Devel (E-mail), Keir Fraser
On 09/28/09 15:43, Dan Magenheimer wrote:
> PV ones too. An app doesn't have any idea (and doesn't
> and shouldn't care) if it is running on a PV or HVM domain.
>
Well, it shouldn't be enabled by default. That slows down all rdtsc
operations for the benefit of very niche applications. The Xen
clocksource assumes that rdtsc is fast, unemulated and in need of
correction.
If someone really needs an artificial tsc, then they can enable the
option for themselves.
J
^ permalink raw reply [flat|nested] 37+ messages in thread* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 23:10 ` Jeremy Fitzhardinge
@ 2009-09-28 23:36 ` Dan Magenheimer
2009-09-29 0:27 ` Ian Pratt
2009-09-29 16:44 ` Jeremy Fitzhardinge
0 siblings, 2 replies; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-28 23:36 UTC (permalink / raw)
To: Jeremy Fitzhardinge; +Cc: Xen-Devel (E-mail), Keir Fraser
> On 09/28/09 15:43, Dan Magenheimer wrote:
> > PV ones too. An app doesn't have any idea (and doesn't
> > and shouldn't care) if it is running on a PV or HVM domain.
>
> Well, it shouldn't be enabled by default. That slows down all rdtsc
> operations for the benefit of very niche applications. The Xen
> clocksource assumes that rdtsc is fast, unemulated and in need of
> correction.
>
> If someone really needs an artificial tsc, then they can enable the
> option for themselves.
While I understand and sympathize (and the idealistic side of me even
agrees with) your position, let me argue your point "they can enable
the option for themselves" by mangling a famous movie quote:
"In a cloud, no-one can hear rdtsc scream"
^ permalink raw reply [flat|nested] 37+ messages in thread* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 23:36 ` Dan Magenheimer
@ 2009-09-29 0:27 ` Ian Pratt
2009-09-29 1:42 ` Zhang, Xiantao
2009-09-29 16:44 ` Jeremy Fitzhardinge
1 sibling, 1 reply; 37+ messages in thread
From: Ian Pratt @ 2009-09-29 0:27 UTC (permalink / raw)
To: Dan Magenheimer, Jeremy Fitzhardinge
Cc: Ian Pratt, Xen-Devel (E-mail), Keir Fraser
> > Well, it shouldn't be enabled by default. That slows down all rdtsc
> > operations for the benefit of very niche applications. The Xen
> > clocksource assumes that rdtsc is fast, unemulated and in need of
> > correction.
> >
> > If someone really needs an artificial tsc, then they can enable the
> > option for themselves.
>
> While I understand and sympathize (and the idealistic side of me even
> agrees with) your position, let me argue your point "they can enable
> the option for themselves" by mangling a famous movie quote:
>
> "In a cloud, no-one can hear rdtsc scream"
If we had some actual bug reports detailing failures of real-world applications then the decision would be easy.
I realise it could be hard to pin strange application behaviour down to TSC issues, but I can't readily spot any bugs on the XenServer or xen.org bug trackers that might even be candidates.
Ian
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-29 0:27 ` Ian Pratt
@ 2009-09-29 1:42 ` Zhang, Xiantao
2009-09-29 3:13 ` Dan Magenheimer
0 siblings, 1 reply; 37+ messages in thread
From: Zhang, Xiantao @ 2009-09-29 1:42 UTC (permalink / raw)
To: Ian Pratt, Dan Magenheimer, Jeremy Fitzhardinge
Cc: Xen-Devel (E-mail), Keir Fraser
Ian Pratt wrote:
>>> Well, it shouldn't be enabled by default. That slows down all rdtsc
>>> operations for the benefit of very niche applications. The Xen
>>> clocksource assumes that rdtsc is fast, unemulated and in need of
>>> correction.
>>>
>>> If someone really needs an artificial tsc, then they can enable the
>>> option for themselves.
>>
>> While I understand and sympathize (and the idealistic side of me even
>> agrees with) your position, let me argue your point "they can enable
>> the option for themselves" by mangling a famous movie quote:
>>
>> "In a cloud, no-one can hear rdtsc scream"
>
> If we had some actual bug reports detailing failures of real-world
> applications then the decision would be easy.
> I realise it could be hard to pin strange application behaviour down
> to TSC issues, but I can't readily spot any bugs on the XenServer or
> xen.org bug trackers that might even be candidates.
Maybe Dan can give more reasons. I think the TSC skew issue should be thre key one. But IMO, the TSC skew issue maybe not that important. In real-world apps, rdtsc is mostly read by the kernel syscall gettimeofday, and kernel can eliminate the skew before returning to user app through returning the value "plus 1 on last_tsc". Even if apps use "raw rdtsc" instruction to get timestamp in rare cases, I think the issue also can be solved by apps' ownself through a simple check. Therefore, I still can't figure out the suasive reasons why adopts emulation way for the tsc read so far.
Dan,
Any other necessary reasons to introduce it except fixing skew issue between cpus?
Xiantao
^ permalink raw reply [flat|nested] 37+ messages in thread* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-29 1:42 ` Zhang, Xiantao
@ 2009-09-29 3:13 ` Dan Magenheimer
2009-09-29 17:24 ` Jeremy Fitzhardinge
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-29 3:13 UTC (permalink / raw)
To: Zhang, Xiantao, Ian Pratt, Jeremy Fitzhardinge
Cc: Xen-Devel (E-mail), Keir Fraser
> Any other necessary reasons to introduce it except fixing
> skew issue between cpus?
I think the reasons were discussed at length in a previous
thread. To summarize, some applications already do (or can
or will) use rdtsc as a legal non-privileged instruction
and expect to get the Intel SDM-defined behavior of the
instruction. There are some hardware+software environments
that do not correctly provide this behavior, including
some older SMP machines and early generation AMD desktop
multi-core machines. There are many more that DO correctly
provide this behavior, including all recent generation
Intel machines and nearly all recent generation AMD
machines... and, very importantly, any VM running on
VMware. Because rdtsc is "unsafe" on some machines
does not stop application programmers from using it
on machines where it is safe; it will be increasingly
likely that an application programmer may never
experience an "unsafe" hardware/software environment.
We cannot tell app programmers that they cannot use rdtsc,
only warn them that it is risky for some older machines.
Some will use it anyway and I have personally talked to
some that are. We cannot predict or legislate
how rdtsc will be used in the future. It is easy to
imagine a scenario where a transaction-oriented application
timestamps transactions with rdtsc and then, when an
infrequent error condition occurs, tries to replay the
transactions; this would work fine on any new hardware
and on VMware and even on Xen for awhile... but without
rdtsc-emulaion could mysteriously fail and cause data
corruption, perhaps only after a migration (or two or
three) and only when the infrequent error condition
occurs after a few migrations. Would you want to be
on the support team that tries to diagnose that?
VMware does not have this problem. The cost for Xen is
some performance. I do not take loss of performance
lightly and have spent weeks now looking for a better
solution. I have not found one and am open to any
creative alternative. But pretending that apps, today
and in the future will NOT use rdtsc, or that they
will use it only following Xen-prescribed constraints,
is just wishful thinking and not appropriate for
hardware/software vendors selling to enterprise
customers (or selling to cloud providers that expect to
provide services for enterprise customers).
The patch provided records and reports frequency of
rdtsc emulations. If an administrator cares to improve
performance and can verify that all apps that are (now
or ever will be) running on this VM are using
rdtsc safely, a per-domain option can be specified
to get the performance back. The opposite is nearly
impossible to ascertain.
So I believe, for rdtsc, correctness is more important
than a small amount of performance.
Dan
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-29 3:13 ` Dan Magenheimer
@ 2009-09-29 17:24 ` Jeremy Fitzhardinge
0 siblings, 0 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-29 17:24 UTC (permalink / raw)
To: Dan Magenheimer
Cc: Ian Pratt, Xen-Devel (E-mail), Keir Fraser, Zhang, Xiantao
On 09/28/09 20:13, Dan Magenheimer wrote:
> So I believe, for rdtsc, correctness is more important
> than a small amount of performance.
>
So run the apps in an HVM domain, where emulating everything is
appropriate, and leave PV domains alone.
J
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-28 23:36 ` Dan Magenheimer
2009-09-29 0:27 ` Ian Pratt
@ 2009-09-29 16:44 ` Jeremy Fitzhardinge
2009-09-29 17:34 ` Dan Magenheimer
1 sibling, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-29 16:44 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Xen-Devel (E-mail), Keir Fraser
On 09/28/09 16:36, Dan Magenheimer wrote:
>> Well, it shouldn't be enabled by default. That slows down all rdtsc
>> operations for the benefit of very niche applications. The Xen
>> clocksource assumes that rdtsc is fast, unemulated and in need of
>> correction.
>>
>> If someone really needs an artificial tsc, then they can enable the
>> option for themselves.
>>
> While I understand and sympathize (and the idealistic side of me even
> agrees with) your position,
>
Please, no. I think you've managed to completely misunderstand
everything I've said on this subject.
The TSC is not, and has never been reliable. At best it has periods of
reliability in which its possible to get coherent results, but never for
indefinite periods. That includes the recent architecture updates,
though they have considerably increased the periods of reliability.
This is not an idealistic statement; it is the conclusion from years of
attempts to use the tsc as a time source. It does not matter what the
documents say, there's always a creative new way for hardware to break
the tsc's behaviour.
There is no requirement for Xen to emulate a hardware feature better
than native. In particular, there's no need for Xen to synthesize the
platonic ideal of a TSC when existing hardware platform does not do so.
That said, I have no problem with making such emulation an option if it
really helps real programs in the field. I wouldn't even mind having it
on by default...
Except that it comes with a terrible cost. rdtsc is a very heavily used
instruction, because its part of the Xen clock ABI. It gets executed
many thousands of times a second, including at least once every context
switch. Adding the overhead of an extra synchronous emulated
instruction trap into Xen is going to have a very noticeable performance
hit.
This is a massive regression in everyday, every-system, every-user
performance to satisfy the hypothetical requirement of abstract apps
which may or may not exist.
The fact that you haven't named a single real app that breaks in the
current system and then works with a synthetic TSC further undermines
your argument. Are you really arguing on the basis that "some apps
might use tsc in a fragile way" or do you actually have a specific list
of apps that are really suffering? What are they? How do they break?
How many users do they have? What are they doing with the tsc? Why
can't they use some other mechanism?
I think a change of this magnitude needs a lot more justification than
some handwavy arguments from (dubious) first principles.
J
^ permalink raw reply [flat|nested] 37+ messages in thread* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-29 16:44 ` Jeremy Fitzhardinge
@ 2009-09-29 17:34 ` Dan Magenheimer
2009-09-29 19:01 ` Jeremy Fitzhardinge
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-29 17:34 UTC (permalink / raw)
To: Jeremy Fitzhardinge; +Cc: Xen-Devel (E-mail), Keir Fraser
Hi Jeremy --
I think it's fair for us both to completely air our opinions,
but since you've contradicted some of my "hand-wavy" points
and since much of your position is based on the contradictions,
I feel it's necessary to respond.
> The TSC is not, and has never been reliable.
Your data is stale. Please discuss this with processor
and system vendors (I have) and look at the latest upstream
Linux.
I agree that this presumed reliability should be carefully
tested. (See my proposed patch posted yesterday.) If
you are correct and I am not, your position (that "all
apps that use rdtsc are fundamentally broken") is much
stronger.
> Except that it comes with a terrible cost...
> This is a massive regression...
It is certainly significant but "terrible" and "massive"
are a bit strong. Based on my measurements, the examples
you cite will degrade performance by a fraction of a percent.
And this loss can be eliminated IF the administrator
understands the risks and chooses performance over
correctness by specifying an option at guest creation.
> The fact that you haven't named a single real app...
> Are you really arguing on the basis that "some apps
> might use tsc in a fragile way" or do you actually have a
> specific list
I have a (small) specific list. For various reasons,
I cannot go into further detail.
But if TSC is indeed reliable on newer processors/systems
(see above) AND on VMware, there will certainly be more.
Thanks,
Dan
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-29 17:34 ` Dan Magenheimer
@ 2009-09-29 19:01 ` Jeremy Fitzhardinge
2009-09-29 23:45 ` Dan Magenheimer
0 siblings, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-29 19:01 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Xen-Devel (E-mail), Keir Fraser
On 09/29/09 10:34, Dan Magenheimer wrote:
>> The TSC is not, and has never been reliable.
>>
> Your data is stale. Please discuss this with processor
> and system vendors (I have)
I'm sure they would say that, as they frequently have in the past. And
then it breaks again.
Even then their guarantee only applies while the processor is powered up
and hasn't been reset. But resets can occur while the system is
"running" in the form of S3 suspend events, or even completely powered
off when suspending to disk.
Besides, the SDM makes no claims about tsc synchronization between CPUs,
only that on a given CPU/core is at a constant rate (at least from now
on, promise!). At that point you're relying on motherboard/system
design, which has a lot more scope for brokenness than just core CPUs.
Large systems simply don't keep all their CPUs in the same clock domain,
and certainly won't guarantee that for all future system designs.
> and look at the latest upstream Linux.
>
The kernel does what it needs to do to make the tsc usable for itself.
It does not make (and has never made) any guarantees about how the tsc
appears in usermode (except for the purposes of implementing
vgettimeofday). You won't find many Linux kernel developers who are
sympathetic with the idea of making any hard guarantees for bare
usermode tsc use.
>> Except that it comes with a terrible cost...
>> This is a massive regression...
>>
> It is certainly significant but "terrible" and "massive"
> are a bit strong. Based on my measurements, the examples
> you cite will degrade performance by a fraction of a percent.
>
How have you measured this? On what systems? Your patch introduces
this regression on all systems for everyone; it isn't enough to measure
it on a new Nehalem machine.
>> The fact that you haven't named a single real app...
>> Are you really arguing on the basis that "some apps
>> might use tsc in a fragile way" or do you actually have a
>> specific list
>>
> I have a (small) specific list. For various reasons,
> I cannot go into further detail.
>
Well, that goes back to my point about spending a lot of effort on
something that can only possibility benefit a (small) set of niche
apps. Spending the effort on a vsyscall approach would be fast, correct
and widely beneficial.
You can default it on within Oracle, or even in Oracle's Xen distro.
It's unreasonable to make this a global default when you're trying to
solve a local problem. You haven't established this is something that
anyone else need be concerned about.
Besides, if they want a global sequence number, why not just keep a
global counter? That's going to be much cheaper and more reliable than
anything time-based.
J
^ permalink raw reply [flat|nested] 37+ messages in thread* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-29 19:01 ` Jeremy Fitzhardinge
@ 2009-09-29 23:45 ` Dan Magenheimer
2009-09-30 0:41 ` Jeremy Fitzhardinge
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-09-29 23:45 UTC (permalink / raw)
To: Jeremy Fitzhardinge; +Cc: Xen-Devel (E-mail), Keir Fraser
Let me attempt to summarize our disagreement and then
I'd like to stop arguing.
1) You think rdtsc is never safe for an app to use. I
think it is safe in many hardware/software enviroments
and that the number of safe environments will continue
to increase.
2) You think the performance hit from rdtsc-emulation
is horrible. I think it is significant but relatively
small and acceptable and, if there are cases where it
is not, administrators or virtual appliance providers
can make an informed choice to turn it off.
3) You think app developers can and should be told to
not use rdtsc because it is inherently unsafe and
so Xen doesn't need to ever be concerned with making it
safe. I think app developers will do what they
please, ignorant to the subtleties of rdtsc, and if
their app works on their hardware and on VMware but
not on Xen, they will blame Xen or Linux or their
OS provider or their cloud provider, and probably never
know that their app doesn't work because of rdtsc.
Do you agree that those are the key points of disagreement?
Thanks,
Dan
> -----Original Message-----
> From: Jeremy Fitzhardinge [mailto:jeremy@goop.org]
> Sent: Tuesday, September 29, 2009 1:02 PM
> To: Dan Magenheimer
> Cc: Xen-Devel (E-mail); Keir Fraser
> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
> boot option with per-domain (hypervisor part)
>
>
> On 09/29/09 10:34, Dan Magenheimer wrote:
> >> The TSC is not, and has never been reliable.
> >>
> > Your data is stale. Please discuss this with processor
> > and system vendors (I have)
>
> I'm sure they would say that, as they frequently have in the
> past. And
> then it breaks again.
>
> Even then their guarantee only applies while the processor is
> powered up
> and hasn't been reset. But resets can occur while the system is
> "running" in the form of S3 suspend events, or even completely powered
> off when suspending to disk.
>
> Besides, the SDM makes no claims about tsc synchronization
> between CPUs,
> only that on a given CPU/core is at a constant rate (at least from now
> on, promise!). At that point you're relying on motherboard/system
> design, which has a lot more scope for brokenness than just
> core CPUs.
> Large systems simply don't keep all their CPUs in the same
> clock domain,
> and certainly won't guarantee that for all future system designs.
>
> > and look at the latest upstream Linux.
> >
> The kernel does what it needs to do to make the tsc usable
> for itself.
> It does not make (and has never made) any guarantees about how the tsc
> appears in usermode (except for the purposes of implementing
> vgettimeofday). You won't find many Linux kernel developers who are
> sympathetic with the idea of making any hard guarantees for bare
> usermode tsc use.
>
> >> Except that it comes with a terrible cost...
> >> This is a massive regression...
> >>
> > It is certainly significant but "terrible" and "massive"
> > are a bit strong. Based on my measurements, the examples
> > you cite will degrade performance by a fraction of a percent.
> >
> How have you measured this? On what systems? Your patch introduces
> this regression on all systems for everyone; it isn't enough
> to measure
> it on a new Nehalem machine.
>
> >> The fact that you haven't named a single real app...
> >> Are you really arguing on the basis that "some apps
> >> might use tsc in a fragile way" or do you actually have a
> >> specific list
> >>
> > I have a (small) specific list. For various reasons,
> > I cannot go into further detail.
> >
>
> Well, that goes back to my point about spending a lot of effort on
> something that can only possibility benefit a (small) set of niche
> apps. Spending the effort on a vsyscall approach would be
> fast, correct
> and widely beneficial.
>
> You can default it on within Oracle, or even in Oracle's Xen distro.
> It's unreasonable to make this a global default when you're trying to
> solve a local problem. You haven't established this is something that
> anyone else need be concerned about.
>
> Besides, if they want a global sequence number, why not just keep a
> global counter? That's going to be much cheaper and more
> reliable than
> anything time-based.
>
> J
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-29 23:45 ` Dan Magenheimer
@ 2009-09-30 0:41 ` Jeremy Fitzhardinge
0 siblings, 0 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-30 0:41 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Xen-Devel (E-mail), Keir Fraser
On 09/29/09 16:45, Dan Magenheimer wrote:
> Let me attempt to summarize our disagreement and then
> I'd like to stop arguing.
>
> 1) You think rdtsc is never safe for an app to use. I
> think it is safe in many hardware/software enviroments
> and that the number of safe environments will continue
> to increase.
>
I think its highly unadvisable, and hard to do correctly. I think there
are better ways to achieve the same effect. I think you are flat-out
mistaken in your faith in "safe environments"; such safe environments
are certain machines qualified on a case-by-case basis, and can never be
assumed to be generally available baseline configuration. You have not
supported your faith in "safe environments" with any concrete documentation.
> 2) You think the performance hit from rdtsc-emulation
> is horrible. I think it is significant but relatively
> small and acceptable and, if there are cases where it
> is not, administrators or virtual appliance providers
> can make an informed choice to turn it off.
>
When evaluating any performance hit one has to take into account who
benefits versus who suffers. The pool of applications and users who
could possibly benefit from synthetic tsc is very small, whereas every
single user will suffer some performance degradation resulting from
synthesizing the tsc.
> 3) You think app developers can and should be told to
> not use rdtsc because it is inherently unsafe and
>
Application developers have been consistently told and advised not to
use rdtsc in usermode in all Linux environments; Linux under Xen is no
exception. Nobody has ever made any guarantees that rdtsc has any
particular behaviour in usermode, nor will they. The only guarantee
that Linux makes is that it will use the tsc in vgettimeofday where
possible.
> so Xen doesn't need to ever be concerned with making it
> safe.
I don't have any problem with allowing synthesis as an option, or even
making it the default for hvm guests. But I object to interfering with
an important PV ABI, for two reasons:
1. it has a negative impact on all the kernel uses of rdtsc, which
are safe
2. it prevents a high-performance vsyscall implementation to allow
safe, fast usermode access to the tsc
In other words, it makes things worse for everyone who is doing things
the right way, for the sake of mitigating the risks of unsafe code.
> I think app developers will do what they
> please, ignorant to the subtleties of rdtsc, and if
> their app works on their hardware and on VMware but
> not on Xen, they will blame Xen or Linux or their
> OS provider or their cloud provider, and probably never
> know that their app doesn't work because of rdtsc.
>
Shrug. They can blame who they like. It is no different from
developers who write buggy multithreaded code that appears to work fine
until they move to more cores, or a different system topology, or
different libraries or compiler or OS or...
It is not recommended for Linux. Microsoft does not recommended it for
Windows.
Developers are consistently advised not to use rdtsc in usermode. There
are no standard library functions to access it directly, so they must go
out of their way to do so. Pretending that the TSC is safe or advisable
to use in usermode is just going to lead developers astray and enable
bad habits. When a drug addict blames the rest of the world for their
affliction, the answer is not to give them more drugs.
That said, just as one can use Xen's various mechanisms to approximate a
machine layout in order to work around bugs in MT programs, I don't have
any particular problems with tsc-synthesis to work around buggy
programs. But it should not be on by default.
J
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-09-27 19:22 [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part) Dan Magenheimer
2009-09-27 21:39 ` Keir Fraser
2009-09-28 22:24 ` Jeremy Fitzhardinge
@ 2009-10-05 23:15 ` Jeremy Fitzhardinge
2009-10-05 23:42 ` Dan Magenheimer
2 siblings, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-05 23:15 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Xen-Devel (E-mail), Keir Fraser
On 09/27/09 12:22, Dan Magenheimer wrote:
> Switches rdtsc emulation from boot option to per-domain
> and enables it by default. Also removes hvm tsc scaling
> as it is no longer necessary.
>
>
I've run into a few problems from this patch:
1. I'm seeing occasional messages on the console "hrtimer: interrupt
too slow, forcing clock min delta to 9001953 ns" which indicates
that the kernel is noticing that timer operations are taking too long.
2. A domain can't turn on and off its own tsc emulation state. I'm
working on vsyscall support for pvclock (done, aside from this
issue), so I need native tsc in usermode (or at least, one with
the same parameters in kernel and userspace). I was getting very
confused because I didn't expect emulation to *only* apply to
usermode; I was expecting it to be done uniformly to both user and
kernel tscs, with appropriate adjustments to the vcpu_time_info
values.
3. The 's' debug key never seems to count any usermode rdtsc
instructions, even if I write a little program to explicitly
exercise them. (Fix below)
All in 64-bit PV domains.
Thanks,
J
diff -r 1dc86d83b352 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Fri Oct 02 17:01:26 2009 -0700
+++ b/xen/arch/x86/time.c Mon Oct 05 16:13:15 2009 -0700
@@ -1476,7 +1476,7 @@
}
else
{
- v->domain->arch.vtsc_kerncount++;
+ v->domain->arch.vtsc_usercount++;
spin_lock(&v->domain->arch.vtsc_lock);
now = get_s_time() + v->domain->arch.vtsc_stime_offset;
if ( (int64_t)(now - v->domain->arch.vtsc_last) > 0 )
^ permalink raw reply [flat|nested] 37+ messages in thread* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-05 23:15 ` Jeremy Fitzhardinge
@ 2009-10-05 23:42 ` Dan Magenheimer
2009-10-06 0:20 ` Jeremy Fitzhardinge
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-10-05 23:42 UTC (permalink / raw)
To: Jeremy Fitzhardinge; +Cc: Xen-Devel (E-mail), Keir Fraser
> I've run into a few problems from this patch:
>
> 1. I'm seeing occasional messages on the console "hrtimer:
> interrupt
> too slow, forcing clock min delta to 9001953 ns" which indicates
> that the kernel is noticing that timer operations are
> taking too long.
Hmmm... that seems unlikely since it is highly probable that on
ANY machine, emulated TSC is faster than other highres timers,
for example, HPET. Perhaps it is a side effect of your assumptions
described in (2) below?
> 2. A domain can't turn on and off its own tsc emulation state. I'm
> working on vsyscall support for pvclock (done, aside from this
> issue), so I need native tsc in usermode (or at least, one with
> the same parameters in kernel and userspace). I was
> getting very
> confused because I didn't expect emulation to *only* apply to
> usermode; I was expecting it to be done uniformly to
> both user and
> kernel tscs, with appropriate adjustments to the vcpu_time_info
> values.
Sorry, I should probably have said this explicitly in the patch
prologue, but there is no (easy/fast) way to turn on emulation
for userland and turn off emulation for the kernel, so rather
than requiring a change to the pvclock ABI, rdtsc emulation
returns a raw TSC value when the rdtsc was executed in kernel
mode and Xen system time (nsec) when the rdtsc was executed
in userland.
To ensure both correctness and maximum performance across
a wide range of conditions, WITHOUT destroying backward
compatiblity for the pvclock ABI, a decision tree similar
to the one I just posted for apps could be employed.
http://xen.markmail.org/message/uj4twbcsdw57z5zp
But this might rely on adding the same new Xen features
described in the parent to that post.
OTOH, if pvclock is executed sometimes in userland and
sometimes in kernel (e.g. depending on the setting of
a sysfs variable), it seems like some kind of decision
tree is required anyway.
> 3. The 's' debug key never seems to count any usermode rdtsc
> instructions, even if I write a little program to explicitly
> exercise them. (Fix below)
Good catch. Sorry, I tested that many times with an earlier version
of the patch, but didn't try debug-key with the final
version, after I went through and did some (apparently careless)
renaming.
Dan
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-05 23:42 ` Dan Magenheimer
@ 2009-10-06 0:20 ` Jeremy Fitzhardinge
2009-10-06 1:43 ` Dan Magenheimer
0 siblings, 1 reply; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-06 0:20 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Xen-Devel (E-mail), Keir Fraser
On 10/05/09 16:42, Dan Magenheimer wrote:
>> I've run into a few problems from this patch:
>>
>> 1. I'm seeing occasional messages on the console "hrtimer:
>> interrupt
>> too slow, forcing clock min delta to 9001953 ns" which indicates
>> that the kernel is noticing that timer operations are
>> taking too long.
>>
> Hmmm... that seems unlikely since it is highly probable that on
> ANY machine, emulated TSC is faster than other highres timers,
> for example, HPET. Perhaps it is a side effect of your assumptions
> described in (2) below?
>
The kernel is measuring all time using pvclock, so it is using rdtsc for
that too. Its simply noticing that rdtsc-to-rdtsc time is taking longer
than expected. (This is a domU, so it has no access to any other form
of time.)
>> 2. A domain can't turn on and off its own tsc emulation state. I'm
>> working on vsyscall support for pvclock (done, aside from this
>> issue), so I need native tsc in usermode (or at least, one with
>> the same parameters in kernel and userspace). I was
>> getting very
>> confused because I didn't expect emulation to *only* apply to
>> usermode; I was expecting it to be done uniformly to
>> both user and
>> kernel tscs, with appropriate adjustments to the vcpu_time_info
>> values.
>>
> Sorry, I should probably have said this explicitly in the patch
> prologue, but there is no (easy/fast) way to turn on emulation
> for userland and turn off emulation for the kernel, so rather
> than requiring a change to the pvclock ABI, rdtsc emulation
> returns a raw TSC value when the rdtsc was executed in kernel
> mode and Xen system time (nsec) when the rdtsc was executed
> in userland.
>
You misunderstand me. I was expecting that it would treat the tsc the
same in user and kernel mode, and return a set of appropriate pvclock
parameters. This would allow user and kernel tscs to be consistent.
That wouldn't requite any changes to the pvclock ABI and it would retain
the TSC's "global timestamp" property, even across kernel/usermode boundary.
Having them different is very awkard for tools which do things like
measure kernel->user latency by getting tsc timestamps, or as I'm trying
to do, use the pvclock parameters in userspace. You've effectively
added an entirely new "usermode tsc" vs "kernel mode tsc" concept to the
architecture.
But aside from that, all I'm asking for is a way for a domain to
explicitly request that its tsc not be synthesized (or failing that,
something that looks exactly like an unsynthesized tsc), so that
usermode pvclock can work without needing edits to the config file.
The current situation is very difficult because the kernel can't even
tell if usermode is getting the same tsc properties that it is.
> To ensure both correctness and maximum performance across
> a wide range of conditions, WITHOUT destroying backward
> compatiblity for the pvclock ABI, a decision tree similar
> to the one I just posted for apps could be employed.
>
Well, having a variance between usermode and kernel tscs does break
backwards compatibility and is very surprising.
> http://xen.markmail.org/message/uj4twbcsdw57z5zp
>
That looks very complicated.
> But this might rely on adding the same new Xen features
> described in the parent to that post.
>
> OTOH, if pvclock is executed sometimes in userland and
> sometimes in kernel (e.g. depending on the setting of
> a sysfs variable), it seems like some kind of decision
> tree is required anyway.
>
It is run in usermode as part of the normal vsyscall mechanism; the
kernel also uses the same information and tsc to compue time for
itself. At the moment the only test it makes is to check if Xen
supports the new hypercall I added to support this.
J
^ permalink raw reply [flat|nested] 37+ messages in thread* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-06 0:20 ` Jeremy Fitzhardinge
@ 2009-10-06 1:43 ` Dan Magenheimer
2009-10-06 4:56 ` Jeremy Fitzhardinge
2009-10-06 7:07 ` Keir Fraser
0 siblings, 2 replies; 37+ messages in thread
From: Dan Magenheimer @ 2009-10-06 1:43 UTC (permalink / raw)
To: Jeremy Fitzhardinge; +Cc: Xen-Devel (E-mail), Keir Fraser
> But aside from that, all I'm asking for is a way for a domain to
> explicitly request that its tsc not be synthesized (or failing that,
> something that looks exactly like an unsynthesized tsc), so that
> usermode pvclock can work without needing edits to the config file.
Maybe I'm misunderstanding but I think what you are
asking for is that the kernel override a setting
specified by the administrator. When that
setting (regardless of what you choose as a default)
is tsc_native==0, that means the administrator
has decided that correctness is more important
than performance, that now or sometime in the
future an app will use rdtsc and expect it to
behave like it does on either hardware with a
reliable TSC or like it does on VMware.
So in that circumstance, if pvclock+vsyscall is
dependent on an rdtsc, any rdtsc is slow and
the pvclock+vsyscall using rdtsc is slow.
I'm certainly open to a solution that solves both
problems, I just don't see one.
Perhaps a better choice would be for emulated tsc
to return Xen system time in both kernel and user
mode (which is what it does for HVM domains) and,
when a domain has tsc_native==0,
Xen sets its pvclock parameters so that no scaling
occurs? This results in the guest reporting that
it has a 1GHz clock, but may be more consistent.
> -----Original Message-----
> From: Jeremy Fitzhardinge [mailto:jeremy@goop.org]
> Sent: Monday, October 05, 2009 6:21 PM
> To: Dan Magenheimer
> Cc: Xen-Devel (E-mail); Keir Fraser
> Subject: Re: [Xen-devel] [PATCH] replace rdtsc emulation-vs-native xen
> boot option with per-domain (hypervisor part)
>
>
> On 10/05/09 16:42, Dan Magenheimer wrote:
> >> I've run into a few problems from this patch:
> >>
> >> 1. I'm seeing occasional messages on the console "hrtimer:
> >> interrupt
> >> too slow, forcing clock min delta to 9001953 ns"
> which indicates
> >> that the kernel is noticing that timer operations are
> >> taking too long.
> >>
> > Hmmm... that seems unlikely since it is highly probable that on
> > ANY machine, emulated TSC is faster than other highres timers,
> > for example, HPET. Perhaps it is a side effect of your assumptions
> > described in (2) below?
> >
>
> The kernel is measuring all time using pvclock, so it is
> using rdtsc for
> that too. Its simply noticing that rdtsc-to-rdtsc time is
> taking longer
> than expected. (This is a domU, so it has no access to any other form
> of time.)
>
> >> 2. A domain can't turn on and off its own tsc emulation
> state. I'm
> >> working on vsyscall support for pvclock (done, aside
> from this
> >> issue), so I need native tsc in usermode (or at
> least, one with
> >> the same parameters in kernel and userspace). I was
> >> getting very
> >> confused because I didn't expect emulation to *only* apply to
> >> usermode; I was expecting it to be done uniformly to
> >> both user and
> >> kernel tscs, with appropriate adjustments to the
> vcpu_time_info
> >> values.
> >>
> > Sorry, I should probably have said this explicitly in the patch
> > prologue, but there is no (easy/fast) way to turn on emulation
> > for userland and turn off emulation for the kernel, so rather
> > than requiring a change to the pvclock ABI, rdtsc emulation
> > returns a raw TSC value when the rdtsc was executed in kernel
> > mode and Xen system time (nsec) when the rdtsc was executed
> > in userland.
> >
>
> You misunderstand me. I was expecting that it would treat the tsc the
> same in user and kernel mode, and return a set of appropriate pvclock
> parameters. This would allow user and kernel tscs to be consistent.
> That wouldn't requite any changes to the pvclock ABI and it
> would retain
> the TSC's "global timestamp" property, even across
> kernel/usermode boundary.
>
> Having them different is very awkard for tools which do things like
> measure kernel->user latency by getting tsc timestamps, or as
> I'm trying
> to do, use the pvclock parameters in userspace. You've effectively
> added an entirely new "usermode tsc" vs "kernel mode tsc"
> concept to the
> architecture.
>
> But aside from that, all I'm asking for is a way for a domain to
> explicitly request that its tsc not be synthesized (or failing that,
> something that looks exactly like an unsynthesized tsc), so that
> usermode pvclock can work without needing edits to the config file.
>
> The current situation is very difficult because the kernel can't even
> tell if usermode is getting the same tsc properties that it is.
>
> > To ensure both correctness and maximum performance across
> > a wide range of conditions, WITHOUT destroying backward
> > compatiblity for the pvclock ABI, a decision tree similar
> > to the one I just posted for apps could be employed.
> >
> Well, having a variance between usermode and kernel tscs does break
> backwards compatibility and is very surprising.
>
> > http://xen.markmail.org/message/uj4twbcsdw57z5zp
> >
>
> That looks very complicated.
>
> > But this might rely on adding the same new Xen features
> > described in the parent to that post.
> >
> > OTOH, if pvclock is executed sometimes in userland and
> > sometimes in kernel (e.g. depending on the setting of
> > a sysfs variable), it seems like some kind of decision
> > tree is required anyway.
> >
>
> It is run in usermode as part of the normal vsyscall mechanism; the
> kernel also uses the same information and tsc to compue time for
> itself. At the moment the only test it makes is to check if Xen
> supports the new hypercall I added to support this.
>
> J
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-06 1:43 ` Dan Magenheimer
@ 2009-10-06 4:56 ` Jeremy Fitzhardinge
2009-10-06 7:07 ` Keir Fraser
1 sibling, 0 replies; 37+ messages in thread
From: Jeremy Fitzhardinge @ 2009-10-06 4:56 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Xen-Devel (E-mail), Keir Fraser
On 10/05/09 18:43, Dan Magenheimer wrote:
> Perhaps a better choice would be for emulated tsc
> to return Xen system time in both kernel and user
> mode (which is what it does for HVM domains) and,
> when a domain has tsc_native==0,
> Xen sets its pvclock parameters so that no scaling
> occurs? This results in the guest reporting that
> it has a 1GHz clock, but may be more consistent.
>
Yes, as I said:
> But aside from that, all I'm asking for is a way for a domain to
> explicitly request that its tsc not be synthesized (or failing that,
> something that looks exactly like an unsynthesized tsc), so that
> usermode pvclock can work without needing edits to the config file.
>
Having a consistent tsc between usermode and kernel will solve the
significant functional breakage that currently exists with tsc synthesis
enabled, making it just a question of performance (and resolution).
I still think tsc_native=1 is the correct setting for the vast majority
of PV guest domains, and enabling it by default is a bad idea.
J
^ permalink raw reply [flat|nested] 37+ messages in thread* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-06 1:43 ` Dan Magenheimer
2009-10-06 4:56 ` Jeremy Fitzhardinge
@ 2009-10-06 7:07 ` Keir Fraser
2009-10-06 9:09 ` Keir Fraser
1 sibling, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-10-06 7:07 UTC (permalink / raw)
To: Dan Magenheimer, Jeremy Fitzhardinge; +Cc: Xen-Devel (E-mail)
On 06/10/2009 02:43, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:
> Perhaps a better choice would be for emulated tsc
> to return Xen system time in both kernel and user
> mode (which is what it does for HVM domains) and,
> when a domain has tsc_native==0,
> Xen sets its pvclock parameters so that no scaling
> occurs? This results in the guest reporting that
> it has a 1GHz clock, but may be more consistent.
Yes, this has to be fixed. There's no good reason for different TSC
behaviours between kernel and userspace when both are emulated. If nothing
else, when Jeremy's patches go in (which I am sure they will, as the concept
is sane and the implementation appears so too) then that is going to be
incompatible with emulated tsc as it behaves currently.
-- Keir
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-06 7:07 ` Keir Fraser
@ 2009-10-06 9:09 ` Keir Fraser
2009-10-06 13:49 ` Dan Magenheimer
0 siblings, 1 reply; 37+ messages in thread
From: Keir Fraser @ 2009-10-06 9:09 UTC (permalink / raw)
To: Dan Magenheimer, Jeremy Fitzhardinge; +Cc: Xen-Devel (E-mail)
On 06/10/2009 08:07, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
>> Perhaps a better choice would be for emulated tsc
>> to return Xen system time in both kernel and user
>> mode (which is what it does for HVM domains) and,
>> when a domain has tsc_native==0,
>> Xen sets its pvclock parameters so that no scaling
>> occurs? This results in the guest reporting that
>> it has a 1GHz clock, but may be more consistent.
>
> Yes, this has to be fixed. There's no good reason for different TSC
> behaviours between kernel and userspace when both are emulated. If nothing
> else, when Jeremy's patches go in (which I am sure they will, as the concept
> is sane and the implementation appears so too) then that is going to be
> incompatible with emulated tsc as it behaves currently.
Should be fixed by c/s 20271.
-- Keir
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-06 9:09 ` Keir Fraser
@ 2009-10-06 13:49 ` Dan Magenheimer
2009-10-06 13:59 ` Keir Fraser
0 siblings, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-10-06 13:49 UTC (permalink / raw)
To: Keir Fraser; +Cc: Jeremy Fitzhardinge, Xen-Devel (E-mail)
> >> Perhaps a better choice would be for emulated tsc
> >> to return Xen system time in both kernel and user
> >> mode (which is what it does for HVM domains) and,
> >> when a domain has tsc_native==0,
> >> Xen sets its pvclock parameters so that no scaling
> >> occurs? This results in the guest reporting that
> >> it has a 1GHz clock, but may be more consistent.
> >
> > Yes, this has to be fixed. There's no good reason for different TSC
> > behaviours between kernel and userspace when both are
> emulated. If nothing
> > else, when Jeremy's patches go in (which I am sure they
> will, as the concept
> > is sane and the implementation appears so too) then that is
> going to be
> > incompatible with emulated tsc as it behaves currently.
>
> Should be fixed by c/s 20271.
Excellent. Thanks.
But in reviewing this patch, I noticed that you removed
the vtsc_stime_offset field in struct arch_domain.
This was intended to record the offset across migration.
But a per-domain stime offset must already be recorded
somewhere else, or hvm fully-emulated platform timers
would be broken across migration.
Do pv guests need something similar or is it magically
handled someplace that I couldn't quickly find?
Thanks,
Dan
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-06 13:49 ` Dan Magenheimer
@ 2009-10-06 13:59 ` Keir Fraser
2009-10-08 23:35 ` Dan Magenheimer
2009-11-05 17:08 ` Dan Magenheimer
0 siblings, 2 replies; 37+ messages in thread
From: Keir Fraser @ 2009-10-06 13:59 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Jeremy Fitzhardinge, Xen-Devel (E-mail)
On 06/10/2009 14:49, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:
> This was intended to record the offset across migration.
> But a per-domain stime offset must already be recorded
> somewhere else, or hvm fully-emulated platform timers
> would be broken across migration.
>
> Do pv guests need something similar or is it magically
> handled someplace that I couldn't quickly find?
PV guests understand that there will be a system-time discontinuity across
save/restore. All I did was remove an unused field. You can reintro it when
you use it.
-- Keir
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-06 13:59 ` Keir Fraser
@ 2009-10-08 23:35 ` Dan Magenheimer
2009-11-05 17:08 ` Dan Magenheimer
1 sibling, 0 replies; 37+ messages in thread
From: Dan Magenheimer @ 2009-10-08 23:35 UTC (permalink / raw)
To: Keir Fraser; +Cc: Xen-Devel (E-mail)
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>
> On 06/10/2009 14:49, "Dan Magenheimer"
> <dan.magenheimer@oracle.com> wrote:
>
> > This was intended to record the offset across migration.
> > But a per-domain stime offset must already be recorded
> > somewhere else, or hvm fully-emulated platform timers
> > would be broken across migration.
> >
> > Do pv guests need something similar or is it magically
> > handled someplace that I couldn't quickly find?
>
> PV guests understand that there will be a system-time
> discontinuity across
> save/restore. All I did was remove an unused field. You can
> reintro it when
> you use it.
Interestingly, I find that there is a discontinuity for
HVM guests as well (with either tsc_native=0 or
tsc_native=1). This can be demonstrated easily by
running a rdtsc loop in a user program that fails
if time goes backwards or jumps forward too far,
then doing a save/restore.
I had assumed that the VT-x tsc_offset mechanism would
handle the discontinuity but apparently it was never
implemented. (I didn't test live migration, so maybe
the mechanism IS used if time goes backwards, but
I suspect not.)
Since TSC is used for inter-jiffie interpolation on
many Linux systems (and the system I ran the test
on reports "Using 3.5xxx MHz WALL PM GTOD PIT/TSC timer"),
I'm surprised a huge discontinuity doesn't cause at
least some interesting problem for some HVM OS's.
Oops, it does. I see a soft lockup reported when I
save/restore an HVM domain (tested with tsc_native=1,
e.g. no tsc emulation).
Looks like another one for the "fix the tsc" list...
Dan
^ permalink raw reply [flat|nested] 37+ messages in thread
* RE: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-10-06 13:59 ` Keir Fraser
2009-10-08 23:35 ` Dan Magenheimer
@ 2009-11-05 17:08 ` Dan Magenheimer
2009-11-06 7:01 ` Keir Fraser
1 sibling, 1 reply; 37+ messages in thread
From: Dan Magenheimer @ 2009-11-05 17:08 UTC (permalink / raw)
To: Keir Fraser; +Cc: Jeremy Fitzhardinge, Xen-Devel (E-mail)
> On 06/10/2009 14:49, "Dan Magenheimer"
> <dan.magenheimer@oracle.com> wrote:
>
> > This was intended to record the offset across migration.
> > But a per-domain stime offset must already be recorded
> > somewhere else, or hvm fully-emulated platform timers
> > would be broken across migration.
> >
> > Do pv guests need something similar or is it magically
> > handled someplace that I couldn't quickly find?
>
> PV guests understand that there will be a system-time
> discontinuity across
> save/restore. All I did was remove an unused field. You can
> reintro it when
> you use it.
Finally got back to looking at this...
For an HVM, the relevant time values are saved/restored
as part of the cpu state.
For a PV, I'm thinking about adding them to the
domctl_getdomaininfo struct rather than add additional
domctls for each. Is this the proper place or will
changes to this struct result in backward compatibility
problems or ??
Thanks,
Dan
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part)
2009-11-05 17:08 ` Dan Magenheimer
@ 2009-11-06 7:01 ` Keir Fraser
0 siblings, 0 replies; 37+ messages in thread
From: Keir Fraser @ 2009-11-06 7:01 UTC (permalink / raw)
To: Dan Magenheimer; +Cc: Jeremy Fitzhardinge, Xen-Devel (E-mail)
On 05/11/2009 17:08, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:
> For a PV, I'm thinking about adding them to the
> domctl_getdomaininfo struct rather than add additional
> domctls for each. Is this the proper place or will
> changes to this struct result in backward compatibility
> problems or ??
It would be fine as we provide no compatibility guarantees for any domctl or
sysctl across consecutive major Xen releases.
-- Keir
^ permalink raw reply [flat|nested] 37+ messages in thread
end of thread, other threads:[~2009-11-06 7:01 UTC | newest]
Thread overview: 37+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-27 19:22 [PATCH] replace rdtsc emulation-vs-native xen boot option with per-domain (hypervisor part) Dan Magenheimer
2009-09-27 21:39 ` Keir Fraser
2009-09-27 23:19 ` Dan Magenheimer
2009-09-28 7:25 ` Keir Fraser
2009-09-28 9:04 ` Keir Fraser
2009-09-28 15:48 ` Dan Magenheimer
2009-09-28 16:07 ` Keir Fraser
2009-09-28 16:24 ` Dan Magenheimer
2009-09-28 19:44 ` Dan Magenheimer
2009-09-28 20:55 ` Keir Fraser
2009-09-28 8:22 ` Zhang, Xiantao
2009-09-28 15:13 ` Dan Magenheimer
2009-09-28 22:24 ` Jeremy Fitzhardinge
2009-09-28 22:43 ` Dan Magenheimer
2009-09-28 23:10 ` Jeremy Fitzhardinge
2009-09-28 23:36 ` Dan Magenheimer
2009-09-29 0:27 ` Ian Pratt
2009-09-29 1:42 ` Zhang, Xiantao
2009-09-29 3:13 ` Dan Magenheimer
2009-09-29 17:24 ` Jeremy Fitzhardinge
2009-09-29 16:44 ` Jeremy Fitzhardinge
2009-09-29 17:34 ` Dan Magenheimer
2009-09-29 19:01 ` Jeremy Fitzhardinge
2009-09-29 23:45 ` Dan Magenheimer
2009-09-30 0:41 ` Jeremy Fitzhardinge
2009-10-05 23:15 ` Jeremy Fitzhardinge
2009-10-05 23:42 ` Dan Magenheimer
2009-10-06 0:20 ` Jeremy Fitzhardinge
2009-10-06 1:43 ` Dan Magenheimer
2009-10-06 4:56 ` Jeremy Fitzhardinge
2009-10-06 7:07 ` Keir Fraser
2009-10-06 9:09 ` Keir Fraser
2009-10-06 13:49 ` Dan Magenheimer
2009-10-06 13:59 ` Keir Fraser
2009-10-08 23:35 ` Dan Magenheimer
2009-11-05 17:08 ` Dan Magenheimer
2009-11-06 7:01 ` Keir Fraser
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.