[PATCH] Fix hvm guest time to be more accurate

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Fix hvm guest time to be more accurate
@ 2007-10-24 21:15 Ben Guthro
  2007-10-25  5:52 ` Dong, Eddie
  0 siblings, 1 reply; 14+ messages in thread
From: Ben Guthro @ 2007-10-24 21:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Dave Winchell

[-- Attachment #1: Type: text/plain, Size: 640 bytes --]

The vpt timer code in effect accumulates missed ticks
when a guest is running but has interrupts disabled
or when the platform timer is starved. For guests
like 64 bit Linux which calculates missed ticks on each
clock interrupt based on the current tsc and the tsc
of the last interrupt and then adds missed ticks to jiffies
there is redundant accounting.

This change subtracts off the hypervisor calculated missed
ticks while guest running for 64 bit guests using the pit.
Missed ticks when vcpu 0 is descheduled are unaffected.

Signed-off-by: Ben Guthro <bguthro@virtualron.com>
Signed-off-by: Dave Winchell <dwinchell@virtualiron.com>

[-- Attachment #2: xen-platform-time.patch --]
[-- Type: text/x-patch, Size: 8750 bytes --]

diff -r c42fcc739fc4 tools/firmware/hvmloader/acpi/static_tables.c
--- a/tools/firmware/hvmloader/acpi/static_tables.c	Tue Oct 23 08:16:39 2007 -0400
+++ b/tools/firmware/hvmloader/acpi/static_tables.c	Tue Oct 23 08:16:39 2007 -0400
@@ -93,7 +93,7 @@ struct acpi_20_fadt Fadt = {
     },
 
     .x_pm_tmr_blk = {
-        .address_space_id    = ACPI_SYSTEM_IO,
+        .address_space_id    = 0xff,
         .register_bit_width  = ACPI_PM_TMR_BLK_BIT_WIDTH,
         .register_bit_offset = ACPI_PM_TMR_BLK_BIT_OFFSET,
         .address             = ACPI_PM_TMR_BLK_ADDRESS,
diff -r c42fcc739fc4 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c	Tue Oct 23 08:16:39 2007 -0400
+++ b/xen/arch/x86/hvm/i8254.c	Tue Oct 23 08:16:39 2007 -0400
@@ -405,6 +405,8 @@ static void pit_info(PITState *pit)
     struct hvm_hw_pit_channel *s;
     struct periodic_time *pt;
     int i;
+    struct periodic_time *pt;
+    unsigned long now;
 
     for ( i = 0; i < 3; i++ )
     {
@@ -447,11 +449,18 @@ static int pit_save(struct domain *d, hv
 {
     PITState *pit = domain_vpit(d);
     int rc;
+    unsigned long now;
+    struct periodic_time *pt;
 
     spin_lock(&pit->lock);
     
     pit_info(pit);
 
+    pt = &pit->pt0;
+    rdtscll(now);
+    pit->hw.pt_delivered = pt->delivered - now;
+    pit->hw.pt_frozen = pt->frozen - now;
+
     /* Save the PIT hardware state */
     rc = hvm_save_entry(PIT, 0, h, &pit->hw);
 
@@ -464,6 +473,8 @@ static int pit_load(struct domain *d, hv
 {
     PITState *pit = domain_vpit(d);
     int i;
+    struct periodic_time *pt;
+    unsigned long now;
 
     spin_lock(&pit->lock);
 
@@ -481,6 +492,11 @@ static int pit_load(struct domain *d, hv
     for ( i = 0; i < 3; i++ )
         pit_load_count(pit, i, pit->hw.channels[i].count);
 
+    pt = &pit->pt0;
+    rdtscll(now);
+    pt->delivered = now + pit->hw.pt_delivered;
+    pt->frozen = now + pit->hw.pt_frozen;
+
     pit_info(pit);
 
     spin_unlock(&pit->lock);
@@ -514,6 +530,18 @@ void pit_init(struct vcpu *v, unsigned l
     }
 
     spin_unlock(&pit->lock);
+}
+
+struct periodic_time *pit_get_timer(struct vcpu *v)
+{
+    PITState *pit = &v->domain->arch.hvm_domain.pl_time.vpit;
+    struct periodic_time *pt;
+
+    pt = &pit->pt0;
+    if ( pt->vcpu == v && pt->enabled )
+	return pt;
+    else
+	return NULL;
 }
 
 void pit_deinit(struct domain *d)
diff -r c42fcc739fc4 xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c	Tue Oct 23 08:16:39 2007 -0400
+++ b/xen/arch/x86/hvm/vpt.c	Tue Oct 23 08:18:49 2007 -0400
@@ -54,16 +54,7 @@ static void missed_ticks(struct periodic
         return;
 
     missed_ticks = missed_ticks / (s_time_t) pt->period + 1;
-    if ( missed_ticks > 1000 )
-    {
-        /* TODO: Adjust guest time together */
-        pt->pending_intr_nr++;
-    }
-    else
-    {
-        pt->pending_intr_nr += missed_ticks;
-    }
-
+    pt->pending_intr_nr += missed_ticks;
     pt->scheduled += missed_ticks * pt->period;
 }
 
@@ -71,6 +62,7 @@ void pt_freeze_time(struct vcpu *v)
 {
     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
     struct periodic_time *pt;
+    unsigned long now;
 
     if ( test_bit(_VPF_blocked, &v->pause_flags) )
         return;
@@ -79,8 +71,12 @@ void pt_freeze_time(struct vcpu *v)
 
     v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v);
 
-    list_for_each_entry ( pt, head, list )
+    rdtscll(now);
+    list_for_each_entry ( pt, head, list )
+    {
         stop_timer(&pt->timer);
+	pt->frozen = now;
+    }
 
     spin_unlock(&v->arch.hvm_vcpu.tm_lock);
 }
@@ -89,6 +85,7 @@ void pt_thaw_time(struct vcpu *v)
 {
     struct list_head *head = &v->arch.hvm_vcpu.tm_list;
     struct periodic_time *pt;
+    unsigned long now, delta;
 
     spin_lock(&v->arch.hvm_vcpu.tm_lock);
 
@@ -97,10 +94,14 @@ void pt_thaw_time(struct vcpu *v)
         hvm_set_guest_time(v, v->arch.hvm_vcpu.guest_time);
         v->arch.hvm_vcpu.guest_time = 0;
 
+	rdtscll(now);
         list_for_each_entry ( pt, head, list )
         {
             missed_ticks(pt);
             set_timer(&pt->timer, pt->scheduled);
+	    delta = now - pt->frozen;
+	    if(pt->delivered)
+		pt->delivered += delta;
         }
     }
 
@@ -158,6 +159,57 @@ void pt_update_irq(struct vcpu *v)
         hvm_isa_irq_assert(v->domain, irq);
     }
 }
+#include <asm/paging.h>
+struct periodic_time *pit_get_timer(struct vcpu *v);
+int pt_irq_subtract(struct vcpu *v, struct periodic_time *pt_handled)
+{
+    struct periodic_time *pt;
+    unsigned long delta_us;
+    unsigned long period_us;
+    int new_nr;
+    unsigned long now, delta;
+    unsigned long ticks, offset;
+    int ret = 0;
+
+    /* 64bit Linux guests calculate missed ticks in the clock interrupt handler
+     * and bump jiffies accordingly while 32bit Linux guests do not.
+     * If the (64bit) guest cpu0 has interrupts disabled for longer than two clock
+     * periods, and cpu0 is running, then since the tsc continues, the guest will
+     * find missed_ticks > 1 at the first clock interrupt. But the pt timer has continued
+     * to expire regularly and accumulated the missed interrupts in pending_intr_nr.
+     * If we deliver these accumulated interrupts the guest will run fast.
+     * Here we subtract off the missed interrupts for 64 bit guests using pit.
+     */
+
+    if(v->arch.paging.mode->guest_levels != 4)
+	return ret;
+    pt = pit_get_timer(v);
+    if(pt)
+	ret = 1;
+    if(pt == pt_handled) {
+	rdtscll(now);
+	if(!pt->delivered) {
+	    pt->delivered = now;
+	    return ret;
+	}
+	delta = now - pt->delivered;
+	pt->delivered = now;
+	delta_us = (delta * 1000UL)/(unsigned long)cpu_khz;
+	period_us = pt->period/1000UL; /* ns to usec*/
+	ticks = delta_us/period_us;
+	offset = delta_us % period_us;
+	if(ticks < 2)
+	    return ret;
+	ticks -= 1;
+	pt->delivered = now - (offset * (unsigned long)cpu_khz)/1000UL;
+	new_nr = pt->pending_intr_nr - ticks;
+	if(new_nr < 1)
+	    ticks = ticks + new_nr - 1;
+	pt->pending_intr_nr -= ticks;
+	pt->last_plt_gtime += ticks * pt->period_cycles;
+    }
+    return ret;
+}
 
 static struct periodic_time *is_pt_irq(
     struct vcpu *v, struct hvm_intack intack)
@@ -197,6 +249,7 @@ void pt_intr_post(struct vcpu *v, struct
     struct periodic_time *pt;
     time_cb *cb;
     void *cb_priv;
+    int pit_only;
 
     spin_lock(&v->arch.hvm_vcpu.tm_lock);
 
@@ -207,6 +260,7 @@ void pt_intr_post(struct vcpu *v, struct
         return;
     }
 
+    pit_only = pt_irq_subtract(v, pt);
     if ( pt->one_shot )
     {
         pt->enabled = 0;
@@ -218,8 +272,12 @@ void pt_intr_post(struct vcpu *v, struct
         pt->last_plt_gtime += pt->period_cycles;
     }
 
-    if ( hvm_get_guest_time(v) < pt->last_plt_gtime )
-        hvm_set_guest_time(v, pt->last_plt_gtime);
+    if(pit_only) {
+	if((pt == pit_get_timer(v)) && (hvm_get_guest_time(pt->vcpu) < pt->last_plt_gtime))
+	    hvm_set_guest_time(pt->vcpu, pt->last_plt_gtime);
+    }
+    else if(hvm_get_guest_time(pt->vcpu) < pt->last_plt_gtime)
+	hvm_set_guest_time(pt->vcpu, pt->last_plt_gtime);
 
     cb = pt->cb;
     cb_priv = pt->priv;
diff -r c42fcc739fc4 xen/include/asm-x86/hvm/vpt.h
--- a/xen/include/asm-x86/hvm/vpt.h	Tue Oct 23 08:16:39 2007 -0400
+++ b/xen/include/asm-x86/hvm/vpt.h	Tue Oct 23 08:16:39 2007 -0400
@@ -76,7 +76,7 @@ struct periodic_time {
     char one_shot;              /* one shot time */
     u8 irq;
     struct vcpu *vcpu;          /* vcpu timer interrupt delivers to */
-    u32 pending_intr_nr;        /* the couner for pending timer interrupts */
+    unsigned int pending_intr_nr; /* the couner for pending timer interrupts */
     u64 period;                 /* frequency in ns */
     u64 period_cycles;          /* frequency in cpu cycles */
     s_time_t scheduled;         /* scheduled timer interrupt */
@@ -84,6 +84,8 @@ struct periodic_time {
     struct timer timer;         /* ac_timer */
     time_cb *cb;
     void *priv;                 /* point back to platform time source */
+    unsigned long delivered;
+    unsigned long frozen;
 };
 
 
diff -r c42fcc739fc4 xen/include/public/arch-x86/hvm/save.h
--- a/xen/include/public/arch-x86/hvm/save.h	Tue Oct 23 08:16:39 2007 -0400
+++ b/xen/include/public/arch-x86/hvm/save.h	Tue Oct 23 08:34:05 2007 -0400
@@ -156,6 +156,8 @@ struct hvm_hw_cpu {
     };
     /* error code for pending event */
     uint32_t error_code;
+    unsigned long pt_delivered;
+    unsigned long pt_frozen;    
 };
 
 DECLARE_HVM_SAVE_TYPE(CPU, 2, struct hvm_hw_cpu);
@@ -342,6 +344,8 @@ struct hvm_hw_pit {
     } channels[3];  /* 3 x 16 bytes */
     uint32_t speaker_data_on;
     uint32_t pad0;
+    unsigned long pt_delivered;
+    unsigned long pt_frozen;
 };
 
 DECLARE_HVM_SAVE_TYPE(PIT, 10, struct hvm_hw_pit);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] Fix hvm guest time to be more accurate
  2007-10-24 21:15 [PATCH] Fix hvm guest time to be more accurate Ben Guthro
@ 2007-10-25  5:52 ` Dong, Eddie
  2007-10-25 14:45   ` Dave Winchell
  0 siblings, 1 reply; 14+ messages in thread
From: Dong, Eddie @ 2007-10-25  5:52 UTC (permalink / raw)
  To: Ben Guthro, xen-devel; +Cc: Dave Winchell

>
>The vpt timer code in effect accumulates missed ticks
>when a guest is running but has interrupts disabled
>or when the platform timer is starved. For guests

This case, VMM will pick up the lost ticks into pending_intr_nr.
The only issue is that if a guest is suspended or save/restored
for long time such as several hours or days, we may see tons 
of lost ticks, which is difficult to be injected back (cost minutes
of times or even longer). So we give up those amount of 
pending_intr_nr.  In all above case, guest need to re-sync its
timer with others like network time for example. So it is 
harmless.

Similar situation happens when somebody is debugging a guest.

>like 64 bit Linux which calculates missed ticks on each
>clock interrupt based on the current tsc and the tsc
>of the last interrupt and then adds missed ticks to jiffies
>there is redundant accounting.
>
>This change subtracts off the hypervisor calculated missed
>ticks while guest running for 64 bit guests using the pit.
>Missed ticks when vcpu 0 is descheduled are unaffected.
>
I think this one is not the right direction.

The problem in time virtualization is that we don't how guest will use
it.
Latest 64 bit Linux can pick up the missed ticks from TSC like you
mentioned, but it is not true for other 64 bits guest even linux 
such as 2.6.16, nor for Windows.

Besides PV timer approach which is not always ready, basically
we have 3 HVM time virtualization approaches:

1: Current one:
	Freeze guest time when the guest is descheduled and
thus sync all guest time resource together. This one
precisely solve the guest time cross-reference issues, guest TSC
precisely represent guest time and thus can be cross-referenced
 in guest to pick up lossed ticks if have. but the logic 
is relatively complicated and is easy to see bugs :-(


2: Pin guest time to host time.
	This is simplest approach, guest TSC is always pinned to
host TSC with a fixed offset no matter the vCPU is descheduled or
not. In this case, other guest periodic IRQ driven time resource 
are not synced to guest TSC.
	Base on this, we have 2 deviations:
	A: Accumulate pending_intr_nr like current #1 approach.
	B: Give up accumulated pending_intr_nr. We only inject
one IRQ for a periodic IRQ driven guest time such as PIT.

	What you mentioned here is a special case of 2B.

	Since we don't know how guest behaviors, what we are
proposing recently is to implement all of above, and let administrate
tools to choose the one to use base on knowledge of guest OS
type. 

thanks, eddie

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Fix hvm guest time to be more accurate
  2007-10-25  5:52 ` Dong, Eddie
@ 2007-10-25 14:45   ` Dave Winchell
  2007-10-26  6:48     ` Dong, Eddie
  0 siblings, 1 reply; 14+ messages in thread
From: Dave Winchell @ 2007-10-25 14:45 UTC (permalink / raw)
  To: Dong, Eddie; +Cc: xen-devel, Ben Guthro

Hi Doug,

Thanks for these comments.

Dong, Eddie wrote:

>>The vpt timer code in effect accumulates missed ticks
>>when a guest is running but has interrupts disabled
>>or when the platform timer is starved. For guests
>>    
>>
>
>This case, VMM will pick up the lost ticks into pending_intr_nr.
>The only issue is that if a guest is suspended or save/restored
>for long time such as several hours or days, we may see tons 
>of lost ticks, which is difficult to be injected back (cost minutes
>of times or even longer). So we give up those amount of 
>pending_intr_nr.  In all above case, guest need to re-sync its
>timer with others like network time for example. So it is 
>harmless.
>
>Similar situation happens when somebody is debugging a guest.
>  
>
The solution we provided removes the one second limit on missed ticks.
Our testing showed that this is often exceeded under some loads,
such as many guests, each running loads. Setting missed ticks to 1 tick
when 1000 is exceeded is a source of timing error. In the code,
where its set to one there is a TBD sync with guest comment, but no
action.

In terms of re-syncing with network time, our goal was to have the
timekeeping accurate enough so that the guest could run ntpd.
To do that, the under lying timekeeping needs to be accurate to .05%,
or so. Our measurements show that with this patch the core timekeeping is
accurate to .02%, approximately, even under loads where many guests run 
loads.
Without this patch, timekeeping is off by more than 10% and ntpd cannot
sync it.

>  
>
>>like 64 bit Linux which calculates missed ticks on each
>>clock interrupt based on the current tsc and the tsc
>>of the last interrupt and then adds missed ticks to jiffies
>>there is redundant accounting.
>>
>>This change subtracts off the hypervisor calculated missed
>>ticks while guest running for 64 bit guests using the pit.
>>Missed ticks when vcpu 0 is descheduled are unaffected.
>>
>>    
>>
>I think this one is not the right direction.
>
>The problem in time virtualization is that we don't how guest will use
>it.
>Latest 64 bit Linux can pick up the missed ticks from TSC like you
>mentioned, but it is not true for other 64 bits guest even linux 
>such as 2.6.16, nor for Windows.
>  
>
Ours is a specific solution.
Let me explain our logic.

We configure all our Linux guests with clock=pit.

The 32bit Linux guests we run don't calculate missed ticks and so
don't need cancellation. All the 64bit Linux guests that we run
calculate missed ticks and need cancellation.
I just checked 2.26.16 and it does calculate missed ticks in
arch/x86_64/lermel/time.c, main_timer_handler(), when using pit for
timekeeping.

The missed ticks cancellation code is activated in this patch when the
guest has configured the pit for timekeeping and the guest has four
level page tables (ie 64 bit).

The windows guests we run use rtc for timekeeping and don't need
or get cancellation.

So the simplifying assumption here is that a 64bit guest using pit is 
calculating
missed ticks.

I would be in favor of a method where xen is told directly whether to do
missed ticks cancellation. Perhaps its part of the guest configuration
information.

>Besides PV timer approach which is not always ready, basically
>we have 3 HVM time virtualization approaches:
>
>1: Current one:
>	Freeze guest time when the guest is descheduled and
>thus sync all guest time resource together. This one
>precisely solve the guest time cross-reference issues, guest TSC
>precisely represent guest time and thus can be cross-referenced
> in guest to pick up lossed ticks if have. but the logic 
>is relatively complicated and is easy to see bugs :-(
>
>
>2: Pin guest time to host time.
>	This is simplest approach, guest TSC is always pinned to
>host TSC with a fixed offset no matter the vCPU is descheduled or
>not. In this case, other guest periodic IRQ driven time resource 
>are not synced to guest TSC.
>	Base on this, we have 2 deviations:
>	A: Accumulate pending_intr_nr like current #1 approach.
>	B: Give up accumulated pending_intr_nr. We only inject
>one IRQ for a periodic IRQ driven guest time such as PIT.
>
>	What you mentioned here is a special case of 2B.
>
>	Since we don't know how guest behaviors, what we are
>proposing recently is to implement all of above, and let administrate
>tools to choose the one to use base on knowledge of guest OS
>type. 
>
>thanks, eddie
>  
>
I agree with you on having various policies for timekeeping based on
the guest being run.

This patch addresses specifically the problem
of pit users who calculate missed ticks. Note that in the solution,
de-scheduled missed ticks are not canceled, they are still needed
as the tsc is continuous in the current methods. We are only canceling those
pending_intr_nr that accumulate while the guest is running. These are due
to inaccuracies in the xen time expirations due to interrupt loads or 
long dom0
interrupt disable periods. They are also due to extended periods where 
the guest
has interrupts disabled. In these cases, as the tsc has been running,
the guest will calculated missed ticks at the time of first clock interrupt
injection and then xen will deliver pending_intr_nr additional 
interrupts resulting
in jiffies moving by 2*pending_intr_nr instead of the desired 
pending_intr_nr.

regards,
Dave

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] Fix hvm guest time to be more accurate
  2007-10-25 14:45   ` Dave Winchell
@ 2007-10-26  6:48     ` Dong, Eddie
  2007-10-26 13:56       ` Dave Winchell
  0 siblings, 1 reply; 14+ messages in thread
From: Dong, Eddie @ 2007-10-26  6:48 UTC (permalink / raw)
  To: Dave Winchell; +Cc: xen-devel, Ben Guthro

Dave Winchell wrote:
> Hi Doug,
> 
> Thanks for these comments.
> 
> Dong, Eddie wrote:
> 
>>> The vpt timer code in effect accumulates missed ticks
>>> when a guest is running but has interrupts disabled
>>> or when the platform timer is starved. For guests
>>> 
>>> 
>> 
>> This case, VMM will pick up the lost ticks into pending_intr_nr.
>> The only issue is that if a guest is suspended or save/restored
>> for long time such as several hours or days, we may see tons
>> of lost ticks, which is difficult to be injected back (cost minutes
>> of times or even longer). So we give up those amount of
>> pending_intr_nr.  In all above case, guest need to re-sync its
>> timer with others like network time for example. So it is
>> harmless.
>> 
>> Similar situation happens when somebody is debugging a guest.
>> 
>> 
> The solution we provided removes the one second limit on missed ticks.
> Our testing showed that this is often exceeded under some loads,
> such as many guests, each running loads. Setting missed ticks to 1
> tick when 1000 is exceeded is a source of timing error. In the code,
> where its set to one there is a TBD sync with guest comment, but no
> action. 

That is possible, So we should increase 1000 to be more bigger.
Make it to be around 10s should be OK?

> 
> In terms of re-syncing with network time, our goal was to have the
> timekeeping accurate enough so that the guest could run ntpd.
> To do that, the under lying timekeeping needs to be accurate to .05%,
> or so. Our measurements show that with this patch the core
> timekeeping is
> accurate to .02%, approximately, even under loads where many
> guests run
> loads.
> Without this patch, timekeeping is off by more than 10% and ntpd
> cannot sync it. 
> 
>> 
>> 
>>> like 64 bit Linux which calculates missed ticks on each
>>> clock interrupt based on the current tsc and the tsc
>>> of the last interrupt and then adds missed ticks to jiffies
>>> there is redundant accounting.
>>> 
>>> This change subtracts off the hypervisor calculated missed
>>> ticks while guest running for 64 bit guests using the pit.
>>> Missed ticks when vcpu 0 is descheduled are unaffected.
>>> 
>>> 
>>> 
>> I think this one is not the right direction.
>> 
>> The problem in time virtualization is that we don't how guest will
>> use it. Latest 64 bit Linux can pick up the missed ticks from TSC
>> like you mentioned, but it is not true for other 64 bits guest even
>> linux 
>> such as 2.6.16, nor for Windows.
>> 
>> 
> Ours is a specific solution.
> Let me explain our logic.

Yes, it can fit for some situation :-)
But I think we need a generic solution.

How to choose the time virtualization policy can be argued.
And we may use some experiemental data. What you found
is definitely one of the good data :-)

> 
> We configure all our Linux guests with clock=pit.

Just curious: why you favor PIT instead of HPET?
Does HPET bring more deviation?

> 
> The 32bit Linux guests we run don't calculate missed ticks and so
> don't need cancellation. All the 64bit Linux guests that we run
> calculate missed ticks and need cancellation.
> I just checked 2.26.16 and it does calculate missed ticks in
> arch/x86_64/lermel/time.c, main_timer_handler(), when using pit for
> timekeeping. 

But this is reported as lost ticks which will prink something.
In theory with guest TSC synchronized with guest periodic
timer. This issue can be removed, but somehow (maybe bug
or virtualization overhead) we may still see them :-(

> 
> The missed ticks cancellation code is activated in this patch when the
> guest has configured the pit for timekeeping and the guest has four
> level page tables (ie 64 bit).
> 
> The windows guests we run use rtc for timekeeping and don't need
> or get cancellation.
> 
> So the simplifying assumption here is that a 64bit guest using pit is
> calculating missed ticks.
> 
> I would be in favor of a method where xen is told directly
> whether to do
> missed ticks cancellation. Perhaps its part of the guest
> configuration information. 
> 
>> Besides PV timer approach which is not always ready, basically
>> we have 3 HVM time virtualization approaches:
>> 
>> 1: Current one:
>> 	Freeze guest time when the guest is descheduled and
>> thus sync all guest time resource together. This one
>> precisely solve the guest time cross-reference issues, guest TSC
>> precisely represent guest time and thus can be cross-referenced
>> in guest to pick up lossed ticks if have. but the logic
>> is relatively complicated and is easy to see bugs :-(
>> 
>> 
>> 2: Pin guest time to host time.
>> 	This is simplest approach, guest TSC is always pinned to
>> host TSC with a fixed offset no matter the vCPU is descheduled or
>> not. In this case, other guest periodic IRQ driven time resource
>> are not synced to guest TSC.
>> 	Base on this, we have 2 deviations:
>> 	A: Accumulate pending_intr_nr like current #1 approach.
>> 	B: Give up accumulated pending_intr_nr. We only inject
>> one IRQ for a periodic IRQ driven guest time such as PIT.
>> 
>> 	What you mentioned here is a special case of 2B.
>> 
>> 	Since we don't know how guest behaviors, what we are
>> proposing recently is to implement all of above, and let administrate
>> tools to choose the one to use base on knowledge of guest OS
>> type.
>> 
>> thanks, eddie
>> 
>> 
> I agree with you on having various policies for timekeeping based on
> the guest being run. 
> 
> This patch addresses specifically the problem
> of pit users who calculate missed ticks. Note that in the solution,
> de-scheduled missed ticks are not canceled, they are still needed
> as the tsc is continuous in the current methods. We are only

If we rely on guest to pick up the lost ticks, why not just do it
thoroughly?
i..e even deschedule missed ticks can rely on guest to pick up.

That is what 2.B proposed.
In some cases, we saw issues in Windows (XP32) with 2B, guest wall clock
becomes slow. Maybe XP64 behaviors different like you saw, but we need
windows expert to double check.

Some rough idea in my mind is:
	Policy #1 works best for 32 bits Liunux (and old 64 bits Linux).
	Policy #2B works for latest 64 bits Linux.
	Policy #2A works for Windows (32 & 64 bits).

> canceling those
> pending_intr_nr that accumulate while the guest is running.
> These are due
> to inaccuracies in the xen time expirations due to interrupt loads or
> long dom0 interrupt disable periods. They are also due to extended
> periods where the guest has interrupts disabled. In these cases, as
> the tsc has been running, the guest will calculated missed ticks at
> the time of first 
> clock interrupt
> injection and then xen will deliver pending_intr_nr additional
> interrupts resulting in jiffies moving by 2*pending_intr_nr instead
> of the desired pending_intr_nr. 
> 
> regards,
> Dave

thx, eddie

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Fix hvm guest time to be more accurate
  2007-10-26  6:48     ` Dong, Eddie
@ 2007-10-26 13:56       ` Dave Winchell
  2007-10-26 18:18         ` Dave Winchell
  2007-10-29  9:57         ` Dong, Eddie
  0 siblings, 2 replies; 14+ messages in thread
From: Dave Winchell @ 2007-10-26 13:56 UTC (permalink / raw)
  To: Dong, Eddie; +Cc: Dave Winchell, xen-devel, Ben Guthro

Dong, Eddie wrote:

>Dave Winchell wrote:
>  
>
>>Hi Doug,
>>
>>Thanks for these comments.
>>
>>Dong, Eddie wrote:
>>
>>    
>>
>>>>The vpt timer code in effect accumulates missed ticks
>>>>when a guest is running but has interrupts disabled
>>>>or when the platform timer is starved. For guests
>>>>
>>>>
>>>>        
>>>>
>>>This case, VMM will pick up the lost ticks into pending_intr_nr.
>>>The only issue is that if a guest is suspended or save/restored
>>>for long time such as several hours or days, we may see tons
>>>of lost ticks, which is difficult to be injected back (cost minutes
>>>of times or even longer). So we give up those amount of
>>>pending_intr_nr.  In all above case, guest need to re-sync its
>>>timer with others like network time for example. So it is
>>>harmless.
>>>
>>>Similar situation happens when somebody is debugging a guest.
>>>
>>>
>>>      
>>>
>>The solution we provided removes the one second limit on missed ticks.
>>Our testing showed that this is often exceeded under some loads,
>>such as many guests, each running loads. Setting missed ticks to 1
>>tick when 1000 is exceeded is a source of timing error. In the code,
>>where its set to one there is a TBD sync with guest comment, but no
>>action. 
>>    
>>
>
>That is possible, So we should increase 1000 to be more bigger.
>Make it to be around 10s should be OK?
>
>  
>
Agreed.

>>In terms of re-syncing with network time, our goal was to have the
>>timekeeping accurate enough so that the guest could run ntpd.
>>To do that, the under lying timekeeping needs to be accurate to .05%,
>>or so. Our measurements show that with this patch the core
>>timekeeping is
>>accurate to .02%, approximately, even under loads where many
>>guests run
>>loads.
>>Without this patch, timekeeping is off by more than 10% and ntpd
>>cannot sync it. 
>>
>>    
>>
>>>      
>>>
>>>>like 64 bit Linux which calculates missed ticks on each
>>>>clock interrupt based on the current tsc and the tsc
>>>>of the last interrupt and then adds missed ticks to jiffies
>>>>there is redundant accounting.
>>>>
>>>>This change subtracts off the hypervisor calculated missed
>>>>ticks while guest running for 64 bit guests using the pit.
>>>>Missed ticks when vcpu 0 is descheduled are unaffected.
>>>>
>>>>
>>>>
>>>>        
>>>>
>>>I think this one is not the right direction.
>>>
>>>The problem in time virtualization is that we don't how guest will
>>>use it. Latest 64 bit Linux can pick up the missed ticks from TSC
>>>like you mentioned, but it is not true for other 64 bits guest even
>>>linux 
>>>such as 2.6.16, nor for Windows.
>>>
>>>
>>>      
>>>
>>Ours is a specific solution.
>>Let me explain our logic.
>>    
>>
>
>Yes, it can fit for some situation :-)
>But I think we need a generic solution.
>
>How to choose the time virtualization policy can be argued.
>And we may use some experiemental data. What you found
>is definitely one of the good data :-)
>
>  
>
>>We configure all our Linux guests with clock=pit.
>>    
>>
>
>Just curious: why you favor PIT instead of HPET?
>Does HPET bring more deviation?
>  
>
We started with pit because it kept such good time for
32 bit Linux. Based on this, we thought that
the problems with 64bit pit would be manageable.

One of these days we will characterize HPET.
Based on rtc performing well, I would think that HPET would do well too.
If not, then the reasons could be investigated.

>  
>
>>The 32bit Linux guests we run don't calculate missed ticks and so
>>don't need cancellation. All the 64bit Linux guests that we run
>>calculate missed ticks and need cancellation.
>>I just checked 2.26.16 and it does calculate missed ticks in
>>arch/x86_64/lermel/time.c, main_timer_handler(), when using pit for
>>timekeeping. 
>>    
>>
>
>But this is reported as lost ticks which will prink something.
>In theory with guest TSC synchronized with guest periodic
>timer. This issue can be removed, but somehow (maybe bug
>or virtualization overhead) we may still see them :-(
>
>  
>
>>The missed ticks cancellation code is activated in this patch when the
>>guest has configured the pit for timekeeping and the guest has four
>>level page tables (ie 64 bit).
>>
>>The windows guests we run use rtc for timekeeping and don't need
>>or get cancellation.
>>
>>So the simplifying assumption here is that a 64bit guest using pit is
>>calculating missed ticks.
>>
>>I would be in favor of a method where xen is told directly
>>whether to do
>>missed ticks cancellation. Perhaps its part of the guest
>>configuration information. 
>>
>>    
>>
>>>Besides PV timer approach which is not always ready, basically
>>>we have 3 HVM time virtualization approaches:
>>>
>>>1: Current one:
>>>	Freeze guest time when the guest is descheduled and
>>>thus sync all guest time resource together. This one
>>>precisely solve the guest time cross-reference issues, guest TSC
>>>precisely represent guest time and thus can be cross-referenced
>>>in guest to pick up lossed ticks if have. but the logic
>>>is relatively complicated and is easy to see bugs :-(
>>>
>>>
>>>2: Pin guest time to host time.
>>>	This is simplest approach, guest TSC is always pinned to
>>>host TSC with a fixed offset no matter the vCPU is descheduled or
>>>not. In this case, other guest periodic IRQ driven time resource
>>>are not synced to guest TSC.
>>>	Base on this, we have 2 deviations:
>>>	A: Accumulate pending_intr_nr like current #1 approach.
>>>	B: Give up accumulated pending_intr_nr. We only inject
>>>one IRQ for a periodic IRQ driven guest time such as PIT.
>>>
>>>	What you mentioned here is a special case of 2B.
>>>
>>>	Since we don't know how guest behaviors, what we are
>>>proposing recently is to implement all of above, and let administrate
>>>tools to choose the one to use base on knowledge of guest OS
>>>type.
>>>
>>>thanks, eddie
>>>
>>>
>>>      
>>>
>>I agree with you on having various policies for timekeeping based on
>>the guest being run. 
>>
>>This patch addresses specifically the problem
>>of pit users who calculate missed ticks. Note that in the solution,
>>de-scheduled missed ticks are not canceled, they are still needed
>>as the tsc is continuous in the current methods. We are only
>>    
>>
>
>If we rely on guest to pick up the lost ticks, why not just do it
>thoroughly?
>i..e even deschedule missed ticks can rely on guest to pick up.
>  
>
I have considered this. I was worried that if the descheduled period
was too large that the guest would do something funny, like declare lost
to be 1 ;-)
However, the descheduled periods are probably no longer than the
interrupts disabled periods, given some of the problems we have with
guests in spinlock_irq code. Also, since we have the Linux guest code,
and have been relying on being able to read it to make timekeeping policy,
we can see that they don't set lost to 1.

Actually, the more I think about this, the more I like the idea.
It would mean that we wouldn't have to deliver all those pent up
interrupts to the guest. It solves some other problems as well.
We could probably use this policy for most guests and timekeeping
sources. Linux 32bit with pit might be the exception.

>That is what 2.B proposed.
>In some cases, we saw issues in Windows (XP32) with 2B, guest wall clock
>becomes slow. Maybe XP64 behaviors different like you saw, but we need
>windows expert to double check.
>
>Some rough idea in my mind is:
>	Policy #1 works best for 32 bits Liunux (and old 64 bits Linux).
>	Policy #2B works for latest 64 bits Linux.
>	Policy #2A works for Windows (32 & 64 bits).
>  
>
I agree with this breakdown.
The next step is to do some experiments, I think.

>  
>
>>canceling those
>>pending_intr_nr that accumulate while the guest is running.
>>These are due
>>to inaccuracies in the xen time expirations due to interrupt loads or
>>long dom0 interrupt disable periods. They are also due to extended
>>periods where the guest has interrupts disabled. In these cases, as
>>the tsc has been running, the guest will calculated missed ticks at
>>the time of first 
>>clock interrupt
>>injection and then xen will deliver pending_intr_nr additional
>>interrupts resulting in jiffies moving by 2*pending_intr_nr instead
>>of the desired pending_intr_nr. 
>>
>>regards,
>>Dave
>>    
>>
>
>thx, eddie
>  
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Fix hvm guest time to be more accurate
  2007-10-26 13:56       ` Dave Winchell
@ 2007-10-26 18:18         ` Dave Winchell
  2007-10-29  9:58           ` Dong, Eddie
  2007-10-29  9:57         ` Dong, Eddie
  1 sibling, 1 reply; 14+ messages in thread
From: Dave Winchell @ 2007-10-26 18:18 UTC (permalink / raw)
  To: Dong, Eddie; +Cc: Dave Winchell, xen-devel, Ben Guthro

Eddie,

I implemented #2B and ran a three hour test
with sles9-64 and rh4u4-64 guests. Each guest had 8 vcpus
and the box was Intel with 2 physical processors.
The guests were running large loads.
Clock was pit. This is my usual test setup, except that I just
as often used AMD nodes with more processors.

The time error was .02%, good enough for ntpd.

The implementation keeps a constant guest tsc offset.
There is no pending_nr cancellation.
When the vpt.c timer expires, it only increments pending_nr
if its value is zero.
Missed_ticks() is still calculated, but only to update the new
timeout value.
There is no adjustment to the tsc offset (set_guest_time())
at clock interrupt delivery time nor at re-scheduling time.

So, I like this method better than the pending_nr subtract.
I'm going to work on this some more and, if all goes well,
propose a new code submission soon.
I'll put some kind of policy switch in too, which we can discuss
and modify, but it will be along the lines of what we discussed below.

Thanks for your input!

-Dave



Dave Winchell wrote:

> Dong, Eddie wrote:
>
>> Dave Winchell wrote:
>>  
>>
>>> Hi Doug,
>>>
>>> Thanks for these comments.
>>>
>>> Dong, Eddie wrote:
>>>
>>>   
>>>
>>>>> The vpt timer code in effect accumulates missed ticks
>>>>> when a guest is running but has interrupts disabled
>>>>> or when the platform timer is starved. For guests
>>>>>
>>>>>
>>>>>       
>>>>
>>>> This case, VMM will pick up the lost ticks into pending_intr_nr.
>>>> The only issue is that if a guest is suspended or save/restored
>>>> for long time such as several hours or days, we may see tons
>>>> of lost ticks, which is difficult to be injected back (cost minutes
>>>> of times or even longer). So we give up those amount of
>>>> pending_intr_nr.  In all above case, guest need to re-sync its
>>>> timer with others like network time for example. So it is
>>>> harmless.
>>>>
>>>> Similar situation happens when somebody is debugging a guest.
>>>>
>>>>
>>>>     
>>>
>>> The solution we provided removes the one second limit on missed ticks.
>>> Our testing showed that this is often exceeded under some loads,
>>> such as many guests, each running loads. Setting missed ticks to 1
>>> tick when 1000 is exceeded is a source of timing error. In the code,
>>> where its set to one there is a TBD sync with guest comment, but no
>>> action.   
>>
>>
>> That is possible, So we should increase 1000 to be more bigger.
>> Make it to be around 10s should be OK?
>>
>>  
>>
> Agreed.
>
>>> In terms of re-syncing with network time, our goal was to have the
>>> timekeeping accurate enough so that the guest could run ntpd.
>>> To do that, the under lying timekeeping needs to be accurate to .05%,
>>> or so. Our measurements show that with this patch the core
>>> timekeeping is
>>> accurate to .02%, approximately, even under loads where many
>>> guests run
>>> loads.
>>> Without this patch, timekeeping is off by more than 10% and ntpd
>>> cannot sync it.
>>>   
>>>
>>>>     
>>>>
>>>>> like 64 bit Linux which calculates missed ticks on each
>>>>> clock interrupt based on the current tsc and the tsc
>>>>> of the last interrupt and then adds missed ticks to jiffies
>>>>> there is redundant accounting.
>>>>>
>>>>> This change subtracts off the hypervisor calculated missed
>>>>> ticks while guest running for 64 bit guests using the pit.
>>>>> Missed ticks when vcpu 0 is descheduled are unaffected.
>>>>>
>>>>>
>>>>>
>>>>>       
>>>>
>>>> I think this one is not the right direction.
>>>>
>>>> The problem in time virtualization is that we don't how guest will
>>>> use it. Latest 64 bit Linux can pick up the missed ticks from TSC
>>>> like you mentioned, but it is not true for other 64 bits guest even
>>>> linux such as 2.6.16, nor for Windows.
>>>>
>>>>
>>>>     
>>>
>>> Ours is a specific solution.
>>> Let me explain our logic.
>>>   
>>
>>
>> Yes, it can fit for some situation :-)
>> But I think we need a generic solution.
>>
>> How to choose the time virtualization policy can be argued.
>> And we may use some experiemental data. What you found
>> is definitely one of the good data :-)
>>
>>  
>>
>>> We configure all our Linux guests with clock=pit.
>>>   
>>
>>
>> Just curious: why you favor PIT instead of HPET?
>> Does HPET bring more deviation?
>>  
>>
> We started with pit because it kept such good time for
> 32 bit Linux. Based on this, we thought that
> the problems with 64bit pit would be manageable.
>
> One of these days we will characterize HPET.
> Based on rtc performing well, I would think that HPET would do well too.
> If not, then the reasons could be investigated.
>
>>  
>>
>>> The 32bit Linux guests we run don't calculate missed ticks and so
>>> don't need cancellation. All the 64bit Linux guests that we run
>>> calculate missed ticks and need cancellation.
>>> I just checked 2.26.16 and it does calculate missed ticks in
>>> arch/x86_64/lermel/time.c, main_timer_handler(), when using pit for
>>> timekeeping.   
>>
>>
>> But this is reported as lost ticks which will prink something.
>> In theory with guest TSC synchronized with guest periodic
>> timer. This issue can be removed, but somehow (maybe bug
>> or virtualization overhead) we may still see them :-(
>>
>>  
>>
>>> The missed ticks cancellation code is activated in this patch when the
>>> guest has configured the pit for timekeeping and the guest has four
>>> level page tables (ie 64 bit).
>>>
>>> The windows guests we run use rtc for timekeeping and don't need
>>> or get cancellation.
>>>
>>> So the simplifying assumption here is that a 64bit guest using pit is
>>> calculating missed ticks.
>>>
>>> I would be in favor of a method where xen is told directly
>>> whether to do
>>> missed ticks cancellation. Perhaps its part of the guest
>>> configuration information.
>>>   
>>>
>>>> Besides PV timer approach which is not always ready, basically
>>>> we have 3 HVM time virtualization approaches:
>>>>
>>>> 1: Current one:
>>>>     Freeze guest time when the guest is descheduled and
>>>> thus sync all guest time resource together. This one
>>>> precisely solve the guest time cross-reference issues, guest TSC
>>>> precisely represent guest time and thus can be cross-referenced
>>>> in guest to pick up lossed ticks if have. but the logic
>>>> is relatively complicated and is easy to see bugs :-(
>>>>
>>>>
>>>> 2: Pin guest time to host time.
>>>>     This is simplest approach, guest TSC is always pinned to
>>>> host TSC with a fixed offset no matter the vCPU is descheduled or
>>>> not. In this case, other guest periodic IRQ driven time resource
>>>> are not synced to guest TSC.
>>>>     Base on this, we have 2 deviations:
>>>>     A: Accumulate pending_intr_nr like current #1 approach.
>>>>     B: Give up accumulated pending_intr_nr. We only inject
>>>> one IRQ for a periodic IRQ driven guest time such as PIT.
>>>>
>>>>     What you mentioned here is a special case of 2B.
>>>>
>>>>     Since we don't know how guest behaviors, what we are
>>>> proposing recently is to implement all of above, and let administrate
>>>> tools to choose the one to use base on knowledge of guest OS
>>>> type.
>>>>
>>>> thanks, eddie
>>>>
>>>>
>>>>     
>>>
>>> I agree with you on having various policies for timekeeping based on
>>> the guest being run.
>>> This patch addresses specifically the problem
>>> of pit users who calculate missed ticks. Note that in the solution,
>>> de-scheduled missed ticks are not canceled, they are still needed
>>> as the tsc is continuous in the current methods. We are only
>>>   
>>
>>
>> If we rely on guest to pick up the lost ticks, why not just do it
>> thoroughly?
>> i..e even deschedule missed ticks can rely on guest to pick up.
>>  
>>
> I have considered this. I was worried that if the descheduled period
> was too large that the guest would do something funny, like declare lost
> to be 1 ;-)
> However, the descheduled periods are probably no longer than the
> interrupts disabled periods, given some of the problems we have with
> guests in spinlock_irq code. Also, since we have the Linux guest code,
> and have been relying on being able to read it to make timekeeping 
> policy,
> we can see that they don't set lost to 1.
>
> Actually, the more I think about this, the more I like the idea.
> It would mean that we wouldn't have to deliver all those pent up
> interrupts to the guest. It solves some other problems as well.
> We could probably use this policy for most guests and timekeeping
> sources. Linux 32bit with pit might be the exception.
>
>> That is what 2.B proposed.
>> In some cases, we saw issues in Windows (XP32) with 2B, guest wall clock
>> becomes slow. Maybe XP64 behaviors different like you saw, but we need
>> windows expert to double check.
>>
>> Some rough idea in my mind is:
>>     Policy #1 works best for 32 bits Liunux (and old 64 bits Linux).
>>     Policy #2B works for latest 64 bits Linux.
>>     Policy #2A works for Windows (32 & 64 bits).
>>  
>>
> I agree with this breakdown.
> The next step is to do some experiments, I think.
>
>>  
>>
>>> canceling those
>>> pending_intr_nr that accumulate while the guest is running.
>>> These are due
>>> to inaccuracies in the xen time expirations due to interrupt loads or
>>> long dom0 interrupt disable periods. They are also due to extended
>>> periods where the guest has interrupts disabled. In these cases, as
>>> the tsc has been running, the guest will calculated missed ticks at
>>> the time of first clock interrupt
>>> injection and then xen will deliver pending_intr_nr additional
>>> interrupts resulting in jiffies moving by 2*pending_intr_nr instead
>>> of the desired pending_intr_nr.
>>> regards,
>>> Dave
>>>   
>>
>>
>> thx, eddie
>>  
>>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] Fix hvm guest time to be more accurate
  2007-10-26 13:56       ` Dave Winchell
  2007-10-26 18:18         ` Dave Winchell
@ 2007-10-29  9:57         ` Dong, Eddie
  1 sibling, 0 replies; 14+ messages in thread
From: Dong, Eddie @ 2007-10-29  9:57 UTC (permalink / raw)
  To: Dave Winchell; +Cc: xen-devel, Ben Guthro

Dave Winchell wrote:
> Dong, Eddie wrote:
> 

>> 
>> That is possible, So we should increase 1000 to be more bigger.
>> Make it to be around 10s should be OK?
>> 
>> 
>> 
> Agreed.

Thanks! And will wait for your patches :-)

>> 
>> Just curious: why you favor PIT instead of HPET?
>> Does HPET bring more deviation?
>> 
>> 
> We started with pit because it kept such good time for
> 32 bit Linux. Based on this, we thought that
> the problems with 64bit pit would be manageable.
> 
> One of these days we will characterize HPET.
> Based on rtc performing well, I would think that HPET would do
> well too.
> If not, then the reasons could be investigated.

Yes!

> 
>> 
>> If we rely on guest to pick up the lost ticks, why not just do it
>> thoroughly? i..e even deschedule missed ticks can rely on guest to
>> pick up. 
>> 
>> 
> I have considered this. I was worried that if the descheduled period
> was too large that the guest would do something funny, like
> declare lost
> to be 1 ;-)
> However, the descheduled periods are probably no longer than the
> interrupts disabled periods, given some of the problems we have with
> guests in spinlock_irq code. Also, since we have the Linux guest code,
> and have been relying on being able to read it to make
> timekeeping policy,
> we can see that they don't set lost to 1.
> 
> Actually, the more I think about this, the more I like the idea.
> It would mean that we wouldn't have to deliver all those pent up
> interrupts to the guest. It solves some other problems as well.
> We could probably use this policy for most guests and timekeeping
> sources. Linux 32bit with pit might be the exception.

Great!

Eddie

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] Fix hvm guest time to be more accurate
  2007-10-26 18:18         ` Dave Winchell
@ 2007-10-29  9:58           ` Dong, Eddie
  2007-10-29 15:00             ` Dave Winchell
  0 siblings, 1 reply; 14+ messages in thread
From: Dong, Eddie @ 2007-10-29  9:58 UTC (permalink / raw)
  To: Dave Winchell; +Cc: xen-devel, Ben Guthro

Dave Winchell wrote:
> Eddie,
> 
> I implemented #2B and ran a three hour test
> with sles9-64 and rh4u4-64 guests. Each guest had 8 vcpus
> and the box was Intel with 2 physical processors.
> The guests were running large loads.
> Clock was pit. This is my usual test setup, except that I just
> as often used AMD nodes with more processors.
> 
> The time error was .02%, good enough for ntpd.
> 
> The implementation keeps a constant guest tsc offset.
> There is no pending_nr cancellation.
> When the vpt.c timer expires, it only increments pending_nr
> if its value is zero.
> Missed_ticks() is still calculated, but only to update the new
> timeout value. There is no adjustment to the tsc offset
> (set_guest_time()) 
> at clock interrupt delivery time nor at re-scheduling time.
> 
> So, I like this method better than the pending_nr subtract.
> I'm going to work on this some more and, if all goes well,
> propose a new code submission soon.
> I'll put some kind of policy switch in too, which we can discuss
> and modify, but it will be along the lines of what we discussed below.
> 
> Thanks for your input!
> 
> -Dave
> 


Haitao Shai may posted his patch, can u check if there are something
missed?
thx,eddie

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Fix hvm guest time to be more accurate
  2007-10-29  9:58           ` Dong, Eddie
@ 2007-10-29 15:00             ` Dave Winchell
  2007-10-29 17:29               ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Dave Winchell @ 2007-10-29 15:00 UTC (permalink / raw)
  To: Dong, Eddie; +Cc: xen-devel, Ben Guthro

Eddie, Haitao:

The patch looks good with the following comments.

1. Since you are in missed_ticks(), why not increase the threshold
    to 10 sec?

2. In missed_ticks() you should only increment pending_intr_nr by 
missed_ticks
    calculated when  pt_support_time_frozen(domain).

3. You might as well fix this one too since its what we discussed and is so
    related to constant tsc offset:
      In pt_timer_fn, if !pt_support_time_frozen(domain) then
      pending_intr_nr should end up with a maximum value of one.

regards,
Dave


Dong, Eddie wrote:

>Dave Winchell wrote:
>  
>
>>Eddie,
>>
>>I implemented #2B and ran a three hour test
>>with sles9-64 and rh4u4-64 guests. Each guest had 8 vcpus
>>and the box was Intel with 2 physical processors.
>>The guests were running large loads.
>>Clock was pit. This is my usual test setup, except that I just
>>as often used AMD nodes with more processors.
>>
>>The time error was .02%, good enough for ntpd.
>>
>>The implementation keeps a constant guest tsc offset.
>>There is no pending_nr cancellation.
>>When the vpt.c timer expires, it only increments pending_nr
>>if its value is zero.
>>Missed_ticks() is still calculated, but only to update the new
>>timeout value. There is no adjustment to the tsc offset
>>(set_guest_time()) 
>>at clock interrupt delivery time nor at re-scheduling time.
>>
>>So, I like this method better than the pending_nr subtract.
>>I'm going to work on this some more and, if all goes well,
>>propose a new code submission soon.
>>I'll put some kind of policy switch in too, which we can discuss
>>and modify, but it will be along the lines of what we discussed below.
>>
>>Thanks for your input!
>>
>>-Dave
>>
>>    
>>
>
>
>Haitao Shai may posted his patch, can u check if there are something
>missed?
>thx,eddie
>  
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Fix hvm guest time to be more accurate
  2007-10-29 15:00             ` Dave Winchell
@ 2007-10-29 17:29               ` Keir Fraser
  2007-10-29 19:55                 ` Dave Winchell
  2007-10-30 11:45                 ` Dong, Eddie
  0 siblings, 2 replies; 14+ messages in thread
From: Keir Fraser @ 2007-10-29 17:29 UTC (permalink / raw)
  To: Dave Winchell, Dong, Eddie; +Cc: xen-devel, haitao.shan, Ben Guthro

I thought the point of the mode in Haitao's patch was to still deliver the
'right' number of pending interrupts, but not stall the guest TSC while
delivering them? That's what I checked in as c/s 16237 (in staging tree). If
we want other modes too they can be added to the enumeration that c/s
defines.

 -- Keir

On 29/10/07 15:00, "Dave Winchell" <dwinchell@virtualiron.com> wrote:

> Eddie, Haitao:
> 
> The patch looks good with the following comments.
> 
> 1. Since you are in missed_ticks(), why not increase the threshold
>     to 10 sec?
> 
> 2. In missed_ticks() you should only increment pending_intr_nr by
> missed_ticks
>     calculated when  pt_support_time_frozen(domain).
> 
> 3. You might as well fix this one too since its what we discussed and is so
>     related to constant tsc offset:
>       In pt_timer_fn, if !pt_support_time_frozen(domain) then
>       pending_intr_nr should end up with a maximum value of one.
> 
> regards,
> Dave
> 
> 
> Dong, Eddie wrote:
> 
>> Dave Winchell wrote:
>>  
>> 
>>> Eddie,
>>> 
>>> I implemented #2B and ran a three hour test
>>> with sles9-64 and rh4u4-64 guests. Each guest had 8 vcpus
>>> and the box was Intel with 2 physical processors.
>>> The guests were running large loads.
>>> Clock was pit. This is my usual test setup, except that I just
>>> as often used AMD nodes with more processors.
>>> 
>>> The time error was .02%, good enough for ntpd.
>>> 
>>> The implementation keeps a constant guest tsc offset.
>>> There is no pending_nr cancellation.
>>> When the vpt.c timer expires, it only increments pending_nr
>>> if its value is zero.
>>> Missed_ticks() is still calculated, but only to update the new
>>> timeout value. There is no adjustment to the tsc offset
>>> (set_guest_time())
>>> at clock interrupt delivery time nor at re-scheduling time.
>>> 
>>> So, I like this method better than the pending_nr subtract.
>>> I'm going to work on this some more and, if all goes well,
>>> propose a new code submission soon.
>>> I'll put some kind of policy switch in too, which we can discuss
>>> and modify, but it will be along the lines of what we discussed below.
>>> 
>>> Thanks for your input!
>>> 
>>> -Dave
>>> 
>>>    
>>> 
>> 
>> 
>> Haitao Shai may posted his patch, can u check if there are something
>> missed?
>> thx,eddie
>>  
>> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Fix hvm guest time to be more accurate
  2007-10-29 17:29               ` Keir Fraser
@ 2007-10-29 19:55                 ` Dave Winchell
  2007-10-29 20:40                   ` Keir Fraser
  2007-10-30 11:45                 ` Dong, Eddie
  1 sibling, 1 reply; 14+ messages in thread
From: Dave Winchell @ 2007-10-29 19:55 UTC (permalink / raw)
  To: Keir Fraser
  Cc: haitao.shan, Dave Winchell, xen-devel, Dong, Eddie, Ben Guthro

Keir,

I think its a good idea to have other modes.
However, I don't believe that the mode checked in to the staging
tree will keep good time for a 64 bit Linux guest, if that was what was 
intended.

Here's why:
The guest running under the new option gets a clock interrupt
after being de-scheduled for a while. It calculates missed_ticks
and bumps jiffies by missed_ticks. Jiffies is now correct.
Then, with the new mode as submitted, the guest will get missed_ticks
additional interrupts. For each, the guest will add 1 to jiffies.
The guest is now missed_ticks * clock_period ahead of where it should be.

Under the old/other option, the guest tsc is continuous after a de-scheduled
period, and thus the missed_ticks calculation in the guest results in zero.
Then missed_ticks interrupts are delivered and jiffies is correct.

I just ran a test with two 64bit Linux guests, one Red Hat and one Sles,
under load.  The hypervisor has constant tsc offset per the code 
submitted to
the staging tree.  In each 5 sec period the guest gained 6-10 seconds 
against
ntp time, an error of almost 200%.

[root@vs079 ~]# while :; do ntpdate -q 0.us.pool.ntp.org; sleep 5; done
server 8.15.10.42, stratum 2, offset -0.061007, delay 0.04959
29 Oct 15:21:21 ntpdate[3892]: adjust time server 8.15.10.42 offset 
-0.061007 sec
server 8.15.10.42, stratum 2, offset -0.077763, delay 0.07129
29 Oct 15:21:28 ntpdate[3894]: adjust time server 8.15.10.42 offset 
-0.077763 sec
server 8.15.10.42, stratum 2, offset -1.733141, delay 0.20813

(load started here.)

29 Oct 15:21:35 ntpdate[3968]: step time server 8.15.10.42 offset 
-1.733141 sec
server 8.15.10.42, stratum 2, offset -9.648700, delay 0.04861
29 Oct 15:21:54 ntpdate[4002]: step time server 8.15.10.42 offset 
-9.648700 sec
server 8.15.10.42, stratum 2, offset -22.872883, delay 0.05319
29 Oct 15:22:21 ntpdate[4027]: step time server 8.15.10.42 offset 
-22.872883 sec
server 8.15.10.42, stratum 2, offset -29.036008, delay 0.19337
29 Oct 15:22:38 ntpdate[4039]: step time server 8.15.10.42 offset 
-29.036008 sec
server 8.15.10.42, stratum 2, offset -34.880845, delay 0.04944
29 Oct 15:22:46 ntpdate[4058]: step time server 8.15.10.42 offset 
-34.880845 sec



With these three changes to the constant tsc offset policy in staging,
the error compared to ntp is about .02% under this load.

 > 1. Since you are in missed_ticks(), why not increase the threshold
 >     to 10 sec?
 >
 > 2. In missed_ticks() you should only increment pending_intr_nr by
 > missed_ticks
 >     calculated when  pt_support_time_frozen(domain).
 >
 > 3. You might as well fix this one too since its what we discussed and 
is so
 >     related to constant tsc offset:
 >       In pt_timer_fn, if !pt_support_time_frozen(domain) then
 >       pending_intr_nr should end up with a maximum value of one.
 >

So, I think these changes are necessary for a 64bit Linux policy. If you 
agree, should they go in
as fixes to the constant tsc offset policy in staging now or as a new 
policy?

thanks,
Dave



Keir Fraser wrote:

>I thought the point of the mode in Haitao's patch was to still deliver the
>'right' number of pending interrupts, but not stall the guest TSC while
>delivering them? That's what I checked in as c/s 16237 (in staging tree). If
>we want other modes too they can be added to the enumeration that c/s
>defines.
>
> -- Keir
>
>On 29/10/07 15:00, "Dave Winchell" <dwinchell@virtualiron.com> wrote:
>
>  
>
>>Eddie, Haitao:
>>
>>The patch looks good with the following comments.
>>
>>1. Since you are in missed_ticks(), why not increase the threshold
>>    to 10 sec?
>>
>>2. In missed_ticks() you should only increment pending_intr_nr by
>>missed_ticks
>>    calculated when  pt_support_time_frozen(domain).
>>
>>3. You might as well fix this one too since its what we discussed and is so
>>    related to constant tsc offset:
>>      In pt_timer_fn, if !pt_support_time_frozen(domain) then
>>      pending_intr_nr should end up with a maximum value of one.
>>
>>regards,
>>Dave
>>
>>
>>Dong, Eddie wrote:
>>
>>    
>>
>>>Dave Winchell wrote:
>>> 
>>>
>>>      
>>>
>>>>Eddie,
>>>>
>>>>I implemented #2B and ran a three hour test
>>>>with sles9-64 and rh4u4-64 guests. Each guest had 8 vcpus
>>>>and the box was Intel with 2 physical processors.
>>>>The guests were running large loads.
>>>>Clock was pit. This is my usual test setup, except that I just
>>>>as often used AMD nodes with more processors.
>>>>
>>>>The time error was .02%, good enough for ntpd.
>>>>
>>>>The implementation keeps a constant guest tsc offset.
>>>>There is no pending_nr cancellation.
>>>>When the vpt.c timer expires, it only increments pending_nr
>>>>if its value is zero.
>>>>Missed_ticks() is still calculated, but only to update the new
>>>>timeout value. There is no adjustment to the tsc offset
>>>>(set_guest_time())
>>>>at clock interrupt delivery time nor at re-scheduling time.
>>>>
>>>>So, I like this method better than the pending_nr subtract.
>>>>I'm going to work on this some more and, if all goes well,
>>>>propose a new code submission soon.
>>>>I'll put some kind of policy switch in too, which we can discuss
>>>>and modify, but it will be along the lines of what we discussed below.
>>>>
>>>>Thanks for your input!
>>>>
>>>>-Dave
>>>>
>>>>   
>>>>
>>>>        
>>>>
>>>Haitao Shai may posted his patch, can u check if there are something
>>>missed?
>>>thx,eddie
>>> 
>>>
>>>      
>>>
>>_______________________________________________
>>Xen-devel mailing list
>>Xen-devel@lists.xensource.com
>>http://lists.xensource.com/xen-devel
>>    
>>
>
>
>  
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Fix hvm guest time to be more accurate
  2007-10-29 19:55                 ` Dave Winchell
@ 2007-10-29 20:40                   ` Keir Fraser
  2007-10-29 20:44                     ` Dave Winchell
  0 siblings, 1 reply; 14+ messages in thread
From: Keir Fraser @ 2007-10-29 20:40 UTC (permalink / raw)
  To: Dave Winchell; +Cc: haitao.shan, xen-devel, Dong, Eddie, Ben Guthro

On 29/10/07 19:55, "Dave Winchell" <dwinchell@virtualiron.com> wrote:

> So, I think these changes are necessary for a 64bit Linux policy. If you
> agree, should they go in
> as fixes to the constant tsc offset policy in staging now or as a new
> policy?

It's easy to add another one with an appropriate name.

 -- Keir

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Fix hvm guest time to be more accurate
  2007-10-29 20:40                   ` Keir Fraser
@ 2007-10-29 20:44                     ` Dave Winchell
  0 siblings, 0 replies; 14+ messages in thread
From: Dave Winchell @ 2007-10-29 20:44 UTC (permalink / raw)
  To: Keir Fraser; +Cc: haitao.shan, xen-devel, Dong, Eddie, Ben Guthro

Keir Fraser wrote:

>On 29/10/07 19:55, "Dave Winchell" <dwinchell@virtualiron.com> wrote:
>
>  
>
>>So, I think these changes are necessary for a 64bit Linux policy. If you
>>agree, should they go in
>>as fixes to the constant tsc offset policy in staging now or as a new
>>policy?
>>    
>>
>
>It's easy to add another one with an appropriate name.
>
> -- Keir
>  
>

Ok, we'll submit a patch per the discussion.

-Dave

>
>
>  
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] Fix hvm guest time to be more accurate
  2007-10-29 17:29               ` Keir Fraser
  2007-10-29 19:55                 ` Dave Winchell
@ 2007-10-30 11:45                 ` Dong, Eddie
  1 sibling, 0 replies; 14+ messages in thread
From: Dong, Eddie @ 2007-10-30 11:45 UTC (permalink / raw)
  To: Keir Fraser, Dave Winchell; +Cc: xen-devel, Shan, Haitao, Ben Guthro

I guess another alternative is missed.
We need to add 3rd choice to ignore pending_intr_nr for X64 Linux.

thx,eddie

>-----Original Message-----
>From: Keir Fraser [mailto:Keir.Fraser@cl.cam.ac.uk] 
>Sent: 2007年10月30日 1:30
>To: Dave Winchell; Dong, Eddie
>Cc: xen-devel; Ben Guthro; Shan, Haitao
>Subject: Re: [Xen-devel] [PATCH] Fix hvm guest time to be more accurate
>
>I thought the point of the mode in Haitao's patch was to still 
>deliver the
>'right' number of pending interrupts, but not stall the guest TSC while
>delivering them? That's what I checked in as c/s 16237 (in 
>staging tree). If
>we want other modes too they can be added to the enumeration that c/s
>defines.
>
> -- Keir
>
>On 29/10/07 15:00, "Dave Winchell" <dwinchell@virtualiron.com> wrote:
>
>> Eddie, Haitao:
>> 
>> The patch looks good with the following comments.
>> 
>> 1. Since you are in missed_ticks(), why not increase the threshold
>>     to 10 sec?
>> 
>> 2. In missed_ticks() you should only increment pending_intr_nr by
>> missed_ticks
>>     calculated when  pt_support_time_frozen(domain).
>> 
>> 3. You might as well fix this one too since its what we 
>discussed and is so
>>     related to constant tsc offset:
>>       In pt_timer_fn, if !pt_support_time_frozen(domain) then
>>       pending_intr_nr should end up with a maximum value of one.
>> 
>> regards,
>> Dave
>> 
>> 
>> Dong, Eddie wrote:
>> 
>>> Dave Winchell wrote:
>>>  
>>> 
>>>> Eddie,
>>>> 
>>>> I implemented #2B and ran a three hour test
>>>> with sles9-64 and rh4u4-64 guests. Each guest had 8 vcpus
>>>> and the box was Intel with 2 physical processors.
>>>> The guests were running large loads.
>>>> Clock was pit. This is my usual test setup, except that I just
>>>> as often used AMD nodes with more processors.
>>>> 
>>>> The time error was .02%, good enough for ntpd.
>>>> 
>>>> The implementation keeps a constant guest tsc offset.
>>>> There is no pending_nr cancellation.
>>>> When the vpt.c timer expires, it only increments pending_nr
>>>> if its value is zero.
>>>> Missed_ticks() is still calculated, but only to update the new
>>>> timeout value. There is no adjustment to the tsc offset
>>>> (set_guest_time())
>>>> at clock interrupt delivery time nor at re-scheduling time.
>>>> 
>>>> So, I like this method better than the pending_nr subtract.
>>>> I'm going to work on this some more and, if all goes well,
>>>> propose a new code submission soon.
>>>> I'll put some kind of policy switch in too, which we can discuss
>>>> and modify, but it will be along the lines of what we 
>discussed below.
>>>> 
>>>> Thanks for your input!
>>>> 
>>>> -Dave
>>>> 
>>>>    
>>>> 
>>> 
>>> 
>>> Haitao Shai may posted his patch, can u check if there are something
>>> missed?
>>> thx,eddie
>>>  
>>> 
>> 
>> 
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2007-10-30 11:45 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-24 21:15 [PATCH] Fix hvm guest time to be more accurate Ben Guthro
2007-10-25  5:52 ` Dong, Eddie
2007-10-25 14:45   ` Dave Winchell
2007-10-26  6:48     ` Dong, Eddie
2007-10-26 13:56       ` Dave Winchell
2007-10-26 18:18         ` Dave Winchell
2007-10-29  9:58           ` Dong, Eddie
2007-10-29 15:00             ` Dave Winchell
2007-10-29 17:29               ` Keir Fraser
2007-10-29 19:55                 ` Dave Winchell
2007-10-29 20:40                   ` Keir Fraser
2007-10-29 20:44                     ` Dave Winchell
2007-10-30 11:45                 ` Dong, Eddie
2007-10-29  9:57         ` Dong, Eddie

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.