* [PATCH] Fix performance issue brought by TSC-sync logic
@ 2009-02-23 8:21 Yang, Xiaowei
2009-02-23 12:51 ` Keir Fraser
0 siblings, 1 reply; 6+ messages in thread
From: Yang, Xiaowei @ 2009-02-23 8:21 UTC (permalink / raw)
To: xen-devel@lists.xensource.com
[-- Attachment #1: Type: text/plain, Size: 975 bytes --]
Recently we found one performance bug when doing network test with VTd
assigned devices - in some extreme case, the network performance in HVM
using new Linux kernel could be 1/20 of native. Root cause is one of our
sync-tsc-under-deep-C-state patches brings extra kilo-TSC drift between
pCPUs and let check-tsc-sync logic in HVM failed. The result is the
kernel fails to use platform timer (HPET, PMtimer) for gettimeofday
instead of TSC and brings very frequent costly IOport access VMExit -
triple per one call.
We provides below 2 patches to address the issue:
tsc1.patch: Minimize the TSC drift between pCPUs by letting BSP/AP set
TSC at the same time in time_calibration_rendezvous(). Looping a few
times before writing tsc sounds better, but it may be too costly.
Signed-off-by: Xiaowei Yang <xiaowei.yang@intel.com>
tsc2.patch: only do TSC-sync if really necessary, which narrows its
effect a lot.
Signed-off-by: Wei Gang <wei.gang@intel.com>
Thanks,
Xiaowei
[-- Attachment #2: tsc1.patch --]
[-- Type: text/x-patch, Size: 1295 bytes --]
diff -r 0b0e7c2b4eef xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Tue Jan 20 21:21:16 2009 +0800
+++ b/xen/arch/x86/time.c Mon Feb 09 02:21:50 2009 +0800
@@ -1095,22 +1095,21 @@ static void time_calibration_rendezvous(
while ( atomic_read(&r->nr_cpus) != (total_cpus - 1) )
cpu_relax();
r->master_stime = read_platform_stime();
- rdtscll(r->master_tsc_stamp);
+ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+ rdtscll(r->master_tsc_stamp);
mb(); /* write r->master_* /then/ signal */
atomic_inc(&r->nr_cpus);
- c->local_tsc_stamp = r->master_tsc_stamp;
}
else
{
atomic_inc(&r->nr_cpus);
while ( atomic_read(&r->nr_cpus) != total_cpus )
- cpu_relax();
- mb(); /* receive signal /then/ read r->master_* */
- if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
- wrmsrl(MSR_IA32_TSC, r->master_tsc_stamp);
- rdtscll(c->local_tsc_stamp);
- }
-
+ mb(); /* receive signal /then/ read r->master_* */
+ }
+
+ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+ wrmsrl(MSR_IA32_TSC, r->master_tsc_stamp);
+ rdtscll(c->local_tsc_stamp);
c->stime_local_stamp = get_s_time();
c->stime_master_stamp = r->master_stime;
[-- Attachment #3: tsc2.patch --]
[-- Type: text/x-patch, Size: 2734 bytes --]
diff -r 246ecf354c85 xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c Mon Feb 16 12:21:52 2009 +0800
+++ b/xen/arch/x86/acpi/cpu_idle.c Mon Feb 16 12:57:08 2009 +0800
@@ -737,6 +737,15 @@ long set_cx_pminfo(uint32_t cpu, struct
if ( cpu_id == 0 && pm_idle_save == NULL )
{
+ int deepest_cx = acpi_power->states[acpi_power->count - 1].type;
+ if ( max_cstate >= 3 && deepest_cx >= ACPI_STATE_C3 )
+ tsc_may_stop = 1;
+ else if ( max_cstate >= 2 && deepest_cx >= ACPI_STATE_C2
+ && !local_apic_timer_c2_ok )
+ tsc_may_stop = 1;
+ else
+ tsc_may_stop = 0;
+
pm_idle_save = pm_idle;
pm_idle = acpi_processor_idle;
}
diff -r 246ecf354c85 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Mon Feb 16 12:21:52 2009 +0800
+++ b/xen/arch/x86/time.c Mon Feb 16 13:10:24 2009 +0800
@@ -1091,6 +1091,8 @@ struct calibration_rendezvous {
u64 master_tsc_stamp;
};
+int tsc_may_stop __read_mostly = 0;
+
static void time_calibration_rendezvous(void *_r)
{
struct cpu_calibration *c = &this_cpu(cpu_calibration);
@@ -1102,7 +1104,9 @@ static void time_calibration_rendezvous(
while ( atomic_read(&r->nr_cpus) != (total_cpus - 1) )
cpu_relax();
r->master_stime = read_platform_stime();
- if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+ if ( !boot_cpu_has(X86_FEATURE_NOSTOP_TSC)
+ && boot_cpu_has(X86_FEATURE_CONSTANT_TSC)
+ && tsc_may_stop )
rdtscll(r->master_tsc_stamp);
mb(); /* write r->master_* /then/ signal */
atomic_inc(&r->nr_cpus);
@@ -1114,7 +1118,7 @@ static void time_calibration_rendezvous(
mb(); /* receive signal /then/ read r->master_* */
}
- if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+ if ( r->master_tsc_stamp )
wrmsrl(MSR_IA32_TSC, r->master_tsc_stamp);
rdtscll(c->local_tsc_stamp);
c->stime_local_stamp = get_s_time();
@@ -1127,7 +1131,8 @@ static void time_calibration(void *unuse
{
struct calibration_rendezvous r = {
.cpu_calibration_map = cpu_online_map,
- .nr_cpus = ATOMIC_INIT(0)
+ .nr_cpus = ATOMIC_INIT(0),
+ .master_tsc_stamp = 0
};
/* @wait=1 because we must wait for all cpus before freeing @r. */
diff -r 246ecf354c85 xen/include/asm-x86/time.h
--- a/xen/include/asm-x86/time.h Mon Feb 16 12:21:52 2009 +0800
+++ b/xen/include/asm-x86/time.h Mon Feb 16 12:57:08 2009 +0800
@@ -41,4 +41,6 @@ uint64_t acpi_pm_tick_to_ns(uint64_t tic
uint64_t acpi_pm_tick_to_ns(uint64_t ticks);
uint64_t ns_to_acpi_pm_tick(uint64_t ns);
+extern int tsc_may_stop;
+
#endif /* __X86_TIME_H__ */
[-- Attachment #4: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Fix performance issue brought by TSC-sync logic
2009-02-23 8:21 [PATCH] Fix performance issue brought by TSC-sync logic Yang, Xiaowei
@ 2009-02-23 12:51 ` Keir Fraser
2009-02-23 12:55 ` Tian, Kevin
2009-02-24 6:33 ` Yang, Xiaowei
0 siblings, 2 replies; 6+ messages in thread
From: Keir Fraser @ 2009-02-23 12:51 UTC (permalink / raw)
To: Yang, Xiaowei, xen-devel@lists.xensource.com
On 23/02/2009 00:21, "Yang, Xiaowei" <xiaowei.yang@intel.com> wrote:
> Recently we found one performance bug when doing network test with VTd
> assigned devices - in some extreme case, the network performance in HVM
> using new Linux kernel could be 1/20 of native. Root cause is one of our
> sync-tsc-under-deep-C-state patches brings extra kilo-TSC drift between
> pCPUs and let check-tsc-sync logic in HVM failed. The result is the
> kernel fails to use platform timer (HPET, PMtimer) for gettimeofday
> instead of TSC and brings very frequent costly IOport access VMExit -
> triple per one call.
>
> We provides below 2 patches to address the issue:
Patch 1 looks reasonable. Patch number 2 I'm less keen on, since patch 1
should suffice? Also I think regular re-sync across CPUs is a good idea
anyway. And that also reminds me -- isn't the CONSTANT_TSC logic in time.c
broken by host S3, and also by CPU hotplug? There's nothing to force sync of
AP TSC to BP TSC when an AP comes online after boot. Doesn't
init_percpu_time() need to handle that?
-- Keir
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH] Fix performance issue brought by TSC-sync logic
2009-02-23 12:51 ` Keir Fraser
@ 2009-02-23 12:55 ` Tian, Kevin
2009-02-24 6:33 ` Yang, Xiaowei
1 sibling, 0 replies; 6+ messages in thread
From: Tian, Kevin @ 2009-02-23 12:55 UTC (permalink / raw)
To: 'Keir Fraser', Yang, Xiaowei,
xen-devel@lists.xensource.com
>From: Keir Fraser
>Sent: Monday, February 23, 2009 8:52 PM
>On 23/02/2009 00:21, "Yang, Xiaowei" <xiaowei.yang@intel.com> wrote:
>
>> Recently we found one performance bug when doing network
>test with VTd
>> assigned devices - in some extreme case, the network
>performance in HVM
>> using new Linux kernel could be 1/20 of native. Root cause
>is one of our
>> sync-tsc-under-deep-C-state patches brings extra kilo-TSC
>drift between
>> pCPUs and let check-tsc-sync logic in HVM failed. The result is the
>> kernel fails to use platform timer (HPET, PMtimer) for gettimeofday
>> instead of TSC and brings very frequent costly IOport access VMExit -
>> triple per one call.
>>
>> We provides below 2 patches to address the issue:
>
>Patch 1 looks reasonable. Patch number 2 I'm less keen on,
>since patch 1
>should suffice? Also I think regular re-sync across CPUs is a good idea
>anyway. And that also reminds me -- isn't the CONSTANT_TSC
>logic in time.c
>broken by host S3, and also by CPU hotplug? There's nothing to
>force sync of
>AP TSC to BP TSC when an AP comes online after boot. Doesn't
>init_percpu_time() need to handle that?
>
Ah, yes, it's broken regarding to S3. We'll work out a patch to handle it.
Thanks,
Kevin
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Fix performance issue brought by TSC-sync logic
2009-02-23 12:51 ` Keir Fraser
2009-02-23 12:55 ` Tian, Kevin
@ 2009-02-24 6:33 ` Yang, Xiaowei
2009-02-24 12:10 ` Keir Fraser
1 sibling, 1 reply; 6+ messages in thread
From: Yang, Xiaowei @ 2009-02-24 6:33 UTC (permalink / raw)
To: Keir Fraser; +Cc: xen-devel@lists.xensource.com
Keir Fraser wrote:
> On 23/02/2009 00:21, "Yang, Xiaowei" <xiaowei.yang@intel.com> wrote:
>
>> Recently we found one performance bug when doing network test with VTd
>> assigned devices - in some extreme case, the network performance in HVM
>> using new Linux kernel could be 1/20 of native. Root cause is one of our
>> sync-tsc-under-deep-C-state patches brings extra kilo-TSC drift between
>> pCPUs and let check-tsc-sync logic in HVM failed. The result is the
>> kernel fails to use platform timer (HPET, PMtimer) for gettimeofday
>> instead of TSC and brings very frequent costly IOport access VMExit -
>> triple per one call.
>>
>> We provides below 2 patches to address the issue:
>
> Patch 1 looks reasonable. Patch number 2 I'm less keen on, since patch 1
> should suffice? Also I think regular re-sync across CPUs is a good idea
> anyway.
Here is average of 100 cycles skew results on one core 2 quad machine:
1) TSC-sync: 1300
2) TSC-sync+tsc1.patch: 400
3) without TSC-sync: 200 (a.k.a sync at boot time only)
We can see from 1) to 2), cycles skew improves a lot. However Linux
kernel's logic to check TSC sync (check_tsc_warp) is very strict, so
even with tsc1.patch, there are still chances to observe checking failed
inside VM.
For further improvement to reach the effect of 3), e.g. by taking care
of cache consistance amongs CPUs, there will be more overhead. And
considering the function is called per second, we are hesitating to do
this. What's your idea?:)
Thanks,
xiaowei
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Fix performance issue brought by TSC-sync logic
2009-02-24 6:33 ` Yang, Xiaowei
@ 2009-02-24 12:10 ` Keir Fraser
2009-02-25 10:26 ` Yang, Xiaowei
0 siblings, 1 reply; 6+ messages in thread
From: Keir Fraser @ 2009-02-24 12:10 UTC (permalink / raw)
To: Yang, Xiaowei; +Cc: xen-devel@lists.xensource.com
On 23/02/2009 22:33, "Yang, Xiaowei" <xiaowei.yang@intel.com> wrote:
> For further improvement to reach the effect of 3), e.g. by taking care
> of cache consistance amongs CPUs, there will be more overhead. And
> considering the function is called per second, we are hesitating to do
> this. What's your idea?:)
Maybe we should see what that overhead is... Also we may not really need to
run that function every second.
-- Keir
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Fix performance issue brought by TSC-sync logic
2009-02-24 12:10 ` Keir Fraser
@ 2009-02-25 10:26 ` Yang, Xiaowei
0 siblings, 0 replies; 6+ messages in thread
From: Yang, Xiaowei @ 2009-02-25 10:26 UTC (permalink / raw)
To: Keir Fraser; +Cc: xen-devel@lists.xensource.com
[-- Attachment #1: Type: text/plain, Size: 782 bytes --]
Keir Fraser wrote:
> On 23/02/2009 22:33, "Yang, Xiaowei" <xiaowei.yang@intel.com> wrote:
>
>> For further improvement to reach the effect of 3), e.g. by taking care
>> of cache consistance amongs CPUs, there will be more overhead. And
>> considering the function is called per second, we are hesitating to do
>> this. What's your idea?:)
>
> Maybe we should see what that overhead is... Also we may not really need to
> run that function every second.
>
> -- Keir
>
>
I measured time_calibration_rendezvous()'s overhead on my machine. It's
around 5-6k TSC. And I made anther patch to add loop. The cycle skew
introduced by TSC-sync is gone with it. And a bit surprisingly, the
expected extra overhead is not even noticeable:)
The new patch is attached.
Thanks,
Xiaowei
[-- Attachment #2: tsc1-new.patch --]
[-- Type: text/x-patch, Size: 3494 bytes --]
diff -r 0b0e7c2b4eef xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Tue Jan 20 21:21:16 2009 +0800
+++ b/xen/arch/x86/time.c Wed Feb 25 03:11:50 2009 +0800
@@ -1079,38 +1079,69 @@ static void local_time_calibration(void)
*/
struct calibration_rendezvous {
cpumask_t cpu_calibration_map;
- atomic_t nr_cpus;
+ atomic_t count_start;
+ atomic_t count_end;
s_time_t master_stime;
u64 master_tsc_stamp;
};
+#define NR_LOOPS 5
+
static void time_calibration_rendezvous(void *_r)
{
+ int i;
struct cpu_calibration *c = &this_cpu(cpu_calibration);
struct calibration_rendezvous *r = _r;
unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
- if ( smp_processor_id() == 0 )
- {
- while ( atomic_read(&r->nr_cpus) != (total_cpus - 1) )
- cpu_relax();
- r->master_stime = read_platform_stime();
- rdtscll(r->master_tsc_stamp);
- mb(); /* write r->master_* /then/ signal */
- atomic_inc(&r->nr_cpus);
- c->local_tsc_stamp = r->master_tsc_stamp;
- }
- else
- {
- atomic_inc(&r->nr_cpus);
- while ( atomic_read(&r->nr_cpus) != total_cpus )
- cpu_relax();
- mb(); /* receive signal /then/ read r->master_* */
- if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
- wrmsrl(MSR_IA32_TSC, r->master_tsc_stamp);
- rdtscll(c->local_tsc_stamp);
- }
-
+ /*
+ * Loop is used here to get rid of the cache's side effect to enlarge
+ * the TSC difference among CPUs.
+ */
+ for ( i = 0; i < NR_LOOPS; i++ )
+ {
+ if ( smp_processor_id() == 0 )
+ {
+ while ( atomic_read(&r->count_start) != (total_cpus - 1) )
+ mb();
+
+ if ( r->master_stime == 0 )
+ {
+ r->master_stime = read_platform_stime();
+ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
+ rdtscll(r->master_tsc_stamp);
+ }
+ atomic_set(&r->count_end, 0);
+ wmb();
+ atomic_inc(&r->count_start);
+
+ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ i == NR_LOOPS - 1 )
+ write_tsc((u32)r->master_tsc_stamp, (u32)(r->master_tsc_stamp >> 32));
+
+ while (atomic_read(&r->count_end) != total_cpus - 1)
+ mb();
+ atomic_set(&r->count_start, 0);
+ wmb();
+ atomic_inc(&r->count_end);
+ }
+ else
+ {
+ atomic_inc(&r->count_start);
+ while ( atomic_read(&r->count_start) != total_cpus )
+ mb();
+
+ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ i == NR_LOOPS - 1 )
+ write_tsc((u32)r->master_tsc_stamp, (u32)(r->master_tsc_stamp >> 32));
+
+ atomic_inc(&r->count_end);
+ while (atomic_read(&r->count_end) != total_cpus)
+ mb();
+ }
+ }
+
+ rdtscll(c->local_tsc_stamp);
c->stime_local_stamp = get_s_time();
c->stime_master_stamp = r->master_stime;
@@ -1121,7 +1152,9 @@ static void time_calibration(void *unuse
{
struct calibration_rendezvous r = {
.cpu_calibration_map = cpu_online_map,
- .nr_cpus = ATOMIC_INIT(0)
+ .count_start = ATOMIC_INIT(0),
+ .count_end = ATOMIC_INIT(0),
+ .master_stime = 0
};
/* @wait=1 because we must wait for all cpus before freeing @r. */
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2009-02-25 10:26 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-02-23 8:21 [PATCH] Fix performance issue brought by TSC-sync logic Yang, Xiaowei
2009-02-23 12:51 ` Keir Fraser
2009-02-23 12:55 ` Tian, Kevin
2009-02-24 6:33 ` Yang, Xiaowei
2009-02-24 12:10 ` Keir Fraser
2009-02-25 10:26 ` Yang, Xiaowei
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.