From: "Dan Magenheimer" <dan.magenheimer@oracle.com>
To: Keir Fraser <keir.fraser@eu.citrix.com>,
"Xen-Devel (E-mail)" <xen-devel@lists.xensource.com>
Cc: Dave Winchell <dwinchell@virtualiron.com>
Subject: RE: [PATCH] clocksource=tsc
Date: Mon, 14 Jul 2008 11:59:41 -0600 [thread overview]
Message-ID: <20080714115941000.00000080236@djm-pc> (raw)
In-Reply-To: <C4A0D9C6.23F67%keir.fraser@eu.citrix.com>
[-- Attachment #1: Type: text/plain, Size: 2503 bytes --]
> I'm sympathetic to the patch, but it looks like an ugly hack
> right now.
> Since it should clearly only affect behaviour when the new
> clocksource is explicitly selected...
Actually the ugliness is because the read_counter function
and plt_mask are both u32 and I worried that changing them to
a u64 would be too invasive.
So here's two possible replacement patches, one with those
switched to u64 and the other with separate 32- and 64-bit
read_counter functions. Both are boot-tested with and
without clocksource=tsc.
Dan
> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Monday, July 14, 2008 3:24 AM
> To: dan.magenheimer@oracle.com; Xen-Devel (E-mail)
> Cc: Dave Winchell
> Subject: Re: [PATCH] clocksource=tsc
>
>
> I'm sympathetic to the patch, but it looks like an ugly hack
> right now. Most
> of the clocksource_is_tsc special cases will fall away if
> you, e.g., have
> platform_timer_stamp = stime_platform_stamp = local_tsc_stamp =
> stime_local_stamp = 0; and provide a plt->read_counter() function; and
> accept that for now you will unnecessarily take the
> platform-timer spinlock;
> etc. Also it is platform timer 5, not 4!
>
> Clean it up and it'll go in. Since it should clearly only
> affect behaviour
> when the new clocksource is explicitly selected, I'll take this patch
> further into 3.3 freeze than usual -- it's more important to
> me that it's
> clean.
>
> Thanks,
> Keir
>
> On 12/7/08 22:38, "Dan Magenheimer"
> <dan.magenheimer@oracle.com> wrote:
>
> > Attached patch adds clocksource=tsc boot option that
> > uses TSC as clocksource. This option should only be
> > used on machines where TSC is known to be synchronized
> > across all processors. A future TODO is to dynamically
> > determine if this is the case.
> >
> > TSC may "beat" with another clocksource, resulting in
> > cross-processor Xen system time skew. This skew can
> > be visible in PV guests and can appear as "time is stopped"
> > in hvm guests. On some systems, this patch can reduce skew
> > by 30x or more.
> >
> > Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> >
> > ===================================
> > Thanks... for the memory
> > I really could use more / My throughput's on the floor
> > The balloon is flat / My swap disk's fat / I've OOM's in store
> > Overcommitted so much
> > (with apologies to the late great Bob Hope)
>
>
>
[-- Attachment #2: tscstable2.patch --]
[-- Type: application/octet-stream, Size: 6953 bytes --]
diff -r bd97e45e073a xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Tue Jul 08 09:28:50 2008 +0100
+++ b/xen/arch/x86/time.c Mon Jul 14 11:10:09 2008 -0600
@@ -54,14 +54,14 @@ struct cpu_time {
s_time_t stime_local_stamp;
s_time_t stime_master_stamp;
struct time_scale tsc_scale;
- u32 cstate_plt_count_stamp;
+ u64 cstate_plt_count_stamp;
struct timer calibration_timer;
};
struct platform_timesource {
char *name;
u64 frequency;
- u32 (*read_counter)(void);
+ u64 (*read_counter)(void);
int counter_bits;
};
@@ -311,7 +311,7 @@ static char *freq_string(u64 freq)
* PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
*/
-static u32 read_pit_count(void)
+static u64 read_pit_count(void)
{
u16 count16;
u32 count32;
@@ -327,7 +327,7 @@ static u32 read_pit_count(void)
spin_unlock_irqrestore(&pit_lock, flags);
- return count32;
+ return (u64)count32;
}
static void init_pit(struct platform_timesource *pts)
@@ -343,9 +343,9 @@ static void init_pit(struct platform_tim
* PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
*/
-static u32 read_hpet_count(void)
+static u64 read_hpet_count(void)
{
- return hpet_read32(HPET_COUNTER);
+ return (u64)hpet_read32(HPET_COUNTER);
}
static int init_hpet(struct platform_timesource *pts)
@@ -383,9 +383,9 @@ int use_cyclone;
/* Cyclone MPMC0 register. */
static volatile u32 *cyclone_timer;
-static u32 read_cyclone_count(void)
+static u64 read_cyclone_count(void)
{
- return *cyclone_timer;
+ return (u64)*cyclone_timer;
}
static volatile u32 *map_cyclone_reg(unsigned long regaddr)
@@ -433,9 +433,9 @@ u32 pmtmr_ioport;
/* ACPI PM timer ticks at 3.579545 MHz. */
#define ACPI_PM_FREQUENCY 3579545
-static u32 read_pmtimer_count(void)
+static u64 read_pmtimer_count(void)
{
- return inl(pmtmr_ioport);
+ return (u64)inl(pmtmr_ioport);
}
static int init_pmtimer(struct platform_timesource *pts)
@@ -452,15 +452,38 @@ static int init_pmtimer(struct platform_
}
/************************************************************
+ * PLATFORM TIMER 5: TSC
+ */
+
+static u64 tsc_freq;
+
+static u64 read_tsc_count(void)
+{
+ u64 tsc;
+ rdtscll(tsc);
+ return tsc;
+}
+
+static int init_tsctimer(struct platform_timesource *pts)
+{
+ /* TODO: evaluate stability of TSC here, return 0 if not stable */
+ pts->name = "TSC";
+ pts->frequency = tsc_freq;
+ pts->read_counter = read_tsc_count;
+ pts->counter_bits = 64;
+ return 1;
+}
+
+/************************************************************
* GENERIC PLATFORM TIMER INFRASTRUCTURE
*/
static struct platform_timesource plt_src; /* details of chosen timesource */
-static u32 plt_mask; /* hardware-width mask */
+static u64 plt_mask; /* hardware-width mask */
static u64 plt_overflow_period; /* ns between calls to plt_overflow() */
static struct time_scale plt_scale; /* scale: platform counter -> nanosecs */
-/* Protected by platform_timer_lock. */
+/* Protected by platform_timer_lock. Must be zero for 64-bit clocksources */
static DEFINE_SPINLOCK(platform_timer_lock);
static s_time_t stime_platform_stamp; /* System time at below platform time */
static u64 platform_timer_stamp; /* Platform time at above system time */
@@ -472,8 +495,9 @@ static void plt_overflow(void *unused)
{
u32 count;
+ ASSERT(plt_src.counter_bits <= 32);
spin_lock(&platform_timer_lock);
- count = plt_src.read_counter();
+ count = (u32)plt_src.read_counter();
plt_stamp64 += (count - plt_stamp) & plt_mask;
plt_stamp = count;
spin_unlock(&platform_timer_lock);
@@ -506,6 +530,8 @@ static void platform_time_calibration(vo
u64 count;
s_time_t stamp;
+ if ( plt_src.counter_bits == 64 )
+ return;
spin_lock(&platform_timer_lock);
count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
stamp = __read_platform_stime(count);
@@ -516,6 +542,8 @@ static void platform_time_calibration(vo
static void resume_platform_timer(void)
{
+ if ( plt_src.counter_bits == 64 )
+ return;
/* No change in platform_stime across suspend/resume. */
platform_timer_stamp = plt_stamp64;
plt_stamp = plt_src.read_counter();
@@ -536,6 +564,8 @@ static void init_platform_timer(void)
rc = init_cyclone(pts);
else if ( !strcmp(opt_clocksource, "acpi") )
rc = init_pmtimer(pts);
+ else if ( !strcmp(opt_clocksource, "tsc") )
+ rc = init_tsctimer(pts);
if ( rc <= 0 )
printk("WARNING: %s clocksource '%s'.\n",
@@ -549,16 +579,26 @@ static void init_platform_timer(void)
!init_pmtimer(pts) )
init_pit(pts);
- plt_mask = (u32)~0u >> (32 - pts->counter_bits);
-
set_time_scale(&plt_scale, pts->frequency);
- plt_overflow_period = scale_delta(
- 1ull << (pts->counter_bits-1), &plt_scale);
- init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
- plt_overflow(NULL);
+ if (pts->counter_bits != 64 )
+ {
+ plt_mask = (u32)~0u >> (32 - pts->counter_bits);
- platform_timer_stamp = plt_stamp64;
+ plt_overflow_period = scale_delta(
+ 1ull << (pts->counter_bits-1), &plt_scale);
+ init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
+ plt_overflow(NULL);
+
+ platform_timer_stamp = plt_stamp64;
+ }
+ else
+ {
+ plt_mask = -1LL;
+ platform_timer_stamp = stime_platform_stamp = 0;
+ plt_stamp = plt_stamp64 = 0;
+ }
+
printk("Platform timer is %s %s\n",
freq_string(pts->frequency), pts->name);
@@ -577,10 +617,11 @@ void cstate_restore_tsc(void)
void cstate_restore_tsc(void)
{
struct cpu_time *t;
- u32 plt_count_delta;
+ u64 plt_count_delta;
u64 tsc_delta;
- if (!tsc_invariant){
+ if ( !tsc_invariant )
+ {
t = &this_cpu(cpu_time);
/* if platform counter overflow happens, interrupt will bring CPU from
@@ -687,14 +728,19 @@ static unsigned long get_cmos_time(void)
s_time_t get_s_time(void)
{
- struct cpu_time *t = &this_cpu(cpu_time);
+ struct cpu_time *t;
u64 tsc, delta;
s_time_t now;
rdtscll(tsc);
- delta = tsc - t->local_tsc_stamp;
- now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
-
+ if ( plt_src.counter_bits == 64 )
+ now = scale_delta(tsc, &plt_scale);
+ else
+ {
+ t = &this_cpu(cpu_time);
+ delta = tsc - t->local_tsc_stamp;
+ now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
+ }
return now;
}
@@ -996,6 +1042,7 @@ void __init early_time_init(void)
{
u64 tmp = init_pit_and_calibrate_tsc();
+ tsc_freq = tmp;
set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);
do_div(tmp, 1000);
[-- Attachment #3: tscstable3.patch --]
[-- Type: application/octet-stream, Size: 8687 bytes --]
diff -r bd97e45e073a xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Tue Jul 08 09:28:50 2008 +0100
+++ b/xen/arch/x86/time.c Mon Jul 14 11:50:54 2008 -0600
@@ -54,15 +54,17 @@ struct cpu_time {
s_time_t stime_local_stamp;
s_time_t stime_master_stamp;
struct time_scale tsc_scale;
- u32 cstate_plt_count_stamp;
+ u64 cstate_plt_count_stamp;
struct timer calibration_timer;
};
struct platform_timesource {
char *name;
u64 frequency;
- u32 (*read_counter)(void);
+ u32 (*read_counter32)(void);
+ u64 (*read_counter64)(void);
int counter_bits;
+ u64 mask;
};
static DEFINE_PER_CPU(struct cpu_time, cpu_time);
@@ -334,8 +336,9 @@ static void init_pit(struct platform_tim
{
pts->name = "PIT";
pts->frequency = CLOCK_TICK_RATE;
- pts->read_counter = read_pit_count;
+ pts->read_counter32 = read_pit_count;
pts->counter_bits = 32;
+ pts->mask = (u32)0xffffffff;
using_pit = 1;
}
@@ -357,8 +360,9 @@ static int init_hpet(struct platform_tim
pts->name = "HPET";
pts->frequency = hpet_rate;
- pts->read_counter = read_hpet_count;
+ pts->read_counter32 = read_hpet_count;
pts->counter_bits = 32;
+ pts->mask = (u32)0xffffffff;
return 1;
}
@@ -418,8 +422,9 @@ static int init_cyclone(struct platform_
pts->name = "IBM Cyclone";
pts->frequency = CYCLONE_TIMER_FREQ;
- pts->read_counter = read_cyclone_count;
+ pts->read_counter32 = read_cyclone_count;
pts->counter_bits = 32;
+ pts->mask = (u32)0xffffffff;
return 1;
}
@@ -445,9 +450,34 @@ static int init_pmtimer(struct platform_
pts->name = "ACPI PM Timer";
pts->frequency = ACPI_PM_FREQUENCY;
- pts->read_counter = read_pmtimer_count;
+ pts->read_counter32 = read_pmtimer_count;
pts->counter_bits = 24;
+ pts->mask = (u32)0xffffff;
+ return 1;
+}
+
+/************************************************************
+ * PLATFORM TIMER 5: TSC
+ */
+
+static u64 tsc_freq;
+
+static u64 read_tsc_count(void)
+{
+ u64 tsc;
+ rdtscll(tsc);
+ return tsc;
+}
+
+static int init_tsctimer(struct platform_timesource *pts)
+{
+ /* TODO: evaluate stability of TSC here, return 0 if not stable */
+ pts->name = "TSC";
+ pts->frequency = tsc_freq;
+ pts->read_counter64 = read_tsc_count;
+ pts->counter_bits = 64;
+ pts->mask = -1LL;
return 1;
}
@@ -456,11 +486,10 @@ static int init_pmtimer(struct platform_
*/
static struct platform_timesource plt_src; /* details of chosen timesource */
-static u32 plt_mask; /* hardware-width mask */
static u64 plt_overflow_period; /* ns between calls to plt_overflow() */
static struct time_scale plt_scale; /* scale: platform counter -> nanosecs */
-/* Protected by platform_timer_lock. */
+/* Protected by platform_timer_lock. Must be zero for 64-bit clocksources */
static DEFINE_SPINLOCK(platform_timer_lock);
static s_time_t stime_platform_stamp; /* System time at below platform time */
static u64 platform_timer_stamp; /* Platform time at above system time */
@@ -468,13 +497,22 @@ static u32 plt_stamp; /* hard
static u32 plt_stamp; /* hardware-width platform counter stamp */
static struct timer plt_overflow_timer;
+static inline u64 plt_read_counter(void)
+{
+ if ( plt_src.counter_bits == 64 )
+ return plt_src.read_counter64();
+ else
+ return plt_src.read_counter32();
+}
+
static void plt_overflow(void *unused)
{
u32 count;
+ ASSERT(plt_src.counter_bits <= 32);
spin_lock(&platform_timer_lock);
- count = plt_src.read_counter();
- plt_stamp64 += (count - plt_stamp) & plt_mask;
+ count = (u32)plt_src.read_counter32();
+ plt_stamp64 += (count - plt_stamp) & plt_src.mask;
plt_stamp = count;
spin_unlock(&platform_timer_lock);
@@ -494,7 +532,7 @@ static s_time_t read_platform_stime(void
s_time_t stime;
spin_lock(&platform_timer_lock);
- count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
+ count = plt_stamp64 + ((plt_read_counter() - plt_stamp) & plt_src.mask);
stime = __read_platform_stime(count);
spin_unlock(&platform_timer_lock);
@@ -506,8 +544,10 @@ static void platform_time_calibration(vo
u64 count;
s_time_t stamp;
+ if ( plt_src.counter_bits == 64 )
+ return;
spin_lock(&platform_timer_lock);
- count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
+ count = plt_stamp64 + ((plt_read_counter() - plt_stamp) & plt_src.mask);
stamp = __read_platform_stime(count);
stime_platform_stamp = stamp;
platform_timer_stamp = count;
@@ -516,9 +556,11 @@ static void platform_time_calibration(vo
static void resume_platform_timer(void)
{
+ if ( plt_src.counter_bits == 64 )
+ return;
/* No change in platform_stime across suspend/resume. */
platform_timer_stamp = plt_stamp64;
- plt_stamp = plt_src.read_counter();
+ plt_stamp = plt_read_counter();
}
static void init_platform_timer(void)
@@ -536,6 +578,8 @@ static void init_platform_timer(void)
rc = init_cyclone(pts);
else if ( !strcmp(opt_clocksource, "acpi") )
rc = init_pmtimer(pts);
+ else if ( !strcmp(opt_clocksource, "tsc") )
+ rc = init_tsctimer(pts);
if ( rc <= 0 )
printk("WARNING: %s clocksource '%s'.\n",
@@ -549,16 +593,23 @@ static void init_platform_timer(void)
!init_pmtimer(pts) )
init_pit(pts);
- plt_mask = (u32)~0u >> (32 - pts->counter_bits);
-
set_time_scale(&plt_scale, pts->frequency);
- plt_overflow_period = scale_delta(
- 1ull << (pts->counter_bits-1), &plt_scale);
- init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
- plt_overflow(NULL);
+ if (pts->counter_bits != 64 )
+ {
+ plt_overflow_period = scale_delta(
+ 1ull << (pts->counter_bits-1), &plt_scale);
+ init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
+ plt_overflow(NULL);
- platform_timer_stamp = plt_stamp64;
+ platform_timer_stamp = plt_stamp64;
+ }
+ else
+ {
+ platform_timer_stamp = stime_platform_stamp = 0;
+ plt_stamp = plt_stamp64 = 0;
+ }
+
printk("Platform timer is %s %s\n",
freq_string(pts->frequency), pts->name);
@@ -569,7 +620,7 @@ void cstate_save_tsc(void)
struct cpu_time *t = &this_cpu(cpu_time);
if (!tsc_invariant){
- t->cstate_plt_count_stamp = plt_src.read_counter();
+ t->cstate_plt_count_stamp = plt_read_counter();
rdtscll(t->cstate_tsc_stamp);
}
}
@@ -577,10 +628,11 @@ void cstate_restore_tsc(void)
void cstate_restore_tsc(void)
{
struct cpu_time *t;
- u32 plt_count_delta;
+ u64 plt_count_delta;
u64 tsc_delta;
- if (!tsc_invariant){
+ if ( !tsc_invariant )
+ {
t = &this_cpu(cpu_time);
/* if platform counter overflow happens, interrupt will bring CPU from
@@ -589,7 +641,7 @@ void cstate_restore_tsc(void)
is enough for delta calculation
*/
plt_count_delta =
- (plt_src.read_counter() - t->cstate_plt_count_stamp) & plt_mask;
+ (plt_read_counter() - t->cstate_plt_count_stamp) & plt_src.mask;
tsc_delta = scale_delta(plt_count_delta, &plt_scale)*cpu_khz/1000000UL;
wrmsrl(MSR_IA32_TSC, t->cstate_tsc_stamp + tsc_delta);
}
@@ -687,14 +739,19 @@ static unsigned long get_cmos_time(void)
s_time_t get_s_time(void)
{
- struct cpu_time *t = &this_cpu(cpu_time);
+ struct cpu_time *t;
u64 tsc, delta;
s_time_t now;
rdtscll(tsc);
- delta = tsc - t->local_tsc_stamp;
- now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
-
+ if ( plt_src.counter_bits == 64 )
+ now = scale_delta(tsc, &plt_scale);
+ else
+ {
+ t = &this_cpu(cpu_time);
+ delta = tsc - t->local_tsc_stamp;
+ now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale);
+ }
return now;
}
@@ -957,7 +1014,8 @@ void init_percpu_time(void)
local_irq_save(flags);
rdtscll(t->local_tsc_stamp);
- now = !plt_src.read_counter ? 0 : read_platform_stime();
+ now = ( !plt_src.read_counter32 && !plt_src.read_counter64 ) ? 0 :
+ read_platform_stime();
local_irq_restore(flags);
t->stime_master_stamp = now;
@@ -996,6 +1054,7 @@ void __init early_time_init(void)
{
u64 tmp = init_pit_and_calibrate_tsc();
+ tsc_freq = tmp;
set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp);
do_div(tmp, 1000);
[-- Attachment #4: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
next prev parent reply other threads:[~2008-07-14 17:59 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-07-12 21:38 [PATCH] clocksource=tsc Dan Magenheimer
2008-07-13 3:59 ` Dan Magenheimer
2008-07-14 9:24 ` Keir Fraser
2008-07-14 17:59 ` Dan Magenheimer [this message]
2008-07-15 0:35 ` Tian, Kevin
2008-07-17 23:47 ` Dan Magenheimer
2008-07-15 13:05 ` Keir Fraser
2008-07-15 14:44 ` Dan Magenheimer
2008-07-15 15:08 ` Keir Fraser
2008-07-15 15:46 ` Dan Magenheimer
2008-07-15 16:04 ` Dan Magenheimer
2008-07-16 1:15 ` Dan Magenheimer
2008-07-16 4:11 ` Dan Magenheimer
2008-07-16 12:43 ` Dan Magenheimer
2008-07-16 12:49 ` Keir Fraser
2008-07-16 13:43 ` Dan Magenheimer
2008-07-16 15:42 ` Dan Magenheimer
2008-07-16 19:32 ` Keir Fraser
2008-07-17 23:05 ` Dan Magenheimer
2008-07-18 7:24 ` Keir Fraser
2008-07-18 11:01 ` Keir Fraser
2008-07-18 11:10 ` Keir Fraser
2008-07-18 14:19 ` Dan Magenheimer
2008-07-18 14:29 ` Keir Fraser
2008-07-18 14:56 ` Dan Magenheimer
2008-07-18 15:00 ` Keir Fraser
2008-07-18 16:51 ` Dan Magenheimer
2008-07-18 19:28 ` Keir Fraser
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080714115941000.00000080236@djm-pc \
--to=dan.magenheimer@oracle.com \
--cc=dwinchell@virtualiron.com \
--cc=keir.fraser@eu.citrix.com \
--cc=xen-devel@lists.xensource.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.