All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jeremy Fitzhardinge <jeremy@goop.org>
To: Keir Fraser <keir@xensource.com>
Cc: xen-devel@lists.xensource.com, Jan Beulich <jbeulich@novell.com>
Subject: Re: [PATCH 7/10] linux 2.6.18: time handling
Date: Tue, 06 Mar 2007 07:50:39 -0800	[thread overview]
Message-ID: <45ED8DCF.2040408@goop.org> (raw)
In-Reply-To: <C213031C.AAEE%keir@xensource.com>

[-- Attachment #1: Type: text/plain, Size: 741 bytes --]

Keir Fraser wrote:
> I think Jeremy Fitzhardinge has an alternative clocksource patch which iirc
> is more in line with how Xen time works (should advertise a GHz frequency
> clocksource, and do scaling of the TSC value according to time-record values
> read from shared_info). Having thought about this some more I think
> clocksource support is worth getting into our tree, but let's look at both
> available patches and decide which is the better basis for further work.
>
> Jeremy: If I'm not mistaken and you do have a patch floating around, could
> you post it?
>   

Yes, there's a Xen clocksource in the pv_ops tree.  There's no nicely
separable patch, but the mechanism is pretty simple.  I've attached
arch/i386/xen/time.c

    J


[-- Attachment #2: time.c --]
[-- Type: text/x-csrc, Size: 11761 bytes --]

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/clocksource.h>

#include <asm/xen/hypercall.h>
#include <asm/arch_hooks.h>

#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>

#include "xen-ops.h"

#define XEN_SHIFT 22

/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
static int __init __permitted_clock_jitter(char *str)
{
	permitted_clock_jitter = simple_strtoul(str, NULL, 0);
	return 1;
}
__setup("permitted_clock_jitter=", __permitted_clock_jitter);


/* These are perodically updated in shared_info, and then copied here. */
struct shadow_time_info {
	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
	u32 tsc_to_nsec_mul;
	int tsc_shift;
	u32 version;
};

static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);

/* Keep track of last time we did processing/updating of jiffies and xtime. */
static u64 processed_system_time;   /* System time (ns) at last processing. */
static DEFINE_PER_CPU(u64, processed_system_time);

/* How much CPU time was spent blocked and how much was 'stolen'? */
static DEFINE_PER_CPU(u64, processed_stolen_time);
static DEFINE_PER_CPU(u64, processed_blocked_time);

/* Current runstate of each CPU (updated automatically by the hypervisor). */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);

/* Must be signed, as it's compared with s64 quantities which can be -ve. */
#define NS_PER_TICK (1000000000LL/HZ)

unsigned long xen_cpu_khz(void)
{
	u64 cpu_khz = 1000000ULL << 32;
	const struct vcpu_time_info *info =
		&HYPERVISOR_shared_info->vcpu_info[0].time;

	do_div(cpu_khz, info->tsc_to_system_mul);
	if (info->tsc_shift < 0)
		cpu_khz <<= -info->tsc_shift;
	else
		cpu_khz >>= info->tsc_shift;

	return cpu_khz;
}

/*
 * Reads a consistent set of time-base values from Xen, into a shadow data
 * area.
 */
static void get_time_values_from_xen(void)
{
	struct vcpu_time_info   *src;
	struct shadow_time_info *dst;

	src = &read_pda(xen.vcpu)->time;
	dst = &get_cpu_var(shadow_time);

	do {
		dst->version = src->version;
		rmb();
		dst->tsc_timestamp     = src->tsc_timestamp;
		dst->system_timestamp  = src->system_time;
		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
		dst->tsc_shift         = src->tsc_shift;
		rmb();
	} while ((src->version & 1) | (dst->version ^ src->version));

	put_cpu_var(shadow_time);
}

static inline int time_values_up_to_date(void)
{
	struct vcpu_time_info   *src;
	unsigned dstversion;

	src = &read_pda(xen.vcpu)->time;
	dstversion = get_cpu_var(shadow_time).version;
	put_cpu_var(shadow_time);

	rmb();
	return (dstversion == src->version);
}

/*
 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
 * yielding a 64-bit result.
 */
static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
{
	u64 product;
#ifdef __i386__
	u32 tmp1, tmp2;
#endif

	if (shift < 0)
		delta >>= -shift;
	else
		delta <<= shift;

#ifdef __i386__
	__asm__ (
		"mul  %5       ; "
		"mov  %4,%%eax ; "
		"mov  %%edx,%4 ; "
		"mul  %5       ; "
		"xor  %5,%5    ; "
		"add  %4,%%eax ; "
		"adc  %5,%%edx ; "
		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
#elif __x86_64__
	__asm__ (
		"mul %%rdx ; shrd $32,%%rdx,%%rax"
		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
#else
#error implement me!
#endif

	return product;
}

static u64 get_nsec_offset(struct shadow_time_info *shadow)
{
	u64 now, delta;
	rdtscll(now);
	delta = now - shadow->tsc_timestamp;
	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
}


static void xen_timer_interrupt_hook(void)
{
	s64 delta, delta_cpu, stolen, blocked;
	u64 sched_time;
	int i, cpu = smp_processor_id();
	unsigned long ticks;
	struct shadow_time_info *shadow = &__get_cpu_var(shadow_time);
	struct vcpu_runstate_info *runstate = &__get_cpu_var(runstate);

	do {
		get_time_values_from_xen();

		/* Obtain a consistent snapshot of elapsed wallclock cycles. */
		delta = delta_cpu =
			shadow->system_timestamp + get_nsec_offset(shadow);
		if (0)
			printk("tsc_timestamp=%llu system_timestamp=%llu tsc_to_nsec=%u tsc_shift=%d, version=%u, delta=%lld processed_system_time=%lld\n",
			       shadow->tsc_timestamp, shadow->system_timestamp,
			       shadow->tsc_to_nsec_mul, shadow->tsc_shift,
			       shadow->version, delta, processed_system_time);

		delta     -= processed_system_time;
		delta_cpu -= __get_cpu_var(processed_system_time);

		/*
		 * Obtain a consistent snapshot of stolen/blocked cycles. We
		 * can use state_entry_time to detect if we get preempted here.
		 */
		do {
			sched_time = runstate->state_entry_time;
			barrier();
			stolen = runstate->time[RUNSTATE_runnable] +
				runstate->time[RUNSTATE_offline] -
				__get_cpu_var(processed_stolen_time);
			blocked = runstate->time[RUNSTATE_blocked] -
				__get_cpu_var(processed_blocked_time);
			barrier();
		} while (sched_time != runstate->state_entry_time);
	} while (!time_values_up_to_date());

	if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
	     unlikely(delta_cpu < -(s64)permitted_clock_jitter))
	    && printk_ratelimit()) {
		printk("Timer ISR/%d: Time went backwards: "
		       "delta=%lld delta_cpu=%lld shadow=%lld "
		       "off=%lld processed=%lld cpu_processed=%lld\n",
		       cpu, delta, delta_cpu, shadow->system_timestamp,
		       (s64)get_nsec_offset(shadow),
		       processed_system_time,
		       __get_cpu_var(processed_system_time));
		for (i = 0; i < num_online_cpus(); i++)
			printk(" %d: %lld\n", i,
			       per_cpu(processed_system_time, i));
	}

	/* System-wide jiffy work. */
	ticks = 0;
	while(delta > NS_PER_TICK) {
		delta -= NS_PER_TICK;
		processed_system_time += NS_PER_TICK;
		ticks++;
	}
	do_timer(ticks);

	/*
	 * Account stolen ticks.
	 * HACK: Passing NULL to account_steal_time()
	 * ensures that the ticks are accounted as stolen.
	 */
	if ((stolen > 0) && (delta_cpu > 0)) {
		delta_cpu -= stolen;
		if (unlikely(delta_cpu < 0))
			stolen += delta_cpu; /* clamp local-time progress */
		do_div(stolen, NS_PER_TICK);
		__get_cpu_var(processed_stolen_time) += stolen * NS_PER_TICK;
		__get_cpu_var(processed_system_time) += stolen * NS_PER_TICK;
		account_steal_time(NULL, (cputime_t)stolen);
	}

	/*
	 * Account blocked ticks.
	 * HACK: Passing idle_task to account_steal_time()
	 * ensures that the ticks are accounted as idle/wait.
	 */
	if ((blocked > 0) && (delta_cpu > 0)) {
		delta_cpu -= blocked;
		if (unlikely(delta_cpu < 0))
			blocked += delta_cpu; /* clamp local-time progress */
		do_div(blocked, NS_PER_TICK);
		__get_cpu_var(processed_blocked_time) += blocked * NS_PER_TICK;
		__get_cpu_var(processed_system_time)  += blocked * NS_PER_TICK;
		account_steal_time(idle_task(cpu), (cputime_t)blocked);
	}

	update_process_times(user_mode_vm(get_irq_regs()));
}

static cycle_t xen_clocksource_read(void)
{
	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
	cycle_t ret;

	get_time_values_from_xen();

	ret = shadow->system_timestamp + get_nsec_offset(shadow);

	put_cpu_var(shadow_time);

	return ret;
}

static void xen_read_wallclock(struct timespec *ts)
{
	const struct shared_info *s = HYPERVISOR_shared_info;
	u32 version;
	u64 delta;
	struct timespec now;

	/* get wallclock at system boot */
	do {
		version = s->wc_version;
		rmb();
		now.tv_sec  = s->wc_sec;
		now.tv_nsec = s->wc_nsec;
		rmb();
	} while ((s->wc_version & 1) | (version ^ s->wc_version));

	delta = xen_clocksource_read();	/* time since system boot */
	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;

	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
	now.tv_sec = delta;

	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}

unsigned long xen_get_wallclock(void)
{
	struct timespec ts;

	xen_read_wallclock(&ts);

	return ts.tv_sec;
}

int xen_set_wallclock(unsigned long now)
{
	/* do nothing for domU */
	return -1;
}

static struct clocksource xen_clocksource = {
	.name = "xen",
	.rating = 400,
	.read = xen_clocksource_read,
	.mask = ~0,
	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
	.shift = XEN_SHIFT,
	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};

static void init_missing_ticks_accounting(int cpu)
{
	struct vcpu_register_runstate_memory_area area;
	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);

	memset(runstate, 0, sizeof(*runstate));

	area.addr.v = runstate;
	HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);

	per_cpu(processed_blocked_time, cpu) =
		runstate->time[RUNSTATE_blocked];
	per_cpu(processed_stolen_time, cpu) =
		runstate->time[RUNSTATE_runnable] +
		runstate->time[RUNSTATE_offline];
}

static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
	/*
	 * Here we are in the timer irq handler. We just have irqs locally
	 * disabled but we don't know if the timer_bh is running on the other
	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
	 * the irq version of write_lock because as just said we have irq
	 * locally disabled. -arca
	 */
	write_seqlock(&xtime_lock);

	xen_timer_interrupt_hook();

	write_sequnlock(&xtime_lock);

	return IRQ_HANDLED;
}

static void setup_cpu0_timer_irq(void)
{
	printk(KERN_DEBUG "installing Xen timer for CPU 0\n");

	bind_virq_to_irqhandler(
		VIRQ_TIMER,
		0,
		xen_timer_interrupt,
		SA_INTERRUPT,
		"timer0",
		NULL);
}

__init void xen_time_init(void)
{
	get_time_values_from_xen();

	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
	per_cpu(processed_system_time, 0) = processed_system_time;

	init_missing_ticks_accounting(0);

	clocksource_register(&xen_clocksource);

	/* Set initial system time with full resolution */
	xen_read_wallclock(&xtime);
	set_normalized_timespec(&wall_to_monotonic,
				-xtime.tv_sec, -xtime.tv_nsec);

	tsc_disable = 0;

	setup_cpu0_timer_irq();
}

/* Convert jiffies to system time. */
static u64 jiffies_to_st(unsigned long j)
{
	unsigned long seq;
	long delta;
	u64 st;

	do {
		seq = read_seqbegin(&xtime_lock);
		delta = j - jiffies;
		if (delta < 1) {
			/* Triggers in some wrap-around cases, but that's okay:
			 * we just end up with a shorter timeout. */
			st = processed_system_time + NS_PER_TICK;
		} else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
			/* Very long timeout means there is no pending timer.
			 * We indicate this to Xen by passing zero timeout. */
			st = 0;
		} else {
			st = processed_system_time + delta * (u64)NS_PER_TICK;
		}
	} while (read_seqretry(&xtime_lock, seq));

	return st;
}

/*
 * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
 * These functions are based on implementations from arch/s390/kernel/time.c
 */
void stop_hz_timer(void)
{
	unsigned int cpu = smp_processor_id();
	unsigned long j;

	cpu_set(cpu, nohz_cpu_mask);

	/*
	 * See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs
	 * ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a
	 * value of rcp->cur that matches rdp->quiescbatch and allows us to
	 * stop the hz timer then the cpumasks created for subsequent values
	 * of cur in rcu_start_batch are guaranteed to pick up the updated
	 * nohz_cpu_mask and so will not depend on this cpu.
	 */

	smp_mb();

	/* Leave ourselves in tick mode if rcu or softirq or timer pending. */
	if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
	    (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
		cpu_clear(cpu, nohz_cpu_mask);
		j = jiffies + 1;
	}

	if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
		BUG();
}

void start_hz_timer(void)
{
	cpu_clear(smp_processor_id(), nohz_cpu_mask);
}


[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

      reply	other threads:[~2007-03-06 15:50 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-03-05 11:17 [PATCH 7/10] linux 2.6.18: time handling Jan Beulich
2007-03-05 15:03 ` Keir Fraser
2007-03-05 15:09   ` Jan Beulich
2007-03-05 15:20     ` Keir Fraser
2007-03-06 11:38 ` Keir Fraser
2007-03-06 15:50   ` Jeremy Fitzhardinge [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=45ED8DCF.2040408@goop.org \
    --to=jeremy@goop.org \
    --cc=jbeulich@novell.com \
    --cc=keir@xensource.com \
    --cc=xen-devel@lists.xensource.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.