virtualization.lists.linux-foundation.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/7] header and stubs for paravirtualizing critical operations
  2006-10-29  2:45 [PATCH 0/7] x86 paravirtualization infrastructure Chris Wright
@ 2006-10-28  7:00 ` Chris Wright
  2006-10-29 16:40   ` Andi Kleen
  2006-10-28  7:00 ` [PATCH 2/7] Patch inline replacements for common paravirt operations Chris Wright
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 54+ messages in thread
From: Chris Wright @ 2006-10-28  7:00 UTC (permalink / raw)
  To: akpm, ak
  Cc: Rusty Russell, Jeremy Fitzhardinge, Zachary Amsden, linux-kernel,
	virtualization

[-- Attachment #1: paravirt.h-addition.patch --]
[-- Type: text/plain, Size: 37026 bytes --]

Create a paravirt.h header for all the critical operations which need
to be replaced with hypervisor calls, and include that instead of
defining native operations, when CONFIG_PARAVIRT.

This patch does the dumbest possible replacement of paravirtualized
instructions: calls through a "paravirt_ops" structure.  Currently
these are function implementations of native hardware: hypervisors
will override the ops structure with their own variants.

All the pv-ops functions are declared "fastcall" so that a specific
register-based ABI is used, to make inlining assember easier.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Zachary Amsden <zach@vmware.com>

---
 arch/i386/Kconfig                          |   11 
 arch/i386/boot/compressed/misc.c           |    1 
 arch/i386/kernel/Makefile                  |    1 
 arch/i386/kernel/asm-offsets.c             |   10 
 arch/i386/kernel/entry.S                   |   34 +-
 arch/i386/kernel/i8259.c                   |    2 
 arch/i386/kernel/paravirt.c                |  410 +++++++++++++++++++++++++++++
 arch/i386/kernel/setup.c                   |    2 
 arch/i386/kernel/smpboot.c                 |    5 
 arch/i386/kernel/time.c                    |   15 -
 drivers/net/de600.c                        |    1 
 include/asm-i386/delay.h                   |    8 
 include/asm-i386/desc.h                    |    4 
 include/asm-i386/io.h                      |   10 
 include/asm-i386/irq.h                     |   10 
 include/asm-i386/irqflags.h                |   42 +-
 include/asm-i386/mach-default/setup_arch.h |    2 
 include/asm-i386/msr.h                     |    5 
 include/asm-i386/paravirt.h                |  291 ++++++++++++++++++++
 include/asm-i386/processor.h               |   15 -
 include/asm-i386/segment.h                 |    2 
 include/asm-i386/setup.h                   |    8 
 include/asm-i386/spinlock.h                |    4 
 include/asm-i386/system.h                  |   16 -
 include/asm-i386/time.h                    |   41 ++
 25 files changed, 902 insertions(+), 48 deletions(-)

--- linux-2.6-pv.orig/arch/i386/Kconfig
+++ linux-2.6-pv/arch/i386/Kconfig
@@ -197,6 +197,17 @@ config X86_ES7000
 
 endchoice
 
+config PARAVIRT
+	bool "Paravirtualization support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Paravirtualization is a way of running multiple instances of
+	  Linux on the same machine, under a hypervisor.  This option
+	  changes the kernel so it can modify itself when it is run
+	  under a hypervisor, improving performance significantly.
+	  However, when run without a hypervisor the kernel is
+	  theoretically slower.  If in doubt, say N.
+
 config ACPI_SRAT
 	bool
 	default y
--- linux-2.6-pv.orig/arch/i386/boot/compressed/misc.c
+++ linux-2.6-pv/arch/i386/boot/compressed/misc.c
@@ -9,6 +9,7 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
+#undef CONFIG_PARAVIRT
 #include <linux/linkage.h>
 #include <linux/vmalloc.h>
 #include <linux/screen_info.h>
--- linux-2.6-pv.orig/arch/i386/kernel/Makefile
+++ linux-2.6-pv/arch/i386/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 
 EXTRA_AFLAGS   := -traditional
 
--- linux-2.6-pv.orig/arch/i386/kernel/asm-offsets.c
+++ linux-2.6-pv/arch/i386/kernel/asm-offsets.c
@@ -101,4 +101,14 @@ void foo(void)
 	BLANK();
  	OFFSET(PDA_cpu, i386_pda, cpu_number);
 	OFFSET(PDA_pcurrent, i386_pda, pcurrent);
+
+#ifdef CONFIG_PARAVIRT
+	BLANK();
+	OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+	OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+	OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+	OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
 }
--- linux-2.6-pv.orig/arch/i386/kernel/entry.S
+++ linux-2.6-pv/arch/i386/kernel/entry.S
@@ -62,13 +62,6 @@ DF_MASK		= 0x00000400 
 NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS		cli
-#define ENABLE_INTERRUPTS		sti
-#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
-#define INTERRUPT_RETURN		iret
-#define GET_CR0_INTO_EAX		movl %cr0, %eax
-
 #ifdef CONFIG_PREEMPT
 #define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
 #else
@@ -416,6 +409,20 @@ ldt_ss:
 	jnz restore_nocheck
 	testl $0x00400000, %eax		# returning to 32bit stack?
 	jnz restore_nocheck		# allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+	/* 
+	 * The kernel can't run on a non-flat stack if paravirt mode
+	 * is active.  Rather than try to fixup the high bits of
+	 * ESP, bypass this code entirely.  This may break DOSemu
+	 * and/or Wine support in a paravirt VM, although the option
+	 * is still available to implement the setting of the high
+	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
+	 */
+	cmpl $0, paravirt_ops+PARAVIRT_enabled
+	jne restore_nocheck
+#endif
+	
 	/* If returning to userspace with 16bit stack,
 	 * try to fix the higher word of ESP, as the CPU
 	 * won't restore it.
@@ -831,6 +838,19 @@ nmi_espfix_stack:
 .previous
 KPROBE_END(nmi)
 
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
+ENTRY(native_irq_enable_sysexit)
+	sti
+	sysexit
+#endif
+
 KPROBE_ENTRY(int3)
 	RING0_INT_FRAME
 	pushl $-1			# mark this as an int
--- linux-2.6-pv.orig/arch/i386/kernel/i8259.c
+++ linux-2.6-pv/arch/i386/kernel/i8259.c
@@ -392,7 +392,7 @@ void __init init_ISA_irqs (void)
 	}
 }
 
-void __init init_IRQ(void)
+void __init native_init_IRQ(void)
 {
 	int i;
 
--- linux-2.6-pv.orig/arch/i386/kernel/setup.c
+++ linux-2.6-pv/arch/i386/kernel/setup.c
@@ -1404,7 +1404,7 @@ void __init setup_arch(char **cmdline_p)
 		efi_init();
 	else {
 		printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-		print_memory_map(machine_specific_memory_setup());
+		print_memory_map(memory_setup());
 	}
 
 	copy_edd();
--- linux-2.6-pv.orig/arch/i386/kernel/smpboot.c
+++ linux-2.6-pv/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
  *		Dave Jones	:	Report invalid combinations of Athlon CPUs.
 *		Rusty Russell	:	Hacked into shape for new "hotplug" boot process. */
 
+
+/* SMP boot always wants to use real time delay to allow sufficient time for
+ * the APs to come online */
+#define USE_REAL_TIME_DELAY
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
--- linux-2.6-pv.orig/arch/i386/kernel/time.c
+++ linux-2.6-pv/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
+#include <asm/time.h>
 
 #include "mach_time.h"
 
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long no
 	/* gets recalled with irq locally disabled */
 	/* XXX - does irqsave resolve this? -johnstul */
 	spin_lock_irqsave(&rtc_lock, flags);
-	if (efi_enabled)
-		retval = efi_set_rtc_mmss(nowtime);
-	else
-		retval = mach_set_rtc_mmss(nowtime);
+	retval = set_wallclock(nowtime);
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return retval;
@@ -211,10 +209,7 @@ unsigned long read_persistent_clock(void
 
 	spin_lock_irqsave(&rtc_lock, flags);
 
-	if (efi_enabled)
-		retval = efi_get_time();
-	else
-		retval = mach_get_cmos_time();
+	retval = get_wallclock();
 
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
@@ -280,7 +275,7 @@ static void __init hpet_time_init(void)
 		printk("Using HPET for base-timer\n");
 	}
 
-	time_init_hook();
+	do_time_init();
 }
 #endif
 
@@ -296,5 +291,5 @@ void __init time_init(void)
 		return;
 	}
 #endif
-	time_init_hook();
+	do_time_init();
 }
--- linux-2.6-pv.orig/drivers/net/de600.c
+++ linux-2.6-pv/drivers/net/de600.c
@@ -43,7 +43,6 @@ static const char version[] = "de600.c: 
  * modify the following "#define": (see <asm/io.h> for more info)
 #define REALLY_SLOW_IO
  */
-#define SLOW_IO_BY_JUMPING /* Looks "better" than dummy write to port 0x80 :-) */
 
 /* use 0 for production, 1 for verification, >2 for debug */
 #ifdef DE600_DEBUG
--- linux-2.6-pv.orig/include/asm-i386/delay.h
+++ linux-2.6-pv/include/asm-i386/delay.h
@@ -15,6 +15,13 @@ extern void __ndelay(unsigned long nsecs
 extern void __const_udelay(unsigned long usecs);
 extern void __delay(unsigned long loops);
 
+#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY)
+#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul)
+	
+#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul)
+
+#else /* !PARAVIRT || USE_REAL_TIME_DELAY */
+
 #define udelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
 	__udelay(n))
@@ -22,6 +29,7 @@ extern void __delay(unsigned long loops)
 #define ndelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
 	__ndelay(n))
+#endif
 
 void use_tsc_delay(void);
 
--- linux-2.6-pv.orig/include/asm-i386/desc.h
+++ linux-2.6-pv/include/asm-i386/desc.h
@@ -55,6 +55,9 @@ static inline void pack_gate(u32 *low, u
 #define DESCTYPE_DPL3	0x60	/* DPL-3 */
 #define DESCTYPE_S	0x10	/* !system */
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
 
 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
@@ -104,6 +107,7 @@ static inline void set_ldt(void *addr, u
 		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
 	}
 }
+#endif /* CONFIG_PARAVIRT */
 
 static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
 {
--- linux-2.6-pv.orig/include/asm-i386/io.h
+++ linux-2.6-pv/include/asm-i386/io.h
@@ -256,11 +256,11 @@ static inline void flush_write_buffers(v
 
 #endif /* __KERNEL__ */
 
-#ifdef SLOW_IO_BY_JUMPING
-#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
-#else
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else 
+
 #define __SLOW_DOWN_IO "outb %%al,$0x80;"
-#endif
 
 static inline void slow_down_io(void) {
 	__asm__ __volatile__(
@@ -271,6 +271,8 @@ static inline void slow_down_io(void) {
 		: : );
 }
 
+#endif
+
 #ifdef CONFIG_X86_NUMAQ
 extern void *xquad_portio;    /* Where the IO area was mapped */
 #define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
--- linux-2.6-pv.orig/include/asm-i386/irq.h
+++ linux-2.6-pv/include/asm-i386/irq.h
@@ -41,4 +41,14 @@ extern int irqbalance_disable(char *str)
 extern void fixup_irqs(cpumask_t map);
 #endif
 
+void __init native_init_IRQ(void);
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+static inline void init_IRQ(void)
+{
+	native_init_IRQ();
+}
+#endif /* CONFIG_PARAVIRT */
+
 #endif /* _ASM_IRQ_H */
--- linux-2.6-pv.orig/include/asm-i386/irqflags.h
+++ linux-2.6-pv/include/asm-i386/irqflags.h
@@ -10,6 +10,9 @@
 #ifndef _ASM_IRQFLAGS_H
 #define _ASM_IRQFLAGS_H
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #ifndef __ASSEMBLY__
 
 static inline unsigned long __raw_local_save_flags(void)
@@ -25,9 +28,6 @@ static inline unsigned long __raw_local_
 	return flags;
 }
 
-#define raw_local_save_flags(flags) \
-		do { (flags) = __raw_local_save_flags(); } while (0)
-
 static inline void raw_local_irq_restore(unsigned long flags)
 {
 	__asm__ __volatile__(
@@ -66,18 +66,6 @@ static inline void halt(void)
 	__asm__ __volatile__("hlt": : :"memory");
 }
 
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return !(flags & (1 << 9));
-}
-
-static inline int raw_irqs_disabled(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	return raw_irqs_disabled_flags(flags);
-}
-
 /*
  * For spinlocks, etc:
  */
@@ -90,9 +78,33 @@ static inline unsigned long __raw_local_
 	return flags;
 }
 
+#else
+#define DISABLE_INTERRUPTS		cli
+#define ENABLE_INTERRUPTS		sti
+#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
+#define INTERRUPT_RETURN		iret
+#define GET_CR0_INTO_EAX		movl %cr0, %eax
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+		do { (flags) = __raw_local_save_flags(); } while (0)
+
 #define raw_local_irq_save(flags) \
 		do { (flags) = __raw_local_irq_save(); } while (0)
 
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return !(flags & (1 << 9));
+}
+
+static inline int raw_irqs_disabled(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	return raw_irqs_disabled_flags(flags);
+}
 #endif /* __ASSEMBLY__ */
 
 /*
--- linux-2.6-pv.orig/include/asm-i386/mach-default/setup_arch.h
+++ linux-2.6-pv/include/asm-i386/mach-default/setup_arch.h
@@ -2,4 +2,6 @@
 
 /* no action for generic */
 
+#ifndef ARCH_SETUP
 #define ARCH_SETUP
+#endif
--- linux-2.6-pv.orig/include/asm-i386/msr.h
+++ linux-2.6-pv/include/asm-i386/msr.h
@@ -1,6 +1,10 @@
 #ifndef __ASM_MSR_H
 #define __ASM_MSR_H
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+
 /*
  * Access to machine-specific registers (available on 586 and better only)
  * Note: the rd* operations modify the parameters directly (without using
@@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long
      __asm__ __volatile__("rdpmc" \
 			  : "=a" (low), "=d" (high) \
 			  : "c" (counter))
+#endif	/* !CONFIG_PARAVIRT */
 
 /* symbolic names for some interesting MSRs */
 /* Intel defined MSRs. */
--- linux-2.6-pv.orig/include/asm-i386/processor.h
+++ linux-2.6-pv/include/asm-i386/processor.h
@@ -146,8 +146,8 @@ static inline void detect_ht(struct cpui
 #define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
 #define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
 
-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
-			   unsigned int *ecx, unsigned int *edx)
+static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx,
+					 unsigned int *ecx, unsigned int *edx)
 {
 	/* ecx is often an input as well as an output. */
 	__asm__("cpuid"
@@ -548,6 +548,12 @@ static inline void rep_nop(void)
 
 #define cpu_relax()	rep_nop()
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_enabled() 0
+#define __cpuid native_cpuid
+
 static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
 {
 	tss->esp0 = thread->esp0;
@@ -570,10 +576,13 @@ static inline void load_esp0(struct tss_
 			: /* no output */			\
 			:"r" (value))
 
+#define set_iopl_mask native_set_iopl_mask
+#endif /* CONFIG_PARAVIRT */
+
 /*
  * Set IOPL bits in EFLAGS from given mask
  */
-static inline void set_iopl_mask(unsigned mask)
+static fastcall inline void native_set_iopl_mask(unsigned mask)
 {
 	unsigned int reg;
 	__asm__ __volatile__ ("pushfl;"
--- linux-2.6-pv.orig/include/asm-i386/segment.h
+++ linux-2.6-pv/include/asm-i386/segment.h
@@ -131,5 +131,7 @@
 #define SEGMENT_LDT		0x4
 #define SEGMENT_GDT		0x0
 
+#ifndef CONFIG_PARAVIRT
 #define get_kernel_rpl()  0
 #endif
+#endif
--- linux-2.6-pv.orig/include/asm-i386/setup.h
+++ linux-2.6-pv/include/asm-i386/setup.h
@@ -70,6 +70,14 @@ extern unsigned char boot_params[PARAM_S
 struct e820entry;
 
 char * __init machine_specific_memory_setup(void);
+#ifndef CONFIG_PARAVIRT
+static inline char *memory_setup(void)
+{
+	return machine_specific_memory_setup();
+}
+#else
+#include <asm/paravirt.h>
+#endif
 
 int __init copy_e820_map(struct e820entry * biosmap, int nr_map);
 int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map);
--- linux-2.6-pv.orig/include/asm-i386/spinlock.h
+++ linux-2.6-pv/include/asm-i386/spinlock.h
@@ -7,8 +7,12 @@
 #include <asm/processor.h>
 #include <linux/compiler.h>
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define CLI_STRING	"cli"
 #define STI_STRING	"sti"
+#endif /* CONFIG_PARAVIRT */
 
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
--- linux-2.6-pv.orig/include/asm-i386/system.h
+++ linux-2.6-pv/include/asm-i386/system.h
@@ -88,6 +88,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
 #define savesegment(seg, value) \
 	asm volatile("mov %%" #seg ",%0":"=rm" (value))
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define read_cr0() ({ \
 	unsigned int __dummy; \
 	__asm__ __volatile__( \
@@ -139,17 +142,18 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
 #define write_cr4(x) \
 	__asm__ __volatile__("movl %0,%%cr4": :"r" (x))
 
-/*
- * Clear and set 'TS' bit respectively
- */
+#define wbinvd() \
+	__asm__ __volatile__ ("wbinvd": : :"memory")
+
+/* Clear the 'TS' bit */
 #define clts() __asm__ __volatile__ ("clts")
+#endif/* CONFIG_PARAVIRT */
+
+/* Set the 'TS' bit */
 #define stts() write_cr0(8 | read_cr0())
 
 #endif	/* __KERNEL__ */
 
-#define wbinvd() \
-	__asm__ __volatile__ ("wbinvd": : :"memory")
-
 static inline unsigned long get_limit(unsigned long segment)
 {
 	unsigned long __limit;
--- /dev/null
+++ linux-2.6-pv/arch/i386/kernel/paravirt.c
@@ -0,0 +1,410 @@
+/*  Paravirtualization interfaces
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+
+/* nop stub */
+static void native_nop(void)
+{
+}
+
+static void __init default_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       paravirt_ops.name);
+}
+
+static fastcall unsigned long native_get_debugreg(int regno)
+{
+	unsigned long val = 0; 	/* Damn you, gcc! */
+
+	switch (regno) {
+	case 0:
+		asm("movl %%db0, %0" :"=r" (val)); break;
+	case 1:
+		asm("movl %%db1, %0" :"=r" (val)); break;
+	case 2:
+		asm("movl %%db2, %0" :"=r" (val)); break;
+	case 3:
+		asm("movl %%db3, %0" :"=r" (val)); break;
+	case 6:
+		asm("movl %%db6, %0" :"=r" (val)); break;
+	case 7:
+		asm("movl %%db7, %0" :"=r" (val)); break;
+	default:
+		BUG();
+	}
+	return val;
+}
+
+static fastcall void native_set_debugreg(int regno, unsigned long value)
+{
+	switch (regno) {
+	case 0:
+		asm("movl %0,%%db0"	: /* no output */ :"r" (value));
+		break;
+	case 1:
+		asm("movl %0,%%db1"	: /* no output */ :"r" (value));
+		break;
+	case 2:
+		asm("movl %0,%%db2"	: /* no output */ :"r" (value));
+		break;
+	case 3:
+		asm("movl %0,%%db3"	: /* no output */ :"r" (value));
+		break;
+	case 6:
+		asm("movl %0,%%db6"	: /* no output */ :"r" (value));
+		break;
+	case 7:
+		asm("movl %0,%%db7"	: /* no output */ :"r" (value));
+		break;
+	default:
+		BUG();
+	}
+}
+
+static fastcall void native_clts(void)
+{
+	asm volatile ("clts");
+}
+
+static fastcall unsigned long native_read_cr0(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr0(unsigned long val)
+{
+	asm volatile("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr2(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr2(unsigned long val)
+{
+	asm volatile("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr3(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr3(unsigned long val)
+{
+	asm volatile("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr4(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall unsigned long native_read_cr4_safe(void)
+{
+	unsigned long val;
+	/* This could fault if %cr4 does not exist */
+	asm("1: movl %%cr4, %0		\n"
+		"2:				\n"
+		".section __ex_table,\"a\"	\n"
+		".long 1b,2b			\n"
+		".previous			\n"
+		: "=r" (val): "0" (0));
+	return val;
+}
+
+static fastcall void native_write_cr4(unsigned long val)
+{
+	asm volatile("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long native_save_fl(void)
+{
+	unsigned long f;
+	asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
+	return f;
+}
+
+static fastcall void native_restore_fl(unsigned long f)
+{
+	asm volatile("pushl %0 ; popfl": /* no output */
+			     :"g" (f)
+			     :"memory", "cc");
+}
+
+static fastcall void native_irq_disable(void)
+{
+	asm volatile("cli": : :"memory");
+}
+
+static fastcall void native_irq_enable(void)
+{
+	asm volatile("sti": : :"memory");
+}
+
+static fastcall void native_safe_halt(void)
+{
+	asm volatile("sti; hlt": : :"memory");
+}
+
+static fastcall void native_halt(void)
+{
+	asm volatile("hlt": : :"memory");
+}
+
+static fastcall void native_wbinvd(void)
+{
+	asm volatile("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+{
+	unsigned long long val;
+
+	asm volatile("2: rdmsr ; xorl %0,%0\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  movl %3,%0 ; jmp 1b\n\t"
+		     ".previous\n\t"
+ 		     ".section __ex_table,\"a\"\n"
+		     "   .align 4\n\t"
+		     "   .long 	2b,3b\n\t"
+		     ".previous"
+		     : "=r" (*err), "=A" (val)
+		     : "c" (msr), "i" (-EFAULT));
+
+	return val;
+}
+
+static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+{
+	int err;
+	asm volatile("2: wrmsr ; xorl %0,%0\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  movl %4,%0 ; jmp 1b\n\t"
+		     ".previous\n\t"
+ 		     ".section __ex_table,\"a\"\n"
+		     "   .align 4\n\t"
+		     "   .long 	2b,3b\n\t"
+		     ".previous"
+		     : "=a" (err)
+		     : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+		       "i" (-EFAULT));
+	return err;
+}
+
+static fastcall unsigned long long native_read_tsc(void)
+{
+	unsigned long long val;
+	asm volatile("rdtsc" : "=A" (val));
+	return val;
+}
+
+static fastcall unsigned long long native_read_pmc(void)
+{
+	unsigned long long val;
+	asm volatile("rdpmc" : "=A" (val));
+	return val;
+}
+
+static fastcall void native_load_tr_desc(void)
+{
+	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void native_set_ldt(const void *addr, unsigned int entries)
+{
+	if (likely(entries == 0))
+		__asm__ __volatile__("lldt %w0"::"q" (0));
+	else {
+		unsigned cpu = smp_processor_id();
+		__u32 a, b;
+
+		pack_descriptor(&a, &b, (unsigned long)addr,
+				entries * sizeof(struct desc_struct) - 1,
+				DESCTYPE_LDT, 0);
+		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
+		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+	} 
+}
+
+static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+	asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
+{
+	asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+{
+	asm ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+{
+	asm ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long native_store_tr(void)
+{
+	unsigned long tr;
+	asm ("str %0":"=r" (tr));
+	return tr;
+}
+
+static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+	C(0); C(1); C(2);
+#undef C
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+	u32 *lp = (u32 *)((char *)dt + entry*8);
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
+static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_load_esp0(struct tss_struct *tss,
+				      struct thread_struct *thread)
+{
+	tss->esp0 = thread->esp0;
+
+	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+		tss->ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+}
+
+static fastcall void native_io_delay(void)
+{
+	asm volatile("outb %al,$0x80");
+}
+
+/* These are in entry.S */
+extern fastcall void native_iret(void);
+extern fastcall void native_irq_enable_sysexit(void);
+
+static int __init print_banner(void)
+{
+	paravirt_ops.banner();
+	return 0;
+}
+core_initcall(print_banner);
+ 
+struct paravirt_ops paravirt_ops = {
+	.name = "bare hardware",
+	.paravirt_enabled = 0,
+	.kernel_rpl = 0,
+
+	.banner = default_banner,
+	.arch_setup = native_nop,
+	.memory_setup = machine_specific_memory_setup,
+	.get_wallclock = native_get_wallclock,
+	.set_wallclock = native_set_wallclock,
+	.time_init = time_init_hook,
+	.init_IRQ = native_init_IRQ,
+
+	.cpuid = native_cpuid,
+	.get_debugreg = native_get_debugreg,
+	.set_debugreg = native_set_debugreg,
+	.clts = native_clts,
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+	.read_cr2 = native_read_cr2,
+	.write_cr2 = native_write_cr2,
+	.read_cr3 = native_read_cr3,
+	.write_cr3 = native_write_cr3,
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = native_write_cr4,
+	.save_fl = native_save_fl,
+	.restore_fl = native_restore_fl,
+	.irq_disable = native_irq_disable,
+	.irq_enable = native_irq_enable,
+	.safe_halt = native_safe_halt,
+	.halt = native_halt,
+	.wbinvd = native_wbinvd,
+	.read_msr = native_read_msr,
+	.write_msr = native_write_msr,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+	.load_tr_desc = native_load_tr_desc,
+	.set_ldt = native_set_ldt,
+	.load_gdt = native_load_gdt,
+	.load_idt = native_load_idt,
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = native_store_tr,
+	.load_tls = native_load_tls,
+	.write_ldt_entry = native_write_ldt_entry,
+	.write_gdt_entry = native_write_gdt_entry,
+	.write_idt_entry = native_write_idt_entry,
+	.load_esp0 = native_load_esp0,
+
+	.set_iopl_mask = native_set_iopl_mask,
+	.io_delay = native_io_delay,
+	.const_udelay = __const_udelay,
+
+	.irq_enable_sysexit = native_irq_enable_sysexit,
+	.iret = native_iret,
+};
+EXPORT_SYMBOL(paravirt_ops);
--- /dev/null
+++ linux-2.6-pv/include/asm-i386/paravirt.h
@@ -0,0 +1,291 @@
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+#include <linux/linkage.h>
+
+#ifdef CONFIG_PARAVIRT
+#ifndef __ASSEMBLY__
+struct thread_struct;
+struct Xgt_desc_struct;
+struct tss_struct;
+struct paravirt_ops
+{
+	unsigned int kernel_rpl;
+ 	int paravirt_enabled;
+	const char *name;
+
+	void (*arch_setup)(void);
+	char *(*memory_setup)(void);
+	void (*init_IRQ)(void);
+
+	void (*banner)(void);
+
+	unsigned long (*get_wallclock)(void);
+	int (*set_wallclock)(unsigned long);
+	void (*time_init)(void);
+
+	/* All the function pointers here are declared as "fastcall"
+	   so that we get a specific register-based calling
+	   convention.  This makes it easier to implement inline
+	   assembler replacements. */
+
+	void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx);
+
+	unsigned long (fastcall *get_debugreg)(int regno);
+	void (fastcall *set_debugreg)(int regno, unsigned long value);
+
+	void (fastcall *clts)(void);
+
+	unsigned long (fastcall *read_cr0)(void);
+	void (fastcall *write_cr0)(unsigned long);
+
+	unsigned long (fastcall *read_cr2)(void);
+	void (fastcall *write_cr2)(unsigned long);
+
+	unsigned long (fastcall *read_cr3)(void);
+	void (fastcall *write_cr3)(unsigned long);
+
+	unsigned long (fastcall *read_cr4_safe)(void);
+	unsigned long (fastcall *read_cr4)(void);
+	void (fastcall *write_cr4)(unsigned long);
+
+	unsigned long (fastcall *save_fl)(void);
+	void (fastcall *restore_fl)(unsigned long);
+	void (fastcall *irq_disable)(void);
+	void (fastcall *irq_enable)(void);
+	void (fastcall *safe_halt)(void);
+	void (fastcall *halt)(void);
+	void (fastcall *wbinvd)(void);
+
+	/* err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
+	u64 (fastcall *read_msr)(unsigned int msr, int *err);
+	int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+	u64 (fastcall *read_tsc)(void);
+	u64 (fastcall *read_pmc)(void);
+
+	void (fastcall *load_tr_desc)(void);
+	void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
+	void (fastcall *load_idt)(const struct Xgt_desc_struct *);
+	void (fastcall *store_gdt)(struct Xgt_desc_struct *);
+	void (fastcall *store_idt)(struct Xgt_desc_struct *);
+	void (fastcall *set_ldt)(const void *desc, unsigned entries);
+	unsigned long (fastcall *store_tr)(void);
+	void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu);
+	void (fastcall *write_ldt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *write_gdt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *write_idt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *load_esp0)(struct tss_struct *tss,
+				   struct thread_struct *thread);
+
+	void (fastcall *set_iopl_mask)(unsigned mask);
+
+	void (fastcall *io_delay)(void);
+	void (*const_udelay)(unsigned long loops);
+
+	/* These two are jmp to, not actually called. */
+	void (fastcall *irq_enable_sysexit)(void);
+	void (fastcall *iret)(void);
+};
+
+extern struct paravirt_ops paravirt_ops;
+
+#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
+
+static inline void init_IRQ(void)
+{
+	paravirt_ops.init_IRQ();
+}
+
+static inline void load_esp0(struct tss_struct *tss,
+			     struct thread_struct *thread)
+{
+	paravirt_ops.load_esp0(tss, thread);
+}
+
+#define ARCH_SETUP			paravirt_ops.arch_setup();
+static inline char *memory_setup(void)
+{
+	return paravirt_ops.memory_setup();
+}
+
+static inline unsigned long get_wallclock(void)
+{
+	return paravirt_ops.get_wallclock();
+}
+
+static inline int set_wallclock(unsigned long nowtime)
+{
+	return paravirt_ops.set_wallclock(nowtime);
+}
+
+static inline void do_time_init(void)
+{
+	return paravirt_ops.time_init();
+}
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+			   unsigned int *ecx, unsigned int *edx)
+{
+	paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
+#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)
+
+#define clts() paravirt_ops.clts()
+
+#define read_cr0() paravirt_ops.read_cr0()
+#define write_cr0(x) paravirt_ops.write_cr0(x)
+
+#define read_cr2() paravirt_ops.read_cr2()
+#define write_cr2(x) paravirt_ops.write_cr2(x)
+
+#define read_cr3() paravirt_ops.read_cr3()
+#define write_cr3(x) paravirt_ops.write_cr3(x)
+
+#define read_cr4() paravirt_ops.read_cr4()
+#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
+#define write_cr4(x) paravirt_ops.write_cr4(x)
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	return paravirt_ops.save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+	return paravirt_ops.restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	paravirt_ops.irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	paravirt_ops.irq_enable();
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long flags = paravirt_ops.save_fl();
+
+	paravirt_ops.irq_disable();
+
+	return flags;
+}
+
+static inline void raw_safe_halt(void)
+{
+	paravirt_ops.safe_halt();
+}
+
+static inline void halt(void)
+{
+	paravirt_ops.safe_halt();
+}
+#define wbinvd() paravirt_ops.wbinvd()
+
+#define get_kernel_rpl()  (paravirt_ops.kernel_rpl)
+
+#define rdmsr(msr,val1,val2) do {				\
+	int _err;						\
+	u64 _l = paravirt_ops.read_msr(msr,&_err);		\
+	val1 = (u32)_l;						\
+	val2 = _l >> 32;					\
+} while(0)
+
+#define wrmsr(msr,val1,val2) do {				\
+	u64 _l = ((u64)(val2) << 32) | (val1);			\
+	paravirt_ops.write_msr((msr), _l);			\
+} while(0)
+
+#define rdmsrl(msr,val) do {					\
+	int _err;						\
+	val = paravirt_ops.read_msr((msr),&_err);		\
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsr_safe(msr,a,b) ({					\
+	u64 _l = ((u64)(b) << 32) | (a);			\
+	paravirt_ops.write_msr((msr),_l);			\
+})
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({					\
+	int _err;						\
+	u64 _l = paravirt_ops.read_msr(msr,&_err);		\
+	(*a) = (u32)_l;						\
+	(*b) = _l >> 32;					\
+	_err; })
+
+#define rdtsc(low,high) do {					\
+	u64 _l = paravirt_ops.read_tsc();			\
+	low = (u32)_l;						\
+	high = _l >> 32;					\
+} while(0)
+
+#define rdtscl(low) do {					\
+	u64 _l = paravirt_ops.read_tsc();			\
+	low = (int)_l;						\
+} while(0)
+
+#define rdtscll(val) (val = paravirt_ops.read_tsc())
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) do {				\
+	u64 _l = paravirt_ops.read_pmc();			\
+	low = (u32)_l;						\
+	high = _l >> 32;					\
+} while(0)
+
+#define load_TR_desc() (paravirt_ops.load_tr_desc())
+#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr))
+#define load_idt(dtr) (paravirt_ops.load_idt(dtr))
+#define set_ldt(addr, entries) (paravirt_ops.set_ldt((addr), (entries)))
+#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr))
+#define store_idt(dtr) (paravirt_ops.store_idt(dtr))
+#define store_tr(tr) ((tr) = paravirt_ops.store_tr())
+#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu)))
+#define write_ldt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_ldt_entry((dt), (entry), (low), (high)))
+#define write_gdt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_gdt_entry((dt), (entry), (low), (high)))
+#define write_idt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_idt_entry((dt), (entry), (low), (high)))
+#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))
+  
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void) {
+	paravirt_ops.io_delay();
+#ifdef REALLY_SLOW_IO
+	paravirt_ops.io_delay();
+	paravirt_ops.io_delay();
+	paravirt_ops.io_delay();
+#endif
+}
+
+#define CLI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
+#define STI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+#else  /* __ASSEMBLY__ */
+
+#define INTERRUPT_RETURN	jmp *%cs:paravirt_ops+PARAVIRT_iret
+#define DISABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS_SYSEXIT	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
+#define GET_CR0_INTO_EAX	call *paravirt_ops+PARAVIRT_read_cr0
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+#endif	/* __ASM_PARAVIRT_H */
--- /dev/null
+++ linux-2.6-pv/include/asm-i386/time.h
@@ -0,0 +1,41 @@
+#ifndef _ASMi386_TIME_H
+#define _ASMi386_TIME_H
+
+#include <linux/efi.h>
+#include "mach_time.h"
+
+static inline unsigned long native_get_wallclock(void)
+{
+	unsigned long retval;
+
+	if (efi_enabled)
+		retval = efi_get_time();
+	else
+		retval = mach_get_cmos_time();
+
+	return retval;
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+	int retval;
+
+	if (efi_enabled)
+		retval = efi_set_rtc_mmss(nowtime);
+	else
+		retval = mach_set_rtc_mmss(nowtime);
+
+	return retval;
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define get_wallclock() native_get_wallclock()
+#define set_wallclock(x) native_set_wallclock(x)
+#define do_time_init() time_init_hook()
+
+#endif /* CONFIG_PARAVIRT */
+
+#endif

--

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 2/7] Patch inline replacements for common paravirt operations.
  2006-10-29  2:45 [PATCH 0/7] x86 paravirtualization infrastructure Chris Wright
  2006-10-28  7:00 ` [PATCH 1/7] header and stubs for paravirtualizing critical operations Chris Wright
@ 2006-10-28  7:00 ` Chris Wright
  2006-10-28  7:00 ` [PATCH 3/7] More generic paravirtualization entry point Chris Wright
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 54+ messages in thread
From: Chris Wright @ 2006-10-28  7:00 UTC (permalink / raw)
  To: akpm, ak
  Cc: Rusty Russell, Jeremy Fitzhardinge, Zachary Amsden, linux-kernel,
	virtualization, Jeremy Fitzhardinge

[-- Attachment #1: 009-binary-patch.patch --]
[-- Type: text/plain, Size: 19794 bytes --]

It turns out that the most called ops, by several orders of magnitude,
are the interrupt manipulation ops.  These are obvious candidates for
patching, so mark them up and create infrastructure for it.

The method used is that the ops structure has a patch function, which
is called for each place which needs to be patched: this returns a
number of instructions (the rest are NOP-padded).

Usually we can spare a register (%eax) for the binary patched code to
use, but in a couple of critical places in entry.S we can't: we make
the clobbers explicit at the call site, and manually clobber the
allowed registers in debug mode as an extra check.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Zachary Amsden <zach@vmware.com>

---
 arch/i386/kernel/alternative.c |   66 ++++++++++++--
 arch/i386/kernel/entry.S       |   26 ++---
 arch/i386/kernel/module.c      |    9 +
 arch/i386/kernel/paravirt.c    |   44 +++++++++
 arch/i386/kernel/vmlinux.lds.S |    6 +
 include/asm-i386/alternative.h |    3 
 include/asm-i386/irqflags.h    |    4 
 include/asm-i386/paravirt.h    |  185 +++++++++++++++++++++++++++++++++--------
 include/asm-i386/spinlock.h    |    5 -
 9 files changed, 284 insertions(+), 64 deletions(-)

--- linux-2.6-pv.orig/arch/i386/kernel/alternative.c
+++ linux-2.6-pv/arch/i386/kernel/alternative.c
@@ -123,6 +123,20 @@ static unsigned char** find_nop_table(vo
 
 #endif /* CONFIG_X86_64 */
 
+static void nop_out(void *insns, unsigned int len)
+{
+	unsigned char **noptable = find_nop_table();
+
+	while (len > 0) {
+		unsigned int noplen = len;
+		if (noplen > ASM_NOP_MAX)
+			noplen = ASM_NOP_MAX;
+		memcpy(insns, noptable[noplen], noplen);
+		insns += noplen;
+		len -= noplen;
+	}
+}
+
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
 extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -137,10 +151,9 @@ extern u8 __smp_alt_begin[], __smp_alt_e
 
 void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 {
-	unsigned char **noptable = find_nop_table();
 	struct alt_instr *a;
 	u8 *instr;
-	int diff, i, k;
+	int diff;
 
 	DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
 	for (a = start; a < end; a++) {
@@ -158,13 +171,7 @@ void apply_alternatives(struct alt_instr
 #endif
 		memcpy(instr, a->replacement, a->replacementlen);
 		diff = a->instrlen - a->replacementlen;
-		/* Pad the rest with nops */
-		for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
-			k = diff;
-			if (k > ASM_NOP_MAX)
-				k = ASM_NOP_MAX;
-			memcpy(a->instr + i, noptable[k], k);
-		}
+		nop_out(instr + a->replacementlen, diff);
 	}
 }
 
@@ -208,7 +215,6 @@ static void alternatives_smp_lock(u8 **s
 
 static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 {
-	unsigned char **noptable = find_nop_table();
 	u8 **ptr;
 
 	for (ptr = start; ptr < end; ptr++) {
@@ -216,7 +222,7 @@ static void alternatives_smp_unlock(u8 *
 			continue;
 		if (*ptr > text_end)
 			continue;
-		**ptr = noptable[1][0];
+		nop_out(*ptr, 1);
 	};
 }
 
@@ -342,6 +348,43 @@ void alternatives_smp_switch(int smp)
 
 #endif
 
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+	struct paravirt_patch *p;
+	int i;
+
+	for (p = start; p < end; p++) {
+		unsigned int used;
+
+		used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
+					  p->len);
+#ifdef CONFIG_DEBUG_KERNEL
+		/* Deliberately clobber regs using "not %reg" to find bugs. */
+		for (i = 0; i < 3; i++) {
+			if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
+				memcpy(p->instr + used, "\xf7\xd0", 2);
+				p->instr[used+1] |= i;
+				used += 2;
+			}
+		}
+#endif
+		/* Pad the rest with nops */
+		nop_out(p->instr + used, p->len - used);
+	}
+
+	/* Sync to be conservative, in case we patched following instructions */
+	sync_core();
+}
+extern struct paravirt_patch __start_parainstructions[],
+	__stop_parainstructions[];
+#else
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+}
+extern struct paravirt_patch *__start_parainstructions, *__stop_parainstructions;
+#endif	/* CONFIG_PARAVIRT */
+
 void __init alternative_instructions(void)
 {
 	unsigned long flags;
@@ -389,5 +432,6 @@ void __init alternative_instructions(voi
 		alternatives_smp_switch(0);
 	}
 #endif
+ 	apply_paravirt(__start_parainstructions, __stop_parainstructions);
 	local_irq_restore(flags);
 }
--- linux-2.6-pv.orig/arch/i386/kernel/entry.S
+++ linux-2.6-pv/arch/i386/kernel/entry.S
@@ -63,9 +63,9 @@ NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
+#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
-#define preempt_stop
+#define preempt_stop(clobbers)
 #define resume_kernel		restore_nocheck
 #endif
 
@@ -226,7 +226,7 @@ ENTRY(ret_from_fork)
 	ALIGN
 	RING0_PTREGS_FRAME
 ret_from_exception:
-	preempt_stop
+	preempt_stop(CLBR_ANY)
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 check_userspace:
@@ -237,7 +237,7 @@ check_userspace:
 	jb resume_kernel		# not returning to v8086 or userspace
 
 ENTRY(resume_userspace)
- 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+ 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -248,7 +248,7 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
@@ -277,7 +277,7 @@ sysenter_past_esp:
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
 	 */
-	ENABLE_INTERRUPTS
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ss, 0*/
@@ -322,7 +322,7 @@ sysenter_past_esp:
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
@@ -364,7 +364,7 @@ syscall_call:
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)		# store the return value
 syscall_exit:
-	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -393,7 +393,7 @@ restore_nocheck_notrace:
 .section .fixup,"ax"
 iret_exc:
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -436,7 +436,7 @@ ldt_ss:
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_EAX)
 	TRACE_IRQS_OFF
 	lss (%esp), %esp
 	CFI_ADJUST_CFA_OFFSET -8
@@ -451,7 +451,7 @@ work_pending:
 	jz work_notifysig
 work_resched:
 	call schedule
-	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -507,7 +507,7 @@ syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS		# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS(CLBR_ANY)	# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
 	movl $1, %edx
@@ -691,7 +691,7 @@ ENTRY(device_not_available)
 	GET_CR0_INTO_EAX
 	testl $0x4, %eax		# EM (math emulation bit)
 	jne device_not_available_emulate
-	preempt_stop
+	preempt_stop(CLBR_ANY)
 	call math_state_restore
 	jmp ret_from_exception
 device_not_available_emulate:
--- linux-2.6-pv.orig/arch/i386/kernel/module.c
+++ linux-2.6-pv/arch/i386/kernel/module.c
@@ -109,7 +109,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
 {
-	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+		*para = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
@@ -119,6 +120,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 			alt = s;
 		if (!strcmp(".smp_locks", secstrings + s->sh_name))
 			locks= s;
+		if (!strcmp(".parainstructions", secstrings + s->sh_name))
+			para = s;
 	}
 
 	if (alt) {
@@ -133,6 +136,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 					    lseg, lseg + locks->sh_size,
 					    tseg, tseg + text->sh_size);
 	}
+	if (para) {
+		void *pseg = (void *)para->sh_addr;
+		apply_paravirt(pseg, pseg + para->sh_size);
+	}
 
 	return module_bug_finalize(hdr, sechdrs, me);
 }
--- linux-2.6-pv.orig/arch/i386/kernel/paravirt.c
+++ linux-2.6-pv/arch/i386/kernel/paravirt.c
@@ -40,6 +40,49 @@ static void __init default_banner(void)
 	       paravirt_ops.name);
 }
 
+/* Simple instruction patching code. */
+#define DEF_NATIVE(name, code)					\
+	extern const char start_##name[], end_##name[];		\
+	asm("start_" #name ": " code "; end_" #name ":")
+DEF_NATIVE(cli, "cli");
+DEF_NATIVE(sti, "sti");
+DEF_NATIVE(popf, "push %eax; popf");
+DEF_NATIVE(pushf, "pushf; pop %eax");
+DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
+DEF_NATIVE(iret, "iret");
+DEF_NATIVE(sti_sysexit, "sti; sysexit");
+
+static const struct native_insns
+{
+	const char *start, *end;
+} native_insns[] = {
+	[PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+	[PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+	[PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+	[PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+	[PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
+};
+
+static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+	unsigned int insn_len;
+
+	/* Don't touch it if we don't have a replacement */
+	if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
+		return len;
+
+	insn_len = native_insns[type].end - native_insns[type].start;
+
+	/* Similarly if we can't fit replacement. */
+	if (len < insn_len)
+		return len;
+
+	memcpy(insns, native_insns[type].start, insn_len);
+	return insn_len;
+}
+
 static fastcall unsigned long native_get_debugreg(int regno)
 {
 	unsigned long val = 0; 	/* Damn you, gcc! */
@@ -355,6 +398,7 @@ struct paravirt_ops paravirt_ops = {
 	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
 
+ 	.patch = native_patch,
 	.banner = default_banner,
 	.arch_setup = native_nop,
 	.memory_setup = machine_specific_memory_setup,
--- linux-2.6-pv.orig/arch/i386/kernel/vmlinux.lds.S
+++ linux-2.6-pv/arch/i386/kernel/vmlinux.lds.S
@@ -154,6 +154,12 @@ SECTIONS
   .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
 	*(.altinstr_replacement)
   }
+  . = ALIGN(4);
+  __start_parainstructions = .;
+  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+	*(.parainstructions)
+  }
+  __stop_parainstructions = .;
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
   .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
--- linux-2.6-pv.orig/include/asm-i386/alternative.h
+++ linux-2.6-pv/include/asm-i386/alternative.h
@@ -118,4 +118,7 @@ static inline void alternatives_smp_swit
 #define LOCK_PREFIX ""
 #endif
 
+struct paravirt_patch;
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
+
 #endif /* _I386_ALTERNATIVE_H */
--- linux-2.6-pv.orig/include/asm-i386/irqflags.h
+++ linux-2.6-pv/include/asm-i386/irqflags.h
@@ -79,8 +79,8 @@ static inline unsigned long __raw_local_
 }
 
 #else
-#define DISABLE_INTERRUPTS		cli
-#define ENABLE_INTERRUPTS		sti
+#define DISABLE_INTERRUPTS(clobbers)	cli
+#define ENABLE_INTERRUPTS(clobbers)	sti
 #define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
 #define INTERRUPT_RETURN		iret
 #define GET_CR0_INTO_EAX		movl %cr0, %eax
--- linux-2.6-pv.orig/include/asm-i386/paravirt.h
+++ linux-2.6-pv/include/asm-i386/paravirt.h
@@ -3,8 +3,26 @@
 /* Various instructions on x86 need to be replaced for
  * para-virtualization: those hooks are defined here. */
 #include <linux/linkage.h>
+#include <linux/stringify.h>
 
 #ifdef CONFIG_PARAVIRT
+/* These are the most performance critical ops, so we want to be able to patch
+ * callers */
+#define PARAVIRT_IRQ_DISABLE 0
+#define PARAVIRT_IRQ_ENABLE 1
+#define PARAVIRT_RESTORE_FLAGS 2
+#define PARAVIRT_SAVE_FLAGS 3
+#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4
+#define PARAVIRT_INTERRUPT_RETURN 5
+#define PARAVIRT_STI_SYSEXIT 6
+
+/* Bitmask of what can be clobbered: usually at least eax. */
+#define CLBR_NONE 0x0
+#define CLBR_EAX 0x1
+#define CLBR_ECX 0x2
+#define CLBR_EDX 0x4
+#define CLBR_ANY 0x7
+
 #ifndef __ASSEMBLY__
 struct thread_struct;
 struct Xgt_desc_struct;
@@ -15,6 +33,15 @@ struct paravirt_ops
  	int paravirt_enabled;
 	const char *name;
 
+	/*
+	 * Patch may replace one of the defined code sequences with arbitrary
+	 * code, subject to the same register constraints.  This generally
+	 * means the code is not free to clobber any registers other than EAX.
+	 * The patch function should return the number of bytes of code
+	 * generated, as we nop pad the rest in generic code.
+	 */
+	unsigned (*patch)(u8 type, u16 clobber, void *firstinsn, unsigned len);
+
 	void (*arch_setup)(void);
 	char *(*memory_setup)(void);
 	void (*init_IRQ)(void);
@@ -157,35 +184,6 @@ static inline void __cpuid(unsigned int 
 #define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
 #define write_cr4(x) paravirt_ops.write_cr4(x)
 
-static inline unsigned long __raw_local_save_flags(void)
-{
-	return paravirt_ops.save_fl();
-}
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
-	return paravirt_ops.restore_fl(flags);
-}
-
-static inline void raw_local_irq_disable(void)
-{
-	paravirt_ops.irq_disable();
-}
-
-static inline void raw_local_irq_enable(void)
-{
-	paravirt_ops.irq_enable();
-}
-
-static inline unsigned long __raw_local_irq_save(void)
-{
-	unsigned long flags = paravirt_ops.save_fl();
-
-	paravirt_ops.irq_disable();
-
-	return flags;
-}
-
 static inline void raw_safe_halt(void)
 {
 	paravirt_ops.safe_halt();
@@ -277,15 +275,130 @@ static inline void slow_down_io(void) {
 #endif
 }
 
-#define CLI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
-#define STI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch {
+	u8 *instr; 		/* original instructions */
+	u8 instrtype;		/* type of this instruction */
+	u8 len;			/* length of original instruction */
+	u16 clobbers;		/* what registers you may clobber */
+};
+
+#define paravirt_alt(insn_string, typenum, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	"  .long 771b\n"				\
+	"  .byte " __stringify(typenum) "\n"		\
+	"  .byte 772b-771b\n"				\
+	"  .short " __stringify(clobber) "\n"		\
+	".popsection"
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	unsigned long f;
+
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_SAVE_FLAGS, CLBR_NONE)
+			     : "=a"(f): "m"(paravirt_ops.save_fl)
+			     : "memory", "cc");
+	return f;
+}
+
+static inline void raw_local_irq_restore(unsigned long f)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_RESTORE_FLAGS, CLBR_EAX)
+			     : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f)
+			     : "memory", "cc");
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%0;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_IRQ_DISABLE, CLBR_EAX)
+			     : : "m" (paravirt_ops.irq_disable)
+			     : "memory", "eax", "cc");
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%0;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_IRQ_ENABLE, CLBR_EAX)
+			     : : "m" (paravirt_ops.irq_enable)
+			     : "memory", "eax", "cc");
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long f;
+
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1; pushl %%eax;"
+					   "call *%2; popl %%eax;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_SAVE_FLAGS_IRQ_DISABLE,
+					  CLBR_NONE)
+			     : "=a"(f)
+			     : "m" (paravirt_ops.save_fl),
+			       "m" (paravirt_ops.irq_disable)
+			     : "memory", "cc");
+	return f;
+}
+
+#define CLI_STRING paravirt_alt("pushl %ecx; pushl %edx;"		\
+		     "call *paravirt_ops+PARAVIRT_irq_disable;"		\
+		     "popl %edx; popl %ecx",				\
+		     PARAVIRT_IRQ_DISABLE, CLBR_EAX)
+
+#define STI_STRING paravirt_alt("pushl %ecx; pushl %edx;"		\
+		     "call *paravirt_ops+PARAVIRT_irq_enable;"		\
+		     "popl %edx; popl %ecx",				\
+		     PARAVIRT_IRQ_ENABLE, CLBR_EAX)
+#define CLI_STI_CLOBBERS , "%eax"
+
 #else  /* __ASSEMBLY__ */
+  
+#define PARA_PATCH(ptype, clobbers, ops)	\
+771:;						\
+	ops;					\
+772:;						\
+	.pushsection .parainstructions,"a";	\
+	 .long 771b;				\
+	 .byte ptype;				\
+	 .byte 772b-771b;			\
+	 .short clobbers;			\
+	.popsection
+
+#define INTERRUPT_RETURN				\
+	PARA_PATCH(PARAVIRT_INTERRUPT_RETURN, CLBR_ANY,	\
+	jmp *%cs:paravirt_ops+PARAVIRT_iret)
+
+#define DISABLE_INTERRUPTS(clobbers)			\
+	PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers,	\
+	pushl %ecx; pushl %edx;				\
+	call *paravirt_ops+PARAVIRT_irq_disable;	\
+	popl %edx; popl %ecx)				\
+
+#define ENABLE_INTERRUPTS(clobbers)			\
+	PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers,	\
+	pushl %ecx; pushl %edx;				\
+	call *%cs:paravirt_ops+PARAVIRT_irq_enable;	\
+	popl %edx; popl %ecx)
+
+#define ENABLE_INTERRUPTS_SYSEXIT			\
+	PARA_PATCH(PARAVIRT_STI_SYSEXIT, CLBR_ANY,	\
+	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit)
+
+#define GET_CR0_INTO_EAX			\
+	call *paravirt_ops+PARAVIRT_read_cr0
 
-#define INTERRUPT_RETURN	jmp *%cs:paravirt_ops+PARAVIRT_iret
-#define DISABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
-#define ENABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
-#define ENABLE_INTERRUPTS_SYSEXIT	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
-#define GET_CR0_INTO_EAX	call *paravirt_ops+PARAVIRT_read_cr0
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_PARAVIRT */
 #endif	/* __ASM_PARAVIRT_H */
--- linux-2.6-pv.orig/include/asm-i386/spinlock.h
+++ linux-2.6-pv/include/asm-i386/spinlock.h
@@ -12,6 +12,7 @@
 #else
 #define CLI_STRING	"cli"
 #define STI_STRING	"sti"
+#define CLI_STI_CLOBBERS
 #endif /* CONFIG_PARAVIRT */
 
 /*
@@ -75,7 +76,9 @@ static inline void __raw_spin_lock_flags
 		"jg 1b\n\t"
 		"jmp 4b\n"
 		"5:\n\t"
-		: "+m" (lock->slock) : "r" (flags) : "memory");
+		: "+m" (lock->slock)
+		: "r" (flags)
+		: "memory" CLI_STI_CLOBBERS);
 }
 #endif
 

--

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 3/7] More generic paravirtualization entry point.
  2006-10-29  2:45 [PATCH 0/7] x86 paravirtualization infrastructure Chris Wright
  2006-10-28  7:00 ` [PATCH 1/7] header and stubs for paravirtualizing critical operations Chris Wright
  2006-10-28  7:00 ` [PATCH 2/7] Patch inline replacements for common paravirt operations Chris Wright
@ 2006-10-28  7:00 ` Chris Wright
  2006-10-29 16:41   ` Andi Kleen
  2006-10-28  7:00 ` [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels Chris Wright
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 54+ messages in thread
From: Chris Wright @ 2006-10-28  7:00 UTC (permalink / raw)
  To: akpm, ak
  Cc: Rusty Russell, Jeremy Fitzhardinge, Zachary Amsden, linux-kernel,
	virtualization

[-- Attachment #1: 011-paravirt-head.S.patch --]
[-- Type: text/plain, Size: 3420 bytes --]

1) Each hypervisor writes a probe function to detect whether we are
   running under that hypervisor.  paravirt_probe() registers this
   function.

2) If vmlinux is booted with ring != 0, we call all the probe
   functions (with registers except %esp intact) in link order: the
   winner will not return.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Zachary Amsden <zach@vmware.com>

---
 arch/i386/kernel/Makefile      |    2 ++
 arch/i386/kernel/head.S        |   33 +++++++++++++++++++++++++++++++++
 arch/i386/kernel/paravirt.c    |    6 +++++-
 arch/i386/kernel/vmlinux.lds.S |    6 ++++++
 include/asm-i386/paravirt.h    |    5 +++++
 5 files changed, 51 insertions(+), 1 deletion(-)

--- linux-2.6-pv.orig/arch/i386/kernel/Makefile
+++ linux-2.6-pv/arch/i386/kernel/Makefile
@@ -39,6 +39,8 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+
+# Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 
 EXTRA_AFLAGS   := -traditional
--- linux-2.6-pv.orig/arch/i386/kernel/head.S
+++ linux-2.6-pv/arch/i386/kernel/head.S
@@ -55,6 +55,12 @@
  */
 ENTRY(startup_32)
 
+#ifdef CONFIG_PARAVIRT
+        movl %cs, %eax
+        testl $0x3, %eax
+        jnz startup_paravirt
+#endif
+
 /*
  * Set segments to known values.
  */
@@ -486,6 +492,33 @@ ignore_int:
 #endif
 	iret
 
+#ifdef CONFIG_PARAVIRT
+startup_paravirt:
+	cld
+ 	movl $(init_thread_union+THREAD_SIZE),%esp
+
+	/* We take pains to preserve all the regs. */
+	pushl	%edx
+	pushl	%ecx
+	pushl	%eax
+
+	/* paravirt.o is last in link, and that probe fn never returns */
+	pushl	$__start_paravirtprobe
+1:
+	movl	0(%esp), %eax
+	pushl	(%eax)
+	movl	8(%esp), %eax
+	call	*(%esp)
+	popl	%eax
+
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	movl	12(%esp), %edx
+
+	addl	$4, (%esp)
+	jmp	1b
+#endif
+
 /*
  * Real beginning of normal "text" segment
  */
--- linux-2.6-pv.orig/arch/i386/kernel/paravirt.c
+++ linux-2.6-pv/arch/i386/kernel/paravirt.c
@@ -392,7 +392,11 @@ static int __init print_banner(void)
 	return 0;
 }
 core_initcall(print_banner);
- 
+
+/* We simply declare start_kernel to be the paravirt probe of last resort. */
+asmlinkage void __init start_kernel(void);
+paravirt_probe(start_kernel);
+  
 struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
--- linux-2.6-pv.orig/arch/i386/kernel/vmlinux.lds.S
+++ linux-2.6-pv/arch/i386/kernel/vmlinux.lds.S
@@ -60,6 +60,12 @@ SECTIONS
 	CONSTRUCTORS
 	} :data
 
+  __start_paravirtprobe = .;
+  .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
+	*(.paravirtprobe)
+  }
+  __stop_paravirtprobe = .;
+
   . = ALIGN(4096);
   __nosave_begin = .;
   .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
--- linux-2.6-pv.orig/include/asm-i386/paravirt.h
+++ linux-2.6-pv/include/asm-i386/paravirt.h
@@ -120,6 +120,11 @@ struct paravirt_ops
 	void (fastcall *iret)(void);
 };
 
+/* Mark a paravirt probe function. */
+#define paravirt_probe(fn)						\
+	static void (*__paravirtprobe_##fn)(void) __attribute_used__	\
+		__attribute__((__section__(".paravirtprobe"))) = fn
+
 extern struct paravirt_ops paravirt_ops;
 
 #define paravirt_enabled() (paravirt_ops.paravirt_enabled)

--

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels
  2006-10-29  2:45 [PATCH 0/7] x86 paravirtualization infrastructure Chris Wright
                   ` (2 preceding siblings ...)
  2006-10-28  7:00 ` [PATCH 3/7] More generic paravirtualization entry point Chris Wright
@ 2006-10-28  7:00 ` Chris Wright
  2006-11-01 12:17   ` Pavel Machek
  2006-10-28  7:00 ` [PATCH 5/7] Allow disabling legacy power management modes with " Chris Wright
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 54+ messages in thread
From: Chris Wright @ 2006-10-28  7:00 UTC (permalink / raw)
  To: akpm, ak
  Cc: Rusty Russell, Jeremy Fitzhardinge, Zachary Amsden, linux-kernel,
	virtualization

[-- Attachment #1: 015-disable-bug-checking.patch --]
[-- Type: text/plain, Size: 1440 bytes --]

Allow selected bug checks to be skipped by paravirt kernels.  The two most
important are the F00F workaround (which is either done by the hypervisor,
or not required), and the 'hlt' instruction check, which can break under
some hypervisors.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

---
 arch/i386/kernel/cpu/intel.c |    2 +-
 include/asm-i386/bugs.h      |    4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

--- linux-2.6-pv.orig/arch/i386/kernel/cpu/intel.c
+++ linux-2.6-pv/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct 
 	 * Note that the workaround only should be initialized once...
 	 */
 	c->f00f_bug = 0;
-	if ( c->x86 == 5 ) {
+	if (!paravirt_enabled() && c->x86 == 5) {
 		static int f00f_workaround_enabled = 0;
 
 		c->f00f_bug = 1;
--- linux-2.6-pv.orig/include/asm-i386/bugs.h
+++ linux-2.6-pv/include/asm-i386/bugs.h
@@ -21,6 +21,7 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
+#include <asm/paravirt.h>
 
 static int __init no_halt(char *s)
 {
@@ -91,6 +92,9 @@ static void __init check_fpu(void)
 
 static void __init check_hlt(void)
 {
+	if (paravirt_enabled())
+		return;
+
 	printk(KERN_INFO "Checking 'hlt' instruction... ");
 	if (!boot_cpu_data.hlt_works_ok) {
 		printk("disabled\n");

--

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 5/7] Allow disabling legacy power management modes with paravirt kernels
  2006-10-29  2:45 [PATCH 0/7] x86 paravirtualization infrastructure Chris Wright
                   ` (3 preceding siblings ...)
  2006-10-28  7:00 ` [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels Chris Wright
@ 2006-10-28  7:00 ` Chris Wright
  2006-10-28  7:00 ` [PATCH 6/7] Add APIC accessors to paravirt-ops Chris Wright
  2006-10-28  7:00 ` [PATCH 7/7] Add mmu virtualization to paravirt-ops Chris Wright
  6 siblings, 0 replies; 54+ messages in thread
From: Chris Wright @ 2006-10-28  7:00 UTC (permalink / raw)
  To: akpm, ak
  Cc: Rusty Russell, Jeremy Fitzhardinge, Zachary Amsden, linux-kernel,
	virtualization

[-- Attachment #1: 016-power-management-bypass.patch --]
[-- Type: text/plain, Size: 1552 bytes --]

Two legacy power management modes are much easier to just explicitly disable
when running in paravirtualized mode - neither APM nor PnP is still relevant.
The status of ACPI is still debatable, and noacpi is still a common enough
boot parameter that it is not necessary to explicitly disable ACPI.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

---
 arch/i386/kernel/apm.c     |    3 ++-
 drivers/pnp/pnpbios/core.c |    3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

--- linux-2.6-pv.orig/arch/i386/kernel/apm.c
+++ linux-2.6-pv/arch/i386/kernel/apm.c
@@ -231,6 +231,7 @@
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/i8253.h>
+#include <asm/paravirt.h>
 
 #include "io_ports.h"
 
@@ -2191,7 +2192,7 @@ static int __init apm_init(void)
 
 	dmi_check_system(apm_dmi_table);
 
-	if (apm_info.bios.version == 0) {
+	if (apm_info.bios.version == 0 || paravirt_enabled()) {
 		printk(KERN_INFO "apm: BIOS not found.\n");
 		return -ENODEV;
 	}
--- linux-2.6-pv.orig/drivers/pnp/pnpbios/core.c
+++ linux-2.6-pv/drivers/pnp/pnpbios/core.c
@@ -530,7 +530,8 @@ static int __init pnpbios_init(void)
 	if (check_legacy_ioport(PNPBIOS_BASE))
 		return -ENODEV;
 #endif
-	if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table)) {
+	if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) ||
+	    paravirt_enabled()) {
 		printk(KERN_INFO "PnPBIOS: Disabled\n");
 		return -ENODEV;
 	}

--

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-29  2:45 [PATCH 0/7] x86 paravirtualization infrastructure Chris Wright
                   ` (4 preceding siblings ...)
  2006-10-28  7:00 ` [PATCH 5/7] Allow disabling legacy power management modes with " Chris Wright
@ 2006-10-28  7:00 ` Chris Wright
  2006-10-29 16:31   ` Andi Kleen
  2006-10-28  7:00 ` [PATCH 7/7] Add mmu virtualization to paravirt-ops Chris Wright
  6 siblings, 1 reply; 54+ messages in thread
From: Chris Wright @ 2006-10-28  7:00 UTC (permalink / raw)
  To: akpm, ak
  Cc: Rusty Russell, Jeremy Fitzhardinge, Zachary Amsden, linux-kernel,
	virtualization

[-- Attachment #1: 01A-apicops.patch --]
[-- Type: text/plain, Size: 3811 bytes --]

Add APIC accessors to paravirt-ops.  Unfortunately, we need two write
functions, as some older broken hardware requires workarounds for
Pentium APIC errata - this is the purpose of apic_write_atomic.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

---
 arch/i386/kernel/paravirt.c |   28 ++++++++++++++++++++++++++++
 include/asm-i386/apic.h     |    5 ++++-
 include/asm-i386/paravirt.h |   27 +++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

--- linux-2.6-pv.orig/arch/i386/kernel/paravirt.c
+++ linux-2.6-pv/arch/i386/kernel/paravirt.c
@@ -28,6 +28,8 @@
 #include <asm/time.h>
 #include <asm/irq.h>
 #include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
 
 /* nop stub */
 static void native_nop(void)
@@ -382,6 +384,26 @@ static fastcall void native_io_delay(voi
 	asm volatile("outb %al,$0x80");
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Basic functions for reading and writing APIC registers
+ */
+static fastcall void native_apic_write(unsigned long reg, unsigned long v)
+{
+	*((volatile unsigned long *)(APIC_BASE+reg)) = v;
+}
+
+static fastcall void native_apic_write_atomic(unsigned long reg, unsigned long v)
+{
+	xchg((volatile unsigned long *)(APIC_BASE+reg), v);
+}
+
+static fastcall unsigned long native_apic_read(unsigned long reg)
+{
+	return *((volatile unsigned long *)(APIC_BASE+reg));
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
 /* These are in entry.S */
 extern fastcall void native_iret(void);
 extern fastcall void native_irq_enable_sysexit(void);
@@ -452,6 +474,12 @@ struct paravirt_ops paravirt_ops = {
 	.io_delay = native_io_delay,
 	.const_udelay = __const_udelay,
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = native_apic_write,
+	.apic_write_atomic = native_apic_write_atomic,
+	.apic_read = native_apic_read,
+#endif
+
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 };
--- linux-2.6-pv.orig/include/asm-i386/apic.h
+++ linux-2.6-pv/include/asm-i386/apic.h
@@ -37,7 +37,9 @@ extern void generic_apic_probe(void);
 /*
  * Basic functions accessing APICs.
  */
-
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 static __inline void apic_write(unsigned long reg, unsigned long v)
 {
 	*((volatile unsigned long *)(APIC_BASE+reg)) = v;
@@ -52,6 +54,7 @@ static __inline unsigned long apic_read(
 {
 	return *((volatile unsigned long *)(APIC_BASE+reg));
 }
+#endif
 
 static __inline__ void apic_wait_icr_idle(void)
 {
--- linux-2.6-pv.orig/include/asm-i386/paravirt.h
+++ linux-2.6-pv/include/asm-i386/paravirt.h
@@ -115,6 +115,12 @@ struct paravirt_ops
 	void (fastcall *io_delay)(void);
 	void (*const_udelay)(unsigned long loops);
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	void (fastcall *apic_write)(unsigned long reg, unsigned long v);
+	void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
+	unsigned long (fastcall *apic_read)(unsigned long reg);
+#endif
+
 	/* These two are jmp to, not actually called. */
 	void (fastcall *irq_enable_sysexit)(void);
 	void (fastcall *iret)(void);
@@ -280,6 +286,27 @@ static inline void slow_down_io(void) {
 #endif
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Basic functions accessing APICs.
+ */
+static __inline void apic_write(unsigned long reg, unsigned long v)
+{
+	paravirt_ops.apic_write(reg,v);
+}
+
+static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+{
+	paravirt_ops.apic_write_atomic(reg,v);
+}
+
+static __inline unsigned long apic_read(unsigned long reg)
+{
+	return paravirt_ops.apic_read(reg);
+}
+#endif
+
+
 /* These all sit in the .parainstructions section to tell us what to patch. */
 struct paravirt_patch {
 	u8 *instr; 		/* original instructions */

--

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 7/7] Add mmu virtualization to paravirt-ops.
  2006-10-29  2:45 [PATCH 0/7] x86 paravirtualization infrastructure Chris Wright
                   ` (5 preceding siblings ...)
  2006-10-28  7:00 ` [PATCH 6/7] Add APIC accessors to paravirt-ops Chris Wright
@ 2006-10-28  7:00 ` Chris Wright
  6 siblings, 0 replies; 54+ messages in thread
From: Chris Wright @ 2006-10-28  7:00 UTC (permalink / raw)
  To: akpm, ak
  Cc: Rusty Russell, Jeremy Fitzhardinge, Zachary Amsden, linux-kernel,
	virtualization

[-- Attachment #1: paravirt-mmu.patch --]
[-- Type: text/plain, Size: 12675 bytes --]

Add the three bare TLB accessor functions to paravirt-ops.  Most amusingly,
flush_tlb is redefined on SMP, so I can't call the paravirt op flush_tlb.
Instead, I chose to indicate the actual flush type, kernel (global) vs. user
(non-global).  Global in this sense means using the global bit in the page
table entry, which makes TLB entries persistent across CR3 reloads, not
global as in the SMP sense of invoking remote shootdowns, so the term is
confusingly overloaded.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

---
 arch/i386/kernel/paravirt.c       |  109 ++++++++++++++++++++++++++++++++++++++
 arch/i386/mm/boot_ioremap.c       |    1 
 include/asm-i386/paravirt.h       |   75 ++++++++++++++++++++++++++
 include/asm-i386/pgtable-2level.h |    5 +
 include/asm-i386/pgtable-3level.h |   42 +++++++-------
 include/asm-i386/pgtable.h        |    4 +
 include/asm-i386/tlbflush.h       |   18 ++++--
 7 files changed, 227 insertions(+), 27 deletions(-)

--- linux-2.6-pv.orig/arch/i386/kernel/paravirt.c
+++ linux-2.6-pv/arch/i386/kernel/paravirt.c
@@ -30,6 +30,7 @@
 #include <asm/delay.h>
 #include <asm/fixmap.h>
 #include <asm/apic.h>
+#include <asm/tlbflush.h>
 
 /* nop stub */
 static void native_nop(void)
@@ -404,6 +405,97 @@ static fastcall unsigned long native_api
 }
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+static fastcall void native_flush_tlb(void)
+{
+	__native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static fastcall void native_flush_tlb_global(void)
+{
+	__native_flush_tlb_global();
+}
+
+static fastcall void native_flush_tlb_single(u32 addr)
+{
+	__native_flush_tlb_single(addr);
+}
+
+#ifndef CONFIG_X86_PAE
+static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	*pmdp = pmdval;
+}
+
+#else /* CONFIG_X86_PAE */
+
+static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	set_64bit((unsigned long long *)ptep,pte_val(pteval));
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
+}
+
+static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
+{
+	*pudp = pudval;
+}
+
+static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = 0;
+}
+
+static fastcall void native_pmd_clear(pmd_t *pmd)
+{
+	u32 *tmp = (u32 *)pmd;
+	*tmp = 0;
+	smp_wmb();
+	*(tmp + 1) = 0;
+}
+#endif /* CONFIG_X86_PAE */
+
 /* These are in entry.S */
 extern fastcall void native_iret(void);
 extern fastcall void native_irq_enable_sysexit(void);
@@ -480,6 +572,23 @@ struct paravirt_ops paravirt_ops = {
 	.apic_read = native_apic_read,
 #endif
 
+	.flush_tlb_user = native_flush_tlb,
+	.flush_tlb_kernel = native_flush_tlb_global,
+	.flush_tlb_single = native_flush_tlb_single,
+
+	.set_pte = native_set_pte,
+	.set_pte_at = native_set_pte_at,
+	.set_pmd = native_set_pmd,
+	.pte_update = (void *)native_nop,
+	.pte_update_defer = (void *)native_nop,
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = native_set_pte_atomic,
+	.set_pte_present = native_set_pte_present,
+	.set_pud = native_set_pud,
+	.pte_clear = native_pte_clear,
+	.pmd_clear = native_pmd_clear,
+#endif
+
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 };
--- linux-2.6-pv.orig/arch/i386/mm/boot_ioremap.c
+++ linux-2.6-pv/arch/i386/mm/boot_ioremap.c
@@ -16,6 +16,7 @@
  */
 
 #undef CONFIG_X86_PAE
+#undef CONFIG_PARAVIRT
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
--- linux-2.6-pv.orig/include/asm-i386/paravirt.h
+++ linux-2.6-pv/include/asm-i386/paravirt.h
@@ -4,6 +4,7 @@
  * para-virtualization: those hooks are defined here. */
 #include <linux/linkage.h>
 #include <linux/stringify.h>
+#include <asm/page.h>
 
 #ifdef CONFIG_PARAVIRT
 /* These are the most performance critical ops, so we want to be able to patch
@@ -27,6 +28,7 @@
 struct thread_struct;
 struct Xgt_desc_struct;
 struct tss_struct;
+struct mm_struct;
 struct paravirt_ops
 {
 	unsigned int kernel_rpl;
@@ -121,6 +123,23 @@ struct paravirt_ops
 	unsigned long (fastcall *apic_read)(unsigned long reg);
 #endif
 
+	void (fastcall *flush_tlb_user)(void);
+	void (fastcall *flush_tlb_kernel)(void);
+	void (fastcall *flush_tlb_single)(u32 addr);
+
+	void (fastcall *set_pte)(pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+	void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep);
+	void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep);
+#ifdef CONFIG_X86_PAE
+	void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte);
+	void (fastcall *set_pud)(pud_t *pudp, pud_t pudval);
+	void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+	void (fastcall *pmd_clear)(pmd_t *pmdp);
+#endif
+
 	/* These two are jmp to, not actually called. */
 	void (fastcall *irq_enable_sysexit)(void);
 	void (fastcall *iret)(void);
@@ -307,6 +326,62 @@ static __inline unsigned long apic_read(
 #endif
 
 
+#define __flush_tlb() paravirt_ops.flush_tlb_user()
+#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
+#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
+
+static inline void set_pte(pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte(ptep, pteval);
+}
+
+static inline void set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte_at(mm, addr, ptep, pteval);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	paravirt_ops.set_pmd(pmdp, pmdval);
+}
+
+static inline void pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	paravirt_ops.pte_update(mm, addr, ptep);
+}
+
+static inline void pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	paravirt_ops.pte_update_defer(mm, addr, ptep);
+}
+
+#ifdef CONFIG_X86_PAE
+static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte_atomic(ptep, pteval);
+}
+
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	paravirt_ops.set_pte_present(mm, addr, ptep, pte);
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pudval)
+{
+	paravirt_ops.set_pud(pudp, pudval);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	paravirt_ops.pte_clear(mm, addr, ptep);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	paravirt_ops.pmd_clear(pmdp);
+}
+#endif
+
 /* These all sit in the .parainstructions section to tell us what to patch. */
 struct paravirt_patch {
 	u8 *instr; 		/* original instructions */
--- linux-2.6-pv.orig/include/asm-i386/pgtable-2level.h
+++ linux-2.6-pv/include/asm-i386/pgtable-2level.h
@@ -11,11 +11,14 @@
  * within a page table are directly modified.  Thus, the following
  * hook is made available.
  */
+#ifndef CONFIG_PARAVIRT
 #define set_pte(pteptr, pteval) (*(pteptr) = pteval)
 #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
+#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
+#endif
+
 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
 #define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval)
-#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
 
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
--- linux-2.6-pv.orig/include/asm-i386/pgtable-3level.h
+++ linux-2.6-pv/include/asm-i386/pgtable-3level.h
@@ -42,6 +42,7 @@ static inline int pte_exec_kernel(pte_t 
 	return pte_x(pte);
 }
 
+#ifndef CONFIG_PARAVIRT
 /* Rules for using set_pte: the pte being assigned *must* be
  * either not present or in a state where the hardware will
  * not attempt to update the pte.  In places where this is
@@ -71,12 +72,33 @@ static inline void set_pte_present(struc
 	ptep->pte_low = pte.pte_low;
 }
 
+/*
+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = 0;
+}
+
+static inline void pmd_clear(pmd_t *pmd)
+{
+	u32 *tmp = (u32 *)pmd;
+	*tmp = 0;
+	smp_wmb();
+	*(tmp + 1) = 0;
+}
+
 #define set_pte_atomic(pteptr,pteval) \
 		set_64bit((unsigned long long *)(pteptr),pte_val(pteval))
 #define set_pmd(pmdptr,pmdval) \
 		set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
 #define set_pud(pudptr,pudval) \
 		(*(pudptr) = (pudval))
+#endif
 
 /*
  * Pentium-II erratum A13: in PAE mode we explicitly have to flush
@@ -97,26 +119,6 @@ static inline void pud_clear (pud_t * pu
 #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
 			pmd_index(address))
 
-/*
- * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
- * entry, so clear the bottom half first and enforce ordering with a compiler
- * barrier.
- */
-static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	ptep->pte_low = 0;
-	smp_wmb();
-	ptep->pte_high = 0;
-}
-
-static inline void pmd_clear(pmd_t *pmd)
-{
-	u32 *tmp = (u32 *)pmd;
-	*tmp = 0;
-	smp_wmb();
-	*(tmp + 1) = 0;
-}
-
 static inline pte_t raw_ptep_get_and_clear(pte_t *ptep)
 {
 	pte_t res;
--- linux-2.6-pv.orig/include/asm-i386/pgtable.h
+++ linux-2.6-pv/include/asm-i386/pgtable.h
@@ -15,6 +15,7 @@
 #include <asm/processor.h>
 #include <asm/fixmap.h>
 #include <linux/threads.h>
+#include <asm/paravirt.h>
 
 #ifndef _I386_BITOPS_H
 #include <asm/bitops.h>
@@ -246,6 +247,7 @@ static inline pte_t pte_mkhuge(pte_t pte
 # include <asm/pgtable-2level.h>
 #endif
 
+#ifndef CONFIG_PARAVIRT
 /*
  * Rules for using pte_update - it must be called after any PTE update which
  * has not been done using the set_pte / clear_pte interfaces.  It is used by
@@ -261,7 +263,7 @@ static inline pte_t pte_mkhuge(pte_t pte
  */
 #define pte_update(mm, addr, ptep)		do { } while (0)
 #define pte_update_defer(mm, addr, ptep)	do { } while (0)
-
+#endif
 
 /*
  * We only update the dirty/accessed state if we set
--- linux-2.6-pv.orig/include/asm-i386/tlbflush.h
+++ linux-2.6-pv/include/asm-i386/tlbflush.h
@@ -4,7 +4,15 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
-#define __flush_tlb()							\
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+#define __native_flush_tlb()						\
 	do {								\
 		unsigned int tmpreg;					\
 									\
@@ -19,7 +27,7 @@
  * Global pages have to be flushed a bit differently. Not a real
  * performance problem because this does not happen often.
  */
-#define __flush_tlb_global()						\
+#define __native_flush_tlb_global()					\
 	do {								\
 		unsigned int tmpreg, cr4, cr4_orig;			\
 									\
@@ -36,6 +44,9 @@
 			: "memory");					\
 	} while (0)
 
+#define __native_flush_tlb_single(addr) 				\
+	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
+
 # define __flush_tlb_all()						\
 	do {								\
 		if (cpu_has_pge)					\
@@ -46,9 +57,6 @@
 
 #define cpu_has_invlpg	(boot_cpu_data.x86 > 3)
 
-#define __flush_tlb_single(addr) \
-	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
-
 #ifdef CONFIG_X86_INVLPG
 # define __flush_tlb_one(addr) __flush_tlb_single(addr)
 #else

--

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 0/7] x86 paravirtualization infrastructure
@ 2006-10-29  2:45 Chris Wright
  2006-10-28  7:00 ` [PATCH 1/7] header and stubs for paravirtualizing critical operations Chris Wright
                   ` (6 more replies)
  0 siblings, 7 replies; 54+ messages in thread
From: Chris Wright @ 2006-10-29  2:45 UTC (permalink / raw)
  To: akpm, ak; +Cc: virtualization, linux-kernel

The following patches introduce the core infrastructure needed to
paravirtualize the 32-bit x86 Linux kernel.  This is done by moving
virtualization sensitive insn's or code paths to a function table,
paravirt_ops.  This structure can be populated with hypervisor specific
calls or native stubs and currently support running on bare metal, VMI,
Xen, or Lhype.  These patches apply to 2.6.19-rc2-mm2 plus the last set
of paravirt prep patches that Rusty sent.

thanks,
-chris
--

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-28  7:00 ` [PATCH 6/7] Add APIC accessors to paravirt-ops Chris Wright
@ 2006-10-29 16:31   ` Andi Kleen
  2006-10-30  3:28     ` Rusty Russell
  0 siblings, 1 reply; 54+ messages in thread
From: Andi Kleen @ 2006-10-29 16:31 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, akpm, linux-kernel, ak


>
>  /* nop stub */
>  static void native_nop(void)
> @@ -382,6 +384,26 @@ static fastcall void native_io_delay(voi
>  	asm volatile("outb %al,$0x80");
>  }
>
> +#ifdef CONFIG_X86_LOCAL_APIC

It would be nicer if you renamed the functions in apic.h to native_apic_*
and then do

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
#define apic_read native_apic_read
...
#endif

This way we wouldn't get that much duplication.

This might apply to at least some of the other paravirt ops too.

-Andi

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] header and stubs for paravirtualizing critical operations
  2006-10-28  7:00 ` [PATCH 1/7] header and stubs for paravirtualizing critical operations Chris Wright
@ 2006-10-29 16:40   ` Andi Kleen
  0 siblings, 0 replies; 54+ messages in thread
From: Andi Kleen @ 2006-10-29 16:40 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, akpm, ak, linux-kernel


Can you add a high level comment to entry.S what paravirt mode
is all about and perhaps a quick cheat sheet on the macros?

> +/* SMP boot always wants to use real time delay to allow sufficient time
> for + * the APs to come online */
> +#define USE_REAL_TIME_DELAY

That's ugly. Can't you call different wait functions for that case instead?

> +#ifdef CONFIG_PARAVIRT
> +#include <asm/paravirt.h>
> +#else
> +static inline void init_IRQ(void)
> +{
> +	native_init_IRQ();
> +}
> +#endif /* CONFIG_PARAVIRT */

You could probably avoid a lot of ifdefs by strategic use of 
__attribute__((weak))

> +#ifdef CONFIG_PARAVIRT
> +#include <asm/paravirt.h>
> +#else

This is probably a good candidate for rename to native + wrapper
macros too. Otherwise we'll always have to hack two different
places later.

-Andi

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/7] More generic paravirtualization entry point.
  2006-10-28  7:00 ` [PATCH 3/7] More generic paravirtualization entry point Chris Wright
@ 2006-10-29 16:41   ` Andi Kleen
  0 siblings, 0 replies; 54+ messages in thread
From: Andi Kleen @ 2006-10-29 16:41 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, akpm, ak, linux-kernel

On Saturday 28 October 2006 00:00, Chris Wright wrote:

> +
> +/* We simply declare start_kernel to be the paravirt probe of last resort.
> */ +asmlinkage void __init start_kernel(void);

Didn't rusty put that into a header recently?

-Andi

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-29 16:31   ` Andi Kleen
@ 2006-10-30  3:28     ` Rusty Russell
  2006-10-30 23:11       ` Andi Kleen
  0 siblings, 1 reply; 54+ messages in thread
From: Rusty Russell @ 2006-10-30  3:28 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chris Wright, virtualization, linux-kernel, akpm, ak

On Sun, 2006-10-29 at 08:31 -0800, Andi Kleen wrote:
> It would be nicer if you renamed the functions in apic.h to native_apic_*
> and then do

...

> This might apply to at least some of the other paravirt ops too.

Yes.  I've done the obvious candidates below (as well as responding to
some of your other points).  Many ops are one-liners, and I don't want
to cause too much additional churn.

Cheers!
Rusty.

Subject: Paravirtualization Kleenups

1) Add "cheatsheet" comments to entry.S about macros.
2) Use weak alias for init_IRQ -> native_init_IRQ in !CONFIG_PARAVIRT case.
   This removes an #ifdef.
3) Use shiny new start_kernel.h rather than another declaration.
4) Avoid duplication in paravirt.c: rename set_ldt to native_set_ldt,
   and use macro in !PARAVIRT case.
5) Same trick for apic ops.

There are other cases where we could use a renaming+macro similar
trick to avoid duplication, but they're generally one-liners.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r ea3bae5ebb37 arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S	Mon Oct 30 11:37:19 2006 +1100
+++ b/arch/i386/kernel/entry.S	Mon Oct 30 11:48:34 2006 +1100
@@ -52,6 +52,19 @@
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include "irq_vectors.h"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
 
 #define nr_syscalls ((syscall_table_size)/4)
 
diff -r ea3bae5ebb37 arch/i386/kernel/i8259.c
--- a/arch/i386/kernel/i8259.c	Mon Oct 30 11:37:19 2006 +1100
+++ b/arch/i386/kernel/i8259.c	Mon Oct 30 11:57:55 2006 +1100
@@ -392,6 +392,9 @@ void __init init_ISA_irqs (void)
 	}
 }
 
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
 void __init native_init_IRQ(void)
 {
 	int i;
diff -r ea3bae5ebb37 arch/i386/kernel/paravirt.c
--- a/arch/i386/kernel/paravirt.c	Mon Oct 30 11:37:19 2006 +1100
+++ b/arch/i386/kernel/paravirt.c	Mon Oct 30 12:31:48 2006 +1100
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/efi.h>
 #include <linux/bcd.h>
+#include <linux/start_kernel.h>
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
@@ -135,6 +136,11 @@ static fastcall void native_set_debugreg
 	}
 }
 
+void init_IRQ(void)
+{
+	paravirt_ops.init_IRQ();
+}
+
 static fastcall void native_clts(void)
 {
 	asm volatile ("clts");
@@ -296,22 +302,6 @@ static fastcall void native_load_tr_desc
 	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
 }
 
-static fastcall void native_set_ldt(const void *addr, unsigned int entries)
-{
-	if (likely(entries == 0))
-		__asm__ __volatile__("lldt %w0"::"q" (0));
-	else {
-		unsigned cpu = smp_processor_id();
-		__u32 a, b;
-
-		pack_descriptor(&a, &b, (unsigned long)addr,
-				entries * sizeof(struct desc_struct) - 1,
-				DESCTYPE_LDT, 0);
-		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
-		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
-	} 
-}
-
 static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
 {
 	asm volatile("lgdt %0"::"m" (*dtr));
@@ -385,26 +375,6 @@ static fastcall void native_io_delay(voi
 	asm volatile("outb %al,$0x80");
 }
 
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Basic functions for reading and writing APIC registers
- */
-static fastcall void native_apic_write(unsigned long reg, unsigned long v)
-{
-	*((volatile unsigned long *)(APIC_BASE+reg)) = v;
-}
-
-static fastcall void native_apic_write_atomic(unsigned long reg, unsigned long v)
-{
-	xchg((volatile unsigned long *)(APIC_BASE+reg), v);
-}
-
-static fastcall unsigned long native_apic_read(unsigned long reg)
-{
-	return *((volatile unsigned long *)(APIC_BASE+reg));
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-
 static fastcall void native_flush_tlb(void)
 {
 	__native_flush_tlb();
@@ -508,7 +478,6 @@ core_initcall(print_banner);
 core_initcall(print_banner);
 
 /* We simply declare start_kernel to be the paravirt probe of last resort. */
-asmlinkage void __init start_kernel(void);
 paravirt_probe(start_kernel);
   
 struct paravirt_ops paravirt_ops = {
diff -r ea3bae5ebb37 include/asm-i386/apic.h
--- a/include/asm-i386/apic.h	Mon Oct 30 11:37:19 2006 +1100
+++ b/include/asm-i386/apic.h	Mon Oct 30 12:41:07 2006 +1100
@@ -40,21 +40,27 @@ extern void generic_apic_probe(void);
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-static __inline void apic_write(unsigned long reg, unsigned long v)
+#define apic_write native_apic_write
+#define apic_write_atomic native_apic_write_atomic
+#define apic_read native_apic_read
+#endif
+
+static __inline fastcall void native_apic_write(unsigned long reg,
+						unsigned long v)
 {
 	*((volatile unsigned long *)(APIC_BASE+reg)) = v;
 }
 
-static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static __inline fastcall void native_apic_write_atomic(unsigned long reg,
+						       unsigned long v)
 {
 	xchg((volatile unsigned long *)(APIC_BASE+reg), v);
 }
 
-static __inline unsigned long apic_read(unsigned long reg)
+static __inline fastcall unsigned long native_apic_read(unsigned long reg)
 {
 	return *((volatile unsigned long *)(APIC_BASE+reg));
 }
-#endif
 
 static __inline__ void apic_wait_icr_idle(void)
 {
diff -r ea3bae5ebb37 include/asm-i386/desc.h
--- a/include/asm-i386/desc.h	Mon Oct 30 11:37:19 2006 +1100
+++ b/include/asm-i386/desc.h	Mon Oct 30 12:40:20 2006 +1100
@@ -92,7 +92,11 @@ static inline void write_dt_entry(void *
 	lp[1] = entry_high;
 }
 
-static inline void set_ldt(void *addr, unsigned int entries)
+#define set_ldt native_set_ldt
+#endif /* CONFIG_PARAVIRT */
+
+static inline fastcall void native_set_ldt(const void *addr,
+					   unsigned int entries)
 {
 	if (likely(entries == 0))
 		__asm__ __volatile__("lldt %w0"::"q" (0));
@@ -107,7 +111,6 @@ static inline void set_ldt(void *addr, u
 		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
 	}
 }
-#endif /* CONFIG_PARAVIRT */
 
 static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
 {
diff -r ea3bae5ebb37 include/asm-i386/irq.h
--- a/include/asm-i386/irq.h	Mon Oct 30 11:37:19 2006 +1100
+++ b/include/asm-i386/irq.h	Mon Oct 30 12:01:31 2006 +1100
@@ -41,14 +41,7 @@ extern void fixup_irqs(cpumask_t map);
 extern void fixup_irqs(cpumask_t map);
 #endif
 
+void init_IRQ(void);
 void __init native_init_IRQ(void);
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-static inline void init_IRQ(void)
-{
-	native_init_IRQ();
-}
-#endif /* CONFIG_PARAVIRT */
 
 #endif /* _ASM_IRQ_H */
diff -r ea3bae5ebb37 include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h	Mon Oct 30 11:37:19 2006 +1100
+++ b/include/asm-i386/paravirt.h	Mon Oct 30 12:18:58 2006 +1100
@@ -153,11 +153,6 @@ extern struct paravirt_ops paravirt_ops;
 extern struct paravirt_ops paravirt_ops;
 
 #define paravirt_enabled() (paravirt_ops.paravirt_enabled)
-
-static inline void init_IRQ(void)
-{
-	paravirt_ops.init_IRQ();
-}
 
 static inline void load_esp0(struct tss_struct *tss,
 			     struct thread_struct *thread)

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-30  3:28     ` Rusty Russell
@ 2006-10-30 23:11       ` Andi Kleen
  2006-10-30 23:42         ` Chris Wright
                           ` (2 more replies)
  0 siblings, 3 replies; 54+ messages in thread
From: Andi Kleen @ 2006-10-30 23:11 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, virtualization, Chris Wright, akpm, linux-kernel

> Subject: Paravirtualization Kleenups

Thanks. 

Chris, can you please merge those into the original patchkit? 

I could do it myself, but then retransmits from Chris would be difficult
if anything else would need to be changed.

Also fixing that !-Os compile error in the original patches would be good.

-Andi

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-30 23:11       ` Andi Kleen
@ 2006-10-30 23:42         ` Chris Wright
  2006-10-30 23:46           ` Andi Kleen
  2006-11-01 10:25         ` Rusty Russell
  2006-11-01 10:27         ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Rusty Russell
  2 siblings, 1 reply; 54+ messages in thread
From: Chris Wright @ 2006-10-30 23:42 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Rusty Russell, Andi Kleen, virtualization, Chris Wright, akpm,
	linux-kernel

* Andi Kleen (ak@muc.de) wrote:
> Chris, can you please merge those into the original patchkit? 

Sure, I'll fold those in.

> I could do it myself, but then retransmits from Chris would be difficult
> if anything else would need to be changed.
> 
> Also fixing that !-Os compile error in the original patches would be good.

Hmm, builds fine here.  If you have a .config and/or error message I'll
fix it up.

thanks,
-chris

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-30 23:42         ` Chris Wright
@ 2006-10-30 23:46           ` Andi Kleen
  2006-10-30 23:55             ` Chris Wright
  2006-10-31  1:45             ` Rusty Russell
  0 siblings, 2 replies; 54+ messages in thread
From: Andi Kleen @ 2006-10-30 23:46 UTC (permalink / raw)
  To: Chris Wright; +Cc: Rusty Russell, virtualization, akpm, linux-kernel

On Tuesday 31 October 2006 00:42, Chris Wright wrote:

> > I could do it myself, but then retransmits from Chris would be difficult
> > if anything else would need to be changed.
> > 
> > Also fixing that !-Os compile error in the original patches would be good.
> 
> Hmm, builds fine here.  If you have a .config and/or error message I'll
> fix it up.

I haven't tried it myself (my laptop was on battery all the time
and I didn't want to drain it with a full rebuild ;-), there was just a report
that it didn't work. Or maybe that was with an old patch. If it works it's fine.

-Andi

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-30 23:46           ` Andi Kleen
@ 2006-10-30 23:55             ` Chris Wright
  2006-10-31  1:45             ` Rusty Russell
  1 sibling, 0 replies; 54+ messages in thread
From: Chris Wright @ 2006-10-30 23:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Chris Wright, Rusty Russell, virtualization, akpm, linux-kernel

* Andi Kleen (ak@suse.de) wrote:
> I haven't tried it myself (my laptop was on battery all the time
> and I didn't want to drain it with a full rebuild ;-), there was just a report
> that it didn't work. Or maybe that was with an old patch. If it works it's fine.

Ah yes, I see the report, (it's against a patch that has been redone),
but I'll double check.

thanks,
-chris

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-30 23:46           ` Andi Kleen
  2006-10-30 23:55             ` Chris Wright
@ 2006-10-31  1:45             ` Rusty Russell
  1 sibling, 0 replies; 54+ messages in thread
From: Rusty Russell @ 2006-10-31  1:45 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chris Wright, virtualization, akpm, linux-kernel

On Tue, 2006-10-31 at 00:46 +0100, Andi Kleen wrote:
> On Tuesday 31 October 2006 00:42, Chris Wright wrote:
> 
> > > I could do it myself, but then retransmits from Chris would be difficult
> > > if anything else would need to be changed.
> > > 
> > > Also fixing that !-Os compile error in the original patches would be good.
> > 
> > Hmm, builds fine here.  If you have a .config and/or error message I'll
> > fix it up.
> 
> I haven't tried it myself (my laptop was on battery all the time
> and I didn't want to drain it with a full rebuild ;-), there was just a report
> that it didn't work. Or maybe that was with an old patch. If it works it's fine.

The -Os thing was a red herring.  It was a brokenpatch in the original 4
which for which I immediately sent a fixup to akpm.  Here it is again
below:

==
Move write_dt_entry back: moving it up breaks compile.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

===================================================================
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -78,6 +78,17 @@ static inline void load_TLS(struct threa
 #undef C
 }
 
+#define write_ldt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
+#define write_gdt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
+#define write_idt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
+
+static inline void write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+	u32 *lp = (u32 *)((char *)dt + entry*8);
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
 static inline void set_ldt(void *addr, unsigned int entries)
 {
 	if (likely(entries == 0))
@@ -92,17 +103,6 @@ static inline void set_ldt(void *addr, u
 		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
 		__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
 	}
-}
-
-#define write_ldt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
-#define write_gdt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
-#define write_idt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
-
-static inline void write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
-{
-	u32 *lp = (u32 *)((char *)dt + entry*8);
-	lp[0] = entry_low;
-	lp[1] = entry_high;
 }
 
 static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.
  2006-10-30 23:11       ` Andi Kleen
  2006-10-30 23:42         ` Chris Wright
@ 2006-11-01 10:25         ` Rusty Russell
  2006-11-01 10:27         ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Rusty Russell
  2 siblings, 0 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 10:25 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, virtualization, Chris Wright, akpm, linux-kernel

On Tue, 2006-10-31 at 00:11 +0100, Andi Kleen wrote:
> > Subject: Paravirtualization Kleenups
> 
> Thanks. 
> 
> Chris, can you please merge those into the original patchkit? 
> 
> I could do it myself, but then retransmits from Chris would be difficult
> if anything else would need to be changed.
> 
> Also fixing that !-Os compile error in the original patches would be good.

That is "prep-for-paravirt-desch-clearer-parameter-names-fix.patch" in
rc4-mm1.

I'll follow with the updated series, although the cleanup patch was
pretty clear by itself...

Rusty.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-10-30 23:11       ` Andi Kleen
  2006-10-30 23:42         ` Chris Wright
  2006-11-01 10:25         ` Rusty Russell
@ 2006-11-01 10:27         ` Rusty Russell
  2006-11-01 10:28           ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Rusty Russell
                             ` (4 more replies)
  2 siblings, 5 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 10:27 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, virtualization, Chris Wright, akpm, linux-kernel

Create a paravirt.h header for all the critical operations which need
to be replaced with hypervisor calls, and include that instead of
defining native operations, when CONFIG_PARAVIRT.

This patch does the dumbest possible replacement of paravirtualized
instructions: calls through a "paravirt_ops" structure.  Currently
these are function implementations of native hardware: hypervisors
will override the ops structure with their own variants.

All the pv-ops functions are declared "fastcall" so that a specific
register-based ABI is used, to make inlining assember easier.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Zachary Amsden <zach@vmware.com>

===================================================================
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -196,6 +196,17 @@ config X86_ES7000
 	  should say N here.
 
 endchoice
+
+config PARAVIRT
+	bool "Paravirtualization support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Paravirtualization is a way of running multiple instances of
+	  Linux on the same machine, under a hypervisor.  This option
+	  changes the kernel so it can modify itself when it is run
+	  under a hypervisor, improving performance significantly.
+	  However, when run without a hypervisor the kernel is
+	  theoretically slower.  If in doubt, say N.
 
 config ACPI_SRAT
 	bool
===================================================================
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -9,6 +9,7 @@
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
 
+#undef CONFIG_PARAVIRT
 #include <linux/linkage.h>
 #include <linux/vmalloc.h>
 #include <linux/screen_info.h>
===================================================================
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_prin
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 
 EXTRA_AFLAGS   := -traditional
 
===================================================================
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -101,4 +101,14 @@ void foo(void)
 	BLANK();
  	OFFSET(PDA_cpu, i386_pda, cpu_number);
 	OFFSET(PDA_pcurrent, i386_pda, pcurrent);
+
+#ifdef CONFIG_PARAVIRT
+	BLANK();
+	OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+	OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+	OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+	OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
 }
===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -61,13 +61,6 @@ DF_MASK		= 0x00000400
 DF_MASK		= 0x00000400 
 NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
-
-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS		cli
-#define ENABLE_INTERRUPTS		sti
-#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
-#define INTERRUPT_RETURN		iret
-#define GET_CR0_INTO_EAX		movl %cr0, %eax
 
 #ifdef CONFIG_PREEMPT
 #define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
@@ -416,6 +409,20 @@ ldt_ss:
 	jnz restore_nocheck
 	testl $0x00400000, %eax		# returning to 32bit stack?
 	jnz restore_nocheck		# allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+	/* 
+	 * The kernel can't run on a non-flat stack if paravirt mode
+	 * is active.  Rather than try to fixup the high bits of
+	 * ESP, bypass this code entirely.  This may break DOSemu
+	 * and/or Wine support in a paravirt VM, although the option
+	 * is still available to implement the setting of the high
+	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
+	 */
+	cmpl $0, paravirt_ops+PARAVIRT_enabled
+	jne restore_nocheck
+#endif
+	
 	/* If returning to userspace with 16bit stack,
 	 * try to fix the higher word of ESP, as the CPU
 	 * won't restore it.
@@ -830,6 +837,19 @@ 1:	INTERRUPT_RETURN
 	.long 1b,iret_exc
 .previous
 KPROBE_END(nmi)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1:	iret
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+
+ENTRY(native_irq_enable_sysexit)
+	sti
+	sysexit
+#endif
 
 KPROBE_ENTRY(int3)
 	RING0_INT_FRAME
===================================================================
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -392,7 +392,10 @@ void __init init_ISA_irqs (void)
 	}
 }
 
-void __init init_IRQ(void)
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
 {
 	int i;
 
===================================================================
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1405,7 +1405,7 @@ void __init setup_arch(char **cmdline_p)
 		efi_init();
 	else {
 		printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-		print_memory_map(machine_specific_memory_setup());
+		print_memory_map(memory_setup());
 	}
 
 	copy_edd();
===================================================================
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
  *		Dave Jones	:	Report invalid combinations of Athlon CPUs.
 *		Rusty Russell	:	Hacked into shape for new "hotplug" boot process. */
 
+
+/* SMP boot always wants to use real time delay to allow sufficient time for
+ * the APs to come online */
+#define USE_REAL_TIME_DELAY
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
===================================================================
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
+#include <asm/time.h>
 
 #include "mach_time.h"
 
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long no
 	/* gets recalled with irq locally disabled */
 	/* XXX - does irqsave resolve this? -johnstul */
 	spin_lock_irqsave(&rtc_lock, flags);
-	if (efi_enabled)
-		retval = efi_set_rtc_mmss(nowtime);
-	else
-		retval = mach_set_rtc_mmss(nowtime);
+	retval = set_wallclock(nowtime);
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return retval;
@@ -211,10 +209,7 @@ unsigned long read_persistent_clock(void
 
 	spin_lock_irqsave(&rtc_lock, flags);
 
-	if (efi_enabled)
-		retval = efi_get_time();
-	else
-		retval = mach_get_cmos_time();
+	retval = get_wallclock();
 
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
@@ -280,7 +275,7 @@ static void __init hpet_time_init(void)
 		printk("Using HPET for base-timer\n");
 	}
 
-	time_init_hook();
+	do_time_init();
 }
 #endif
 
@@ -296,5 +291,5 @@ void __init time_init(void)
 		return;
 	}
 #endif
-	time_init_hook();
-}
+	do_time_init();
+}
===================================================================
--- a/drivers/net/de600.c
+++ b/drivers/net/de600.c
@@ -43,7 +43,6 @@ static const char version[] = "de600.c: 
  * modify the following "#define": (see <asm/io.h> for more info)
 #define REALLY_SLOW_IO
  */
-#define SLOW_IO_BY_JUMPING /* Looks "better" than dummy write to port 0x80 :-) */
 
 /* use 0 for production, 1 for verification, >2 for debug */
 #ifdef DE600_DEBUG
===================================================================
--- a/include/asm-i386/delay.h
+++ b/include/asm-i386/delay.h
@@ -15,6 +15,13 @@ extern void __const_udelay(unsigned long
 extern void __const_udelay(unsigned long usecs);
 extern void __delay(unsigned long loops);
 
+#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY)
+#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul)
+	
+#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul)
+
+#else /* !PARAVIRT || USE_REAL_TIME_DELAY */
+
 #define udelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
 	__udelay(n))
@@ -22,6 +29,7 @@ extern void __delay(unsigned long loops)
 #define ndelay(n) (__builtin_constant_p(n) ? \
 	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
 	__ndelay(n))
+#endif
 
 void use_tsc_delay(void);
 
===================================================================
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -55,6 +55,9 @@ static inline void pack_gate(u32 *low, u
 #define DESCTYPE_DPL3	0x60	/* DPL-3 */
 #define DESCTYPE_S	0x10	/* !system */
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
 
 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
@@ -89,7 +92,11 @@ static inline void write_dt_entry(void *
 	lp[1] = entry_high;
 }
 
-static inline void set_ldt(void *addr, unsigned int entries)
+#define set_ldt native_set_ldt
+#endif /* CONFIG_PARAVIRT */
+
+static inline fastcall void native_set_ldt(const void *addr,
+					   unsigned int entries)
 {
 	if (likely(entries == 0))
 		__asm__ __volatile__("lldt %w0"::"q" (0));
===================================================================
--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -256,11 +256,11 @@ static inline void flush_write_buffers(v
 
 #endif /* __KERNEL__ */
 
-#ifdef SLOW_IO_BY_JUMPING
-#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
-#else
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else 
+
 #define __SLOW_DOWN_IO "outb %%al,$0x80;"
-#endif
 
 static inline void slow_down_io(void) {
 	__asm__ __volatile__(
@@ -270,6 +270,8 @@ static inline void slow_down_io(void) {
 #endif
 		: : );
 }
+
+#endif
 
 #ifdef CONFIG_X86_NUMAQ
 extern void *xquad_portio;    /* Where the IO area was mapped */
===================================================================
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -41,4 +41,7 @@ extern void fixup_irqs(cpumask_t map);
 extern void fixup_irqs(cpumask_t map);
 #endif
 
+void init_IRQ(void);
+void __init native_init_IRQ(void);
+
 #endif /* _ASM_IRQ_H */
===================================================================
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -10,6 +10,9 @@
 #ifndef _ASM_IRQFLAGS_H
 #define _ASM_IRQFLAGS_H
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #ifndef __ASSEMBLY__
 
 static inline unsigned long __raw_local_save_flags(void)
@@ -24,9 +27,6 @@ static inline unsigned long __raw_local_
 
 	return flags;
 }
-
-#define raw_local_save_flags(flags) \
-		do { (flags) = __raw_local_save_flags(); } while (0)
 
 static inline void raw_local_irq_restore(unsigned long flags)
 {
@@ -66,18 +66,6 @@ static inline void halt(void)
 	__asm__ __volatile__("hlt": : :"memory");
 }
 
-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
-	return !(flags & (1 << 9));
-}
-
-static inline int raw_irqs_disabled(void)
-{
-	unsigned long flags = __raw_local_save_flags();
-
-	return raw_irqs_disabled_flags(flags);
-}
-
 /*
  * For spinlocks, etc:
  */
@@ -90,9 +78,33 @@ static inline unsigned long __raw_local_
 	return flags;
 }
 
+#else
+#define DISABLE_INTERRUPTS		cli
+#define ENABLE_INTERRUPTS		sti
+#define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
+#define INTERRUPT_RETURN		iret
+#define GET_CR0_INTO_EAX		movl %cr0, %eax
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+		do { (flags) = __raw_local_save_flags(); } while (0)
+
 #define raw_local_irq_save(flags) \
 		do { (flags) = __raw_local_irq_save(); } while (0)
 
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return !(flags & (1 << 9));
+}
+
+static inline int raw_irqs_disabled(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	return raw_irqs_disabled_flags(flags);
+}
 #endif /* __ASSEMBLY__ */
 
 /*
===================================================================
--- a/include/asm-i386/mach-default/setup_arch.h
+++ b/include/asm-i386/mach-default/setup_arch.h
@@ -2,4 +2,6 @@
 
 /* no action for generic */
 
+#ifndef ARCH_SETUP
 #define ARCH_SETUP
+#endif
===================================================================
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -1,5 +1,9 @@
 #ifndef __ASM_MSR_H
 #define __ASM_MSR_H
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 
 /*
  * Access to machine-specific registers (available on 586 and better only)
@@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long
      __asm__ __volatile__("rdpmc" \
 			  : "=a" (low), "=d" (high) \
 			  : "c" (counter))
+#endif	/* !CONFIG_PARAVIRT */
 
 /* symbolic names for some interesting MSRs */
 /* Intel defined MSRs. */
===================================================================
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -146,8 +146,8 @@ static inline void detect_ht(struct cpui
 #define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
 #define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
 
-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
-			   unsigned int *ecx, unsigned int *edx)
+static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx,
+					 unsigned int *ecx, unsigned int *edx)
 {
 	/* ecx is often an input as well as an output. */
 	__asm__("cpuid"
@@ -548,6 +548,12 @@ static inline void rep_nop(void)
 
 #define cpu_relax()	rep_nop()
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_enabled() 0
+#define __cpuid native_cpuid
+
 static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
 {
 	tss->esp0 = thread->esp0;
@@ -570,10 +576,13 @@ static inline void load_esp0(struct tss_
 			: /* no output */			\
 			:"r" (value))
 
+#define set_iopl_mask native_set_iopl_mask
+#endif /* CONFIG_PARAVIRT */
+
 /*
  * Set IOPL bits in EFLAGS from given mask
  */
-static inline void set_iopl_mask(unsigned mask)
+static fastcall inline void native_set_iopl_mask(unsigned mask)
 {
 	unsigned int reg;
 	__asm__ __volatile__ ("pushfl;"
===================================================================
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -131,5 +131,7 @@
 #define SEGMENT_LDT		0x4
 #define SEGMENT_GDT		0x0
 
+#ifndef CONFIG_PARAVIRT
 #define get_kernel_rpl()  0
 #endif
+#endif
===================================================================
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -70,6 +70,14 @@ struct e820entry;
 struct e820entry;
 
 char * __init machine_specific_memory_setup(void);
+#ifndef CONFIG_PARAVIRT
+static inline char *memory_setup(void)
+{
+	return machine_specific_memory_setup();
+}
+#else
+#include <asm/paravirt.h>
+#endif
 
 int __init copy_e820_map(struct e820entry * biosmap, int nr_map);
 int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map);
===================================================================
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -7,8 +7,12 @@
 #include <asm/processor.h>
 #include <linux/compiler.h>
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define CLI_STRING	"cli"
 #define STI_STRING	"sti"
+#endif /* CONFIG_PARAVIRT */
 
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
===================================================================
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -88,6 +88,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
 #define savesegment(seg, value) \
 	asm volatile("mov %%" #seg ",%0":"=rm" (value))
 
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
 #define read_cr0() ({ \
 	unsigned int __dummy; \
 	__asm__ __volatile__( \
@@ -139,16 +142,17 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
 #define write_cr4(x) \
 	__asm__ __volatile__("movl %0,%%cr4": :"r" (x))
 
-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() __asm__ __volatile__ ("clts")
-#define stts() write_cr0(8 | read_cr0())
-
-#endif	/* __KERNEL__ */
-
 #define wbinvd() \
 	__asm__ __volatile__ ("wbinvd": : :"memory")
+
+/* Clear the 'TS' bit */
+#define clts() __asm__ __volatile__ ("clts")
+#endif/* CONFIG_PARAVIRT */
+
+/* Set the 'TS' bit */
+#define stts() write_cr0(8 | read_cr0())
+
+#endif	/* __KERNEL__ */
 
 static inline unsigned long get_limit(unsigned long segment)
 {
===================================================================
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,399 @@
+/*  Paravirtualization interfaces
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+
+/* nop stub */
+static void native_nop(void)
+{
+}
+
+static void __init default_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       paravirt_ops.name);
+}
+
+static fastcall unsigned long native_get_debugreg(int regno)
+{
+	unsigned long val = 0; 	/* Damn you, gcc! */
+
+	switch (regno) {
+	case 0:
+		asm("movl %%db0, %0" :"=r" (val)); break;
+	case 1:
+		asm("movl %%db1, %0" :"=r" (val)); break;
+	case 2:
+		asm("movl %%db2, %0" :"=r" (val)); break;
+	case 3:
+		asm("movl %%db3, %0" :"=r" (val)); break;
+	case 6:
+		asm("movl %%db6, %0" :"=r" (val)); break;
+	case 7:
+		asm("movl %%db7, %0" :"=r" (val)); break;
+	default:
+		BUG();
+	}
+	return val;
+}
+
+static fastcall void native_set_debugreg(int regno, unsigned long value)
+{
+	switch (regno) {
+	case 0:
+		asm("movl %0,%%db0"	: /* no output */ :"r" (value));
+		break;
+	case 1:
+		asm("movl %0,%%db1"	: /* no output */ :"r" (value));
+		break;
+	case 2:
+		asm("movl %0,%%db2"	: /* no output */ :"r" (value));
+		break;
+	case 3:
+		asm("movl %0,%%db3"	: /* no output */ :"r" (value));
+		break;
+	case 6:
+		asm("movl %0,%%db6"	: /* no output */ :"r" (value));
+		break;
+	case 7:
+		asm("movl %0,%%db7"	: /* no output */ :"r" (value));
+		break;
+	default:
+		BUG();
+	}
+}
+
+void init_IRQ(void)
+{
+	paravirt_ops.init_IRQ();
+}
+
+static fastcall void native_clts(void)
+{
+	asm volatile ("clts");
+}
+
+static fastcall unsigned long native_read_cr0(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr0(unsigned long val)
+{
+	asm volatile("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr2(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr2(unsigned long val)
+{
+	asm volatile("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr3(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall void native_write_cr3(unsigned long val)
+{
+	asm volatile("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr4(void)
+{
+	unsigned long val;
+	asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
+	return val;
+}
+
+static fastcall unsigned long native_read_cr4_safe(void)
+{
+	unsigned long val;
+	/* This could fault if %cr4 does not exist */
+	asm("1: movl %%cr4, %0		\n"
+		"2:				\n"
+		".section __ex_table,\"a\"	\n"
+		".long 1b,2b			\n"
+		".previous			\n"
+		: "=r" (val): "0" (0));
+	return val;
+}
+
+static fastcall void native_write_cr4(unsigned long val)
+{
+	asm volatile("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long native_save_fl(void)
+{
+	unsigned long f;
+	asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
+	return f;
+}
+
+static fastcall void native_restore_fl(unsigned long f)
+{
+	asm volatile("pushl %0 ; popfl": /* no output */
+			     :"g" (f)
+			     :"memory", "cc");
+}
+
+static fastcall void native_irq_disable(void)
+{
+	asm volatile("cli": : :"memory");
+}
+
+static fastcall void native_irq_enable(void)
+{
+	asm volatile("sti": : :"memory");
+}
+
+static fastcall void native_safe_halt(void)
+{
+	asm volatile("sti; hlt": : :"memory");
+}
+
+static fastcall void native_halt(void)
+{
+	asm volatile("hlt": : :"memory");
+}
+
+static fastcall void native_wbinvd(void)
+{
+	asm volatile("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+{
+	unsigned long long val;
+
+	asm volatile("2: rdmsr ; xorl %0,%0\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  movl %3,%0 ; jmp 1b\n\t"
+		     ".previous\n\t"
+ 		     ".section __ex_table,\"a\"\n"
+		     "   .align 4\n\t"
+		     "   .long 	2b,3b\n\t"
+		     ".previous"
+		     : "=r" (*err), "=A" (val)
+		     : "c" (msr), "i" (-EFAULT));
+
+	return val;
+}
+
+static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+{
+	int err;
+	asm volatile("2: wrmsr ; xorl %0,%0\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  movl %4,%0 ; jmp 1b\n\t"
+		     ".previous\n\t"
+ 		     ".section __ex_table,\"a\"\n"
+		     "   .align 4\n\t"
+		     "   .long 	2b,3b\n\t"
+		     ".previous"
+		     : "=a" (err)
+		     : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+		       "i" (-EFAULT));
+	return err;
+}
+
+static fastcall unsigned long long native_read_tsc(void)
+{
+	unsigned long long val;
+	asm volatile("rdtsc" : "=A" (val));
+	return val;
+}
+
+static fastcall unsigned long long native_read_pmc(void)
+{
+	unsigned long long val;
+	asm volatile("rdpmc" : "=A" (val));
+	return val;
+}
+
+static fastcall void native_load_tr_desc(void)
+{
+	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+	asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
+{
+	asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+{
+	asm ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+{
+	asm ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long native_store_tr(void)
+{
+	unsigned long tr;
+	asm ("str %0":"=r" (tr));
+	return tr;
+}
+
+static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+	C(0); C(1); C(2);
+#undef C
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+	u32 *lp = (u32 *)((char *)dt + entry*8);
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
+static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_load_esp0(struct tss_struct *tss,
+				      struct thread_struct *thread)
+{
+	tss->esp0 = thread->esp0;
+
+	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+		tss->ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+}
+
+static fastcall void native_io_delay(void)
+{
+	asm volatile("outb %al,$0x80");
+}
+
+/* These are in entry.S */
+extern fastcall void native_iret(void);
+extern fastcall void native_irq_enable_sysexit(void);
+
+static int __init print_banner(void)
+{
+	paravirt_ops.banner();
+	return 0;
+}
+core_initcall(print_banner);
+ 
+struct paravirt_ops paravirt_ops = {
+	.name = "bare hardware",
+	.paravirt_enabled = 0,
+	.kernel_rpl = 0,
+
+	.banner = default_banner,
+	.arch_setup = native_nop,
+	.memory_setup = machine_specific_memory_setup,
+	.get_wallclock = native_get_wallclock,
+	.set_wallclock = native_set_wallclock,
+	.time_init = time_init_hook,
+	.init_IRQ = native_init_IRQ,
+
+	.cpuid = native_cpuid,
+	.get_debugreg = native_get_debugreg,
+	.set_debugreg = native_set_debugreg,
+	.clts = native_clts,
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+	.read_cr2 = native_read_cr2,
+	.write_cr2 = native_write_cr2,
+	.read_cr3 = native_read_cr3,
+	.write_cr3 = native_write_cr3,
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = native_write_cr4,
+	.save_fl = native_save_fl,
+	.restore_fl = native_restore_fl,
+	.irq_disable = native_irq_disable,
+	.irq_enable = native_irq_enable,
+	.safe_halt = native_safe_halt,
+	.halt = native_halt,
+	.wbinvd = native_wbinvd,
+	.read_msr = native_read_msr,
+	.write_msr = native_write_msr,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+	.load_tr_desc = native_load_tr_desc,
+	.set_ldt = native_set_ldt,
+	.load_gdt = native_load_gdt,
+	.load_idt = native_load_idt,
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = native_store_tr,
+	.load_tls = native_load_tls,
+	.write_ldt_entry = native_write_ldt_entry,
+	.write_gdt_entry = native_write_gdt_entry,
+	.write_idt_entry = native_write_idt_entry,
+	.load_esp0 = native_load_esp0,
+
+	.set_iopl_mask = native_set_iopl_mask,
+	.io_delay = native_io_delay,
+	.const_udelay = __const_udelay,
+
+	.irq_enable_sysexit = native_irq_enable_sysexit,
+	.iret = native_iret,
+};
+EXPORT_SYMBOL(paravirt_ops);
===================================================================
--- /dev/null
+++ b/include/asm-i386/paravirt.h
@@ -0,0 +1,286 @@
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+#include <linux/linkage.h>
+
+#ifdef CONFIG_PARAVIRT
+#ifndef __ASSEMBLY__
+struct thread_struct;
+struct Xgt_desc_struct;
+struct tss_struct;
+struct paravirt_ops
+{
+	unsigned int kernel_rpl;
+ 	int paravirt_enabled;
+	const char *name;
+
+	void (*arch_setup)(void);
+	char *(*memory_setup)(void);
+	void (*init_IRQ)(void);
+
+	void (*banner)(void);
+
+	unsigned long (*get_wallclock)(void);
+	int (*set_wallclock)(unsigned long);
+	void (*time_init)(void);
+
+	/* All the function pointers here are declared as "fastcall"
+	   so that we get a specific register-based calling
+	   convention.  This makes it easier to implement inline
+	   assembler replacements. */
+
+	void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx);
+
+	unsigned long (fastcall *get_debugreg)(int regno);
+	void (fastcall *set_debugreg)(int regno, unsigned long value);
+
+	void (fastcall *clts)(void);
+
+	unsigned long (fastcall *read_cr0)(void);
+	void (fastcall *write_cr0)(unsigned long);
+
+	unsigned long (fastcall *read_cr2)(void);
+	void (fastcall *write_cr2)(unsigned long);
+
+	unsigned long (fastcall *read_cr3)(void);
+	void (fastcall *write_cr3)(unsigned long);
+
+	unsigned long (fastcall *read_cr4_safe)(void);
+	unsigned long (fastcall *read_cr4)(void);
+	void (fastcall *write_cr4)(unsigned long);
+
+	unsigned long (fastcall *save_fl)(void);
+	void (fastcall *restore_fl)(unsigned long);
+	void (fastcall *irq_disable)(void);
+	void (fastcall *irq_enable)(void);
+	void (fastcall *safe_halt)(void);
+	void (fastcall *halt)(void);
+	void (fastcall *wbinvd)(void);
+
+	/* err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
+	u64 (fastcall *read_msr)(unsigned int msr, int *err);
+	int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+	u64 (fastcall *read_tsc)(void);
+	u64 (fastcall *read_pmc)(void);
+
+	void (fastcall *load_tr_desc)(void);
+	void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
+	void (fastcall *load_idt)(const struct Xgt_desc_struct *);
+	void (fastcall *store_gdt)(struct Xgt_desc_struct *);
+	void (fastcall *store_idt)(struct Xgt_desc_struct *);
+	void (fastcall *set_ldt)(const void *desc, unsigned entries);
+	unsigned long (fastcall *store_tr)(void);
+	void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu);
+	void (fastcall *write_ldt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *write_gdt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *write_idt_entry)(void *dt, int entrynum,
+					 u32 low, u32 high);
+	void (fastcall *load_esp0)(struct tss_struct *tss,
+				   struct thread_struct *thread);
+
+	void (fastcall *set_iopl_mask)(unsigned mask);
+
+	void (fastcall *io_delay)(void);
+	void (*const_udelay)(unsigned long loops);
+
+	/* These two are jmp to, not actually called. */
+	void (fastcall *irq_enable_sysexit)(void);
+	void (fastcall *iret)(void);
+};
+
+extern struct paravirt_ops paravirt_ops;
+
+#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
+
+static inline void load_esp0(struct tss_struct *tss,
+			     struct thread_struct *thread)
+{
+	paravirt_ops.load_esp0(tss, thread);
+}
+
+#define ARCH_SETUP			paravirt_ops.arch_setup();
+static inline char *memory_setup(void)
+{
+	return paravirt_ops.memory_setup();
+}
+
+static inline unsigned long get_wallclock(void)
+{
+	return paravirt_ops.get_wallclock();
+}
+
+static inline int set_wallclock(unsigned long nowtime)
+{
+	return paravirt_ops.set_wallclock(nowtime);
+}
+
+static inline void do_time_init(void)
+{
+	return paravirt_ops.time_init();
+}
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+			   unsigned int *ecx, unsigned int *edx)
+{
+	paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
+#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)
+
+#define clts() paravirt_ops.clts()
+
+#define read_cr0() paravirt_ops.read_cr0()
+#define write_cr0(x) paravirt_ops.write_cr0(x)
+
+#define read_cr2() paravirt_ops.read_cr2()
+#define write_cr2(x) paravirt_ops.write_cr2(x)
+
+#define read_cr3() paravirt_ops.read_cr3()
+#define write_cr3(x) paravirt_ops.write_cr3(x)
+
+#define read_cr4() paravirt_ops.read_cr4()
+#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
+#define write_cr4(x) paravirt_ops.write_cr4(x)
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	return paravirt_ops.save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+	return paravirt_ops.restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	paravirt_ops.irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	paravirt_ops.irq_enable();
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long flags = paravirt_ops.save_fl();
+
+	paravirt_ops.irq_disable();
+
+	return flags;
+}
+
+static inline void raw_safe_halt(void)
+{
+	paravirt_ops.safe_halt();
+}
+
+static inline void halt(void)
+{
+	paravirt_ops.safe_halt();
+}
+#define wbinvd() paravirt_ops.wbinvd()
+
+#define get_kernel_rpl()  (paravirt_ops.kernel_rpl)
+
+#define rdmsr(msr,val1,val2) do {				\
+	int _err;						\
+	u64 _l = paravirt_ops.read_msr(msr,&_err);		\
+	val1 = (u32)_l;						\
+	val2 = _l >> 32;					\
+} while(0)
+
+#define wrmsr(msr,val1,val2) do {				\
+	u64 _l = ((u64)(val2) << 32) | (val1);			\
+	paravirt_ops.write_msr((msr), _l);			\
+} while(0)
+
+#define rdmsrl(msr,val) do {					\
+	int _err;						\
+	val = paravirt_ops.read_msr((msr),&_err);		\
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsr_safe(msr,a,b) ({					\
+	u64 _l = ((u64)(b) << 32) | (a);			\
+	paravirt_ops.write_msr((msr),_l);			\
+})
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({					\
+	int _err;						\
+	u64 _l = paravirt_ops.read_msr(msr,&_err);		\
+	(*a) = (u32)_l;						\
+	(*b) = _l >> 32;					\
+	_err; })
+
+#define rdtsc(low,high) do {					\
+	u64 _l = paravirt_ops.read_tsc();			\
+	low = (u32)_l;						\
+	high = _l >> 32;					\
+} while(0)
+
+#define rdtscl(low) do {					\
+	u64 _l = paravirt_ops.read_tsc();			\
+	low = (int)_l;						\
+} while(0)
+
+#define rdtscll(val) (val = paravirt_ops.read_tsc())
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) do {				\
+	u64 _l = paravirt_ops.read_pmc();			\
+	low = (u32)_l;						\
+	high = _l >> 32;					\
+} while(0)
+
+#define load_TR_desc() (paravirt_ops.load_tr_desc())
+#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr))
+#define load_idt(dtr) (paravirt_ops.load_idt(dtr))
+#define set_ldt(addr, entries) (paravirt_ops.set_ldt((addr), (entries)))
+#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr))
+#define store_idt(dtr) (paravirt_ops.store_idt(dtr))
+#define store_tr(tr) ((tr) = paravirt_ops.store_tr())
+#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu)))
+#define write_ldt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_ldt_entry((dt), (entry), (low), (high)))
+#define write_gdt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_gdt_entry((dt), (entry), (low), (high)))
+#define write_idt_entry(dt, entry, low, high)				\
+	(paravirt_ops.write_idt_entry((dt), (entry), (low), (high)))
+#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))
+  
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void) {
+	paravirt_ops.io_delay();
+#ifdef REALLY_SLOW_IO
+	paravirt_ops.io_delay();
+	paravirt_ops.io_delay();
+	paravirt_ops.io_delay();
+#endif
+}
+
+#define CLI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
+#define STI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+#else  /* __ASSEMBLY__ */
+
+#define INTERRUPT_RETURN	jmp *%cs:paravirt_ops+PARAVIRT_iret
+#define DISABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS_SYSEXIT	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
+#define GET_CR0_INTO_EAX	call *paravirt_ops+PARAVIRT_read_cr0
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+#endif	/* __ASM_PARAVIRT_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/time.h
@@ -0,0 +1,41 @@
+#ifndef _ASMi386_TIME_H
+#define _ASMi386_TIME_H
+
+#include <linux/efi.h>
+#include "mach_time.h"
+
+static inline unsigned long native_get_wallclock(void)
+{
+	unsigned long retval;
+
+	if (efi_enabled)
+		retval = efi_get_time();
+	else
+		retval = mach_get_cmos_time();
+
+	return retval;
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+	int retval;
+
+	if (efi_enabled)
+		retval = efi_set_rtc_mmss(nowtime);
+	else
+		retval = mach_set_rtc_mmss(nowtime);
+
+	return retval;
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define get_wallclock() native_get_wallclock()
+#define set_wallclock(x) native_set_wallclock(x)
+#define do_time_init() time_init_hook()
+
+#endif /* CONFIG_PARAVIRT */
+
+#endif

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations.
  2006-11-01 10:27         ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Rusty Russell
@ 2006-11-01 10:28           ` Rusty Russell
  2006-11-01 10:29             ` [PATCH 3/7] paravirtualization: More generic paravirtualization entry point Rusty Russell
  2006-11-01 23:27             ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Andrew Morton
  2006-11-01 10:45           ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Arjan van de Ven
                             ` (3 subsequent siblings)
  4 siblings, 2 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 10:28 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, virtualization, Chris Wright, akpm, linux-kernel

It turns out that the most called ops, by several orders of magnitude,
are the interrupt manipulation ops.  These are obvious candidates for
patching, so mark them up and create infrastructure for it.

The method used is that the ops structure has a patch function, which
is called for each place which needs to be patched: this returns a
number of instructions (the rest are NOP-padded).

Usually we can spare a register (%eax) for the binary patched code to
use, but in a couple of critical places in entry.S we can't: we make
the clobbers explicit at the call site, and manually clobber the
allowed registers in debug mode as an extra check.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Zachary Amsden <zach@vmware.com>

===================================================================
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -123,6 +123,20 @@ static unsigned char** find_nop_table(vo
 
 #endif /* CONFIG_X86_64 */
 
+static void nop_out(void *insns, unsigned int len)
+{
+	unsigned char **noptable = find_nop_table();
+
+	while (len > 0) {
+		unsigned int noplen = len;
+		if (noplen > ASM_NOP_MAX)
+			noplen = ASM_NOP_MAX;
+		memcpy(insns, noptable[noplen], noplen);
+		insns += noplen;
+		len -= noplen;
+	}
+}
+
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
 extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -137,10 +151,9 @@ extern u8 __smp_alt_begin[], __smp_alt_e
 
 void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 {
-	unsigned char **noptable = find_nop_table();
 	struct alt_instr *a;
 	u8 *instr;
-	int diff, i, k;
+	int diff;
 
 	DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
 	for (a = start; a < end; a++) {
@@ -158,13 +171,7 @@ void apply_alternatives(struct alt_instr
 #endif
 		memcpy(instr, a->replacement, a->replacementlen);
 		diff = a->instrlen - a->replacementlen;
-		/* Pad the rest with nops */
-		for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
-			k = diff;
-			if (k > ASM_NOP_MAX)
-				k = ASM_NOP_MAX;
-			memcpy(a->instr + i, noptable[k], k);
-		}
+		nop_out(instr + a->replacementlen, diff);
 	}
 }
 
@@ -208,7 +215,6 @@ static void alternatives_smp_lock(u8 **s
 
 static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 {
-	unsigned char **noptable = find_nop_table();
 	u8 **ptr;
 
 	for (ptr = start; ptr < end; ptr++) {
@@ -216,7 +222,7 @@ static void alternatives_smp_unlock(u8 *
 			continue;
 		if (*ptr > text_end)
 			continue;
-		**ptr = noptable[1][0];
+		nop_out(*ptr, 1);
 	};
 }
 
@@ -341,6 +347,43 @@ void alternatives_smp_switch(int smp)
 }
 
 #endif
+
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+	struct paravirt_patch *p;
+	int i;
+
+	for (p = start; p < end; p++) {
+		unsigned int used;
+
+		used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
+					  p->len);
+#ifdef CONFIG_DEBUG_KERNEL
+		/* Deliberately clobber regs using "not %reg" to find bugs. */
+		for (i = 0; i < 3; i++) {
+			if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
+				memcpy(p->instr + used, "\xf7\xd0", 2);
+				p->instr[used+1] |= i;
+				used += 2;
+			}
+		}
+#endif
+		/* Pad the rest with nops */
+		nop_out(p->instr + used, p->len - used);
+	}
+
+	/* Sync to be conservative, in case we patched following instructions */
+	sync_core();
+}
+extern struct paravirt_patch __start_parainstructions[],
+	__stop_parainstructions[];
+#else
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+}
+extern struct paravirt_patch *__start_parainstructions, *__stop_parainstructions;
+#endif	/* CONFIG_PARAVIRT */
 
 void __init alternative_instructions(void)
 {
@@ -389,5 +432,6 @@ void __init alternative_instructions(voi
 		alternatives_smp_switch(0);
 	}
 #endif
+ 	apply_paravirt(__start_parainstructions, __stop_parainstructions);
 	local_irq_restore(flags);
 }
===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -53,6 +53,19 @@
 #include <asm/dwarf2.h>
 #include "irq_vectors.h"
 
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
 #define nr_syscalls ((syscall_table_size)/4)
 
 CF_MASK		= 0x00000001
@@ -63,9 +76,9 @@ VM_MASK		= 0x00020000
 VM_MASK		= 0x00020000
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		DISABLE_INTERRUPTS; TRACE_IRQS_OFF
+#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
-#define preempt_stop
+#define preempt_stop(clobbers)
 #define resume_kernel		restore_nocheck
 #endif
 
@@ -226,7 +239,7 @@ ENTRY(ret_from_fork)
 	ALIGN
 	RING0_PTREGS_FRAME
 ret_from_exception:
-	preempt_stop
+	preempt_stop(CLBR_ANY)
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 check_userspace:
@@ -237,7 +250,7 @@ check_userspace:
 	jb resume_kernel		# not returning to v8086 or userspace
 
 ENTRY(resume_userspace)
- 	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+ 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -248,7 +261,7 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
@@ -277,7 +290,7 @@ sysenter_past_esp:
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs and here we enable it straight after entry:
 	 */
-	ENABLE_INTERRUPTS
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $(__USER_DS)
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET ss, 0*/
@@ -322,7 +335,7 @@ 1:	movl (%ebp),%ebp
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
@@ -364,7 +377,7 @@ syscall_call:
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)		# store the return value
 syscall_exit:
-	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -393,7 +406,7 @@ 1:	INTERRUPT_RETURN
 .section .fixup,"ax"
 iret_exc:
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -436,7 +449,7 @@ ldt_ss:
 	CFI_ADJUST_CFA_OFFSET 4
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
-	DISABLE_INTERRUPTS
+	DISABLE_INTERRUPTS(CLBR_EAX)
 	TRACE_IRQS_OFF
 	lss (%esp), %esp
 	CFI_ADJUST_CFA_OFFSET -8
@@ -451,7 +464,7 @@ work_pending:
 	jz work_notifysig
 work_resched:
 	call schedule
-	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
@@ -507,7 +520,7 @@ syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS		# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS(CLBR_ANY)	# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
 	movl $1, %edx
@@ -691,7 +704,7 @@ ENTRY(device_not_available)
 	GET_CR0_INTO_EAX
 	testl $0x4, %eax		# EM (math emulation bit)
 	jne device_not_available_emulate
-	preempt_stop
+	preempt_stop(CLBR_ANY)
 	call math_state_restore
 	jmp ret_from_exception
 device_not_available_emulate:
===================================================================
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -109,7 +109,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
 {
-	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+		*para = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 
@@ -119,6 +120,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 			alt = s;
 		if (!strcmp(".smp_locks", secstrings + s->sh_name))
 			locks= s;
+		if (!strcmp(".parainstructions", secstrings + s->sh_name))
+			para = s;
 	}
 
 	if (alt) {
@@ -133,6 +136,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 					    lseg, lseg + locks->sh_size,
 					    tseg, tseg + text->sh_size);
 	}
+	if (para) {
+		void *pseg = (void *)para->sh_addr;
+		apply_paravirt(pseg, pseg + para->sh_size);
+	}
 
 	return module_bug_finalize(hdr, sechdrs, me);
 }
===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -38,6 +38,49 @@ static void __init default_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
 	       paravirt_ops.name);
+}
+
+/* Simple instruction patching code. */
+#define DEF_NATIVE(name, code)					\
+	extern const char start_##name[], end_##name[];		\
+	asm("start_" #name ": " code "; end_" #name ":")
+DEF_NATIVE(cli, "cli");
+DEF_NATIVE(sti, "sti");
+DEF_NATIVE(popf, "push %eax; popf");
+DEF_NATIVE(pushf, "pushf; pop %eax");
+DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
+DEF_NATIVE(iret, "iret");
+DEF_NATIVE(sti_sysexit, "sti; sysexit");
+
+static const struct native_insns
+{
+	const char *start, *end;
+} native_insns[] = {
+	[PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+	[PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+	[PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+	[PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+	[PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
+};
+
+static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+	unsigned int insn_len;
+
+	/* Don't touch it if we don't have a replacement */
+	if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
+		return len;
+
+	insn_len = native_insns[type].end - native_insns[type].start;
+
+	/* Similarly if we can't fit replacement. */
+	if (len < insn_len)
+		return len;
+
+	memcpy(insns, native_insns[type].start, insn_len);
+	return insn_len;
 }
 
 static fastcall unsigned long native_get_debugreg(int regno)
@@ -344,6 +387,7 @@ struct paravirt_ops paravirt_ops = {
 	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
 
+ 	.patch = native_patch,
 	.banner = default_banner,
 	.arch_setup = native_nop,
 	.memory_setup = machine_specific_memory_setup,
===================================================================
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -154,6 +154,12 @@ SECTIONS
   .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
 	*(.altinstr_replacement)
   }
+  . = ALIGN(4);
+  __start_parainstructions = .;
+  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+	*(.parainstructions)
+  }
+  __stop_parainstructions = .;
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
   .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
===================================================================
--- a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -118,4 +118,7 @@ static inline void alternatives_smp_swit
 #define LOCK_PREFIX ""
 #endif
 
+struct paravirt_patch;
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
+
 #endif /* _I386_ALTERNATIVE_H */
===================================================================
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -79,8 +79,8 @@ static inline unsigned long __raw_local_
 }
 
 #else
-#define DISABLE_INTERRUPTS		cli
-#define ENABLE_INTERRUPTS		sti
+#define DISABLE_INTERRUPTS(clobbers)	cli
+#define ENABLE_INTERRUPTS(clobbers)	sti
 #define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
 #define INTERRUPT_RETURN		iret
 #define GET_CR0_INTO_EAX		movl %cr0, %eax
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -3,8 +3,26 @@
 /* Various instructions on x86 need to be replaced for
  * para-virtualization: those hooks are defined here. */
 #include <linux/linkage.h>
+#include <linux/stringify.h>
 
 #ifdef CONFIG_PARAVIRT
+/* These are the most performance critical ops, so we want to be able to patch
+ * callers */
+#define PARAVIRT_IRQ_DISABLE 0
+#define PARAVIRT_IRQ_ENABLE 1
+#define PARAVIRT_RESTORE_FLAGS 2
+#define PARAVIRT_SAVE_FLAGS 3
+#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4
+#define PARAVIRT_INTERRUPT_RETURN 5
+#define PARAVIRT_STI_SYSEXIT 6
+
+/* Bitmask of what can be clobbered: usually at least eax. */
+#define CLBR_NONE 0x0
+#define CLBR_EAX 0x1
+#define CLBR_ECX 0x2
+#define CLBR_EDX 0x4
+#define CLBR_ANY 0x7
+
 #ifndef __ASSEMBLY__
 struct thread_struct;
 struct Xgt_desc_struct;
@@ -14,6 +32,15 @@ struct paravirt_ops
 	unsigned int kernel_rpl;
  	int paravirt_enabled;
 	const char *name;
+
+	/*
+	 * Patch may replace one of the defined code sequences with arbitrary
+	 * code, subject to the same register constraints.  This generally
+	 * means the code is not free to clobber any registers other than EAX.
+	 * The patch function should return the number of bytes of code
+	 * generated, as we nop pad the rest in generic code.
+	 */
+	unsigned (*patch)(u8 type, u16 clobber, void *firstinsn, unsigned len);
 
 	void (*arch_setup)(void);
 	char *(*memory_setup)(void);
@@ -151,35 +178,6 @@ static inline void __cpuid(unsigned int 
 #define read_cr4() paravirt_ops.read_cr4()
 #define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
 #define write_cr4(x) paravirt_ops.write_cr4(x)
-
-static inline unsigned long __raw_local_save_flags(void)
-{
-	return paravirt_ops.save_fl();
-}
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
-	return paravirt_ops.restore_fl(flags);
-}
-
-static inline void raw_local_irq_disable(void)
-{
-	paravirt_ops.irq_disable();
-}
-
-static inline void raw_local_irq_enable(void)
-{
-	paravirt_ops.irq_enable();
-}
-
-static inline unsigned long __raw_local_irq_save(void)
-{
-	unsigned long flags = paravirt_ops.save_fl();
-
-	paravirt_ops.irq_disable();
-
-	return flags;
-}
 
 static inline void raw_safe_halt(void)
 {
@@ -272,15 +270,130 @@ static inline void slow_down_io(void) {
 #endif
 }
 
-#define CLI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
-#define STI_STRING	"pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch {
+	u8 *instr; 		/* original instructions */
+	u8 instrtype;		/* type of this instruction */
+	u8 len;			/* length of original instruction */
+	u16 clobbers;		/* what registers you may clobber */
+};
+
+#define paravirt_alt(insn_string, typenum, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	"  .long 771b\n"				\
+	"  .byte " __stringify(typenum) "\n"		\
+	"  .byte 772b-771b\n"				\
+	"  .short " __stringify(clobber) "\n"		\
+	".popsection"
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	unsigned long f;
+
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_SAVE_FLAGS, CLBR_NONE)
+			     : "=a"(f): "m"(paravirt_ops.save_fl)
+			     : "memory", "cc");
+	return f;
+}
+
+static inline void raw_local_irq_restore(unsigned long f)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_RESTORE_FLAGS, CLBR_EAX)
+			     : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f)
+			     : "memory", "cc");
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%0;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_IRQ_DISABLE, CLBR_EAX)
+			     : : "m" (paravirt_ops.irq_disable)
+			     : "memory", "eax", "cc");
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%0;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_IRQ_ENABLE, CLBR_EAX)
+			     : : "m" (paravirt_ops.irq_enable)
+			     : "memory", "eax", "cc");
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long f;
+
+	__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+					   "call *%1; pushl %%eax;"
+					   "call *%2; popl %%eax;"
+					   "popl %%edx; popl %%ecx",
+					  PARAVIRT_SAVE_FLAGS_IRQ_DISABLE,
+					  CLBR_NONE)
+			     : "=a"(f)
+			     : "m" (paravirt_ops.save_fl),
+			       "m" (paravirt_ops.irq_disable)
+			     : "memory", "cc");
+	return f;
+}
+
+#define CLI_STRING paravirt_alt("pushl %ecx; pushl %edx;"		\
+		     "call *paravirt_ops+PARAVIRT_irq_disable;"		\
+		     "popl %edx; popl %ecx",				\
+		     PARAVIRT_IRQ_DISABLE, CLBR_EAX)
+
+#define STI_STRING paravirt_alt("pushl %ecx; pushl %edx;"		\
+		     "call *paravirt_ops+PARAVIRT_irq_enable;"		\
+		     "popl %edx; popl %ecx",				\
+		     PARAVIRT_IRQ_ENABLE, CLBR_EAX)
+#define CLI_STI_CLOBBERS , "%eax"
+
 #else  /* __ASSEMBLY__ */
-
-#define INTERRUPT_RETURN	jmp *%cs:paravirt_ops+PARAVIRT_iret
-#define DISABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
-#define ENABLE_INTERRUPTS	pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
-#define ENABLE_INTERRUPTS_SYSEXIT	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
-#define GET_CR0_INTO_EAX	call *paravirt_ops+PARAVIRT_read_cr0
+  
+#define PARA_PATCH(ptype, clobbers, ops)	\
+771:;						\
+	ops;					\
+772:;						\
+	.pushsection .parainstructions,"a";	\
+	 .long 771b;				\
+	 .byte ptype;				\
+	 .byte 772b-771b;			\
+	 .short clobbers;			\
+	.popsection
+
+#define INTERRUPT_RETURN				\
+	PARA_PATCH(PARAVIRT_INTERRUPT_RETURN, CLBR_ANY,	\
+	jmp *%cs:paravirt_ops+PARAVIRT_iret)
+
+#define DISABLE_INTERRUPTS(clobbers)			\
+	PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers,	\
+	pushl %ecx; pushl %edx;				\
+	call *paravirt_ops+PARAVIRT_irq_disable;	\
+	popl %edx; popl %ecx)				\
+
+#define ENABLE_INTERRUPTS(clobbers)			\
+	PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers,	\
+	pushl %ecx; pushl %edx;				\
+	call *%cs:paravirt_ops+PARAVIRT_irq_enable;	\
+	popl %edx; popl %ecx)
+
+#define ENABLE_INTERRUPTS_SYSEXIT			\
+	PARA_PATCH(PARAVIRT_STI_SYSEXIT, CLBR_ANY,	\
+	jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit)
+
+#define GET_CR0_INTO_EAX			\
+	call *paravirt_ops+PARAVIRT_read_cr0
+
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_PARAVIRT */
 #endif	/* __ASM_PARAVIRT_H */
===================================================================
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -12,6 +12,7 @@
 #else
 #define CLI_STRING	"cli"
 #define STI_STRING	"sti"
+#define CLI_STI_CLOBBERS
 #endif /* CONFIG_PARAVIRT */
 
 /*
@@ -75,7 +76,9 @@ static inline void __raw_spin_lock_flags
 		"jg 1b\n\t"
 		"jmp 4b\n"
 		"5:\n\t"
-		: "+m" (lock->slock) : "r" (flags) : "memory");
+		: "+m" (lock->slock)
+		: "r" (flags)
+		: "memory" CLI_STI_CLOBBERS);
 }
 #endif
 

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 3/7] paravirtualization: More generic paravirtualization entry point.
  2006-11-01 10:28           ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Rusty Russell
@ 2006-11-01 10:29             ` Rusty Russell
  2006-11-01 10:30               ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Rusty Russell
  2006-11-01 23:27             ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Andrew Morton
  1 sibling, 1 reply; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 10:29 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, virtualization, Chris Wright, akpm, linux-kernel

1) Each hypervisor writes a probe function to detect whether we are
   running under that hypervisor.  paravirt_probe() registers this
   function.

2) If vmlinux is booted with ring != 0, we call all the probe
   functions (with registers except %esp intact) in link order: the
   winner will not return.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Zachary Amsden <zach@vmware.com>

===================================================================
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,6 +39,8 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_prin
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+
+# Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 
 EXTRA_AFLAGS   := -traditional
===================================================================
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -54,6 +54,12 @@
  * can.
  */
 ENTRY(startup_32)
+
+#ifdef CONFIG_PARAVIRT
+        movl %cs, %eax
+        testl $0x3, %eax
+        jnz startup_paravirt
+#endif
 
 /*
  * Set segments to known values.
@@ -486,6 +492,33 @@ ignore_int:
 #endif
 	iret
 
+#ifdef CONFIG_PARAVIRT
+startup_paravirt:
+	cld
+ 	movl $(init_thread_union+THREAD_SIZE),%esp
+
+	/* We take pains to preserve all the regs. */
+	pushl	%edx
+	pushl	%ecx
+	pushl	%eax
+
+	/* paravirt.o is last in link, and that probe fn never returns */
+	pushl	$__start_paravirtprobe
+1:
+	movl	0(%esp), %eax
+	pushl	(%eax)
+	movl	8(%esp), %eax
+	call	*(%esp)
+	popl	%eax
+
+	movl	4(%esp), %eax
+	movl	8(%esp), %ecx
+	movl	12(%esp), %edx
+
+	addl	$4, (%esp)
+	jmp	1b
+#endif
+
 /*
  * Real beginning of normal "text" segment
  */
===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/efi.h>
 #include <linux/bcd.h>
+#include <linux/start_kernel.h>
 
 #include <asm/bug.h>
 #include <asm/paravirt.h>
@@ -381,7 +382,10 @@ static int __init print_banner(void)
 	return 0;
 }
 core_initcall(print_banner);
- 
+
+/* We simply declare start_kernel to be the paravirt probe of last resort. */
+paravirt_probe(start_kernel);
+  
 struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
===================================================================
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -60,6 +60,12 @@ SECTIONS
 	CONSTRUCTORS
 	} :data
 
+  __start_paravirtprobe = .;
+  .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
+	*(.paravirtprobe)
+  }
+  __stop_paravirtprobe = .;
+
   . = ALIGN(4096);
   __nosave_begin = .;
   .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -119,6 +119,11 @@ struct paravirt_ops
 	void (fastcall *irq_enable_sysexit)(void);
 	void (fastcall *iret)(void);
 };
+
+/* Mark a paravirt probe function. */
+#define paravirt_probe(fn)						\
+	static void (*__paravirtprobe_##fn)(void) __attribute_used__	\
+		__attribute__((__section__(".paravirtprobe"))) = fn
 
 extern struct paravirt_ops paravirt_ops;
 

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels
  2006-11-01 10:29             ` [PATCH 3/7] paravirtualization: More generic paravirtualization entry point Rusty Russell
@ 2006-11-01 10:30               ` Rusty Russell
  2006-11-01 10:31                 ` [PATCH 5/7] paravirtualization: Allow disabling legacy power management modes with " Rusty Russell
  2006-11-01 23:29                 ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Andrew Morton
  0 siblings, 2 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 10:30 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chris Wright, virtualization, akpm, linux-kernel

Allow selected bug checks to be skipped by paravirt kernels.  The two most
important are the F00F workaround (which is either done by the hypervisor,
or not required), and the 'hlt' instruction check, which can break under
some hypervisors.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

===================================================================
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct 
 	 * Note that the workaround only should be initialized once...
 	 */
 	c->f00f_bug = 0;
-	if ( c->x86 == 5 ) {
+	if (!paravirt_enabled() && c->x86 == 5) {
 		static int f00f_workaround_enabled = 0;
 
 		c->f00f_bug = 1;
===================================================================
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -21,6 +21,7 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
+#include <asm/paravirt.h>
 
 static int __init no_halt(char *s)
 {
@@ -91,6 +92,9 @@ static void __init check_fpu(void)
 
 static void __init check_hlt(void)
 {
+	if (paravirt_enabled())
+		return;
+
 	printk(KERN_INFO "Checking 'hlt' instruction... ");
 	if (!boot_cpu_data.hlt_works_ok) {
 		printk("disabled\n");

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 5/7] paravirtualization: Allow disabling legacy power management modes with paravirt kernels
  2006-11-01 10:30               ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Rusty Russell
@ 2006-11-01 10:31                 ` Rusty Russell
  2006-11-01 10:32                   ` [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops Rusty Russell
  2006-11-01 23:29                 ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Andrew Morton
  1 sibling, 1 reply; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 10:31 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, virtualization, Chris Wright, akpm, linux-kernel

Two legacy power management modes are much easier to just explicitly disable
when running in paravirtualized mode - neither APM nor PnP is still relevant.
The status of ACPI is still debatable, and noacpi is still a common enough
boot parameter that it is not necessary to explicitly disable ACPI.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

===================================================================
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -231,6 +231,7 @@
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/i8253.h>
+#include <asm/paravirt.h>
 
 #include "io_ports.h"
 
@@ -2191,7 +2192,7 @@ static int __init apm_init(void)
 
 	dmi_check_system(apm_dmi_table);
 
-	if (apm_info.bios.version == 0) {
+	if (apm_info.bios.version == 0 || paravirt_enabled()) {
 		printk(KERN_INFO "apm: BIOS not found.\n");
 		return -ENODEV;
 	}
===================================================================
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -530,7 +530,8 @@ static int __init pnpbios_init(void)
 	if (check_legacy_ioport(PNPBIOS_BASE))
 		return -ENODEV;
 #endif
-	if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table)) {
+	if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) ||
+	    paravirt_enabled()) {
 		printk(KERN_INFO "PnPBIOS: Disabled\n");
 		return -ENODEV;
 	}

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops.
  2006-11-01 10:31                 ` [PATCH 5/7] paravirtualization: Allow disabling legacy power management modes with " Rusty Russell
@ 2006-11-01 10:32                   ` Rusty Russell
  2006-11-01 10:34                     ` [PATCH 7/7] paravirtualization: Add mmu virtualization " Rusty Russell
  2006-11-01 23:31                     ` [PATCH 6/7] paravirtualization: Add APIC accessors " Andrew Morton
  0 siblings, 2 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 10:32 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, virtualization, Chris Wright, akpm, linux-kernel

Add APIC accessors to paravirt-ops.  Unfortunately, we need two write
functions, as some older broken hardware requires workarounds for
Pentium APIC errata - this is the purpose of apic_write_atomic.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -29,6 +29,8 @@
 #include <asm/time.h>
 #include <asm/irq.h>
 #include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
 
 /* nop stub */
 static void native_nop(void)
@@ -441,6 +443,12 @@ struct paravirt_ops paravirt_ops = {
 	.io_delay = native_io_delay,
 	.const_udelay = __const_udelay,
 
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = native_apic_write,
+	.apic_write_atomic = native_apic_write_atomic,
+	.apic_read = native_apic_read,
+#endif
+
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 };
===================================================================
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -37,18 +37,27 @@ extern void generic_apic_probe(void);
 /*
  * Basic functions accessing APICs.
  */
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define apic_write native_apic_write
+#define apic_write_atomic native_apic_write_atomic
+#define apic_read native_apic_read
+#endif
 
-static __inline void apic_write(unsigned long reg, unsigned long v)
+static __inline fastcall void native_apic_write(unsigned long reg,
+						unsigned long v)
 {
 	*((volatile unsigned long *)(APIC_BASE+reg)) = v;
 }
 
-static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static __inline fastcall void native_apic_write_atomic(unsigned long reg,
+						       unsigned long v)
 {
 	xchg((volatile unsigned long *)(APIC_BASE+reg), v);
 }
 
-static __inline unsigned long apic_read(unsigned long reg)
+static __inline fastcall unsigned long native_apic_read(unsigned long reg)
 {
 	return *((volatile unsigned long *)(APIC_BASE+reg));
 }
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -114,6 +114,12 @@ struct paravirt_ops
 
 	void (fastcall *io_delay)(void);
 	void (*const_udelay)(unsigned long loops);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	void (fastcall *apic_write)(unsigned long reg, unsigned long v);
+	void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
+	unsigned long (fastcall *apic_read)(unsigned long reg);
+#endif
 
 	/* These two are jmp to, not actually called. */
 	void (fastcall *irq_enable_sysexit)(void);
@@ -275,6 +281,27 @@ static inline void slow_down_io(void) {
 #endif
 }
 
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Basic functions accessing APICs.
+ */
+static __inline void apic_write(unsigned long reg, unsigned long v)
+{
+	paravirt_ops.apic_write(reg,v);
+}
+
+static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+{
+	paravirt_ops.apic_write_atomic(reg,v);
+}
+
+static __inline unsigned long apic_read(unsigned long reg)
+{
+	return paravirt_ops.apic_read(reg);
+}
+#endif
+
+
 /* These all sit in the .parainstructions section to tell us what to patch. */
 struct paravirt_patch {
 	u8 *instr; 		/* original instructions */

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 7/7] paravirtualization: Add mmu virtualization to paravirt-ops.
  2006-11-01 10:32                   ` [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops Rusty Russell
@ 2006-11-01 10:34                     ` Rusty Russell
  2006-11-01 23:31                     ` [PATCH 6/7] paravirtualization: Add APIC accessors " Andrew Morton
  1 sibling, 0 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 10:34 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, virtualization, Chris Wright, akpm, linux-kernel

Add the three bare TLB accessor functions to paravirt-ops.  Most amusingly,
flush_tlb is redefined on SMP, so I can't call the paravirt op flush_tlb.
Instead, I chose to indicate the actual flush type, kernel (global) vs. user
(non-global).  Global in this sense means using the global bit in the page
table entry, which makes TLB entries persistent across CR3 reloads, not
global as in the SMP sense of invoking remote shootdowns, so the term is
confusingly overloaded.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>

===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -31,6 +31,7 @@
 #include <asm/delay.h>
 #include <asm/fixmap.h>
 #include <asm/apic.h>
+#include <asm/tlbflush.h>
 
 /* nop stub */
 static void native_nop(void)
@@ -373,6 +374,97 @@ static fastcall void native_io_delay(voi
 {
 	asm volatile("outb %al,$0x80");
 }
+
+static fastcall void native_flush_tlb(void)
+{
+	__native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static fastcall void native_flush_tlb_global(void)
+{
+	__native_flush_tlb_global();
+}
+
+static fastcall void native_flush_tlb_single(u32 addr)
+{
+	__native_flush_tlb_single(addr);
+}
+
+#ifndef CONFIG_X86_PAE
+static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	*pmdp = pmdval;
+}
+
+#else /* CONFIG_X86_PAE */
+
+static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	set_64bit((unsigned long long *)ptep,pte_val(pteval));
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
+}
+
+static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
+{
+	*pudp = pudval;
+}
+
+static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = 0;
+}
+
+static fastcall void native_pmd_clear(pmd_t *pmd)
+{
+	u32 *tmp = (u32 *)pmd;
+	*tmp = 0;
+	smp_wmb();
+	*(tmp + 1) = 0;
+}
+#endif /* CONFIG_X86_PAE */
 
 /* These are in entry.S */
 extern fastcall void native_iret(void);
@@ -449,6 +541,23 @@ struct paravirt_ops paravirt_ops = {
 	.apic_read = native_apic_read,
 #endif
 
+	.flush_tlb_user = native_flush_tlb,
+	.flush_tlb_kernel = native_flush_tlb_global,
+	.flush_tlb_single = native_flush_tlb_single,
+
+	.set_pte = native_set_pte,
+	.set_pte_at = native_set_pte_at,
+	.set_pmd = native_set_pmd,
+	.pte_update = (void *)native_nop,
+	.pte_update_defer = (void *)native_nop,
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = native_set_pte_atomic,
+	.set_pte_present = native_set_pte_present,
+	.set_pud = native_set_pud,
+	.pte_clear = native_pte_clear,
+	.pmd_clear = native_pmd_clear,
+#endif
+
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 };
===================================================================
--- a/arch/i386/mm/boot_ioremap.c
+++ b/arch/i386/mm/boot_ioremap.c
@@ -16,6 +16,7 @@
  */
 
 #undef CONFIG_X86_PAE
+#undef CONFIG_PARAVIRT
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -4,6 +4,7 @@
  * para-virtualization: those hooks are defined here. */
 #include <linux/linkage.h>
 #include <linux/stringify.h>
+#include <asm/page.h>
 
 #ifdef CONFIG_PARAVIRT
 /* These are the most performance critical ops, so we want to be able to patch
@@ -27,6 +28,7 @@ struct thread_struct;
 struct thread_struct;
 struct Xgt_desc_struct;
 struct tss_struct;
+struct mm_struct;
 struct paravirt_ops
 {
 	unsigned int kernel_rpl;
@@ -119,6 +121,23 @@ struct paravirt_ops
 	void (fastcall *apic_write)(unsigned long reg, unsigned long v);
 	void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
 	unsigned long (fastcall *apic_read)(unsigned long reg);
+#endif
+
+	void (fastcall *flush_tlb_user)(void);
+	void (fastcall *flush_tlb_kernel)(void);
+	void (fastcall *flush_tlb_single)(u32 addr);
+
+	void (fastcall *set_pte)(pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+	void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep);
+	void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep);
+#ifdef CONFIG_X86_PAE
+	void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval);
+	void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte);
+	void (fastcall *set_pud)(pud_t *pudp, pud_t pudval);
+	void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+	void (fastcall *pmd_clear)(pmd_t *pmdp);
 #endif
 
 	/* These two are jmp to, not actually called. */
@@ -302,6 +321,62 @@ static __inline unsigned long apic_read(
 #endif
 
 
+#define __flush_tlb() paravirt_ops.flush_tlb_user()
+#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
+#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
+
+static inline void set_pte(pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte(ptep, pteval);
+}
+
+static inline void set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte_at(mm, addr, ptep, pteval);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	paravirt_ops.set_pmd(pmdp, pmdval);
+}
+
+static inline void pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	paravirt_ops.pte_update(mm, addr, ptep);
+}
+
+static inline void pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	paravirt_ops.pte_update_defer(mm, addr, ptep);
+}
+
+#ifdef CONFIG_X86_PAE
+static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	paravirt_ops.set_pte_atomic(ptep, pteval);
+}
+
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	paravirt_ops.set_pte_present(mm, addr, ptep, pte);
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pudval)
+{
+	paravirt_ops.set_pud(pudp, pudval);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	paravirt_ops.pte_clear(mm, addr, ptep);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+	paravirt_ops.pmd_clear(pmdp);
+}
+#endif
+
 /* These all sit in the .parainstructions section to tell us what to patch. */
 struct paravirt_patch {
 	u8 *instr; 		/* original instructions */
===================================================================
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -11,11 +11,14 @@
  * within a page table are directly modified.  Thus, the following
  * hook is made available.
  */
+#ifndef CONFIG_PARAVIRT
 #define set_pte(pteptr, pteval) (*(pteptr) = pteval)
 #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
+#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
+#endif
+
 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
 #define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval)
-#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
 
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
===================================================================
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -42,6 +42,7 @@ static inline int pte_exec_kernel(pte_t 
 	return pte_x(pte);
 }
 
+#ifndef CONFIG_PARAVIRT
 /* Rules for using set_pte: the pte being assigned *must* be
  * either not present or in a state where the hardware will
  * not attempt to update the pte.  In places where this is
@@ -71,32 +72,6 @@ static inline void set_pte_present(struc
 	ptep->pte_low = pte.pte_low;
 }
 
-#define set_pte_atomic(pteptr,pteval) \
-		set_64bit((unsigned long long *)(pteptr),pte_val(pteval))
-#define set_pmd(pmdptr,pmdval) \
-		set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
-#define set_pud(pudptr,pudval) \
-		(*(pudptr) = (pudval))
-
-/*
- * Pentium-II erratum A13: in PAE mode we explicitly have to flush
- * the TLB via cr3 if the top-level pgd is changed...
- * We do not let the generic code free and clear pgd entries due to
- * this erratum.
- */
-static inline void pud_clear (pud_t * pud) { }
-
-#define pud_page(pud) \
-((struct page *) __va(pud_val(pud) & PAGE_MASK))
-
-#define pud_page_vaddr(pud) \
-((unsigned long) __va(pud_val(pud) & PAGE_MASK))
-
-
-/* Find an entry in the second-level page table.. */
-#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
-			pmd_index(address))
-
 /*
  * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
  * entry, so clear the bottom half first and enforce ordering with a compiler
@@ -116,6 +91,33 @@ static inline void pmd_clear(pmd_t *pmd)
 	smp_wmb();
 	*(tmp + 1) = 0;
 }
+
+#define set_pte_atomic(pteptr,pteval) \
+		set_64bit((unsigned long long *)(pteptr),pte_val(pteval))
+#define set_pmd(pmdptr,pmdval) \
+		set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
+#define set_pud(pudptr,pudval) \
+		(*(pudptr) = (pudval))
+#endif
+
+/*
+ * Pentium-II erratum A13: in PAE mode we explicitly have to flush
+ * the TLB via cr3 if the top-level pgd is changed...
+ * We do not let the generic code free and clear pgd entries due to
+ * this erratum.
+ */
+static inline void pud_clear (pud_t * pud) { }
+
+#define pud_page(pud) \
+((struct page *) __va(pud_val(pud) & PAGE_MASK))
+
+#define pud_page_vaddr(pud) \
+((unsigned long) __va(pud_val(pud) & PAGE_MASK))
+
+
+/* Find an entry in the second-level page table.. */
+#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
+			pmd_index(address))
 
 static inline pte_t raw_ptep_get_and_clear(pte_t *ptep)
 {
===================================================================
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -15,6 +15,7 @@
 #include <asm/processor.h>
 #include <asm/fixmap.h>
 #include <linux/threads.h>
+#include <asm/paravirt.h>
 
 #ifndef _I386_BITOPS_H
 #include <asm/bitops.h>
@@ -246,6 +247,7 @@ static inline pte_t pte_mkhuge(pte_t pte
 # include <asm/pgtable-2level.h>
 #endif
 
+#ifndef CONFIG_PARAVIRT
 /*
  * Rules for using pte_update - it must be called after any PTE update which
  * has not been done using the set_pte / clear_pte interfaces.  It is used by
@@ -261,7 +263,7 @@ static inline pte_t pte_mkhuge(pte_t pte
  */
 #define pte_update(mm, addr, ptep)		do { } while (0)
 #define pte_update_defer(mm, addr, ptep)	do { } while (0)
-
+#endif
 
 /*
  * We only update the dirty/accessed state if we set
===================================================================
--- a/include/asm-i386/tlbflush.h
+++ b/include/asm-i386/tlbflush.h
@@ -4,7 +4,15 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
-#define __flush_tlb()							\
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+#define __native_flush_tlb()						\
 	do {								\
 		unsigned int tmpreg;					\
 									\
@@ -19,7 +27,7 @@
  * Global pages have to be flushed a bit differently. Not a real
  * performance problem because this does not happen often.
  */
-#define __flush_tlb_global()						\
+#define __native_flush_tlb_global()					\
 	do {								\
 		unsigned int tmpreg, cr4, cr4_orig;			\
 									\
@@ -36,6 +44,9 @@
 			: "memory");					\
 	} while (0)
 
+#define __native_flush_tlb_single(addr) 				\
+	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
+
 # define __flush_tlb_all()						\
 	do {								\
 		if (cpu_has_pge)					\
@@ -45,9 +56,6 @@
 	} while (0)
 
 #define cpu_has_invlpg	(boot_cpu_data.x86 > 3)
-
-#define __flush_tlb_single(addr) \
-	__asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
 
 #ifdef CONFIG_X86_INVLPG
 # define __flush_tlb_one(addr) __flush_tlb_single(addr)

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-01 10:27         ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Rusty Russell
  2006-11-01 10:28           ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Rusty Russell
@ 2006-11-01 10:45           ` Arjan van de Ven
  2006-11-01 17:27             ` Andi Kleen
  2006-11-01 23:32             ` Rusty Russell
  2006-11-02  7:13           ` Andrew Morton
                             ` (2 subsequent siblings)
  4 siblings, 2 replies; 54+ messages in thread
From: Arjan van de Ven @ 2006-11-01 10:45 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Chris Wright, Andi Kleen, akpm, virtualization, linux-kernel

On Wed, 2006-11-01 at 21:27 +1100, Rusty Russell wrote:
> Create a paravirt.h header for all the critical operations which need
> to be replaced with hypervisor calls, and include that instead of
> defining native operations, when CONFIG_PARAVIRT.
> 
> This patch does the dumbest possible replacement of paravirtualized
> instructions: calls through a "paravirt_ops" structure.  Currently
> these are function implementations of native hardware: hypervisors
> will override the ops structure with their own variants.
> 
> All the pv-ops functions are declared "fastcall" so that a specific
> register-based ABI is used, to make inlining assember easier.


this is a lot of infrastructure... do we have more than 1 user of this
yet that wants to get merged in mainline?


-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels
  2006-10-28  7:00 ` [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels Chris Wright
@ 2006-11-01 12:17   ` Pavel Machek
  2006-11-01 22:40     ` Dave Jones
  2006-11-01 23:24     ` Zachary Amsden
  0 siblings, 2 replies; 54+ messages in thread
From: Pavel Machek @ 2006-11-01 12:17 UTC (permalink / raw)
  To: Chris Wright
  Cc: akpm, ak, Rusty Russell, Jeremy Fitzhardinge, Zachary Amsden,
	linux-kernel, virtualization

On Sat 2006-10-28 00:00:04, Chris Wright wrote:
> Allow selected bug checks to be skipped by paravirt kernels.  The two most
> important are the F00F workaround (which is either done by the hypervisor,
> or not required), and the 'hlt' instruction check, which can break under
> some hypervisors.

How can hlt check break? It is hlt;hlt;hlt, IIRC, that looks fairly
innocent to me.

> --- linux-2.6-pv.orig/arch/i386/kernel/cpu/intel.c
> +++ linux-2.6-pv/arch/i386/kernel/cpu/intel.c
> @@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct 
>  	 * Note that the workaround only should be initialized once...
>  	 */
>  	c->f00f_bug = 0;
> -	if ( c->x86 == 5 ) {
> +	if (!paravirt_enabled() && c->x86 == 5) {

I'd do x86==5 check first... pentiums are not common any more.

								Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-01 10:45           ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Arjan van de Ven
@ 2006-11-01 17:27             ` Andi Kleen
  2006-11-01 23:32             ` Rusty Russell
  1 sibling, 0 replies; 54+ messages in thread
From: Andi Kleen @ 2006-11-01 17:27 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Rusty Russell, virtualization, Chris Wright, akpm, linux-kernel

On Wednesday 01 November 2006 11:45, Arjan van de Ven wrote:

> this is a lot of infrastructure... do we have more than 1 user of this
> yet that wants to get merged in mainline?

AFAIK xen, vmi, lhype (and native ops).

-Andi

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels
  2006-11-01 12:17   ` Pavel Machek
@ 2006-11-01 22:40     ` Dave Jones
  2006-11-01 23:24     ` Zachary Amsden
  1 sibling, 0 replies; 54+ messages in thread
From: Dave Jones @ 2006-11-01 22:40 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Chris Wright, akpm, ak, Rusty Russell, Jeremy Fitzhardinge,
	Zachary Amsden, linux-kernel, virtualization

On Wed, Nov 01, 2006 at 01:17:53PM +0100, Pavel Machek wrote:

 > > +++ linux-2.6-pv/arch/i386/kernel/cpu/intel.c
 > > @@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct 
 > >  	 * Note that the workaround only should be initialized once...
 > >  	 */
 > >  	c->f00f_bug = 0;
 > > -	if ( c->x86 == 5 ) {
 > > +	if (!paravirt_enabled() && c->x86 == 5) {
 > 
 > I'd do x86==5 check first... pentiums are not common any more.

It's not like paravirt_enabled will be common-case either,
and is this isn't exactly a performance critical piece of code,
it doesn't really matter which way around the checks are done.

	Dave

-- 
http://www.codemonkey.org.uk

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels
  2006-11-01 12:17   ` Pavel Machek
  2006-11-01 22:40     ` Dave Jones
@ 2006-11-01 23:24     ` Zachary Amsden
  2006-11-02 10:20       ` Pavel Machek
  1 sibling, 1 reply; 54+ messages in thread
From: Zachary Amsden @ 2006-11-01 23:24 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Chris Wright, akpm, ak, Rusty Russell, Jeremy Fitzhardinge,
	linux-kernel, virtualization

Pavel Machek wrote:
> On Sat 2006-10-28 00:00:04, Chris Wright wrote:
>   
>> Allow selected bug checks to be skipped by paravirt kernels.  The two most
>> important are the F00F workaround (which is either done by the hypervisor,
>> or not required), and the 'hlt' instruction check, which can break under
>> some hypervisors.
>>     
>
> How can hlt check break? It is hlt;hlt;hlt, IIRC, that looks fairly
> innocent to me.
>   

Not if you use tickless timers that don't generate interrupts to unhalt 
you, or if you delay ticks until the next scheduled timeout and you 
haven't yet scheduled any timeout.  Both are likely in a hypervisor.

Zach

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations.
  2006-11-01 10:28           ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Rusty Russell
  2006-11-01 10:29             ` [PATCH 3/7] paravirtualization: More generic paravirtualization entry point Rusty Russell
@ 2006-11-01 23:27             ` Andrew Morton
  2006-11-02  0:47               ` Rusty Russell
  1 sibling, 1 reply; 54+ messages in thread
From: Andrew Morton @ 2006-11-01 23:27 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Chris Wright, Andi Kleen, linux-kernel, virtualization

On Wed, 01 Nov 2006 21:28:13 +1100
Rusty Russell <rusty@rustcorp.com.au> wrote:

> +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
> +{
> +	struct paravirt_patch *p;
> +	int i;
> +
> +	for (p = start; p < end; p++) {
> +		unsigned int used;
> +
> +		used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
> +					  p->len);
> +#ifdef CONFIG_DEBUG_KERNEL
> +		/* Deliberately clobber regs using "not %reg" to find bugs. */

That would be considered to be abusive of CONFIG_DEBUG_KERNEL.  A
CONFIG_DEBUG_PARAVIRT which depends on CONFIG_DEBUG_KERNEL would be more
harmonious.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels
  2006-11-01 10:30               ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Rusty Russell
  2006-11-01 10:31                 ` [PATCH 5/7] paravirtualization: Allow disabling legacy power management modes with " Rusty Russell
@ 2006-11-01 23:29                 ` Andrew Morton
  2006-11-01 23:58                   ` Jeremy Fitzhardinge
  2006-11-02  0:01                   ` Rusty Russell
  1 sibling, 2 replies; 54+ messages in thread
From: Andrew Morton @ 2006-11-01 23:29 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Chris Wright, Andi Kleen, linux-kernel, virtualization

On Wed, 01 Nov 2006 21:30:43 +1100
Rusty Russell <rusty@rustcorp.com.au> wrote:

> --- a/include/asm-i386/bugs.h
> +++ b/include/asm-i386/bugs.h
> @@ -21,6 +21,7 @@
>  #include <asm/processor.h>
>  #include <asm/i387.h>
>  #include <asm/msr.h>
> +#include <asm/paravirt.h>

In many other places you have

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
...

But not here.

Making <asm/paravirt.h> invulnerable would be the more typical approach.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops.
  2006-11-01 10:32                   ` [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops Rusty Russell
  2006-11-01 10:34                     ` [PATCH 7/7] paravirtualization: Add mmu virtualization " Rusty Russell
@ 2006-11-01 23:31                     ` Andrew Morton
  2006-11-02  0:46                       ` Rusty Russell
  1 sibling, 1 reply; 54+ messages in thread
From: Andrew Morton @ 2006-11-01 23:31 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Chris Wright, Andi Kleen, linux-kernel, virtualization

On Wed, 01 Nov 2006 21:32:30 +1100
Rusty Russell <rusty@rustcorp.com.au> wrote:

> +static __inline void apic_write(unsigned long reg, unsigned long v)
> +static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
> +static __inline unsigned long apic_read(unsigned long reg)

Just `inline', please.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-01 10:45           ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Arjan van de Ven
  2006-11-01 17:27             ` Andi Kleen
@ 2006-11-01 23:32             ` Rusty Russell
  1 sibling, 0 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-01 23:32 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, Andi Kleen, virtualization, Chris Wright, akpm,
	linux-kernel

On Wed, 2006-11-01 at 11:45 +0100, Arjan van de Ven wrote:
> On Wed, 2006-11-01 at 21:27 +1100, Rusty Russell wrote:
> > Create a paravirt.h header for all the critical operations which need
> > to be replaced with hypervisor calls, and include that instead of
> > defining native operations, when CONFIG_PARAVIRT.
> this is a lot of infrastructure... do we have more than 1 user of this
> yet that wants to get merged in mainline?

Yep.  Xen and VMI both have patches on top of this pending merge.  I
also have a toy hypervisor "lhype" based on this, but it's not ready for
mainline.  (It seems people expect consoles to do *input* as well as
output).

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels
  2006-11-01 23:29                 ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Andrew Morton
@ 2006-11-01 23:58                   ` Jeremy Fitzhardinge
  2006-11-02  0:01                   ` Rusty Russell
  1 sibling, 0 replies; 54+ messages in thread
From: Jeremy Fitzhardinge @ 2006-11-01 23:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chris Wright, Andi Kleen, linux-kernel, virtualization

Andrew Morton wrote:
> On Wed, 01 Nov 2006 21:30:43 +1100
> Rusty Russell <rusty@rustcorp.com.au> wrote:
>
>   
>> --- a/include/asm-i386/bugs.h
>> +++ b/include/asm-i386/bugs.h
>> @@ -21,6 +21,7 @@
>>  #include <asm/processor.h>
>>  #include <asm/i387.h>
>>  #include <asm/msr.h>
>> +#include <asm/paravirt.h>
>>     
>
> In many other places you have
>
> #ifdef CONFIG_PARAVIRT
> #include <asm/paravirt.h>
> ...
>
> But not here.
>
> Making <asm/paravirt.h> invulnerable would be the more typical approach.
CONFIG_PARAVIRT is not being used to guard asm/paravirt.h from multiple 
inclusion.  In places where it is being used to guard #include 
<asm/paravirt.h>, the idea is that asm/paravirt.h defines various 
inlines/macros which would otherwise be defined in the header.  So, for 
example, asm/desc.h would normally define load_gdt() in the 
!CONFIG_PARAVIRT case, but asm/paravirt.h defines it when 
CONFIG_PARAVIRT is enabled. 

In this case, asm/paravirt.h included because we need the definition for 
paravirt_enabled(), not because it is replacing any of bugs.h's definitions.

    J

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels
  2006-11-01 23:29                 ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Andrew Morton
  2006-11-01 23:58                   ` Jeremy Fitzhardinge
@ 2006-11-02  0:01                   ` Rusty Russell
  1 sibling, 0 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-02  0:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chris Wright, Andi Kleen, linux-kernel, virtualization

On Wed, 2006-11-01 at 15:29 -0800, Andrew Morton wrote:
> On Wed, 01 Nov 2006 21:30:43 +1100
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> 
> > --- a/include/asm-i386/bugs.h
> > +++ b/include/asm-i386/bugs.h
> > @@ -21,6 +21,7 @@
> >  #include <asm/processor.h>
> >  #include <asm/i387.h>
> >  #include <asm/msr.h>
> > +#include <asm/paravirt.h>
> 
> In many other places you have
> 
> #ifdef CONFIG_PARAVIRT
> #include <asm/paravirt.h>
> ...
> 
> But not here.
> 
> Making <asm/paravirt.h> invulnerable would be the more typical approach.

It *is* actually safe.  The "#ifdef CONFIG_PARAVIRT / #include
<asm/paravirt.h> / #else / <... native versions...>" is to give a big
hint to the reader to look in paravirt.h for the real definitions.

Originally I had a noparavirt.h where all these lived, and people hated
it.  So we did it this way, which minimizes churn.

Rusty.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops.
  2006-11-01 23:31                     ` [PATCH 6/7] paravirtualization: Add APIC accessors " Andrew Morton
@ 2006-11-02  0:46                       ` Rusty Russell
  0 siblings, 0 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-02  0:46 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chris Wright, Andi Kleen, linux-kernel, virtualization

On Wed, 2006-11-01 at 15:31 -0800, Andrew Morton wrote:
> On Wed, 01 Nov 2006 21:32:30 +1100
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> 
> > +static __inline void apic_write(unsigned long reg, unsigned long v)
> > +static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
> > +static __inline unsigned long apic_read(unsigned long reg)
> 
> Just `inline', please.

akpm says: "Just `inline', please."

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 3a3bc9aed04c include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h	Thu Nov 02 11:42:22 2006 +1100
+++ b/include/asm-i386/paravirt.h	Thu Nov 02 11:44:15 2006 +1100
@@ -304,17 +304,17 @@ static inline void slow_down_io(void) {
 /*
  * Basic functions accessing APICs.
  */
-static __inline void apic_write(unsigned long reg, unsigned long v)
+static inline void apic_write(unsigned long reg, unsigned long v)
 {
 	paravirt_ops.apic_write(reg,v);
 }
 
-static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static inline void apic_write_atomic(unsigned long reg, unsigned long v)
 {
 	paravirt_ops.apic_write_atomic(reg,v);
 }
 
-static __inline unsigned long apic_read(unsigned long reg)
+static inline unsigned long apic_read(unsigned long reg)
 {
 	return paravirt_ops.apic_read(reg);
 }

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations.
  2006-11-01 23:27             ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Andrew Morton
@ 2006-11-02  0:47               ` Rusty Russell
  2006-11-02  0:54                 ` Zachary Amsden
  0 siblings, 1 reply; 54+ messages in thread
From: Rusty Russell @ 2006-11-02  0:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chris Wright, Andi Kleen, linux-kernel, virtualization

On Wed, 2006-11-01 at 15:27 -0800, Andrew Morton wrote:
> On Wed, 01 Nov 2006 21:28:13 +1100
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> > +#ifdef CONFIG_DEBUG_KERNEL
> > +		/* Deliberately clobber regs using "not %reg" to find bugs. */
> 
> That would be considered to be abusive of CONFIG_DEBUG_KERNEL.  A
> CONFIG_DEBUG_PARAVIRT which depends on CONFIG_DEBUG_KERNEL would be more
> harmonious.

I wasn't sure.  Making a config option for what is a one-liner seemed
overkill.

==

Don't abuse CONFIG_DEBUG_KERNEL, add CONFIG_DEBUG_PARAVIRT.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 2707c89d72f0 arch/i386/Kconfig.debug
--- a/arch/i386/Kconfig.debug	Thu Nov 02 10:14:50 2006 +1100
+++ b/arch/i386/Kconfig.debug	Thu Nov 02 11:41:20 2006 +1100
@@ -87,4 +87,14 @@ config DOUBLEFAULT
           option saves about 4k and might cause you much additional grey
           hair.
 
+config DEBUG_PARAVIRT
+	bool "Enable some paravirtualization debugging"
+	default y
+	depends on PARAVIRT && DEBUG_KERNEL
+	help
+	  Currently deliberately clobbers regs which are allowed to be
+	  clobbered in inlined paravirt hooks, even in native mode.
+	  If turning this off solves a problem, then DISABLE_INTERRUPTS() or
+	  ENABLE_INTERRUPTS() is lying about what registers can be clobbered.
+
 endmenu
diff -r 2707c89d72f0 arch/i386/kernel/alternative.c
--- a/arch/i386/kernel/alternative.c	Thu Nov 02 10:14:50 2006 +1100
+++ b/arch/i386/kernel/alternative.c	Thu Nov 02 11:36:54 2006 +1100
@@ -359,7 +359,7 @@ void apply_paravirt(struct paravirt_patc
 
 		used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
 					  p->len);
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_PARAVIRT
 		/* Deliberately clobber regs using "not %reg" to find bugs. */
 		for (i = 0; i < 3; i++) {
 			if (p->len - used >= 2 && (p->clobbers & (1 << i))) {

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations.
  2006-11-02  0:47               ` Rusty Russell
@ 2006-11-02  0:54                 ` Zachary Amsden
  0 siblings, 0 replies; 54+ messages in thread
From: Zachary Amsden @ 2006-11-02  0:54 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Andi Kleen, Andi Kleen, virtualization,
	Chris Wright, linux-kernel

Rusty Russell wrote:
> On Wed, 2006-11-01 at 15:27 -0800, Andrew Morton wrote:
>   
>> On Wed, 01 Nov 2006 21:28:13 +1100
>> Rusty Russell <rusty@rustcorp.com.au> wrote:
>>     
>>> +#ifdef CONFIG_DEBUG_KERNEL
>>> +		/* Deliberately clobber regs using "not %reg" to find bugs. */
>>>       
>> That would be considered to be abusive of CONFIG_DEBUG_KERNEL.  A
>> CONFIG_DEBUG_PARAVIRT which depends on CONFIG_DEBUG_KERNEL would be more
>> harmonious.
>>     
>
> I wasn't sure.  Making a config option for what is a one-liner seemed
> overkill.
>   

I have further stuff in my vmi-debug patch that can use 
CONFIG_DEBUG_PARAVIRT as well :)

Zach

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-01 10:27         ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Rusty Russell
  2006-11-01 10:28           ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Rusty Russell
  2006-11-01 10:45           ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Arjan van de Ven
@ 2006-11-02  7:13           ` Andrew Morton
  2006-11-02  7:44             ` Oleg Verych
  2006-11-03  2:56           ` Andi Kleen
  2006-11-18  2:08           ` john stultz
  4 siblings, 1 reply; 54+ messages in thread
From: Andrew Morton @ 2006-11-02  7:13 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Chris Wright, Andi Kleen, linux-kernel, virtualization

This patch breaks `make headers_check' in mysterious ways:

  CHECK   include/linux/netfilter_ipv4/ip_conntrack_tcp.h
  CHECK   include/linux/netfilter_ipv4/ip_conntrack_sctp.h
  CHECK   include/linux/netfilter_ipv4/ip_conntrack_protocol.h
  CHECK   include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
  CHECK   include/linux/netfilter_ipv4/ip_conntrack_helper_h323_asn1.h
  CHECK   include/linux/netfilter_ipv4/ip_conntrack_helper.h
make[2]: *** [/usr/src/devel/usr/include/asm/.check.setup.h] Error 1
make[2]: *** Waiting for unfinished jobs....
make[1]: *** [asm-i386] Error 2
make[1]: *** Waiting for unfinished jobs....
make: *** [headers_check] Error 2

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-02  7:13           ` Andrew Morton
@ 2006-11-02  7:44             ` Oleg Verych
  0 siblings, 0 replies; 54+ messages in thread
From: Oleg Verych @ 2006-11-02  7:44 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization

On 2006-11-02, Andrew Morton wrote:
> This patch breaks `make headers_check' in mysterious ways:
>
>   CHECK   include/linux/netfilter_ipv4/ip_conntrack_tcp.h
>   CHECK   include/linux/netfilter_ipv4/ip_conntrack_sctp.h
>   CHECK   include/linux/netfilter_ipv4/ip_conntrack_protocol.h
>   CHECK   include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
>   CHECK   include/linux/netfilter_ipv4/ip_conntrack_helper_h323_asn1.h
>   CHECK   include/linux/netfilter_ipv4/ip_conntrack_helper.h
> make[2]: *** [/usr/src/devel/usr/include/asm/.check.setup.h] Error 1
> make[2]: *** Waiting for unfinished jobs....
> make[1]: *** [asm-i386] Error 2
> make[1]: *** Waiting for unfinished jobs....
> make: *** [headers_check] Error 2

It seems like missing
"header-y += paravirt.h" in the "include/asm-i386/Kbuild".
____

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels
  2006-11-01 23:24     ` Zachary Amsden
@ 2006-11-02 10:20       ` Pavel Machek
  2006-11-02 11:04         ` Zachary Amsden
  0 siblings, 1 reply; 54+ messages in thread
From: Pavel Machek @ 2006-11-02 10:20 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: Chris Wright, akpm, ak, virtualization, linux-kernel

Hi!

> >>Allow selected bug checks to be skipped by paravirt kernels.  The two most
> >>important are the F00F workaround (which is either done by the hypervisor,
> >>or not required), and the 'hlt' instruction check, which can break under
> >>some hypervisors.
> >>    
> >
> >How can hlt check break? It is hlt;hlt;hlt, IIRC, that looks fairly
> >innocent to me.
> >  
> 
> Not if you use tickless timers that don't generate interrupts to unhalt 
> you, or if you delay ticks until the next scheduled timeout and you 
> haven't yet scheduled any timeout.  Both are likely in a hypervisor.

Well.. but you are working around problem, instead of fixing it.

Tickless kernels are possible on normal machines, too.

Please fix it properly... probably by requesting timer 10msec in
advance or something.
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels
  2006-11-02 10:20       ` Pavel Machek
@ 2006-11-02 11:04         ` Zachary Amsden
  0 siblings, 0 replies; 54+ messages in thread
From: Zachary Amsden @ 2006-11-02 11:04 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Chris Wright, akpm, ak, Rusty Russell, Jeremy Fitzhardinge,
	linux-kernel, virtualization

Pavel Machek wrote:
>>> How can hlt check break? It is hlt;hlt;hlt, IIRC, that looks fairly
>>> innocent to me.
>>>  
>>>       
>> Not if you use tickless timers that don't generate interrupts to unhalt 
>> you, or if you delay ticks until the next scheduled timeout and you 
>> haven't yet scheduled any timeout.  Both are likely in a hypervisor.
>>     
>
> Well.. but you are working around problem, instead of fixing it.
>
> Tickless kernels are possible on normal machines, too.
>
> Please fix it properly... probably by requesting timer 10msec in
> advance or something.
> 									Pavel
>   

Well, I agree in spirit, but there is something to be said for keeping 
the code less complicated by removing these workarounds for broken 
processors.  Preferably, we could remove the hlt check entirely, but 
then those people with these broken processors would not get the 
expected behavior of stalling during boot - that is the expected 
behavior of failure, correct?  In any case, I added this workaround for 
the case when running under Xen.  I would rather not add a dependence on 
timer scheduling to legacy bug checking code when the number of timer 
sources and tickless variations available is proliferating and the 
number of legacy processors that would even need this check is rapidly 
approaching zero.

Zach

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-01 10:27         ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Rusty Russell
                             ` (2 preceding siblings ...)
  2006-11-02  7:13           ` Andrew Morton
@ 2006-11-03  2:56           ` Andi Kleen
  2006-11-03 20:35             ` Zachary Amsden
  2006-11-18  2:08           ` john stultz
  4 siblings, 1 reply; 54+ messages in thread
From: Andi Kleen @ 2006-11-03  2:56 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Chris Wright, virtualization, linux-kernel, akpm

On Wednesday 01 November 2006 11:27, Rusty Russell wrote:
> Create a paravirt.h header for all the critical operations which need
> to be replaced with hypervisor calls, and include that instead of
> defining native operations, when CONFIG_PARAVIRT.

Hmm, did this all ever compile in mainline? I had to do a few merges
and in the end i get

/home/lsrc/quilt/linux/kernel/spinlock.c: In function ‘_spin_lock_irqsave’:
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
{standard input}: Assembler messages:
{standard input}:593: Error: undefined symbol `paravirt_ops' in operation
{standard input}:593: Error: undefined symbol `PARAVIRT_irq_enable' in operation
{standard input}:605: Error: undefined symbol `paravirt_ops' in operation
{standard input}:605: Error: undefined symbol `PARAVIRT_irq_disable' in operatio
n

and lots of new warnings like

/home/lsrc/quilt/linux/arch/i386/kernel/traps.c: In function ‘set_intr_gate’:
/home/lsrc/quilt/linux/arch/i386/kernel/traps.c:1165: warning: implicit declarat
ion of function ‘_set_gate’
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/common.c: In function ‘_cpu_init’:
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/common.c:754: warning: implicit decl
aration of function ‘__set_tss_desc’
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c: In function ‘intel_mach
ine_check’:
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eax’ 
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ebx’ 
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ecx’ 
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.edx’ 
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.esi’ 
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.edi’ 
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ebp’ 
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.esp’ 
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eflag
s’ may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eip’ 
may be used uninitialized in this function


This is with i386 defconfig + CONFIG_PARAVIRT

-Andi

_______________________________________________
Virtualization mailing list
Virtualization@lists.osdl.org
https://lists.osdl.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-03  2:56           ` Andi Kleen
@ 2006-11-03 20:35             ` Zachary Amsden
  2006-11-03 21:09               ` Andi Kleen
  0 siblings, 1 reply; 54+ messages in thread
From: Zachary Amsden @ 2006-11-03 20:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Rusty Russell, Chris Wright, virtualization, linux-kernel, akpm

Andi Kleen wrote:
> On Wednesday 01 November 2006 11:27, Rusty Russell wrote:
>   
>> Create a paravirt.h header for all the critical operations which need
>> to be replaced with hypervisor calls, and include that instead of
>> defining native operations, when CONFIG_PARAVIRT.
>>     
>
> Hmm, did this all ever compile in mainline? I had to do a few merges
> and in the end i get
>
> /home/lsrc/quilt/linux/kernel/spinlock.c: In function ‘_spin_lock_irqsave’:
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> {standard input}: Assembler messages:
> {standard input}:593: Error: undefined symbol `paravirt_ops' in operation
> {standard input}:593: Error: undefined symbol `PARAVIRT_irq_enable' in operation
> {standard input}:605: Error: undefined symbol `paravirt_ops' in operation
> {standard input}:605: Error: undefined symbol `PARAVIRT_irq_disable' in operatio
> n
>   

Not seeing that here (on 2.6.19-rc2-mm2 with gcc 4.0.2).

> and lots of new warnings like
>
> /home/lsrc/quilt/linux/arch/i386/kernel/traps.c: In function ‘set_intr_gate’:
> /home/lsrc/quilt/linux/arch/i386/kernel/traps.c:1165: warning: implicit declarat
> ion of function ‘_set_gate’
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/common.c: In function ‘_cpu_init’:
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/common.c:754: warning: implicit decl
> aration of function ‘__set_tss_desc'
>   

Sounds like desc.h got reordered.  Somewhere, there was a broken patch 
once that did this, I thought we fixed that.

> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c: In function ‘intel_mach
> ine_check’:
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eax’ 
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ebx’ 
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ecx’ 
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.edx’ 
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.esi’ 
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.edi’ 
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ebp’ 
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.esp’ 
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eflag
> s’ may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eip’ 
> may be used uninitialized in this function
>   

Those appear to be valid warnings, with or without paravirt, due to the 
tacky glued inline oddity of intel_get_extended_msrs.

Zach

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-03 20:35             ` Zachary Amsden
@ 2006-11-03 21:09               ` Andi Kleen
  2006-11-05  4:43                 ` Rusty Russell
  0 siblings, 1 reply; 54+ messages in thread
From: Andi Kleen @ 2006-11-03 21:09 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Rusty Russell, Chris Wright, virtualization, linux-kernel, akpm


> 
> Sounds like desc.h got reordered.  Somewhere, there was a broken patch 
> once that did this, I thought we fixed that.

I think I got Rusty's latest patches that I found in my mailbox.

I haven't looked at desc.h, but at least processor.h ordering was totally
b0rken (e.g. #define __cpuid native_cpuid was after several uses). I fixed
that to make at least the CONFIG_PARAVIRT not set case compile.

I can't see how this ever worked either.

Haven't attempted the CONFIG_PARAVIRT case which apparently needs more work
(it is currently marked CONFIG_BROKEN) 

Can someone double check this is the correct patchkit?

ftp://ftp.frstfloor.org/pub/ak/x86_64/quilt/patches/paravirt*

-Andi

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-03 21:09               ` Andi Kleen
@ 2006-11-05  4:43                 ` Rusty Russell
  2006-11-05  4:59                   ` Zachary Amsden
  2006-11-05  5:46                   ` Andi Kleen
  0 siblings, 2 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-05  4:43 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Zachary Amsden, Chris Wright, virtualization, linux-kernel, akpm

On Fri, 2006-11-03 at 22:09 +0100, Andi Kleen wrote:
> > 
> > Sounds like desc.h got reordered.  Somewhere, there was a broken patch 
> > once that did this, I thought we fixed that.
> 
> I think I got Rusty's latest patches that I found in my mailbox.
> 
> I haven't looked at desc.h, but at least processor.h ordering was totally
> b0rken (e.g. #define __cpuid native_cpuid was after several uses). I fixed
> that to make at least the CONFIG_PARAVIRT not set case compile.
> 
> I can't see how this ever worked either.
> 
> Haven't attempted the CONFIG_PARAVIRT case which apparently needs more work
> (it is currently marked CONFIG_BROKEN) 
> 
> Can someone double check this is the correct patchkit?
> 
> ftp://ftp.frstfloor.org/pub/ak/x86_64/quilt/patches/paravirt*

Andi, the patches work against Andrew's tree, and he's merged them in
rc4-mm2.  There are a few warnings to clean up, but it seems basically
sound.

At this point I our think time is better spent on beating those patches
up, rather than going back and figuring out why they don't work in your
tree.

Sorry,
Rusty.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-05  4:43                 ` Rusty Russell
@ 2006-11-05  4:59                   ` Zachary Amsden
  2006-11-05  5:08                     ` Rusty Russell
  2006-11-05  5:46                   ` Andi Kleen
  1 sibling, 1 reply; 54+ messages in thread
From: Zachary Amsden @ 2006-11-05  4:59 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Chris Wright, virtualization, akpm, linux-kernel

Rusty Russell wrote:
> Andi, the patches work against Andrew's tree, and he's merged them in
> rc4-mm2.  There are a few warnings to clean up, but it seems basically
> sound.
>
> At this point I our think time is better spent on beating those patches
> up, rather than going back and figuring out why they don't work in your
> tree.
>   

This begs the question - should we rebase the paravirt-ops patchset 
against -rc4-mm2?  I almost did it today, but didn't want to stomp on 
anybody else's toes.

Zach

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-05  4:59                   ` Zachary Amsden
@ 2006-11-05  5:08                     ` Rusty Russell
  0 siblings, 0 replies; 54+ messages in thread
From: Rusty Russell @ 2006-11-05  5:08 UTC (permalink / raw)
  To: Zachary Amsden; +Cc: Chris Wright, virtualization, akpm, linux-kernel

On Sat, 2006-11-04 at 20:59 -0800, Zachary Amsden wrote:
> Rusty Russell wrote:
> > Andi, the patches work against Andrew's tree, and he's merged them in
> > rc4-mm2.  There are a few warnings to clean up, but it seems basically
> > sound.
> >
> > At this point I our think time is better spent on beating those patches
> > up, rather than going back and figuring out why they don't work in your
> > tree.
> >   
> 
> This begs the question - should we rebase the paravirt-ops patchset 
> against -rc4-mm2?  I almost did it today, but didn't want to stomp on 
> anybody else's toes.

Yes.  Andrew has shot me a couple of warnings which people have found,
and I'm preparing patches for them.  Rebasing will make it easier.

If you're not awake now, I'll do it.  If you are, see me on IRC.

Rusty.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-05  4:43                 ` Rusty Russell
  2006-11-05  4:59                   ` Zachary Amsden
@ 2006-11-05  5:46                   ` Andi Kleen
  2006-11-05  6:18                     ` Andrew Morton
  2006-11-05  6:21                     ` Rusty Russell
  1 sibling, 2 replies; 54+ messages in thread
From: Andi Kleen @ 2006-11-05  5:46 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Zachary Amsden, Chris Wright, virtualization, linux-kernel, akpm


> Andi, the patches work against Andrew's tree, and he's merged them in
> rc4-mm2.  There are a few warnings to clean up, but it seems basically
> sound.
> 
> At this point I our think time is better spent on beating those patches
> up, rather than going back and figuring out why they don't work in your
> tree.

My tree is basically mainline as base. Sure if you don't care about mainline
merges we can ignore it there and keep it forever in -mm* until Andrew
gets tired of it?

That's a possible strategy, but only if you want to keep it as a mm-only
toy forever.

-Andi

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-05  5:46                   ` Andi Kleen
@ 2006-11-05  6:18                     ` Andrew Morton
  2006-11-05  6:21                     ` Rusty Russell
  1 sibling, 0 replies; 54+ messages in thread
From: Andrew Morton @ 2006-11-05  6:18 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chris Wright, linux-kernel, virtualization

On Sun, 5 Nov 2006 06:46:15 +0100
Andi Kleen <ak@suse.de> wrote:

> 
> > Andi, the patches work against Andrew's tree, and he's merged them in
> > rc4-mm2.  There are a few warnings to clean up, but it seems basically
> > sound.
> > 
> > At this point I our think time is better spent on beating those patches
> > up, rather than going back and figuring out why they don't work in your
> > tree.
> 
> My tree is basically mainline as base. Sure if you don't care about mainline
> merges we can ignore it there and keep it forever in -mm* until Andrew
> gets tired of it?
> 
> That's a possible strategy, but only if you want to keep it as a mm-only
> toy forever.
> 

They're in my regular list-of-thing-to-spam-maintainers-with, so we can
transfer them as-is next week sometime.

It would be better to sort out the various warnings and any other nasties first
though.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-05  5:46                   ` Andi Kleen
  2006-11-05  6:18                     ` Andrew Morton
@ 2006-11-05  6:21                     ` Rusty Russell
  2006-11-05  6:57                       ` Andi Kleen
  1 sibling, 1 reply; 54+ messages in thread
From: Rusty Russell @ 2006-11-05  6:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Chris Wright, linux-kernel, akpm, virtualization

On Sun, 2006-11-05 at 06:46 +0100, Andi Kleen wrote:
> > Andi, the patches work against Andrew's tree, and he's merged them in
> > rc4-mm2.  There are a few warnings to clean up, but it seems basically
> > sound.
> > 
> > At this point I our think time is better spent on beating those patches
> > up, rather than going back and figuring out why they don't work in your
> > tree.
> 
> My tree is basically mainline as base. Sure if you don't care about mainline
> merges we can ignore it there and keep it forever in -mm* until Andrew
> gets tired of it?
> 
> That's a possible strategy, but only if you want to keep it as a mm-only
> toy forever.

Andi, it's been simpler for us to get the code into Andrew's tree, in
nice bit-size pieces.  We've had trouble every time we've tried to get
stuff into your tree.  In addition, Andrew's tree gives the code
exposure and testing.

If Andrew says we have to get those patches into mainline through you,
then I'll spend all that time re-spinning the patches for you from the
-mm tree until they go in.  It doesn't seem like a good use of anyone's
time though.

Rusty.

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-05  6:21                     ` Rusty Russell
@ 2006-11-05  6:57                       ` Andi Kleen
  0 siblings, 0 replies; 54+ messages in thread
From: Andi Kleen @ 2006-11-05  6:57 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Zachary Amsden, Chris Wright, virtualization, linux-kernel, akpm


> 
> If Andrew says we have to get those patches into mainline through you,

Well I'm mainline in this case.


> then I'll spend all that time re-spinning the patches for you from the
> -mm tree until they go in.  

I got it to compile now with this patch (+ one additional patch
that is folded in). It then goes through kernel initialization
and then init gets killed with "Inconsistency detected by rtld.c:1250:
Assertation ph_vaddr == _rtld_local.dl_sysinfo_vdso failed"

It looks like some of the ifdefs were placed completely wrong
and in addition you were missing a patch to include asm/offset.h
everywhere as assembly (I patched around that). And two macros
were apparently never compiled in their current form.

But it seems it is dependent on even more -mm* magic than just
that. If you can identify the missing patches that make init's
rtld work again that would be useful.

-Andi

Get paravirt ops to compile

TBD should be folded into the original patches

Unfortunately still doesn't boot.

Signed-off-by: Andi Kleen <ak@suse.de>

Index: linux/include/asm-i386/desc.h
===================================================================
--- linux.orig/include/asm-i386/desc.h
+++ linux/include/asm-i386/desc.h
@@ -92,6 +92,9 @@ static inline void write_dt_entry(void *
 #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
 #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
 
+#define set_ldt native_set_ldt
+#endif /* CONFIG_PARAVIRT */
+
 static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
 {
 	__u32 a, b;
@@ -108,9 +111,6 @@ static inline void __set_tss_desc(unsign
 	write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
 }
 
-#define set_ldt native_set_ldt
-#endif /* CONFIG_PARAVIRT */
-
 static inline fastcall void native_set_ldt(const void *addr,
 					   unsigned int entries)
 {
Index: linux/include/asm-i386/paravirt.h
===================================================================
--- linux.orig/include/asm-i386/paravirt.h
+++ linux/include/asm-i386/paravirt.h
@@ -454,16 +454,20 @@ static inline unsigned long __raw_local_
 	return f;
 }
 
-#define CLI_STRING paravirt_alt("pushl %ecx; pushl %edx;"		\
-		     "call *paravirt_ops+PARAVIRT_irq_disable;"		\
-		     "popl %edx; popl %ecx",				\
+#define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;"		\
+		     "call *paravirt_ops+%c[irq_disable];"		\
+		     "popl %%edx; popl %%ecx",				\
 		     PARAVIRT_IRQ_DISABLE, CLBR_EAX)
 
-#define STI_STRING paravirt_alt("pushl %ecx; pushl %edx;"		\
-		     "call *paravirt_ops+PARAVIRT_irq_enable;"		\
-		     "popl %edx; popl %ecx",				\
+#define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;"		\
+		     "call *paravirt_ops+%c[irq_enable];"		\
+		     "popl %%edx; popl %%ecx",				\
 		     PARAVIRT_IRQ_ENABLE, CLBR_EAX)
 #define CLI_STI_CLOBBERS , "%eax"
+#define CLI_STI_INPUT_ARGS \
+	,								\
+	[irq_disable] "i" (offsetof(struct paravirt_ops, irq_disable)),	\
+	[irq_enable] "i" (offsetof(struct paravirt_ops, irq_enable))
 
 #else  /* __ASSEMBLY__ */
 
Index: linux/include/asm-i386/spinlock.h
===================================================================
--- linux.orig/include/asm-i386/spinlock.h
+++ linux/include/asm-i386/spinlock.h
@@ -13,6 +13,7 @@
 #define CLI_STRING	"cli"
 #define STI_STRING	"sti"
 #define CLI_STI_CLOBBERS
+#define CLI_STI_INPUT_ARGS
 #endif /* CONFIG_PARAVIRT */
 
 /*
@@ -58,26 +59,27 @@ static inline void __raw_spin_lock_flags
 {
 	asm volatile(
 		"\n1:\t"
-		LOCK_PREFIX " ; decb %0\n\t"
+		LOCK_PREFIX " ; decb %[slock]\n\t"
 		"jns 5f\n"
 		"2:\t"
-		"testl $0x200, %1\n\t"
+		"testl $0x200, %[flags]\n\t"
 		"jz 4f\n\t"
 		STI_STRING "\n"
 		"3:\t"
 		"rep;nop\n\t"
-		"cmpb $0, %0\n\t"
+		"cmpb $0, %[slock]\n\t"
 		"jle 3b\n\t"
 		CLI_STRING "\n\t"
 		"jmp 1b\n"
 		"4:\t"
 		"rep;nop\n\t"
-		"cmpb $0, %0\n\t"
+		"cmpb $0, %[slock]\n\t"
 		"jg 1b\n\t"
 		"jmp 4b\n"
 		"5:\n\t"
-		: "+m" (lock->slock)
-		: "r" (flags)
+		: [slock] "+m" (lock->slock)
+		: [flags] "r" (flags) 
+	 	  CLI_STI_INPUT_ARGS
 		: "memory" CLI_STI_CLOBBERS);
 }
 #endif
Index: linux/include/asm-i386/processor.h
===================================================================
--- linux.orig/include/asm-i386/processor.h
+++ linux/include/asm-i386/processor.h
@@ -511,6 +511,7 @@ static inline void load_esp0(struct tss_
 		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 	}
 }
+#endif
 
 #define start_thread(regs, new_eip, new_esp) do {		\
 	__asm__("movl %0,%%fs": :"r" (0));			\
@@ -524,6 +525,7 @@ static inline void load_esp0(struct tss_
 	regs->esp = new_esp;					\
 } while (0)
 
+#ifndef CONFIG_PARAVIRT
 /*
  * These special macros can be used to get or set a debugging register
  */

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations
  2006-11-01 10:27         ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Rusty Russell
                             ` (3 preceding siblings ...)
  2006-11-03  2:56           ` Andi Kleen
@ 2006-11-18  2:08           ` john stultz
  4 siblings, 0 replies; 54+ messages in thread
From: john stultz @ 2006-11-18  2:08 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, Andi Kleen, virtualization, Chris Wright, akpm,
	linux-kernel

On Wed, 2006-11-01 at 21:27 +1100, Rusty Russell wrote:
> Create a paravirt.h header for all the critical operations which need
> to be replaced with hypervisor calls, and include that instead of
> defining native operations, when CONFIG_PARAVIRT.
> 
> This patch does the dumbest possible replacement of paravirtualized
> instructions: calls through a "paravirt_ops" structure.  Currently
> these are function implementations of native hardware: hypervisors
> will override the ops structure with their own variants.
> 
[snip]

> +struct paravirt_ops paravirt_ops = {
> +	.name = "bare hardware",
[snip]
> +	.get_wallclock = native_get_wallclock,
> +	.set_wallclock = native_set_wallclock,

[snip]

> --- /dev/null
> +++ b/include/asm-i386/time.h
> @@ -0,0 +1,41 @@
> +#ifndef _ASMi386_TIME_H
> +#define _ASMi386_TIME_H
> +
> +#include <linux/efi.h>
> +#include "mach_time.h"
> +
> +static inline unsigned long native_get_wallclock(void)
> +{
> +	unsigned long retval;
> +
> +	if (efi_enabled)
> +		retval = efi_get_time();
> +	else
> +		retval = mach_get_cmos_time();
> +
> +	return retval;
> +}
> +
> +static inline int native_set_wallclock(unsigned long nowtime)
> +{
> +	int retval;
> +
> +	if (efi_enabled)
> +		retval = efi_set_rtc_mmss(nowtime);
> +	else
> +		retval = mach_set_rtc_mmss(nowtime);
> +
> +	return retval;
> +}
> +
> +#ifdef CONFIG_PARAVIRT
> +#include <asm/paravirt.h>
> +#else /* !CONFIG_PARAVIRT */
> +
> +#define get_wallclock() native_get_wallclock()
> +#define set_wallclock(x) native_set_wallclock(x)


Could a better name then "get/set_wallclock" be used here? Its too vague
and would be easily confused with do_set/gettimeofday() functions.

My suggestion would be to use "persistent_clock" to describe the
battery-backed CMOS/hardware clock. (I assume that is what you intend
this paravirt_op to be, rather then get the high-resolution system
timeofday)

thanks
-john

^ permalink raw reply	[flat|nested] 54+ messages in thread

end of thread, other threads:[~2006-11-18  2:08 UTC | newest]

Thread overview: 54+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-10-29  2:45 [PATCH 0/7] x86 paravirtualization infrastructure Chris Wright
2006-10-28  7:00 ` [PATCH 1/7] header and stubs for paravirtualizing critical operations Chris Wright
2006-10-29 16:40   ` Andi Kleen
2006-10-28  7:00 ` [PATCH 2/7] Patch inline replacements for common paravirt operations Chris Wright
2006-10-28  7:00 ` [PATCH 3/7] More generic paravirtualization entry point Chris Wright
2006-10-29 16:41   ` Andi Kleen
2006-10-28  7:00 ` [PATCH 4/7] Allow selected bug checks to be skipped by paravirt kernels Chris Wright
2006-11-01 12:17   ` Pavel Machek
2006-11-01 22:40     ` Dave Jones
2006-11-01 23:24     ` Zachary Amsden
2006-11-02 10:20       ` Pavel Machek
2006-11-02 11:04         ` Zachary Amsden
2006-10-28  7:00 ` [PATCH 5/7] Allow disabling legacy power management modes with " Chris Wright
2006-10-28  7:00 ` [PATCH 6/7] Add APIC accessors to paravirt-ops Chris Wright
2006-10-29 16:31   ` Andi Kleen
2006-10-30  3:28     ` Rusty Russell
2006-10-30 23:11       ` Andi Kleen
2006-10-30 23:42         ` Chris Wright
2006-10-30 23:46           ` Andi Kleen
2006-10-30 23:55             ` Chris Wright
2006-10-31  1:45             ` Rusty Russell
2006-11-01 10:25         ` Rusty Russell
2006-11-01 10:27         ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Rusty Russell
2006-11-01 10:28           ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Rusty Russell
2006-11-01 10:29             ` [PATCH 3/7] paravirtualization: More generic paravirtualization entry point Rusty Russell
2006-11-01 10:30               ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Rusty Russell
2006-11-01 10:31                 ` [PATCH 5/7] paravirtualization: Allow disabling legacy power management modes with " Rusty Russell
2006-11-01 10:32                   ` [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops Rusty Russell
2006-11-01 10:34                     ` [PATCH 7/7] paravirtualization: Add mmu virtualization " Rusty Russell
2006-11-01 23:31                     ` [PATCH 6/7] paravirtualization: Add APIC accessors " Andrew Morton
2006-11-02  0:46                       ` Rusty Russell
2006-11-01 23:29                 ` [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels Andrew Morton
2006-11-01 23:58                   ` Jeremy Fitzhardinge
2006-11-02  0:01                   ` Rusty Russell
2006-11-01 23:27             ` [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations Andrew Morton
2006-11-02  0:47               ` Rusty Russell
2006-11-02  0:54                 ` Zachary Amsden
2006-11-01 10:45           ` [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations Arjan van de Ven
2006-11-01 17:27             ` Andi Kleen
2006-11-01 23:32             ` Rusty Russell
2006-11-02  7:13           ` Andrew Morton
2006-11-02  7:44             ` Oleg Verych
2006-11-03  2:56           ` Andi Kleen
2006-11-03 20:35             ` Zachary Amsden
2006-11-03 21:09               ` Andi Kleen
2006-11-05  4:43                 ` Rusty Russell
2006-11-05  4:59                   ` Zachary Amsden
2006-11-05  5:08                     ` Rusty Russell
2006-11-05  5:46                   ` Andi Kleen
2006-11-05  6:18                     ` Andrew Morton
2006-11-05  6:21                     ` Rusty Russell
2006-11-05  6:57                       ` Andi Kleen
2006-11-18  2:08           ` john stultz
2006-10-28  7:00 ` [PATCH 7/7] Add mmu virtualization to paravirt-ops Chris Wright

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).