* [Qemu-devel] [PATCH 1/3] Add additional CPU flag definitions @ 2008-10-28 20:13 Anthony Liguori 2008-10-28 20:13 ` [Qemu-devel] [PATCH 2/3] Split CPUID from op_helper Anthony Liguori 0 siblings, 1 reply; 34+ messages in thread From: Anthony Liguori @ 2008-10-28 20:13 UTC (permalink / raw) To: qemu-devel; +Cc: Glauber Costa, Avi Kivity, kvm-devel, Anthony Liguori Some x86 CPU definitions that KVM needs Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 3c11e0f..b1678ef 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -159,9 +159,11 @@ #define HF_MP_MASK (1 << HF_MP_SHIFT) #define HF_EM_MASK (1 << HF_EM_SHIFT) #define HF_TS_MASK (1 << HF_TS_SHIFT) +#define HF_IOPL_MASK (3 << HF_IOPL_SHIFT) #define HF_LMA_MASK (1 << HF_LMA_SHIFT) #define HF_CS64_MASK (1 << HF_CS64_SHIFT) #define HF_OSFXSR_MASK (1 << HF_OSFXSR_SHIFT) +#define HF_VM_MASK (1 << HF_VM_SHIFT) #define HF_SMM_MASK (1 << HF_SMM_SHIFT) #define HF_SVME_MASK (1 << HF_SVME_SHIFT) #define HF_SVMI_MASK (1 << HF_SVMI_SHIFT) @@ -178,6 +180,9 @@ #define HF2_NMI_MASK (1 << HF2_NMI_SHIFT) #define HF2_VINTR_MASK (1 << HF2_VINTR_SHIFT) +#define CR0_PE_SHIFT 0 +#define CR0_MP_SHIFT 1 + #define CR0_PE_MASK (1 << 0) #define CR0_MP_MASK (1 << 1) #define CR0_EM_MASK (1 << 2) @@ -196,7 +201,8 @@ #define CR4_PAE_MASK (1 << 5) #define CR4_PGE_MASK (1 << 7) #define CR4_PCE_MASK (1 << 8) -#define CR4_OSFXSR_MASK (1 << 9) +#define CR4_OSFXSR_SHIFT 9 +#define CR4_OSFXSR_MASK (1 << CR4_OSFXSR_SHIFT) #define CR4_OSXMMEXCPT_MASK (1 << 10) #define PG_PRESENT_BIT 0 @@ -229,6 +235,7 @@ #define PG_ERROR_RSVD_MASK 0x08 #define PG_ERROR_I_D_MASK 0x10 +#define MSR_IA32_TSC 0x10 #define MSR_IA32_APICBASE 0x1b #define MSR_IA32_APICBASE_BSP (1<<8) #define MSR_IA32_APICBASE_ENABLE (1<<11) ^ permalink raw reply related [flat|nested] 34+ messages in thread
* [Qemu-devel] [PATCH 2/3] Split CPUID from op_helper 2008-10-28 20:13 [Qemu-devel] [PATCH 1/3] Add additional CPU flag definitions Anthony Liguori @ 2008-10-28 20:13 ` Anthony Liguori 2008-10-28 20:13 ` [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU Anthony Liguori 0 siblings, 1 reply; 34+ messages in thread From: Anthony Liguori @ 2008-10-28 20:13 UTC (permalink / raw) To: qemu-devel; +Cc: Glauber Costa, Avi Kivity, kvm-devel, Anthony Liguori KVM needs to call CPUID from outside of the TCG code. This patch splits out the CPUID logic into a separate helper that both the op helper and KVM can call. Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> diff --git a/target-i386/cpu.h b/target-i386/cpu.h index b1678ef..263a477 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -730,6 +730,10 @@ void cpu_smm_update(CPUX86State *env); /* will be suppressed */ void cpu_x86_update_cr0(CPUX86State *env, uint32_t new_cr0); +void cpu_x86_cpuid(CPUX86State *env, uint32_t index, + uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx); + /* used to debug */ #define X86_DUMP_FPU 0x0001 /* dump FPU state too */ #define X86_DUMP_CCOP 0x0002 /* dump qemu flag cache */ diff --git a/target-i386/helper.c b/target-i386/helper.c index c2e1a88..905ae9b 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -1287,3 +1287,169 @@ target_phys_addr_t cpu_get_phys_page_debug(CPUState *env, target_ulong addr) return paddr; } #endif /* !CONFIG_USER_ONLY */ + +void cpu_x86_cpuid(CPUX86State *env, uint32_t index, + uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + /* test if maximum index reached */ + if (index & 0x80000000) { + if (index > env->cpuid_xlevel) + index = env->cpuid_level; + } else { + if (index > env->cpuid_level) + index = env->cpuid_level; + } + + switch(index) { + case 0: + *eax = env->cpuid_level; + *ebx = env->cpuid_vendor1; + *edx = env->cpuid_vendor2; + *ecx = env->cpuid_vendor3; + break; + case 1: + *eax = env->cpuid_version; + *ebx = (env->cpuid_apic_id << 24) | 8 << 8; /* CLFLUSH size in quad words, Linux wants it. */ + *ecx = env->cpuid_ext_features; + *edx = env->cpuid_features; + break; + case 2: + /* cache info: needed for Pentium Pro compatibility */ + *eax = 1; + *ebx = 0; + *ecx = 0; + *edx = 0x2c307d; + break; + case 4: + /* cache info: needed for Core compatibility */ + switch (*ecx) { + case 0: /* L1 dcache info */ + *eax = 0x0000121; + *ebx = 0x1c0003f; + *ecx = 0x000003f; + *edx = 0x0000001; + break; + case 1: /* L1 icache info */ + *eax = 0x0000122; + *ebx = 0x1c0003f; + *ecx = 0x000003f; + *edx = 0x0000001; + break; + case 2: /* L2 cache info */ + *eax = 0x0000143; + *ebx = 0x3c0003f; + *ecx = 0x0000fff; + *edx = 0x0000001; + break; + default: /* end of info */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + } + + break; + case 5: + /* mwait info: needed for Core compatibility */ + *eax = 0; /* Smallest monitor-line size in bytes */ + *ebx = 0; /* Largest monitor-line size in bytes */ + *ecx = CPUID_MWAIT_EMX | CPUID_MWAIT_IBE; + *edx = 0; + break; + case 6: + /* Thermal and Power Leaf */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 9: + /* Direct Cache Access Information Leaf */ + *eax = 0; /* Bits 0-31 in DCA_CAP MSR */ + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0xA: + /* Architectural Performance Monitoring Leaf */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x80000000: + *eax = env->cpuid_xlevel; + *ebx = env->cpuid_vendor1; + *edx = env->cpuid_vendor2; + *ecx = env->cpuid_vendor3; + break; + case 0x80000001: + *eax = env->cpuid_features; + *ebx = 0; + *ecx = env->cpuid_ext3_features; + *edx = env->cpuid_ext2_features; + break; + case 0x80000002: + case 0x80000003: + case 0x80000004: + *eax = env->cpuid_model[(index - 0x80000002) * 4 + 0]; + *ebx = env->cpuid_model[(index - 0x80000002) * 4 + 1]; + *ecx = env->cpuid_model[(index - 0x80000002) * 4 + 2]; + *edx = env->cpuid_model[(index - 0x80000002) * 4 + 3]; + break; + case 0x80000005: + /* cache info (L1 cache) */ + *eax = 0x01ff01ff; + *ebx = 0x01ff01ff; + *ecx = 0x40020140; + *edx = 0x40020140; + break; + case 0x80000006: + /* cache info (L2 cache) */ + *eax = 0; + *ebx = 0x42004200; + *ecx = 0x02008140; + *edx = 0; + break; + case 0x80000008: + /* virtual & phys address size in low 2 bytes. */ +/* XXX: This value must match the one used in the MMU code. */ + if (env->cpuid_ext2_features & CPUID_EXT2_LM) { + /* 64 bit processor */ +#if defined(USE_KQEMU) + *eax = 0x00003020; /* 48 bits virtual, 32 bits physical */ +#else +/* XXX: The physical address space is limited to 42 bits in exec.c. */ + *eax = 0x00003028; /* 48 bits virtual, 40 bits physical */ +#endif + } else { +#if defined(USE_KQEMU) + *eax = 0x00000020; /* 32 bits physical */ +#else + if (env->cpuid_features & CPUID_PSE36) + *eax = 0x00000024; /* 36 bits physical */ + else + *eax = 0x00000020; /* 32 bits physical */ +#endif + } + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + case 0x8000000A: + *eax = 0x00000001; /* SVM Revision */ + *ebx = 0x00000010; /* nr of ASIDs */ + *ecx = 0; + *edx = 0; /* optional features */ + break; + default: + /* reserved values: zero */ + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = 0; + break; + } +} diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c index ebb5824..e424c61 100644 --- a/target-i386/op_helper.c +++ b/target-i386/op_helper.c @@ -1885,171 +1885,15 @@ void helper_single_step(void) void helper_cpuid(void) { - uint32_t index; + uint32_t eax, ebx, ecx, edx; helper_svm_check_intercept_param(SVM_EXIT_CPUID, 0); - - index = (uint32_t)EAX; - /* test if maximum index reached */ - if (index & 0x80000000) { - if (index > env->cpuid_xlevel) - index = env->cpuid_level; - } else { - if (index > env->cpuid_level) - index = env->cpuid_level; - } - switch(index) { - case 0: - EAX = env->cpuid_level; - EBX = env->cpuid_vendor1; - EDX = env->cpuid_vendor2; - ECX = env->cpuid_vendor3; - break; - case 1: - EAX = env->cpuid_version; - EBX = (env->cpuid_apic_id << 24) | 8 << 8; /* CLFLUSH size in quad words, Linux wants it. */ - ECX = env->cpuid_ext_features; - EDX = env->cpuid_features; - break; - case 2: - /* cache info: needed for Pentium Pro compatibility */ - EAX = 1; - EBX = 0; - ECX = 0; - EDX = 0x2c307d; - break; - case 4: - /* cache info: needed for Core compatibility */ - switch (ECX) { - case 0: /* L1 dcache info */ - EAX = 0x0000121; - EBX = 0x1c0003f; - ECX = 0x000003f; - EDX = 0x0000001; - break; - case 1: /* L1 icache info */ - EAX = 0x0000122; - EBX = 0x1c0003f; - ECX = 0x000003f; - EDX = 0x0000001; - break; - case 2: /* L2 cache info */ - EAX = 0x0000143; - EBX = 0x3c0003f; - ECX = 0x0000fff; - EDX = 0x0000001; - break; - default: /* end of info */ - EAX = 0; - EBX = 0; - ECX = 0; - EDX = 0; - break; - } - - break; - case 5: - /* mwait info: needed for Core compatibility */ - EAX = 0; /* Smallest monitor-line size in bytes */ - EBX = 0; /* Largest monitor-line size in bytes */ - ECX = CPUID_MWAIT_EMX | CPUID_MWAIT_IBE; - EDX = 0; - break; - case 6: - /* Thermal and Power Leaf */ - EAX = 0; - EBX = 0; - ECX = 0; - EDX = 0; - break; - case 9: - /* Direct Cache Access Information Leaf */ - EAX = 0; /* Bits 0-31 in DCA_CAP MSR */ - EBX = 0; - ECX = 0; - EDX = 0; - break; - case 0xA: - /* Architectural Performance Monitoring Leaf */ - EAX = 0; - EBX = 0; - ECX = 0; - EDX = 0; - break; - case 0x80000000: - EAX = env->cpuid_xlevel; - EBX = env->cpuid_vendor1; - EDX = env->cpuid_vendor2; - ECX = env->cpuid_vendor3; - break; - case 0x80000001: - EAX = env->cpuid_features; - EBX = 0; - ECX = env->cpuid_ext3_features; - EDX = env->cpuid_ext2_features; - break; - case 0x80000002: - case 0x80000003: - case 0x80000004: - EAX = env->cpuid_model[(index - 0x80000002) * 4 + 0]; - EBX = env->cpuid_model[(index - 0x80000002) * 4 + 1]; - ECX = env->cpuid_model[(index - 0x80000002) * 4 + 2]; - EDX = env->cpuid_model[(index - 0x80000002) * 4 + 3]; - break; - case 0x80000005: - /* cache info (L1 cache) */ - EAX = 0x01ff01ff; - EBX = 0x01ff01ff; - ECX = 0x40020140; - EDX = 0x40020140; - break; - case 0x80000006: - /* cache info (L2 cache) */ - EAX = 0; - EBX = 0x42004200; - ECX = 0x02008140; - EDX = 0; - break; - case 0x80000008: - /* virtual & phys address size in low 2 bytes. */ -/* XXX: This value must match the one used in the MMU code. */ - if (env->cpuid_ext2_features & CPUID_EXT2_LM) { - /* 64 bit processor */ -#if defined(USE_KQEMU) - EAX = 0x00003020; /* 48 bits virtual, 32 bits physical */ -#else -/* XXX: The physical address space is limited to 42 bits in exec.c. */ - EAX = 0x00003028; /* 48 bits virtual, 40 bits physical */ -#endif - } else { -#if defined(USE_KQEMU) - EAX = 0x00000020; /* 32 bits physical */ -#else - if (env->cpuid_features & CPUID_PSE36) - EAX = 0x00000024; /* 36 bits physical */ - else - EAX = 0x00000020; /* 32 bits physical */ -#endif - } - EBX = 0; - ECX = 0; - EDX = 0; - break; - case 0x8000000A: - EAX = 0x00000001; /* SVM Revision */ - EBX = 0x00000010; /* nr of ASIDs */ - ECX = 0; - EDX = 0; /* optional features */ - break; - default: - /* reserved values: zero */ - EAX = 0; - EBX = 0; - ECX = 0; - EDX = 0; - break; - } + cpu_x86_cpuid(env, (uint32_t)EAX, &eax, &ebx, &ecx, &edx); + EAX = eax; + EBX = ebx; + ECX = ecx; + EDX = edx; } void helper_enter_level(int level, int data32, target_ulong t1) ^ permalink raw reply related [flat|nested] 34+ messages in thread
* [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:13 ` [Qemu-devel] [PATCH 2/3] Split CPUID from op_helper Anthony Liguori @ 2008-10-28 20:13 ` Anthony Liguori 2008-10-28 20:49 ` Hollis Blanchard ` (5 more replies) 0 siblings, 6 replies; 34+ messages in thread From: Anthony Liguori @ 2008-10-28 20:13 UTC (permalink / raw) To: qemu-devel; +Cc: Glauber Costa, Avi Kivity, kvm-devel, Anthony Liguori This patch adds very basic KVM support. KVM is a kernel module for Linux that allows userspace programs to make use of hardware virtualization support. It current supports x86 hardware virtualization using Intel VT-x or AMD-V. It also supports IA64 VT-i, PPC 440, and S390. This patch only implements the bare minimum support to get a guest booting. It has very little impact the rest of QEMU and attempts to integrate nicely with the rest of QEMU. Even though this implementation is basic, it is significantly faster than TCG. Booting and shutting down a Linux guest: w/TCG: 1:32.36 elapsed 84% CPU w/KVM: 0:31.14 elapsed 59% CPU Right now, KVM is disabled by default and must be explicitly enabled with -enable-kvm. We can enable it by default later when we have had better testing. Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> diff --git a/KVM_TODO b/KVM_TODO new file mode 100644 index 0000000..9529049 --- /dev/null +++ b/KVM_TODO @@ -0,0 +1,9 @@ +1) Add hooks for load/save of register state + o Fixes gdbstub, save/restore, and vmport +2) Add VGA optimization +3) Add IO thread +4) Add guest SMP support +5) Add TPR optimization +6) Add support for in-kernel APIC +7) Add support for in-kernel PIT +8) Merge in additional changes in kvm-userspace tree diff --git a/Makefile.target b/Makefile.target index e2edf9d..903d66d 100644 --- a/Makefile.target +++ b/Makefile.target @@ -183,6 +183,9 @@ CFLAGS+=-I/opt/SUNWspro/prod/include/cc endif endif +kvm.o: CFLAGS+=$(KVM_CFLAGS) +kvm-all.o: CFLAGS+=$(KVM_CFLAGS) + all: $(PROGS) ######################################################### @@ -475,6 +478,9 @@ ifndef CONFIG_USER_ONLY OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o OBJS+=fw_cfg.o aio.o buffered_file.o migration.o migration-tcp.o +ifdef CONFIG_KVM +OBJS+=kvm.o kvm-all.o +endif ifdef CONFIG_WIN32 OBJS+=block-raw-win32.o else diff --git a/configure b/configure index aefa69b..7aed99d 100755 --- a/configure +++ b/configure @@ -113,6 +113,7 @@ aio="yes" nptl="yes" mixemu="no" bluez="yes" +kvm="yes" # OS specific targetos=`uname -s` @@ -300,6 +301,8 @@ for opt do ;; --disable-bluez) bluez="no" ;; + --disable-kvm) kvm="no" + ;; --enable-profiler) profiler="yes" ;; --enable-cocoa) @@ -439,6 +442,7 @@ echo " --disable-brlapi disable BrlAPI" echo " --disable-vnc-tls disable TLS encryption for VNC server" echo " --disable-curses disable curses output" echo " --disable-bluez disable bluez stack connectivity" +echo " --disable-kvm disable KVM acceleration support" echo " --disable-nptl disable usermode NPTL support" echo " --enable-system enable all system emulation targets" echo " --disable-system disable all system emulation targets" @@ -933,6 +937,30 @@ EOF fi ########################################## +# kvm probe +if test "$kvm" = "yes" ; then + cat > $TMPC <<EOF +#include <linux/kvm.h> +#if !defined(KVM_API_VERSION) || \ + KVM_API_VERSION < 12 || \ + KVM_API_VERSION > 12 || \ + !defined(KVM_CAP_USER_MEMORY) || \ + !defined(KVM_CAP_SET_TSS_ADDR) +#error Invalid KVM version +#endif +int main(void) { return 0; } +EOF + # FIXME make this configurable + kvm_cflags=-I/lib/modules/`uname -r`/build/include + if $cc $ARCH_CFLAGS -o $TMPE ${OS_CFLAGS} $kvm_cflags $TMPC \ + 2>/dev/null ; then + : + else + kvm="no" + fi +fi + +########################################## # AIO probe if test "$aio" = "yes" ; then aio=no @@ -1018,6 +1046,7 @@ echo "uname -r $uname_release" echo "NPTL support $nptl" echo "vde support $vde" echo "AIO support $aio" +echo "KVM support $kvm" if test $sdl_too_old = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -1388,6 +1417,15 @@ interp_prefix1=`echo "$interp_prefix" | sed "s/%M/$target_cpu/g"` echo "#define CONFIG_QEMU_PREFIX \"$interp_prefix1\"" >> $config_h gdb_xml_files="" +# FIXME allow i386 to build on x86_64 and vice versa +if test "$kvm" = "yes" -a "$target_cpu" != "$cpu" ; then + kvm="no" +fi +# Disable KVM for linux-user +if test "$kvm" = "yes" -a "$target_softmmu" = "no" ; then + kvm="no" +fi + case "$target_cpu" in i386) echo "TARGET_ARCH=i386" >> $config_mak @@ -1397,6 +1435,11 @@ case "$target_cpu" in then echo "#define USE_KQEMU 1" >> $config_h fi + if test "$kvm" = "yes" ; then + echo "CONFIG_KVM=yes" >> $config_mak + echo "KVM_CFLAGS=$kvm_cflags" >> $config_mak + echo "#define CONFIG_KVM" >> $config_h + fi gcc3minver=`$cc --version 2> /dev/null| fgrep "(GCC) 3." | awk '{ print $3 }' | cut -f2 -d.` if test -n "$gcc3minver" && test $gcc3minver -gt 3 then @@ -1414,6 +1457,11 @@ case "$target_cpu" in then echo "#define USE_KQEMU 1" >> $config_h fi + if test "$kvm" = "yes" ; then + echo "CONFIG_KVM=yes" >> $config_mak + echo "KVM_CFLAGS=$kvm_cflags" >> $config_mak + echo "#define CONFIG_KVM 1" >> $config_h + fi ;; alpha) echo "TARGET_ARCH=alpha" >> $config_mak diff --git a/cpu-defs.h b/cpu-defs.h index 5dcac74..46d4487 100644 --- a/cpu-defs.h +++ b/cpu-defs.h @@ -142,6 +142,9 @@ typedef struct icount_decr_u16 { } icount_decr_u16; #endif +struct kvm_run; +struct KVMState; + #define CPU_TEMP_BUF_NLONGS 128 #define CPU_COMMON \ struct TranslationBlock *current_tb; /* currently executing TB */ \ @@ -199,6 +202,9 @@ typedef struct icount_decr_u16 { /* user data */ \ void *opaque; \ \ - const char *cpu_model_str; + const char *cpu_model_str; \ + struct KVMState *kvm_state; \ + struct kvm_run *kvm_run; \ + int kvm_fd; #endif diff --git a/cpu-exec.c b/cpu-exec.c index 6d4dcdd..04b3021 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -22,6 +22,7 @@ #include "exec.h" #include "disas.h" #include "tcg.h" +#include "kvm.h" #if !defined(CONFIG_SOFTMMU) #undef EAX @@ -361,6 +362,19 @@ int cpu_exec(CPUState *env1) } #endif + if (kvm_enabled()) { + int ret; + ret = kvm_cpu_exec(env); + if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) { + env->interrupt_request &= ~CPU_INTERRUPT_EXIT; + env->exception_index = EXCP_INTERRUPT; + cpu_loop_exit(); + } else if (env->halted) { + cpu_loop_exit(); + } else + longjmp(env->jmp_env, 1); + } + next_tb = 0; /* force lookup of first TB */ for(;;) { interrupt_request = env->interrupt_request; diff --git a/exec.c b/exec.c index f1fcec8..2623ac6 100644 --- a/exec.c +++ b/exec.c @@ -39,6 +39,7 @@ #include "tcg.h" #include "hw/hw.h" #include "osdep.h" +#include "kvm.h" #if defined(CONFIG_USER_ONLY) #include <qemu.h> #endif @@ -2211,6 +2212,9 @@ void cpu_register_physical_memory(target_phys_addr_t start_addr, kqemu_set_phys_mem(start_addr, size, phys_offset); } #endif + if (kvm_enabled()) + kvm_set_phys_mem(start_addr, size, phys_offset); + size = (size + TARGET_PAGE_SIZE - 1) & TARGET_PAGE_MASK; end_addr = start_addr + (target_phys_addr_t)size; for(addr = start_addr; addr != end_addr; addr += TARGET_PAGE_SIZE) { diff --git a/hw/acpi.c b/hw/acpi.c index 45963d3..66a5faa 100644 --- a/hw/acpi.c +++ b/hw/acpi.c @@ -23,6 +23,7 @@ #include "sysemu.h" #include "i2c.h" #include "smbus.h" +#include "kvm.h" //#define DEBUG @@ -501,6 +502,12 @@ i2c_bus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base, register_ioport_write(ACPI_DBG_IO_ADDR, 4, 4, acpi_dbg_writel, s); + if (kvm_enabled()) { + /* Mark SMM as already inited to prevent SMM from running. KVM does not + * support SMM mode. */ + pci_conf[0x5B] = 0x02; + } + /* XXX: which specification is used ? The i82731AB has different mappings */ pci_conf[0x5f] = (parallel_hds[0] != NULL ? 0x80 : 0) | 0x10; diff --git a/kvm-all.c b/kvm-all.c new file mode 100644 index 0000000..4379071 --- /dev/null +++ b/kvm-all.c @@ -0,0 +1,377 @@ +/* + * QEMU KVM support + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <linux/kvm.h> + +#include "qemu-common.h" +#include "sysemu.h" +#include "kvm.h" + +//#define DEBUG_KVM + +#ifdef DEBUG_KVM +#define dprintf(fmt, ...) \ + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) +#else +#define dprintf(fmt, ...) \ + do { } while (0) +#endif + +typedef struct kvm_userspace_memory_region KVMSlot; + +int kvm_allowed = 0; + +struct KVMState +{ + KVMSlot slots[32]; + int fd; + int vmfd; +}; + +static KVMState *kvm_state; + +static KVMSlot *kvm_alloc_slot(KVMState *s) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(s->slots); i++) { + if (s->slots[i].memory_size == 0) + return &s->slots[i]; + } + + return NULL; +} + +static KVMSlot *kvm_lookup_slot(KVMState *s, target_phys_addr_t start_addr) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(s->slots); i++) { + KVMSlot *mem = &s->slots[i]; + + if (start_addr >= mem->guest_phys_addr && + start_addr < (mem->guest_phys_addr + mem->memory_size)) + return mem; + } + + return NULL; +} + +int kvm_init_vcpu(CPUState *env) +{ + KVMState *s = kvm_state; + long mmap_size; + int ret; + + dprintf("kvm_init_vcpu\n"); + + ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, + (void *)(unsigned long)env->cpu_index); + if (ret < 0) { + dprintf("kvm_create_vcpu failed\n"); + goto err; + } + + env->kvm_fd = ret; + env->kvm_state = s; + + mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); + if (mmap_size < 0) { + dprintf("KVM_GET_VCPU_MMAP_SIZE failed\n"); + goto err; + } + + env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, + env->kvm_fd, 0); + if (env->kvm_run == MAP_FAILED) { + ret = -errno; + dprintf("mmap'ing vcpu state failed\n"); + goto err; + } + + ret = kvm_arch_init_vcpu(env); + +err: + return ret; +} + +int kvm_init(int smp_cpus) +{ + KVMState *s; + int ret; + int i; + + if (smp_cpus > 1) + return -EINVAL; + + s = qemu_mallocz(sizeof(KVMState)); + if (s == NULL) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(s->slots); i++) + s->slots[i].slot = i; + + s->vmfd = -1; + s->fd = open("/dev/kvm", O_RDWR); + if (s->fd == -1) { + fprintf(stderr, "Could not access KVM kernel module: %m\n"); + ret = -errno; + goto err; + } + + ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); + if (ret < KVM_API_VERSION) { + if (ret > 0) + ret = -EINVAL; + fprintf(stderr, "kvm version too old\n"); + goto err; + } + + if (ret > KVM_API_VERSION) { + ret = -EINVAL; + fprintf(stderr, "kvm version not supported\n"); + goto err; + } + + s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0); + if (s->vmfd < 0) + goto err; + + /* initially, KVM allocated its own memory and we had to jump through + * hooks to make phys_ram_base point to this. Modern versions of KVM + * just use a user allocated buffer so we can use phys_ram_base + * unmodified. Make sure we have a sufficiently modern version of KVM. + */ + ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY); + if (ret <= 0) { + if (ret == 0) + ret = -EINVAL; + fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n"); + goto err; + } + + ret = kvm_arch_init(s, smp_cpus); + if (ret < 0) + goto err; + + kvm_state = s; + + return 0; + +err: + if (s) { + if (s->vmfd != -1) + close(s->vmfd); + if (s->fd != -1) + close(s->fd); + } + qemu_free(s); + + return ret; +} + +static int kvm_handle_io(CPUState *env, uint16_t port, void *data, + int direction, int size, uint32_t count) +{ + int i; + uint8_t *ptr = data; + + for (i = 0; i < count; i++) { + if (direction == KVM_EXIT_IO_IN) { + switch (size) { + case 1: + stb_p(ptr, cpu_inb(env, port)); + break; + case 2: + stw_p(ptr, cpu_inw(env, port)); + break; + case 4: + stl_p(ptr, cpu_inl(env, port)); + break; + } + } else { + switch (size) { + case 1: + cpu_outb(env, port, ldub_p(ptr)); + break; + case 2: + cpu_outw(env, port, lduw_p(ptr)); + break; + case 4: + cpu_outl(env, port, ldl_p(ptr)); + break; + } + } + + ptr += size; + } + + return 1; +} + +int kvm_cpu_exec(CPUState *env) +{ + struct kvm_run *run = env->kvm_run; + int ret; + + dprintf("kvm_cpu_exec()\n"); + + do { + kvm_arch_pre_run(env, run); + + if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) { + dprintf("interrupt exit requested\n"); + ret = 0; + break; + } + + dprintf("setting tpr\n"); + run->cr8 = cpu_get_apic_tpr(env); + + ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); + kvm_arch_post_run(env, run); + + if (ret == -EINTR || ret == -EAGAIN) { + dprintf("io window exit\n"); + ret = 0; + break; + } + + if (ret < 0) { + dprintf("kvm run failed %s\n", strerror(-ret)); + abort(); + } + + ret = 0; /* exit loop */ + switch (run->exit_reason) { + case KVM_EXIT_IO: + dprintf("handle_io\n"); + ret = kvm_handle_io(env, run->io.port, + (uint8_t *)run + run->io.data_offset, + run->io.direction, + run->io.size, + run->io.count); + break; + case KVM_EXIT_MMIO: + dprintf("handle_mmio\n"); + cpu_physical_memory_rw(run->mmio.phys_addr, + run->mmio.data, + run->mmio.len, + run->mmio.is_write); + ret = 1; + break; + case KVM_EXIT_IRQ_WINDOW_OPEN: + dprintf("irq_window_open\n"); + break; + case KVM_EXIT_SHUTDOWN: + dprintf("shutdown\n"); + qemu_system_reset_request(); + ret = 1; + break; + case KVM_EXIT_UNKNOWN: + dprintf("kvm_exit_unknown\n"); + break; + case KVM_EXIT_FAIL_ENTRY: + dprintf("kvm_exit_fail_entry\n"); + break; + case KVM_EXIT_EXCEPTION: + dprintf("kvm_exit_exception\n"); + break; + case KVM_EXIT_DEBUG: + dprintf("kvm_exit_debug\n"); + break; + default: + dprintf("kvm_arch_handle_exit\n"); + ret = kvm_arch_handle_exit(env, run); + break; + } + } while (ret > 0); + + return ret; +} + +void kvm_set_phys_mem(target_phys_addr_t start_addr, + ram_addr_t size, + ram_addr_t phys_offset) +{ + KVMState *s = kvm_state; + ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK; + KVMSlot *mem; + + /* KVM does not support read-only slots */ + phys_offset &= ~IO_MEM_ROM; + + mem = kvm_lookup_slot(s, start_addr); + if (mem) { + if (flags == IO_MEM_UNASSIGNED) { + mem->memory_size = 0; + mem->guest_phys_addr = start_addr; + mem->userspace_addr = 0; + mem->flags = 0; + + kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem); + } else if (start_addr >= mem->guest_phys_addr && + (start_addr + size) <= (mem->guest_phys_addr + mem->memory_size)) + return; + } + + /* KVM does not need to know about this memory */ + if (flags >= IO_MEM_UNASSIGNED) + return; + + mem = kvm_alloc_slot(s); + mem->memory_size = size; + mem->guest_phys_addr = start_addr; + mem->userspace_addr = (unsigned long)(phys_ram_base + phys_offset); + mem->flags = 0; + + kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem); + /* FIXME deal with errors */ +} + +int kvm_ioctl(KVMState *s, int type, void *data) +{ + int ret; + + ret = ioctl(s->fd, type, data); + if (ret == -1) + ret = -errno; + + return ret; +} + +int kvm_vm_ioctl(KVMState *s, int type, void *data) +{ + int ret; + + ret = ioctl(s->vmfd, type, data); + if (ret == -1) + ret = -errno; + + return ret; +} + +int kvm_vcpu_ioctl(CPUState *env, int type, void *data) +{ + int ret; + + ret = ioctl(env->kvm_fd, type, data); + if (ret == -1) + ret = -errno; + + return ret; +} diff --git a/kvm.h b/kvm.h new file mode 100644 index 0000000..37102b4 --- /dev/null +++ b/kvm.h @@ -0,0 +1,68 @@ +/* + * QEMU KVM support + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_KVM_H +#define QEMU_KVM_H + +#include "config.h" + +#ifdef CONFIG_KVM +extern int kvm_allowed; + +#define kvm_enabled() (kvm_allowed) +#else +#define kvm_enabled() (0) +#endif + +struct kvm_run; + +/* external API */ + +int kvm_init(int smp_cpus); + +int kvm_init_vcpu(CPUState *env); + +int kvm_cpu_exec(CPUState *env); + +void kvm_set_phys_mem(target_phys_addr_t start_addr, + ram_addr_t size, + ram_addr_t phys_offset); + +/* internal API */ + +struct KVMState; +typedef struct KVMState KVMState; + +int kvm_ioctl(KVMState *s, int type, void *data); + +int kvm_vm_ioctl(KVMState *s, int type, void *data); + +int kvm_vcpu_ioctl(CPUState *env, int type, void *data); + +/* Arch specific hooks */ + +int kvm_arch_post_run(CPUState *env, struct kvm_run *run); + +int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run); + +int kvm_arch_pre_run(CPUState *env, struct kvm_run *run); + +int kvm_arch_get_registers(CPUState *env); + +int kvm_arch_put_registers(CPUState *env); + +int kvm_arch_init(KVMState *s, int smp_cpus); + +int kvm_arch_init_vcpu(CPUState *env); + +#endif diff --git a/monitor.c b/monitor.c index f0a0bc3..dc90a2b 100644 --- a/monitor.c +++ b/monitor.c @@ -37,6 +37,7 @@ #include <dirent.h> #include "qemu-timer.h" #include "migration.h" +#include "kvm.h" //#define DEBUG //#define DEBUG_COMPLETION @@ -1263,6 +1264,19 @@ static void do_info_kqemu(void) #endif } +static void do_info_kvm(void) +{ +#ifdef CONFIG_KVM + term_printf("kvm support: "); + if (kvm_enabled()) + term_printf("enabled\n"); + else + term_printf("disabled\n"); +#else + term_printf("kvm support: not compiled\n"); +#endif +} + #ifdef CONFIG_PROFILER int64_t kqemu_time; @@ -1495,6 +1509,8 @@ static const term_cmd_t info_cmds[] = { "", "show dynamic compiler info", }, { "kqemu", "", do_info_kqemu, "", "show kqemu information", }, + { "kvm", "", do_info_kvm, + "", "show kvm information", }, { "usb", "", usb_info, "", "show guest USB devices", }, { "usbhost", "", usb_host_info, diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 263a477..167bae2 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -587,6 +587,8 @@ typedef struct CPUX86State { target_ulong kernelgsbase; #endif + uint64_t tsc; + uint64_t pat; /* exception/interrupt handling */ @@ -617,6 +619,10 @@ typedef struct CPUX86State { int kqemu_enabled; int last_io_time; #endif + + /* For KVM */ + uint64_t interrupt_bitmap[256 / 64]; + /* in order to simplify APIC support, we leave this pointer to the user */ struct APICState *apic_state; diff --git a/target-i386/helper.c b/target-i386/helper.c index 905ae9b..e550f74 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -29,6 +29,7 @@ #include "exec-all.h" #include "svm.h" #include "qemu-common.h" +#include "kvm.h" //#define DEBUG_MMU @@ -115,6 +116,8 @@ CPUX86State *cpu_x86_init(const char *cpu_model) #ifdef USE_KQEMU kqemu_init(env); #endif + if (kvm_enabled()) + kvm_init_vcpu(env); return env; } @@ -1288,6 +1291,40 @@ target_phys_addr_t cpu_get_phys_page_debug(CPUState *env, target_ulong addr) } #endif /* !CONFIG_USER_ONLY */ +#if defined(CONFIG_KVM) +static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + uint32_t vec[4]; + +#ifdef __x86_64__ + asm volatile("cpuid" + : "=a"(vec[0]), "=b"(vec[1]), + "=c"(vec[2]), "=d"(vec[3]) + : "0"(function) : "cc"); +#else + asm volatile("pusha \n\t" + "cpuid \n\t" + "mov %%eax, 0(%1) \n\t" + "mov %%ebx, 4(%1) \n\t" + "mov %%ecx, 8(%1) \n\t" + "mov %%edx, 12(%1) \n\t" + "popa" + : : "a"(function), "S"(vec) + : "memory", "cc"); +#endif + + if (eax) + *eax = vec[0]; + if (ebx) + *ebx = vec[1]; + if (ecx) + *ecx = vec[2]; + if (edx) + *edx = vec[3]; +} +#endif + void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) @@ -1307,12 +1344,23 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, *ebx = env->cpuid_vendor1; *edx = env->cpuid_vendor2; *ecx = env->cpuid_vendor3; + + /* sysenter isn't supported on compatibility mode on AMD. and syscall + * isn't supported in compatibility mode on Intel. so advertise the + * actuall cpu, and say goodbye to migration between different vendors + * is you use compatibility mode. */ + if (kvm_enabled()) + host_cpuid(0, NULL, ebx, ecx, edx); break; case 1: *eax = env->cpuid_version; *ebx = (env->cpuid_apic_id << 24) | 8 << 8; /* CLFLUSH size in quad words, Linux wants it. */ *ecx = env->cpuid_ext_features; *edx = env->cpuid_features; + + /* "Hypervisor present" bit required for Microsoft SVVP */ + if (kvm_enabled()) + *ecx |= (1 << 31); break; case 2: /* cache info: needed for Pentium Pro compatibility */ @@ -1390,6 +1438,31 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, *ebx = 0; *ecx = env->cpuid_ext3_features; *edx = env->cpuid_ext2_features; + + if (kvm_enabled()) { + uint32_t h_eax, h_edx; + + host_cpuid(0x80000001, &h_eax, NULL, NULL, &h_edx); + + /* disable CPU features that the host does not support */ + + /* long mode */ + if ((h_edx & 0x20000000) == 0 /* || !lm_capable_kernel */) + *edx &= ~0x20000000; + /* syscall */ + if ((h_edx & 0x00000800) == 0) + *edx &= ~0x00000800; + /* nx */ + if ((h_edx & 0x00100000) == 0) + *edx &= ~0x00100000; + + /* disable CPU features that KVM cannot support */ + + /* svm */ + *ecx &= ~4UL; + /* 3dnow */ + *edx = ~0xc0000000; + } break; case 0x80000002: case 0x80000003: diff --git a/target-i386/kvm.c b/target-i386/kvm.c new file mode 100644 index 0000000..ff372af --- /dev/null +++ b/target-i386/kvm.c @@ -0,0 +1,635 @@ +/* + * QEMU KVM support + * + * Copyright (C) 2006-2008 Qumranet Technologies + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <linux/kvm.h> + +#include "qemu-common.h" +#include "sysemu.h" +#include "kvm.h" +#include "cpu.h" + +//#define DEBUG_KVM + +#ifdef DEBUG_KVM +#define dprintf(fmt, ...) \ + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) +#else +#define dprintf(fmt, ...) \ + do { } while (0) +#endif + +int kvm_arch_init_vcpu(CPUState *env) +{ + struct { + struct kvm_cpuid cpuid; + struct kvm_cpuid_entry entries[100]; + } __attribute__((packed)) cpuid_data; + int limit, i, cpuid_i; + uint32_t eax, ebx, ecx, edx; + + cpuid_i = 0; + + cpu_x86_cpuid(env, 0, &eax, &ebx, &ecx, &edx); + limit = eax; + + for (i = 0; i < limit; i++) { + struct kvm_cpuid_entry *c = &cpuid_data.entries[cpuid_i++]; + + cpu_x86_cpuid(env, i, &eax, &ebx, &ecx, &edx); + c->function = i; + c->eax = eax; + c->ebx = ebx; + c->ecx = ecx; + c->edx = edx; + } + + cpu_x86_cpuid(env, 0x80000000, &eax, &ebx, &ecx, &edx); + limit = eax; + + for (i = 0x80000000; i < limit; i++) { + struct kvm_cpuid_entry *c = &cpuid_data.entries[cpuid_i++]; + + cpu_x86_cpuid(env, i, &eax, &ebx, &ecx, &edx); + c->function = i; + c->eax = eax; + c->ebx = ebx; + c->ecx = ecx; + c->edx = edx; + } + + cpuid_data.cpuid.nent = cpuid_i; + + return kvm_vcpu_ioctl(env, KVM_SET_CPUID, &cpuid_data); +} + +static int kvm_has_msr_star(CPUState *env) +{ + static int has_msr_star; + int ret; + + /* first time */ + if (has_msr_star == 0) { + struct kvm_msr_list msr_list, *kvm_msr_list; + + has_msr_star = -1; + + /* Obtain MSR list from KVM. These are the MSRs that we must + * save/restore */ + ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, &msr_list); + if (ret < 0) + return 0; + + msr_list.nmsrs = 0; + kvm_msr_list = qemu_mallocz(sizeof(msr_list) + + msr_list.nmsrs * sizeof(msr_list.indices[0])); + if (kvm_msr_list == NULL) + return 0; + + ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, kvm_msr_list); + if (ret >= 0) { + int i; + + for (i = 0; i < kvm_msr_list->nmsrs; i++) { + if (kvm_msr_list->indices[i] == MSR_STAR) { + has_msr_star = 1; + break; + } + } + } + + free(kvm_msr_list); + } + + if (has_msr_star == 1) + return 1; + return 0; +} + +int kvm_arch_init(KVMState *s, int smp_cpus) +{ + int ret; + + /* create vm86 tss. KVM uses vm86 mode to emulate 16-bit code + * directly. In order to use vm86 mode, a TSS is needed. Since this + * must be part of guest physical memory, we need to allocate it. Older + * versions of KVM just assumed that it would be at the end of physical + * memory but that doesn't work with more than 4GB of memory. We simply + * refuse to work with those older versions of KVM. */ + ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, (void *)KVM_CAP_SET_TSS_ADDR); + if (ret <= 0) { + fprintf(stderr, "kvm does not support KVM_CAP_SET_TSS_ADDR\n"); + return ret; + } + + /* this address is 3 pages before the bios, and the bios should present + * as unavaible memory. FIXME, need to ensure the e820 map deals with + * this? + */ + return kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, (void *)0xfffbd000); +} + +static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs) +{ + lhs->selector = rhs->selector; + lhs->base = rhs->base; + lhs->limit = rhs->limit; + lhs->type = 3; + lhs->present = 1; + lhs->dpl = 3; + lhs->db = 0; + lhs->s = 1; + lhs->l = 0; + lhs->g = 0; + lhs->avl = 0; + lhs->unusable = 0; +} + +static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs) +{ + unsigned flags = rhs->flags; + lhs->selector = rhs->selector; + lhs->base = rhs->base; + lhs->limit = rhs->limit; + lhs->type = (flags >> DESC_TYPE_SHIFT) & 15; + lhs->present = (flags & DESC_P_MASK) != 0; + lhs->dpl = rhs->selector & 3; + lhs->db = (flags >> DESC_B_SHIFT) & 1; + lhs->s = (flags & DESC_S_MASK) != 0; + lhs->l = (flags >> DESC_L_SHIFT) & 1; + lhs->g = (flags & DESC_G_MASK) != 0; + lhs->avl = (flags & DESC_AVL_MASK) != 0; + lhs->unusable = 0; +} + +static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) +{ + lhs->selector = rhs->selector; + lhs->base = rhs->base; + lhs->limit = rhs->limit; + lhs->flags = + (rhs->type << DESC_TYPE_SHIFT) + | (rhs->present * DESC_P_MASK) + | (rhs->dpl << DESC_DPL_SHIFT) + | (rhs->db << DESC_B_SHIFT) + | (rhs->s * DESC_S_MASK) + | (rhs->l << DESC_L_SHIFT) + | (rhs->g * DESC_G_MASK) + | (rhs->avl * DESC_AVL_MASK); +} + +static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set) +{ + if (set) + *kvm_reg = *qemu_reg; + else + *qemu_reg = *kvm_reg; +} + +static int kvm_getput_regs(CPUState *env, int set) +{ + struct kvm_regs regs; + int ret = 0; + + if (!set) { + ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, ®s); + if (ret < 0) + return ret; + } + + kvm_getput_reg(®s.rax, &env->regs[R_EAX], set); + kvm_getput_reg(®s.rbx, &env->regs[R_EBX], set); + kvm_getput_reg(®s.rcx, &env->regs[R_ECX], set); + kvm_getput_reg(®s.rdx, &env->regs[R_EDX], set); + kvm_getput_reg(®s.rsi, &env->regs[R_ESI], set); + kvm_getput_reg(®s.rdi, &env->regs[R_EDI], set); + kvm_getput_reg(®s.rsp, &env->regs[R_ESP], set); + kvm_getput_reg(®s.rbp, &env->regs[R_EBP], set); +#ifdef TARGET_X86_64 + kvm_getput_reg(®s.r8, &env->regs[8], set); + kvm_getput_reg(®s.r9, &env->regs[9], set); + kvm_getput_reg(®s.r10, &env->regs[10], set); + kvm_getput_reg(®s.r11, &env->regs[11], set); + kvm_getput_reg(®s.r12, &env->regs[12], set); + kvm_getput_reg(®s.r13, &env->regs[13], set); + kvm_getput_reg(®s.r14, &env->regs[14], set); + kvm_getput_reg(®s.r15, &env->regs[15], set); +#endif + + kvm_getput_reg(®s.rflags, &env->eflags, set); + kvm_getput_reg(®s.rip, &env->eip, set); + + if (set) + ret = kvm_vcpu_ioctl(env, KVM_SET_REGS, ®s); + + return ret; +} + +static int kvm_put_fpu(CPUState *env) +{ + struct kvm_fpu fpu; + int i; + + memset(&fpu, 0, sizeof fpu); + fpu.fsw = env->fpus & ~(7 << 11); + fpu.fsw |= (env->fpstt & 7) << 11; + fpu.fcw = env->fpuc; + for (i = 0; i < 8; ++i) + fpu.ftwx |= (!env->fptags[i]) << i; + memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs); + memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs); + fpu.mxcsr = env->mxcsr; + + return kvm_vcpu_ioctl(env, KVM_SET_FPU, &fpu); +} + +static int kvm_put_sregs(CPUState *env) +{ + struct kvm_sregs sregs; + + memcpy(sregs.interrupt_bitmap, + env->interrupt_bitmap, + sizeof(sregs.interrupt_bitmap)); + + if ((env->eflags & VM_MASK)) { + set_v8086_seg(&sregs.cs, &env->segs[R_CS]); + set_v8086_seg(&sregs.ds, &env->segs[R_DS]); + set_v8086_seg(&sregs.es, &env->segs[R_ES]); + set_v8086_seg(&sregs.fs, &env->segs[R_FS]); + set_v8086_seg(&sregs.gs, &env->segs[R_GS]); + set_v8086_seg(&sregs.ss, &env->segs[R_SS]); + } else { + set_seg(&sregs.cs, &env->segs[R_CS]); + set_seg(&sregs.ds, &env->segs[R_DS]); + set_seg(&sregs.es, &env->segs[R_ES]); + set_seg(&sregs.fs, &env->segs[R_FS]); + set_seg(&sregs.gs, &env->segs[R_GS]); + set_seg(&sregs.ss, &env->segs[R_SS]); + + if (env->cr[0] & CR0_PE_MASK) { + /* force ss cpl to cs cpl */ + sregs.ss.selector = (sregs.ss.selector & ~3) | + (sregs.cs.selector & 3); + sregs.ss.dpl = sregs.ss.selector & 3; + } + } + + set_seg(&sregs.tr, &env->tr); + set_seg(&sregs.ldt, &env->ldt); + + sregs.idt.limit = env->idt.limit; + sregs.idt.base = env->idt.base; + sregs.gdt.limit = env->gdt.limit; + sregs.gdt.base = env->gdt.base; + + sregs.cr0 = env->cr[0]; + sregs.cr2 = env->cr[2]; + sregs.cr3 = env->cr[3]; + sregs.cr4 = env->cr[4]; + + sregs.cr8 = cpu_get_apic_tpr(env); + sregs.apic_base = cpu_get_apic_base(env); + + sregs.efer = env->efer; + + return kvm_vcpu_ioctl(env, KVM_SET_SREGS, &sregs); +} + +static void kvm_msr_entry_set(struct kvm_msr_entry *entry, + uint32_t index, uint64_t value) +{ + entry->index = index; + entry->data = value; +} + +static int kvm_put_msrs(CPUState *env) +{ + struct { + struct kvm_msrs info; + struct kvm_msr_entry entries[100]; + } msr_data; + struct kvm_msr_entry *msrs = msr_data.entries; + int n = 0; + + kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs); + kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp); + kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip); + if (kvm_has_msr_star(env)) + kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star); + kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc); +#ifdef TARGET_X86_64 + /* FIXME if lm capable */ + kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar); + kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase); + kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask); + kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar); +#endif + msr_data.info.nmsrs = n; + + return kvm_vcpu_ioctl(env, KVM_SET_MSRS, &msr_data); + +} + + +static int kvm_get_fpu(CPUState *env) +{ + struct kvm_fpu fpu; + int i, ret; + + ret = kvm_vcpu_ioctl(env, KVM_GET_FPU, &fpu); + if (ret < 0) + return ret; + + env->fpstt = (fpu.fsw >> 11) & 7; + env->fpus = fpu.fsw; + env->fpuc = fpu.fcw; + for (i = 0; i < 8; ++i) + env->fptags[i] = !((fpu.ftwx >> i) & 1); + memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs); + memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs); + env->mxcsr = fpu.mxcsr; + + return 0; +} + +static int kvm_get_sregs(CPUState *env) +{ + struct kvm_sregs sregs; + uint32_t hflags; + int ret; + + ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs); + if (ret < 0) + return ret; + + memcpy(env->interrupt_bitmap, + sregs.interrupt_bitmap, + sizeof(sregs.interrupt_bitmap)); + + get_seg(&env->segs[R_CS], &sregs.cs); + get_seg(&env->segs[R_DS], &sregs.ds); + get_seg(&env->segs[R_ES], &sregs.es); + get_seg(&env->segs[R_FS], &sregs.fs); + get_seg(&env->segs[R_GS], &sregs.gs); + get_seg(&env->segs[R_SS], &sregs.ss); + + get_seg(&env->tr, &sregs.tr); + get_seg(&env->ldt, &sregs.ldt); + + env->idt.limit = sregs.idt.limit; + env->idt.base = sregs.idt.base; + env->gdt.limit = sregs.gdt.limit; + env->gdt.base = sregs.gdt.base; + + env->cr[0] = sregs.cr0; + env->cr[2] = sregs.cr2; + env->cr[3] = sregs.cr3; + env->cr[4] = sregs.cr4; + + cpu_set_apic_base(env, sregs.apic_base); + + env->efer = sregs.efer; + //cpu_set_apic_tpr(env, sregs.cr8); + +#define HFLAG_COPY_MASK ~( \ + HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \ + HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \ + HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \ + HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK) + + + + hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK; + hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT); + hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) & + (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK); + hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK)); + hflags |= (env->cr[4] & CR4_OSFXSR_MASK) << + (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT); + + if (env->efer & MSR_EFER_LMA) { + hflags |= HF_LMA_MASK; + } + + if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) { + hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK; + } else { + hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >> + (DESC_B_SHIFT - HF_CS32_SHIFT); + hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >> + (DESC_B_SHIFT - HF_SS32_SHIFT); + if (!(env->cr[0] & CR0_PE_MASK) || + (env->eflags & VM_MASK) || + !(hflags & HF_CS32_MASK)) { + hflags |= HF_ADDSEG_MASK; + } else { + hflags |= ((env->segs[R_DS].base | + env->segs[R_ES].base | + env->segs[R_SS].base) != 0) << + HF_ADDSEG_SHIFT; + } + } + env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags; + env->cc_src = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); + env->df = 1 - (2 * ((env->eflags >> 10) & 1)); + env->cc_op = CC_OP_EFLAGS; + env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); + + return 0; +} + +static int kvm_get_msrs(CPUState *env) +{ + struct { + struct kvm_msrs info; + struct kvm_msr_entry entries[100]; + } msr_data; + struct kvm_msr_entry *msrs = msr_data.entries; + int ret, i, n; + + n = 0; + msrs[n++].index = MSR_IA32_SYSENTER_CS; + msrs[n++].index = MSR_IA32_SYSENTER_ESP; + msrs[n++].index = MSR_IA32_SYSENTER_EIP; + if (kvm_has_msr_star(env)) + msrs[n++].index = MSR_STAR; + msrs[n++].index = MSR_IA32_TSC; +#ifdef TARGET_X86_64 + /* FIXME lm_capable_kernel */ + msrs[n++].index = MSR_CSTAR; + msrs[n++].index = MSR_KERNELGSBASE; + msrs[n++].index = MSR_FMASK; + msrs[n++].index = MSR_LSTAR; +#endif + msr_data.info.nmsrs = n; + ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data); + if (ret < 0) + return ret; + + for (i = 0; i < ret; i++) { + switch (msrs[i].index) { + case MSR_IA32_SYSENTER_CS: + env->sysenter_cs = msrs[i].data; + break; + case MSR_IA32_SYSENTER_ESP: + env->sysenter_esp = msrs[i].data; + break; + case MSR_IA32_SYSENTER_EIP: + env->sysenter_eip = msrs[i].data; + break; + case MSR_STAR: + env->star = msrs[i].data; + break; +#ifdef TARGET_X86_64 + case MSR_CSTAR: + env->cstar = msrs[i].data; + break; + case MSR_KERNELGSBASE: + env->kernelgsbase = msrs[i].data; + break; + case MSR_FMASK: + env->fmask = msrs[i].data; + break; + case MSR_LSTAR: + env->lstar = msrs[i].data; + break; +#endif + case MSR_IA32_TSC: + env->tsc = msrs[i].data; + break; + } + } + + return 0; +} + +int kvm_arch_put_registers(CPUState *env) +{ + int ret; + + ret = kvm_getput_regs(env, 1); + if (ret < 0) + return ret; + + ret = kvm_put_fpu(env); + if (ret < 0) + return ret; + + ret = kvm_put_sregs(env); + if (ret < 0) + return ret; + + ret = kvm_put_msrs(env); + if (ret < 0) + return ret; + + return 0; +} + +int kvm_arch_get_registers(CPUState *env) +{ + int ret; + + ret = kvm_getput_regs(env, 0); + if (ret < 0) + return ret; + + ret = kvm_get_fpu(env); + if (ret < 0) + return ret; + + ret = kvm_get_sregs(env); + if (ret < 0) + return ret; + + ret = kvm_get_msrs(env); + if (ret < 0) + return ret; + + return 0; +} + +int kvm_arch_pre_run(CPUState *env, struct kvm_run *run) +{ + /* Try to inject an interrupt if the guest can accept it */ + if (run->ready_for_interrupt_injection && + (env->interrupt_request & CPU_INTERRUPT_HARD) && + (env->eflags & IF_MASK)) { + int irq; + + env->interrupt_request &= ~CPU_INTERRUPT_HARD; + irq = cpu_get_pic_interrupt(env); + if (irq >= 0) { + struct kvm_interrupt intr; + intr.irq = irq; + /* FIXME: errors */ + dprintf("injected interrupt %d\n", irq); + kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr); + } + } + + /* If we have an interrupt but the guest is not ready to receive an + * interrupt, request an interrupt window exit. This will + * cause a return to userspace as soon as the guest is ready to + * receive interrupts. */ + if ((env->interrupt_request & CPU_INTERRUPT_HARD)) + run->request_interrupt_window = 1; + else + run->request_interrupt_window = 0; + + return 0; +} + +int kvm_arch_post_run(CPUState *env, struct kvm_run *run) +{ + if (run->if_flag) + env->eflags |= IF_MASK; + else + env->eflags &= ~IF_MASK; + + cpu_set_apic_tpr(env, run->cr8); + cpu_set_apic_base(env, run->apic_base); + + return 0; +} + +static int kvm_handle_halt(CPUState *env) +{ + if (!((env->interrupt_request & CPU_INTERRUPT_HARD) && + (env->eflags & IF_MASK)) && + !(env->interrupt_request & CPU_INTERRUPT_NMI)) { + env->halted = 1; + env->exception_index = EXCP_HLT; + return 0; + } + + return 1; +} + +int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) +{ + int ret = 0; + + switch (run->exit_reason) { + case KVM_EXIT_HLT: + dprintf("handle_hlt\n"); + ret = kvm_handle_halt(env); + break; + } + + return ret; +} diff --git a/vl.c b/vl.c index 74ae652..ecda8d5 100644 --- a/vl.c +++ b/vl.c @@ -39,6 +39,7 @@ #include "block.h" #include "audio/audio.h" #include "migration.h" +#include "kvm.h" #include <unistd.h> #include <fcntl.h> @@ -8258,6 +8259,9 @@ static void help(int exitcode) "-kernel-kqemu enable KQEMU full virtualization (default is user mode only)\n" "-no-kqemu disable KQEMU kernel module usage\n" #endif +#ifdef CONFIG_KVM + "-enable-kvm enable KVM full virtualization support\n" +#endif #ifdef TARGET_I386 "-no-acpi disable ACPI\n" #endif @@ -8363,6 +8367,7 @@ enum { QEMU_OPTION_pidfile, QEMU_OPTION_no_kqemu, QEMU_OPTION_kernel_kqemu, + QEMU_OPTION_enable_kvm, QEMU_OPTION_win2k_hack, QEMU_OPTION_usb, QEMU_OPTION_usbdevice, @@ -8449,6 +8454,9 @@ static const QEMUOption qemu_options[] = { { "no-kqemu", 0, QEMU_OPTION_no_kqemu }, { "kernel-kqemu", 0, QEMU_OPTION_kernel_kqemu }, #endif +#ifdef CONFIG_KVM + { "enable-kvm", 0, QEMU_OPTION_enable_kvm }, +#endif #if defined(TARGET_PPC) || defined(TARGET_SPARC) { "g", 1, QEMU_OPTION_g }, #endif @@ -9271,6 +9279,14 @@ int main(int argc, char **argv) kqemu_allowed = 2; break; #endif +#ifdef CONFIG_KVM + case QEMU_OPTION_enable_kvm: + kvm_allowed = 1; +#ifdef USE_KQEMU + kqemu_allowed = 0; +#endif + break; +#endif case QEMU_OPTION_usb: usb_enabled = 1; break; @@ -9405,6 +9421,14 @@ int main(int argc, char **argv) } } +#if defined(CONFIG_KVM) && defined(USE_KQEMU) + if (kvm_allowed && kqemu_allowed) { + fprintf(stderr, + "You can not enable both KVM and kqemu at the same time\n"); + exit(1); + } +#endif + if (smp_cpus > machine->max_cpus) { fprintf(stderr, "Number of SMP cpus requested (%d), exceeds max cpus " "supported by machine `%s' (%d)\n", smp_cpus, machine->name, @@ -9710,6 +9734,16 @@ int main(int argc, char **argv) } } + if (kvm_enabled()) { + int ret; + + ret = kvm_init(smp_cpus); + if (ret < 0) { + fprintf(stderr, "failed to initialize KVM\n"); + exit(1); + } + } + machine->init(ram_size, vga_ram_size, boot_devices, ds, kernel_filename, kernel_cmdline, initrd_filename, cpu_model); ^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:13 ` [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU Anthony Liguori @ 2008-10-28 20:49 ` Hollis Blanchard 2008-10-28 21:10 ` Anthony Liguori 2008-10-28 20:57 ` Andreas Färber ` (4 subsequent siblings) 5 siblings, 1 reply; 34+ messages in thread From: Hollis Blanchard @ 2008-10-28 20:49 UTC (permalink / raw) To: qemu-devel; +Cc: kvm-ppc, Glauber Costa, Avi Kivity, kvm-devel, Anthony Liguori Just a quick skim... On Tue, Oct 28, 2008 at 3:13 PM, Anthony Liguori <aliguori@us.ibm.com> wrote: > +int kvm_cpu_exec(CPUState *env) > +{ > + struct kvm_run *run = env->kvm_run; > + int ret; > + > + dprintf("kvm_cpu_exec()\n"); > + > + do { > + kvm_arch_pre_run(env, run); > + > + if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) { > + dprintf("interrupt exit requested\n"); > + ret = 0; > + break; > + } > + > + dprintf("setting tpr\n"); > + run->cr8 = cpu_get_apic_tpr(env); This belongs in the arch_pre_run hook above. > + ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > + kvm_arch_post_run(env, run); > + > + if (ret == -EINTR || ret == -EAGAIN) { > + dprintf("io window exit\n"); > + ret = 0; > + break; > + } > + > + if (ret < 0) { > + dprintf("kvm run failed %s\n", strerror(-ret)); > + abort(); > + } > + > + ret = 0; /* exit loop */ > + switch (run->exit_reason) { > + case KVM_EXIT_IO: > + dprintf("handle_io\n"); > + ret = kvm_handle_io(env, run->io.port, > + (uint8_t *)run + run->io.data_offset, > + run->io.direction, > + run->io.size, > + run->io.count); > + break; > + case KVM_EXIT_MMIO: > + dprintf("handle_mmio\n"); > + cpu_physical_memory_rw(run->mmio.phys_addr, > + run->mmio.data, > + run->mmio.len, > + run->mmio.is_write); > + ret = 1; > + break; > + case KVM_EXIT_IRQ_WINDOW_OPEN: > + dprintf("irq_window_open\n"); > + break; > + case KVM_EXIT_SHUTDOWN: > + dprintf("shutdown\n"); > + qemu_system_reset_request(); > + ret = 1; > + break; > + case KVM_EXIT_UNKNOWN: > + dprintf("kvm_exit_unknown\n"); > + break; > + case KVM_EXIT_FAIL_ENTRY: > + dprintf("kvm_exit_fail_entry\n"); > + break; > + case KVM_EXIT_EXCEPTION: > + dprintf("kvm_exit_exception\n"); > + break; > + case KVM_EXIT_DEBUG: > + dprintf("kvm_exit_debug\n"); > + break; > + default: > + dprintf("kvm_arch_handle_exit\n"); > + ret = kvm_arch_handle_exit(env, run); > + break; > + } > + } while (ret > 0); > + > + return ret; > +} How did you decide which exit handlers should go into architecture-specific code? Looking at just the KVM architecture set: IO: x86 and ia64, not PowerPC or s390 MMIO: everybody except s390 DCRs: PowerPC only IRQ window: not sure -Hollis ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:49 ` Hollis Blanchard @ 2008-10-28 21:10 ` Anthony Liguori 0 siblings, 0 replies; 34+ messages in thread From: Anthony Liguori @ 2008-10-28 21:10 UTC (permalink / raw) To: Hollis Blanchard Cc: Anthony Liguori, kvm-devel, Glauber Costa, qemu-devel, kvm-ppc, Avi Kivity Hollis Blanchard wrote: > Just a quick skim... > > On Tue, Oct 28, 2008 at 3:13 PM, Anthony Liguori <aliguori@us.ibm.com> wrote: > >> +int kvm_cpu_exec(CPUState *env) >> +{ >> + struct kvm_run *run = env->kvm_run; >> + int ret; >> + >> + dprintf("kvm_cpu_exec()\n"); >> + >> + do { >> + kvm_arch_pre_run(env, run); >> + >> + if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) { >> + dprintf("interrupt exit requested\n"); >> + ret = 0; >> + break; >> + } >> + >> + dprintf("setting tpr\n"); >> + run->cr8 = cpu_get_apic_tpr(env); >> > > This belongs in the arch_pre_run hook above. > Good catch, I've updated the patch. > How did you decide which exit handlers should go into > architecture-specific code? Looking at just the KVM architecture set: > Based on whether the implementation required target-specific code. > IO: x86 and ia64, not PowerPC or s390 > cpu_{in,out}[bwl] are defined in vl.c and are available for all architectures. They are no-ops on most architectures because they are never used. > MMIO: everybody except s390 > cpu_physical_memory_rw() is defined by everyone. > DCRs: PowerPC only > This will have to be an architecture specific handler. > IRQ window: not sure > It's a no-op implementation. I would think that this would be needed on PPC. If you want to inject an interrupt, but the guest is unable to handle an interrupt, you need to exit to userspace when the guest re-enables interrupts. Otherwise, you may never return to userspace for the interrupt to be injected. How do you handle that now? Does PPC have something that makes this unnecessary? Regards, Anthony Liguori > -Hollis > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:13 ` [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU Anthony Liguori 2008-10-28 20:49 ` Hollis Blanchard @ 2008-10-28 20:57 ` Andreas Färber 2008-10-28 21:04 ` Glauber Costa 2008-10-28 21:05 ` Anthony Liguori 2008-10-28 21:41 ` [Qemu-devel] " Gerd Hoffmann ` (3 subsequent siblings) 5 siblings, 2 replies; 34+ messages in thread From: Andreas Färber @ 2008-10-28 20:57 UTC (permalink / raw) To: qemu-devel; +Cc: Anthony Liguori Anthony, Am 28.10.2008 um 21:13 schrieb Anthony Liguori: > diff --git a/kvm-all.c b/kvm-all.c > new file mode 100644 > index 0000000..4379071 > --- /dev/null > +++ b/kvm-all.c > @@ -0,0 +1,377 @@ > +/* > + * QEMU KVM support > + * > + * Copyright IBM, Corp. 2008 > + * > + * Authors: > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version > 2. See > + * the COPYING file in the top-level directory. > + * > + */ Just wondering - since this is a new file of yours, would it make sense to use "version 2 or, at your option, any later version" to avoid the recent binutils GPL compatibility issues in the future? Andreas ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:57 ` Andreas Färber @ 2008-10-28 21:04 ` Glauber Costa 2008-10-28 21:16 ` Anthony Liguori 2008-10-28 21:05 ` Anthony Liguori 1 sibling, 1 reply; 34+ messages in thread From: Glauber Costa @ 2008-10-28 21:04 UTC (permalink / raw) To: qemu-devel; +Cc: Anthony Liguori On Tue, Oct 28, 2008 at 6:57 PM, Andreas Färber <andreas.faerber@web.de> wrote: > Anthony, > > Am 28.10.2008 um 21:13 schrieb Anthony Liguori: > >> diff --git a/kvm-all.c b/kvm-all.c >> new file mode 100644 >> index 0000000..4379071 >> --- /dev/null >> +++ b/kvm-all.c >> @@ -0,0 +1,377 @@ >> +/* >> + * QEMU KVM support >> + * >> + * Copyright IBM, Corp. 2008 >> + * >> + * Authors: >> + * Anthony Liguori <aliguori@us.ibm.com> >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2. See >> + * the COPYING file in the top-level directory. >> + * >> + */ > > Just wondering - since this is a new file of yours, would it make sense to > use "version 2 or, at your option, any later version" to avoid the recent > binutils GPL compatibility issues in the future? It's not that easy, because it is not exatly "a new file of yours". It's obviously derived from current kvm code. That said, libkvm.c is version 2, and qemu-kvm.c is v2+. I'm not sure if there's any legal restriction on the final product anthony should follow. > Andreas > > > > -- Glauber Costa. "Free as in Freedom" http://glommer.net "The less confident you are, the more serious you have to act." ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU 2008-10-28 21:04 ` Glauber Costa @ 2008-10-28 21:16 ` Anthony Liguori 0 siblings, 0 replies; 34+ messages in thread From: Anthony Liguori @ 2008-10-28 21:16 UTC (permalink / raw) To: qemu-devel; +Cc: Anthony Liguori Glauber Costa wrote: > On Tue, Oct 28, 2008 at 6:57 PM, Andreas Färber <andreas.faerber@web.de> wrote: > >> Anthony, >> >> Am 28.10.2008 um 21:13 schrieb Anthony Liguori: >> >> >>> diff --git a/kvm-all.c b/kvm-all.c >>> new file mode 100644 >>> index 0000000..4379071 >>> --- /dev/null >>> +++ b/kvm-all.c >>> @@ -0,0 +1,377 @@ >>> +/* >>> + * QEMU KVM support >>> + * >>> + * Copyright IBM, Corp. 2008 >>> + * >>> + * Authors: >>> + * Anthony Liguori <aliguori@us.ibm.com> >>> + * >>> + * This work is licensed under the terms of the GNU GPL, version 2. See >>> + * the COPYING file in the top-level directory. >>> + * >>> + */ >>> >> Just wondering - since this is a new file of yours, would it make sense to >> use "version 2 or, at your option, any later version" to avoid the recent >> binutils GPL compatibility issues in the future? >> > > It's not that easy, because it is not exatly "a new file of yours". > It's obviously derived > Well, kvm-all.c really isn't. There are maybe a few snippets (mainly if clauses) that came from qemu-kvm but not enough to be considered derived IMHO. kvm.c on the other hand contains enough that I would think it is derived (and I preserved the original copyright from qemu-kvm-x86.c). At any rate, as I said previously, Avi should agree before changing the license of any of it. > from current kvm code. That said, libkvm.c is version 2, and qemu-kvm.c is v2+. > I'm not sure if there's any legal restriction on the final product > anthony should follow. > Ah, I missed the or-higher part in qemu-kvm.c. I'll update the next round of patches to reflect that. I don't care either way honestly, I just copied the copyright from somewhere else. Regards, Anthony Liguori >> Andreas >> >> >> >> >> > > > > ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:57 ` Andreas Färber 2008-10-28 21:04 ` Glauber Costa @ 2008-10-28 21:05 ` Anthony Liguori 2008-11-04 13:25 ` Avi Kivity 1 sibling, 1 reply; 34+ messages in thread From: Anthony Liguori @ 2008-10-28 21:05 UTC (permalink / raw) To: qemu-devel; +Cc: Anthony Liguori, Avi Kivity Andreas Färber wrote: > Anthony, > > Am 28.10.2008 um 21:13 schrieb Anthony Liguori: > >> diff --git a/kvm-all.c b/kvm-all.c >> new file mode 100644 >> index 0000000..4379071 >> --- /dev/null >> +++ b/kvm-all.c >> @@ -0,0 +1,377 @@ >> +/* >> + * QEMU KVM support >> + * >> + * Copyright IBM, Corp. 2008 >> + * >> + * Authors: >> + * Anthony Liguori <aliguori@us.ibm.com> >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2. >> See >> + * the COPYING file in the top-level directory. >> + * >> + */ > > Just wondering - since this is a new file of yours, would it make > sense to use "version 2 or, at your option, any later version" to > avoid the recent binutils GPL compatibility issues in the future? If Avi agrees, I can make both files GPL v2+ However, there is an awful lot of GPLv2 only code in QEMU. I don't expect that situation to change anytime soon. Regards, Anthony Liguori > Andreas > > > ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU 2008-10-28 21:05 ` Anthony Liguori @ 2008-11-04 13:25 ` Avi Kivity 0 siblings, 0 replies; 34+ messages in thread From: Avi Kivity @ 2008-11-04 13:25 UTC (permalink / raw) To: Anthony Liguori; +Cc: Anthony Liguori, qemu-devel, Avi Kivity Anthony Liguori wrote: > > If Avi agrees, I can make both files GPL v2+ However, there is an > awful lot of GPLv2 only code in QEMU. I don't expect that situation > to change anytime soon. I have no problems with v2+. -- error compiling committee.c: too many arguments to function ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:13 ` [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU Anthony Liguori 2008-10-28 20:49 ` Hollis Blanchard 2008-10-28 20:57 ` Andreas Färber @ 2008-10-28 21:41 ` Gerd Hoffmann 2008-10-28 21:51 ` Anthony Liguori 2008-10-29 14:58 ` Glauber Costa ` (2 subsequent siblings) 5 siblings, 1 reply; 34+ messages in thread From: Gerd Hoffmann @ 2008-10-28 21:41 UTC (permalink / raw) To: Anthony Liguori; +Cc: Glauber Costa, qemu-devel, kvm-devel, Avi Kivity Anthony Liguori wrote: > This patch only implements the bare minimum support to get a guest booting. It > has very little impact the rest of QEMU and attempts to integrate nicely with > the rest of QEMU. Huh? That isn't based on the qemu-accel patches ... surprised, Gerd ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 21:41 ` [Qemu-devel] " Gerd Hoffmann @ 2008-10-28 21:51 ` Anthony Liguori 2008-10-28 23:04 ` Glauber Costa 0 siblings, 1 reply; 34+ messages in thread From: Anthony Liguori @ 2008-10-28 21:51 UTC (permalink / raw) To: Gerd Hoffmann; +Cc: Glauber Costa, qemu-devel, kvm-devel, Avi Kivity Gerd Hoffmann wrote: > Anthony Liguori wrote: > >> This patch only implements the bare minimum support to get a guest booting. It >> has very little impact the rest of QEMU and attempts to integrate nicely with >> the rest of QEMU. >> > > Huh? That isn't based on the qemu-accel patches ... > This is part of the reason for this exercise. I'd rather introduce KVM support first and then look at abstracting things, than vice versa. A number of the hooks in the current QEMUAccel tree are there for the wrong reason (to support the out-of-tree IO thread, for instance). If you just introduce something with various hooks and say, these are hooks we'll need, it's not possible to really evaluate whether the hooks are needed because nothing in the tree makes use of them. Regards, Anthony Liguori > surprised, > Gerd > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 21:51 ` Anthony Liguori @ 2008-10-28 23:04 ` Glauber Costa 2008-10-28 23:36 ` Anthony Liguori 0 siblings, 1 reply; 34+ messages in thread From: Glauber Costa @ 2008-10-28 23:04 UTC (permalink / raw) To: qemu-devel; +Cc: Glauber Costa, Gerd Hoffmann, kvm-devel, Avi Kivity On Tue, Oct 28, 2008 at 7:51 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: > Gerd Hoffmann wrote: >> >> Anthony Liguori wrote: >> >>> >>> This patch only implements the bare minimum support to get a guest >>> booting. It >>> has very little impact the rest of QEMU and attempts to integrate nicely >>> with >>> the rest of QEMU. >>> >> >> Huh? That isn't based on the qemu-accel patches ... >> > > This is part of the reason for this exercise. I'd rather introduce KVM > support first and then look at abstracting things, than vice versa. A > number of the hooks in the current QEMUAccel tree are there for the wrong > reason (to support the out-of-tree IO thread, for instance). > > If you just introduce something with various hooks and say, these are hooks > we'll need, it's not possible to really evaluate whether the hooks are > needed because nothing in the tree makes use of them. We talked extensively on monday about it, and I'm in agreement with it. > > Regards, > > Anthony Liguori > >> surprised, >> Gerd >> -- >> To unsubscribe from this list: send the line "unsubscribe kvm" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> > > > > -- Glauber Costa. "Free as in Freedom" http://glommer.net "The less confident you are, the more serious you have to act." ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 23:04 ` Glauber Costa @ 2008-10-28 23:36 ` Anthony Liguori 2008-10-29 9:54 ` Avi Kivity 2008-10-29 13:51 ` Hollis Blanchard 0 siblings, 2 replies; 34+ messages in thread From: Anthony Liguori @ 2008-10-28 23:36 UTC (permalink / raw) To: Glauber Costa Cc: Glauber Costa, Avi Kivity, qemu-devel, kvm-devel, Gerd Hoffmann Glauber Costa wrote: > On Tue, Oct 28, 2008 at 7:51 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: > >> >> This is part of the reason for this exercise. I'd rather introduce KVM >> support first and then look at abstracting things, than vice versa. A >> number of the hooks in the current QEMUAccel tree are there for the wrong >> reason (to support the out-of-tree IO thread, for instance). >> >> If you just introduce something with various hooks and say, these are hooks >> we'll need, it's not possible to really evaluate whether the hooks are >> needed because nothing in the tree makes use of them. >> > > We talked extensively on monday about it, and I'm in agreement with it. > Something I was thinking about this morning, and I think the first place where we'll definitely need a hook, is how to deal with kvm_load_registers(). I think there's overlap between KVM and the IO thread here. There are two reasons (I can think of) that most of the device model code can't run in conjunction with TCG. The first is that TCG may modify CPUState in a non-atomic way. The device model may need to access CPUState although there are very few places that it does. The other reason is accessing guest memory. TCG does not preserve atomicity when a guest accesses device memory. There are probably only a few places in the device model (like virtio) that depend on atomicity. If we implemented an API that implemented a lock/unlock for CPUState and for portions of memory, then I think this could be used both as a hook for kvm_load_registers and as a way to introduce an IO thread with TCG. The CPUState lock/unlock is pretty straight forward. For the memory implementation to be efficient, I think you would have to acquire the lock when TCG brings a physical address into the TLB (preferrably, at a page granularity), or whenever someone tries to access memory (via cpu_physical_memory_rw). I think in the vast majority of the cases, there wouldn't be any contention and both could TCG could run along side the IO thread. Another place "hook" is updating a slot's dirty bitmap. Right now, with my patchset we don't have live migration or the VGA RAM optimization. There's nothing about the VGA RAM optimization that wouldn't work for QEMU. I'm not sure that it really is an optimization in the context of TCG, but I certainly don't think it's any worse. The only thing you really need is to query the KVM dirty bitmap when it comes time to enable start over querying the VGA dirty bits. The same is needed for live migration, so I think what we really need is to change the memory dirty bit tracking API to have a concept of refresh that we can use to hook for KVM. FWIW, I included a TODO in my patch if people are interesting in tackling any of these things. Regards, Anthony Liguori Regards, Anthony Liguori >> Regards, >> >> Anthony Liguori >> >> >>> surprised, >>> Gerd >>> -- >>> To unsubscribe from this list: send the line "unsubscribe kvm" in >>> the body of a message to majordomo@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>> >>> >> >> >> > > > > ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 23:36 ` Anthony Liguori @ 2008-10-29 9:54 ` Avi Kivity 2008-10-29 12:35 ` Glauber Costa 2008-10-29 13:07 ` Anthony Liguori 2008-10-29 13:51 ` Hollis Blanchard 1 sibling, 2 replies; 34+ messages in thread From: Avi Kivity @ 2008-10-29 9:54 UTC (permalink / raw) To: Anthony Liguori Cc: Glauber Costa, Glauber Costa, qemu-devel, kvm-devel, Gerd Hoffmann Anthony Liguori wrote: > Another place "hook" is updating a slot's dirty bitmap. Right now, > with my patchset we don't have live migration or the VGA RAM > optimization. There's nothing about the VGA RAM optimization that > wouldn't work for QEMU. I'm not sure that it really is an > optimization in the context of TCG, but I certainly don't think it's > any worse. The only thing you really need is to query the KVM dirty > bitmap when it comes time to enable start over querying the VGA dirty > bits. I don't understand this. The VGA optimization really is qemu's, the kvm modifications only cater to the different way of getting the dirty bits. > The same is needed for live migration, so I think what we really need > is to change the memory dirty bit tracking API to have a concept of > refresh that we can use to hook for KVM. > Can you elaborate on this refresh? -- error compiling committee.c: too many arguments to function ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 9:54 ` Avi Kivity @ 2008-10-29 12:35 ` Glauber Costa 2008-10-29 12:39 ` Avi Kivity 2008-10-29 13:07 ` Anthony Liguori 1 sibling, 1 reply; 34+ messages in thread From: Glauber Costa @ 2008-10-29 12:35 UTC (permalink / raw) To: Avi Kivity; +Cc: Glauber Costa, kvm-devel, qemu-devel, Gerd Hoffmann On Wed, Oct 29, 2008 at 11:54:11AM +0200, Avi Kivity wrote: > Anthony Liguori wrote: >> Another place "hook" is updating a slot's dirty bitmap. Right now, >> with my patchset we don't have live migration or the VGA RAM >> optimization. There's nothing about the VGA RAM optimization that >> wouldn't work for QEMU. I'm not sure that it really is an >> optimization in the context of TCG, but I certainly don't think it's >> any worse. The only thing you really need is to query the KVM dirty >> bitmap when it comes time to enable start over querying the VGA dirty >> bits. > > I don't understand this. The VGA optimization really is qemu's, the kvm > modifications only cater to the different way of getting the dirty bits. As it seems to me, the real difference is that qemu has to explicitly set certain regions as dirty, while kvm get dirty bit "automatically" from the kernel. So I believe we can have markers on the code to refresh dirty bitmap for certain area ranges (for kvm use), and also enable a manual override (for qemu). After that, the cpu_physical_memory_get_dirty() will simply return whether or not the page is dirty. Also, kvm only tracks "dirty" bits, whereas qemu has at least three kinds of them. But I think for now we can assume that kvm's dirty mean "all dirty" ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 12:35 ` Glauber Costa @ 2008-10-29 12:39 ` Avi Kivity 2008-10-29 12:56 ` Glauber Costa 0 siblings, 1 reply; 34+ messages in thread From: Avi Kivity @ 2008-10-29 12:39 UTC (permalink / raw) To: Glauber Costa; +Cc: Glauber Costa, kvm-devel, qemu-devel, Gerd Hoffmann Glauber Costa wrote: >>> Another place "hook" is updating a slot's dirty bitmap. Right now, >>> with my patchset we don't have live migration or the VGA RAM >>> optimization. There's nothing about the VGA RAM optimization that >>> wouldn't work for QEMU. I'm not sure that it really is an >>> optimization in the context of TCG, but I certainly don't think it's >>> any worse. The only thing you really need is to query the KVM dirty >>> bitmap when it comes time to enable start over querying the VGA dirty >>> bits. >>> >> I don't understand this. The VGA optimization really is qemu's, the kvm >> modifications only cater to the different way of getting the dirty bits. >> > > As it seems to me, the real difference is that qemu has to explicitly set > certain regions as dirty, while kvm get dirty bit "automatically" from the kernel. > > I'm completely lost. I don't see how one or the other is more or less automatic, or how qemu has to explicitly set regions as dirty (except when emulating bitblt). > So I believe we can have markers on the code to refresh dirty bitmap for certain > area ranges (for kvm use), and also enable a manual override (for qemu). After that, > the cpu_physical_memory_get_dirty() will simply return whether or not the page is > dirty. > Does not cpu_p_m_g_dirty() simply return whether or not the page is dirty now? > Also, kvm only tracks "dirty" bits, whereas qemu has at least three kinds of them. > But I think for now we can assume that kvm's dirty mean "all dirty kvm's dirty bits mean that kvm has seen the page written to since the last query. A zero doesn't mean the page is clean though -- it could have been written to by qemu. -- error compiling committee.c: too many arguments to function ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 12:39 ` Avi Kivity @ 2008-10-29 12:56 ` Glauber Costa 0 siblings, 0 replies; 34+ messages in thread From: Glauber Costa @ 2008-10-29 12:56 UTC (permalink / raw) To: Avi Kivity; +Cc: Glauber Costa, kvm-devel, qemu-devel, Gerd Hoffmann On Wed, Oct 29, 2008 at 02:39:57PM +0200, Avi Kivity wrote: > Glauber Costa wrote: >>>> Another place "hook" is updating a slot's dirty bitmap. Right now, >>>> with my patchset we don't have live migration or the VGA RAM >>>> optimization. There's nothing about the VGA RAM optimization that >>>> wouldn't work for QEMU. I'm not sure that it really is an >>>> optimization in the context of TCG, but I certainly don't think >>>> it's any worse. The only thing you really need is to query the >>>> KVM dirty bitmap when it comes time to enable start over querying >>>> the VGA dirty bits. >>>> >>> I don't understand this. The VGA optimization really is qemu's, the >>> kvm modifications only cater to the different way of getting the >>> dirty bits. >>> >> >> As it seems to me, the real difference is that qemu has to explicitly set >> certain regions as dirty, while kvm get dirty bit "automatically" from the kernel. >> >> > > I'm completely lost. I don't see how one or the other is more or less > automatic, or how qemu has to explicitly set regions as dirty (except > when emulating bitblt). Or maybe I am. But I don't see any way in which qemu sets dirty bits but explicitly with cpu_physical_memory_set_dirty(). This is pretty explicit. > >> So I believe we can have markers on the code to refresh dirty bitmap for certain >> area ranges (for kvm use), and also enable a manual override (for qemu). After that, >> the cpu_physical_memory_get_dirty() will simply return whether or not the page is >> dirty. >> > > Does not cpu_p_m_g_dirty() simply return whether or not the page is > dirty now? If you look at the vga code, you see something like: cpu_physical_memory_get_dirty(page0, VGA_DIRTY_FLAG) | cpu_physical_memory_get_dirty(page1, VGA_DIRTY_FLAG); if (kvm_enabled()) { update |= bitmap_get_dirty(bitmap, (page0 - s->vram_offset) >> TARGET_PAGE_BITS); update |= bitmap_get_dirty(bitmap, (page1 - s->vram_offset) >> TARGET_PAGE_BITS); } so if the page is not dirty to cpu_p_m_g_dirty() (I liked that abb), it can still be dirty for kvm. Ideally, it would not be necessary. > >> Also, kvm only tracks "dirty" bits, whereas qemu has at least three kinds of them. >> But I think for now we can assume that kvm's dirty mean "all dirty > > kvm's dirty bits mean that kvm has seen the page written to since the > last query. A zero doesn't mean the page is clean though -- it could > have been written to by qemu. Right. The point here is more like kvm has 1 type of dirty whereas qemu has many. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 9:54 ` Avi Kivity 2008-10-29 12:35 ` Glauber Costa @ 2008-10-29 13:07 ` Anthony Liguori 2008-10-29 13:23 ` Avi Kivity 1 sibling, 1 reply; 34+ messages in thread From: Anthony Liguori @ 2008-10-29 13:07 UTC (permalink / raw) To: Avi Kivity Cc: Glauber Costa, Glauber Costa, qemu-devel, kvm-devel, Gerd Hoffmann Avi Kivity wrote: > Anthony Liguori wrote: >> Another place "hook" is updating a slot's dirty bitmap. Right now, >> with my patchset we don't have live migration or the VGA RAM >> optimization. There's nothing about the VGA RAM optimization that >> wouldn't work for QEMU. I'm not sure that it really is an >> optimization in the context of TCG, but I certainly don't think it's >> any worse. The only thing you really need is to query the KVM dirty >> bitmap when it comes time to enable start over querying the VGA dirty >> bits. > > I don't understand this. The VGA optimization really is qemu's, the > kvm modifications only cater to the different way of getting the dirty > bits. Right. I'm just not sure that it's going to be as much of an optimization for TCG as it is for KVM. >> The same is needed for live migration, so I think what we really need >> is to change the memory dirty bit tracking API to have a concept of >> refresh that we can use to hook for KVM. >> > > Can you elaborate on this refresh? Right now, in QEMU, code looks like this: for (i = 0; i < addr; i += TARGET_PAGE_SIZE) { if (cpu_p_m_g_dirty(i, DIRTY_FLAG)) { cpu_p_m_r_dirty(i, i + TARGET_PAGE_SIZE, DIRTY_FLAG); // do something with dirty memory } All we need to do is add another cpu_physical_memory_sync_dirty(i, i + REGION_SIZE, DIRTY_FLAG); that would go at the start of this. For QEMU, this is a nop since dirty bits are updated as soon as they are reset. For KVM, this would update the entire set of dirty bits for the given memory region. We also need something to enable dirty tracking for a particular region. We already have something for migration, we could perhaps extend that API (cpu_p_m_s_dirty_tracking). Regards, Anthony Liguori ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 13:07 ` Anthony Liguori @ 2008-10-29 13:23 ` Avi Kivity 2008-10-29 13:32 ` Anthony Liguori 0 siblings, 1 reply; 34+ messages in thread From: Avi Kivity @ 2008-10-29 13:23 UTC (permalink / raw) To: Anthony Liguori Cc: Glauber Costa, Glauber Costa, qemu-devel, kvm-devel, Gerd Hoffmann Anthony Liguori wrote: > Avi Kivity wrote: >> Anthony Liguori wrote: >>> Another place "hook" is updating a slot's dirty bitmap. Right now, >>> with my patchset we don't have live migration or the VGA RAM >>> optimization. There's nothing about the VGA RAM optimization that >>> wouldn't work for QEMU. I'm not sure that it really is an >>> optimization in the context of TCG, but I certainly don't think it's >>> any worse. The only thing you really need is to query the KVM dirty >>> bitmap when it comes time to enable start over querying the VGA >>> dirty bits. >> >> I don't understand this. The VGA optimization really is qemu's, the >> kvm modifications only cater to the different way of getting the >> dirty bits. > > Right. I'm just not sure that it's going to be as much of an > optimization for TCG as it is for KVM. When qemu is loaded certainly this is lost in the noise. But when idling (and as a bonus, the screen doesn't change much), tcg and kvm will benefit equally. > > Right now, in QEMU, code looks like this: > > for (i = 0; i < addr; i += TARGET_PAGE_SIZE) { > if (cpu_p_m_g_dirty(i, DIRTY_FLAG)) { > cpu_p_m_r_dirty(i, i + TARGET_PAGE_SIZE, DIRTY_FLAG); > > // do something with dirty memory > } > > All we need to do is add another cpu_physical_memory_sync_dirty(i, i + > REGION_SIZE, DIRTY_FLAG); that would go at the start of this. For > QEMU, this is a nop since dirty bits are updated as soon as they are > reset. For KVM, this would update the entire set of dirty bits for > the given memory region. You mean merge the kvm bitmap into the qemu bitmap? That's what it does now, no? > We also need something to enable dirty tracking for a particular > region. We already have something for migration, we could perhaps > extend that API (cpu_p_m_s_dirty_tracking). I don't think qemu would benefit much from per-region tracking, but maybe I'm wrong. -- error compiling committee.c: too many arguments to function ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 13:23 ` Avi Kivity @ 2008-10-29 13:32 ` Anthony Liguori 0 siblings, 0 replies; 34+ messages in thread From: Anthony Liguori @ 2008-10-29 13:32 UTC (permalink / raw) To: Avi Kivity Cc: Glauber Costa, Glauber Costa, qemu-devel, kvm-devel, Gerd Hoffmann Avi Kivity wrote: > Anthony Liguori wrote: >> Avi Kivity wrote: >>> Anthony Liguori wrote: >>>> Another place "hook" is updating a slot's dirty bitmap. Right now, >>>> with my patchset we don't have live migration or the VGA RAM >>>> optimization. There's nothing about the VGA RAM optimization that >>>> wouldn't work for QEMU. I'm not sure that it really is an >>>> optimization in the context of TCG, but I certainly don't think >>>> it's any worse. The only thing you really need is to query the KVM >>>> dirty bitmap when it comes time to enable start over querying the >>>> VGA dirty bits. >>> >>> I don't understand this. The VGA optimization really is qemu's, the >>> kvm modifications only cater to the different way of getting the >>> dirty bits. >> >> Right. I'm just not sure that it's going to be as much of an >> optimization for TCG as it is for KVM. > > When qemu is loaded certainly this is lost in the noise. But when > idling (and as a bonus, the screen doesn't change much), tcg and kvm > will benefit equally. > >> >> Right now, in QEMU, code looks like this: >> >> for (i = 0; i < addr; i += TARGET_PAGE_SIZE) { >> if (cpu_p_m_g_dirty(i, DIRTY_FLAG)) { >> cpu_p_m_r_dirty(i, i + TARGET_PAGE_SIZE, DIRTY_FLAG); >> >> // do something with dirty memory >> } >> >> All we need to do is add another cpu_physical_memory_sync_dirty(i, i >> + REGION_SIZE, DIRTY_FLAG); that would go at the start of this. For >> QEMU, this is a nop since dirty bits are updated as soon as they are >> reset. For KVM, this would update the entire set of dirty bits for >> the given memory region. > > You mean merge the kvm bitmap into the qemu bitmap? That's what it > does now, no? > >> We also need something to enable dirty tracking for a particular >> region. We already have something for migration, we could perhaps >> extend that API (cpu_p_m_s_dirty_tracking). > > I don't think qemu would benefit much from per-region tracking, but > maybe I'm wrong. QEMU doesn't do per region tracking but instead, uses flags to simulate regions. At any rate, the suggestion for an API to enable dirty tracking was really for KVMs benefit. It's a nop for QEMU. Regards, Anthony Liguori ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 23:36 ` Anthony Liguori 2008-10-29 9:54 ` Avi Kivity @ 2008-10-29 13:51 ` Hollis Blanchard 2008-10-29 14:09 ` Avi Kivity 1 sibling, 1 reply; 34+ messages in thread From: Hollis Blanchard @ 2008-10-29 13:51 UTC (permalink / raw) To: qemu-devel Cc: Glauber Costa, Glauber Costa, Avi Kivity, kvm-devel, Gerd Hoffmann On Tue, Oct 28, 2008 at 6:36 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: > > Something I was thinking about this morning, and I think the first place > where we'll definitely need a hook, is how to deal with > kvm_load_registers(). I think there's overlap between KVM and the IO thread > here. > > There are two reasons (I can think of) that most of the device model code > can't run in conjunction with TCG. The first is that TCG may modify > CPUState in a non-atomic way. The device model may need to access CPUState > although there are very few places that it does. Out of curiosity, where are those places? -Hollis ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 13:51 ` Hollis Blanchard @ 2008-10-29 14:09 ` Avi Kivity 2008-10-29 14:16 ` Fabrice Bellard 2008-10-29 19:13 ` Blue Swirl 0 siblings, 2 replies; 34+ messages in thread From: Avi Kivity @ 2008-10-29 14:09 UTC (permalink / raw) To: hollis; +Cc: Glauber Costa, Glauber Costa, qemu-devel, kvm-devel, Gerd Hoffmann Hollis Blanchard wrote: > On Tue, Oct 28, 2008 at 6:36 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: > >> Something I was thinking about this morning, and I think the first place >> where we'll definitely need a hook, is how to deal with >> kvm_load_registers(). I think there's overlap between KVM and the IO thread >> here. >> >> There are two reasons (I can think of) that most of the device model code >> can't run in conjunction with TCG. The first is that TCG may modify >> CPUState in a non-atomic way. The device model may need to access CPUState >> although there are very few places that it does. >> > > Out of curiosity, where are those places? > local apic -- needs to access interrupt disable flag acpi sleep -- halts the current processor, so tied to cpustate vmport -- bad ABI requires access to registers -- error compiling committee.c: too many arguments to function ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 14:09 ` Avi Kivity @ 2008-10-29 14:16 ` Fabrice Bellard 2008-10-29 14:23 ` Anthony Liguori 2008-10-29 19:13 ` Blue Swirl 1 sibling, 1 reply; 34+ messages in thread From: Fabrice Bellard @ 2008-10-29 14:16 UTC (permalink / raw) To: qemu-devel; +Cc: Glauber Costa, hollis, Gerd Hoffmann, kvm-devel Avi Kivity wrote: > Hollis Blanchard wrote: >> On Tue, Oct 28, 2008 at 6:36 PM, Anthony Liguori >> <anthony@codemonkey.ws> wrote: >> >>> Something I was thinking about this morning, and I think the first place >>> where we'll definitely need a hook, is how to deal with >>> kvm_load_registers(). I think there's overlap between KVM and the IO >>> thread >>> here. >>> >>> There are two reasons (I can think of) that most of the device model >>> code >>> can't run in conjunction with TCG. The first is that TCG may modify >>> CPUState in a non-atomic way. The device model may need to access >>> CPUState >>> although there are very few places that it does. >>> >> >> Out of curiosity, where are those places? >> > > local apic -- needs to access interrupt disable flag > acpi sleep -- halts the current processor, so tied to cpustate > vmport -- bad ABI requires access to registers These accesses are the exception and should be done with specific CPU methods. IMHO, direct access to the CPU state should otherwise never be done from devices. Regards, Fabrice. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 14:16 ` Fabrice Bellard @ 2008-10-29 14:23 ` Anthony Liguori 0 siblings, 0 replies; 34+ messages in thread From: Anthony Liguori @ 2008-10-29 14:23 UTC (permalink / raw) To: qemu-devel; +Cc: Glauber Costa, hollis, Gerd Hoffmann, kvm-devel Fabrice Bellard wrote: > Avi Kivity wrote: > >> Hollis Blanchard wrote: >> >>> >>> Out of curiosity, where are those places? >>> >>> >> local apic -- needs to access interrupt disable flag >> acpi sleep -- halts the current processor, so tied to cpustate >> vmport -- bad ABI requires access to registers >> > > These accesses are the exception and should be done with specific CPU > methods. IMHO, direct access to the CPU state should otherwise never be > done from devices. > Yes, this is what I'm working on now. There are also non-devices that access CPUState but they are also special cases (gdbstub, monitor, save/restore). Regards, Anthony Liguori > Regards, > > Fabrice. > > > > ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 14:09 ` Avi Kivity 2008-10-29 14:16 ` Fabrice Bellard @ 2008-10-29 19:13 ` Blue Swirl 2008-11-01 16:25 ` Blue Swirl 1 sibling, 1 reply; 34+ messages in thread From: Blue Swirl @ 2008-10-29 19:13 UTC (permalink / raw) To: qemu-devel; +Cc: Glauber Costa, Glauber Costa, hollis, Gerd Hoffmann, kvm-devel On 10/29/08, Avi Kivity <avi@redhat.com> wrote: > Hollis Blanchard wrote: > > > On Tue, Oct 28, 2008 at 6:36 PM, Anthony Liguori <anthony@codemonkey.ws> > wrote: > > > > > > > Something I was thinking about this morning, and I think the first place > > > where we'll definitely need a hook, is how to deal with > > > kvm_load_registers(). I think there's overlap between KVM and the IO > thread > > > here. > > > > > > There are two reasons (I can think of) that most of the device model > code > > > can't run in conjunction with TCG. The first is that TCG may modify > > > CPUState in a non-atomic way. The device model may need to access > CPUState > > > although there are very few places that it does. > > > > > > > > > > Out of curiosity, where are those places? > > > > > > local apic -- needs to access interrupt disable flag > acpi sleep -- halts the current processor, so tied to cpustate It should be possible to avoid these, just use a qemu_irq for per-CPU interrupt lines and halt signals. > vmport -- bad ABI requires access to registers Ugly. Maybe there could be two parts, one in pc.c which registers the ioport and checks EAX/ECX, maybe using a CPU specific helper, and second part, generic port in vmport.c, which does not know about CPU state. I don't know if this would solve the original atomicity problem, though. ^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 19:13 ` Blue Swirl @ 2008-11-01 16:25 ` Blue Swirl 0 siblings, 0 replies; 34+ messages in thread From: Blue Swirl @ 2008-11-01 16:25 UTC (permalink / raw) To: qemu-devel; +Cc: Glauber Costa, hollis, Gerd Hoffmann, kvm-devel [-- Attachment #1: Type: text/plain, Size: 1608 bytes --] On 10/29/08, Blue Swirl <blauwirbel@gmail.com> wrote: > On 10/29/08, Avi Kivity <avi@redhat.com> wrote: > > Hollis Blanchard wrote: > > > > > On Tue, Oct 28, 2008 at 6:36 PM, Anthony Liguori <anthony@codemonkey.ws> > > wrote: > > > > > > > > > > Something I was thinking about this morning, and I think the first place > > > > where we'll definitely need a hook, is how to deal with > > > > kvm_load_registers(). I think there's overlap between KVM and the IO > > thread > > > > here. > > > > > > > > There are two reasons (I can think of) that most of the device model > > code > > > > can't run in conjunction with TCG. The first is that TCG may modify > > > > CPUState in a non-atomic way. The device model may need to access > > CPUState > > > > although there are very few places that it does. > > > > > > > > > > > > > > Out of curiosity, where are those places? > > > > > > > > > > local apic -- needs to access interrupt disable flag > > acpi sleep -- halts the current processor, so tied to cpustate > > > It should be possible to avoid these, just use a qemu_irq for per-CPU > interrupt lines and halt signals. Just for fun, I made a set of small patches that convert apic to avoid env access completely. Only lightly tested. The fourth patch (suppress_apic_env_use_accessors.diff) is probably not OK, because I changed the accessors to use the apic state as registered for the MMIO, previous implementation used instead the apic state associated with the currently running CPU. I couldn't find any CPU state handling in acpi.c, is this a KVM specific addition? [-- Attachment #2: suppress_apic_env_use_SIPI.diff --] [-- Type: plain/text, Size: 3840 bytes --] [-- Attachment #3: suppress_apic_env_use_reset_NMI_SMI.diff --] [-- Type: plain/text, Size: 5756 bytes --] [-- Attachment #4: suppress_apic_env_use_cpu.diff --] [-- Type: plain/text, Size: 7037 bytes --] [-- Attachment #5: suppress_apic_env_use_accessors.diff --] [-- Type: plain/text, Size: 4528 bytes --] ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:13 ` [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU Anthony Liguori ` (2 preceding siblings ...) 2008-10-28 21:41 ` [Qemu-devel] " Gerd Hoffmann @ 2008-10-29 14:58 ` Glauber Costa 2008-10-29 17:41 ` Glauber Costa 2008-11-04 13:24 ` Avi Kivity 5 siblings, 0 replies; 34+ messages in thread From: Glauber Costa @ 2008-10-29 14:58 UTC (permalink / raw) To: Anthony Liguori; +Cc: qemu-devel, kvm-devel, Avi Kivity On Tue, Oct 28, 2008 at 03:13:34PM -0500, Anthony Liguori wrote: > This patch adds very basic KVM support. KVM is a kernel module for Linux that > allows userspace programs to make use of hardware virtualization support. It > current supports x86 hardware virtualization using Intel VT-x or AMD-V. It > also supports IA64 VT-i, PPC 440, and S390. > > This patch only implements the bare minimum support to get a guest booting. It > has very little impact the rest of QEMU and attempts to integrate nicely with > the rest of QEMU. > > Even though this implementation is basic, it is significantly faster than TCG. > Booting and shutting down a Linux guest: > > w/TCG: 1:32.36 elapsed 84% CPU > > w/KVM: 0:31.14 elapsed 59% CPU > > Right now, KVM is disabled by default and must be explicitly enabled with > -enable-kvm. We can enable it by default later when we have had better > testing. > > Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> > > diff --git a/KVM_TODO b/KVM_TODO > new file mode 100644 > index 0000000..9529049 > --- /dev/null > +++ b/KVM_TODO > @@ -0,0 +1,9 @@ > +1) Add hooks for load/save of register state > + o Fixes gdbstub, save/restore, and vmport > +2) Add VGA optimization > +3) Add IO thread > +4) Add guest SMP support > +5) Add TPR optimization > +6) Add support for in-kernel APIC > +7) Add support for in-kernel PIT > +8) Merge in additional changes in kvm-userspace tree > diff --git a/Makefile.target b/Makefile.target > index e2edf9d..903d66d 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -183,6 +183,9 @@ CFLAGS+=-I/opt/SUNWspro/prod/include/cc > endif > endif > > +kvm.o: CFLAGS+=$(KVM_CFLAGS) > +kvm-all.o: CFLAGS+=$(KVM_CFLAGS) > + > all: $(PROGS) > > ######################################################### > @@ -475,6 +478,9 @@ ifndef CONFIG_USER_ONLY > > OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o > OBJS+=fw_cfg.o aio.o buffered_file.o migration.o migration-tcp.o > +ifdef CONFIG_KVM > +OBJS+=kvm.o kvm-all.o > +endif > ifdef CONFIG_WIN32 > OBJS+=block-raw-win32.o > else > diff --git a/configure b/configure > index aefa69b..7aed99d 100755 > --- a/configure > +++ b/configure > @@ -113,6 +113,7 @@ aio="yes" > nptl="yes" > mixemu="no" > bluez="yes" > +kvm="yes" > > # OS specific > targetos=`uname -s` > @@ -300,6 +301,8 @@ for opt do > ;; > --disable-bluez) bluez="no" > ;; > + --disable-kvm) kvm="no" > + ;; > --enable-profiler) profiler="yes" > ;; > --enable-cocoa) > @@ -439,6 +442,7 @@ echo " --disable-brlapi disable BrlAPI" > echo " --disable-vnc-tls disable TLS encryption for VNC server" > echo " --disable-curses disable curses output" > echo " --disable-bluez disable bluez stack connectivity" > +echo " --disable-kvm disable KVM acceleration support" > echo " --disable-nptl disable usermode NPTL support" > echo " --enable-system enable all system emulation targets" > echo " --disable-system disable all system emulation targets" > @@ -933,6 +937,30 @@ EOF > fi > > ########################################## > +# kvm probe > +if test "$kvm" = "yes" ; then > + cat > $TMPC <<EOF > +#include <linux/kvm.h> > +#if !defined(KVM_API_VERSION) || \ > + KVM_API_VERSION < 12 || \ > + KVM_API_VERSION > 12 || \ > + !defined(KVM_CAP_USER_MEMORY) || \ > + !defined(KVM_CAP_SET_TSS_ADDR) > +#error Invalid KVM version > +#endif > +int main(void) { return 0; } > +EOF > + # FIXME make this configurable > + kvm_cflags=-I/lib/modules/`uname -r`/build/include > + if $cc $ARCH_CFLAGS -o $TMPE ${OS_CFLAGS} $kvm_cflags $TMPC \ > + 2>/dev/null ; then > + : > + else > + kvm="no" > + fi > +fi > + > +########################################## > # AIO probe > if test "$aio" = "yes" ; then > aio=no > @@ -1018,6 +1046,7 @@ echo "uname -r $uname_release" > echo "NPTL support $nptl" > echo "vde support $vde" > echo "AIO support $aio" > +echo "KVM support $kvm" > > if test $sdl_too_old = "yes"; then > echo "-> Your SDL version is too old - please upgrade to have SDL support" > @@ -1388,6 +1417,15 @@ interp_prefix1=`echo "$interp_prefix" | sed "s/%M/$target_cpu/g"` > echo "#define CONFIG_QEMU_PREFIX \"$interp_prefix1\"" >> $config_h > gdb_xml_files="" > > +# FIXME allow i386 to build on x86_64 and vice versa > +if test "$kvm" = "yes" -a "$target_cpu" != "$cpu" ; then > + kvm="no" > +fi > +# Disable KVM for linux-user > +if test "$kvm" = "yes" -a "$target_softmmu" = "no" ; then > + kvm="no" > +fi > + > case "$target_cpu" in > i386) > echo "TARGET_ARCH=i386" >> $config_mak > @@ -1397,6 +1435,11 @@ case "$target_cpu" in > then > echo "#define USE_KQEMU 1" >> $config_h > fi > + if test "$kvm" = "yes" ; then > + echo "CONFIG_KVM=yes" >> $config_mak > + echo "KVM_CFLAGS=$kvm_cflags" >> $config_mak > + echo "#define CONFIG_KVM" >> $config_h > + fi > gcc3minver=`$cc --version 2> /dev/null| fgrep "(GCC) 3." | awk '{ print $3 }' | cut -f2 -d.` > if test -n "$gcc3minver" && test $gcc3minver -gt 3 > then > @@ -1414,6 +1457,11 @@ case "$target_cpu" in > then > echo "#define USE_KQEMU 1" >> $config_h > fi > + if test "$kvm" = "yes" ; then > + echo "CONFIG_KVM=yes" >> $config_mak > + echo "KVM_CFLAGS=$kvm_cflags" >> $config_mak > + echo "#define CONFIG_KVM 1" >> $config_h > + fi > ;; > alpha) > echo "TARGET_ARCH=alpha" >> $config_mak > diff --git a/cpu-defs.h b/cpu-defs.h > index 5dcac74..46d4487 100644 > --- a/cpu-defs.h > +++ b/cpu-defs.h > @@ -142,6 +142,9 @@ typedef struct icount_decr_u16 { > } icount_decr_u16; > #endif > > +struct kvm_run; > +struct KVMState; > + > #define CPU_TEMP_BUF_NLONGS 128 > #define CPU_COMMON \ > struct TranslationBlock *current_tb; /* currently executing TB */ \ > @@ -199,6 +202,9 @@ typedef struct icount_decr_u16 { > /* user data */ \ > void *opaque; \ > \ > - const char *cpu_model_str; > + const char *cpu_model_str; \ > + struct KVMState *kvm_state; \ > + struct kvm_run *kvm_run; \ > + int kvm_fd; > > #endif > diff --git a/cpu-exec.c b/cpu-exec.c > index 6d4dcdd..04b3021 100644 > --- a/cpu-exec.c > +++ b/cpu-exec.c > @@ -22,6 +22,7 @@ > #include "exec.h" > #include "disas.h" > #include "tcg.h" > +#include "kvm.h" > > #if !defined(CONFIG_SOFTMMU) > #undef EAX > @@ -361,6 +362,19 @@ int cpu_exec(CPUState *env1) > } > #endif > > + if (kvm_enabled()) { > + int ret; > + ret = kvm_cpu_exec(env); > + if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) { > + env->interrupt_request &= ~CPU_INTERRUPT_EXIT; > + env->exception_index = EXCP_INTERRUPT; > + cpu_loop_exit(); > + } else if (env->halted) { > + cpu_loop_exit(); > + } else > + longjmp(env->jmp_env, 1); > + } > + > next_tb = 0; /* force lookup of first TB */ > for(;;) { > interrupt_request = env->interrupt_request; > diff --git a/exec.c b/exec.c > index f1fcec8..2623ac6 100644 > --- a/exec.c > +++ b/exec.c > @@ -39,6 +39,7 @@ > #include "tcg.h" > #include "hw/hw.h" > #include "osdep.h" > +#include "kvm.h" > #if defined(CONFIG_USER_ONLY) > #include <qemu.h> > #endif > @@ -2211,6 +2212,9 @@ void cpu_register_physical_memory(target_phys_addr_t start_addr, > kqemu_set_phys_mem(start_addr, size, phys_offset); > } > #endif > + if (kvm_enabled()) > + kvm_set_phys_mem(start_addr, size, phys_offset); > + > size = (size + TARGET_PAGE_SIZE - 1) & TARGET_PAGE_MASK; > end_addr = start_addr + (target_phys_addr_t)size; > for(addr = start_addr; addr != end_addr; addr += TARGET_PAGE_SIZE) { > diff --git a/hw/acpi.c b/hw/acpi.c > index 45963d3..66a5faa 100644 > --- a/hw/acpi.c > +++ b/hw/acpi.c > @@ -23,6 +23,7 @@ > #include "sysemu.h" > #include "i2c.h" > #include "smbus.h" > +#include "kvm.h" > > //#define DEBUG > > @@ -501,6 +502,12 @@ i2c_bus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base, > > register_ioport_write(ACPI_DBG_IO_ADDR, 4, 4, acpi_dbg_writel, s); > > + if (kvm_enabled()) { > + /* Mark SMM as already inited to prevent SMM from running. KVM does not > + * support SMM mode. */ > + pci_conf[0x5B] = 0x02; > + } > + > /* XXX: which specification is used ? The i82731AB has different > mappings */ > pci_conf[0x5f] = (parallel_hds[0] != NULL ? 0x80 : 0) | 0x10; > diff --git a/kvm-all.c b/kvm-all.c > new file mode 100644 > index 0000000..4379071 > --- /dev/null > +++ b/kvm-all.c > @@ -0,0 +1,377 @@ > +/* > + * QEMU KVM support > + * > + * Copyright IBM, Corp. 2008 > + * > + * Authors: > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include <sys/types.h> > +#include <sys/ioctl.h> > +#include <sys/mman.h> > + > +#include <linux/kvm.h> > + > +#include "qemu-common.h" > +#include "sysemu.h" > +#include "kvm.h" > + > +//#define DEBUG_KVM > + > +#ifdef DEBUG_KVM > +#define dprintf(fmt, ...) \ > + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) > +#else > +#define dprintf(fmt, ...) \ > + do { } while (0) > +#endif > + > +typedef struct kvm_userspace_memory_region KVMSlot; Actually, I don't think it is a good idea. We may want to keep internal-only data tied to the slot, such the slot's dirty bitmap if we do per-slot dirty tracking. Of course there may be other ways to do it, but this is the cleaner and more adequate since we're going through a fresh start. > + > +int kvm_allowed = 0; > + > +struct KVMState > +{ > + KVMSlot slots[32]; > + int fd; > + int vmfd; > +}; > + > +static KVMState *kvm_state; > + > +static KVMSlot *kvm_alloc_slot(KVMState *s) > +{ > + int i; > + > + for (i = 0; i < ARRAY_SIZE(s->slots); i++) { > + if (s->slots[i].memory_size == 0) > + return &s->slots[i]; > + } > + > + return NULL; > +} > + > +static KVMSlot *kvm_lookup_slot(KVMState *s, target_phys_addr_t start_addr) > +{ > + int i; > + > + for (i = 0; i < ARRAY_SIZE(s->slots); i++) { > + KVMSlot *mem = &s->slots[i]; > + > + if (start_addr >= mem->guest_phys_addr && > + start_addr < (mem->guest_phys_addr + mem->memory_size)) > + return mem; > + } > + > + return NULL; > +} > + > +int kvm_init_vcpu(CPUState *env) > +{ > + KVMState *s = kvm_state; > + long mmap_size; > + int ret; > + > + dprintf("kvm_init_vcpu\n"); > + > + ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, > + (void *)(unsigned long)env->cpu_index); > + if (ret < 0) { > + dprintf("kvm_create_vcpu failed\n"); > + goto err; > + } > + > + env->kvm_fd = ret; > + env->kvm_state = s; > + > + mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); > + if (mmap_size < 0) { > + dprintf("KVM_GET_VCPU_MMAP_SIZE failed\n"); > + goto err; > + } > + > + env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, > + env->kvm_fd, 0); > + if (env->kvm_run == MAP_FAILED) { > + ret = -errno; > + dprintf("mmap'ing vcpu state failed\n"); > + goto err; > + } > + > + ret = kvm_arch_init_vcpu(env); > + > +err: > + return ret; > +} > + > +int kvm_init(int smp_cpus) > +{ > + KVMState *s; > + int ret; > + int i; > + > + if (smp_cpus > 1) > + return -EINVAL; > + > + s = qemu_mallocz(sizeof(KVMState)); > + if (s == NULL) > + return -ENOMEM; > + > + for (i = 0; i < ARRAY_SIZE(s->slots); i++) > + s->slots[i].slot = i; > + > + s->vmfd = -1; > + s->fd = open("/dev/kvm", O_RDWR); > + if (s->fd == -1) { > + fprintf(stderr, "Could not access KVM kernel module: %m\n"); > + ret = -errno; > + goto err; > + } > + > + ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); > + if (ret < KVM_API_VERSION) { > + if (ret > 0) > + ret = -EINVAL; > + fprintf(stderr, "kvm version too old\n"); > + goto err; > + } > + > + if (ret > KVM_API_VERSION) { > + ret = -EINVAL; > + fprintf(stderr, "kvm version not supported\n"); > + goto err; > + } > + > + s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0); > + if (s->vmfd < 0) > + goto err; > + > + /* initially, KVM allocated its own memory and we had to jump through > + * hooks to make phys_ram_base point to this. Modern versions of KVM > + * just use a user allocated buffer so we can use phys_ram_base > + * unmodified. Make sure we have a sufficiently modern version of KVM. > + */ > + ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY); > + if (ret <= 0) { > + if (ret == 0) > + ret = -EINVAL; > + fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n"); > + goto err; > + } > + > + ret = kvm_arch_init(s, smp_cpus); > + if (ret < 0) > + goto err; > + > + kvm_state = s; > + > + return 0; > + > +err: > + if (s) { > + if (s->vmfd != -1) > + close(s->vmfd); > + if (s->fd != -1) > + close(s->fd); > + } > + qemu_free(s); > + > + return ret; > +} > + > +static int kvm_handle_io(CPUState *env, uint16_t port, void *data, > + int direction, int size, uint32_t count) > +{ > + int i; > + uint8_t *ptr = data; > + > + for (i = 0; i < count; i++) { > + if (direction == KVM_EXIT_IO_IN) { > + switch (size) { > + case 1: > + stb_p(ptr, cpu_inb(env, port)); > + break; > + case 2: > + stw_p(ptr, cpu_inw(env, port)); > + break; > + case 4: > + stl_p(ptr, cpu_inl(env, port)); > + break; > + } > + } else { > + switch (size) { > + case 1: > + cpu_outb(env, port, ldub_p(ptr)); > + break; > + case 2: > + cpu_outw(env, port, lduw_p(ptr)); > + break; > + case 4: > + cpu_outl(env, port, ldl_p(ptr)); > + break; > + } > + } > + > + ptr += size; > + } > + > + return 1; > +} > + > +int kvm_cpu_exec(CPUState *env) > +{ > + struct kvm_run *run = env->kvm_run; > + int ret; > + > + dprintf("kvm_cpu_exec()\n"); > + > + do { > + kvm_arch_pre_run(env, run); > + > + if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) { > + dprintf("interrupt exit requested\n"); > + ret = 0; > + break; > + } > + > + dprintf("setting tpr\n"); > + run->cr8 = cpu_get_apic_tpr(env); > + > + ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > + kvm_arch_post_run(env, run); > + > + if (ret == -EINTR || ret == -EAGAIN) { > + dprintf("io window exit\n"); > + ret = 0; > + break; > + } > + > + if (ret < 0) { > + dprintf("kvm run failed %s\n", strerror(-ret)); > + abort(); > + } > + > + ret = 0; /* exit loop */ > + switch (run->exit_reason) { > + case KVM_EXIT_IO: > + dprintf("handle_io\n"); > + ret = kvm_handle_io(env, run->io.port, > + (uint8_t *)run + run->io.data_offset, > + run->io.direction, > + run->io.size, > + run->io.count); > + break; > + case KVM_EXIT_MMIO: > + dprintf("handle_mmio\n"); > + cpu_physical_memory_rw(run->mmio.phys_addr, > + run->mmio.data, > + run->mmio.len, > + run->mmio.is_write); > + ret = 1; > + break; > + case KVM_EXIT_IRQ_WINDOW_OPEN: > + dprintf("irq_window_open\n"); > + break; > + case KVM_EXIT_SHUTDOWN: > + dprintf("shutdown\n"); > + qemu_system_reset_request(); > + ret = 1; > + break; > + case KVM_EXIT_UNKNOWN: > + dprintf("kvm_exit_unknown\n"); > + break; > + case KVM_EXIT_FAIL_ENTRY: > + dprintf("kvm_exit_fail_entry\n"); > + break; > + case KVM_EXIT_EXCEPTION: > + dprintf("kvm_exit_exception\n"); > + break; > + case KVM_EXIT_DEBUG: > + dprintf("kvm_exit_debug\n"); > + break; > + default: > + dprintf("kvm_arch_handle_exit\n"); > + ret = kvm_arch_handle_exit(env, run); > + break; > + } > + } while (ret > 0); > + > + return ret; > +} > + > +void kvm_set_phys_mem(target_phys_addr_t start_addr, > + ram_addr_t size, > + ram_addr_t phys_offset) > +{ > + KVMState *s = kvm_state; > + ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK; > + KVMSlot *mem; > + > + /* KVM does not support read-only slots */ > + phys_offset &= ~IO_MEM_ROM; > + > + mem = kvm_lookup_slot(s, start_addr); > + if (mem) { > + if (flags == IO_MEM_UNASSIGNED) { > + mem->memory_size = 0; > + mem->guest_phys_addr = start_addr; > + mem->userspace_addr = 0; > + mem->flags = 0; > + > + kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem); > + } else if (start_addr >= mem->guest_phys_addr && > + (start_addr + size) <= (mem->guest_phys_addr + mem->memory_size)) > + return; > + } > + > + /* KVM does not need to know about this memory */ > + if (flags >= IO_MEM_UNASSIGNED) > + return; > + > + mem = kvm_alloc_slot(s); > + mem->memory_size = size; > + mem->guest_phys_addr = start_addr; > + mem->userspace_addr = (unsigned long)(phys_ram_base + phys_offset); > + mem->flags = 0; > + > + kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem); > + /* FIXME deal with errors */ > +} > + > +int kvm_ioctl(KVMState *s, int type, void *data) > +{ > + int ret; > + > + ret = ioctl(s->fd, type, data); > + if (ret == -1) > + ret = -errno; > + > + return ret; > +} > + > +int kvm_vm_ioctl(KVMState *s, int type, void *data) > +{ > + int ret; > + > + ret = ioctl(s->vmfd, type, data); > + if (ret == -1) > + ret = -errno; > + > + return ret; > +} > + > +int kvm_vcpu_ioctl(CPUState *env, int type, void *data) > +{ > + int ret; > + > + ret = ioctl(env->kvm_fd, type, data); > + if (ret == -1) > + ret = -errno; > + > + return ret; > +} > diff --git a/kvm.h b/kvm.h > new file mode 100644 > index 0000000..37102b4 > --- /dev/null > +++ b/kvm.h > @@ -0,0 +1,68 @@ > +/* > + * QEMU KVM support > + * > + * Copyright IBM, Corp. 2008 > + * > + * Authors: > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#ifndef QEMU_KVM_H > +#define QEMU_KVM_H > + > +#include "config.h" > + > +#ifdef CONFIG_KVM > +extern int kvm_allowed; > + > +#define kvm_enabled() (kvm_allowed) > +#else > +#define kvm_enabled() (0) > +#endif > + > +struct kvm_run; > + > +/* external API */ > + > +int kvm_init(int smp_cpus); > + > +int kvm_init_vcpu(CPUState *env); > + > +int kvm_cpu_exec(CPUState *env); > + > +void kvm_set_phys_mem(target_phys_addr_t start_addr, > + ram_addr_t size, > + ram_addr_t phys_offset); > + > +/* internal API */ > + > +struct KVMState; > +typedef struct KVMState KVMState; > + > +int kvm_ioctl(KVMState *s, int type, void *data); > + > +int kvm_vm_ioctl(KVMState *s, int type, void *data); > + > +int kvm_vcpu_ioctl(CPUState *env, int type, void *data); > + > +/* Arch specific hooks */ > + > +int kvm_arch_post_run(CPUState *env, struct kvm_run *run); > + > +int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run); > + > +int kvm_arch_pre_run(CPUState *env, struct kvm_run *run); > + > +int kvm_arch_get_registers(CPUState *env); > + > +int kvm_arch_put_registers(CPUState *env); > + > +int kvm_arch_init(KVMState *s, int smp_cpus); > + > +int kvm_arch_init_vcpu(CPUState *env); > + > +#endif > diff --git a/monitor.c b/monitor.c > index f0a0bc3..dc90a2b 100644 > --- a/monitor.c > +++ b/monitor.c > @@ -37,6 +37,7 @@ > #include <dirent.h> > #include "qemu-timer.h" > #include "migration.h" > +#include "kvm.h" > > //#define DEBUG > //#define DEBUG_COMPLETION > @@ -1263,6 +1264,19 @@ static void do_info_kqemu(void) > #endif > } > > +static void do_info_kvm(void) > +{ > +#ifdef CONFIG_KVM > + term_printf("kvm support: "); > + if (kvm_enabled()) > + term_printf("enabled\n"); > + else > + term_printf("disabled\n"); > +#else > + term_printf("kvm support: not compiled\n"); > +#endif > +} > + > #ifdef CONFIG_PROFILER > > int64_t kqemu_time; > @@ -1495,6 +1509,8 @@ static const term_cmd_t info_cmds[] = { > "", "show dynamic compiler info", }, > { "kqemu", "", do_info_kqemu, > "", "show kqemu information", }, > + { "kvm", "", do_info_kvm, > + "", "show kvm information", }, > { "usb", "", usb_info, > "", "show guest USB devices", }, > { "usbhost", "", usb_host_info, > diff --git a/target-i386/cpu.h b/target-i386/cpu.h > index 263a477..167bae2 100644 > --- a/target-i386/cpu.h > +++ b/target-i386/cpu.h > @@ -587,6 +587,8 @@ typedef struct CPUX86State { > target_ulong kernelgsbase; > #endif > > + uint64_t tsc; > + > uint64_t pat; > > /* exception/interrupt handling */ > @@ -617,6 +619,10 @@ typedef struct CPUX86State { > int kqemu_enabled; > int last_io_time; > #endif > + > + /* For KVM */ > + uint64_t interrupt_bitmap[256 / 64]; > + > /* in order to simplify APIC support, we leave this pointer to the > user */ > struct APICState *apic_state; > diff --git a/target-i386/helper.c b/target-i386/helper.c > index 905ae9b..e550f74 100644 > --- a/target-i386/helper.c > +++ b/target-i386/helper.c > @@ -29,6 +29,7 @@ > #include "exec-all.h" > #include "svm.h" > #include "qemu-common.h" > +#include "kvm.h" > > //#define DEBUG_MMU > > @@ -115,6 +116,8 @@ CPUX86State *cpu_x86_init(const char *cpu_model) > #ifdef USE_KQEMU > kqemu_init(env); > #endif > + if (kvm_enabled()) > + kvm_init_vcpu(env); > return env; > } > > @@ -1288,6 +1291,40 @@ target_phys_addr_t cpu_get_phys_page_debug(CPUState *env, target_ulong addr) > } > #endif /* !CONFIG_USER_ONLY */ > > +#if defined(CONFIG_KVM) > +static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx, > + uint32_t *ecx, uint32_t *edx) > +{ > + uint32_t vec[4]; > + > +#ifdef __x86_64__ > + asm volatile("cpuid" > + : "=a"(vec[0]), "=b"(vec[1]), > + "=c"(vec[2]), "=d"(vec[3]) > + : "0"(function) : "cc"); > +#else > + asm volatile("pusha \n\t" > + "cpuid \n\t" > + "mov %%eax, 0(%1) \n\t" > + "mov %%ebx, 4(%1) \n\t" > + "mov %%ecx, 8(%1) \n\t" > + "mov %%edx, 12(%1) \n\t" > + "popa" > + : : "a"(function), "S"(vec) > + : "memory", "cc"); > +#endif > + > + if (eax) > + *eax = vec[0]; > + if (ebx) > + *ebx = vec[1]; > + if (ecx) > + *ecx = vec[2]; > + if (edx) > + *edx = vec[3]; > +} > +#endif > + > void cpu_x86_cpuid(CPUX86State *env, uint32_t index, > uint32_t *eax, uint32_t *ebx, > uint32_t *ecx, uint32_t *edx) > @@ -1307,12 +1344,23 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, > *ebx = env->cpuid_vendor1; > *edx = env->cpuid_vendor2; > *ecx = env->cpuid_vendor3; > + > + /* sysenter isn't supported on compatibility mode on AMD. and syscall > + * isn't supported in compatibility mode on Intel. so advertise the > + * actuall cpu, and say goodbye to migration between different vendors > + * is you use compatibility mode. */ > + if (kvm_enabled()) > + host_cpuid(0, NULL, ebx, ecx, edx); > break; > case 1: > *eax = env->cpuid_version; > *ebx = (env->cpuid_apic_id << 24) | 8 << 8; /* CLFLUSH size in quad words, Linux wants it. */ > *ecx = env->cpuid_ext_features; > *edx = env->cpuid_features; > + > + /* "Hypervisor present" bit required for Microsoft SVVP */ > + if (kvm_enabled()) > + *ecx |= (1 << 31); > break; > case 2: > /* cache info: needed for Pentium Pro compatibility */ > @@ -1390,6 +1438,31 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, > *ebx = 0; > *ecx = env->cpuid_ext3_features; > *edx = env->cpuid_ext2_features; > + > + if (kvm_enabled()) { > + uint32_t h_eax, h_edx; > + > + host_cpuid(0x80000001, &h_eax, NULL, NULL, &h_edx); > + > + /* disable CPU features that the host does not support */ > + > + /* long mode */ > + if ((h_edx & 0x20000000) == 0 /* || !lm_capable_kernel */) > + *edx &= ~0x20000000; > + /* syscall */ > + if ((h_edx & 0x00000800) == 0) > + *edx &= ~0x00000800; > + /* nx */ > + if ((h_edx & 0x00100000) == 0) > + *edx &= ~0x00100000; > + > + /* disable CPU features that KVM cannot support */ > + > + /* svm */ > + *ecx &= ~4UL; > + /* 3dnow */ > + *edx = ~0xc0000000; > + } > break; > case 0x80000002: > case 0x80000003: > diff --git a/target-i386/kvm.c b/target-i386/kvm.c > new file mode 100644 > index 0000000..ff372af > --- /dev/null > +++ b/target-i386/kvm.c > @@ -0,0 +1,635 @@ > +/* > + * QEMU KVM support > + * > + * Copyright (C) 2006-2008 Qumranet Technologies > + * Copyright IBM, Corp. 2008 > + * > + * Authors: > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include <sys/types.h> > +#include <sys/ioctl.h> > +#include <sys/mman.h> > + > +#include <linux/kvm.h> > + > +#include "qemu-common.h" > +#include "sysemu.h" > +#include "kvm.h" > +#include "cpu.h" > + > +//#define DEBUG_KVM > + > +#ifdef DEBUG_KVM > +#define dprintf(fmt, ...) \ > + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) > +#else > +#define dprintf(fmt, ...) \ > + do { } while (0) > +#endif > + > +int kvm_arch_init_vcpu(CPUState *env) > +{ > + struct { > + struct kvm_cpuid cpuid; > + struct kvm_cpuid_entry entries[100]; > + } __attribute__((packed)) cpuid_data; > + int limit, i, cpuid_i; > + uint32_t eax, ebx, ecx, edx; > + > + cpuid_i = 0; > + > + cpu_x86_cpuid(env, 0, &eax, &ebx, &ecx, &edx); > + limit = eax; > + > + for (i = 0; i < limit; i++) { > + struct kvm_cpuid_entry *c = &cpuid_data.entries[cpuid_i++]; > + > + cpu_x86_cpuid(env, i, &eax, &ebx, &ecx, &edx); > + c->function = i; > + c->eax = eax; > + c->ebx = ebx; > + c->ecx = ecx; > + c->edx = edx; > + } > + > + cpu_x86_cpuid(env, 0x80000000, &eax, &ebx, &ecx, &edx); > + limit = eax; > + > + for (i = 0x80000000; i < limit; i++) { > + struct kvm_cpuid_entry *c = &cpuid_data.entries[cpuid_i++]; > + > + cpu_x86_cpuid(env, i, &eax, &ebx, &ecx, &edx); > + c->function = i; > + c->eax = eax; > + c->ebx = ebx; > + c->ecx = ecx; > + c->edx = edx; > + } > + > + cpuid_data.cpuid.nent = cpuid_i; > + > + return kvm_vcpu_ioctl(env, KVM_SET_CPUID, &cpuid_data); > +} > + > +static int kvm_has_msr_star(CPUState *env) > +{ > + static int has_msr_star; > + int ret; > + > + /* first time */ > + if (has_msr_star == 0) { > + struct kvm_msr_list msr_list, *kvm_msr_list; > + > + has_msr_star = -1; > + > + /* Obtain MSR list from KVM. These are the MSRs that we must > + * save/restore */ > + ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, &msr_list); > + if (ret < 0) > + return 0; > + > + msr_list.nmsrs = 0; > + kvm_msr_list = qemu_mallocz(sizeof(msr_list) + > + msr_list.nmsrs * sizeof(msr_list.indices[0])); > + if (kvm_msr_list == NULL) > + return 0; > + > + ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, kvm_msr_list); > + if (ret >= 0) { > + int i; > + > + for (i = 0; i < kvm_msr_list->nmsrs; i++) { > + if (kvm_msr_list->indices[i] == MSR_STAR) { > + has_msr_star = 1; > + break; > + } > + } > + } > + > + free(kvm_msr_list); > + } > + > + if (has_msr_star == 1) > + return 1; > + return 0; > +} > + > +int kvm_arch_init(KVMState *s, int smp_cpus) > +{ > + int ret; > + > + /* create vm86 tss. KVM uses vm86 mode to emulate 16-bit code > + * directly. In order to use vm86 mode, a TSS is needed. Since this > + * must be part of guest physical memory, we need to allocate it. Older > + * versions of KVM just assumed that it would be at the end of physical > + * memory but that doesn't work with more than 4GB of memory. We simply > + * refuse to work with those older versions of KVM. */ > + ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, (void *)KVM_CAP_SET_TSS_ADDR); > + if (ret <= 0) { > + fprintf(stderr, "kvm does not support KVM_CAP_SET_TSS_ADDR\n"); > + return ret; > + } > + > + /* this address is 3 pages before the bios, and the bios should present > + * as unavaible memory. FIXME, need to ensure the e820 map deals with > + * this? > + */ > + return kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, (void *)0xfffbd000); > +} > + > +static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs) > +{ > + lhs->selector = rhs->selector; > + lhs->base = rhs->base; > + lhs->limit = rhs->limit; > + lhs->type = 3; > + lhs->present = 1; > + lhs->dpl = 3; > + lhs->db = 0; > + lhs->s = 1; > + lhs->l = 0; > + lhs->g = 0; > + lhs->avl = 0; > + lhs->unusable = 0; > +} > + > +static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs) > +{ > + unsigned flags = rhs->flags; > + lhs->selector = rhs->selector; > + lhs->base = rhs->base; > + lhs->limit = rhs->limit; > + lhs->type = (flags >> DESC_TYPE_SHIFT) & 15; > + lhs->present = (flags & DESC_P_MASK) != 0; > + lhs->dpl = rhs->selector & 3; > + lhs->db = (flags >> DESC_B_SHIFT) & 1; > + lhs->s = (flags & DESC_S_MASK) != 0; > + lhs->l = (flags >> DESC_L_SHIFT) & 1; > + lhs->g = (flags & DESC_G_MASK) != 0; > + lhs->avl = (flags & DESC_AVL_MASK) != 0; > + lhs->unusable = 0; > +} > + > +static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) > +{ > + lhs->selector = rhs->selector; > + lhs->base = rhs->base; > + lhs->limit = rhs->limit; > + lhs->flags = > + (rhs->type << DESC_TYPE_SHIFT) > + | (rhs->present * DESC_P_MASK) > + | (rhs->dpl << DESC_DPL_SHIFT) > + | (rhs->db << DESC_B_SHIFT) > + | (rhs->s * DESC_S_MASK) > + | (rhs->l << DESC_L_SHIFT) > + | (rhs->g * DESC_G_MASK) > + | (rhs->avl * DESC_AVL_MASK); > +} > + > +static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set) > +{ > + if (set) > + *kvm_reg = *qemu_reg; > + else > + *qemu_reg = *kvm_reg; > +} > + > +static int kvm_getput_regs(CPUState *env, int set) > +{ > + struct kvm_regs regs; > + int ret = 0; > + > + if (!set) { > + ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, ®s); > + if (ret < 0) > + return ret; > + } > + > + kvm_getput_reg(®s.rax, &env->regs[R_EAX], set); > + kvm_getput_reg(®s.rbx, &env->regs[R_EBX], set); > + kvm_getput_reg(®s.rcx, &env->regs[R_ECX], set); > + kvm_getput_reg(®s.rdx, &env->regs[R_EDX], set); > + kvm_getput_reg(®s.rsi, &env->regs[R_ESI], set); > + kvm_getput_reg(®s.rdi, &env->regs[R_EDI], set); > + kvm_getput_reg(®s.rsp, &env->regs[R_ESP], set); > + kvm_getput_reg(®s.rbp, &env->regs[R_EBP], set); > +#ifdef TARGET_X86_64 > + kvm_getput_reg(®s.r8, &env->regs[8], set); > + kvm_getput_reg(®s.r9, &env->regs[9], set); > + kvm_getput_reg(®s.r10, &env->regs[10], set); > + kvm_getput_reg(®s.r11, &env->regs[11], set); > + kvm_getput_reg(®s.r12, &env->regs[12], set); > + kvm_getput_reg(®s.r13, &env->regs[13], set); > + kvm_getput_reg(®s.r14, &env->regs[14], set); > + kvm_getput_reg(®s.r15, &env->regs[15], set); > +#endif > + > + kvm_getput_reg(®s.rflags, &env->eflags, set); > + kvm_getput_reg(®s.rip, &env->eip, set); > + > + if (set) > + ret = kvm_vcpu_ioctl(env, KVM_SET_REGS, ®s); > + > + return ret; > +} > + > +static int kvm_put_fpu(CPUState *env) > +{ > + struct kvm_fpu fpu; > + int i; > + > + memset(&fpu, 0, sizeof fpu); > + fpu.fsw = env->fpus & ~(7 << 11); > + fpu.fsw |= (env->fpstt & 7) << 11; > + fpu.fcw = env->fpuc; > + for (i = 0; i < 8; ++i) > + fpu.ftwx |= (!env->fptags[i]) << i; > + memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs); > + memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs); > + fpu.mxcsr = env->mxcsr; > + > + return kvm_vcpu_ioctl(env, KVM_SET_FPU, &fpu); > +} > + > +static int kvm_put_sregs(CPUState *env) > +{ > + struct kvm_sregs sregs; > + > + memcpy(sregs.interrupt_bitmap, > + env->interrupt_bitmap, > + sizeof(sregs.interrupt_bitmap)); > + > + if ((env->eflags & VM_MASK)) { > + set_v8086_seg(&sregs.cs, &env->segs[R_CS]); > + set_v8086_seg(&sregs.ds, &env->segs[R_DS]); > + set_v8086_seg(&sregs.es, &env->segs[R_ES]); > + set_v8086_seg(&sregs.fs, &env->segs[R_FS]); > + set_v8086_seg(&sregs.gs, &env->segs[R_GS]); > + set_v8086_seg(&sregs.ss, &env->segs[R_SS]); > + } else { > + set_seg(&sregs.cs, &env->segs[R_CS]); > + set_seg(&sregs.ds, &env->segs[R_DS]); > + set_seg(&sregs.es, &env->segs[R_ES]); > + set_seg(&sregs.fs, &env->segs[R_FS]); > + set_seg(&sregs.gs, &env->segs[R_GS]); > + set_seg(&sregs.ss, &env->segs[R_SS]); > + > + if (env->cr[0] & CR0_PE_MASK) { > + /* force ss cpl to cs cpl */ > + sregs.ss.selector = (sregs.ss.selector & ~3) | > + (sregs.cs.selector & 3); > + sregs.ss.dpl = sregs.ss.selector & 3; > + } > + } > + > + set_seg(&sregs.tr, &env->tr); > + set_seg(&sregs.ldt, &env->ldt); > + > + sregs.idt.limit = env->idt.limit; > + sregs.idt.base = env->idt.base; > + sregs.gdt.limit = env->gdt.limit; > + sregs.gdt.base = env->gdt.base; > + > + sregs.cr0 = env->cr[0]; > + sregs.cr2 = env->cr[2]; > + sregs.cr3 = env->cr[3]; > + sregs.cr4 = env->cr[4]; > + > + sregs.cr8 = cpu_get_apic_tpr(env); > + sregs.apic_base = cpu_get_apic_base(env); > + > + sregs.efer = env->efer; > + > + return kvm_vcpu_ioctl(env, KVM_SET_SREGS, &sregs); > +} > + > +static void kvm_msr_entry_set(struct kvm_msr_entry *entry, > + uint32_t index, uint64_t value) > +{ > + entry->index = index; > + entry->data = value; > +} > + > +static int kvm_put_msrs(CPUState *env) > +{ > + struct { > + struct kvm_msrs info; > + struct kvm_msr_entry entries[100]; > + } msr_data; > + struct kvm_msr_entry *msrs = msr_data.entries; > + int n = 0; > + > + kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs); > + kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp); > + kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip); > + if (kvm_has_msr_star(env)) > + kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star); > + kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc); > +#ifdef TARGET_X86_64 > + /* FIXME if lm capable */ > + kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar); > + kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase); > + kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask); > + kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar); > +#endif > + msr_data.info.nmsrs = n; > + > + return kvm_vcpu_ioctl(env, KVM_SET_MSRS, &msr_data); > + > +} > + > + > +static int kvm_get_fpu(CPUState *env) > +{ > + struct kvm_fpu fpu; > + int i, ret; > + > + ret = kvm_vcpu_ioctl(env, KVM_GET_FPU, &fpu); > + if (ret < 0) > + return ret; > + > + env->fpstt = (fpu.fsw >> 11) & 7; > + env->fpus = fpu.fsw; > + env->fpuc = fpu.fcw; > + for (i = 0; i < 8; ++i) > + env->fptags[i] = !((fpu.ftwx >> i) & 1); > + memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs); > + memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs); > + env->mxcsr = fpu.mxcsr; > + > + return 0; > +} > + > +static int kvm_get_sregs(CPUState *env) > +{ > + struct kvm_sregs sregs; > + uint32_t hflags; > + int ret; > + > + ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs); > + if (ret < 0) > + return ret; > + > + memcpy(env->interrupt_bitmap, > + sregs.interrupt_bitmap, > + sizeof(sregs.interrupt_bitmap)); > + > + get_seg(&env->segs[R_CS], &sregs.cs); > + get_seg(&env->segs[R_DS], &sregs.ds); > + get_seg(&env->segs[R_ES], &sregs.es); > + get_seg(&env->segs[R_FS], &sregs.fs); > + get_seg(&env->segs[R_GS], &sregs.gs); > + get_seg(&env->segs[R_SS], &sregs.ss); > + > + get_seg(&env->tr, &sregs.tr); > + get_seg(&env->ldt, &sregs.ldt); > + > + env->idt.limit = sregs.idt.limit; > + env->idt.base = sregs.idt.base; > + env->gdt.limit = sregs.gdt.limit; > + env->gdt.base = sregs.gdt.base; > + > + env->cr[0] = sregs.cr0; > + env->cr[2] = sregs.cr2; > + env->cr[3] = sregs.cr3; > + env->cr[4] = sregs.cr4; > + > + cpu_set_apic_base(env, sregs.apic_base); > + > + env->efer = sregs.efer; > + //cpu_set_apic_tpr(env, sregs.cr8); > + > +#define HFLAG_COPY_MASK ~( \ > + HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \ > + HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \ > + HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \ > + HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK) > + > + > + > + hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK; > + hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT); > + hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) & > + (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK); > + hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK)); > + hflags |= (env->cr[4] & CR4_OSFXSR_MASK) << > + (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT); > + > + if (env->efer & MSR_EFER_LMA) { > + hflags |= HF_LMA_MASK; > + } > + > + if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) { > + hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK; > + } else { > + hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >> > + (DESC_B_SHIFT - HF_CS32_SHIFT); > + hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >> > + (DESC_B_SHIFT - HF_SS32_SHIFT); > + if (!(env->cr[0] & CR0_PE_MASK) || > + (env->eflags & VM_MASK) || > + !(hflags & HF_CS32_MASK)) { > + hflags |= HF_ADDSEG_MASK; > + } else { > + hflags |= ((env->segs[R_DS].base | > + env->segs[R_ES].base | > + env->segs[R_SS].base) != 0) << > + HF_ADDSEG_SHIFT; > + } > + } > + env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags; > + env->cc_src = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); > + env->df = 1 - (2 * ((env->eflags >> 10) & 1)); > + env->cc_op = CC_OP_EFLAGS; > + env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); > + > + return 0; > +} > + > +static int kvm_get_msrs(CPUState *env) > +{ > + struct { > + struct kvm_msrs info; > + struct kvm_msr_entry entries[100]; > + } msr_data; > + struct kvm_msr_entry *msrs = msr_data.entries; > + int ret, i, n; > + > + n = 0; > + msrs[n++].index = MSR_IA32_SYSENTER_CS; > + msrs[n++].index = MSR_IA32_SYSENTER_ESP; > + msrs[n++].index = MSR_IA32_SYSENTER_EIP; > + if (kvm_has_msr_star(env)) > + msrs[n++].index = MSR_STAR; > + msrs[n++].index = MSR_IA32_TSC; > +#ifdef TARGET_X86_64 > + /* FIXME lm_capable_kernel */ > + msrs[n++].index = MSR_CSTAR; > + msrs[n++].index = MSR_KERNELGSBASE; > + msrs[n++].index = MSR_FMASK; > + msrs[n++].index = MSR_LSTAR; > +#endif > + msr_data.info.nmsrs = n; > + ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data); > + if (ret < 0) > + return ret; > + > + for (i = 0; i < ret; i++) { > + switch (msrs[i].index) { > + case MSR_IA32_SYSENTER_CS: > + env->sysenter_cs = msrs[i].data; > + break; > + case MSR_IA32_SYSENTER_ESP: > + env->sysenter_esp = msrs[i].data; > + break; > + case MSR_IA32_SYSENTER_EIP: > + env->sysenter_eip = msrs[i].data; > + break; > + case MSR_STAR: > + env->star = msrs[i].data; > + break; > +#ifdef TARGET_X86_64 > + case MSR_CSTAR: > + env->cstar = msrs[i].data; > + break; > + case MSR_KERNELGSBASE: > + env->kernelgsbase = msrs[i].data; > + break; > + case MSR_FMASK: > + env->fmask = msrs[i].data; > + break; > + case MSR_LSTAR: > + env->lstar = msrs[i].data; > + break; > +#endif > + case MSR_IA32_TSC: > + env->tsc = msrs[i].data; > + break; > + } > + } > + > + return 0; > +} > + > +int kvm_arch_put_registers(CPUState *env) > +{ > + int ret; > + > + ret = kvm_getput_regs(env, 1); > + if (ret < 0) > + return ret; > + > + ret = kvm_put_fpu(env); > + if (ret < 0) > + return ret; > + > + ret = kvm_put_sregs(env); > + if (ret < 0) > + return ret; > + > + ret = kvm_put_msrs(env); > + if (ret < 0) > + return ret; > + > + return 0; > +} > + > +int kvm_arch_get_registers(CPUState *env) > +{ > + int ret; > + > + ret = kvm_getput_regs(env, 0); > + if (ret < 0) > + return ret; > + > + ret = kvm_get_fpu(env); > + if (ret < 0) > + return ret; > + > + ret = kvm_get_sregs(env); > + if (ret < 0) > + return ret; > + > + ret = kvm_get_msrs(env); > + if (ret < 0) > + return ret; > + > + return 0; > +} > + > +int kvm_arch_pre_run(CPUState *env, struct kvm_run *run) > +{ > + /* Try to inject an interrupt if the guest can accept it */ > + if (run->ready_for_interrupt_injection && > + (env->interrupt_request & CPU_INTERRUPT_HARD) && > + (env->eflags & IF_MASK)) { > + int irq; > + > + env->interrupt_request &= ~CPU_INTERRUPT_HARD; > + irq = cpu_get_pic_interrupt(env); > + if (irq >= 0) { > + struct kvm_interrupt intr; > + intr.irq = irq; > + /* FIXME: errors */ > + dprintf("injected interrupt %d\n", irq); > + kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr); > + } > + } > + > + /* If we have an interrupt but the guest is not ready to receive an > + * interrupt, request an interrupt window exit. This will > + * cause a return to userspace as soon as the guest is ready to > + * receive interrupts. */ > + if ((env->interrupt_request & CPU_INTERRUPT_HARD)) > + run->request_interrupt_window = 1; > + else > + run->request_interrupt_window = 0; > + > + return 0; > +} > + > +int kvm_arch_post_run(CPUState *env, struct kvm_run *run) > +{ > + if (run->if_flag) > + env->eflags |= IF_MASK; > + else > + env->eflags &= ~IF_MASK; > + > + cpu_set_apic_tpr(env, run->cr8); > + cpu_set_apic_base(env, run->apic_base); > + > + return 0; > +} > + > +static int kvm_handle_halt(CPUState *env) > +{ > + if (!((env->interrupt_request & CPU_INTERRUPT_HARD) && > + (env->eflags & IF_MASK)) && > + !(env->interrupt_request & CPU_INTERRUPT_NMI)) { > + env->halted = 1; > + env->exception_index = EXCP_HLT; > + return 0; > + } > + > + return 1; > +} > + > +int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) > +{ > + int ret = 0; > + > + switch (run->exit_reason) { > + case KVM_EXIT_HLT: > + dprintf("handle_hlt\n"); > + ret = kvm_handle_halt(env); > + break; > + } > + > + return ret; > +} > diff --git a/vl.c b/vl.c > index 74ae652..ecda8d5 100644 > --- a/vl.c > +++ b/vl.c > @@ -39,6 +39,7 @@ > #include "block.h" > #include "audio/audio.h" > #include "migration.h" > +#include "kvm.h" > > #include <unistd.h> > #include <fcntl.h> > @@ -8258,6 +8259,9 @@ static void help(int exitcode) > "-kernel-kqemu enable KQEMU full virtualization (default is user mode only)\n" > "-no-kqemu disable KQEMU kernel module usage\n" > #endif > +#ifdef CONFIG_KVM > + "-enable-kvm enable KVM full virtualization support\n" > +#endif > #ifdef TARGET_I386 > "-no-acpi disable ACPI\n" > #endif > @@ -8363,6 +8367,7 @@ enum { > QEMU_OPTION_pidfile, > QEMU_OPTION_no_kqemu, > QEMU_OPTION_kernel_kqemu, > + QEMU_OPTION_enable_kvm, > QEMU_OPTION_win2k_hack, > QEMU_OPTION_usb, > QEMU_OPTION_usbdevice, > @@ -8449,6 +8454,9 @@ static const QEMUOption qemu_options[] = { > { "no-kqemu", 0, QEMU_OPTION_no_kqemu }, > { "kernel-kqemu", 0, QEMU_OPTION_kernel_kqemu }, > #endif > +#ifdef CONFIG_KVM > + { "enable-kvm", 0, QEMU_OPTION_enable_kvm }, > +#endif > #if defined(TARGET_PPC) || defined(TARGET_SPARC) > { "g", 1, QEMU_OPTION_g }, > #endif > @@ -9271,6 +9279,14 @@ int main(int argc, char **argv) > kqemu_allowed = 2; > break; > #endif > +#ifdef CONFIG_KVM > + case QEMU_OPTION_enable_kvm: > + kvm_allowed = 1; > +#ifdef USE_KQEMU > + kqemu_allowed = 0; > +#endif > + break; > +#endif > case QEMU_OPTION_usb: > usb_enabled = 1; > break; > @@ -9405,6 +9421,14 @@ int main(int argc, char **argv) > } > } > > +#if defined(CONFIG_KVM) && defined(USE_KQEMU) > + if (kvm_allowed && kqemu_allowed) { > + fprintf(stderr, > + "You can not enable both KVM and kqemu at the same time\n"); > + exit(1); > + } > +#endif > + > if (smp_cpus > machine->max_cpus) { > fprintf(stderr, "Number of SMP cpus requested (%d), exceeds max cpus " > "supported by machine `%s' (%d)\n", smp_cpus, machine->name, > @@ -9710,6 +9734,16 @@ int main(int argc, char **argv) > } > } > > + if (kvm_enabled()) { > + int ret; > + > + ret = kvm_init(smp_cpus); > + if (ret < 0) { > + fprintf(stderr, "failed to initialize KVM\n"); > + exit(1); > + } > + } > + > machine->init(ram_size, vga_ram_size, boot_devices, ds, > kernel_filename, kernel_cmdline, initrd_filename, cpu_model); > ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:13 ` [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU Anthony Liguori ` (3 preceding siblings ...) 2008-10-29 14:58 ` Glauber Costa @ 2008-10-29 17:41 ` Glauber Costa 2008-10-29 19:01 ` Anthony Liguori 2008-11-04 13:24 ` Avi Kivity 5 siblings, 1 reply; 34+ messages in thread From: Glauber Costa @ 2008-10-29 17:41 UTC (permalink / raw) To: Anthony Liguori; +Cc: qemu-devel, kvm-devel, Avi Kivity resending, now with less quoted text: > diff --git a/kvm-all.c b/kvm-all.c > new file mode 100644 > index 0000000..4379071 > --- /dev/null > +++ b/kvm-all.c > @@ -0,0 +1,377 @@ > +/* > + * QEMU KVM support > + * > + * Copyright IBM, Corp. 2008 > + * > + * Authors: > + * Anthony Liguori <aliguori@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include <sys/types.h> > +#include <sys/ioctl.h> > +#include <sys/mman.h> > + > +#include <linux/kvm.h> > + > +#include "qemu-common.h" > +#include "sysemu.h" > +#include "kvm.h" > + > +//#define DEBUG_KVM > + > +#ifdef DEBUG_KVM > +#define dprintf(fmt, ...) \ > + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) > +#else > +#define dprintf(fmt, ...) \ > + do { } while (0) > +#endif > + > +typedef struct kvm_userspace_memory_region KVMSlot; Actually, I don't think it is a good idea. We may want to keep internal-only data tied to the slot, such the slot's dirty bitmap if we do per-slot dirty tracking. Of course there may be other ways to do it, but this is the cleaner and more adequate since we're going through a fresh start. ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-29 17:41 ` Glauber Costa @ 2008-10-29 19:01 ` Anthony Liguori 0 siblings, 0 replies; 34+ messages in thread From: Anthony Liguori @ 2008-10-29 19:01 UTC (permalink / raw) To: Glauber Costa; +Cc: qemu-devel, kvm-devel, Avi Kivity Glauber Costa wrote: > resending, now with less quoted text: > Yeah, I was intending to scold you about that ;-) >> diff --git a/kvm-all.c b/kvm-all.c >> new file mode 100644 >> index 0000000..4379071 >> --- /dev/null >> +++ b/kvm-all.c >> @@ -0,0 +1,377 @@ >> +/* >> + * QEMU KVM support >> + * >> + * Copyright IBM, Corp. 2008 >> + * >> + * Authors: >> + * Anthony Liguori <aliguori@us.ibm.com> >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2. See >> + * the COPYING file in the top-level directory. >> + * >> + */ >> + >> +#include <sys/types.h> >> +#include <sys/ioctl.h> >> +#include <sys/mman.h> >> + >> +#include <linux/kvm.h> >> + >> +#include "qemu-common.h" >> +#include "sysemu.h" >> +#include "kvm.h" >> + >> +//#define DEBUG_KVM >> + >> +#ifdef DEBUG_KVM >> +#define dprintf(fmt, ...) \ >> + do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) >> +#else >> +#define dprintf(fmt, ...) \ >> + do { } while (0) >> +#endif >> + >> +typedef struct kvm_userspace_memory_region KVMSlot; >> > > Actually, I don't think it is a good idea. > We may want to keep internal-only data tied to the slot, such the slot's > dirty bitmap if we do per-slot dirty tracking. > > Of course there may be other ways to do it, but this is the cleaner > and more adequate since we're going through a fresh start. > For now, I think it's fine. Switching to making it a proper structure is certainly a good idea when we get more stuff per slot. Regards, Anthony Liguori ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-10-28 20:13 ` [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU Anthony Liguori ` (4 preceding siblings ...) 2008-10-29 17:41 ` Glauber Costa @ 2008-11-04 13:24 ` Avi Kivity 2008-11-04 14:02 ` Anthony Liguori 5 siblings, 1 reply; 34+ messages in thread From: Avi Kivity @ 2008-11-04 13:24 UTC (permalink / raw) To: Anthony Liguori; +Cc: Glauber Costa, qemu-devel, kvm-devel Anthony Liguori wrote: > This patch adds very basic KVM support. KVM is a kernel module for Linux that > allows userspace programs to make use of hardware virtualization support. It > current supports x86 hardware virtualization using Intel VT-x or AMD-V. It > also supports IA64 VT-i, PPC 440, and S390. > > This patch only implements the bare minimum support to get a guest booting. It > has very little impact the rest of QEMU and attempts to integrate nicely with > the rest of QEMU. > > Even though this implementation is basic, it is significantly faster than TCG. > Booting and shutting down a Linux guest: > > w/TCG: 1:32.36 elapsed 84% CPU > > w/KVM: 0:31.14 elapsed 59% CPU > > Right now, KVM is disabled by default and must be explicitly enabled with > -enable-kvm. We can enable it by default later when we have had better > testing. > > Signed-off-by: Anthony Liguori <aliguori@us.ibm.com> > > diff --git a/KVM_TODO b/KVM_TODO > new file mode 100644 > index 0000000..9529049 > --- /dev/null > +++ b/KVM_TODO > @@ -0,0 +1,9 @@ > +1) Add hooks for load/save of register state > + o Fixes gdbstub, save/restore, and vmport > +2) Add VGA optimization > +3) Add IO thread > +4) Add guest SMP support > +5) Add TPR optimization > +6) Add support for in-kernel APIC > +7) Add support for in-kernel PIT > +8) Merge in additional changes in kvm-userspace tree > One of the important changes is running with signal delivery disabled, since that's particularly slow (requires save/restore of the floating point state, for example). > + > +typedef struct kvm_userspace_memory_region KVMSlot; > KVMMemorySlot? > + > +static KVMState *kvm_state; > Why a pointer? > + if (ret < 0) { > + dprintf("kvm_create_vcpu failed\n"); > showing errno would be nice. > + > +static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set) > +{ > + if (set) > + *kvm_reg = *qemu_reg; > + else > + *qemu_reg = *kvm_reg; > +} > Ugh. I think live migration is now broken, since kvm accesses will not update the qemu dirty bitmap. -- error compiling committee.c: too many arguments to function ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-11-04 13:24 ` Avi Kivity @ 2008-11-04 14:02 ` Anthony Liguori 2008-11-04 14:46 ` Avi Kivity 0 siblings, 1 reply; 34+ messages in thread From: Anthony Liguori @ 2008-11-04 14:02 UTC (permalink / raw) To: Avi Kivity; +Cc: Glauber Costa, qemu-devel, kvm-devel Avi Kivity wrote: >> >> +8) Merge in additional changes in kvm-userspace tree >> > > One of the important changes is running with signal delivery disabled, > since that's particularly slow (requires save/restore of the floating > point state, for example). I think we need the IO thread first. We're slowly getting there >> + >> +typedef struct kvm_userspace_memory_region KVMSlot; >> > > KVMMemorySlot? Sure. >> + >> +static KVMState *kvm_state; >> > > Why a pointer? I would like to avoid having the global state to begin with. >> + if (ret < 0) { >> + dprintf("kvm_create_vcpu failed\n"); >> > > showing errno would be nice. Indeed. >> + >> +static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, >> int set) >> +{ >> + if (set) >> + *kvm_reg = *qemu_reg; >> + else >> + *qemu_reg = *kvm_reg; >> +} >> > > Ugh. > > I think live migration is now broken, since kvm accesses will not > update the qemu dirty bitmap. Eh? I don't follow you here. Regards, Anthony Liguori ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-11-04 14:02 ` Anthony Liguori @ 2008-11-04 14:46 ` Avi Kivity 2008-11-04 14:50 ` Anthony Liguori 0 siblings, 1 reply; 34+ messages in thread From: Avi Kivity @ 2008-11-04 14:46 UTC (permalink / raw) To: Anthony Liguori; +Cc: Glauber Costa, qemu-devel, kvm-devel Anthony Liguori wrote: >> I think live migration is now broken, since kvm accesses will not >> update the qemu dirty bitmap. > > Eh? I don't follow you here. > guest writes in qemu set the qemu dirty bitmap (for cpu_physical_memory_get_dirty()), but guest accesses in kvm won't, unless you enable dirty tracking and synchronize kvm's bitmap to qemu's. See the changes to migration.c in kvm's qemu. -- error compiling committee.c: too many arguments to function ^ permalink raw reply [flat|nested] 34+ messages in thread
* [Qemu-devel] Re: [PATCH 3/3] Add KVM support to QEMU 2008-11-04 14:46 ` Avi Kivity @ 2008-11-04 14:50 ` Anthony Liguori 0 siblings, 0 replies; 34+ messages in thread From: Anthony Liguori @ 2008-11-04 14:50 UTC (permalink / raw) To: Avi Kivity; +Cc: Glauber Costa, qemu-devel, kvm-devel Avi Kivity wrote: > Anthony Liguori wrote: > >>> I think live migration is now broken, since kvm accesses will not >>> update the qemu dirty bitmap. >> >> Eh? I don't follow you here. >> > > guest writes in qemu set the qemu dirty bitmap (for > cpu_physical_memory_get_dirty()), but guest accesses in kvm won't, > unless you enable dirty tracking and synchronize kvm's bitmap to > qemu's. See the changes to migration.c in kvm's qemu. Oh yes, that falls under the save/restore TODO item. Regards, Anthony Liguori ^ permalink raw reply [flat|nested] 34+ messages in thread
end of thread, other threads:[~2008-11-04 14:51 UTC | newest] Thread overview: 34+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2008-10-28 20:13 [Qemu-devel] [PATCH 1/3] Add additional CPU flag definitions Anthony Liguori 2008-10-28 20:13 ` [Qemu-devel] [PATCH 2/3] Split CPUID from op_helper Anthony Liguori 2008-10-28 20:13 ` [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU Anthony Liguori 2008-10-28 20:49 ` Hollis Blanchard 2008-10-28 21:10 ` Anthony Liguori 2008-10-28 20:57 ` Andreas Färber 2008-10-28 21:04 ` Glauber Costa 2008-10-28 21:16 ` Anthony Liguori 2008-10-28 21:05 ` Anthony Liguori 2008-11-04 13:25 ` Avi Kivity 2008-10-28 21:41 ` [Qemu-devel] " Gerd Hoffmann 2008-10-28 21:51 ` Anthony Liguori 2008-10-28 23:04 ` Glauber Costa 2008-10-28 23:36 ` Anthony Liguori 2008-10-29 9:54 ` Avi Kivity 2008-10-29 12:35 ` Glauber Costa 2008-10-29 12:39 ` Avi Kivity 2008-10-29 12:56 ` Glauber Costa 2008-10-29 13:07 ` Anthony Liguori 2008-10-29 13:23 ` Avi Kivity 2008-10-29 13:32 ` Anthony Liguori 2008-10-29 13:51 ` Hollis Blanchard 2008-10-29 14:09 ` Avi Kivity 2008-10-29 14:16 ` Fabrice Bellard 2008-10-29 14:23 ` Anthony Liguori 2008-10-29 19:13 ` Blue Swirl 2008-11-01 16:25 ` Blue Swirl 2008-10-29 14:58 ` Glauber Costa 2008-10-29 17:41 ` Glauber Costa 2008-10-29 19:01 ` Anthony Liguori 2008-11-04 13:24 ` Avi Kivity 2008-11-04 14:02 ` Anthony Liguori 2008-11-04 14:46 ` Avi Kivity 2008-11-04 14:50 ` Anthony Liguori
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).