From mboxrd@z Thu Jan 1 00:00:00 1970 From: Marcelo Tosatti Subject: [patch 08/12] QEMU/KVM: non-virtualized ACPI PMTimer support Date: Thu, 29 May 2008 19:22:57 -0300 Message-ID: <20080529222829.012521195@localhost.localdomain> References: <20080529222249.563011248@localhost.localdomain> Cc: Chris Wright , Glauber Costa , Anthony Liguori , kvm@vger.kernel.org, Marcelo Tosatti To: Avi Kivity Return-path: Received: from mx1.redhat.com ([66.187.233.31]:52673 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754033AbYE2W3t (ORCPT ); Thu, 29 May 2008 18:29:49 -0400 Content-Disposition: inline; filename=acpi-pmtimer Sender: kvm-owner@vger.kernel.org List-ID: QEMU support for direct pmtimer reads. Hopefully its safe, since its a read-only register ? With self-disable C2 + this I'm seeing less CPU usage when idle with CONFIG_CPU_IDLE enabled. Quite noticeable on SMP guests. Windows XP is comparable to standard (never seen it consume less than 10% either way, usually 20-30%). On migration the destination host can either lack ACPI or have the timer in a different IO port, so emulation is necessary. Or luckily the pmtimer is in the same address. Since the 24-bit counter overflow period is only ~= 4.6 seconds, its probably worthwhile to wait for synchronization before restarting the guest. Not implemented though. Signed-off-by: Marcelo Tosatti Index: kvm-userspace.realtip/bios/rombios32.c =================================================================== --- kvm-userspace.realtip.orig/bios/rombios32.c +++ kvm-userspace.realtip/bios/rombios32.c @@ -391,7 +391,7 @@ uint8_t bios_uuid[16]; unsigned long ebda_cur_addr; #endif int acpi_enabled; -uint32_t pm_io_base, smb_io_base; +uint32_t pm_io_base, pmtmr_base, smb_io_base; int pm_sci_int; unsigned long bios_table_cur_addr; unsigned long bios_table_end_addr; @@ -819,6 +819,12 @@ static void pci_bios_init_device(PCIDevi pci_config_writeb(d, PCI_INTERRUPT_LINE, 9); pm_io_base = PM_IO_BASE; + pmtmr_base = cmos_readb(0x60); + pmtmr_base |= cmos_readb(0x61) << 8; + pmtmr_base |= cmos_readb(0x62) << 16; + pmtmr_base |= cmos_readb(0x63) << 24; + if (!pmtmr_base) + pmtmr_base = pm_io_base + 0x08; pci_config_writel(d, 0x40, pm_io_base | 1); pci_config_writeb(d, 0x80, 0x01); /* enable PM io space */ smb_io_base = SMB_IO_BASE; @@ -1376,7 +1382,7 @@ void acpi_bios_init(void) fadt->acpi_disable = 0xf0; fadt->pm1a_evt_blk = cpu_to_le32(pm_io_base); fadt->pm1a_cnt_blk = cpu_to_le32(pm_io_base + 0x04); - fadt->pm_tmr_blk = cpu_to_le32(pm_io_base + 0x08); + fadt->pm_tmr_blk = cpu_to_le32(pmtmr_base); fadt->pm1_evt_len = 4; fadt->pm1_cnt_len = 2; fadt->pm_tmr_len = 4; Index: kvm-userspace.realtip/qemu/hw/acpi.c =================================================================== --- kvm-userspace.realtip.orig/qemu/hw/acpi.c +++ kvm-userspace.realtip/qemu/hw/acpi.c @@ -40,6 +40,10 @@ typedef struct PIIX4PMState { uint16_t pmsts; uint16_t pmen; uint16_t pmcntrl; + uint32_t pmtimer_base; + uint8_t direct_access; + int32_t pmtimer_offset; + uint32_t pmtimer_io_offset; uint8_t apmc; uint8_t apms; QEMUTimer *tmr_timer; @@ -81,7 +85,12 @@ PIIX4PMState *pm_state; static uint32_t get_pmtmr(PIIX4PMState *s) { uint32_t d; - d = muldiv64(qemu_get_clock(vm_clock), PM_FREQ, ticks_per_sec); + if (!s->direct_access) { + d = muldiv64(qemu_get_clock(vm_clock), PM_FREQ, ticks_per_sec); + d += s->pmtimer_offset; + } else + qemu_kvm_get_pmtimer(&d); + return d & 0xffffff; } @@ -235,14 +244,10 @@ static uint32_t pm_ioport_readl(void *op uint32_t val; addr &= 0x3f; - switch(addr) { - case 0x08: + if (addr == s->pmtimer_io_offset) val = get_pmtmr(s); - break; - default: + else val = 0; - break; - } #ifdef DEBUG printf("PM readl port=0x%04x val=0x%08x\n", addr, val); #endif @@ -433,9 +438,9 @@ static uint32_t smb_ioport_readb(void *o return val; } -static void pm_io_space_update(PIIX4PMState *s) +static void pm_io_space_update(PIIX4PMState *s, int migration) { - uint32_t pm_io_base; + uint32_t pm_io_base, pmtmr_len; if (s->dev.config[0x80] & 1) { pm_io_base = le32_to_cpu(*(uint32_t *)(s->dev.config + 0x40)); @@ -443,14 +448,29 @@ static void pm_io_space_update(PIIX4PMSt /* XXX: need to improve memory and ioport allocation */ #if defined(DEBUG) - printf("PM: mapping to 0x%x\n", pm_io_base); + printf("PM: mapping to 0x%x mig=%d\n", pm_io_base, migration); #endif register_ioport_write(pm_io_base, 64, 1, pm_ioport_writeb, s); register_ioport_read(pm_io_base, 64, 1, pm_ioport_readb, s); register_ioport_write(pm_io_base, 64, 2, pm_ioport_writew, s); register_ioport_read(pm_io_base, 64, 2, pm_ioport_readw, s); - register_ioport_write(pm_io_base, 64, 4, pm_ioport_writel, s); - register_ioport_read(pm_io_base, 64, 4, pm_ioport_readl, s); + + if (migration) { + s->pmtimer_io_offset = 0x08; + pmtmr_len = 64; + } else if (host_pmtimer_base) { + s->pmtimer_base = host_pmtimer_base; + s->pmtimer_io_offset = 0x0; + pmtmr_len = 4; + s->direct_access = 1; + } else { + s->pmtimer_base = pm_io_base; + s->pmtimer_io_offset = 0x08; + pmtmr_len = 64; + } + + register_ioport_write(s->pmtimer_base, pmtmr_len, 4, pm_ioport_writel, s); + register_ioport_read(s->pmtimer_base, pmtmr_len, 4, pm_ioport_readl, s); } } @@ -459,12 +479,13 @@ static void pm_write_config(PCIDevice *d { pci_default_write_config(d, address, val, len); if (address == 0x80) - pm_io_space_update((PIIX4PMState *)d); + pm_io_space_update((PIIX4PMState *)d, 0); } static void pm_save(QEMUFile* f,void *opaque) { PIIX4PMState *s = opaque; + uint32_t pmtmr_val; pci_device_save(&s->dev, f); @@ -475,6 +496,14 @@ static void pm_save(QEMUFile* f,void *op qemu_put_8s(f, &s->apms); qemu_put_timer(f, s->tmr_timer); qemu_put_be64(f, s->tmr_overflow_time); + qemu_put_be32(f, s->pmtimer_base); + if (s->direct_access) { + if (qemu_kvm_get_pmtimer(&pmtmr_val) < 0) + pmtmr_val = 1 << 30; + } else + pmtmr_val = get_pmtmr(s); + + qemu_put_be32(f, pmtmr_val); } static int pm_load(QEMUFile* f,void* opaque,int version_id) @@ -482,7 +511,7 @@ static int pm_load(QEMUFile* f,void* opa PIIX4PMState *s = opaque; int ret; - if (version_id > 1) + if (version_id > 2) return -EINVAL; ret = pci_device_load(&s->dev, f); @@ -496,10 +525,31 @@ static int pm_load(QEMUFile* f,void* opa qemu_get_8s(f, &s->apms); qemu_get_timer(f, s->tmr_timer); s->tmr_overflow_time=qemu_get_be64(f); + if (version_id >= 2) { + uint32_t pmtmr_val; - pm_io_space_update(s); + s->pmtimer_base = qemu_get_be32(f); + pmtmr_val = qemu_get_be32(f); + if (pmtmr_val & (1 << 30)) + return -EINVAL; +#ifdef KVM_CAP_OPEN_IOPORT + /* + * Could wait for synchronicity instead of closing + * direct access. + */ + if (host_pmtimer_base) { + ret = kvm_close_direct_pmtimer(); + if (ret) + return ret; + host_pmtimer_base = 0; + } +#endif + s->pmtimer_offset = pmtmr_val - get_pmtmr(s); + } - return 0; + pm_io_space_update(s, 1); + + return 0; } i2c_bus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base, @@ -548,7 +598,7 @@ i2c_bus *piix4_pm_init(PCIBus *bus, int s->tmr_timer = qemu_new_timer(vm_clock, pm_tmr_timer, s); - register_savevm("piix4_pm", 0, 1, pm_save, pm_load, s); + register_savevm("piix4_pm", 0, 2, pm_save, pm_load, s); s->smbus = i2c_init_bus(); s->irq = sci_irq; Index: kvm-userspace.realtip/qemu/hw/pc.c =================================================================== --- kvm-userspace.realtip.orig/qemu/hw/pc.c +++ kvm-userspace.realtip/qemu/hw/pc.c @@ -253,6 +253,11 @@ static void cmos_init(ram_addr_t ram_siz } rtc_set_memory(s, 0x5f, smp_cpus - 1); + rtc_set_memory(s, 0x60, host_pmtimer_base); + rtc_set_memory(s, 0x61, host_pmtimer_base >> 8); + rtc_set_memory(s, 0x62, host_pmtimer_base >> 16); + rtc_set_memory(s, 0x63, host_pmtimer_base >> 24); + if (ram_size > (16 * 1024 * 1024)) val = (ram_size / 65536) - ((16 * 1024 * 1024) / 65536); else Index: kvm-userspace.realtip/qemu/qemu-kvm-x86.c =================================================================== --- kvm-userspace.realtip.orig/qemu/qemu-kvm-x86.c +++ kvm-userspace.realtip/qemu/qemu-kvm-x86.c @@ -11,12 +11,17 @@ #include #include "hw/hw.h" +#include "sysemu.h" #include "qemu-kvm.h" #include #include #include #include +#include +#include +#include + #define MSR_IA32_TSC 0x10 @@ -545,6 +550,123 @@ static int get_para_features(kvm_context return features; } +#ifdef KVM_CAP_OPEN_IOPORT +int kvm_arch_open_pmtimer(void) +{ + int fd, ret = 0; + char buf[16384]; + char *line, *saveptr; + uint32_t pmtmr; + struct kvm_ioport_list *ioport_list; + + if (no_direct_pmtimer) + return ret; + + fd = open("/proc/ioports", O_RDONLY); + if (fd == -1) { + perror("open /proc/ioports"); + exit(0); + } + ret = read(fd, buf, 16384); + if (ret == -1) { + perror("read /proc/ioports"); + exit(0); + } + + line = strtok_r(buf, "\n", &saveptr); + do { + char *pmstr; + line = pmstr = strtok_r(NULL, "\n", &saveptr); + if (pmstr && strstr(pmstr, "ACPI PM_TMR")) { + pmstr = strtok(line, "-"); + while (*pmstr == ' ') + pmstr++; + host_pmtimer_base = strtoul(pmstr, NULL, 16); + /* + * Fail now instead of during migration + */ + if (qemu_kvm_get_pmtimer(&pmtmr) < 0) + host_pmtimer_base = 0; + break; + } + } while (line); + + if (!host_pmtimer_base) + return 0; + + ioport_list = qemu_malloc(sizeof(struct kvm_ioport_list) + + sizeof(struct kvm_ioport) * 2); + if (!ioport_list) + goto out_no_pmtimer; + ioport_list->nranges = 2; + ioport_list->ioports[0].addr = 0x80; + ioport_list->ioports[0].len = 1; + ioport_list->ioports[1].addr = host_pmtimer_base; + ioport_list->ioports[1].len = 4; + + ret = kvm_set_open_ioports(kvm_context, ioport_list); + if (ret) { + perror("kvm_set_open_ioports"); + goto out_no_pmtimer_free; + } + + qemu_free(ioport_list); + return 0; + +out_no_pmtimer_free: + qemu_free(ioport_list); +out_no_pmtimer: + host_pmtimer_base = 0; + return 0; +} + +int kvm_close_direct_pmtimer(void) +{ + struct kvm_ioport_list *ioport_list; + int ret; + + ioport_list = qemu_malloc(sizeof(struct kvm_ioport_list) + + sizeof(struct kvm_ioport)); + if (!ioport_list) + return -EINVAL; + ioport_list->nranges = 1; + ioport_list->ioports[0].addr = 0x80; + ioport_list->ioports[0].len = 1; + + ret = kvm_set_open_ioports(kvm_context, ioport_list); + + qemu_free(ioport_list); + return ret; +} +#else +int kvm_arch_open_pmtimer(void) +{ + return 0; +} +#endif + +int kvm_arch_qemu_init(void) +{ + kvm_arch_open_pmtimer(); + return 0; +} + +int qemu_kvm_get_pmtimer(uint32_t *value) +{ + int fd, ret; + + fd = open("/dev/pmtimer", O_RDONLY); + if (fd == -1) + return -1; + + ret = read(fd, value, sizeof(value)); + close(fd); + + *value &= 0xffffff; + + return ret; +} + int kvm_arch_qemu_init_env(CPUState *cenv) { struct kvm_cpuid_entry cpuid_ent[100]; Index: kvm-userspace.realtip/qemu/qemu-kvm.c =================================================================== --- kvm-userspace.realtip.orig/qemu/qemu-kvm.c +++ kvm-userspace.realtip/qemu/qemu-kvm.c @@ -677,6 +677,7 @@ int kvm_qemu_create_context(void) r = kvm_arch_qemu_create_context(); if(r <0) kvm_qemu_destroy(); + kvm_arch_qemu_init(); return 0; } Index: kvm-userspace.realtip/qemu/qemu-kvm.h =================================================================== --- kvm-userspace.realtip.orig/qemu/qemu-kvm.h +++ kvm-userspace.realtip/qemu/qemu-kvm.h @@ -49,6 +49,7 @@ void kvm_cpu_destroy_phys_mem(target_phy unsigned long size); int kvm_arch_qemu_create_context(void); +int kvm_arch_qemu_init(void); void kvm_arch_save_regs(CPUState *env); void kvm_arch_load_regs(CPUState *env); @@ -60,6 +61,8 @@ int kvm_arch_has_work(CPUState *env); int kvm_arch_try_push_interrupts(void *opaque); void kvm_arch_update_regs_for_sipi(CPUState *env); void kvm_arch_cpu_reset(CPUState *env); +int qemu_kvm_get_pmtimer(uint32_t *value); +int kvm_close_direct_pmtimer(void); CPUState *qemu_kvm_cpu_env(int index); Index: kvm-userspace.realtip/qemu/sysemu.h =================================================================== --- kvm-userspace.realtip.orig/qemu/sysemu.h +++ kvm-userspace.realtip/qemu/sysemu.h @@ -94,6 +94,7 @@ extern int win2k_install_hack; extern int alt_grab; extern int usb_enabled; extern int smp_cpus; +extern unsigned int host_pmtimer_base; extern int cursor_hide; extern int graphic_rotate; extern int no_quit; @@ -101,6 +102,7 @@ extern int semihosting_enabled; extern int autostart; extern int old_param; extern int hpagesize; +extern int no_direct_pmtimer; extern const char *bootp_filename; Index: kvm-userspace.realtip/qemu/vl.c =================================================================== --- kvm-userspace.realtip.orig/qemu/vl.c +++ kvm-userspace.realtip/qemu/vl.c @@ -209,6 +209,7 @@ int win2k_install_hack = 0; int usb_enabled = 0; static VLANState *first_vlan; int smp_cpus = 1; +unsigned int host_pmtimer_base; const char *vnc_display; #if defined(TARGET_SPARC) #define MAX_CPUS 16 @@ -235,6 +236,7 @@ int time_drift_fix = 0; unsigned int kvm_shadow_memory = 0; const char *mem_path = NULL; int hpagesize = 0; +int no_direct_pmtimer = 0; const char *cpu_vendor_string; #ifdef TARGET_ARM int old_param = 0; @@ -7931,6 +7933,7 @@ enum { QEMU_OPTION_tdf, QEMU_OPTION_kvm_shadow_memory, QEMU_OPTION_mempath, + QEMU_OPTION_no_direct_pmtimer, }; typedef struct QEMUOption { @@ -8058,6 +8061,7 @@ const QEMUOption qemu_options[] = { { "clock", HAS_ARG, QEMU_OPTION_clock }, { "startdate", HAS_ARG, QEMU_OPTION_startdate }, { "mem-path", HAS_ARG, QEMU_OPTION_mempath }, + { "no-direct-pmtimer", 0, QEMU_OPTION_no_direct_pmtimer }, { NULL }, }; @@ -8962,6 +8966,9 @@ int main(int argc, char **argv) case QEMU_OPTION_mempath: mem_path = optarg; break; + case QEMU_OPTION_no_direct_pmtimer: + no_direct_pmtimer = 1; + break; case QEMU_OPTION_name: qemu_name = optarg; break; --