From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrea Arcangeli Subject: [PATCH] reserved-ram kvm-userland patch Date: Wed, 25 Jun 2008 03:28:07 +0200 Message-ID: <20080625012806.GO6938@duo.random> References: <1214232737-21267-1-git-send-email-benami@il.ibm.com> <1214232737-21267-2-git-send-email-benami@il.ibm.com> <20080625005739.GM6938@duo.random> <20080625011808.GN6938@duo.random> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Cc: amit.shah@qumranet.com, kvm@vger.kernel.org, aliguori@us.ibm.com, allen.m.kay@intel.com, muli@il.ibm.com To: benami@il.ibm.com, Avi Kivity Return-path: Received: from host36-195-149-62.serverdedicati.aruba.it ([62.149.195.36]:48634 "EHLO mx.cpushare.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752291AbYFYB2J (ORCPT ); Tue, 24 Jun 2008 21:28:09 -0400 Content-Disposition: inline In-Reply-To: <20080625011808.GN6938@duo.random> Sender: kvm-owner@vger.kernel.org List-ID: This is the kvm-userland patch to use after applying the reserved-ram patch to the host kernel. Bios must be rebuilt after applying the patch, to do that just 'make bios'. Then it's enough to pass '-reserved-ram' on the command line. 4997 ? Sl 2:56 3515 1544 4677235 1697028 47.9 /home/andrea/bin/x86_64/kvm/bin/qemu-system-x86_64 -hda tmp/virt 5002 ? Sl 3:23 4728 1544 4677235 1600980 45.2 /home/andrea/bin/x86_64/kvm/bin/qemu-system-x86_64 -hda tmp/virt 5008 ? Sl 2:39 239 1544 892127 15496 0.4 /home/andrea/bin/x86_64/kvm/bin/qemu-system-x86_64 -hda tmp/virtual total used free shared buffers cached Mem: 3540492 3525108 15384 0 1892 51896 -/+ buffers/cache: 3471320 69172 Swap: 5863684 3014072 2849612 eth0: no IPv6 routers present loaded kvm module (kvm-70-399-g275f337) apic write: bad size=1 fee00030 Ignoring de-assert INIT to vcpu 0 apic write: bad size=1 fee00030 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 0 kvm: emulating exchange as write apic write: bad size=1 fee00030 Ignoring de-assert INIT to vcpu 0 Ignoring de-assert INIT to vcpu 0 You can see above 3 KVM guests, last one with -reserved-ram -m 512, the first two with -m 3000. Host kernel has both mmu-notifier v18 and -reserved-ram patch applied. KVM kernel has the pfn-mmio patch applied plus my fix to export the reserved RAM through vma->fault, and the kvm mmu notifier support for reliable and efficient swapping. All 3 guests seems to work great together while system is 3G into swap. The reserved-ram guest is almost responsive as if there would be no swap of course (only the userland bits need to be paged in but all the virtual ram remains in ram). You can also see the RSS of the -reserved-ram task is only 15M which is about the footprint of kvm userland (part of which are shared libs, so it's actually much less). Signed-off-by: Andrea Arcangeli diff --git a/bios/rombios.c b/bios/rombios.c index 318de57..f93a6c6 100644 --- a/bios/rombios.c +++ b/bios/rombios.c @@ -4251,6 +4251,7 @@ int15_function32(regs, ES, DS, FLAGS) Bit32u extra_lowbits_memory_size=0; Bit16u CX,DX; Bit8u extra_highbits_memory_size=0; + Bit32u below_640_end; BX_DEBUG_INT15("int15 AX=%04x\n",regs.u.r16.ax); @@ -4305,6 +4306,11 @@ ASM_END case 0x20: // coded by osmaker aka K.J. if(regs.u.r32.edx == 0x534D4150) { + below_640_end = inb_cmos(0x16); + below_640_end <<= 8; + below_640_end |= inb_cmos(0x15); + below_640_end *= 1024; + extended_memory_size = inb_cmos(0x35); extended_memory_size <<= 8; extended_memory_size |= inb_cmos(0x34); @@ -4334,7 +4340,7 @@ ASM_END { case 0: set_e820_range(ES, regs.u.r16.di, - 0x0000000L, 0x0009fc00L, 0, 0, 1); + 0x0000000L, below_640_end, 0, 0, 1); regs.u.r32.ebx = 1; regs.u.r32.eax = 0x534D4150; regs.u.r32.ecx = 0x14; @@ -4343,7 +4349,7 @@ ASM_END break; case 1: set_e820_range(ES, regs.u.r16.di, - 0x0009fc00L, 0x000a0000L, 0, 0, 2); + below_640_end, 0x000a0000L, 0, 0, 2); regs.u.r32.ebx = 2; regs.u.r32.eax = 0x534D4150; regs.u.r32.ecx = 0x14; diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c index 42c2687..c6a21d5 100644 --- a/qemu/hw/pc.c +++ b/qemu/hw/pc.c @@ -235,6 +235,8 @@ static void cmos_init(ram_addr_t ram_size, ram_addr_t above_4g_mem_size, /* memory size */ val = 640; /* base memory in K */ + if (reserved_ram) + val = reserved[1] / 1024; rtc_set_memory(s, 0x15, val); rtc_set_memory(s, 0x16, val >> 8); diff --git a/qemu/pc-bios/bios.bin b/qemu/pc-bios/bios.bin index 3e5d96a..c9c94e6 100644 Binary files a/qemu/pc-bios/bios.bin and b/qemu/pc-bios/bios.bin differ diff --git a/qemu/sysemu.h b/qemu/sysemu.h index 97d73e9..964fee4 100644 --- a/qemu/sysemu.h +++ b/qemu/sysemu.h @@ -102,6 +102,8 @@ extern int autostart; extern int old_param; extern int hpagesize; extern const char *bootp_filename; +extern int reserved_ram; +extern int64_t reserved[4]; #ifdef USE_KQEMU diff --git a/qemu/vl.c b/qemu/vl.c index f573dce..3ce2f2a 100644 --- a/qemu/vl.c +++ b/qemu/vl.c @@ -235,6 +235,8 @@ int time_drift_fix = 0; unsigned int kvm_shadow_memory = 0; const char *mem_path = NULL; int hpagesize = 0; +int reserved_ram = 0; +int64_t reserved[4]; const char *cpu_vendor_string; #ifdef TARGET_ARM int old_param = 0; @@ -7817,6 +7819,7 @@ static void help(int exitcode) "-clock force the use of the given methods for timer alarm.\n" " To see what timers are available use -clock ?\n" "-startdate select initial date of the clock\n" + "-reserved-ram use reserved RAM in /proc/iomem with spte identity mapping\n" "\n" "During emulation, the following keys are useful:\n" "ctrl-alt-f toggle full screen\n" @@ -7932,6 +7935,7 @@ enum { QEMU_OPTION_tdf, QEMU_OPTION_kvm_shadow_memory, QEMU_OPTION_mempath, + QEMU_OPTION_reserved_ram, }; typedef struct QEMUOption { @@ -8059,6 +8063,7 @@ const QEMUOption qemu_options[] = { { "startdate", HAS_ARG, QEMU_OPTION_startdate }, { "tb-size", HAS_ARG, QEMU_OPTION_tb_size }, { "mem-path", HAS_ARG, QEMU_OPTION_mempath }, + { "reserved-ram", 0, QEMU_OPTION_reserved_ram }, { NULL }, }; @@ -8276,6 +8281,77 @@ static int gethugepagesize(void) return hugepagesize; } +static int find_reserved_ram(int64_t *_start, int64_t *_end, + unsigned long below, unsigned long above, + unsigned long min_size) +{ + int ret, fd; + char buf[4096]; + char *needle = "reserved RAM\n"; + char *size, *curr; + int64_t start, end; + + fd = open("/proc/iomem", O_RDONLY); + if (fd < 0) { + perror("open"); + exit(0); + } + + ret = read(fd, buf, sizeof(buf)-1); + if (ret < 0) { + perror("read"); + exit(0); + } + buf[ret] = 0; + + size = buf; + while (1) { + size = strstr(size, needle); + if (!size) + return 0; + size += strlen(needle); + curr = size - strlen(needle) - 20; + start = strtoll(curr, &curr, 16); + end = strtoll(curr+1, NULL, 16); + if ((!above || start >= above) && (!below || end <= below) && + (!min_size || end-start >= min_size)) { + *_start = start; + *_end = end+1; + return 1; + } + } +} + +static void init_reserved_ram(void) +{ + if (find_reserved_ram(&reserved[0], &reserved[1], + 640*1024, 0, 500*1024) && + find_reserved_ram(&reserved[2], &reserved[3], + 0, 1024*1024, 1024*1024)) { + reserved_ram = 1; + if (reserved[0] != 4096) { + fprintf(stderr, + "strange host ram layout\n"); + exit(1); + } + if (reserved[2] != 1024*1024) { + fprintf(stderr, + "strange host ram layout\n"); + exit(1); + } + if (reserved[3] < ram_size) { + fprintf(stderr, + "not enough host reserved ram, decrease -m\n"); + exit(1); + } + reserved[1] &= TARGET_PAGE_MASK; + } else { + fprintf(stderr, + "host reserved ram not found\n"); + exit(1); + } +} + void *alloc_mem_area(unsigned long memory, const char *path) { char *filename; @@ -8322,10 +8398,43 @@ void *qemu_alloc_physram(unsigned long memory) { void *area = NULL; - if (mem_path) + if (!area && mem_path) area = alloc_mem_area(memory, mem_path); - if (!area) + if (!area) { area = qemu_vmalloc(memory); + if (reserved_ram) { + int fd; + if (memory < reserved[2]) { + printf("memory < reserved[2]\n"); + return NULL; + } + fd = open("/dev/mem", O_RDWR); + if (fd < 0) { + perror("reserved_ram requires access to /dev/mem"); + return NULL; + } + if (mmap((char *)area+reserved[0], + reserved[1]-reserved[0], + PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, + fd, 0) == MAP_FAILED) { + perror("reserved_ram mmap failed on /dev/mem"); + return NULL; + } + bzero((char *)area+reserved[0], reserved[1]-reserved[0]); + if (mmap((char *)area+reserved[2], + ram_size-reserved[2], + PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED, + fd, reserved[2]) == MAP_FAILED) { + perror("reserved_ram mmap failed on /dev/mem"); + return NULL; + } + bzero((char *)area+reserved[2], ram_size-reserved[2]); + if (close(fd) < 0) { + perror("/dev/mem"); + return NULL; + } + } + } return area; } @@ -8962,6 +9071,9 @@ int main(int argc, char **argv) case QEMU_OPTION_mempath: mem_path = optarg; break; + case QEMU_OPTION_reserved_ram: + init_reserved_ram(); + break; case QEMU_OPTION_name: qemu_name = optarg; break;