From mboxrd@z Thu Jan 1 00:00:00 1970 From: Anthony Liguori Subject: Re: patch: qemu + hugetlbfs.. Date: Tue, 08 Jul 2008 18:09:09 -0500 Message-ID: <4873F395.6030209@codemonkey.ws> References: <4873E400.4000409@third-harmonic.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org, mtosatti@redhat.com, john.cooper@redhat.com To: john cooper Return-path: Received: from rn-out-0910.google.com ([64.233.170.187]:12002 "EHLO rn-out-0910.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754702AbYGHXJf (ORCPT ); Tue, 8 Jul 2008 19:09:35 -0400 Received: by rn-out-0910.google.com with SMTP id k40so752232rnd.17 for ; Tue, 08 Jul 2008 16:09:34 -0700 (PDT) In-Reply-To: <4873E400.4000409@third-harmonic.com> Sender: kvm-owner@vger.kernel.org List-ID: john cooper wrote: > Attached is a patch to qemu which allows preallocation of huge > pages at startup time. Also in cases where sufficient huge > pages are not available, the user may request to selectively fall > back to 4K pages for those portions of phys_mem which couldn't be > backed by huge pages. This patch is relative to kvm-70. > > The motivation for this arose from odd behavior seen in qemu > when access to huge page backed phys_mem failed during startup > (eg: loading the bios), and during runtime where a guest will > terminate via signal if a free hpage isn't available to satisfy > a guest page fault. > > Two flags are introduced: > > -mem-prealloc [0|1] > > Attempt to fault-in all hpages and if all can be allocated > proceed as normal. If all hpages can't be resolved and > -mem-fallback behavior wasn't is requested, qemu will error > exit. This behavior is enabled by default when -mem-path > is specified. > > -mem-fallback [0|1] > > In the case some hpages can't be allocated, mapping a > series of 4K pages will be attempted in place of them. > In the case zero hpages were allocated a diagnostic to > this effect will be printed (flagging scenarios such > as /proc/sys/vm/nr_hugepages failing to be setup) and > the normal non-hpage 4K allocation will be attempted. > This behavior by default is disabled. Recognition of > -mem-fallback will only occur when -mem-path is > specified and -mem-prealloc behavior is enabled. > > This is believed to be an interim bandaid until such support > is available from within the kernel+kvm, ideally allowing > fallback to 4K pages on a dynamic basis and ultimately allowing > huge pages faults to reclaim huge pages from other users in the > system. > > -john > > > Signed-off-by: john cooper redhat.com> > > > --- a/qemu/vl.c > +++ b/qemu/vl.c > @@ -45,6 +45,7 @@ > #include > #include > #include > +#include > #include > #include > > @@ -160,6 +161,15 @@ int inet_aton(const char *cp, struct in_ > /* XXX: use a two level table to limit memory usage */ > #define MAX_IOPORTS 65536 > > +/* assertion checking > + */ > +#define ASSERT(c) if (!(c)) _assert(#c, __FILE__, __LINE__) > + > +static inline void _assert(char *text, char *file, int line) { > + fprintf(stderr, "ASSERTION FAILED: [%s] %s:%d\n", text, file, line); > + kill(getpid(), 12); /* dump core */ > +} > + > Is it really necessary to add this as part of this patch? > const char *bios_dir = CONFIG_QEMU_SHAREDIR; > const char *bios_name = NULL; > void *ioport_opaque[MAX_IOPORTS]; > @@ -234,6 +244,8 @@ int autostart = 1; > int time_drift_fix = 0; > unsigned int kvm_shadow_memory = 0; > const char *mem_path = NULL; > +int mem_prealloc = {1}; /* force preallocation of physical target memory */ > +int mem_fallback = {0}; /* allow alloc from alternate page size */ > This is a bizarre way to initialize an integer. I had no idea you could even do this for a scalar. Just initialize to 1 and 0. > int hpagesize = 0; > const char *cpu_vendor_string; > #ifdef TARGET_ARM > @@ -7809,7 +7821,12 @@ static void help(int exitcode) > #endif > "-tdf inject timer interrupts that got lost\n" > "-kvm-shadow-memory megs set the amount of shadow pages to be allocated\n" > - "-mem-path set the path to hugetlbfs/tmpfs mounted directory, also enables allocation of guest memory with huge pages\n" > + "-mem-path set the path to hugetlbfs/tmpfs mounted directory, also\n" > + " enables allocation of guest memory with huge pages\n" > + "-mem-prealloc forces preallocation of huge page physical memory\n" > + " at startup when -mem-path is specified. Default 1 (on)\n" > + "-mem-fallback enables fallback to conventional pages as needed\n" > + " when -mem-prealloc is enabled. Default: 0 (off)\n" > "-option-rom rom load a file, rom, into the option ROM space\n" > #ifdef TARGET_SPARC > "-prom-env variable=value set OpenBIOS nvram variables\n" > @@ -7932,6 +7949,8 @@ enum { > QEMU_OPTION_tdf, > QEMU_OPTION_kvm_shadow_memory, > QEMU_OPTION_mempath, > + QEMU_OPTION_mem_prealloc, > + QEMU_OPTION_mem_fallback, > }; > > typedef struct QEMUOption { > @@ -8059,6 +8078,8 @@ const QEMUOption qemu_options[] = { > { "startdate", HAS_ARG, QEMU_OPTION_startdate }, > { "tb-size", HAS_ARG, QEMU_OPTION_tb_size }, > { "mem-path", HAS_ARG, QEMU_OPTION_mempath }, > + { "mem-prealloc", HAS_ARG, QEMU_OPTION_mem_prealloc }, > + { "mem-fallback", HAS_ARG, QEMU_OPTION_mem_fallback }, > { NULL }, > }; > > @@ -8276,11 +8297,36 @@ static int gethugepagesize(void) > return hugepagesize; > } > > -void *alloc_mem_area(unsigned long memory, const char *path) > +/* fault in associated page, fail syscall when free page is unavailable > + */ > +static inline int fault_in(void *a) > +{ > + return (gettimeofday(a, NULL) < 0 ? errno != EFAULT : 1); > +} > I don't like this very much. Why not just MAP_POPULATE? > +/* we failed to fault in hpage *a, fall back to conventional page mapping > + */ > +int remap_hpage(void *a, int sz) > +{ > + ASSERT(!(sz & (EXEC_PAGESIZE - 1))); > + if (munmap(a, sz) < 0) > + perror("remap_hpage: munmap"); > + else if (mmap(a, sz, PROT_READ|PROT_WRITE, > + MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED) > + perror("remap_hpage: mmap"); > + else > + return (1); > + return (0); > +} > I think this would be simplified with MAP_POPULATE since you can fail in large chunks of memory instead of potentially having a highly fragmented set of VMAs. > +/* attempt to allocate memory mmap'ed to mem_path > + */ > +void *alloc_hpage_mem(unsigned long memory, const char *path) > { > char *filename; > - void *area; > - int fd; > + void *area, *a; > + int fd, fail; > + long i; > > if (asprintf(&filename, "%s/kvm.XXXXXX", path) == -1) > return NULL; > @@ -8308,27 +8354,80 @@ void *alloc_mem_area(unsigned long memor > */ > ftruncate(fd, memory); > > + fail = 0; > area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); > - if (area == MAP_FAILED) { > - perror("mmap"); > - close(fd); > - return NULL; > - } > + if (area == MAP_FAILED) > + ++fail, perror("alloc_hpage_mem: hugetlbfs mmap"); > > - return area; > + /* success of mmap() doesn't guarantee phys pages are present in the > + * map. if requested we attempt to fault in all associated huge pages * and where unable if requested, selectively revert back to a mapping > + * of conventional sized pages > + */ > + ASSERT(!((unsigned long)area & (hpagesize - 1))); > + if (mem_prealloc) > + for (a = area, i = memory / hpagesize; !fail && 0 < i--; a += hpagesize) > + if (fault_in(a)) > + ; /* hpage is mapped */ > + else if (!mem_fallback) > + ++fail; > + else if (!remap_hpage(a, hpagesize)) > + ++fail; > + if (!fail) > + return (area); > + if (area != MAP_FAILED) > + munmap(area, memory); > + close(fd); > + return (NULL); > } > > +/* allocate from hpage mem if requested and to the extent possible, else > + * revert to conventional page allocation. > + */ > void *qemu_alloc_physram(unsigned long memory) > { > void *area = NULL; > > - if (mem_path) > - area = alloc_mem_area(memory, mem_path); > - if (!area) > - area = qemu_vmalloc(memory); > + if (mem_path && !(area = alloc_hpage_mem(memory, mem_path))) > + fprintf(stderr, "warning: couldn't alloc memory as requested from %s\n", > + mem_path); > + return (!area && mem_fallback ? qemu_vmalloc(memory) : area); > +} > > - return area; > +/* match string *s in pattern *p. *p is of the form: [ | [ '|']*] > + * and is a string to be matched which may be empty. > + * Return pointer to tail of matched *p for success, NULL otherwise. > + */ > +char const *altmatch(const char *p, const char *s) > +{ > + const char *bp, *q; > + > + for (bp = p, q = s; ; ) > + if ((!*p || *p == '|') && !*q) > + return (q == s && bp != p ? NULL : p); > + else if (!*p) > + return (0); > + else if (*p == *q) > + ++p, ++q; > + else { /* failed, try next field */ > + for (q = s; *p && *p != '|'; ++p) > + ; > + if (*p == '|') > + bp = ++p; > + } > +} > + > +int bool_arg(const char *optarg, const char *flag_name) > +{ > + if (!optarg) > + ; > + else if (altmatch("y|yes|1", optarg)) > + return (1); > + else if (altmatch("n|no|0", optarg)) > + return (0); > + fprintf(stderr, "usage: %s [0|1]\n", flag_name); > + exit(1); > } > + > This is introducing too much infrastructure. Just follow the conventions in the rest of the code. No need to make the options take a boolean argument. The options themselves are boolean arguments. Regards, Anthony Liguori > int main(int argc, char **argv) > { > @@ -8962,6 +9061,12 @@ int main(int argc, char **argv) > case QEMU_OPTION_mempath: > mem_path = optarg; > break; > + case QEMU_OPTION_mem_prealloc: > + mem_prealloc = bool_arg(optarg, "mem_prealloc"); > + break; > + case QEMU_OPTION_mem_fallback: > + mem_fallback = bool_arg(optarg, "mem_fallback"); > + break; > case QEMU_OPTION_name: > qemu_name = optarg; > break; >