public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
From: john cooper <john.cooper@third-harmonic.com>
To: kvm@vger.kernel.org
Cc: mtosatti@redhat.com, john.cooper@redhat.com
Subject: patch: qemu + hugetlbfs..
Date: Tue, 08 Jul 2008 18:02:40 -0400	[thread overview]
Message-ID: <4873E400.4000409@third-harmonic.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 1771 bytes --]

Attached is a patch to qemu which allows preallocation of huge
pages at startup time.  Also in cases where sufficient huge
pages are not available, the user may request to selectively fall
back to 4K pages for those portions of phys_mem which couldn't be
backed by huge pages.  This patch is relative to kvm-70.

The motivation for this arose from odd behavior seen in qemu
when access to huge page backed phys_mem failed during startup
(eg: loading the bios), and during runtime where a guest will
terminate via signal if a free hpage isn't available to satisfy
a guest page fault.

Two flags are introduced:

-mem-prealloc [0|1]

     Attempt to fault-in all hpages and if all can be allocated
     proceed as normal.  If all hpages can't be resolved and
     -mem-fallback behavior wasn't is requested, qemu will error
     exit.  This behavior is enabled by default when -mem-path
     is specified.

-mem-fallback [0|1]

     In the case some hpages can't be allocated, mapping a
     series of 4K pages will be attempted in place of them.
     In the case zero hpages were allocated a diagnostic to
     this effect will be printed (flagging scenarios such
     as /proc/sys/vm/nr_hugepages failing to be setup) and
     the normal non-hpage 4K allocation will be attempted.
     This behavior by default is disabled.  Recognition of
     -mem-fallback will only occur when -mem-path is
     specified and -mem-prealloc behavior is enabled.

This is believed to be an interim bandaid until such support
is available from within the kernel+kvm, ideally allowing
fallback to 4K pages on a dynamic basis and ultimately allowing
huge pages faults to reclaim huge pages from other users in the
system.

-john


Signed-off-by: john cooper <john.cooper <at> redhat.com>



[-- Attachment #2: prealloc.diff --]
[-- Type: text/x-patch, Size: 6852 bytes --]

--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -45,6 +45,7 @@
 #include <signal.h>
 #include <time.h>
 #include <errno.h>
+#include <asm/param.h>
 #include <sys/time.h>
 #include <zlib.h>
 
@@ -160,6 +161,15 @@ int inet_aton(const char *cp, struct in_
 /* XXX: use a two level table to limit memory usage */
 #define MAX_IOPORTS 65536
 
+/* assertion checking
+ */
+#define ASSERT(c)       if (!(c)) _assert(#c, __FILE__, __LINE__)
+
+static inline void _assert(char *text, char *file, int line) {
+    fprintf(stderr, "ASSERTION FAILED: [%s] %s:%d\n", text, file, line);
+    kill(getpid(), 12);     /* dump core */
+}
+
 const char *bios_dir = CONFIG_QEMU_SHAREDIR;
 const char *bios_name = NULL;
 void *ioport_opaque[MAX_IOPORTS];
@@ -234,6 +244,8 @@ int autostart = 1;
 int time_drift_fix = 0;
 unsigned int kvm_shadow_memory = 0;
 const char *mem_path = NULL;
+int mem_prealloc = {1};	/* force preallocation of physical target memory */
+int mem_fallback = {0};	/* allow alloc from alternate page size */
 int hpagesize = 0;
 const char *cpu_vendor_string;
 #ifdef TARGET_ARM
@@ -7809,7 +7821,12 @@ static void help(int exitcode)
 #endif
            "-tdf            inject timer interrupts that got lost\n"
            "-kvm-shadow-memory megs set the amount of shadow pages to be allocated\n"
-           "-mem-path       set the path to hugetlbfs/tmpfs mounted directory, also enables allocation of guest memory with huge pages\n"
+           "-mem-path       set the path to hugetlbfs/tmpfs mounted directory, also\n"
+           "                enables allocation of guest memory with huge pages\n"
+           "-mem-prealloc   forces preallocation of huge page physical memory\n"
+           "                at startup when -mem-path is specified.  Default 1 (on)\n"
+           "-mem-fallback   enables fallback to conventional pages as needed\n"
+           "                when -mem-prealloc is enabled.  Default: 0 (off)\n" 
 	   "-option-rom rom load a file, rom, into the option ROM space\n"
 #ifdef TARGET_SPARC
            "-prom-env variable=value  set OpenBIOS nvram variables\n"
@@ -7932,6 +7949,8 @@ enum {
     QEMU_OPTION_tdf,
     QEMU_OPTION_kvm_shadow_memory,
     QEMU_OPTION_mempath,
+    QEMU_OPTION_mem_prealloc,
+    QEMU_OPTION_mem_fallback,
 };
 
 typedef struct QEMUOption {
@@ -8059,6 +8078,8 @@ const QEMUOption qemu_options[] = {
     { "startdate", HAS_ARG, QEMU_OPTION_startdate },
     { "tb-size", HAS_ARG, QEMU_OPTION_tb_size },
     { "mem-path", HAS_ARG, QEMU_OPTION_mempath },
+    { "mem-prealloc", HAS_ARG, QEMU_OPTION_mem_prealloc },
+    { "mem-fallback", HAS_ARG, QEMU_OPTION_mem_fallback },
     { NULL },
 };
 
@@ -8276,11 +8297,36 @@ static int gethugepagesize(void)
     return hugepagesize;
 }
 
-void *alloc_mem_area(unsigned long memory, const char *path)
+/* fault in associated page, fail syscall when free page is unavailable
+ */ 
+static inline int fault_in(void *a)
+{
+    return (gettimeofday(a, NULL) < 0 ? errno != EFAULT : 1);
+}
+
+/* we failed to fault in hpage *a, fall back to conventional page mapping
+ */
+int remap_hpage(void *a, int sz)
+{
+    ASSERT(!(sz & (EXEC_PAGESIZE - 1)));
+    if (munmap(a, sz) < 0)
+	perror("remap_hpage: munmap");
+    else if (mmap(a, sz, PROT_READ|PROT_WRITE,
+	MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED)
+	    perror("remap_hpage: mmap");
+    else
+	return (1);
+    return (0);
+}
+
+/* attempt to allocate memory mmap'ed to mem_path
+ */
+void *alloc_hpage_mem(unsigned long memory, const char *path)
 {
     char *filename;
-    void *area;
-    int fd;
+    void *area, *a;
+    int fd, fail;
+    long i;
 
     if (asprintf(&filename, "%s/kvm.XXXXXX", path) == -1)
 	return NULL;
@@ -8308,27 +8354,80 @@ void *alloc_mem_area(unsigned long memor
      */
     ftruncate(fd, memory);
 
+    fail = 0;
     area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
-    if (area == MAP_FAILED) {
-	perror("mmap");
-	close(fd);
-	return NULL;
-    }
+    if (area == MAP_FAILED)
+	++fail, perror("alloc_hpage_mem: hugetlbfs mmap");
 
-    return area;
+    /* success of mmap() doesn't guarantee phys pages are present in the
+     * map.  if requested we attempt to fault in all associated huge pages           * and where unable if requested, selectively revert back to a mapping
+     * of conventional sized pages
+     */
+    ASSERT(!((unsigned long)area & (hpagesize - 1)));
+    if (mem_prealloc)
+	for (a = area, i = memory / hpagesize; !fail && 0 < i--; a += hpagesize)
+	    if (fault_in(a))
+		;	/* hpage is mapped */
+	    else if (!mem_fallback)
+		++fail;
+	    else if (!remap_hpage(a, hpagesize))
+		++fail;
+    if (!fail)
+        return (area);
+    if (area != MAP_FAILED)
+	munmap(area, memory);
+    close(fd);
+    return (NULL);
 }
 
+/* allocate from hpage mem if requested and to the extent possible, else
+ * revert to conventional page allocation.
+ */
 void *qemu_alloc_physram(unsigned long memory)
 {
     void *area = NULL;
 
-    if (mem_path)
-	area = alloc_mem_area(memory, mem_path);
-    if (!area)
-	area = qemu_vmalloc(memory);
+    if (mem_path && !(area = alloc_hpage_mem(memory, mem_path)))
+	fprintf(stderr, "warning: couldn't alloc memory as requested from %s\n",
+	    mem_path);
+    return (!area && mem_fallback ? qemu_vmalloc(memory) : area);
+}
 
-    return area;
+/* match string *s in pattern *p.  *p is of the form: [<str> | [<str> '|']*]
+ * and <str> is a string to be matched which may be empty.
+ * Return pointer to tail of matched *p for success, NULL otherwise.
+ */
+char const *altmatch(const char *p, const char *s)
+{
+    const char *bp, *q; 
+
+    for (bp = p, q = s; ; ) 
+        if ((!*p || *p == '|') && !*q) 
+            return (q == s && bp != p ? NULL : p);
+        else if (!*p)
+            return (0); 
+        else if (*p == *q)
+            ++p, ++q;   
+        else {   /* failed, try next field */
+            for (q = s; *p && *p != '|'; ++p)
+                ;               
+            if (*p == '|') 
+                bp = ++p;       
+        }           
+}   
+
+int bool_arg(const char *optarg, const char *flag_name)
+{
+    if (!optarg)
+        ;
+    else if (altmatch("y|yes|1", optarg))
+        return (1);
+    else if (altmatch("n|no|0", optarg))
+        return (0);
+    fprintf(stderr, "usage: %s [0|1]\n", flag_name);
+    exit(1);
 }
+		
 
 int main(int argc, char **argv)
 {
@@ -8962,6 +9061,12 @@ int main(int argc, char **argv)
             case QEMU_OPTION_mempath:
 		mem_path = optarg;
 		break;
+            case QEMU_OPTION_mem_prealloc:
+		mem_prealloc = bool_arg(optarg, "mem_prealloc");
+		break;
+            case QEMU_OPTION_mem_fallback:
+		mem_fallback = bool_arg(optarg, "mem_fallback");
+		break;
             case QEMU_OPTION_name:
                 qemu_name = optarg;
                 break;

             reply	other threads:[~2008-07-08 22:52 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-07-08 22:02 john cooper [this message]
2008-07-08 23:09 ` patch: qemu + hugetlbfs Anthony Liguori
2008-07-09  0:23   ` john cooper
2008-07-09  1:08     ` Anthony Liguori
2008-07-09 17:03       ` Marcelo Tosatti
2008-07-09 17:11         ` Anthony Liguori
2008-07-10 16:40           ` john cooper
2008-07-10 17:58             ` Anthony Liguori
2008-07-10 20:16               ` john cooper
2008-07-10 20:47                 ` Anthony Liguori
2008-07-10 21:12                   ` john cooper
2008-07-10 21:38                     ` Anthony Liguori
2008-08-25 23:05                       ` Resend: " john cooper
2008-08-26  8:11                         ` Avi Kivity
2008-08-27  4:13                           ` john cooper
2009-01-16  2:19                         ` john cooper
2009-01-20 10:29                           ` Avi Kivity
2009-01-23 21:21                             ` john cooper
2009-02-05 15:42                               ` Avi Kivity
2009-02-05 16:12                                 ` Marcelo Tosatti
2009-02-05 16:15                                   ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4873E400.4000409@third-harmonic.com \
    --to=john.cooper@third-harmonic.com \
    --cc=john.cooper@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=mtosatti@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox