All of lore.kernel.org
 help / color / mirror / Atom feed
From: Marcelo Tosatti <mtosatti@redhat.com>
To: Paul Brook <paul@codesourcery.com>
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org,
	john cooper <john.cooper@redhat.com>,
	avi@redhat.com
Subject: Re: [Qemu-devel] [patch uq/master 2/2] Add option to use file backed guest memory
Date: Mon, 1 Mar 2010 20:25:08 -0300	[thread overview]
Message-ID: <20100301232508.GA13703@amt.cnet> (raw)
In-Reply-To: <201002280128.16649.paul@codesourcery.com>

Hi Paul,

Thank you for reviewing.

On Sun, Feb 28, 2010 at 01:28:16AM +0000, Paul Brook wrote:
> IMHO it would be better to check the mem_path != NULL here, rather that 
> burying the check in file_ram_alloc.
> 
> >+    if (memory < hpagesize) {
> >+        return NULL;
> >+    }
> 
> Ah, so it's actually "allocate memory in $path, if you feel like it". Good job 
> we aren't relying on this for correctness.  At minimum I recommend documenting 
> this heuristic.

More like "allocate memory in $path, if it its larger than a hugepage."

Huge pages are an optimization.

> 
> >+    if (!new_block->host) {
> > #if defined(TARGET_S390X) && defined(CONFIG_KVM)
> >-    /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
> 
> By my reading this implies -mempath is probably broken on s390 KVM?
> 
> >+DEF("mem-path", HAS_ARG, QEMU_OPTION_mempath,
> >+    "-mem-path FILE  provide backing storage for guest RAM\n")
> >+STEXI
> >+@item -mem-path @var{path}
> >+Allocate guest RAM from a temporarily created file in @var{path}.
> >+ETEXI
> 
> You should mention that this is only useful when PATH happens to be a linux 
> hugetlbfs mount.

It can be used with a file, since its mapped as MAP_PRIVATE.

Can you check whether the patch below properly addresses your concerns.


Add option to use file backed guest memory

Port qemu-kvm's -mem-path and -mem-prealloc options. These are useful 
for backing guest memory with huge pages via hugetlbfs.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
CC: john cooper <john.cooper@redhat.com>

Index: qemu/cpu-all.h
===================================================================
--- qemu.orig/cpu-all.h
+++ qemu/cpu-all.h
@@ -847,6 +847,9 @@ extern uint8_t *phys_ram_dirty;
 extern ram_addr_t ram_size;
 extern ram_addr_t last_ram_offset;
 
+extern const char *mem_path;
+extern int mem_prealloc;
+
 /* physical memory access */
 
 /* MMIO pages are identified by a combination of an IO device index and
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2529,6 +2529,99 @@ void qemu_flush_coalesced_mmio_buffer(vo
         kvm_flush_coalesced_mmio_buffer();
 }
 
+#if defined(__linux__) && !defined(TARGET_S390X)
+
+#include <sys/vfs.h>
+
+#define HUGETLBFS_MAGIC       0x958458f6
+
+static long gethugepagesize(const char *path)
+{
+    struct statfs fs;
+    int ret;
+
+    do {
+	    ret = statfs(path, &fs);
+    } while (ret != 0 && errno == EINTR);
+
+    if (ret != 0) {
+	    perror("statfs");
+	    return 0;
+    }
+
+    if (fs.f_type != HUGETLBFS_MAGIC)
+	    fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
+
+    return fs.f_bsize;
+}
+
+static void *file_ram_alloc(ram_addr_t memory, const char *path)
+{
+    char *filename;
+    void *area;
+    int fd;
+#ifdef MAP_POPULATE
+    int flags;
+#endif
+    unsigned long hpagesize;
+
+    hpagesize = gethugepagesize(path);
+    if (!hpagesize) {
+	return NULL;
+    }
+
+    if (memory < hpagesize) {
+        return NULL;
+    }
+
+    if (kvm_enabled() && !kvm_has_sync_mmu()) {
+        fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
+        return NULL;
+    }
+
+    if (asprintf(&filename, "%s/qemu_back_mem.XXXXXX", path) == -1) {
+	return NULL;
+    }
+
+    fd = mkstemp(filename);
+    if (fd < 0) {
+	perror("mkstemp");
+	free(filename);
+	return NULL;
+    }
+    unlink(filename);
+    free(filename);
+
+    memory = (memory+hpagesize-1) & ~(hpagesize-1);
+
+    /*
+     * ftruncate is not supported by hugetlbfs in older
+     * hosts, so don't bother bailing out on errors.
+     * If anything goes wrong with it under other filesystems,
+     * mmap will fail.
+     */
+    if (ftruncate(fd, memory))
+	perror("ftruncate");
+
+#ifdef MAP_POPULATE
+    /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case
+     * MAP_PRIVATE is requested.  For mem_prealloc we mmap as MAP_SHARED
+     * to sidestep this quirk.
+     */
+    flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE;
+    area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0);
+#else
+    area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+#endif
+    if (area == MAP_FAILED) {
+	perror("file_ram_alloc: can't mmap RAM pages");
+	close(fd);
+	return (NULL);
+    }
+    return area;
+}
+#endif
+
 ram_addr_t qemu_ram_alloc(ram_addr_t size)
 {
     RAMBlock *new_block;
@@ -2536,16 +2629,28 @@ ram_addr_t qemu_ram_alloc(ram_addr_t siz
     size = TARGET_PAGE_ALIGN(size);
     new_block = qemu_malloc(sizeof(*new_block));
 
+    if (mem_path) {
+#if defined (__linux__) && !defined(TARGET_S390X)
+        new_block->host = file_ram_alloc(size, mem_path);
+        if (!new_block->host)
+            exit(1);
+#else
+        fprintf(stderr, "-mem-path option unsupported\n");
+        exit(1);
+#endif
+    } else {
 #if defined(TARGET_S390X) && defined(CONFIG_KVM)
-    /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
-    new_block->host = mmap((void*)0x1000000, size, PROT_EXEC|PROT_READ|PROT_WRITE,
-                           MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+        /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
+        new_block->host = mmap((void*)0x1000000, size,
+                                PROT_EXEC|PROT_READ|PROT_WRITE,
+                                MAP_SHARED | MAP_ANONYMOUS, -1, 0);
 #else
-    new_block->host = qemu_vmalloc(size);
+        new_block->host = qemu_vmalloc(size);
 #endif
 #ifdef MADV_MERGEABLE
-    madvise(new_block->host, size, MADV_MERGEABLE);
+        madvise(new_block->host, size, MADV_MERGEABLE);
 #endif
+    }
     new_block->offset = last_ram_offset;
     new_block->length = size;
 
Index: qemu/qemu-options.hx
===================================================================
--- qemu.orig/qemu-options.hx
+++ qemu/qemu-options.hx
@@ -314,6 +314,22 @@ a suffix of ``M'' or ``G'' can be used t
 gigabytes respectively.
 ETEXI
 
+DEF("mem-path", HAS_ARG, QEMU_OPTION_mempath,
+    "-mem-path FILE  provide backing storage for guest RAM\n")
+STEXI
+@item -mem-path @var{path}
+Allocate guest RAM from a temporarily created file in @var{path}.
+ETEXI
+
+#ifdef MAP_POPULATE
+DEF("mem-prealloc", 0, QEMU_OPTION_mem_prealloc,
+    "-mem-prealloc   preallocate guest memory (use with -mem-path)\n")
+STEXI
+@item -mem-prealloc
+Preallocate memory when using -mem-path.
+ETEXI
+#endif
+
 DEF("k", HAS_ARG, QEMU_OPTION_k,
     "-k language     use keyboard layout (for example 'fr' for French)\n")
 STEXI
Index: qemu/vl.c
===================================================================
--- qemu.orig/vl.c
+++ qemu/vl.c
@@ -185,6 +185,10 @@ enum vga_retrace_method vga_retrace_meth
 DisplayType display_type = DT_DEFAULT;
 const char* keyboard_layout = NULL;
 ram_addr_t ram_size;
+const char *mem_path = NULL;
+#ifdef MAP_POPULATE
+int mem_prealloc = 0; /* force preallocation of physical target memory */
+#endif
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int vm_running;
@@ -5216,6 +5220,14 @@ int main(int argc, char **argv, char **e
                 ram_size = value;
                 break;
             }
+            case QEMU_OPTION_mempath:
+                mem_path = optarg;
+                break;
+#ifdef MAP_POPULATE
+            case QEMU_OPTION_mem_prealloc:
+                mem_prealloc = 1;
+                break;
+#endif
             case QEMU_OPTION_d:
                 {
                     int mask;

WARNING: multiple messages have this Message-ID (diff)
From: Marcelo Tosatti <mtosatti@redhat.com>
To: Paul Brook <paul@codesourcery.com>
Cc: john cooper <john.cooper@redhat.com>,
	qemu-devel@nongnu.org, kvm@vger.kernel.org, avi@redhat.com
Subject: Re: [Qemu-devel] [patch uq/master 2/2] Add option to use file backed guest memory
Date: Mon, 1 Mar 2010 20:25:08 -0300	[thread overview]
Message-ID: <20100301232508.GA13703@amt.cnet> (raw)
In-Reply-To: <201002280128.16649.paul@codesourcery.com>

Hi Paul,

Thank you for reviewing.

On Sun, Feb 28, 2010 at 01:28:16AM +0000, Paul Brook wrote:
> IMHO it would be better to check the mem_path != NULL here, rather that 
> burying the check in file_ram_alloc.
> 
> >+    if (memory < hpagesize) {
> >+        return NULL;
> >+    }
> 
> Ah, so it's actually "allocate memory in $path, if you feel like it". Good job 
> we aren't relying on this for correctness.  At minimum I recommend documenting 
> this heuristic.

More like "allocate memory in $path, if it its larger than a hugepage."

Huge pages are an optimization.

> 
> >+    if (!new_block->host) {
> > #if defined(TARGET_S390X) && defined(CONFIG_KVM)
> >-    /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
> 
> By my reading this implies -mempath is probably broken on s390 KVM?
> 
> >+DEF("mem-path", HAS_ARG, QEMU_OPTION_mempath,
> >+    "-mem-path FILE  provide backing storage for guest RAM\n")
> >+STEXI
> >+@item -mem-path @var{path}
> >+Allocate guest RAM from a temporarily created file in @var{path}.
> >+ETEXI
> 
> You should mention that this is only useful when PATH happens to be a linux 
> hugetlbfs mount.

It can be used with a file, since its mapped as MAP_PRIVATE.

Can you check whether the patch below properly addresses your concerns.


Add option to use file backed guest memory

Port qemu-kvm's -mem-path and -mem-prealloc options. These are useful 
for backing guest memory with huge pages via hugetlbfs.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
CC: john cooper <john.cooper@redhat.com>

Index: qemu/cpu-all.h
===================================================================
--- qemu.orig/cpu-all.h
+++ qemu/cpu-all.h
@@ -847,6 +847,9 @@ extern uint8_t *phys_ram_dirty;
 extern ram_addr_t ram_size;
 extern ram_addr_t last_ram_offset;
 
+extern const char *mem_path;
+extern int mem_prealloc;
+
 /* physical memory access */
 
 /* MMIO pages are identified by a combination of an IO device index and
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c
+++ qemu/exec.c
@@ -2529,6 +2529,99 @@ void qemu_flush_coalesced_mmio_buffer(vo
         kvm_flush_coalesced_mmio_buffer();
 }
 
+#if defined(__linux__) && !defined(TARGET_S390X)
+
+#include <sys/vfs.h>
+
+#define HUGETLBFS_MAGIC       0x958458f6
+
+static long gethugepagesize(const char *path)
+{
+    struct statfs fs;
+    int ret;
+
+    do {
+	    ret = statfs(path, &fs);
+    } while (ret != 0 && errno == EINTR);
+
+    if (ret != 0) {
+	    perror("statfs");
+	    return 0;
+    }
+
+    if (fs.f_type != HUGETLBFS_MAGIC)
+	    fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
+
+    return fs.f_bsize;
+}
+
+static void *file_ram_alloc(ram_addr_t memory, const char *path)
+{
+    char *filename;
+    void *area;
+    int fd;
+#ifdef MAP_POPULATE
+    int flags;
+#endif
+    unsigned long hpagesize;
+
+    hpagesize = gethugepagesize(path);
+    if (!hpagesize) {
+	return NULL;
+    }
+
+    if (memory < hpagesize) {
+        return NULL;
+    }
+
+    if (kvm_enabled() && !kvm_has_sync_mmu()) {
+        fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n");
+        return NULL;
+    }
+
+    if (asprintf(&filename, "%s/qemu_back_mem.XXXXXX", path) == -1) {
+	return NULL;
+    }
+
+    fd = mkstemp(filename);
+    if (fd < 0) {
+	perror("mkstemp");
+	free(filename);
+	return NULL;
+    }
+    unlink(filename);
+    free(filename);
+
+    memory = (memory+hpagesize-1) & ~(hpagesize-1);
+
+    /*
+     * ftruncate is not supported by hugetlbfs in older
+     * hosts, so don't bother bailing out on errors.
+     * If anything goes wrong with it under other filesystems,
+     * mmap will fail.
+     */
+    if (ftruncate(fd, memory))
+	perror("ftruncate");
+
+#ifdef MAP_POPULATE
+    /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case
+     * MAP_PRIVATE is requested.  For mem_prealloc we mmap as MAP_SHARED
+     * to sidestep this quirk.
+     */
+    flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE;
+    area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0);
+#else
+    area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+#endif
+    if (area == MAP_FAILED) {
+	perror("file_ram_alloc: can't mmap RAM pages");
+	close(fd);
+	return (NULL);
+    }
+    return area;
+}
+#endif
+
 ram_addr_t qemu_ram_alloc(ram_addr_t size)
 {
     RAMBlock *new_block;
@@ -2536,16 +2629,28 @@ ram_addr_t qemu_ram_alloc(ram_addr_t siz
     size = TARGET_PAGE_ALIGN(size);
     new_block = qemu_malloc(sizeof(*new_block));
 
+    if (mem_path) {
+#if defined (__linux__) && !defined(TARGET_S390X)
+        new_block->host = file_ram_alloc(size, mem_path);
+        if (!new_block->host)
+            exit(1);
+#else
+        fprintf(stderr, "-mem-path option unsupported\n");
+        exit(1);
+#endif
+    } else {
 #if defined(TARGET_S390X) && defined(CONFIG_KVM)
-    /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
-    new_block->host = mmap((void*)0x1000000, size, PROT_EXEC|PROT_READ|PROT_WRITE,
-                           MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+        /* XXX S390 KVM requires the topmost vma of the RAM to be < 256GB */
+        new_block->host = mmap((void*)0x1000000, size,
+                                PROT_EXEC|PROT_READ|PROT_WRITE,
+                                MAP_SHARED | MAP_ANONYMOUS, -1, 0);
 #else
-    new_block->host = qemu_vmalloc(size);
+        new_block->host = qemu_vmalloc(size);
 #endif
 #ifdef MADV_MERGEABLE
-    madvise(new_block->host, size, MADV_MERGEABLE);
+        madvise(new_block->host, size, MADV_MERGEABLE);
 #endif
+    }
     new_block->offset = last_ram_offset;
     new_block->length = size;
 
Index: qemu/qemu-options.hx
===================================================================
--- qemu.orig/qemu-options.hx
+++ qemu/qemu-options.hx
@@ -314,6 +314,22 @@ a suffix of ``M'' or ``G'' can be used t
 gigabytes respectively.
 ETEXI
 
+DEF("mem-path", HAS_ARG, QEMU_OPTION_mempath,
+    "-mem-path FILE  provide backing storage for guest RAM\n")
+STEXI
+@item -mem-path @var{path}
+Allocate guest RAM from a temporarily created file in @var{path}.
+ETEXI
+
+#ifdef MAP_POPULATE
+DEF("mem-prealloc", 0, QEMU_OPTION_mem_prealloc,
+    "-mem-prealloc   preallocate guest memory (use with -mem-path)\n")
+STEXI
+@item -mem-prealloc
+Preallocate memory when using -mem-path.
+ETEXI
+#endif
+
 DEF("k", HAS_ARG, QEMU_OPTION_k,
     "-k language     use keyboard layout (for example 'fr' for French)\n")
 STEXI
Index: qemu/vl.c
===================================================================
--- qemu.orig/vl.c
+++ qemu/vl.c
@@ -185,6 +185,10 @@ enum vga_retrace_method vga_retrace_meth
 DisplayType display_type = DT_DEFAULT;
 const char* keyboard_layout = NULL;
 ram_addr_t ram_size;
+const char *mem_path = NULL;
+#ifdef MAP_POPULATE
+int mem_prealloc = 0; /* force preallocation of physical target memory */
+#endif
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int vm_running;
@@ -5216,6 +5220,14 @@ int main(int argc, char **argv, char **e
                 ram_size = value;
                 break;
             }
+            case QEMU_OPTION_mempath:
+                mem_path = optarg;
+                break;
+#ifdef MAP_POPULATE
+            case QEMU_OPTION_mem_prealloc:
+                mem_prealloc = 1;
+                break;
+#endif
             case QEMU_OPTION_d:
                 {
                     int mask;

  reply	other threads:[~2010-03-01 23:31 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-02-24 21:11 [patch uq/master 0/2] port qemu-kvm's -mem-path and -mem-prealloc to qemu Marcelo Tosatti
2010-02-24 21:11 ` [Qemu-devel] " Marcelo Tosatti
2010-02-24 21:11 ` [patch uq/master 1/2] Allocate memory below 4GB as one chunk Marcelo Tosatti
2010-02-24 21:11   ` [Qemu-devel] " Marcelo Tosatti
2010-02-25 13:33   ` Avi Kivity
2010-02-25 13:33     ` [Qemu-devel] " Avi Kivity
2010-02-24 21:11 ` [patch uq/master 2/2] Add option to use file backed guest memory Marcelo Tosatti
2010-02-24 21:11   ` [Qemu-devel] " Marcelo Tosatti
2010-02-28  1:28   ` Paul Brook
2010-02-28  1:28     ` Paul Brook
2010-03-01 23:25     ` Marcelo Tosatti [this message]
2010-03-01 23:25       ` Marcelo Tosatti
2010-03-01 23:32       ` Marcelo Tosatti
2010-03-01 23:32         ` Marcelo Tosatti
2010-02-25 13:32 ` [patch uq/master 0/2] port qemu-kvm's -mem-path and -mem-prealloc to qemu Avi Kivity
2010-02-25 13:32   ` [Qemu-devel] " Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100301232508.GA13703@amt.cnet \
    --to=mtosatti@redhat.com \
    --cc=avi@redhat.com \
    --cc=john.cooper@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=paul@codesourcery.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.