* [patch 0/3] QEMU balloon support
@ 2008-03-12 19:17 Marcelo Tosatti
2008-03-12 19:17 ` [patch 1/3] QEMU support for virtio balloon driver Marcelo Tosatti
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Marcelo Tosatti @ 2008-03-12 19:17 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel
This patchset resends Anthony's QEMU balloon support plus:
- Truncates the target size to ram size
- Enables madvise() conditioned on KVM_ZAP_GFN ioctl
--
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 9+ messages in thread
* [patch 1/3] QEMU support for virtio balloon driver
2008-03-12 19:17 [patch 0/3] QEMU balloon support Marcelo Tosatti
@ 2008-03-12 19:17 ` Marcelo Tosatti
2008-03-12 19:17 ` [patch 2/3] QEMU: balloon: dont allow target larger than ram size Marcelo Tosatti
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Marcelo Tosatti @ 2008-03-12 19:17 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel
[-- Attachment #1: qemu-balloon --]
[-- Type: text/plain, Size: 12508 bytes --]
From: Anthony Liguori <aliguori@us.ibm.com>
This patch adds support to QEMU for Rusty's recently introduce virtio balloon
driver. The user-facing portions of this are the introduction of a "balloon"
and "info balloon" command in the monitor.
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Index: kvm-userspace.balloon/qemu/Makefile.target
===================================================================
--- kvm-userspace.balloon.orig/qemu/Makefile.target
+++ kvm-userspace.balloon/qemu/Makefile.target
@@ -577,7 +577,7 @@ OBJS += e1000.o
OBJS+= hypercall.o
# virtio devices
-OBJS += virtio.o virtio-net.o virtio-blk.o
+OBJS += virtio.o virtio-net.o virtio-blk.o virtio-balloon.o
ifeq ($(TARGET_BASE_ARCH), i386)
# Hardware support
Index: kvm-userspace.balloon/qemu/balloon.h
===================================================================
--- /dev/null
+++ kvm-userspace.balloon/qemu/balloon.h
@@ -0,0 +1,14 @@
+#ifndef _QEMU_BALLOON_H
+#define _QEMU_BALLOON_H
+
+#include "cpu-defs.h"
+
+typedef ram_addr_t (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque);
+
+void qemu_balloon(ram_addr_t target);
+
+ram_addr_t qemu_balloon_status(void);
+
+#endif
Index: kvm-userspace.balloon/qemu/hw/pc.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/hw/pc.c
+++ kvm-userspace.balloon/qemu/hw/pc.c
@@ -1120,6 +1120,9 @@ static void pc_init1(ram_addr_t ram_size
}
}
+ if (pci_enabled)
+ virtio_balloon_init(pci_bus, 0x1AF4, 0x1002);
+
if (extboot_drive != -1) {
DriveInfo *info = &drives_table[extboot_drive];
int cyls, heads, secs;
Index: kvm-userspace.balloon/qemu/hw/pc.h
===================================================================
--- kvm-userspace.balloon.orig/qemu/hw/pc.h
+++ kvm-userspace.balloon/qemu/hw/pc.h
@@ -153,6 +153,9 @@ void virtio_net_poll(void);
void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
BlockDriverState *bs);
+/* virtio-balloon.h */
+void *virtio_balloon_init(PCIBus *bus, uint16_t vendor, uint16_t device);
+
/* extboot.c */
void extboot_init(BlockDriverState *bs, int cmd);
Index: kvm-userspace.balloon/qemu/hw/virtio-balloon.c
===================================================================
--- /dev/null
+++ kvm-userspace.balloon/qemu/hw/virtio-balloon.c
@@ -0,0 +1,145 @@
+/*
+ * Virtio Block Device
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "virtio.h"
+#include "block.h"
+#include "pc.h"
+#include "balloon.h"
+#include "sysemu.h"
+
+#include <sys/mman.h>
+
+/* from Linux's linux/virtio_blk.h */
+
+/* The ID for virtio_balloon */
+#define VIRTIO_ID_BALLOON 5
+
+/* The feature bitmap for virtio balloon */
+#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */
+
+struct virtio_balloon_config
+{
+ /* Number of pages host wants Guest to give up. */
+ uint32_t num_pages;
+ /* Number of pages we've actually got in balloon. */
+ uint32_t actual;
+};
+
+typedef struct VirtIOBalloon
+{
+ VirtIODevice vdev;
+ VirtQueue *ivq, *dvq;
+ uint32_t num_pages;
+ uint32_t actual;
+} VirtIOBalloon;
+
+static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev)
+{
+ return (VirtIOBalloon *)vdev;
+}
+
+static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOBalloon *s = to_virtio_balloon(vdev);
+ VirtQueueElement elem;
+ unsigned int count;
+
+ while ((count = virtqueue_pop(vq, &elem)) != 0) {
+ int i;
+ unsigned int wlen = 0;
+
+ for (i = 0; i < elem.out_num; i++) {
+ int flags;
+ uint32_t *pfns = elem.out_sg[i].iov_base;
+ unsigned int n_pfns = elem.out_sg[i].iov_len / 4;
+ int j;
+
+ for (j = 0; j < n_pfns; j++) {
+ if (vq == s->dvq) /* deflate */
+ flags = MADV_WILLNEED;
+ else /* inflate */
+ flags = MADV_DONTNEED;
+
+#if 0
+ /* can't use this until we have mmu notifier support */
+ madvise(phys_ram_base + (pfns[j] << TARGET_PAGE_BITS),
+ TARGET_PAGE_SIZE, flags);
+#endif
+ }
+
+ wlen += elem.out_sg[i].iov_len;
+ }
+
+ virtqueue_push(vq, &elem, wlen);
+ virtio_notify(vdev, vq);
+ }
+}
+
+static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
+{
+ VirtIOBalloon *dev = to_virtio_balloon(vdev);
+ struct virtio_balloon_config config;
+
+ config.num_pages = dev->num_pages;
+ config.actual = dev->actual;
+
+ memcpy(config_data, &config, 8);
+}
+
+static void virtio_balloon_set_config(VirtIODevice *vdev,
+ const uint8_t *config_data)
+{
+ VirtIOBalloon *dev = to_virtio_balloon(vdev);
+ struct virtio_balloon_config config;
+ memcpy(&config, config_data, 8);
+ dev->actual = config.actual;
+}
+
+static uint32_t virtio_balloon_get_features(VirtIODevice *vdev)
+{
+ return 0;
+}
+
+static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t target)
+{
+ VirtIOBalloon *dev = opaque;
+
+ if (target) {
+ dev->num_pages = (ram_size - target) >> TARGET_PAGE_BITS;
+ virtio_notify_config(&dev->vdev);
+ }
+
+ return ram_size - (dev->actual << TARGET_PAGE_BITS);
+}
+
+void *virtio_balloon_init(PCIBus *bus, uint16_t vendor, uint16_t device)
+{
+ VirtIOBalloon *s;
+
+ s = (VirtIOBalloon *)virtio_init_pci(bus, "virtio-balloon",
+ vendor, device,
+ 0, VIRTIO_ID_BALLOON,
+ 0x05, 0x00, 0x00,
+ 8, sizeof(VirtIOBalloon));
+
+ s->vdev.get_config = virtio_balloon_get_config;
+ s->vdev.set_config = virtio_balloon_set_config;
+ s->vdev.get_features = virtio_balloon_get_features;
+
+ s->ivq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
+ s->dvq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
+
+ qemu_add_balloon_handler(virtio_balloon_to_target, s);
+
+ return &s->vdev;
+}
Index: kvm-userspace.balloon/qemu/hw/virtio-blk.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/hw/virtio-blk.c
+++ kvm-userspace.balloon/qemu/hw/virtio-blk.c
@@ -153,7 +153,7 @@ void *virtio_blk_init(PCIBus *bus, uint1
0x01, 0x80, 0x00,
16, sizeof(VirtIOBlock));
- s->vdev.update_config = virtio_blk_update_config;
+ s->vdev.get_config = virtio_blk_update_config;
s->vdev.get_features = virtio_blk_get_features;
s->bs = bs;
Index: kvm-userspace.balloon/qemu/hw/virtio-net.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/hw/virtio-net.c
+++ kvm-userspace.balloon/qemu/hw/virtio-net.c
@@ -288,7 +288,7 @@ void *virtio_net_init(PCIBus *bus, NICIn
0x02, 0x00, 0x00,
6, sizeof(VirtIONet));
- n->vdev.update_config = virtio_net_update_config;
+ n->vdev.get_config = virtio_net_update_config;
n->vdev.get_features = virtio_net_get_features;
n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
Index: kvm-userspace.balloon/qemu/hw/virtio.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/hw/virtio.c
+++ kvm-userspace.balloon/qemu/hw/virtio.c
@@ -307,6 +307,9 @@ static void virtio_config_writeb(void *o
return;
memcpy(vdev->config + addr, &val, sizeof(val));
+
+ if (vdev->set_config)
+ vdev->set_config(vdev, vdev->config);
}
static void virtio_config_writew(void *opaque, uint32_t addr, uint32_t data)
@@ -319,6 +322,9 @@ static void virtio_config_writew(void *o
return;
memcpy(vdev->config + addr, &val, sizeof(val));
+
+ if (vdev->set_config)
+ vdev->set_config(vdev, vdev->config);
}
static void virtio_config_writel(void *opaque, uint32_t addr, uint32_t data)
@@ -331,6 +337,9 @@ static void virtio_config_writel(void *o
return;
memcpy(vdev->config + addr, &val, sizeof(val));
+
+ if (vdev->set_config)
+ vdev->set_config(vdev, vdev->config);
}
static void virtio_map(PCIDevice *pci_dev, int region_num,
@@ -359,7 +368,7 @@ static void virtio_map(PCIDevice *pci_de
register_ioport_read(addr + 20, vdev->config_len, 4,
virtio_config_readl, vdev);
- vdev->update_config(vdev, vdev->config);
+ vdev->get_config(vdev, vdev->config);
}
}
@@ -383,6 +392,14 @@ VirtQueue *virtio_add_queue(VirtIODevice
return &vdev->vq[i];
}
+void virtio_notify_config(VirtIODevice *vdev)
+{
+ /* make sure we have the latest config */
+ vdev->get_config(vdev, vdev->config);
+ vdev->isr = 3;
+ virtio_update_irq(vdev);
+}
+
void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
{
/* Always notify when queue is empty */
Index: kvm-userspace.balloon/qemu/hw/virtio.h
===================================================================
--- kvm-userspace.balloon.orig/qemu/hw/virtio.h
+++ kvm-userspace.balloon/qemu/hw/virtio.h
@@ -118,7 +118,8 @@ struct VirtIODevice
void *config;
uint32_t (*get_features)(VirtIODevice *vdev);
void (*set_features)(VirtIODevice *vdev, uint32_t val);
- void (*update_config)(VirtIODevice *vdev, uint8_t *config);
+ void (*get_config)(VirtIODevice *vdev, uint8_t *config);
+ void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
VirtQueue vq[VIRTIO_PCI_QUEUE_MAX];
};
@@ -140,4 +141,6 @@ int virtqueue_pop(VirtQueue *vq, VirtQue
void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
+void virtio_notify_config(VirtIODevice *vdev);
+
#endif
Index: kvm-userspace.balloon/qemu/monitor.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/monitor.c
+++ kvm-userspace.balloon/qemu/monitor.c
@@ -35,6 +35,7 @@
#include "audio/audio.h"
#include "disas.h"
#include "migration.h"
+#include "balloon.h"
#include <dirent.h>
#include "qemu-kvm.h"
@@ -1281,6 +1282,23 @@ static void do_wav_capture (const char *
}
#endif
+static void do_balloon(int value)
+{
+ ram_addr_t target = value;
+ qemu_balloon(target << 20);
+}
+
+static void do_info_balloon(void)
+{
+ ram_addr_t actual;
+
+ actual = qemu_balloon_status();
+ if (actual == 0)
+ term_printf("Ballooning not activated in VM\n");
+ else
+ term_printf("balloon: actual=%d\n", (int)(actual >> 20));
+}
+
static term_cmd_t term_cmds[] = {
{ "help|?", "s?", do_help,
"[cmd]", "show the help" },
@@ -1359,6 +1377,8 @@ static term_cmd_t term_cmds[] = {
{ "migrate_set_speed", "s", do_migrate_set_speed,
"value", "set maximum speed (in bytes) for migrations" },
{ "cpu_set", "is", do_cpu_set_nr, "cpu [online|offline]", "change cpu state" },
+ { "balloon", "i", do_balloon,
+ "target", "request VM to change it's memory allocation (in MB)" },
{ NULL, NULL, },
};
@@ -1421,6 +1441,8 @@ static term_cmd_t info_cmds[] = {
#endif
{ "migration", "", do_info_migration,
"", "show migration information" },
+ { "balloon", "", do_info_balloon,
+ "", "show balloon information" },
{ NULL, NULL, },
};
Index: kvm-userspace.balloon/qemu/vl.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/vl.c
+++ kvm-userspace.balloon/qemu/vl.c
@@ -38,6 +38,7 @@
#include "block.h"
#include "audio/audio.h"
#include "migration.h"
+#include "balloon.h"
#include "qemu-kvm.h"
#include <unistd.h>
@@ -513,6 +514,31 @@ void hw_error(const char *fmt, ...)
abort();
}
+/***************/
+/* ballooning */
+
+static QEMUBalloonEvent *qemu_balloon_event;
+void *qemu_balloon_event_opaque;
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque)
+{
+ qemu_balloon_event = func;
+ qemu_balloon_event_opaque = opaque;
+}
+
+void qemu_balloon(ram_addr_t target)
+{
+ if (qemu_balloon_event)
+ qemu_balloon_event(qemu_balloon_event_opaque, target);
+}
+
+ram_addr_t qemu_balloon_status(void)
+{
+ if (qemu_balloon_event)
+ return qemu_balloon_event(qemu_balloon_event_opaque, 0);
+ return 0;
+}
+
/***********************************************************/
/* keyboard/mouse */
--
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 9+ messages in thread
* [patch 2/3] QEMU: balloon: dont allow target larger than ram size
2008-03-12 19:17 [patch 0/3] QEMU balloon support Marcelo Tosatti
2008-03-12 19:17 ` [patch 1/3] QEMU support for virtio balloon driver Marcelo Tosatti
@ 2008-03-12 19:17 ` Marcelo Tosatti
2008-03-12 19:17 ` [patch 3/3] QEMU: balloon: zap any shadow mappings to a page before madvise(MADV_DONTNEED) Marcelo Tosatti
2008-03-16 14:00 ` [patch 0/3] QEMU balloon support Avi Kivity
3 siblings, 0 replies; 9+ messages in thread
From: Marcelo Tosatti @ 2008-03-12 19:17 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti
[-- Attachment #1: larger-ram-size --]
[-- Type: text/plain, Size: 1064 bytes --]
Truncate the target size to be at most ram size.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Index: kvm-userspace.balloon/qemu/hw/virtio-balloon.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/hw/virtio-balloon.c
+++ kvm-userspace.balloon/qemu/hw/virtio-balloon.c
@@ -16,6 +16,7 @@
#include "pc.h"
#include "balloon.h"
#include "sysemu.h"
+#include "console.h"
#include <sys/mman.h>
@@ -115,6 +116,10 @@ static ram_addr_t virtio_balloon_to_targ
VirtIOBalloon *dev = opaque;
if (target) {
+ if (target > ram_size) {
+ term_printf("target larger than ram size, truncating\n");
+ target = ram_size;
+ }
dev->num_pages = (ram_size - target) >> TARGET_PAGE_BITS;
virtio_notify_config(&dev->vdev);
}
--
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 9+ messages in thread
* [patch 3/3] QEMU: balloon: zap any shadow mappings to a page before madvise(MADV_DONTNEED)
2008-03-12 19:17 [patch 0/3] QEMU balloon support Marcelo Tosatti
2008-03-12 19:17 ` [patch 1/3] QEMU support for virtio balloon driver Marcelo Tosatti
2008-03-12 19:17 ` [patch 2/3] QEMU: balloon: dont allow target larger than ram size Marcelo Tosatti
@ 2008-03-12 19:17 ` Marcelo Tosatti
2008-03-16 14:00 ` [patch 0/3] QEMU balloon support Avi Kivity
3 siblings, 0 replies; 9+ messages in thread
From: Marcelo Tosatti @ 2008-03-12 19:17 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti
[-- Attachment #1: virtio-ball1 --]
[-- Type: text/plain, Size: 3799 bytes --]
Call into kvm to know whether we can madvise(MADV_DONTNEED) pages.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Index: kvm-userspace.balloon/libkvm/libkvm.c
===================================================================
--- kvm-userspace.balloon.orig/libkvm/libkvm.c
+++ kvm-userspace.balloon/libkvm/libkvm.c
@@ -840,6 +840,20 @@ int kvm_is_ready_for_interrupt_injection
return run->ready_for_interrupt_injection;
}
+int kvm_zap_single_gfn(kvm_context_t kvm, unsigned long gfn)
+{
+ int r = -1;
+
+#ifdef KVM_CAP_ZAP_GFN
+ r = ioctl(kvm->fd, KVM_CHECK_EXTENSION, KVM_CAP_ZAP_GFN);
+ if (r > 0)
+ r = ioctl(kvm->vm_fd, KVM_ZAP_GFN, &gfn);
+ else
+ r = -1;
+#endif
+ return r;
+}
+
int kvm_run(kvm_context_t kvm, int vcpu)
{
int r;
Index: kvm-userspace.balloon/libkvm/libkvm.h
===================================================================
--- kvm-userspace.balloon.orig/libkvm/libkvm.h
+++ kvm-userspace.balloon/libkvm/libkvm.h
@@ -422,6 +422,7 @@ int kvm_get_dirty_pages_range(kvm_contex
unsigned long end_addr, void *buf, void*opaque,
int (*cb)(unsigned long start, unsigned long len,
void*bitmap, void *opaque));
+int kvm_zap_single_gfn(kvm_context_t kvm, unsigned long gfn);
/*!
* \brief Create a memory alias
Index: kvm-userspace.balloon/qemu/hw/virtio-balloon.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/hw/virtio-balloon.c
+++ kvm-userspace.balloon/qemu/hw/virtio-balloon.c
@@ -17,6 +17,7 @@
#include "balloon.h"
#include "sysemu.h"
#include "console.h"
+#include "qemu-kvm.h"
#include <sys/mman.h>
@@ -60,22 +61,21 @@ static void virtio_balloon_handle_output
unsigned int wlen = 0;
for (i = 0; i < elem.out_num; i++) {
- int flags;
uint32_t *pfns = elem.out_sg[i].iov_base;
unsigned int n_pfns = elem.out_sg[i].iov_len / 4;
int j;
for (j = 0; j < n_pfns; j++) {
- if (vq == s->dvq) /* deflate */
- flags = MADV_WILLNEED;
- else /* inflate */
- flags = MADV_DONTNEED;
-
-#if 0
- /* can't use this until we have mmu notifier support */
- madvise(phys_ram_base + (pfns[j] << TARGET_PAGE_BITS),
- TARGET_PAGE_SIZE, flags);
-#endif
+ if (vq == s->dvq)
+ /* deflate */
+ madvise(phys_ram_base + (pfns[j] << TARGET_PAGE_BITS),
+ TARGET_PAGE_SIZE, MADV_WILLNEED);
+ else { /* inflate */
+ if (kvm_zap_gfn(pfns[j]) == 0)
+ madvise(phys_ram_base +
+ (pfns[j] << TARGET_PAGE_BITS),
+ TARGET_PAGE_SIZE, MADV_DONTNEED);
+ }
}
wlen += elem.out_sg[i].iov_len;
Index: kvm-userspace.balloon/qemu/qemu-kvm.c
===================================================================
--- kvm-userspace.balloon.orig/qemu/qemu-kvm.c
+++ kvm-userspace.balloon/qemu/qemu-kvm.c
@@ -65,6 +65,11 @@ static void sig_ipi_handler(int n)
{
}
+int kvm_zap_gfn(unsigned long gfn)
+{
+ return kvm_zap_single_gfn(kvm_context, gfn);
+}
+
void kvm_update_interrupt_request(CPUState *env)
{
if (env && vcpu && env != vcpu->env) {
Index: kvm-userspace.balloon/qemu/qemu-kvm.h
===================================================================
--- kvm-userspace.balloon.orig/qemu/qemu-kvm.h
+++ kvm-userspace.balloon/qemu/qemu-kvm.h
@@ -24,6 +24,7 @@ int kvm_qemu_init_env(CPUState *env);
int kvm_qemu_check_extension(int ext);
void kvm_apic_init(CPUState *env);
int kvm_set_irq(int irq, int level);
+int kvm_zap_gfn(unsigned long gfn);
int kvm_physical_memory_set_dirty_tracking(int enable);
int kvm_update_dirty_pages_log(void);
--
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 0/3] QEMU balloon support
2008-03-12 19:17 [patch 0/3] QEMU balloon support Marcelo Tosatti
` (2 preceding siblings ...)
2008-03-12 19:17 ` [patch 3/3] QEMU: balloon: zap any shadow mappings to a page before madvise(MADV_DONTNEED) Marcelo Tosatti
@ 2008-03-16 14:00 ` Avi Kivity
2008-03-16 18:59 ` Marcelo Tosatti
3 siblings, 1 reply; 9+ messages in thread
From: Avi Kivity @ 2008-03-16 14:00 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: kvm-devel
Marcelo Tosatti wrote:
> This patchset resends Anthony's QEMU balloon support plus:
>
> - Truncates the target size to ram size
> - Enables madvise() conditioned on KVM_ZAP_GFN ioctl
>
>
Once mmu notifiers are in, KVM_ZAP_GFN isn't needed. So we have three
possible situations:
- zap needed, but not available: don't madvise()
- zap needed and available: zap and madvise()
- zap unneeded: madvise()
Did you find out what's causing the errors in the first place (if zap is
not used)? It worries me greatly.
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 0/3] QEMU balloon support
2008-03-16 14:00 ` [patch 0/3] QEMU balloon support Avi Kivity
@ 2008-03-16 18:59 ` Marcelo Tosatti
2008-03-17 11:11 ` Avi Kivity
0 siblings, 1 reply; 9+ messages in thread
From: Marcelo Tosatti @ 2008-03-16 18:59 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel
Hi Avi,
Good that you're back.
On Sun, Mar 16, 2008 at 04:00:06PM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> >This patchset resends Anthony's QEMU balloon support plus:
> >
> >- Truncates the target size to ram size
> >- Enables madvise() conditioned on KVM_ZAP_GFN ioctl
> >
> >
>
> Once mmu notifiers are in, KVM_ZAP_GFN isn't needed. So we have three
> possible situations:
>
> - zap needed, but not available: don't madvise()
> - zap needed and available: zap and madvise()
> - zap unneeded: madvise()
Right. That is what the patch does. You just have to fill in
"have_mmu_notifiers" here:
+int kvm_zap_single_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned long addr;
+ int have_mmu_notifiers = 0;
+
+ down_read(&kvm->slots_lock);
+ addr = gfn_to_hva(kvm, gfn);
+
+ if (kvm_is_error_hva(addr)) {
+ up_read(&kvm->slots_lock);
+ return -EINVAL;
+ }
+
+ if (!have_mmu_notifiers) {
+ spin_lock(&kvm->mmu_lock);
+ rmap_nuke(kvm, gfn);
+ spin_unlock(&kvm->mmu_lock);
+ }
+ up_read(&kvm->slots_lock);
So as to avoid rmap_nuke, since that will be done through the madvise()
path.
> Did you find out what's causing the errors in the first place (if zap is
> not used)? It worries me greatly.
Yes, the problem is that the rmap code does not handle the qemu process
mappings from vanishing while there is a present rmap. If that happens,
and there is a fault for a gfn whose qemu mapping has been removed, a
different physical zero page will be allocated:
rmap a -> gfn 0 -> physical host page 0
mapping for gfn 0 gets removed
guest faults in gfn 0 through the same pte "chain"
rmap a -> gfn 0 -> physical host page 1
When instantiating the shadow mapping for the second time, the
"is_rmap_pte" check succeeds, so we release the reference grabbed by
gfn_to_page() at mmu_set_spte(). We now have a shadow mapping pointing
to a physical page without having an additional reference on that page.
The following makes the host not crash under such a condition, but the
condition itself is invalid leading to inconsistent state on the guest.
So IMHO it shouldnt be allowed to happen in the first place.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f0cdfba..4c93b79 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1009,6 +1009,21 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t
+gva)
return page;
}
+static int was_spte_rmapped(struct kvm *kvm, u64 *spte, struct page *page)
+{
+ int ret = 0;
+ unsigned long host_pfn = (*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ if (is_rmap_pte(*spte)) {
+ if (host_pfn != page_to_pfn(page))
+ rmap_remove(kvm, spte);
+ else
+ ret = 1;
+ }
+
+ return ret;
+}
+
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
@@ -1016,7 +1031,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64
+*shadow_pte,
struct page *page)
{
u64 spte;
- int was_rmapped = is_rmap_pte(*shadow_pte);
+ int was_rmapped = was_spte_rmapped(vcpu->kvm, shadow_pte, page);
int was_writeble = is_writeble_pte(*shadow_pte);
/*
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [patch 0/3] QEMU balloon support
2008-03-16 18:59 ` Marcelo Tosatti
@ 2008-03-17 11:11 ` Avi Kivity
2008-03-17 13:08 ` Marcelo Tosatti
0 siblings, 1 reply; 9+ messages in thread
From: Avi Kivity @ 2008-03-17 11:11 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: kvm-devel
Marcelo Tosatti wrote:
> Hi Avi,
>
> Good that you're back.
>
> On Sun, Mar 16, 2008 at 04:00:06PM +0200, Avi Kivity wrote:
>
>> Marcelo Tosatti wrote:
>>
>>> This patchset resends Anthony's QEMU balloon support plus:
>>>
>>> - Truncates the target size to ram size
>>> - Enables madvise() conditioned on KVM_ZAP_GFN ioctl
>>>
>>>
>>>
>> Once mmu notifiers are in, KVM_ZAP_GFN isn't needed. So we have three
>> possible situations:
>>
>> - zap needed, but not available: don't madvise()
>> - zap needed and available: zap and madvise()
>> - zap unneeded: madvise()
>>
>
> Right. That is what the patch does. You just have to fill in
> "have_mmu_notifiers" here:
>
> +int kvm_zap_single_gfn(struct kvm *kvm, gfn_t gfn)
> +{
> + unsigned long addr;
> + int have_mmu_notifiers = 0;
> +
> + down_read(&kvm->slots_lock);
> + addr = gfn_to_hva(kvm, gfn);
> +
> + if (kvm_is_error_hva(addr)) {
> + up_read(&kvm->slots_lock);
> + return -EINVAL;
> + }
> +
> + if (!have_mmu_notifiers) {
> + spin_lock(&kvm->mmu_lock);
> + rmap_nuke(kvm, gfn);
> + spin_unlock(&kvm->mmu_lock);
> + }
> + up_read(&kvm->slots_lock);
>
> So as to avoid rmap_nuke, since that will be done through the madvise()
> path.
>
>
Why not do it in userspace?
I'll likely merge zap directly into the backwards compatibility support
(it won't ever appear in mainline since mmu notifiers will be merged
sooner).
>> Did you find out what's causing the errors in the first place (if zap is
>> not used)? It worries me greatly.
>>
>
> Yes, the problem is that the rmap code does not handle the qemu process
> mappings from vanishing while there is a present rmap. If that happens,
> and there is a fault for a gfn whose qemu mapping has been removed, a
> different physical zero page will be allocated:
>
> rmap a -> gfn 0 -> physical host page 0
> mapping for gfn 0 gets removed
> guest faults in gfn 0 through the same pte "chain"
> rmap a -> gfn 0 -> physical host page 1
>
> When instantiating the shadow mapping for the second time, the
> "is_rmap_pte" check succeeds, so we release the reference grabbed by
> gfn_to_page() at mmu_set_spte(). We now have a shadow mapping pointing
> to a physical page without having an additional reference on that page.
>
> The following makes the host not crash under such a condition, but the
> condition itself is invalid leading to inconsistent state on the guest.
> So IMHO it shouldnt be allowed to happen in the first place.
>
>
The only way to prevent it is with mmu notifiers, which we may not have
for 2.6.25. So I'd like to send this for 2.6.25-rc.
Please add a signoff.
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index f0cdfba..4c93b79 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1009,6 +1009,21 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t
> +gva)
> return page;
> }
>
> +static int was_spte_rmapped(struct kvm *kvm, u64 *spte, struct page *page)
> +{
> + int ret = 0;
> + unsigned long host_pfn = (*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
>
hfn_t hfn
> +
> + if (is_rmap_pte(*spte)) {
> + if (host_pfn != page_to_pfn(page))
> + rmap_remove(kvm, spte);
> + else
> + ret = 1;
> + }
> +
> + return ret;
> +}
> +
> static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
> unsigned pt_access, unsigned pte_access,
> int user_fault, int write_fault, int dirty,
> @@ -1016,7 +1031,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64
> +*shadow_pte,
> struct page *page)
> {
> u64 spte;
> - int was_rmapped = is_rmap_pte(*shadow_pte);
> + int was_rmapped = was_spte_rmapped(vcpu->kvm, shadow_pte, page);
> int was_writeble = is_writeble_pte(*shadow_pte);
>
Calling code with side effects in an initializer is a bit confusing. I
suggest simply inlining the code into mmu_set_spte().
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch 0/3] QEMU balloon support
2008-03-17 11:11 ` Avi Kivity
@ 2008-03-17 13:08 ` Marcelo Tosatti
2008-03-17 14:56 ` Avi Kivity
0 siblings, 1 reply; 9+ messages in thread
From: Marcelo Tosatti @ 2008-03-17 13:08 UTC (permalink / raw)
To: Avi Kivity; +Cc: Marcelo Tosatti, kvm-devel
On Mon, Mar 17, 2008 at 01:11:11PM +0200, Avi Kivity wrote:
> >+ up_read(&kvm->slots_lock);
> >
> >So as to avoid rmap_nuke, since that will be done through the madvise()
> >path.
> >
> >
>
> Why not do it in userspace?
I don't see any way of knowing whether you have or not mmu notifiers from
userspace except adding an ioctl.
> I'll likely merge zap directly into the backwards compatibility support
> (it won't ever appear in mainline since mmu notifiers will be merged
> sooner).
Even with mmu notifiers QEMU needs to ask the host kernel "do you have
mmu notifiers?" before zapping any page, otherwise the guest will crash.
So some method of finding if the host kernel has mmu notifiers needs to
be in place even in mainline.
And what about allocation of ioctl numbers? Will you reserve the number
used for ZAP_GFN on mainline?
> >>Did you find out what's causing the errors in the first place (if zap is
> >>not used)? It worries me greatly.
> >>
> >
> >Yes, the problem is that the rmap code does not handle the qemu process
> >mappings from vanishing while there is a present rmap. If that happens,
> >and there is a fault for a gfn whose qemu mapping has been removed, a
> >different physical zero page will be allocated:
> >
> > rmap a -> gfn 0 -> physical host page 0
> > mapping for gfn 0 gets removed
> > guest faults in gfn 0 through the same pte "chain"
> > rmap a -> gfn 0 -> physical host page 1
> >
> >When instantiating the shadow mapping for the second time, the
> >"is_rmap_pte" check succeeds, so we release the reference grabbed by
> >gfn_to_page() at mmu_set_spte(). We now have a shadow mapping pointing
> >to a physical page without having an additional reference on that page.
> >
> >The following makes the host not crash under such a condition, but the
> >condition itself is invalid leading to inconsistent state on the guest.
> >So IMHO it shouldnt be allowed to happen in the first place.
> >
> >
>
> The only way to prevent it is with mmu notifiers, which we may not have
> for 2.6.25. So I'd like to send this for 2.6.25-rc.
Well, you can prevent it by never allowing a page from being removed while
there are shadow mappings for it. So while this fixes the host crash, it
won't fix the guest crash.
KVM: MMU: handle page removal with shadow mapping
Do not assume that a shadow mapping will always point to the same host
frame number. Fixes crash with madvise(MADV_DONTNEED).
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f0cdfba..4caa1a0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1016,8 +1016,16 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
struct page *page)
{
u64 spte;
- int was_rmapped = is_rmap_pte(*shadow_pte);
+ int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
+ hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ if (is_rmap_pte(*shadow_pte)) {
+ if (host_pfn != page_to_pfn(page))
+ rmap_remove(vcpu->kvm, shadow_pte);
+ else
+ was_rmapped = 1;
+ }
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [patch 0/3] QEMU balloon support
2008-03-17 13:08 ` Marcelo Tosatti
@ 2008-03-17 14:56 ` Avi Kivity
0 siblings, 0 replies; 9+ messages in thread
From: Avi Kivity @ 2008-03-17 14:56 UTC (permalink / raw)
To: Marcelo Tosatti; +Cc: kvm-devel
Marcelo Tosatti wrote:
> On Mon, Mar 17, 2008 at 01:11:11PM +0200, Avi Kivity wrote:
>
>
>>> + up_read(&kvm->slots_lock);
>>>
>>> So as to avoid rmap_nuke, since that will be done through the madvise()
>>> path.
>>>
>>>
>>>
>> Why not do it in userspace?
>>
>
> I don't see any way of knowing whether you have or not mmu notifiers from
> userspace except adding an ioctl.
>
>
KVM_CHECK_CAPABILITY (KVM_CAP_MMU_NOTIFIERS)
>> I'll likely merge zap directly into the backwards compatibility support
>> (it won't ever appear in mainline since mmu notifiers will be merged
>> sooner).
>>
>
> Even with mmu notifiers QEMU needs to ask the host kernel "do you have
> mmu notifiers?" before zapping any page, otherwise the guest will crash.
>
>
If we drop the shadow ptes explicitly, it shouldn't crash?
> So some method of finding if the host kernel has mmu notifiers needs to
> be in place even in mainline.
>
>
Yes, the capability test.
> And what about allocation of ioctl numbers? Will you reserve the number
> used for ZAP_GFN on mainline?
>
>
We can allocate those from the end and hope they never meet.
> KVM: MMU: handle page removal with shadow mapping
>
> Do not assume that a shadow mapping will always point to the same host
> frame number. Fixes crash with madvise(MADV_DONTNEED).
>
>
Applied, thanks.
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2008-03-17 14:56 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-12 19:17 [patch 0/3] QEMU balloon support Marcelo Tosatti
2008-03-12 19:17 ` [patch 1/3] QEMU support for virtio balloon driver Marcelo Tosatti
2008-03-12 19:17 ` [patch 2/3] QEMU: balloon: dont allow target larger than ram size Marcelo Tosatti
2008-03-12 19:17 ` [patch 3/3] QEMU: balloon: zap any shadow mappings to a page before madvise(MADV_DONTNEED) Marcelo Tosatti
2008-03-16 14:00 ` [patch 0/3] QEMU balloon support Avi Kivity
2008-03-16 18:59 ` Marcelo Tosatti
2008-03-17 11:11 ` Avi Kivity
2008-03-17 13:08 ` Marcelo Tosatti
2008-03-17 14:56 ` Avi Kivity
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox