[Qemu-devel] [PATCH 1/5] PCI DMA API (v3)

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
@ 2008-04-15 22:11 Anthony Liguori
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 2/5] virtio for QEMU (v3) Anthony Liguori
                   ` (4 more replies)
  0 siblings, 5 replies; 13+ messages in thread
From: Anthony Liguori @ 2008-04-15 22:11 UTC (permalink / raw)
  To: qemu-devel
  Cc: Anthony Liguori, kvm-devel, Marcelo Tosatti, Paul Brook,
	Aurelien Jarno

This patch introduces a DMA API and plumbs support through the DMA layer.  We
use a mostly opaque structure, IOVector to represent a scatter/gather list of
physical memory.  Associated with each IOVector is a read/write function and
an opaque pointer.  This allows arbitrary transformation/mapping of the
data while providing an easy mechanism to short-cut the zero-copy case
in the block/net backends.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile b/Makefile
index adb50a8..a8df278 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ recurse-all: $(patsubst %,subdir-%, $(TARGET_DIRS))
 #######################################################################
 # BLOCK_OBJS is code used by both qemu system emulation and qemu-img
 
-BLOCK_OBJS=cutils.o
+BLOCK_OBJS=cutils.o iovector.o
 BLOCK_OBJS+=block-cow.o block-qcow.o aes.o block-vmdk.o block-cloop.o
 BLOCK_OBJS+=block-dmg.o block-bochs.o block-vpc.o block-vvfat.o
 BLOCK_OBJS+=block-qcow2.o block-parallels.o
diff --git a/block.c b/block.c
index 0730954..eb610e0 100644
--- a/block.c
+++ b/block.c
@@ -570,6 +570,55 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
     }
 }
 
+int bdrv_readv(BlockDriverState *bs, int64_t sector_num,
+	       IOVector *iovec)
+{
+    char *buffer;
+    size_t size;
+    int ret;
+
+    /* it's possible that we'll see a OOM condition here if the transfer size
+     * is sufficiently large.
+     */
+    size = iovector_size(iovec);
+    buffer = qemu_malloc(size);
+    if (buffer == NULL)
+	return -ENOMEM;
+
+    ret = bdrv_read(bs, sector_num, buffer, size / 512);
+
+    if (ret >= 0)
+	memcpy_to_iovector(iovec, 0, size, buffer);
+
+    qemu_free(buffer);
+
+    return ret;
+}
+
+int bdrv_writev(BlockDriverState *bs, int64_t sector_num,
+		const IOVector *iovec)
+{
+    char *buffer;
+    size_t size;
+    int ret;
+
+    /* it's possible that we'll see a OOM condition here if the transfer size
+     * is sufficiently large.
+     */
+    size = iovector_size(iovec);
+    buffer = qemu_malloc(size);
+    if (buffer == NULL)
+	return -ENOMEM;
+
+    memcpy_from_iovector(buffer, 0, size, iovec);
+
+    ret = bdrv_write(bs, sector_num, buffer, size / 512);
+
+    qemu_free(buffer);
+
+    return ret;
+}
+
 static int bdrv_pread_em(BlockDriverState *bs, int64_t offset,
                          uint8_t *buf, int count1)
 {
diff --git a/block.h b/block.h
index b730505..9d30db2 100644
--- a/block.h
+++ b/block.h
@@ -1,6 +1,8 @@
 #ifndef BLOCK_H
 #define BLOCK_H
 
+#include "iovector.h"
+
 /* block.c */
 typedef struct BlockDriver BlockDriver;
 
@@ -67,6 +69,9 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num,
               uint8_t *buf, int nb_sectors);
 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
                const uint8_t *buf, int nb_sectors);
+int bdrv_readv(BlockDriverState *bs, int64_t sector_num, IOVector *iovec);
+int bdrv_writev(BlockDriverState *bs, int64_t sector_num,
+		const IOVector *iovec);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
                void *buf, int count);
 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
diff --git a/hw/pci.c b/hw/pci.c
index bc55989..3282478 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -145,6 +145,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
     return 0;
 }
 
+void pci_device_dma_write(PCIDevice *s, target_phys_addr_t addr,
+			  const void *buffer, size_t len)
+{
+    cpu_physical_memory_write(addr, buffer, len);
+}
+
+void pci_device_dma_read(PCIDevice *s, target_phys_addr_t addr,
+			 void *buffer, size_t len)
+{
+    cpu_physical_memory_read(addr, buffer, len);
+}
+
 /* -1 for devfn means auto assign */
 PCIDevice *pci_register_device(PCIBus *bus, const char *name,
                                int instance_size, int devfn,
diff --git a/hw/pci.h b/hw/pci.h
index e870987..c885cc5 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -81,6 +81,12 @@ void pci_default_write_config(PCIDevice *d,
 void pci_device_save(PCIDevice *s, QEMUFile *f);
 int pci_device_load(PCIDevice *s, QEMUFile *f);
 
+void pci_device_dma_write(PCIDevice *s, target_phys_addr_t addr,
+			  const void *buffer, size_t len);
+
+void pci_device_dma_read(PCIDevice *s, target_phys_addr_t addr,
+			 void *buffer, size_t len);
+
 typedef void (*pci_set_irq_fn)(qemu_irq *pic, int irq_num, int level);
 typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
 PCIBus *pci_register_bus(pci_set_irq_fn set_irq, pci_map_irq_fn map_irq,
diff --git a/iovector.c b/iovector.c
new file mode 100644
index 0000000..056a86e
--- /dev/null
+++ b/iovector.c
@@ -0,0 +1,144 @@
+/*
+ * IO Vectors
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "iovector.h"
+
+static size_t iovector_rw(void *buffer, size_t size, IOVector *iov, int read)
+{
+    uint8_t *ptr = buffer;
+    size_t offset = 0;
+    int i;
+
+    for (i = 0; i < iov->num; i++) {
+	size_t len;
+
+	len = MIN(iov->sg[i].len, size - offset);
+
+	if (read)
+	    iov->read(iov->opaque, iov->sg[i].base, ptr + offset, len);
+	else
+	    iov->write(iov->opaque, iov->sg[i].base, ptr + offset, len);
+
+	offset += len;
+    }
+
+    return offset;
+}
+
+size_t memcpy_from_iovector(void *buffer, size_t offset, size_t size,
+			    const IOVector *iov)
+{
+    IOVector *sg;
+    size_t len;
+
+    if (offset)
+	sg = iovector_trim(iov, offset, size);
+    else
+	sg = (IOVector *)iov;
+
+    len = iovector_rw(buffer, size, sg, 1);
+
+    if (offset)
+	iovector_free(sg);
+
+    return len;
+}
+
+size_t memcpy_to_iovector(IOVector *iovec, size_t offset, size_t size,
+			  const void *buffer)
+{
+    IOVector *sg;
+    size_t len;
+
+    if (offset)
+	sg = iovector_trim(iovec, offset, size);
+    else
+	sg = iovec;
+
+    len = iovector_rw((void *)buffer, size, sg, 0);
+
+    if (offset)
+	iovector_free(sg);
+
+    return len;
+}
+
+IOVector *iovector_new(int num, DMAReadHandler *read, DMAWriteHandler *write,
+		       void *opaque)
+{
+    IOVector *ret;
+
+    ret = qemu_malloc(sizeof(IOVector) + sizeof(IOVectorElement) * num);
+    if (ret == NULL)
+	return NULL;
+
+    ret->num = num;
+    ret->read = read;
+    ret->write = write;
+    ret->opaque = opaque;
+
+    return ret;
+}
+
+void iovector_free(IOVector *iov)
+{
+    qemu_free(iov);
+}
+
+IOVector *iovector_trim(const IOVector *iov, size_t offset, size_t size)
+{
+    IOVector *ret;
+    size_t off, total_size;
+    int i;
+
+    ret = iovector_new(iov->num, iov->read, iov->write, iov->opaque);
+    if (ret == NULL)
+	return NULL;
+
+    total_size = 0;
+    ret->num = 0;
+
+    off = 0;
+    for (i = 0; i < iov->num; i++) {
+	if (off >= offset || offset < (off + iov->sg[i].len)) {
+	    size_t fudge = 0;
+	    if (off < offset)
+		fudge = offset - off;
+
+	    ret->sg[ret->num].base = iov->sg[i].base + fudge;
+	    ret->sg[ret->num].len = MIN(iov->sg[i].len - fudge,
+					size - total_size);
+	    total_size += ret->sg[ret->num].len;
+	    ret->num++;
+
+	    if (total_size == size)
+		break;
+	}
+
+	off += iov->sg[i].len;
+    }
+
+    return ret;
+}
+
+size_t iovector_size(const IOVector *iov)
+{
+    size_t size = 0;
+    int i;
+
+    for (i = 0; i < iov->num; i++)
+	size += iov->sg[i].len;
+    
+    return size;
+}
diff --git a/iovector.h b/iovector.h
new file mode 100644
index 0000000..f40f0a0
--- /dev/null
+++ b/iovector.h
@@ -0,0 +1,63 @@
+/*
+ * IO Vectors
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_IOVECTOR_H
+#define _QEMU_IOVECTOR_H
+
+typedef struct IOVectorElement IOVectorElement;
+
+typedef void (DMAReadHandler)(void *opaque, uint64_t addr,
+			      void *data, size_t len);
+
+typedef void (DMAWriteHandler)(void *opaque, uint64_t addr,
+			       const void *data, size_t len);
+
+typedef struct IOVector
+{
+    DMAWriteHandler *write;
+    DMAReadHandler *read;
+    void *opaque;
+
+    int num;
+    struct IOVectorElement {
+	uint64_t base;
+	size_t len;
+    } sg[0];
+} IOVector;
+
+/* Copy from an IOVector to a flat buffer.  Be careful to pass in a fully
+ * translated IOVector here. */
+size_t memcpy_from_iovector(void *buffer, size_t offset, size_t size,
+			    const IOVector *iov);
+
+/* Copy to an IOVector from a flat buffer.  Be careful to pass in a fully
+ * translated IOVector here. */
+size_t memcpy_to_iovector(IOVector *iovec, size_t offset, size_t size,
+			  const void *buffer);
+
+/* Return a new IOVector that's a subset of the passed in IOVector.  It should
+ * be freed with iovector_free when you are done with it. */
+IOVector *iovector_trim(const IOVector *iov, size_t offset, size_t size);
+
+/* Returns the size of an IOVector in bytes */
+size_t iovector_size(const IOVector *iov);
+
+/* Returns a new IOVector with num elements.  iov->num will be set to num on
+ * return */
+IOVector *iovector_new(int num, DMAReadHandler *read, DMAWriteHandler *write,
+		       void *opaque);
+
+/* Frees an IOVector */
+void iovector_free(IOVector *iov);
+
+#endif
diff --git a/net.h b/net.h
index 2dfff8d..0b3a155 100644
--- a/net.h
+++ b/net.h
@@ -1,6 +1,8 @@
 #ifndef QEMU_NET_H
 #define QEMU_NET_H
 
+#include "iovector.h"
+
 /* VLANs support */
 
 typedef struct VLANClientState VLANClientState;
@@ -30,6 +32,7 @@ VLANClientState *qemu_new_vlan_client(VLANState *vlan,
                                       void *opaque);
 int qemu_can_send_packet(VLANClientState *vc);
 void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size);
+void qemu_sendv_packet(VLANClientState *vc, const IOVector *iovec);
 void qemu_handler_true(void *opaque);
 
 void do_info_network(void);
diff --git a/vl.c b/vl.c
index 318eb35..821c05d 100644
--- a/vl.c
+++ b/vl.c
@@ -3731,6 +3731,22 @@ void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size)
     }
 }
 
+void qemu_sendv_packet(VLANClientState *vc, const IOVector *iovec)
+{
+    size_t size;
+    uint8_t *data;
+
+    size = iovector_size(iovec);
+    data = qemu_malloc(size);
+    if (data == NULL)
+	return;
+
+    memcpy_from_iovector(data, 0, size, iovec);
+    qemu_send_packet(vc, data, size);
+
+    qemu_free(data);
+}
+
 #if defined(CONFIG_SLIRP)
 
 /* slirp network adapter */

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 2/5] virtio for QEMU (v3)
  2008-04-15 22:11 [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Anthony Liguori
@ 2008-04-15 22:11 ` Anthony Liguori
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 3/5] virtio network driver (v3) Anthony Liguori
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Anthony Liguori @ 2008-04-15 22:11 UTC (permalink / raw)
  To: qemu-devel
  Cc: Anthony Liguori, kvm-devel, Marcelo Tosatti, Paul Brook,
	Aurelien Jarno

This patch introduces virtio support over PCI.  virtio is a generic virtual IO
framework for Linux first introduced in 2.6.23.  Since 2.6.25, virtio has
supported a PCI transport which this patch implements.

Since the last time these patches were posted to qemu-devel, I've reworked it
to use the proper access functions to manipulate guest memory.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 8470164..3e9f7b1 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -534,6 +534,9 @@ OBJS += pcnet.o
 OBJS += rtl8139.o
 OBJS += e1000.o
 
+# virtio devices
+OBJS += virtio.o
+
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
 OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
diff --git a/cutils.c b/cutils.c
index 9ef2fa6..814b3c4 100644
--- a/cutils.c
+++ b/cutils.c
@@ -95,3 +95,14 @@ time_t mktimegm(struct tm *tm)
     t += 3600 * tm->tm_hour + 60 * tm->tm_min + tm->tm_sec;
     return t;
 }
+
+int fls(int i)
+{
+    int bit;
+
+    for (bit=31; bit >= 0; bit--)
+        if (i & (1 << bit))
+            return bit+1;
+
+    return 0;
+}
diff --git a/hw/virtio-pci.h b/hw/virtio-pci.h
new file mode 100644
index 0000000..9262e49
--- /dev/null
+++ b/hw/virtio-pci.h
@@ -0,0 +1,65 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _VIRTIO_PCI_H
+#define _VIRTIO_PCI_H
+
+/* from Linux's linux/virtio_ring.h */
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT	1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE	2
+
+/* This means don't notify other side when buffer added. */
+#define VRING_USED_F_NO_NOTIFY	1
+/* This means don't interrupt guest when buffer consumed. */
+#define VRING_AVAIL_F_NO_INTERRUPT	1
+
+#define VIRTIO_PCI_QUEUE_MAX	16
+
+/* from Linux's linux/virtio_pci.h */
+
+/* A 32-bit r/o bitmask of the features supported by the host */
+#define VIRTIO_PCI_HOST_FEATURES	0
+
+/* A 32-bit r/w bitmask of features activated by the guest */
+#define VIRTIO_PCI_GUEST_FEATURES	4
+
+/* A 32-bit r/w PFN for the currently selected queue */
+#define VIRTIO_PCI_QUEUE_PFN		8
+
+/* A 16-bit r/o queue size for the currently selected queue */
+#define VIRTIO_PCI_QUEUE_NUM		12
+
+/* A 16-bit r/w queue selector */
+#define VIRTIO_PCI_QUEUE_SEL		14
+
+/* A 16-bit r/w queue notifier */
+#define VIRTIO_PCI_QUEUE_NOTIFY		16
+
+/* An 8-bit device status register.  */
+#define VIRTIO_PCI_STATUS		18
+
+/* An 8-bit r/o interrupt status register.  Reading the value will return the
+ * current contents of the ISR and will also clear it.  This is effectively
+ * a read-and-acknowledge. */
+#define VIRTIO_PCI_ISR			19
+
+#define VIRTIO_PCI_CONFIG		20
+
+/* Virtio ABI version, if we increment this, we break the guest driver. */
+#define VIRTIO_PCI_ABI_VERSION		0
+
+#endif
diff --git a/hw/virtio.c b/hw/virtio.c
new file mode 100644
index 0000000..f364ef3
--- /dev/null
+++ b/hw/virtio.c
@@ -0,0 +1,592 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */ 
+
+#include <inttypes.h>
+#include <err.h>
+
+#include "virtio.h"
+#include "sysemu.h"
+#include "virtio-pci.h"
+
+typedef struct VRingDesc
+{
+    /* Address (guest-physical). */
+    uint64_t addr;
+    /* Length. */
+    uint32_t len;
+    /* The flags as indicated above. */
+    uint16_t flags;
+    /* We chain unused descriptors via this, too */
+    uint16_t next;
+} VRingDesc;
+
+typedef struct VRingAvail
+{
+    uint16_t flags;
+    uint16_t idx;
+    uint16_t ring[];
+} VRingAvail;
+
+typedef struct VRingUsedElem
+{
+    /* Index of start of used descriptor chain. */
+    uint32_t id;
+    /* Total length of the descriptor chain which was used (written to) */
+    uint32_t len;
+} VRingUsedElem;
+
+typedef struct VRingUsed
+{
+    uint16_t flags;
+    uint16_t idx;
+    VRingUsedElem ring[];
+} VRingUsed;
+
+typedef struct VRing
+{
+    unsigned int num;
+    target_phys_addr_t desc;
+    target_phys_addr_t avail;
+    target_phys_addr_t used;
+} VRing;
+
+struct VirtQueue
+{
+    VRing vring;
+    uint32_t pfn;
+    uint16_t last_avail_idx;
+    void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
+    int index;
+    VirtIODevice *vdev;
+};
+
+/* QEMU doesn't strictly need write barriers since everything runs in
+ * lock-step.  We'll leave the calls to wmb() in though to make it obvious for
+ * KVM or if kqemu gets SMP support.
+ */
+
+#define wmb() do { } while (0)
+
+/* FIXME put this somewhere generic */
+#define offsetof(type, member) ((unsigned long)(&((type *)0)->member))
+
+/* virt queue functions */
+
+static void virtqueue_init(VirtQueue *vq, target_phys_addr_t p)
+{
+    vq->vring.desc = p;
+    vq->vring.avail = p + vq->vring.num * 16;
+    vq->vring.used = vq->vring.avail + 2 * (2 + vq->vring.num);
+    vq->vring.used = TARGET_PAGE_ALIGN(vq->vring.used);
+}
+
+static uint64_t vring_desc_addr(VirtQueue *vq, unsigned int i)
+{
+    return ldq_phys(vq->vring.desc + i * sizeof(VRingDesc) +
+		    offsetof(VRingDesc, addr));
+}
+
+static uint32_t vring_desc_len(VirtQueue *vq, unsigned int i)
+{
+    return ldl_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
+		    offsetof(VRingDesc, len));
+}
+
+static uint16_t vring_desc_flags(VirtQueue *vq, unsigned int i)
+{
+    return lduw_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
+		     offsetof(VRingDesc, flags));
+}
+
+static uint16_t vring_desc_next(VirtQueue *vq, unsigned int i)
+{
+    return lduw_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
+		     offsetof(VRingDesc, next));
+}
+
+static uint16_t vring_avail_flags(VirtQueue *vq)
+{
+    return lduw_phys(vq->vring.avail + offsetof(VRingAvail, flags));
+}
+
+static uint16_t vring_avail_idx(VirtQueue *vq)
+{
+    return lduw_phys(vq->vring.avail + offsetof(VRingAvail, idx));
+}
+
+static uint16_t vring_avail_ring(VirtQueue *vq, unsigned int i)
+{
+    return lduw_phys(vq->vring.avail + offsetof(VRingAvail, ring[i]));
+}
+
+static void vring_used_set_flag(VirtQueue *vq, uint16_t flag)
+{
+    stw_phys(vq->vring.used + offsetof(VRingUsed, flags),
+	     lduw_phys(vq->vring.used + offsetof(VRingUsed, flags)) | flag);
+}
+
+static void vring_used_unset_flag(VirtQueue *vq, uint16_t flag)
+{
+    stw_phys(vq->vring.used + offsetof(VRingUsed, flags),
+	     lduw_phys(vq->vring.used + offsetof(VRingUsed, flags)) & ~flag);
+}
+
+static uint16_t vring_used_get_idx(VirtQueue *vq)
+{
+    return lduw_phys(vq->vring.used + offsetof(VRingUsed, idx));
+}
+
+static void vring_used_set_idx(VirtQueue *vq, uint16_t value)
+{
+    stw_phys(vq->vring.used + offsetof(VRingUsed, idx), value);
+}
+
+static void vring_used_set_ring(VirtQueue *vq, unsigned int i,
+				uint32_t id, uint32_t len)
+{
+    stl_phys(vq->vring.used + offsetof(VRingUsed, ring[i].id), id);
+    stl_phys(vq->vring.used + offsetof(VRingUsed, ring[i].len), len);
+}
+
+static unsigned virtqueue_next_desc(VirtQueue *vq, unsigned int i)
+{
+    unsigned int next;
+
+    /* If this descriptor says it doesn't chain, we're done. */
+    if (!(vring_desc_flags(vq, i) & VRING_DESC_F_NEXT))
+	return vq->vring.num;
+
+    /* Check they're not leading us off end of descriptors. */
+    next = vring_desc_next(vq, i);
+    /* Make sure compiler knows to grab that: we don't want it changing! */
+    wmb();
+
+    if (next >= vq->vring.num)
+	errx(1, "Desc next is %u", next);
+
+    return next;
+}
+
+void virtqueue_push(VirtQueue *vq, VirtQueueElement *elem, unsigned int len)
+{
+    uint16_t idx;
+
+    idx = vring_used_get_idx(vq);
+    vring_used_set_ring(vq, idx % vq->vring.num, elem->index, len);
+    wmb();
+    vring_used_set_idx(vq, idx + 1);
+
+    iovector_free(elem->in);
+    iovector_free(elem->out);
+    qemu_free(elem);
+}
+
+VirtQueueElement *virtqueue_pop(VirtQueue *vq)
+{
+    unsigned int i, head;
+    unsigned int position;
+    VirtQueueElement *elem;
+
+    /* Check it isn't doing very strange things with descriptor numbers. */
+    if ((uint16_t)(vring_avail_idx(vq) - vq->last_avail_idx) > vq->vring.num)
+	errx(1, "Guest moved used index from %u to %u",
+	     vq->last_avail_idx, vring_avail_idx(vq));
+
+    /* If there's nothing new since last we looked, return invalid. */
+    if (vring_avail_idx(vq) == vq->last_avail_idx)
+	return NULL;
+
+    /* Grab the next descriptor number they're advertising, and increment
+     * the index we've seen. */
+    head = vring_avail_ring(vq, vq->last_avail_idx++ % vq->vring.num);
+
+    /* If their number is silly, that's a fatal mistake. */
+    if (head >= vq->vring.num)
+	errx(1, "Guest says index %u is available", head);
+
+    /* When we start there are none of either input nor output. */
+    position = 0;
+
+    elem = qemu_mallocz(sizeof(VirtQueueElement));
+
+    elem->in = iovector_new(vq->vring.num,
+			    (DMAReadHandler *)pci_device_dma_read,
+			    (DMAWriteHandler *)pci_device_dma_write,
+			    &vq->vdev->pci_dev);
+    elem->out = iovector_new(vq->vring.num,
+			    (DMAReadHandler *)pci_device_dma_read,
+			    (DMAWriteHandler *)pci_device_dma_write,
+			     &vq->vdev->pci_dev);
+
+    elem->in->num = elem->out->num = 0;
+
+    i = head;
+    do {
+	IOVectorElement *sge;
+
+	if (vring_desc_flags(vq, i) & VRING_DESC_F_WRITE)
+	    sge = &elem->in->sg[elem->in->num++];
+	else
+	    sge = &elem->out->sg[elem->out->num++];
+
+	/* Grab the first descriptor, and check it's OK. */
+	sge->len = vring_desc_len(vq, i);
+	sge->base = vring_desc_addr(vq, i);
+
+	/* If we've got too many, that implies a descriptor loop. */
+	if ((elem->in->num + elem->out->num) > vq->vring.num)
+	    errx(1, "Looped descriptor");
+    } while ((i = virtqueue_next_desc(vq, i)) != vq->vring.num);
+
+    elem->index = head;
+
+    return elem;
+}
+
+/* virtio device */
+
+static VirtIODevice *to_virtio_device(PCIDevice *pci_dev)
+{
+    return (VirtIODevice *)pci_dev;
+}
+
+static void virtio_update_irq(VirtIODevice *vdev)
+{
+    qemu_set_irq(vdev->pci_dev.irq[0], vdev->isr & 1);
+}
+
+void virtio_reset(void *opaque)
+{
+    VirtIODevice *vdev = opaque;
+    int i;
+
+    vdev->features = 0;
+    vdev->queue_sel = 0;
+    vdev->status = 0;
+    vdev->isr = 0;
+
+    for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
+        vdev->vq[i].vring.desc = 0;
+        vdev->vq[i].vring.avail = 0;
+        vdev->vq[i].vring.used = 0;
+        vdev->vq[i].last_avail_idx = 0;
+        vdev->vq[i].pfn = 0;
+    }
+}
+
+static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
+{
+    VirtIODevice *vdev = to_virtio_device(opaque);
+    ram_addr_t pa;
+
+    addr -= vdev->addr;
+
+    switch (addr) {
+    case VIRTIO_PCI_GUEST_FEATURES:
+	if (vdev->set_features)
+	    vdev->set_features(vdev, val);
+	vdev->features = val;
+	break;
+    case VIRTIO_PCI_QUEUE_PFN:
+	pa = (ram_addr_t)val << TARGET_PAGE_BITS;
+	vdev->vq[vdev->queue_sel].pfn = val;
+	if (pa == 0)
+            virtio_reset(vdev);
+	else
+	    virtqueue_init(&vdev->vq[vdev->queue_sel], pa);
+	break;
+    case VIRTIO_PCI_QUEUE_SEL:
+	if (val < VIRTIO_PCI_QUEUE_MAX)
+	    vdev->queue_sel = val;
+	break;
+    case VIRTIO_PCI_QUEUE_NOTIFY:
+	if (val < VIRTIO_PCI_QUEUE_MAX && vdev->vq[val].vring.desc)
+	    vdev->vq[val].handle_output(vdev, &vdev->vq[val]);
+	break;
+    case VIRTIO_PCI_STATUS:
+	vdev->status = val & 0xFF;
+	if (vdev->status == 0)
+	    virtio_reset(vdev);
+	break;
+    }
+}
+
+static uint32_t virtio_ioport_read(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = to_virtio_device(opaque);
+    uint32_t ret = 0xFFFFFFFF;
+
+    addr -= vdev->addr;
+
+    switch (addr) {
+    case VIRTIO_PCI_HOST_FEATURES:
+	ret = vdev->get_features(vdev);
+	break;
+    case VIRTIO_PCI_GUEST_FEATURES:
+	ret = vdev->features;
+	break;
+    case VIRTIO_PCI_QUEUE_PFN:
+	ret = vdev->vq[vdev->queue_sel].pfn;
+	break;
+    case VIRTIO_PCI_QUEUE_NUM:
+	ret = vdev->vq[vdev->queue_sel].vring.num;
+	break;
+    case VIRTIO_PCI_QUEUE_SEL:
+	ret = vdev->queue_sel;
+	break;
+    case VIRTIO_PCI_STATUS:
+	ret = vdev->status;
+	break;
+    case VIRTIO_PCI_ISR:
+	/* reading from the ISR also clears it. */
+	ret = vdev->isr;
+	vdev->isr = 0;
+	virtio_update_irq(vdev);
+	break;
+    default:
+	break;
+    }
+
+    return ret;
+}
+
+static uint32_t virtio_config_readb(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = opaque;
+    uint8_t val;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return (uint32_t)-1;
+
+    memcpy(&val, vdev->config + addr, sizeof(val));
+    return val;
+}
+
+static uint32_t virtio_config_readw(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = opaque;
+    uint16_t val;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return (uint32_t)-1;
+
+    memcpy(&val, vdev->config + addr, sizeof(val));
+    return val;
+}
+
+static uint32_t virtio_config_readl(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = opaque;
+    uint32_t val;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return (uint32_t)-1;
+
+    memcpy(&val, vdev->config + addr, sizeof(val));
+    return val;
+}
+
+static void virtio_config_writeb(void *opaque, uint32_t addr, uint32_t data)
+{
+    VirtIODevice *vdev = opaque;
+    uint8_t val = data;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return;
+
+    memcpy(vdev->config + addr, &val, sizeof(val));
+
+    if (vdev->set_config)
+	vdev->set_config(vdev, vdev->config);
+}
+
+static void virtio_config_writew(void *opaque, uint32_t addr, uint32_t data)
+{
+    VirtIODevice *vdev = opaque;
+    uint16_t val = data;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return;
+
+    memcpy(vdev->config + addr, &val, sizeof(val));
+
+    if (vdev->set_config)
+	vdev->set_config(vdev, vdev->config);
+}
+
+static void virtio_config_writel(void *opaque, uint32_t addr, uint32_t data)
+{
+    VirtIODevice *vdev = opaque;
+    uint32_t val = data;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return;
+
+    memcpy(vdev->config + addr, &val, sizeof(val));
+
+    if (vdev->set_config)
+	vdev->set_config(vdev, vdev->config);
+}
+
+static void virtio_map(PCIDevice *pci_dev, int region_num,
+		       uint32_t addr, uint32_t size, int type)
+{
+    VirtIODevice *vdev = to_virtio_device(pci_dev);
+    int i;
+
+    vdev->addr = addr;
+    for (i = 0; i < 3; i++) {
+	register_ioport_write(addr, 20, 1 << i, virtio_ioport_write, vdev);
+	register_ioport_read(addr, 20, 1 << i, virtio_ioport_read, vdev);
+    }
+
+    if (vdev->config_len) {
+	register_ioport_write(addr + 20, vdev->config_len, 1,
+			      virtio_config_writeb, vdev);
+	register_ioport_write(addr + 20, vdev->config_len, 2,
+			      virtio_config_writew, vdev);
+	register_ioport_write(addr + 20, vdev->config_len, 4,
+			      virtio_config_writel, vdev);
+	register_ioport_read(addr + 20, vdev->config_len, 1,
+			     virtio_config_readb, vdev);
+	register_ioport_read(addr + 20, vdev->config_len, 2,
+			     virtio_config_readw, vdev);
+	register_ioport_read(addr + 20, vdev->config_len, 4,
+			     virtio_config_readl, vdev);
+
+	vdev->get_config(vdev, vdev->config);
+    }
+}
+
+VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
+			    void (*handle_output)(VirtIODevice *, VirtQueue *))
+{
+    int i;
+
+    for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
+	if (vdev->vq[i].vring.num == 0)
+	    break;
+    }
+
+    if (i == VIRTIO_PCI_QUEUE_MAX)
+	abort();
+
+    vdev->vq[i].vring.num = queue_size;
+    vdev->vq[i].handle_output = handle_output;
+    vdev->vq[i].index = i;
+    vdev->vq[i].vdev = vdev;
+
+    return &vdev->vq[i];
+}
+
+void virtio_notify_config(VirtIODevice *vdev)
+{
+    /* make sure we have the latest config */
+    vdev->get_config(vdev, vdev->config);
+    vdev->isr = 3;
+    virtio_update_irq(vdev);
+}
+
+void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
+{
+    /* Always notify when queue is empty */
+    if (vring_avail_idx(vq) != vq->last_avail_idx &&
+	(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT))
+	return;
+
+    vdev->isr = 1;
+    virtio_update_irq(vdev);
+}
+
+void virtio_ring_set_used_no_notify(VirtQueue *vq, int enable)
+{
+    if (enable)
+	vring_used_set_flag(vq, VRING_USED_F_NO_NOTIFY);
+    else
+	vring_used_unset_flag(vq, VRING_USED_F_NO_NOTIFY);
+}
+
+size_t virtio_ring_avail_size(VirtQueue *vq)
+{
+    return vring_avail_idx(vq) - vq->last_avail_idx;
+}
+
+int virtio_ring_inited(VirtQueue *vq)
+{
+    return (vq->vring.avail != 0);
+}
+
+VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
+			      uint16_t vendor, uint16_t device,
+			      uint16_t subvendor, uint16_t subdevice,
+			      uint8_t class_code, uint8_t subclass_code,
+			      uint8_t pif, size_t config_size,
+			      size_t struct_size)
+{
+    VirtIODevice *vdev;
+    PCIDevice *pci_dev;
+    uint8_t *config;
+    uint32_t size;
+
+    pci_dev = pci_register_device(bus, name, struct_size,
+				  -1, NULL, NULL);
+    vdev = to_virtio_device(pci_dev);
+
+    vdev->status = 0;
+    vdev->isr = 0;
+    vdev->queue_sel = 0;
+    vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
+
+    config = pci_dev->config;
+    config[0x00] = vendor & 0xFF;
+    config[0x01] = (vendor >> 8) & 0xFF;
+    config[0x02] = device & 0xFF;
+    config[0x03] = (device >> 8) & 0xFF;
+
+    config[0x08] = VIRTIO_PCI_ABI_VERSION;
+
+    config[0x09] = pif;
+    config[0x0a] = subclass_code;
+    config[0x0b] = class_code;
+    config[0x0e] = 0x00;
+
+    config[0x2c] = subvendor & 0xFF;
+    config[0x2d] = (subvendor >> 8) & 0xFF;
+    config[0x2e] = subdevice & 0xFF;
+    config[0x2f] = (subdevice >> 8) & 0xFF;
+
+    config[0x3d] = 1;
+
+    vdev->name = name;
+    vdev->config_len = config_size;
+    if (vdev->config_len)
+	vdev->config = qemu_mallocz(config_size);
+    else
+	vdev->config = NULL;
+
+    size = 20 + config_size;
+    if (size & (size-1))
+        size = 1 << fls(size);
+
+    pci_register_io_region(pci_dev, 0, size, PCI_ADDRESS_SPACE_IO,
+			   virtio_map);
+    qemu_register_reset(virtio_reset, vdev);
+
+    return vdev;
+}
diff --git a/hw/virtio.h b/hw/virtio.h
new file mode 100644
index 0000000..4b991d0
--- /dev/null
+++ b/hw/virtio.h
@@ -0,0 +1,88 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_VIRTIO_H
+#define _QEMU_VIRTIO_H
+
+#include "hw.h"
+#include "pci.h"
+#include "iovector.h"
+
+/* from Linux's linux/virtio_config.h */
+
+/* Status byte for guest to report progress, and synchronize features. */
+/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
+#define VIRTIO_CONFIG_S_ACKNOWLEDGE	1
+/* We have found a driver for the device. */
+#define VIRTIO_CONFIG_S_DRIVER		2
+/* Driver has used its parts of the config, and is happy */
+#define VIRTIO_CONFIG_S_DRIVER_OK	4
+/* We've given up on this device. */
+#define VIRTIO_CONFIG_S_FAILED		0x80
+
+typedef struct VirtQueue VirtQueue;
+typedef struct VirtIODevice VirtIODevice;
+
+typedef struct VirtQueueElement
+{
+    unsigned int index;
+    IOVector *in, *out;
+} VirtQueueElement;
+
+struct VirtIODevice
+{
+    PCIDevice pci_dev;
+    const char *name;
+    uint32_t addr;
+    uint16_t vendor;
+    uint16_t device;
+    uint8_t status;
+    uint8_t isr;
+    uint16_t queue_sel;
+    uint32_t features;
+    size_t config_len;
+    void *config;
+    uint32_t (*get_features)(VirtIODevice *vdev);
+    void (*set_features)(VirtIODevice *vdev, uint32_t val);
+    void (*get_config)(VirtIODevice *vdev, uint8_t *config);
+    void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
+    VirtQueue *vq;
+};
+
+VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
+			      uint16_t vendor, uint16_t device,
+			      uint16_t subvendor, uint16_t subdevice,
+			      uint8_t class_code, uint8_t subclass_code,
+			      uint8_t pif, size_t config_size,
+			      size_t struct_size);
+
+VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
+			    void (*handle_output)(VirtIODevice *,
+						  VirtQueue *));
+
+void virtqueue_push(VirtQueue *vq, VirtQueueElement *elem, unsigned int len);
+
+VirtQueueElement *virtqueue_pop(VirtQueue *vq);
+
+void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
+
+void virtio_ring_set_used_no_notify(VirtQueue *vq, int enable);
+
+size_t virtio_ring_avail_size(VirtQueue *vq);
+
+int virtio_ring_inited(VirtQueue *vq);
+
+void virtio_notify_config(VirtIODevice *vdev);
+
+#endif
diff --git a/qemu-common.h b/qemu-common.h
index 746dcc5..cd387b1 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -85,6 +85,7 @@ char *pstrcat(char *buf, int buf_size, const char *s);
 int strstart(const char *str, const char *val, const char **ptr);
 int stristart(const char *str, const char *val, const char **ptr);
 time_t mktimegm(struct tm *tm);
+int fls(int i);
 
 /* Error handling.  */
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 3/5] virtio network driver (v3)
  2008-04-15 22:11 [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Anthony Liguori
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 2/5] virtio for QEMU (v3) Anthony Liguori
@ 2008-04-15 22:11 ` Anthony Liguori
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 4/5] virtio block " Anthony Liguori
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Anthony Liguori @ 2008-04-15 22:11 UTC (permalink / raw)
  To: qemu-devel
  Cc: Anthony Liguori, kvm-devel, Marcelo Tosatti, Paul Brook,
	Aurelien Jarno

This patch implements the virtio network driver backend.  In KVM, this driver
can achieve 1gbit tx/rx performance.  More patches are required to improve the
network IO infrastructure to achieve better performance in QEMU.

Since v1, I've updated the patch based on the IOVector refactoring.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 3e9f7b1..ea632fa 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -535,7 +535,7 @@ OBJS += rtl8139.o
 OBJS += e1000.o
 
 # virtio devices
-OBJS += virtio.o
+OBJS += virtio.o virtio-net.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
diff --git a/hw/pci.c b/hw/pci.c
index 3282478..94452d3 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -652,9 +652,11 @@ void pci_nic_init(PCIBus *bus, NICInfo *nd, int devfn)
         pci_e1000_init(bus, nd, devfn);
     } else if (strcmp(nd->model, "pcnet") == 0) {
         pci_pcnet_init(bus, nd, devfn);
+    } else if (strcmp(nd->model, "virtio") == 0) {
+	virtio_net_init(bus, nd, devfn);
     } else if (strcmp(nd->model, "?") == 0) {
         fprintf(stderr, "qemu: Supported PCI NICs: i82551 i82557b i82559er"
-                        " ne2k_pci pcnet rtl8139 e1000\n");
+                        " ne2k_pci pcnet rtl8139 e1000 virtio\n");
         exit (1);
     } else {
         fprintf(stderr, "qemu: Unsupported NIC: %s\n", nd->model);
diff --git a/hw/pci.h b/hw/pci.h
index c885cc5..e9e5ed3 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -145,4 +145,7 @@ PCIBus *pci_prep_init(qemu_irq *pic);
 PCIBus *pci_apb_init(target_phys_addr_t special_base, target_phys_addr_t mem_base,
                      qemu_irq *pic);
 
+/* virtio.c */
+PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn);
+
 #endif
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
new file mode 100644
index 0000000..e21aa1e
--- /dev/null
+++ b/hw/virtio-net.c
@@ -0,0 +1,162 @@
+/*
+ * Virtio Network Device
+ *
+ * Copyright IBM, Corp. 2007
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "virtio.h"
+#include "net.h"
+#include "pc.h"
+#include "qemu-timer.h"
+#include "virtio-net.h"
+
+#define TX_TIMER_INTERVAL (1000 / 500)
+
+typedef struct VirtIONet
+{
+    VirtIODevice vdev;
+    uint8_t mac[6];
+    VirtQueue *rx_vq;
+    VirtQueue *tx_vq;
+    VLANClientState *vc;
+    QEMUTimer *tx_timer;
+    int tx_timer_active;
+} VirtIONet;
+
+static VirtIONet *to_virtio_net(VirtIODevice *vdev)
+{
+    return (VirtIONet *)vdev;
+}
+
+static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    struct virtio_net_config netcfg;
+
+    memcpy(netcfg.mac, n->mac, 6);
+    memcpy(config, &netcfg, sizeof(netcfg));
+}
+
+static uint32_t virtio_net_get_features(VirtIODevice *vdev)
+{
+    return (1 << VIRTIO_NET_F_MAC);
+}
+
+/* RX */
+
+static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
+{
+}
+
+static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
+{
+    VirtIONet *n = opaque;
+    VirtQueueElement *elem;
+    struct virtio_net_hdr hdr;
+
+    /* FIXME: the drivers really need to set their status better */
+    if (!virtio_ring_inited(n->rx_vq))
+	return;
+
+    if ((elem = virtqueue_pop(n->rx_vq)) == NULL)
+	/* wait until the guest adds some rx bufs */
+	return;
+
+    memset(&hdr, 0, sizeof(hdr));
+    hdr.flags = 0;
+    hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+    memcpy_to_iovector(elem->in, 0, sizeof(hdr), &hdr);
+    memcpy_to_iovector(elem->in, sizeof(hdr), size, buf);
+
+    /* signal other side */
+    virtqueue_push(n->rx_vq, elem, sizeof(hdr) + size);
+    virtio_notify(&n->vdev, n->rx_vq);
+}
+
+/* TX */
+static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
+{
+    VirtQueueElement *elem;
+
+    if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
+        return;
+
+    while ((elem = virtqueue_pop(vq))) {
+	IOVector *sg;
+	size_t out_size;
+
+	/* ignore the header for now */
+	out_size = iovector_size(elem->out);
+
+	sg = iovector_trim(elem->out, sizeof(struct virtio_net_hdr),
+			   out_size - sizeof(struct virtio_net_hdr));
+
+	qemu_sendv_packet(n->vc, sg);
+
+	iovector_free(sg);
+
+	virtqueue_push(vq, elem, out_size);
+	virtio_notify(&n->vdev, vq);
+    }
+}
+
+static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+
+    if (n->tx_timer_active &&
+	virtio_ring_avail_size(vq) == 64) {
+	virtio_ring_set_used_no_notify(vq, 0);
+	qemu_del_timer(n->tx_timer);
+	n->tx_timer_active = 0;
+	virtio_net_flush_tx(n, vq);
+    } else {
+	qemu_mod_timer(n->tx_timer,
+		       qemu_get_clock(vm_clock) + TX_TIMER_INTERVAL);
+	n->tx_timer_active = 1;
+	virtio_ring_set_used_no_notify(vq, 1);
+    }
+}
+
+static void virtio_net_tx_timer(void *opaque)
+{
+    VirtIONet *n = opaque;
+
+    n->tx_timer_active = 0;
+
+    /* Just in case the driver is not ready on more */
+    if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
+        return;
+
+    virtio_ring_set_used_no_notify(n->tx_vq, 0);
+    virtio_net_flush_tx(n, n->tx_vq);
+}
+
+PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
+{
+    VirtIONet *n;
+
+    n = (VirtIONet *)virtio_init_pci(bus, "virtio-net", 6900, 0x1000,
+				     0, VIRTIO_ID_NET,
+				     0x02, 0x00, 0x00,
+				     6, sizeof(VirtIONet));
+
+    n->vdev.get_config = virtio_net_get_config;
+    n->vdev.get_features = virtio_net_get_features;
+    n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
+    n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
+    memcpy(n->mac, nd->macaddr, 6);
+    n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive, NULL, n);
+    n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
+    n->tx_timer_active = 0;
+
+    return (PCIDevice *)n;
+}
diff --git a/hw/virtio-net.h b/hw/virtio-net.h
new file mode 100644
index 0000000..2959198
--- /dev/null
+++ b/hw/virtio-net.h
@@ -0,0 +1,54 @@
+/*
+ * Virtio-net Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_VIRTIO_NET_H
+#define _QEMU_VIRTIO_NET_H
+
+/* from Linux's virtio_net.h */
+
+/* The ID for virtio_net */
+#define VIRTIO_ID_NET	1
+
+/* The feature bitmap for virtio net */
+#define VIRTIO_NET_F_NO_CSUM	0
+#define VIRTIO_NET_F_MAC	5
+#define VIRTIO_NET_F_GS0	6
+
+/* The config defining mac address (6 bytes) */
+struct virtio_net_config
+{
+    uint8_t mac[6];
+} __attribute__((packed));
+
+/* This is the first element of the scatter-gather list.  If you don't
+ * specify GSO or CSUM features, you can simply ignore the header. */
+struct virtio_net_hdr
+{
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	// Use csum_start, csum_offset
+    uint8_t flags;
+#define VIRTIO_NET_HDR_GSO_NONE		0	// Not a GSO frame
+#define VIRTIO_NET_HDR_GSO_TCPV4	1	// GSO frame, IPv4 TCP (TSO)
+/* FIXME: Do we need this?  If they said they can handle ECN, do they care? */
+#define VIRTIO_NET_HDR_GSO_TCPV4_ECN	2	// GSO frame, IPv4 TCP w/ ECN
+#define VIRTIO_NET_HDR_GSO_UDP		3	// GSO frame, IPv4 UDP (UFO)
+#define VIRTIO_NET_HDR_GSO_TCPV6	4	// GSO frame, IPv6 TCP
+#define VIRTIO_NET_HDR_GSO_ECN		0x80	// TCP has ECN set
+    uint8_t gso_type;
+    uint16_t hdr_len;
+    uint16_t gso_size;
+    uint16_t csum_start;
+    uint16_t csum_offset;
+};
+
+#endif

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 4/5] virtio block driver (v3)
  2008-04-15 22:11 [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Anthony Liguori
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 2/5] virtio for QEMU (v3) Anthony Liguori
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 3/5] virtio network driver (v3) Anthony Liguori
@ 2008-04-15 22:11 ` Anthony Liguori
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 5/5] virtio balloon " Anthony Liguori
  2008-04-16 19:51 ` [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Blue Swirl
  4 siblings, 0 replies; 13+ messages in thread
From: Anthony Liguori @ 2008-04-15 22:11 UTC (permalink / raw)
  To: qemu-devel
  Cc: Anthony Liguori, kvm-devel, Marcelo Tosatti, Paul Brook,
	Aurelien Jarno

This patch implements the virtio block driver backend.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index ea632fa..4d695c7 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -535,7 +535,7 @@ OBJS += rtl8139.o
 OBJS += e1000.o
 
 # virtio devices
-OBJS += virtio.o virtio-net.o
+OBJS += virtio.o virtio-net.o virtio-blk.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
diff --git a/hw/pc.c b/hw/pc.c
index 4fec2d4..2da9413 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -1011,6 +1011,18 @@ static void pc_init1(int ram_size, int vga_ram_size,
 	    }
         }
     }
+
+    /* Add virtio block devices */
+    if (pci_enabled) {
+	int index;
+	int unit_id = 0;
+
+	while ((index = drive_get_index(IF_VIRTIO, 0, unit_id)) != -1) {
+	    virtio_blk_init(pci_bus, drives_table[index].bdrv);
+	    unit_id++;
+	}
+    }
+
 }
 
 static void pc_init_pci(int ram_size, int vga_ram_size,
diff --git a/hw/pc.h b/hw/pc.h
index 9f83050..c828cda 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -143,4 +143,7 @@ void pci_piix4_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
 
 void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd);
 
+/* virtio-blk.c */
+void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs);
+
 #endif
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
new file mode 100644
index 0000000..534e819
--- /dev/null
+++ b/hw/virtio-blk.c
@@ -0,0 +1,112 @@
+/*
+ * Virtio Block Device
+ *
+ * Copyright IBM, Corp. 2007
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "virtio.h"
+#include "block.h"
+#include "block_int.h"
+#include "pc.h"
+#include "virtio-blk.h"
+
+typedef struct VirtIOBlock
+{
+    VirtIODevice vdev;
+    BlockDriverState *bs;
+} VirtIOBlock;
+
+static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
+{
+    return (VirtIOBlock *)vdev;
+}
+
+static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIOBlock *s = to_virtio_blk(vdev);
+    VirtQueueElement *elem;
+
+    while ((elem = virtqueue_pop(vq)) != 0) {
+	struct virtio_blk_outhdr out;
+	struct virtio_blk_inhdr in;
+	unsigned int wlen;
+	size_t in_size, out_size;
+
+	out_size = iovector_size(elem->out);
+	in_size = iovector_size(elem->in);
+
+	memcpy_from_iovector(&out, 0, sizeof(out), elem->out);
+
+	if (out.type & VIRTIO_BLK_T_SCSI_CMD) {
+	    wlen = sizeof(in);
+	    in.status = VIRTIO_BLK_S_UNSUPP;
+	} else if (out.type & VIRTIO_BLK_T_OUT) {
+	    IOVector *sg;
+
+	    sg = iovector_trim(elem->out, sizeof(out),
+			       out_size - sizeof(out));
+	    bdrv_writev(s->bs, out.sector, sg);
+	    iovector_free(sg);
+
+	    wlen = sizeof(in);
+	    in.status = VIRTIO_BLK_S_OK;
+	} else {
+	    IOVector *sg;
+
+	    sg = iovector_trim(elem->in, 0, in_size - sizeof(in));
+	    bdrv_readv(s->bs, out.sector, sg);
+	    iovector_free(sg);
+
+	    wlen = in_size;
+	    in.status = VIRTIO_BLK_S_OK;
+	}
+
+	memcpy_to_iovector(elem->in, in_size - sizeof(in),
+			   sizeof(in), &in);
+
+	virtqueue_push(vq, elem, wlen);
+	virtio_notify(vdev, vq);
+    }
+}
+
+static void virtio_blk_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VirtIOBlock *s = to_virtio_blk(vdev);
+    struct virtio_blk_config blkcfg;
+    int64_t capacity;
+
+    bdrv_get_geometry(s->bs, &capacity);
+    blkcfg.capacity = cpu_to_le64(capacity);
+    blkcfg.seg_max = cpu_to_le32(128 - 2);
+    memcpy(config, &blkcfg, sizeof(blkcfg));
+}
+
+static uint32_t virtio_blk_get_features(VirtIODevice *vdev)
+{
+    return (1 << VIRTIO_BLK_F_SEG_MAX);
+}
+
+void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs)
+{
+    VirtIOBlock *s;
+
+    s = (VirtIOBlock *)virtio_init_pci(bus, "virtio-blk", 6900, 0x1001,
+				       0, VIRTIO_ID_BLOCK,
+				       0x01, 0x80, 0x00,
+				       16, sizeof(VirtIOBlock));
+
+    s->vdev.get_config = virtio_blk_get_config;
+    s->vdev.get_features = virtio_blk_get_features;
+    s->bs = bs;
+
+    virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
+
+    return s;
+}
diff --git a/hw/virtio-blk.h b/hw/virtio-blk.h
new file mode 100644
index 0000000..290ff5b
--- /dev/null
+++ b/hw/virtio-blk.h
@@ -0,0 +1,66 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_VIRTIO_BLK_H
+#define _QEMU_VIRTIO_BLK_H
+
+/* from Linux's linux/virtio_blk.h */
+
+/* The ID for virtio_block */
+#define VIRTIO_ID_BLOCK	2
+
+/* Feature bits */
+#define VIRTIO_BLK_F_BARRIER	0	/* Does host support barriers? */
+#define VIRTIO_BLK_F_SIZE_MAX	1	/* Indicates maximum segment size */
+#define VIRTIO_BLK_F_SEG_MAX	2	/* Indicates maximum # of segments */
+
+struct virtio_blk_config
+{
+    uint64_t capacity;
+    uint32_t size_max;
+    uint32_t seg_max;
+};
+
+/* These two define direction. */
+#define VIRTIO_BLK_T_IN		0
+#define VIRTIO_BLK_T_OUT	1
+
+/* This bit says it's a scsi command, not an actual read or write. */
+#define VIRTIO_BLK_T_SCSI_CMD	2
+
+/* Barrier before this op. */
+#define VIRTIO_BLK_T_BARRIER	0x80000000
+
+/* This is the first element of the read scatter-gather list. */
+struct virtio_blk_outhdr
+{
+    /* VIRTIO_BLK_T* */
+    uint32_t type;
+    /* io priority. */
+    uint32_t ioprio;
+    /* Sector (ie. 512 byte offset) */
+    uint64_t sector;
+};
+
+#define VIRTIO_BLK_S_OK		0
+#define VIRTIO_BLK_S_IOERR	1
+#define VIRTIO_BLK_S_UNSUPP	2
+
+/* This is the first element of the write scatter-gather list */
+struct virtio_blk_inhdr
+{
+    unsigned char status;
+};
+
+#endif
diff --git a/sysemu.h b/sysemu.h
index 0f18e04..0078190 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -119,7 +119,7 @@ extern unsigned int nb_prom_envs;
 #endif
 
 typedef enum {
-    IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD
+    IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD, IF_VIRTIO
 } BlockInterfaceType;
 
 typedef struct DriveInfo {
diff --git a/vl.c b/vl.c
index 821c05d..4c11be6 100644
--- a/vl.c
+++ b/vl.c
@@ -5050,6 +5050,9 @@ static int drive_init(struct drive_opt *arg, int snapshot,
 	} else if (!strcmp(buf, "sd")) {
 	    type = IF_SD;
             max_devs = 0;
+	} else if (!strcmp(buf, "virtio")) {
+	    type = IF_VIRTIO;
+	    max_devs = 0;
 	} else {
             fprintf(stderr, "qemu: '%s' unsupported bus type '%s'\n", str, buf);
             return -1;
@@ -5241,6 +5244,7 @@ static int drive_init(struct drive_opt *arg, int snapshot,
         break;
     case IF_PFLASH:
     case IF_MTD:
+    case IF_VIRTIO:
         break;
     }
     if (!file[0])

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 5/5] virtio balloon driver (v3)
  2008-04-15 22:11 [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Anthony Liguori
                   ` (2 preceding siblings ...)
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 4/5] virtio block " Anthony Liguori
@ 2008-04-15 22:11 ` Anthony Liguori
  2008-04-16 19:51 ` [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Blue Swirl
  4 siblings, 0 replies; 13+ messages in thread
From: Anthony Liguori @ 2008-04-15 22:11 UTC (permalink / raw)
  To: qemu-devel
  Cc: Anthony Liguori, kvm-devel, Marcelo Tosatti, Paul Brook,
	Aurelien Jarno

This patch implements the virtio balloon driver backend.  A user can interact
with the balloon driver using a newly introduce monitor command 'balloon'.

Ballooning is used to request the guest to stop using a certain portion of its
memory.  The guest notifies the host of this memory so the host can immediately
reallocate it.

Ballooning is implemented within QEMU via the madvise() system call.  This is
for Linux hosts only ATM but it should be easy enough to add the right code for
other hosts.

If you balloon down sufficiently, you can see the resident memory of the QEMU
instance decrease when using this driver.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 4d695c7..dead372 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -535,7 +535,7 @@ OBJS += rtl8139.o
 OBJS += e1000.o
 
 # virtio devices
-OBJS += virtio.o virtio-net.o virtio-blk.o
+OBJS += virtio.o virtio-net.o virtio-blk.o virtio-balloon.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
diff --git a/balloon.h b/balloon.h
new file mode 100644
index 0000000..60b4a5d
--- /dev/null
+++ b/balloon.h
@@ -0,0 +1,27 @@
+/*
+ * Balloon
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_BALLOON_H
+#define _QEMU_BALLOON_H
+
+#include "cpu-defs.h"
+
+typedef ram_addr_t (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque);
+
+void qemu_balloon(ram_addr_t target);
+
+ram_addr_t qemu_balloon_status(void);
+
+#endif
diff --git a/hw/pc.c b/hw/pc.c
index 2da9413..8d3401a 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -1023,6 +1023,8 @@ static void pc_init1(int ram_size, int vga_ram_size,
 	}
     }
 
+    if (pci_enabled)
+	virtio_balloon_init(pci_bus);
 }
 
 static void pc_init_pci(int ram_size, int vga_ram_size,
diff --git a/hw/pc.h b/hw/pc.h
index c828cda..67583f7 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -146,4 +146,7 @@ void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd);
 /* virtio-blk.c */
 void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs);
 
+/* virtio-balloon.h */
+void *virtio_balloon_init(PCIBus *bus);
+
 #endif
diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c
new file mode 100644
index 0000000..d97f4b2
--- /dev/null
+++ b/hw/virtio-balloon.c
@@ -0,0 +1,134 @@
+/*
+ * Virtio Block Device
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "virtio.h"
+#include "pc.h"
+#include "sysemu.h"
+#include "cpu.h"
+#include "balloon.h"
+#include "virtio-balloon.h"
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#endif
+
+typedef struct VirtIOBalloon
+{
+    VirtIODevice vdev;
+    VirtQueue *ivq, *dvq;
+    uint32_t num_pages;
+    uint32_t actual;
+} VirtIOBalloon;
+
+static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev)
+{
+    return (VirtIOBalloon *)vdev;
+}
+
+static void balloon_page(void *addr, int deflate)
+{
+#if defined(__linux__)
+    madvise(addr, TARGET_PAGE_SIZE, deflate ? MADV_WILLNEED : MADV_DONTNEED);
+#endif
+}
+
+static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIOBalloon *s = to_virtio_balloon(vdev);
+    VirtQueueElement *elem;
+
+    while ((elem = virtqueue_pop(vq)) != NULL) {
+	size_t offset = 0;
+	uint32_t pfn;
+
+	while (memcpy_from_iovector(&pfn, offset, 4, elem->out) == 4) {
+	    ram_addr_t pa;
+	    ram_addr_t addr;
+
+	    pa = (ram_addr_t)ldl_p(&pfn) << TARGET_PAGE_BITS;
+	    offset += 4;
+
+	    addr = cpu_get_physical_page_desc(pa);
+	    if ((addr & ~TARGET_PAGE_MASK) != IO_MEM_RAM)
+		continue;
+
+	    balloon_page(phys_ram_base + addr, !!(vq == s->dvq));
+	}
+
+	virtqueue_push(vq, elem, offset);
+	virtio_notify(vdev, vq);
+    }
+}
+
+static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
+{
+    VirtIOBalloon *dev = to_virtio_balloon(vdev);
+    struct virtio_balloon_config config;
+
+    config.num_pages = cpu_to_le32(dev->num_pages);
+    config.actual = cpu_to_le32(dev->actual);
+
+    memcpy(config_data, &config, 8);
+}
+
+static void virtio_balloon_set_config(VirtIODevice *vdev,
+				      const uint8_t *config_data)
+{
+    VirtIOBalloon *dev = to_virtio_balloon(vdev);
+    struct virtio_balloon_config config;
+    memcpy(&config, config_data, 8);
+    dev->actual = config.actual;
+}
+
+static uint32_t virtio_balloon_get_features(VirtIODevice *vdev)
+{
+    return 0;
+}
+
+static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t target)
+{
+    VirtIOBalloon *dev = opaque;
+
+    if (target > ram_size)
+	target = ram_size;
+
+    if (target) {
+	dev->num_pages = (ram_size - target) >> TARGET_PAGE_BITS;
+	virtio_notify_config(&dev->vdev);
+    }
+
+    return ram_size - (dev->actual << TARGET_PAGE_BITS);
+}
+
+void *virtio_balloon_init(PCIBus *bus)
+{
+    VirtIOBalloon *s;
+
+    s = (VirtIOBalloon *)virtio_init_pci(bus, "virtio-balloon",
+					 6900, 0x1002,
+					 0, VIRTIO_ID_BALLOON,
+					 0x05, 0x00, 0x00,
+					 8, sizeof(VirtIOBalloon));
+
+    s->vdev.get_config = virtio_balloon_get_config;
+    s->vdev.set_config = virtio_balloon_set_config;
+    s->vdev.get_features = virtio_balloon_get_features;
+
+    s->ivq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
+    s->dvq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
+
+    qemu_add_balloon_handler(virtio_balloon_to_target, s);
+
+    return &s->vdev;
+}
diff --git a/hw/virtio-balloon.h b/hw/virtio-balloon.h
new file mode 100644
index 0000000..27d6985
--- /dev/null
+++ b/hw/virtio-balloon.h
@@ -0,0 +1,34 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_VIRTIO_BALLOON_H
+#define _QEMU_VIRTIO_BALLOON_H
+
+/* from Linux's linux/virtio_balloon.h */
+
+/* The ID for virtio_balloon */
+#define VIRTIO_ID_BALLOON	5
+
+/* The feature bitmap for virtio balloon */
+#define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
+
+struct virtio_balloon_config
+{
+    /* Number of pages host wants Guest to give up. */
+    uint32_t num_pages;
+    /* Number of pages we've actually got in balloon. */
+    uint32_t actual;
+};
+
+#endif
diff --git a/monitor.c b/monitor.c
index 025025b..7f4c096 100644
--- a/monitor.c
+++ b/monitor.c
@@ -34,6 +34,7 @@
 #include "block.h"
 #include "audio/audio.h"
 #include "disas.h"
+#include "balloon.h"
 #include <dirent.h>
 
 #ifdef CONFIG_PROFILER
@@ -1257,6 +1258,23 @@ static void do_wav_capture (const char *path,
 }
 #endif
 
+static void do_balloon(int value)
+{
+    ram_addr_t target = value;
+    qemu_balloon(target << 20);
+}
+
+static void do_info_balloon(void)
+{
+    ram_addr_t actual;
+
+    actual = qemu_balloon_status();
+    if (actual == 0)
+	term_printf("Ballooning not activated in VM\n");
+    else
+	term_printf("balloon: actual=%d\n", (int)(actual >> 20));
+}
+
 static term_cmd_t term_cmds[] = {
     { "help|?", "s?", do_help,
       "[cmd]", "show the help" },
@@ -1328,6 +1346,8 @@ static term_cmd_t term_cmds[] = {
        "capture index", "stop capture" },
     { "memsave", "lis", do_memory_save,
       "addr size file", "save to disk virtual memory dump starting at 'addr' of size 'size'", },
+    { "balloon", "i", do_balloon,
+      "target", "request VM to change it's memory allocation (in MB)" },
     { NULL, NULL, },
 };
 
@@ -1388,6 +1408,8 @@ static term_cmd_t info_cmds[] = {
     { "slirp", "", do_info_slirp,
       "", "show SLIRP statistics", },
 #endif
+    { "balloon", "", do_info_balloon,
+      "", "show balloon information" },
     { NULL, NULL, },
 };
 
diff --git a/vl.c b/vl.c
index 4c11be6..eca3377 100644
--- a/vl.c
+++ b/vl.c
@@ -37,6 +37,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "balloon.h"
 
 #include <unistd.h>
 #include <fcntl.h>
@@ -482,6 +483,31 @@ void hw_error(const char *fmt, ...)
     va_end(ap);
     abort();
 }
+ 
+/***************/
+/* ballooning */
+
+static QEMUBalloonEvent *qemu_balloon_event;
+void *qemu_balloon_event_opaque;
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque)
+{
+    qemu_balloon_event = func;
+    qemu_balloon_event_opaque = opaque;
+}
+
+void qemu_balloon(ram_addr_t target)
+{
+    if (qemu_balloon_event)
+	qemu_balloon_event(qemu_balloon_event_opaque, target);
+}
+
+ram_addr_t qemu_balloon_status(void)
+{
+    if (qemu_balloon_event)
+	return qemu_balloon_event(qemu_balloon_event_opaque, 0);
+    return 0;
+}
 
 /***********************************************************/
 /* keyboard/mouse */

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
  2008-04-15 22:11 [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Anthony Liguori
                   ` (3 preceding siblings ...)
  2008-04-15 22:11 ` [Qemu-devel] [PATCH 5/5] virtio balloon " Anthony Liguori
@ 2008-04-16 19:51 ` Blue Swirl
  2008-04-16 19:54   ` Anthony Liguori
  4 siblings, 1 reply; 13+ messages in thread
From: Blue Swirl @ 2008-04-16 19:51 UTC (permalink / raw)
  To: qemu-devel
  Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Paul Brook,
	Aurelien Jarno

[-- Attachment #1: Type: text/plain, Size: 784 bytes --]

On 4/16/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> This patch introduces a DMA API and plumbs support through the DMA layer.  We
>  use a mostly opaque structure, IOVector to represent a scatter/gather list of
>  physical memory.  Associated with each IOVector is a read/write function and
>  an opaque pointer.  This allows arbitrary transformation/mapping of the
>  data while providing an easy mechanism to short-cut the zero-copy case
>  in the block/net backends.

This looks much better also for Sparc uses. I converted pcnet to use
the IOVectors (see patch), it does not work yet but looks doable.

IMHO the read/write functions should be a property of the bus so that
they are hidden from the device, for pcnet it does not matter as we
have to do the swapping anyway.

[-- Attachment #2: pcnet_dma_api.diff --]
[-- Type: plain/text, Size: 14685 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
  2008-04-16 19:51 ` [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Blue Swirl
@ 2008-04-16 19:54   ` Anthony Liguori
  2008-04-17 19:27     ` Blue Swirl
  0 siblings, 1 reply; 13+ messages in thread
From: Anthony Liguori @ 2008-04-16 19:54 UTC (permalink / raw)
  To: Blue Swirl
  Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno,
	Paul Brook

Blue Swirl wrote:
> On 4/16/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
>> This patch introduces a DMA API and plumbs support through the DMA layer.  We
>>  use a mostly opaque structure, IOVector to represent a scatter/gather list of
>>  physical memory.  Associated with each IOVector is a read/write function and
>>  an opaque pointer.  This allows arbitrary transformation/mapping of the
>>  data while providing an easy mechanism to short-cut the zero-copy case
>>  in the block/net backends.
>>     
>
> This looks much better also for Sparc uses. I converted pcnet to use
> the IOVectors (see patch), it does not work yet but looks doable.
>   

Excellent!

> IMHO the read/write functions should be a property of the bus so that
> they are hidden from the device, for pcnet it does not matter as we
> have to do the swapping anyway.
>   

For an IOMMU that has a per-device mapping, the read/write functions 
have to operate on a per-device basis.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
  2008-04-16 19:54   ` Anthony Liguori
@ 2008-04-17 19:27     ` Blue Swirl
  2008-04-17 20:05       ` Anthony Liguori
  0 siblings, 1 reply; 13+ messages in thread
From: Blue Swirl @ 2008-04-17 19:27 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno,
	Paul Brook

[-- Attachment #1: Type: text/plain, Size: 1868 bytes --]

On 4/16/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> Blue Swirl wrote:
>
> > On 4/16/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> >
> >
> > > This patch introduces a DMA API and plumbs support through the DMA
> layer.  We
> > >  use a mostly opaque structure, IOVector to represent a scatter/gather
> list of
> > >  physical memory.  Associated with each IOVector is a read/write
> function and
> > >  an opaque pointer.  This allows arbitrary transformation/mapping of the
> > >  data while providing an easy mechanism to short-cut the zero-copy case
> > >  in the block/net backends.
> > >
> > >
> >
> > This looks much better also for Sparc uses. I converted pcnet to use
> > the IOVectors (see patch), it does not work yet but looks doable.
> >
> >
>
>  Excellent!

I fixed the bug, now pcnet works. Performance is improved by a few
percent. The problem was that the vector was not freed. Maybe dynamic
allocation is a bit fragile. In this case, the length of the vector is
known, so it could be allocated once at init time. But would this
work?

The next step would be to add a vector version for packet receive. For
ESP/SCSI, in addition to bdrv_readv/writev, AIO versions would need to
be added. Last year I made a patch (attached) that made SLIRP use my
version of IOVector, I could update it to this model.

> > IMHO the read/write functions should be a property of the bus so that
> > they are hidden from the device, for pcnet it does not matter as we
> > have to do the swapping anyway.
> >
> >
>
>  For an IOMMU that has a per-device mapping, the read/write functions have
> to operate on a per-device basis.

No, I meant that there could be a bus layer that did the memory access
and provided a specialized version of iovector_new without the
handlers. But I think we can live with this, if things get too ugly we
can add the layering later.

[-- Attachment #2: pcnet_prepare.diff --]
[-- Type: plain/text, Size: 14321 bytes --]

[-- Attachment #3: pcnet_dma_api.diff --]
[-- Type: plain/text, Size: 2891 bytes --]

[-- Attachment #4: slirp_iov.diff --]
[-- Type: plain/text, Size: 9315 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
  2008-04-17 19:27     ` Blue Swirl
@ 2008-04-17 20:05       ` Anthony Liguori
  2008-04-19 19:40         ` Blue Swirl
  0 siblings, 1 reply; 13+ messages in thread
From: Anthony Liguori @ 2008-04-17 20:05 UTC (permalink / raw)
  To: Blue Swirl
  Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno,
	Paul Brook

Blue Swirl wrote:
>
> I fixed the bug, now pcnet works. Performance is improved by a few
> percent. The problem was that the vector was not freed. Maybe dynamic
> allocation is a bit fragile. In this case, the length of the vector is
> known, so it could be allocated once at init time. But would this
> work?
>   

For you, yes, but not for me.  virtio scatter/gather lists can be very 
long.  The API tries not to make assumptions about who's allocating what 
so you should be able to get away without a dynamic allocation if you 
were sufficiently motivated.

> The next step would be to add a vector version for packet receive. For
> ESP/SCSI, in addition to bdrv_readv/writev, AIO versions would need to
> be added. Last year I made a patch (attached) that made SLIRP use my
> version of IOVector, I could update it to this model.
>   

Yes, the vector version of packet receive is tough.  I'll take a look at 
your patch.  Basically, you need to associate a set of RX vectors with 
each VLANClientState and then when it comes time to deliver a packet to 
the VLAN, before calling fd_read, see if there is an RX vector available 
for the client.

In the case of tap, I want to optimize further and do the initial 
readv() to one of the clients RX buffers and then copy that RX buffer to 
the rest of the clients if necessary.

Regards,

Anthony Liguori

>>> IMHO the read/write functions should be a property of the bus so that
>>> they are hidden from the device, for pcnet it does not matter as we
>>> have to do the swapping anyway.
>>>
>>>
>>>       
>>  For an IOMMU that has a per-device mapping, the read/write functions have
>> to operate on a per-device basis.
>>     
>
> No, I meant that there could be a bus layer that did the memory access
> and provided a specialized version of iovector_new without the
> handlers. But I think we can live with this, if things get too ugly we
> can add the layering later.
>   

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
  2008-04-17 20:05       ` Anthony Liguori
@ 2008-04-19 19:40         ` Blue Swirl
  2008-04-19 20:02           ` [kvm-devel] " Anthony Liguori
  0 siblings, 1 reply; 13+ messages in thread
From: Blue Swirl @ 2008-04-19 19:40 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno,
	Paul Brook

[-- Attachment #1: Type: text/plain, Size: 910 bytes --]

On 4/17/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>  Yes, the vector version of packet receive is tough.  I'll take a look at
> your patch.  Basically, you need to associate a set of RX vectors with each
> VLANClientState and then when it comes time to deliver a packet to the VLAN,
> before calling fd_read, see if there is an RX vector available for the
> client.
>
>  In the case of tap, I want to optimize further and do the initial readv()
> to one of the clients RX buffers and then copy that RX buffer to the rest of
> the clients if necessary.

The vector versions should also help SLIRP to add IP and Ethernet
headers to the incoming packets.

I made an initial version of the vectored AIO SCSI with ESP. It does
not work, but I can see that just using the vectors won't give too
much extra performance, because at least initially the vector length
is 1. Collecting the statuses may be tricky.

[-- Attachment #2: block_aio_rw_v.diff --]
[-- Type: plain/text, Size: 10010 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [kvm-devel] [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
  2008-04-19 19:40         ` Blue Swirl
@ 2008-04-19 20:02           ` Anthony Liguori
  2008-04-20  6:42             ` Blue Swirl
  0 siblings, 1 reply; 13+ messages in thread
From: Anthony Liguori @ 2008-04-19 20:02 UTC (permalink / raw)
  To: Blue Swirl
  Cc: Anthony Liguori, kvm-devel, qemu-devel, Marcelo Tosatti,
	Paul Brook, Aurelien Jarno

Blue Swirl wrote:
> On 4/17/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
>>  Yes, the vector version of packet receive is tough.  I'll take a look at
>> your patch.  Basically, you need to associate a set of RX vectors with each
>> VLANClientState and then when it comes time to deliver a packet to the VLAN,
>> before calling fd_read, see if there is an RX vector available for the
>> client.
>>
>>  In the case of tap, I want to optimize further and do the initial readv()
>> to one of the clients RX buffers and then copy that RX buffer to the rest of
>> the clients if necessary.
>>     
>
> The vector versions should also help SLIRP to add IP and Ethernet
> headers to the incoming packets.
>   

Yeah, I'm hoping that with my posted linux-aio interface, I can add 
vector support since linux-aio has a proper asynchronous vector function.

Are we happy with the DMA API?  If so, we should commit it now so we can 
start adding proper vector interfaces for net/block.

Regards,

Anthony Liguori

> I made an initial version of the vectored AIO SCSI with ESP. It does
> not work, but I can see that just using the vectors won't give too
> much extra performance, because at least initially the vector length
> is 1. Collecting the statuses may be tricky.
>   
> ------------------------------------------------------------------------
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
> Don't miss this year's exciting event. There's still time to save $100. 
> Use priority code J8TL2D2. 
> http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
> ------------------------------------------------------------------------
>
> _______________________________________________
> kvm-devel mailing list
> kvm-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>   

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [kvm-devel] [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
  2008-04-19 20:02           ` [kvm-devel] " Anthony Liguori
@ 2008-04-20  6:42             ` Blue Swirl
  2008-04-20 19:29               ` Anthony Liguori
  0 siblings, 1 reply; 13+ messages in thread
From: Blue Swirl @ 2008-04-20  6:42 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Anthony Liguori, kvm-devel, qemu-devel, Marcelo Tosatti,
	Paul Brook, Aurelien Jarno

On 4/19/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Blue Swirl wrote:
>
> > On 4/17/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> >
> >
> > >  Yes, the vector version of packet receive is tough.  I'll take a look
> at
> > > your patch.  Basically, you need to associate a set of RX vectors with
> each
> > > VLANClientState and then when it comes time to deliver a packet to the
> VLAN,
> > > before calling fd_read, see if there is an RX vector available for the
> > > client.
> > >
> > >  In the case of tap, I want to optimize further and do the initial
> readv()
> > > to one of the clients RX buffers and then copy that RX buffer to the
> rest of
> > > the clients if necessary.
> > >
> > >
> >
> > The vector versions should also help SLIRP to add IP and Ethernet
> > headers to the incoming packets.
> >
> >
>
>  Yeah, I'm hoping that with my posted linux-aio interface, I can add vector
> support since linux-aio has a proper asynchronous vector function.
>
>  Are we happy with the DMA API?  If so, we should commit it now so we can
> start adding proper vector interfaces for net/block.

Well, the IOVector part and bdrv_readv look OK, except for the heavy
mallocing involved.

I'm not so sure about the DMA side and how everything fits together
for zero-copy IO. For example, do we still need explicit translation
at some point?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [kvm-devel] [Qemu-devel] [PATCH 1/5] PCI DMA API (v3)
  2008-04-20  6:42             ` Blue Swirl
@ 2008-04-20 19:29               ` Anthony Liguori
  0 siblings, 0 replies; 13+ messages in thread
From: Anthony Liguori @ 2008-04-20 19:29 UTC (permalink / raw)
  To: Blue Swirl
  Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno,
	Paul Brook

Blue Swirl wrote:
> On 4/19/08, Anthony Liguori <anthony@codemonkey.ws> wrote:
>   
> Well, the IOVector part and bdrv_readv look OK, except for the heavy
> mallocing involved.
>   

I don't think that in practice, malloc is going to have any sort of 
performance impact.  If it does, it's easy enough to implement a small 
object allocator for common, small vector sizes.

> I'm not so sure about the DMA side and how everything fits together
> for zero-copy IO. For example, do we still need explicit translation
> at some point?

I'm thinking that zero copy will be implemented by setting the map and 
unmap functions to NULL by default (instead of to the PCI read/write 
functions).  Then the bus can decide whether copy functions are needed.  
I'll send an updated patch series tomorrow that includes this functionality.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2008-04-20 19:30 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-15 22:11 [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Anthony Liguori
2008-04-15 22:11 ` [Qemu-devel] [PATCH 2/5] virtio for QEMU (v3) Anthony Liguori
2008-04-15 22:11 ` [Qemu-devel] [PATCH 3/5] virtio network driver (v3) Anthony Liguori
2008-04-15 22:11 ` [Qemu-devel] [PATCH 4/5] virtio block " Anthony Liguori
2008-04-15 22:11 ` [Qemu-devel] [PATCH 5/5] virtio balloon " Anthony Liguori
2008-04-16 19:51 ` [Qemu-devel] [PATCH 1/5] PCI DMA API (v3) Blue Swirl
2008-04-16 19:54   ` Anthony Liguori
2008-04-17 19:27     ` Blue Swirl
2008-04-17 20:05       ` Anthony Liguori
2008-04-19 19:40         ` Blue Swirl
2008-04-19 20:02           ` [kvm-devel] " Anthony Liguori
2008-04-20  6:42             ` Blue Swirl
2008-04-20 19:29               ` Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).