[Qemu-devel] [PATCH 1/6] Use ram_addr_t for cpu_get_physical_page

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 1/6] Use ram_addr_t for cpu_get_physical_page_desc
@ 2008-03-29 21:55 Anthony Liguori
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 2/6] PCI DMA API Anthony Liguori
                   ` (4 more replies)
  0 siblings, 5 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-29 21:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

uint32_t is the wrong type to use to represent physical addresses.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/cpu-all.h b/cpu-all.h
index 2a2b197..9e5d33b 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -834,7 +834,7 @@ typedef uint32_t CPUReadMemoryFunc(void *opaque, target_phys_addr_t addr);
 void cpu_register_physical_memory(target_phys_addr_t start_addr,
                                   unsigned long size,
                                   unsigned long phys_offset);
-uint32_t cpu_get_physical_page_desc(target_phys_addr_t addr);
+ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr);
 ram_addr_t qemu_ram_alloc(unsigned int size);
 void qemu_ram_free(ram_addr_t addr);
 int cpu_register_io_memory(int io_index,
diff --git a/exec.c b/exec.c
index 48dabd6..c25872d 100644
--- a/exec.c
+++ b/exec.c
@@ -2075,7 +2075,7 @@ void cpu_register_physical_memory(target_phys_addr_t start_addr,
 }
 
 /* XXX: temporary until new memory mapping API */
-uint32_t cpu_get_physical_page_desc(target_phys_addr_t addr)
+ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr)
 {
     PhysPageDesc *p;
 

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-29 21:55 [Qemu-devel] [PATCH 1/6] Use ram_addr_t for cpu_get_physical_page_desc Anthony Liguori
@ 2008-03-29 21:55 ` Anthony Liguori
  2008-03-30  7:06   ` Blue Swirl
                     ` (2 more replies)
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 3/6] virtio for QEMU Anthony Liguori
                   ` (3 subsequent siblings)
  4 siblings, 3 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-29 21:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

This patch introduces a PCI DMA API and some generic code to support other DMA
APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
maps a PhysIOVector, which is composed of target_phys_addr_t, into an IOVector,
which is composed of void *.

This enables zero-copy IO to be preformed without introducing assumptions of
phys_ram_base.  This API is at the PCI device level to enable support of
per-device IOMMU remapping.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 5ac29a7..94f3e58 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -173,7 +173,7 @@ all: $(PROGS)
 #########################################################
 # cpu emulator library
 LIBOBJS=exec.o kqemu.o translate-all.o cpu-exec.o\
-        translate.o host-utils.o
+        translate.o host-utils.o iovector.o
 ifndef CONFIG_NO_DYNGEN_OP
 LIBOBJS+=op.o
 endif
diff --git a/cpu-all.h b/cpu-all.h
index 9e5d33b..23b0a11 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -24,6 +24,8 @@
 #define WORDS_ALIGNED
 #endif
 
+#include "iovector.h"
+
 /* some important defines:
  *
  * WORDS_ALIGNED : if defined, the host cpu can only make word aligned
@@ -835,6 +837,8 @@ void cpu_register_physical_memory(target_phys_addr_t start_addr,
                                   unsigned long size,
                                   unsigned long phys_offset);
 ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr);
+IOVector *cpu_translate_physical_page_vector(PhysIOVector *phys);
+void cpu_physical_page_vector_set_dirty(PhysIOVector *phys);
 ram_addr_t qemu_ram_alloc(unsigned int size);
 void qemu_ram_free(ram_addr_t addr);
 int cpu_register_io_memory(int io_index,
diff --git a/exec.c b/exec.c
index c25872d..4b4b1a9 100644
--- a/exec.c
+++ b/exec.c
@@ -2085,6 +2085,65 @@ ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr)
     return p->phys_offset;
 }
 
+IOVector *cpu_translate_physical_page_vector(PhysIOVector *phys)
+{
+    unsigned int i;
+    IOVector *virt;
+
+    /* QEMU represents guest physical memory as virtually contiguous so we
+       never should need more IOVector's than PhysIOVectors */
+
+    virt = qemu_malloc(sizeof(IOVector) + phys->num * sizeof(IOVectorElement));
+
+    virt->num = phys->num;
+    for (i = 0; i < phys->num; i++) {
+	ram_addr_t base_offset = 0;
+	ram_addr_t offset;
+
+	/* we need to check that the guest is trying to DMA to somewhere they
+	   shouldn't */
+	for (offset = 0; offset < phys->sg[i].len; offset += TARGET_PAGE_SIZE){
+	    ram_addr_t phys_offset;
+
+	    /* DMA'ing to MMIO, just skip */
+	    phys_offset = cpu_get_physical_page_desc(phys->sg[i].base);
+	    if ((phys_offset & ~TARGET_PAGE_MASK) != IO_MEM_RAM) {
+		fprintf(stderr, "dma'ing to non-RAM region\n");
+		qemu_free(virt);
+		return NULL;
+	    }
+
+	    phys_offset &= TARGET_PAGE_MASK;
+	    phys_offset += phys->sg[i].base & ~TARGET_PAGE_MASK;
+
+	    if (offset == 0)
+		base_offset = phys_offset;
+	    else if ((phys_offset - base_offset) != offset) {
+		fprintf(stderr, "bug: discontiguous guest memory?\n");
+		qemu_free(virt);
+		return NULL;
+	    }
+	}
+
+	virt->sg[i].base = phys_ram_base + base_offset;
+	virt->sg[i].len = phys->sg[i].len;
+    }
+
+    return virt;
+}
+
+void cpu_physical_page_vector_set_dirty(PhysIOVector *phys)
+{
+    int i;
+
+    for (i = 0; i < phys->num; i++) {
+	ram_addr_t offset;
+	for (offset = 0; offset < phys->sg[i].len;
+	     offset += TARGET_PAGE_SIZE)
+	    cpu_physical_memory_set_dirty(phys->sg[i].base + offset);
+    }
+}
+
 /* XXX: better than nothing */
 ram_addr_t qemu_ram_alloc(unsigned int size)
 {
diff --git a/hw/pci.c b/hw/pci.c
index bc55989..99c206f 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -145,6 +145,20 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
     return 0;
 }
 
+IOVector *pci_device_dma_map(PCIDevice *s, PhysIOVector *phys)
+{
+    return cpu_translate_physical_page_vector(phys);
+}
+
+void pci_device_dma_unmap(PCIDevice *s, PhysIOVector *phys, IOVector *virt,
+			  int write)
+{
+    /* mark memory as dirty if necessary */
+    if (write)
+	cpu_physical_page_vector_set_dirty(phys);
+    qemu_free(virt);
+}
+
 /* -1 for devfn means auto assign */
 PCIDevice *pci_register_device(PCIBus *bus, const char *name,
                                int instance_size, int devfn,
diff --git a/hw/pci.h b/hw/pci.h
index e870987..b965919 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -81,6 +81,10 @@ void pci_default_write_config(PCIDevice *d,
 void pci_device_save(PCIDevice *s, QEMUFile *f);
 int pci_device_load(PCIDevice *s, QEMUFile *f);
 
+IOVector *pci_device_dma_map(PCIDevice *s, PhysIOVector *phys);
+void pci_device_dma_unmap(PCIDevice *s, PhysIOVector *phys, IOVector *virt,
+			  int write);
+
 typedef void (*pci_set_irq_fn)(qemu_irq *pic, int irq_num, int level);
 typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num);
 PCIBus *pci_register_bus(pci_set_irq_fn set_irq, pci_map_irq_fn map_irq,
diff --git a/iovector.c b/iovector.c
new file mode 100644
index 0000000..432b483
--- /dev/null
+++ b/iovector.c
@@ -0,0 +1,121 @@
+/*
+ * IO Vectors
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "iovector.h"
+
+static size_t iovector_rw(void *buffer, size_t size, IOVector *iov, int read)
+{
+    uint8_t *ptr = buffer;
+    size_t offset = 0;
+    int i;
+
+    for (i = 0; i < iov->num; i++) {
+	size_t len;
+
+	len = MIN(iov->sg[i].len, size - offset);
+
+	if (read)
+	    memcpy(ptr + offset, iov->sg[i].base, len);
+	else
+	    memcpy(iov->sg[i].base, ptr + offset, len);
+
+	offset += len;
+    }
+
+    return offset;
+}
+
+size_t memcpy_from_iovector(void *buffer, size_t offset, size_t size,
+			    const IOVector *iov)
+{
+    IOVector *sg;
+    size_t len;
+
+    if (offset)
+	sg = iovector_trim(iov, offset, size);
+    else
+	sg = (IOVector *)iov;
+
+    len = iovector_rw(buffer, size, sg, 1);
+
+    if (offset)
+	qemu_free(sg);
+
+    return len;
+}
+
+size_t memcpy_to_iovector(const void *buffer, size_t offset, size_t size,
+			  IOVector *iov)
+{
+    IOVector *sg;
+    size_t len;
+
+    if (offset)
+	sg = iovector_trim(iov, offset, size);
+    else
+	sg = iov;
+
+    len = iovector_rw((void *)buffer, size, sg, 0);
+
+    if (offset)
+	qemu_free(sg);
+
+    return len;
+}
+
+IOVector *iovector_trim(const IOVector *iov, size_t offset, size_t size)
+{
+    IOVector *ret;
+    size_t off, total_size;
+    int i;
+
+    ret = qemu_malloc(sizeof(IOVector) + sizeof(IOVectorElement) * iov->num);
+    if (ret == NULL)
+	return NULL;
+
+    total_size = 0;
+    ret->num = 0;
+    off = 0;
+    for (i = 0; i < iov->num; i++) {
+	if (off >= offset || offset < (off + iov->sg[i].len)) {
+	    size_t fudge = 0;
+	    if (off < offset)
+		fudge = offset - off;
+
+	    ret->sg[ret->num].base = iov->sg[i].base + fudge;
+	    ret->sg[ret->num].len = MIN(iov->sg[i].len - fudge,
+					size - total_size);
+	    total_size += ret->sg[ret->num].len;
+	    ret->num++;
+
+	    if (total_size == size)
+		break;
+	}
+
+	off += iov->sg[i].len;
+    }
+
+    return ret;
+}
+
+size_t iovector_size(const IOVector *iov)
+{
+    size_t size = 0;
+    int i;
+
+    for (i = 0; i < iov->num; i++)
+	size += iov->sg[i].len;
+    
+    return size;
+}
diff --git a/iovector.h b/iovector.h
new file mode 100644
index 0000000..042ea3a
--- /dev/null
+++ b/iovector.h
@@ -0,0 +1,49 @@
+/*
+ * IO Vectors
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_IOVECTOR_H
+#define _QEMU_IOVECTOR_H
+
+typedef struct IOVectorElement IOVectorElement;
+
+typedef struct PhysIOVectorElement PhysIOVectorElement;
+
+typedef struct IOVector
+{
+    int num;
+    struct IOVectorElement {
+	void *base;
+	size_t len;
+    } sg[0];
+} IOVector;
+
+typedef struct PhysIOVector
+{
+    int num;
+    struct PhysIOVectorElement {
+	target_phys_addr_t base;
+	size_t len;
+    } sg[0];
+} PhysIOVector;
+
+size_t memcpy_from_iovector(void *buffer, size_t offset, size_t size,
+			    const IOVector *iov);
+
+size_t memcpy_to_iovector(const void *buffer, size_t offset, size_t size,
+			  IOVector *iov);
+
+IOVector *iovector_trim(const IOVector *iov, size_t offset, size_t size);
+
+size_t iovector_size(const IOVector *iov);
+
+#endif

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 3/6] virtio for QEMU
  2008-03-29 21:55 [Qemu-devel] [PATCH 1/6] Use ram_addr_t for cpu_get_physical_page_desc Anthony Liguori
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 2/6] PCI DMA API Anthony Liguori
@ 2008-03-29 21:55 ` Anthony Liguori
  2008-03-30 17:25   ` Dor Laor
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 4/6] virtio network driver Anthony Liguori
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 23+ messages in thread
From: Anthony Liguori @ 2008-03-29 21:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

This patch introduces virtio support over PCI.  virtio is a generic virtual IO
framework for Linux first introduced in 2.6.23.  Since 2.6.25, virtio has
supported a PCI transport which this patch implements.

Since the last time these patches were posted to qemu-devel, I've reworked it
to use the proper access functions to manipulate guest memory.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 94f3e58..6815ba8 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -534,6 +534,9 @@ OBJS += pcnet.o
 OBJS += rtl8139.o
 OBJS += e1000.o
 
+# virtio devices
+OBJS += virtio.o
+
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
 OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o
diff --git a/cutils.c b/cutils.c
index 9ef2fa6..814b3c4 100644
--- a/cutils.c
+++ b/cutils.c
@@ -95,3 +95,14 @@ time_t mktimegm(struct tm *tm)
     t += 3600 * tm->tm_hour + 60 * tm->tm_min + tm->tm_sec;
     return t;
 }
+
+int fls(int i)
+{
+    int bit;
+
+    for (bit=31; bit >= 0; bit--)
+        if (i & (1 << bit))
+            return bit+1;
+
+    return 0;
+}
diff --git a/hw/virtio-pci.h b/hw/virtio-pci.h
new file mode 100644
index 0000000..9262e49
--- /dev/null
+++ b/hw/virtio-pci.h
@@ -0,0 +1,65 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _VIRTIO_PCI_H
+#define _VIRTIO_PCI_H
+
+/* from Linux's linux/virtio_ring.h */
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT	1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE	2
+
+/* This means don't notify other side when buffer added. */
+#define VRING_USED_F_NO_NOTIFY	1
+/* This means don't interrupt guest when buffer consumed. */
+#define VRING_AVAIL_F_NO_INTERRUPT	1
+
+#define VIRTIO_PCI_QUEUE_MAX	16
+
+/* from Linux's linux/virtio_pci.h */
+
+/* A 32-bit r/o bitmask of the features supported by the host */
+#define VIRTIO_PCI_HOST_FEATURES	0
+
+/* A 32-bit r/w bitmask of features activated by the guest */
+#define VIRTIO_PCI_GUEST_FEATURES	4
+
+/* A 32-bit r/w PFN for the currently selected queue */
+#define VIRTIO_PCI_QUEUE_PFN		8
+
+/* A 16-bit r/o queue size for the currently selected queue */
+#define VIRTIO_PCI_QUEUE_NUM		12
+
+/* A 16-bit r/w queue selector */
+#define VIRTIO_PCI_QUEUE_SEL		14
+
+/* A 16-bit r/w queue notifier */
+#define VIRTIO_PCI_QUEUE_NOTIFY		16
+
+/* An 8-bit device status register.  */
+#define VIRTIO_PCI_STATUS		18
+
+/* An 8-bit r/o interrupt status register.  Reading the value will return the
+ * current contents of the ISR and will also clear it.  This is effectively
+ * a read-and-acknowledge. */
+#define VIRTIO_PCI_ISR			19
+
+#define VIRTIO_PCI_CONFIG		20
+
+/* Virtio ABI version, if we increment this, we break the guest driver. */
+#define VIRTIO_PCI_ABI_VERSION		0
+
+#endif
diff --git a/hw/virtio.c b/hw/virtio.c
new file mode 100644
index 0000000..9e75b2f
--- /dev/null
+++ b/hw/virtio.c
@@ -0,0 +1,594 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */ 
+
+#include <inttypes.h>
+#include <err.h>
+
+#include "virtio.h"
+#include "sysemu.h"
+#include "virtio-pci.h"
+
+typedef struct VRingDesc
+{
+    /* Address (guest-physical). */
+    uint64_t addr;
+    /* Length. */
+    uint32_t len;
+    /* The flags as indicated above. */
+    uint16_t flags;
+    /* We chain unused descriptors via this, too */
+    uint16_t next;
+} VRingDesc;
+
+typedef struct VRingAvail
+{
+    uint16_t flags;
+    uint16_t idx;
+    uint16_t ring[];
+} VRingAvail;
+
+typedef struct VRingUsedElem
+{
+    /* Index of start of used descriptor chain. */
+    uint32_t id;
+    /* Total length of the descriptor chain which was used (written to) */
+    uint32_t len;
+} VRingUsedElem;
+
+typedef struct VRingUsed
+{
+    uint16_t flags;
+    uint16_t idx;
+    VRingUsedElem ring[];
+} VRingUsed;
+
+typedef struct VRing
+{
+    unsigned int num;
+    target_phys_addr_t desc;
+    target_phys_addr_t avail;
+    target_phys_addr_t used;
+} VRing;
+
+struct VirtQueue
+{
+    VRing vring;
+    uint32_t pfn;
+    uint16_t last_avail_idx;
+    void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
+    int index;
+    VirtIODevice *vdev;
+};
+
+/* QEMU doesn't strictly need write barriers since everything runs in
+ * lock-step.  We'll leave the calls to wmb() in though to make it obvious for
+ * KVM or if kqemu gets SMP support.
+ */
+
+#define wmb() do { } while (0)
+
+/* FIXME put this somewhere generic */
+#define offsetof(type, member) ((unsigned long)(&((type *)0)->member))
+
+/* virt queue functions */
+
+static void virtqueue_init(VirtQueue *vq, target_phys_addr_t p)
+{
+    vq->vring.desc = p;
+    vq->vring.avail = p + vq->vring.num * 16;
+    vq->vring.used = vq->vring.avail + 2 * (2 + vq->vring.num);
+    vq->vring.used = TARGET_PAGE_ALIGN(vq->vring.used);
+}
+
+static uint64_t vring_desc_addr(VirtQueue *vq, unsigned int i)
+{
+    return ldq_phys(vq->vring.desc + i * sizeof(VRingDesc) +
+		    offsetof(VRingDesc, addr));
+}
+
+static uint32_t vring_desc_len(VirtQueue *vq, unsigned int i)
+{
+    return ldl_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
+		    offsetof(VRingDesc, len));
+}
+
+static uint16_t vring_desc_flags(VirtQueue *vq, unsigned int i)
+{
+    return lduw_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
+		     offsetof(VRingDesc, flags));
+}
+
+static uint16_t vring_desc_next(VirtQueue *vq, unsigned int i)
+{
+    return lduw_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
+		     offsetof(VRingDesc, next));
+}
+
+static uint16_t vring_avail_flags(VirtQueue *vq)
+{
+    return lduw_phys(vq->vring.avail + offsetof(VRingAvail, flags));
+}
+
+static uint16_t vring_avail_idx(VirtQueue *vq)
+{
+    return lduw_phys(vq->vring.avail + offsetof(VRingAvail, idx));
+}
+
+static uint16_t vring_avail_ring(VirtQueue *vq, unsigned int i)
+{
+    return lduw_phys(vq->vring.avail + offsetof(VRingAvail, ring[i]));
+}
+
+static void vring_used_set_flag(VirtQueue *vq, uint16_t flag)
+{
+    stw_phys(vq->vring.used + offsetof(VRingUsed, flags),
+	     lduw_phys(vq->vring.used + offsetof(VRingUsed, flags)) | flag);
+}
+
+static void vring_used_unset_flag(VirtQueue *vq, uint16_t flag)
+{
+    stw_phys(vq->vring.used + offsetof(VRingUsed, flags),
+	     lduw_phys(vq->vring.used + offsetof(VRingUsed, flags)) & ~flag);
+}
+
+static uint16_t vring_used_get_idx(VirtQueue *vq)
+{
+    return lduw_phys(vq->vring.used + offsetof(VRingUsed, idx));
+}
+
+static void vring_used_set_idx(VirtQueue *vq, uint16_t value)
+{
+    stw_phys(vq->vring.used + offsetof(VRingUsed, idx), value);
+}
+
+static void vring_used_set_ring(VirtQueue *vq, unsigned int i,
+				uint32_t id, uint32_t len)
+{
+    stl_phys(vq->vring.used + offsetof(VRingUsed, ring[i].id), id);
+    stl_phys(vq->vring.used + offsetof(VRingUsed, ring[i].len), len);
+}
+
+static unsigned virtqueue_next_desc(VirtQueue *vq, unsigned int i)
+{
+    unsigned int next;
+
+    /* If this descriptor says it doesn't chain, we're done. */
+    if (!(vring_desc_flags(vq, i) & VRING_DESC_F_NEXT))
+	return vq->vring.num;
+
+    /* Check they're not leading us off end of descriptors. */
+    next = vring_desc_next(vq, i);
+    /* Make sure compiler knows to grab that: we don't want it changing! */
+    wmb();
+
+    if (next >= vq->vring.num)
+	errx(1, "Desc next is %u", next);
+
+    return next;
+}
+
+void virtqueue_push(VirtQueue *vq, VirtQueueElement *elem, unsigned int len)
+{
+    uint16_t idx;
+
+    pci_device_dma_unmap(&vq->vdev->pci_dev, elem->phys_in, elem->virt_in, 1);
+    pci_device_dma_unmap(&vq->vdev->pci_dev, elem->phys_out, elem->virt_out, 0);
+
+    idx = vring_used_get_idx(vq);
+    vring_used_set_ring(vq, idx % vq->vring.num, elem->index, len);
+    wmb();
+    vring_used_set_idx(vq, idx + 1);
+
+    qemu_free(elem->phys_in);
+    qemu_free(elem->phys_out);
+    qemu_free(elem);
+}
+
+VirtQueueElement *virtqueue_pop(VirtQueue *vq)
+{
+    unsigned int i, head;
+    unsigned int position;
+    VirtQueueElement *elem;
+
+    /* Check it isn't doing very strange things with descriptor numbers. */
+    if ((uint16_t)(vring_avail_idx(vq) - vq->last_avail_idx) > vq->vring.num)
+	errx(1, "Guest moved used index from %u to %u",
+	     vq->last_avail_idx, vring_avail_idx(vq));
+
+    /* If there's nothing new since last we looked, return invalid. */
+    if (vring_avail_idx(vq) == vq->last_avail_idx)
+	return NULL;
+
+    /* Grab the next descriptor number they're advertising, and increment
+     * the index we've seen. */
+    head = vring_avail_ring(vq, vq->last_avail_idx++ % vq->vring.num);
+
+    /* If their number is silly, that's a fatal mistake. */
+    if (head >= vq->vring.num)
+	errx(1, "Guest says index %u is available", head);
+
+    /* When we start there are none of either input nor output. */
+    position = 0;
+
+    elem = qemu_mallocz(sizeof(VirtQueueElement));
+
+    elem->phys_in = qemu_mallocz(sizeof(PhysIOVector) +
+				 vq->vring.num * sizeof(PhysIOVectorElement));
+    elem->phys_out = qemu_mallocz(sizeof(PhysIOVector) +
+				  vq->vring.num * sizeof(PhysIOVectorElement));
+
+    i = head;
+    do {
+	PhysIOVectorElement *sge;
+
+	if (vring_desc_flags(vq, i) & VRING_DESC_F_WRITE)
+	    sge = &elem->phys_in->sg[elem->phys_in->num++];
+	else
+	    sge = &elem->phys_out->sg[elem->phys_out->num++];
+
+	/* Grab the first descriptor, and check it's OK. */
+	sge->len = vring_desc_len(vq, i);
+	sge->base = vring_desc_addr(vq, i);
+
+	/* If we've got too many, that implies a descriptor loop. */
+	if ((elem->phys_in->num + elem->phys_out->num) > vq->vring.num)
+	    errx(1, "Looped descriptor");
+    } while ((i = virtqueue_next_desc(vq, i)) != vq->vring.num);
+
+    elem->virt_in = pci_device_dma_map(&vq->vdev->pci_dev, elem->phys_in);
+    elem->virt_out = pci_device_dma_map(&vq->vdev->pci_dev, elem->phys_out);
+    elem->index = head;
+
+    if (elem->virt_in == NULL || elem->virt_out == NULL)
+	errx(1, "Bad DMA");
+
+    return elem;
+}
+
+/* virtio device */
+
+static VirtIODevice *to_virtio_device(PCIDevice *pci_dev)
+{
+    return (VirtIODevice *)pci_dev;
+}
+
+static void virtio_update_irq(VirtIODevice *vdev)
+{
+    qemu_set_irq(vdev->pci_dev.irq[0], vdev->isr & 1);
+}
+
+void virtio_reset(void *opaque)
+{
+    VirtIODevice *vdev = opaque;
+    int i;
+
+    vdev->features = 0;
+    vdev->queue_sel = 0;
+    vdev->status = 0;
+    vdev->isr = 0;
+
+    for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
+        vdev->vq[i].vring.desc = 0;
+        vdev->vq[i].vring.avail = 0;
+        vdev->vq[i].vring.used = 0;
+        vdev->vq[i].last_avail_idx = 0;
+        vdev->vq[i].pfn = 0;
+    }
+}
+
+static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
+{
+    VirtIODevice *vdev = to_virtio_device(opaque);
+    ram_addr_t pa;
+
+    addr -= vdev->addr;
+
+    switch (addr) {
+    case VIRTIO_PCI_GUEST_FEATURES:
+	if (vdev->set_features)
+	    vdev->set_features(vdev, val);
+	vdev->features = val;
+	break;
+    case VIRTIO_PCI_QUEUE_PFN:
+	pa = (ram_addr_t)val << TARGET_PAGE_BITS;
+	vdev->vq[vdev->queue_sel].pfn = val;
+	if (pa == 0)
+            virtio_reset(vdev);
+	else
+	    virtqueue_init(&vdev->vq[vdev->queue_sel], pa);
+	break;
+    case VIRTIO_PCI_QUEUE_SEL:
+	if (val < VIRTIO_PCI_QUEUE_MAX)
+	    vdev->queue_sel = val;
+	break;
+    case VIRTIO_PCI_QUEUE_NOTIFY:
+	if (val < VIRTIO_PCI_QUEUE_MAX && vdev->vq[val].vring.desc)
+	    vdev->vq[val].handle_output(vdev, &vdev->vq[val]);
+	break;
+    case VIRTIO_PCI_STATUS:
+	vdev->status = val & 0xFF;
+	if (vdev->status == 0)
+	    virtio_reset(vdev);
+	break;
+    }
+}
+
+static uint32_t virtio_ioport_read(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = to_virtio_device(opaque);
+    uint32_t ret = 0xFFFFFFFF;
+
+    addr -= vdev->addr;
+
+    switch (addr) {
+    case VIRTIO_PCI_HOST_FEATURES:
+	ret = vdev->get_features(vdev);
+	break;
+    case VIRTIO_PCI_GUEST_FEATURES:
+	ret = vdev->features;
+	break;
+    case VIRTIO_PCI_QUEUE_PFN:
+	ret = vdev->vq[vdev->queue_sel].pfn;
+	break;
+    case VIRTIO_PCI_QUEUE_NUM:
+	ret = vdev->vq[vdev->queue_sel].vring.num;
+	break;
+    case VIRTIO_PCI_QUEUE_SEL:
+	ret = vdev->queue_sel;
+	break;
+    case VIRTIO_PCI_STATUS:
+	ret = vdev->status;
+	break;
+    case VIRTIO_PCI_ISR:
+	/* reading from the ISR also clears it. */
+	ret = vdev->isr;
+	vdev->isr = 0;
+	virtio_update_irq(vdev);
+	break;
+    default:
+	break;
+    }
+
+    return ret;
+}
+
+static uint32_t virtio_config_readb(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = opaque;
+    uint8_t val;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return (uint32_t)-1;
+
+    memcpy(&val, vdev->config + addr, sizeof(val));
+    return val;
+}
+
+static uint32_t virtio_config_readw(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = opaque;
+    uint16_t val;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return (uint32_t)-1;
+
+    memcpy(&val, vdev->config + addr, sizeof(val));
+    return val;
+}
+
+static uint32_t virtio_config_readl(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = opaque;
+    uint32_t val;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return (uint32_t)-1;
+
+    memcpy(&val, vdev->config + addr, sizeof(val));
+    return val;
+}
+
+static void virtio_config_writeb(void *opaque, uint32_t addr, uint32_t data)
+{
+    VirtIODevice *vdev = opaque;
+    uint8_t val = data;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return;
+
+    memcpy(vdev->config + addr, &val, sizeof(val));
+
+    if (vdev->set_config)
+	vdev->set_config(vdev, vdev->config);
+}
+
+static void virtio_config_writew(void *opaque, uint32_t addr, uint32_t data)
+{
+    VirtIODevice *vdev = opaque;
+    uint16_t val = data;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return;
+
+    memcpy(vdev->config + addr, &val, sizeof(val));
+
+    if (vdev->set_config)
+	vdev->set_config(vdev, vdev->config);
+}
+
+static void virtio_config_writel(void *opaque, uint32_t addr, uint32_t data)
+{
+    VirtIODevice *vdev = opaque;
+    uint32_t val = data;
+
+    addr -= vdev->addr + VIRTIO_PCI_CONFIG;
+    if (addr > (vdev->config_len - sizeof(val)))
+	return;
+
+    memcpy(vdev->config + addr, &val, sizeof(val));
+
+    if (vdev->set_config)
+	vdev->set_config(vdev, vdev->config);
+}
+
+static void virtio_map(PCIDevice *pci_dev, int region_num,
+		       uint32_t addr, uint32_t size, int type)
+{
+    VirtIODevice *vdev = to_virtio_device(pci_dev);
+    int i;
+
+    vdev->addr = addr;
+    for (i = 0; i < 3; i++) {
+	register_ioport_write(addr, 20, 1 << i, virtio_ioport_write, vdev);
+	register_ioport_read(addr, 20, 1 << i, virtio_ioport_read, vdev);
+    }
+
+    if (vdev->config_len) {
+	register_ioport_write(addr + 20, vdev->config_len, 1,
+			      virtio_config_writeb, vdev);
+	register_ioport_write(addr + 20, vdev->config_len, 2,
+			      virtio_config_writew, vdev);
+	register_ioport_write(addr + 20, vdev->config_len, 4,
+			      virtio_config_writel, vdev);
+	register_ioport_read(addr + 20, vdev->config_len, 1,
+			     virtio_config_readb, vdev);
+	register_ioport_read(addr + 20, vdev->config_len, 2,
+			     virtio_config_readw, vdev);
+	register_ioport_read(addr + 20, vdev->config_len, 4,
+			     virtio_config_readl, vdev);
+
+	vdev->get_config(vdev, vdev->config);
+    }
+}
+
+VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
+			    void (*handle_output)(VirtIODevice *, VirtQueue *))
+{
+    int i;
+
+    for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
+	if (vdev->vq[i].vring.num == 0)
+	    break;
+    }
+
+    if (i == VIRTIO_PCI_QUEUE_MAX)
+	abort();
+
+    vdev->vq[i].vring.num = queue_size;
+    vdev->vq[i].handle_output = handle_output;
+    vdev->vq[i].index = i;
+    vdev->vq[i].vdev = vdev;
+
+    return &vdev->vq[i];
+}
+
+void virtio_notify_config(VirtIODevice *vdev)
+{
+    /* make sure we have the latest config */
+    vdev->get_config(vdev, vdev->config);
+    vdev->isr = 3;
+    virtio_update_irq(vdev);
+}
+
+void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
+{
+    /* Always notify when queue is empty */
+    if (vring_avail_idx(vq) != vq->last_avail_idx &&
+	(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT))
+	return;
+
+    vdev->isr = 1;
+    virtio_update_irq(vdev);
+}
+
+void virtio_ring_set_used_notify(VirtQueue *vq, int enable)
+{
+    if (enable)
+	vring_used_set_flag(vq, VRING_USED_F_NO_NOTIFY);
+    else
+	vring_used_unset_flag(vq, VRING_USED_F_NO_NOTIFY);
+}
+
+size_t virtio_ring_avail_size(VirtQueue *vq)
+{
+    return vring_avail_idx(vq) - vq->last_avail_idx;
+}
+
+int virtio_ring_inited(VirtQueue *vq)
+{
+    return (vq->vring.avail != 0);
+}
+
+VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
+			      uint16_t vendor, uint16_t device,
+			      uint16_t subvendor, uint16_t subdevice,
+			      uint8_t class_code, uint8_t subclass_code,
+			      uint8_t pif, size_t config_size,
+			      size_t struct_size)
+{
+    VirtIODevice *vdev;
+    PCIDevice *pci_dev;
+    uint8_t *config;
+    uint32_t size;
+
+    pci_dev = pci_register_device(bus, name, struct_size,
+				  -1, NULL, NULL);
+    vdev = to_virtio_device(pci_dev);
+
+    vdev->status = 0;
+    vdev->isr = 0;
+    vdev->queue_sel = 0;
+    vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
+
+    config = pci_dev->config;
+    config[0x00] = vendor & 0xFF;
+    config[0x01] = (vendor >> 8) & 0xFF;
+    config[0x02] = device & 0xFF;
+    config[0x03] = (device >> 8) & 0xFF;
+
+    config[0x08] = VIRTIO_PCI_ABI_VERSION;
+
+    config[0x09] = pif;
+    config[0x0a] = subclass_code;
+    config[0x0b] = class_code;
+    config[0x0e] = 0x00;
+
+    config[0x2c] = subvendor & 0xFF;
+    config[0x2d] = (subvendor >> 8) & 0xFF;
+    config[0x2e] = subdevice & 0xFF;
+    config[0x2f] = (subdevice >> 8) & 0xFF;
+
+    config[0x3d] = 1;
+
+    vdev->name = name;
+    vdev->config_len = config_size;
+    if (vdev->config_len)
+	vdev->config = qemu_mallocz(config_size);
+    else
+	vdev->config = NULL;
+
+    size = 20 + config_size;
+    if (size & (size-1))
+        size = 1 << fls(size);
+
+    pci_register_io_region(pci_dev, 0, size, PCI_ADDRESS_SPACE_IO,
+			   virtio_map);
+    qemu_register_reset(virtio_reset, vdev);
+
+    return vdev;
+}
diff --git a/hw/virtio.h b/hw/virtio.h
new file mode 100644
index 0000000..301aed7
--- /dev/null
+++ b/hw/virtio.h
@@ -0,0 +1,89 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_VIRTIO_H
+#define _QEMU_VIRTIO_H
+
+#include "hw.h"
+#include "pci.h"
+#include "iovector.h"
+
+/* from Linux's linux/virtio_config.h */
+
+/* Status byte for guest to report progress, and synchronize features. */
+/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
+#define VIRTIO_CONFIG_S_ACKNOWLEDGE	1
+/* We have found a driver for the device. */
+#define VIRTIO_CONFIG_S_DRIVER		2
+/* Driver has used its parts of the config, and is happy */
+#define VIRTIO_CONFIG_S_DRIVER_OK	4
+/* We've given up on this device. */
+#define VIRTIO_CONFIG_S_FAILED		0x80
+
+typedef struct VirtQueue VirtQueue;
+typedef struct VirtIODevice VirtIODevice;
+
+typedef struct VirtQueueElement
+{
+    unsigned int index;
+    IOVector *virt_in, *virt_out;
+    PhysIOVector *phys_in, *phys_out;
+} VirtQueueElement;
+
+struct VirtIODevice
+{
+    PCIDevice pci_dev;
+    const char *name;
+    uint32_t addr;
+    uint16_t vendor;
+    uint16_t device;
+    uint8_t status;
+    uint8_t isr;
+    uint16_t queue_sel;
+    uint32_t features;
+    size_t config_len;
+    void *config;
+    uint32_t (*get_features)(VirtIODevice *vdev);
+    void (*set_features)(VirtIODevice *vdev, uint32_t val);
+    void (*get_config)(VirtIODevice *vdev, uint8_t *config);
+    void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
+    VirtQueue *vq;
+};
+
+VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
+			      uint16_t vendor, uint16_t device,
+			      uint16_t subvendor, uint16_t subdevice,
+			      uint8_t class_code, uint8_t subclass_code,
+			      uint8_t pif, size_t config_size,
+			      size_t struct_size);
+
+VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
+			    void (*handle_output)(VirtIODevice *,
+						  VirtQueue *));
+
+void virtqueue_push(VirtQueue *vq, VirtQueueElement *elem, unsigned int len);
+
+VirtQueueElement *virtqueue_pop(VirtQueue *vq);
+
+void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
+
+void virtio_ring_set_used_notify(VirtQueue *vq, int enable);
+
+size_t virtio_ring_avail_size(VirtQueue *vq);
+
+int virtio_ring_inited(VirtQueue *vq);
+
+void virtio_notify_config(VirtIODevice *vdev);
+
+#endif
diff --git a/qemu-common.h b/qemu-common.h
index 746dcc5..cd387b1 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -85,6 +85,7 @@ char *pstrcat(char *buf, int buf_size, const char *s);
 int strstart(const char *str, const char *val, const char **ptr);
 int stristart(const char *str, const char *val, const char **ptr);
 time_t mktimegm(struct tm *tm);
+int fls(int i);
 
 /* Error handling.  */
 

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 4/6] virtio network driver
  2008-03-29 21:55 [Qemu-devel] [PATCH 1/6] Use ram_addr_t for cpu_get_physical_page_desc Anthony Liguori
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 2/6] PCI DMA API Anthony Liguori
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 3/6] virtio for QEMU Anthony Liguori
@ 2008-03-29 21:55 ` Anthony Liguori
  2008-03-30 10:27   ` Paul Brook
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 5/6] virtio block driver Anthony Liguori
  2008-03-29 21:56 ` [Qemu-devel] [PATCH 6/6] virtio balloon driver Anthony Liguori
  4 siblings, 1 reply; 23+ messages in thread
From: Anthony Liguori @ 2008-03-29 21:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

This patch implements the virtio network driver backend.  In KVM, this driver
can achieve 1gbit tx/rx performance.  More patches are required to improve the
network IO infrastructure to achieve better performance in QEMU.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 6815ba8..3ea40d1 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -535,7 +535,7 @@ OBJS += rtl8139.o
 OBJS += e1000.o
 
 # virtio devices
-OBJS += virtio.o
+OBJS += virtio.o virtio-net.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
diff --git a/hw/pci.c b/hw/pci.c
index 99c206f..8dca481 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -654,9 +654,11 @@ void pci_nic_init(PCIBus *bus, NICInfo *nd, int devfn)
         pci_e1000_init(bus, nd, devfn);
     } else if (strcmp(nd->model, "pcnet") == 0) {
         pci_pcnet_init(bus, nd, devfn);
+    } else if (strcmp(nd->model, "virtio") == 0) {
+	virtio_net_init(bus, nd, devfn);
     } else if (strcmp(nd->model, "?") == 0) {
         fprintf(stderr, "qemu: Supported PCI NICs: i82551 i82557b i82559er"
-                        " ne2k_pci pcnet rtl8139 e1000\n");
+                        " ne2k_pci pcnet rtl8139 e1000 virtio\n");
         exit (1);
     } else {
         fprintf(stderr, "qemu: Unsupported NIC: %s\n", nd->model);
diff --git a/hw/pci.h b/hw/pci.h
index b965919..f4db366 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -143,4 +143,7 @@ PCIBus *pci_prep_init(qemu_irq *pic);
 PCIBus *pci_apb_init(target_phys_addr_t special_base, target_phys_addr_t mem_base,
                      qemu_irq *pic);
 
+/* virtio.c */
+PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn);
+
 #endif
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
new file mode 100644
index 0000000..de4723a
--- /dev/null
+++ b/hw/virtio-net.c
@@ -0,0 +1,180 @@
+/*
+ * Virtio Network Device
+ *
+ * Copyright IBM, Corp. 2007
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "virtio.h"
+#include "net.h"
+#include "pc.h"
+#include "qemu-timer.h"
+#include "virtio-net.h"
+
+#define TX_TIMER_INTERVAL (1000 / 500)
+
+typedef struct VirtIONet
+{
+    VirtIODevice vdev;
+    uint8_t mac[6];
+    VirtQueue *rx_vq;
+    VirtQueue *tx_vq;
+    VLANClientState *vc;
+    int can_receive;
+    QEMUTimer *tx_timer;
+    int tx_timer_active;
+} VirtIONet;
+
+static VirtIONet *to_virtio_net(VirtIODevice *vdev)
+{
+    return (VirtIONet *)vdev;
+}
+
+static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    struct virtio_net_config netcfg;
+
+    memcpy(netcfg.mac, n->mac, 6);
+    memcpy(config, &netcfg, sizeof(netcfg));
+}
+
+static uint32_t virtio_net_get_features(VirtIODevice *vdev)
+{
+    return (1 << VIRTIO_NET_F_MAC);
+}
+
+/* RX */
+
+static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    n->can_receive = 1;
+}
+
+static int virtio_net_can_receive(void *opaque)
+{
+    VirtIONet *n = opaque;
+
+    return (n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) && n->can_receive;
+}
+
+static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
+{
+    VirtIONet *n = opaque;
+    VirtQueueElement *elem;
+    struct virtio_net_hdr hdr;
+
+    /* FIXME: the drivers really need to set their status better */
+    if (!virtio_ring_inited(n->rx_vq)) {
+	n->can_receive = 0;
+	return;
+    }
+
+    if ((elem = virtqueue_pop(n->rx_vq)) == NULL) {
+	/* wait until the guest adds some rx bufs */
+	n->can_receive = 0;
+	return;
+    }
+
+    memset(&hdr, 0, sizeof(hdr));
+    hdr.flags = 0;
+    hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+    memcpy_to_iovector(&hdr, 0, sizeof(hdr), elem->virt_in);
+    memcpy_to_iovector(buf, sizeof(hdr), size, elem->virt_in);
+
+    /* signal other side */
+    virtqueue_push(n->rx_vq, elem, sizeof(hdr) + size);
+    virtio_notify(&n->vdev, n->rx_vq);
+}
+
+/* TX */
+static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
+{
+    VirtQueueElement *elem;
+
+    if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
+        return;
+
+    while ((elem = virtqueue_pop(vq))) {
+	IOVector *sg;
+	size_t out_size;
+	int i;
+
+	/* ignore the header for now */
+	out_size = iovector_size(elem->virt_out);
+
+	sg = iovector_trim(elem->virt_out, sizeof(struct virtio_net_hdr),
+			   out_size - sizeof(struct virtio_net_hdr));
+
+	for (i = 0; i < sg->num; i++)
+	    qemu_send_packet(n->vc, sg->sg[i].base, sg->sg[i].len);
+
+	qemu_free(sg);
+
+	virtqueue_push(vq, elem, out_size);
+	virtio_notify(&n->vdev, vq);
+    }
+}
+
+static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+
+    if (n->tx_timer_active &&
+	virtio_ring_avail_size(vq) == 64) {
+	virtio_ring_set_used_notify(vq, 0);
+	qemu_del_timer(n->tx_timer);
+	n->tx_timer_active = 0;
+	virtio_net_flush_tx(n, vq);
+    } else {
+	qemu_mod_timer(n->tx_timer,
+		       qemu_get_clock(vm_clock) + TX_TIMER_INTERVAL);
+	n->tx_timer_active = 1;
+	virtio_ring_set_used_notify(vq, 1);
+    }
+}
+
+static void virtio_net_tx_timer(void *opaque)
+{
+    VirtIONet *n = opaque;
+
+    n->tx_timer_active = 0;
+
+    /* Just in case the driver is not ready on more */
+    if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
+        return;
+
+    virtio_ring_set_used_notify(n->tx_vq, 0);
+    virtio_net_flush_tx(n, n->tx_vq);
+}
+
+PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
+{
+    VirtIONet *n;
+
+    n = (VirtIONet *)virtio_init_pci(bus, "virtio-net", 6900, 0x1000,
+				     0, VIRTIO_ID_NET,
+				     0x02, 0x00, 0x00,
+				     6, sizeof(VirtIONet));
+
+    n->vdev.get_config = virtio_net_get_config;
+    n->vdev.get_features = virtio_net_get_features;
+    n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
+    n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
+    n->can_receive = 0;
+    memcpy(n->mac, nd->macaddr, 6);
+    n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive,
+                                 virtio_net_can_receive, n);
+    n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
+    n->tx_timer_active = 0;
+
+    return (PCIDevice *)n;
+}
diff --git a/hw/virtio-net.h b/hw/virtio-net.h
new file mode 100644
index 0000000..2959198
--- /dev/null
+++ b/hw/virtio-net.h
@@ -0,0 +1,54 @@
+/*
+ * Virtio-net Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_VIRTIO_NET_H
+#define _QEMU_VIRTIO_NET_H
+
+/* from Linux's virtio_net.h */
+
+/* The ID for virtio_net */
+#define VIRTIO_ID_NET	1
+
+/* The feature bitmap for virtio net */
+#define VIRTIO_NET_F_NO_CSUM	0
+#define VIRTIO_NET_F_MAC	5
+#define VIRTIO_NET_F_GS0	6
+
+/* The config defining mac address (6 bytes) */
+struct virtio_net_config
+{
+    uint8_t mac[6];
+} __attribute__((packed));
+
+/* This is the first element of the scatter-gather list.  If you don't
+ * specify GSO or CSUM features, you can simply ignore the header. */
+struct virtio_net_hdr
+{
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	// Use csum_start, csum_offset
+    uint8_t flags;
+#define VIRTIO_NET_HDR_GSO_NONE		0	// Not a GSO frame
+#define VIRTIO_NET_HDR_GSO_TCPV4	1	// GSO frame, IPv4 TCP (TSO)
+/* FIXME: Do we need this?  If they said they can handle ECN, do they care? */
+#define VIRTIO_NET_HDR_GSO_TCPV4_ECN	2	// GSO frame, IPv4 TCP w/ ECN
+#define VIRTIO_NET_HDR_GSO_UDP		3	// GSO frame, IPv4 UDP (UFO)
+#define VIRTIO_NET_HDR_GSO_TCPV6	4	// GSO frame, IPv6 TCP
+#define VIRTIO_NET_HDR_GSO_ECN		0x80	// TCP has ECN set
+    uint8_t gso_type;
+    uint16_t hdr_len;
+    uint16_t gso_size;
+    uint16_t csum_start;
+    uint16_t csum_offset;
+};
+
+#endif

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 5/6] virtio block driver
  2008-03-29 21:55 [Qemu-devel] [PATCH 1/6] Use ram_addr_t for cpu_get_physical_page_desc Anthony Liguori
                   ` (2 preceding siblings ...)
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 4/6] virtio network driver Anthony Liguori
@ 2008-03-29 21:55 ` Anthony Liguori
  2008-03-29 21:56 ` [Qemu-devel] [PATCH 6/6] virtio balloon driver Anthony Liguori
  4 siblings, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-29 21:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

This patch implements the virtio block driver backend.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index 3ea40d1..f9fe660 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -535,7 +535,7 @@ OBJS += rtl8139.o
 OBJS += e1000.o
 
 # virtio devices
-OBJS += virtio.o virtio-net.o
+OBJS += virtio.o virtio-net.o virtio-blk.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
diff --git a/hw/pc.c b/hw/pc.c
index 4fec2d4..2da9413 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -1011,6 +1011,18 @@ static void pc_init1(int ram_size, int vga_ram_size,
 	    }
         }
     }
+
+    /* Add virtio block devices */
+    if (pci_enabled) {
+	int index;
+	int unit_id = 0;
+
+	while ((index = drive_get_index(IF_VIRTIO, 0, unit_id)) != -1) {
+	    virtio_blk_init(pci_bus, drives_table[index].bdrv);
+	    unit_id++;
+	}
+    }
+
 }
 
 static void pc_init_pci(int ram_size, int vga_ram_size,
diff --git a/hw/pc.h b/hw/pc.h
index 9f83050..c828cda 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -143,4 +143,7 @@ void pci_piix4_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
 
 void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd);
 
+/* virtio-blk.c */
+void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs);
+
 #endif
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
new file mode 100644
index 0000000..a2662c6
--- /dev/null
+++ b/hw/virtio-blk.c
@@ -0,0 +1,127 @@
+/*
+ * Virtio Block Device
+ *
+ * Copyright IBM, Corp. 2007
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "virtio.h"
+#include "block.h"
+#include "block_int.h"
+#include "pc.h"
+#include "virtio-blk.h"
+
+typedef struct VirtIOBlock
+{
+    VirtIODevice vdev;
+    BlockDriverState *bs;
+} VirtIOBlock;
+
+static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
+{
+    return (VirtIOBlock *)vdev;
+}
+
+static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIOBlock *s = to_virtio_blk(vdev);
+    VirtQueueElement *elem;
+
+    while ((elem = virtqueue_pop(vq)) != 0) {
+	struct virtio_blk_outhdr out;
+	struct virtio_blk_inhdr in;
+	unsigned int wlen;
+	size_t in_size, out_size;
+	off_t off;
+	int i;
+
+	out_size = iovector_size(elem->virt_out);
+	in_size = iovector_size(elem->virt_in);
+
+	memcpy_from_iovector(&out, 0, sizeof(out), elem->virt_out);
+
+	off = out.sector;
+
+	if (out.type & VIRTIO_BLK_T_SCSI_CMD) {
+	    wlen = sizeof(in);
+	    in.status = VIRTIO_BLK_S_UNSUPP;
+	} else if (out.type & VIRTIO_BLK_T_OUT) {
+	    IOVector *sg;
+
+	    sg = iovector_trim(elem->virt_out, sizeof(out),
+			       out_size - sizeof(out));
+
+	    wlen = sizeof(in);
+
+	    for (i = 0; i < sg->num; i++) {
+		bdrv_write(s->bs, off, sg->sg[i].base, sg->sg[i].len / 512);
+		off += sg->sg[i].len / 512;
+	    }
+
+	    qemu_free(sg);
+	    in.status = VIRTIO_BLK_S_OK;
+	} else {
+	    IOVector *sg;
+
+	    sg = iovector_trim(elem->virt_in, 0, in_size - sizeof(in));
+
+	    wlen = sizeof(in);
+
+	    for (i = 0; i < sg->num; i++) {
+		bdrv_read(s->bs, off, sg->sg[i].base, sg->sg[i].len / 512);
+		off += sg->sg[i].len / 512;
+		wlen += sg->sg[i].len;
+	    }
+
+	    qemu_free(sg);
+	    in.status = VIRTIO_BLK_S_OK;
+	}
+
+	memcpy_to_iovector(&in, in_size - sizeof(in),
+			   sizeof(in), elem->virt_in);
+
+	virtqueue_push(vq, elem, wlen);
+	virtio_notify(vdev, vq);
+    }
+}
+
+static void virtio_blk_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VirtIOBlock *s = to_virtio_blk(vdev);
+    struct virtio_blk_config blkcfg;
+    int64_t capacity;
+
+    bdrv_get_geometry(s->bs, &capacity);
+    blkcfg.capacity = capacity;
+    blkcfg.seg_max = 128 - 2;
+    memcpy(config, &blkcfg, sizeof(blkcfg));
+}
+
+static uint32_t virtio_blk_get_features(VirtIODevice *vdev)
+{
+    return (1 << VIRTIO_BLK_F_SEG_MAX);
+}
+
+void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs)
+{
+    VirtIOBlock *s;
+
+    s = (VirtIOBlock *)virtio_init_pci(bus, "virtio-blk", 6900, 0x1001,
+				       0, VIRTIO_ID_BLOCK,
+				       0x01, 0x80, 0x00,
+				       16, sizeof(VirtIOBlock));
+
+    s->vdev.get_config = virtio_blk_get_config;
+    s->vdev.get_features = virtio_blk_get_features;
+    s->bs = bs;
+
+    virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
+
+    return s;
+}
diff --git a/hw/virtio-blk.h b/hw/virtio-blk.h
new file mode 100644
index 0000000..290ff5b
--- /dev/null
+++ b/hw/virtio-blk.h
@@ -0,0 +1,66 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_VIRTIO_BLK_H
+#define _QEMU_VIRTIO_BLK_H
+
+/* from Linux's linux/virtio_blk.h */
+
+/* The ID for virtio_block */
+#define VIRTIO_ID_BLOCK	2
+
+/* Feature bits */
+#define VIRTIO_BLK_F_BARRIER	0	/* Does host support barriers? */
+#define VIRTIO_BLK_F_SIZE_MAX	1	/* Indicates maximum segment size */
+#define VIRTIO_BLK_F_SEG_MAX	2	/* Indicates maximum # of segments */
+
+struct virtio_blk_config
+{
+    uint64_t capacity;
+    uint32_t size_max;
+    uint32_t seg_max;
+};
+
+/* These two define direction. */
+#define VIRTIO_BLK_T_IN		0
+#define VIRTIO_BLK_T_OUT	1
+
+/* This bit says it's a scsi command, not an actual read or write. */
+#define VIRTIO_BLK_T_SCSI_CMD	2
+
+/* Barrier before this op. */
+#define VIRTIO_BLK_T_BARRIER	0x80000000
+
+/* This is the first element of the read scatter-gather list. */
+struct virtio_blk_outhdr
+{
+    /* VIRTIO_BLK_T* */
+    uint32_t type;
+    /* io priority. */
+    uint32_t ioprio;
+    /* Sector (ie. 512 byte offset) */
+    uint64_t sector;
+};
+
+#define VIRTIO_BLK_S_OK		0
+#define VIRTIO_BLK_S_IOERR	1
+#define VIRTIO_BLK_S_UNSUPP	2
+
+/* This is the first element of the write scatter-gather list */
+struct virtio_blk_inhdr
+{
+    unsigned char status;
+};
+
+#endif
diff --git a/sysemu.h b/sysemu.h
index 0f18e04..0078190 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -119,7 +119,7 @@ extern unsigned int nb_prom_envs;
 #endif
 
 typedef enum {
-    IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD
+    IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD, IF_VIRTIO
 } BlockInterfaceType;
 
 typedef struct DriveInfo {
diff --git a/vl.c b/vl.c
index 61eb191..9b614e9 100644
--- a/vl.c
+++ b/vl.c
@@ -5034,6 +5034,9 @@ static int drive_init(struct drive_opt *arg, int snapshot,
 	} else if (!strcmp(buf, "sd")) {
 	    type = IF_SD;
             max_devs = 0;
+	} else if (!strcmp(buf, "virtio")) {
+	    type = IF_VIRTIO;
+	    max_devs = 0;
 	} else {
             fprintf(stderr, "qemu: '%s' unsupported bus type '%s'\n", str, buf);
             return -1;
@@ -5225,6 +5228,7 @@ static int drive_init(struct drive_opt *arg, int snapshot,
         break;
     case IF_PFLASH:
     case IF_MTD:
+    case IF_VIRTIO:
         break;
     }
     if (!file[0])

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 6/6] virtio balloon driver
  2008-03-29 21:55 [Qemu-devel] [PATCH 1/6] Use ram_addr_t for cpu_get_physical_page_desc Anthony Liguori
                   ` (3 preceding siblings ...)
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 5/6] virtio block driver Anthony Liguori
@ 2008-03-29 21:56 ` Anthony Liguori
  4 siblings, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-29 21:56 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

This patch implements the virtio balloon driver backend.  A user can interact
with the balloon driver using a newly introduce monitor command 'balloon'.

Ballooning is used to request the guest to stop using a certain portion of its
memory.  The guest notifies the host of this memory so the host can immediately
reallocate it.

Ballooning is implemented within QEMU via the madvise() system call.  This is
for Linux hosts only ATM but it should be easy enough to add the right code for
other hosts.

If you balloon down sufficiently, you can see the resident memory of the QEMU
instance decrease when using this driver.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index f9fe660..86a0bf5 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -535,7 +535,7 @@ OBJS += rtl8139.o
 OBJS += e1000.o
 
 # virtio devices
-OBJS += virtio.o virtio-net.o virtio-blk.o
+OBJS += virtio.o virtio-net.o virtio-blk.o virtio-balloon.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
diff --git a/balloon.h b/balloon.h
new file mode 100644
index 0000000..60b4a5d
--- /dev/null
+++ b/balloon.h
@@ -0,0 +1,27 @@
+/*
+ * Balloon
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_BALLOON_H
+#define _QEMU_BALLOON_H
+
+#include "cpu-defs.h"
+
+typedef ram_addr_t (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque);
+
+void qemu_balloon(ram_addr_t target);
+
+ram_addr_t qemu_balloon_status(void);
+
+#endif
diff --git a/hw/pc.c b/hw/pc.c
index 2da9413..8d3401a 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -1023,6 +1023,8 @@ static void pc_init1(int ram_size, int vga_ram_size,
 	}
     }
 
+    if (pci_enabled)
+	virtio_balloon_init(pci_bus);
 }
 
 static void pc_init_pci(int ram_size, int vga_ram_size,
diff --git a/hw/pc.h b/hw/pc.h
index c828cda..67583f7 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -146,4 +146,7 @@ void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd);
 /* virtio-blk.c */
 void *virtio_blk_init(PCIBus *bus, BlockDriverState *bs);
 
+/* virtio-balloon.h */
+void *virtio_balloon_init(PCIBus *bus);
+
 #endif
diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c
new file mode 100644
index 0000000..b37eb04
--- /dev/null
+++ b/hw/virtio-balloon.c
@@ -0,0 +1,137 @@
+/*
+ * Virtio Block Device
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "virtio.h"
+#include "pc.h"
+#include "sysemu.h"
+#include "cpu.h"
+#include "balloon.h"
+#include "virtio-balloon.h"
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#endif
+
+typedef struct VirtIOBalloon
+{
+    VirtIODevice vdev;
+    VirtQueue *ivq, *dvq;
+    uint32_t num_pages;
+    uint32_t actual;
+} VirtIOBalloon;
+
+static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev)
+{
+    return (VirtIOBalloon *)vdev;
+}
+
+static void balloon_page(void *addr, int deflate)
+{
+#if defined(__linux__)
+    madvise(addr, TARGET_PAGE_SIZE, deflate ? MADV_WILLNEED : MADV_DONTNEED);
+#endif
+}
+
+static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIOBalloon *s = to_virtio_balloon(vdev);
+    VirtQueueElement *elem;
+
+    while ((elem = virtqueue_pop(vq)) != NULL) {
+	size_t offset = 0;
+	uint32_t pfn;
+
+	while (memcpy_from_iovector(&pfn, offset, 4, elem->virt_out) == 4) {
+	    ram_addr_t phys, pa;
+
+	    pa = (ram_addr_t)ldl_p(&pfn) << TARGET_PAGE_BITS;
+
+	    phys = cpu_get_physical_page_desc(pa);
+
+	    /* can't balloon non-RAM pages */
+	    if ((phys & ~TARGET_PAGE_MASK) != IO_MEM_RAM)
+		continue;
+
+	    balloon_page(phys_ram_base + (phys & TARGET_PAGE_MASK), 
+			 !!(vq == s->dvq));
+
+	    offset += 4;
+	}
+
+	virtqueue_push(vq, elem, offset);
+	virtio_notify(vdev, vq);
+    }
+}
+
+static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
+{
+    VirtIOBalloon *dev = to_virtio_balloon(vdev);
+    struct virtio_balloon_config config;
+
+    config.num_pages = dev->num_pages;
+    config.actual = dev->actual;
+
+    memcpy(config_data, &config, 8);
+}
+
+static void virtio_balloon_set_config(VirtIODevice *vdev,
+				      const uint8_t *config_data)
+{
+    VirtIOBalloon *dev = to_virtio_balloon(vdev);
+    struct virtio_balloon_config config;
+    memcpy(&config, config_data, 8);
+    dev->actual = config.actual;
+}
+
+static uint32_t virtio_balloon_get_features(VirtIODevice *vdev)
+{
+    return 0;
+}
+
+static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t target)
+{
+    VirtIOBalloon *dev = opaque;
+
+    if (target > ram_size)
+	target = ram_size;
+
+    if (target) {
+	dev->num_pages = (ram_size - target) >> TARGET_PAGE_BITS;
+	virtio_notify_config(&dev->vdev);
+    }
+
+    return ram_size - (dev->actual << TARGET_PAGE_BITS);
+}
+
+void *virtio_balloon_init(PCIBus *bus)
+{
+    VirtIOBalloon *s;
+
+    s = (VirtIOBalloon *)virtio_init_pci(bus, "virtio-balloon",
+					 6900, 0x1002,
+					 0, VIRTIO_ID_BALLOON,
+					 0x05, 0x00, 0x00,
+					 8, sizeof(VirtIOBalloon));
+
+    s->vdev.get_config = virtio_balloon_get_config;
+    s->vdev.set_config = virtio_balloon_set_config;
+    s->vdev.get_features = virtio_balloon_get_features;
+
+    s->ivq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
+    s->dvq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output);
+
+    qemu_add_balloon_handler(virtio_balloon_to_target, s);
+
+    return &s->vdev;
+}
diff --git a/hw/virtio-balloon.h b/hw/virtio-balloon.h
new file mode 100644
index 0000000..27d6985
--- /dev/null
+++ b/hw/virtio-balloon.h
@@ -0,0 +1,34 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007-2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Rusty Russell     <rusty@rustcorp.com.au>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_VIRTIO_BALLOON_H
+#define _QEMU_VIRTIO_BALLOON_H
+
+/* from Linux's linux/virtio_balloon.h */
+
+/* The ID for virtio_balloon */
+#define VIRTIO_ID_BALLOON	5
+
+/* The feature bitmap for virtio balloon */
+#define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
+
+struct virtio_balloon_config
+{
+    /* Number of pages host wants Guest to give up. */
+    uint32_t num_pages;
+    /* Number of pages we've actually got in balloon. */
+    uint32_t actual;
+};
+
+#endif
diff --git a/monitor.c b/monitor.c
index 025025b..7f4c096 100644
--- a/monitor.c
+++ b/monitor.c
@@ -34,6 +34,7 @@
 #include "block.h"
 #include "audio/audio.h"
 #include "disas.h"
+#include "balloon.h"
 #include <dirent.h>
 
 #ifdef CONFIG_PROFILER
@@ -1257,6 +1258,23 @@ static void do_wav_capture (const char *path,
 }
 #endif
 
+static void do_balloon(int value)
+{
+    ram_addr_t target = value;
+    qemu_balloon(target << 20);
+}
+
+static void do_info_balloon(void)
+{
+    ram_addr_t actual;
+
+    actual = qemu_balloon_status();
+    if (actual == 0)
+	term_printf("Ballooning not activated in VM\n");
+    else
+	term_printf("balloon: actual=%d\n", (int)(actual >> 20));
+}
+
 static term_cmd_t term_cmds[] = {
     { "help|?", "s?", do_help,
       "[cmd]", "show the help" },
@@ -1328,6 +1346,8 @@ static term_cmd_t term_cmds[] = {
        "capture index", "stop capture" },
     { "memsave", "lis", do_memory_save,
       "addr size file", "save to disk virtual memory dump starting at 'addr' of size 'size'", },
+    { "balloon", "i", do_balloon,
+      "target", "request VM to change it's memory allocation (in MB)" },
     { NULL, NULL, },
 };
 
@@ -1388,6 +1408,8 @@ static term_cmd_t info_cmds[] = {
     { "slirp", "", do_info_slirp,
       "", "show SLIRP statistics", },
 #endif
+    { "balloon", "", do_info_balloon,
+      "", "show balloon information" },
     { NULL, NULL, },
 };
 
diff --git a/vl.c b/vl.c
index 9b614e9..22ec24f 100644
--- a/vl.c
+++ b/vl.c
@@ -37,6 +37,7 @@
 #include "qemu-char.h"
 #include "block.h"
 #include "audio/audio.h"
+#include "balloon.h"
 
 #include <unistd.h>
 #include <fcntl.h>
@@ -482,6 +483,31 @@ void hw_error(const char *fmt, ...)
     va_end(ap);
     abort();
 }
+ 
+/***************/
+/* ballooning */
+
+static QEMUBalloonEvent *qemu_balloon_event;
+void *qemu_balloon_event_opaque;
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque)
+{
+    qemu_balloon_event = func;
+    qemu_balloon_event_opaque = opaque;
+}
+
+void qemu_balloon(ram_addr_t target)
+{
+    if (qemu_balloon_event)
+	qemu_balloon_event(qemu_balloon_event_opaque, target);
+}
+
+ram_addr_t qemu_balloon_status(void)
+{
+    if (qemu_balloon_event)
+	return qemu_balloon_event(qemu_balloon_event_opaque, 0);
+    return 0;
+}
 
 /***********************************************************/
 /* keyboard/mouse */

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 2/6] PCI DMA API Anthony Liguori
@ 2008-03-30  7:06   ` Blue Swirl
  2008-03-30 14:44     ` Anthony Liguori
  2008-03-30 10:18   ` Paul Brook
  2008-03-30 10:25   ` [Qemu-devel] Re: [kvm-devel] " Avi Kivity
  2 siblings, 1 reply; 23+ messages in thread
From: Blue Swirl @ 2008-03-30  7:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

On 3/30/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> This patch introduces a PCI DMA API and some generic code to support other DMA
>  APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
>  maps a PhysIOVector, which is composed of target_phys_addr_t, into an IOVector,
>  which is composed of void *.

This looks like it wouldn't scale to handle the Sparc systems. There
we want to make more translation steps from DVMA addresses to physical
in DMA controller and IOMMU and only in the final stage to void *. To
handle this, probably there should be an opaque parameter and some way
to register the translation function. Otherwise the API looks OK.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 2/6] PCI DMA API Anthony Liguori
  2008-03-30  7:06   ` Blue Swirl
@ 2008-03-30 10:18   ` Paul Brook
  2008-03-30 14:42     ` Anthony Liguori
  2008-03-30 10:25   ` [Qemu-devel] Re: [kvm-devel] " Avi Kivity
  2 siblings, 1 reply; 23+ messages in thread
From: Paul Brook @ 2008-03-30 10:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

On Saturday 29 March 2008, Anthony Liguori wrote:
> This patch introduces a PCI DMA API and some generic code to support other
> DMA APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
> maps a PhysIOVector, which is composed of target_phys_addr_t, into an
> IOVector, which is composed of void *.

Devices should not be using IOVector. They should either use the DMA copy 
routines to copy from a PhysIOVector into a local buffer, or they should pass 
a PhysIOVector to a block/network read/write routine. The DMA API should 
allow devices to be agnostic about how DMA is implemented. They should not be 
trying to manually implement zero copy.

> This enables zero-copy IO to be preformed without introducing assumptions
> of phys_ram_base.  This API is at the PCI device level to enable support of
> per-device IOMMU remapping.

By my reading it *requires* bridges be zero-copy.  For big-endian targets we 
need to ability to byteswap accesses.

Some description (in the form of source comments) of how it's meant to be used 
would also be helpful.

Paul

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] Re: [kvm-devel] [PATCH 2/6] PCI DMA API
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 2/6] PCI DMA API Anthony Liguori
  2008-03-30  7:06   ` Blue Swirl
  2008-03-30 10:18   ` Paul Brook
@ 2008-03-30 10:25   ` Avi Kivity
  2008-03-30 14:49     ` Anthony Liguori
  2 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2008-03-30 10:25 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Anthony Liguori wrote:
> This patch introduces a PCI DMA API and some generic code to support other DMA
> APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
> maps a PhysIOVector, which is composed of target_phys_addr_t, into an IOVector,
> which is composed of void *.
>
> This enables zero-copy IO to be preformed without introducing assumptions of
> phys_ram_base.  This API is at the PCI device level to enable support of
> per-device IOMMU remapping.
>
>
> +
> +typedef struct IOVector
> +{
> +    int num;
> +    struct IOVectorElement {
> +	void *base;
> +	size_t len;
> +    } sg[0];
> +} IOVector;
> +
>   

Can we use 'struct iovec' for the element type (with accessors for 
setting base+len, and reading base or len, so we can substitute the 
Windows version for that platform)? That will allow using the vector 
without additional translation or casts.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 4/6] virtio network driver
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 4/6] virtio network driver Anthony Liguori
@ 2008-03-30 10:27   ` Paul Brook
  2008-03-30 14:47     ` Anthony Liguori
  0 siblings, 1 reply; 23+ messages in thread
From: Paul Brook @ 2008-03-30 10:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno

On Saturday 29 March 2008, Anthony Liguori wrote:
> +    if ((elem = virtqueue_pop(n->rx_vq)) == NULL) {
> +       /* wait until the guest adds some rx bufs */
> +       n->can_receive = 0;
> +       return;
> +    }

Setting can_receive to zero *after* dropping a packet is a bit late.
Not a fatal flaw, but it does make can_receive fairly useless. The whole point 
of can_receive is to workaround lack of proper TCP rate control in the slirp 
code.

Paul

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-30 10:18   ` Paul Brook
@ 2008-03-30 14:42     ` Anthony Liguori
  2008-03-30 18:19       ` Paul Brook
  0 siblings, 1 reply; 23+ messages in thread
From: Anthony Liguori @ 2008-03-30 14:42 UTC (permalink / raw)
  To: Paul Brook; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Paul Brook wrote:
> On Saturday 29 March 2008, Anthony Liguori wrote:
>   
>> This patch introduces a PCI DMA API and some generic code to support other
>> DMA APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
>> maps a PhysIOVector, which is composed of target_phys_addr_t, into an
>> IOVector, which is composed of void *.
>>     
>
> Devices should not be using IOVector. They should either use the DMA copy 
> routines to copy from a PhysIOVector into a local buffer, or they should pass 
> a PhysIOVector to a block/network read/write routine. The DMA API should 
> allow devices to be agnostic about how DMA is implemented. They should not be 
> trying to manually implement zero copy.
>   

Someone has to do the translation of PhysIOVector => IOVector.  It 
doesn't seem logical to me to do it in the IO backend level because the 
block subsystem doesn't know how to do that translation.  You would have 
to pass the PhysIOVector although with a translation function and an 
opaque pointer.

What could work is if the DMA API functions mapped PhysIOVector => 
PhysIOVector and then the network and block subsystems could operate on 
a PhysIOVector.  I have patches that implement vector IO for net and 
block but didn't want to include them in this series to keep things simple.

>> This enables zero-copy IO to be preformed without introducing assumptions
>> of phys_ram_base.  This API is at the PCI device level to enable support of
>> per-device IOMMU remapping.
>>     
>
> By my reading it *requires* bridges be zero-copy.  For big-endian targets we 
> need to ability to byteswap accesses.
>   

You mean via ld/st_phys?  I can add a set of ld/st_vec functions (and 
even use them in hw/virtio.c).  I think operating on a translated vec is 
the right thing to do as it avoids the translation to be cached.  To 
make ld/st_phys just work, we would have to have some sort of global DMA 
context.  That gets tricky for drivers that use timer callbacks.

> Some description (in the form of source comments) of how it's meant to be used 
> would also be helpful.
>   

Will do for the next round.

Thanks,

Anthony Liguori

> Paul
>   

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-30  7:06   ` Blue Swirl
@ 2008-03-30 14:44     ` Anthony Liguori
  2008-03-30 14:49       ` Avi Kivity
  2008-03-30 14:58       ` Blue Swirl
  0 siblings, 2 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-30 14:44 UTC (permalink / raw)
  To: Blue Swirl; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Blue Swirl wrote:
> On 3/30/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
>> This patch introduces a PCI DMA API and some generic code to support other DMA
>>  APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
>>  maps a PhysIOVector, which is composed of target_phys_addr_t, into an IOVector,
>>  which is composed of void *.
>>     
>
> This looks like it wouldn't scale to handle the Sparc systems. There
> we want to make more translation steps from DVMA addresses to physical
> in DMA controller and IOMMU and only in the final stage to void *. To
> handle this, probably there should be an opaque parameter and some way
> to register the translation function. Otherwise the API looks OK.
>   

I think having the PCI DMA API translate PhysIOVector => PhysIOVector 
would help.  Then it becomes pretty easy to just call the DMA controller 
for additional translation from the IOMMU.

Does that sound right?  I don't quite understand what role the opaque 
parameter would serve.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 4/6] virtio network driver
  2008-03-30 10:27   ` Paul Brook
@ 2008-03-30 14:47     ` Anthony Liguori
  0 siblings, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-30 14:47 UTC (permalink / raw)
  To: Paul Brook; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Paul Brook wrote:
> On Saturday 29 March 2008, Anthony Liguori wrote:
>   
>> +    if ((elem = virtqueue_pop(n->rx_vq)) == NULL) {
>> +       /* wait until the guest adds some rx bufs */
>> +       n->can_receive = 0;
>> +       return;
>> +    }
>>     
>
> Setting can_receive to zero *after* dropping a packet is a bit late.
> Not a fatal flaw, but it does make can_receive fairly useless. The whole point 
> of can_receive is to workaround lack of proper TCP rate control in the slirp 
> code.
>   

Yeah, I should just drop the can_receive handler.  I assumed when I 
wrote the driver originally that can_receive queued packets.  Since we 
have to drop packets anyway in the code now, there's no point in having 
a can_receive handler.

Regards,

Anthony Liguori

> Paul
>   

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] Re: [kvm-devel] [PATCH 2/6] PCI DMA API
  2008-03-30 10:25   ` [Qemu-devel] Re: [kvm-devel] " Avi Kivity
@ 2008-03-30 14:49     ` Anthony Liguori
  0 siblings, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-30 14:49 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Avi Kivity wrote:
> Anthony Liguori wrote:
>> This patch introduces a PCI DMA API and some generic code to support 
>> other DMA
>> APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
>> maps a PhysIOVector, which is composed of target_phys_addr_t, into an 
>> IOVector,
>> which is composed of void *.
>>
>> This enables zero-copy IO to be preformed without introducing 
>> assumptions of
>> phys_ram_base.  This API is at the PCI device level to enable support of
>> per-device IOMMU remapping.
>>
>>
>> +
>> +typedef struct IOVector
>> +{
>> +    int num;
>> +    struct IOVectorElement {
>> +    void *base;
>> +    size_t len;
>> +    } sg[0];
>> +} IOVector;
>> +
>>   
>
> Can we use 'struct iovec' for the element type (with accessors for 
> setting base+len, and reading base or len, so we can substitute the 
> Windows version for that platform)? That will allow using the vector 
> without additional translation or casts.

If we switch to PhysIOVector => PhysIOVector, then the IO infrastructure 
can convert it to whatever it wants (including a struct iovec).

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-30 14:44     ` Anthony Liguori
@ 2008-03-30 14:49       ` Avi Kivity
  2008-03-30 14:56         ` [kvm-devel] " Anthony Liguori
  2008-03-30 14:58       ` Blue Swirl
  1 sibling, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2008-03-30 14:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, kvm-devel, Marcelo Tosatti, Aurelien Jarno

Anthony Liguori wrote:
>>
>> This looks like it wouldn't scale to handle the Sparc systems. There
>> we want to make more translation steps from DVMA addresses to physical
>> in DMA controller and IOMMU and only in the final stage to void *. To
>> handle this, probably there should be an opaque parameter and some way
>> to register the translation function. Otherwise the API looks OK.
>>   
>
> I think having the PCI DMA API translate PhysIOVector => PhysIOVector 
> would help.  Then it becomes pretty easy to just call the DMA 
> controller for additional translation from the IOMMU.
>
> Does that sound right?  I don't quite understand what role the opaque 
> parameter would serve.
>

State for the dma controller.

I think Blue is calling for chaining of dma mappings, no?  Something 
similar is being proposed for the Linux dma api.


-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-30 14:49       ` Avi Kivity
@ 2008-03-30 14:56         ` Anthony Liguori
  0 siblings, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-30 14:56 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Blue Swirl, kvm-devel, Marcelo Tosatti, qemu-devel,
	Aurelien Jarno

Avi Kivity wrote:
> Anthony Liguori wrote:
>   
>>> This looks like it wouldn't scale to handle the Sparc systems. There
>>> we want to make more translation steps from DVMA addresses to physical
>>> in DMA controller and IOMMU and only in the final stage to void *. To
>>> handle this, probably there should be an opaque parameter and some way
>>> to register the translation function. Otherwise the API looks OK.
>>>   
>>>       
>> I think having the PCI DMA API translate PhysIOVector => PhysIOVector 
>> would help.  Then it becomes pretty easy to just call the DMA 
>> controller for additional translation from the IOMMU.
>>
>> Does that sound right?  I don't quite understand what role the opaque 
>> parameter would serve.
>>
>>     
>
> State for the dma controller.
>
> I think Blue is calling for chaining of dma mappings, no?  Something 
> similar is being proposed for the Linux dma api.
>
>   

The way I envision chaining is:

virtio-blk calls pci_device_dma_map with a PhysIOVector A
pci_device_dma_map calls into PCI IOMMU (if necessary) to translate 
PhysIOVector A to PhysIOVector B
pci_device_dma_map then calls into platform DMA engine to translate 
PhysIOVector B to PhysIOVector C
pci_device_dma_map frees PhysIOVector B and returns PhysIOVector C

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-30 14:44     ` Anthony Liguori
  2008-03-30 14:49       ` Avi Kivity
@ 2008-03-30 14:58       ` Blue Swirl
  2008-03-30 15:11         ` Anthony Liguori
  1 sibling, 1 reply; 23+ messages in thread
From: Blue Swirl @ 2008-03-30 14:58 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

On 3/30/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
> Blue Swirl wrote:
>  > On 3/30/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>  >
>  >> This patch introduces a PCI DMA API and some generic code to support other DMA
>  >>  APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
>  >>  maps a PhysIOVector, which is composed of target_phys_addr_t, into an IOVector,
>  >>  which is composed of void *.
>  >>
>  >
>  > This looks like it wouldn't scale to handle the Sparc systems. There
>  > we want to make more translation steps from DVMA addresses to physical
>  > in DMA controller and IOMMU and only in the final stage to void *. To
>  > handle this, probably there should be an opaque parameter and some way
>  > to register the translation function. Otherwise the API looks OK.
>  >
>
>
> I think having the PCI DMA API translate PhysIOVector => PhysIOVector
>  would help.  Then it becomes pretty easy to just call the DMA controller
>  for additional translation from the IOMMU.
>
>  Does that sound right?  I don't quite understand what role the opaque
>  parameter would serve.

Devices should not need to know about the underlying buses, so they
can be used in different systems. So the translators just call
recursively next ones until we get physical memory. I would use the
opaque parameter as a pointer to each translator's own state
structures. But if you can implement this without the parameter,
great!

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-30 14:58       ` Blue Swirl
@ 2008-03-30 15:11         ` Anthony Liguori
  0 siblings, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-30 15:11 UTC (permalink / raw)
  To: Blue Swirl; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Blue Swirl wrote:
> On 3/30/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>   
>> Blue Swirl wrote:
>>  > On 3/30/08, Anthony Liguori <aliguori@us.ibm.com> wrote:
>>  >
>>  >> This patch introduces a PCI DMA API and some generic code to support other DMA
>>  >>  APIs.  Two types are introduced: PhysIOVector and IOVector.  A DMA API
>>  >>  maps a PhysIOVector, which is composed of target_phys_addr_t, into an IOVector,
>>  >>  which is composed of void *.
>>  >>
>>  >
>>  > This looks like it wouldn't scale to handle the Sparc systems. There
>>  > we want to make more translation steps from DVMA addresses to physical
>>  > in DMA controller and IOMMU and only in the final stage to void *. To
>>  > handle this, probably there should be an opaque parameter and some way
>>  > to register the translation function. Otherwise the API looks OK.
>>  >
>>
>>
>> I think having the PCI DMA API translate PhysIOVector => PhysIOVector
>>  would help.  Then it becomes pretty easy to just call the DMA controller
>>  for additional translation from the IOMMU.
>>
>>  Does that sound right?  I don't quite understand what role the opaque
>>  parameter would serve.
>>     
>
> Devices should not need to know about the underlying buses, so they
> can be used in different systems.

I don't think it will be too hard for a device to support multiple buses 
if we have the DMA API at the bus level.  In the future, the per-bus DMA 
API may have slight, but important differences.  For instance, at some 
point, PCI devices will be capable of recovering from an IO fault and 
you'd eventually want the DMA API to reflect this for PCI.

Regards,

Anthony LIguori

>  So the translators just call
> recursively next ones until we get physical memory. I would use the
> opaque parameter as a pointer to each translator's own state
> structures. But if you can implement this without the parameter,
> great!
>   

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/6] virtio for QEMU
  2008-03-29 21:55 ` [Qemu-devel] [PATCH 3/6] virtio for QEMU Anthony Liguori
@ 2008-03-30 17:25   ` Dor Laor
  2008-03-30 22:59     ` [kvm-devel] " Anthony Liguori
  2008-04-05  3:09     ` Anthony Liguori
  0 siblings, 2 replies; 23+ messages in thread
From: Dor Laor @ 2008-03-30 17:25 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti, Anthony Liguori, Aurelien Jarno


On Sat, 2008-03-29 at 16:55 -0500, Anthony Liguori wrote:
> This patch introduces virtio support over PCI.  virtio is a generic virtual IO
> framework for Linux first introduced in 2.6.23.  Since 2.6.25, virtio has
> supported a PCI transport which this patch implements.
> 
> Since the last time these patches were posted to qemu-devel, I've reworked it
> to use the proper access functions to manipulate guest memory.
> 
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

It's will be great to drop the nasty hacks :)
Do you still get 1G net performance using the extra copy from tap
(memcpy_to_iovector)?

[snip]

> +static uint32_t vring_desc_len(VirtQueue *vq, unsigned int i)
> +{

Below there were place you did use offsetof(vq->vring.desc[i], len) so
we better be consistent + its nicer

> +    return ldl_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
> +		    offsetof(VRingDesc, len));
> +}
> +

[snip]

> +VirtQueueElement *virtqueue_pop(VirtQueue *vq)
> +{
> +    unsigned int i, head;
> +    unsigned int position;
> +    VirtQueueElement *elem;
> +
> +    /* Check it isn't doing very strange things with descriptor numbers. */
> +    if ((uint16_t)(vring_avail_idx(vq) - vq->last_avail_idx) > vq->vring.num)
> +	errx(1, "Guest moved used index from %u to %u",
> +	     vq->last_avail_idx, vring_avail_idx(vq));
> +
> +    /* If there's nothing new since last we looked, return invalid. */
> +    if (vring_avail_idx(vq) == vq->last_avail_idx)
> +	return NULL;
> +
> +    /* Grab the next descriptor number they're advertising, and increment
> +     * the index we've seen. */
> +    head = vring_avail_ring(vq, vq->last_avail_idx++ % vq->vring.num);
> +
> +    /* If their number is silly, that's a fatal mistake. */
> +    if (head >= vq->vring.num)
> +	errx(1, "Guest says index %u is available", head);
> +
> +    /* When we start there are none of either input nor output. */
> +    position = 0;
> +
> +    elem = qemu_mallocz(sizeof(VirtQueueElement));
> +
> +    elem->phys_in = qemu_mallocz(sizeof(PhysIOVector) +
> +				 vq->vring.num * sizeof(PhysIOVectorElement));
> +    elem->phys_out = qemu_mallocz(sizeof(PhysIOVector) +
> +				  vq->vring.num * sizeof(PhysIOVectorElement));

I was wondering whether it can be optimized since vring.num is sometimes
512 so and we can either use a pool of these or calculate the vring.num
from the descriptors but it seems like your way is the best.

> +
> +    i = head;
> +    do {
> +	PhysIOVectorElement *sge;
> +
> +	if (vring_desc_flags(vq, i) & VRING_DESC_F_WRITE)
> +	    sge = &elem->phys_in->sg[elem->phys_in->num++];
> +	else
> +	    sge = &elem->phys_out->sg[elem->phys_out->num++];
> +
> +	/* Grab the first descriptor, and check it's OK. */
> +	sge->len = vring_desc_len(vq, i);
> +	sge->base = vring_desc_addr(vq, i);
> +
> +	/* If we've got too many, that implies a descriptor loop. */
> +	if ((elem->phys_in->num + elem->phys_out->num) > vq->vring.num)
> +	    errx(1, "Looped descriptor");
> +    } while ((i = virtqueue_next_desc(vq, i)) != vq->vring.num);
> +
> +    elem->virt_in = pci_device_dma_map(&vq->vdev->pci_dev, elem->phys_in);
> +    elem->virt_out = pci_device_dma_map(&vq->vdev->pci_dev, elem->phys_out);
> +    elem->index = head;
> +
> +    if (elem->virt_in == NULL || elem->virt_out == NULL)
> +	errx(1, "Bad DMA");
> +
> +    return elem;
> +}
> +
> +

The name below is a bit misleading since when enable is true you
actually set no_notify.
So I name it something like virtio_vring_set_no_notify(...) or similar.


> +void virtio_ring_set_used_notify(VirtQueue *vq, int enable)
> +{
> +    if (enable)
> +	vring_used_set_flag(vq, VRING_USED_F_NO_NOTIFY);
> +    else
> +	vring_used_unset_flag(vq, VRING_USED_F_NO_NOTIFY);
> +}
> +

Cheers,
Dor

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-30 14:42     ` Anthony Liguori
@ 2008-03-30 18:19       ` Paul Brook
  2008-03-30 19:02         ` Anthony Liguori
  0 siblings, 1 reply; 23+ messages in thread
From: Paul Brook @ 2008-03-30 18:19 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

On Sunday 30 March 2008, Anthony Liguori wrote:
> Paul Brook wrote:
> > On Saturday 29 March 2008, Anthony Liguori wrote:
> >> This patch introduces a PCI DMA API and some generic code to support
> >> other DMA APIs.  Two types are introduced: PhysIOVector and IOVector.  A
> >> DMA API maps a PhysIOVector, which is composed of target_phys_addr_t,
> >> into an IOVector, which is composed of void *.
> >
> > Devices should not be using IOVector. They should either use the DMA copy
> > routines to copy from a PhysIOVector into a local buffer, or they should
> > pass a PhysIOVector to a block/network read/write routine. The DMA API
> > should allow devices to be agnostic about how DMA is implemented. They
> > should not be trying to manually implement zero copy.
>
> Someone has to do the translation of PhysIOVector => IOVector.  It
> doesn't seem logical to me to do it in the IO backend level because the
> block subsystem doesn't know how to do that translation.  You would have
> to pass the PhysIOVector although with a translation function and an
> opaque pointer.

The entity processing the data shouldn't need to know or care how the 
translation is done. PhysIOVector should describe everything it need to know.

> What could work is if the DMA API functions mapped PhysIOVector =>
> PhysIOVector and then the network and block subsystems could operate on
> a PhysIOVector.  I have patches that implement vector IO for net and
> block but didn't want to include them in this series to keep things simple.

IMHO this is the only sane way to implement zero-copy.

> >> This enables zero-copy IO to be preformed without introducing
> >> assumptions of phys_ram_base.  This API is at the PCI device level to
> >> enable support of per-device IOMMU remapping.
> >
> > By my reading it *requires* bridges be zero-copy.  For big-endian targets
> > we need to ability to byteswap accesses.
>
> You mean via ld/st_phys?  

By whatever means the bridge deems necessary. The whole point of the DMA API 
is that you're transferring a block of data. The API allows intermediate 
busses to transform that data (and address) without the block handler needing 
to know or care.

With your current scheme a byteswapping bus has to allocate a single large 
buffer for the whole vector, even if the device then ends up copying unto a 
local buffer in small chunks.

Paul

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] PCI DMA API
  2008-03-30 18:19       ` Paul Brook
@ 2008-03-30 19:02         ` Anthony Liguori
  0 siblings, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-30 19:02 UTC (permalink / raw)
  To: Paul Brook; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Paul Brook wrote:
> On Sunday 30 March 2008, Anthony Liguori wrote:
>   
> The entity processing the data shouldn't need to know or care how the 
> translation is done. PhysIOVector should describe everything it need to know.
>   

Okay, I'll update.

>> What could work is if the DMA API functions mapped PhysIOVector =>
>> PhysIOVector and then the network and block subsystems could operate on
>> a PhysIOVector.  I have patches that implement vector IO for net and
>> block but didn't want to include them in this series to keep things simple.
>>     
>
> IMHO this is the only sane way to implement zero-copy.
>
>   
>>>> This enables zero-copy IO to be preformed without introducing
>>>> assumptions of phys_ram_base.  This API is at the PCI device level to
>>>> enable support of per-device IOMMU remapping.
>>>>         
>>> By my reading it *requires* bridges be zero-copy.  For big-endian targets
>>> we need to ability to byteswap accesses.
>>>       
>> You mean via ld/st_phys?  
>>     
>
> By whatever means the bridge deems necessary. The whole point of the DMA API 
> is that you're transferring a block of data. The API allows intermediate 
> busses to transform that data (and address) without the block handler needing 
> to know or care.
>
> With your current scheme a byteswapping bus has to allocate a single large 
> buffer for the whole vector, even if the device then ends up copying unto a 
> local buffer in small chunks.
>   

Oh, I see now.  The DMA API should have not just a mechanism to do bulk 
transfers but also provide an interface to do load/store's that could 
potentially be byte-swapped.  I didn't realize buses did that.

Regards,

Anthony Liguori

> Paul
>   

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] [Qemu-devel] [PATCH 3/6] virtio for QEMU
  2008-03-30 17:25   ` Dor Laor
@ 2008-03-30 22:59     ` Anthony Liguori
  2008-04-05  3:09     ` Anthony Liguori
  1 sibling, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-03-30 22:59 UTC (permalink / raw)
  To: dor.laor; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Dor Laor wrote:
> On Sat, 2008-03-29 at 16:55 -0500, Anthony Liguori wrote:
>   
>> This patch introduces virtio support over PCI.  virtio is a generic virtual IO
>> framework for Linux first introduced in 2.6.23.  Since 2.6.25, virtio has
>> supported a PCI transport which this patch implements.
>>
>> Since the last time these patches were posted to qemu-devel, I've reworked it
>> to use the proper access functions to manipulate guest memory.
>>
>> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>>     
>
> It's will be great to drop the nasty hacks :)
> Do you still get 1G net performance using the extra copy from tap
> (memcpy_to_iovector)?
>   

We take a bit of a hit (probably 10-20%) doing copy.  The "tap hacks" 
require a more invasive set of patches to refactor the VLAN support in 
QEMU.  The fundamental problem with tap is that it only supports one tap 
device per VLAN.  What we really want is a VLAN were each VLAN client 
has it's own tap device.  This is also necessary to properly support the 
upcoming ring queue support for TAP along with GSO.

That patch set is why I revisited this one in fact :-)  Once we get 
virtio merged, I'll then send out the VLAN refactoring.  The nice thing 
though is that once we have the VLAN refactoring, we can optimize the 
e1000 device to make use of it.

> [snip]
>
>   
>> +static uint32_t vring_desc_len(VirtQueue *vq, unsigned int i)
>> +{
>>     
>
> Below there were place you did use offsetof(vq->vring.desc[i], len) so
> we better be consistent + its nicer
>
>   
>> +    return ldl_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
>> +		    offsetof(VRingDesc, len));
>> +}
>>     

Yup, I just missed this one.  Thanks for the catch!

>> +VirtQueueElement *virtqueue_pop(VirtQueue *vq)
>> +{
>> +    unsigned int i, head;
>> +    unsigned int position;
>> +    VirtQueueElement *elem;
>> +
>> +    /* Check it isn't doing very strange things with descriptor numbers. */
>> +    if ((uint16_t)(vring_avail_idx(vq) - vq->last_avail_idx) > vq->vring.num)
>> +	errx(1, "Guest moved used index from %u to %u",
>> +	     vq->last_avail_idx, vring_avail_idx(vq));
>> +
>> +    /* If there's nothing new since last we looked, return invalid. */
>> +    if (vring_avail_idx(vq) == vq->last_avail_idx)
>> +	return NULL;
>> +
>> +    /* Grab the next descriptor number they're advertising, and increment
>> +     * the index we've seen. */
>> +    head = vring_avail_ring(vq, vq->last_avail_idx++ % vq->vring.num);
>> +
>> +    /* If their number is silly, that's a fatal mistake. */
>> +    if (head >= vq->vring.num)
>> +	errx(1, "Guest says index %u is available", head);
>> +
>> +    /* When we start there are none of either input nor output. */
>> +    position = 0;
>> +
>> +    elem = qemu_mallocz(sizeof(VirtQueueElement));
>> +
>> +    elem->phys_in = qemu_mallocz(sizeof(PhysIOVector) +
>> +				 vq->vring.num * sizeof(PhysIOVectorElement));
>> +    elem->phys_out = qemu_mallocz(sizeof(PhysIOVector) +
>> +				  vq->vring.num * sizeof(PhysIOVectorElement));
>>     
>
> I was wondering whether it can be optimized since vring.num is sometimes
> 512 so and we can either use a pool of these or calculate the vring.num
> from the descriptors but it seems like your way is the best.
>   

My thinking right now is to use qemu_mallocz() for everything and then 
we can go back and optimize with pooling if necessary.

>> +
>> +    i = head;
>> +    do {
>> +	PhysIOVectorElement *sge;
>> +
>> +	if (vring_desc_flags(vq, i) & VRING_DESC_F_WRITE)
>> +	    sge = &elem->phys_in->sg[elem->phys_in->num++];
>> +	else
>> +	    sge = &elem->phys_out->sg[elem->phys_out->num++];
>> +
>> +	/* Grab the first descriptor, and check it's OK. */
>> +	sge->len = vring_desc_len(vq, i);
>> +	sge->base = vring_desc_addr(vq, i);
>> +
>> +	/* If we've got too many, that implies a descriptor loop. */
>> +	if ((elem->phys_in->num + elem->phys_out->num) > vq->vring.num)
>> +	    errx(1, "Looped descriptor");
>> +    } while ((i = virtqueue_next_desc(vq, i)) != vq->vring.num);
>> +
>> +    elem->virt_in = pci_device_dma_map(&vq->vdev->pci_dev, elem->phys_in);
>> +    elem->virt_out = pci_device_dma_map(&vq->vdev->pci_dev, elem->phys_out);
>> +    elem->index = head;
>> +
>> +    if (elem->virt_in == NULL || elem->virt_out == NULL)
>> +	errx(1, "Bad DMA");
>> +
>> +    return elem;
>> +}
>> +
>> +
>>     
>
> The name below is a bit misleading since when enable is true you
> actually set no_notify.
> So I name it something like virtio_vring_set_no_notify(...) or similar.
>   

Yeah, that's not a bad suggestion.

Thanks,

Anthony Liguori

>> +void virtio_ring_set_used_notify(VirtQueue *vq, int enable)
>> +{
>> +    if (enable)
>> +	vring_used_set_flag(vq, VRING_USED_F_NO_NOTIFY);
>> +    else
>> +	vring_used_unset_flag(vq, VRING_USED_F_NO_NOTIFY);
>> +}
>> +
>>     
>
> Cheers,
> Dor
>
>
> -------------------------------------------------------------------------
> Check out the new SourceForge.net Marketplace.
> It's the best place to buy or sell services for
> just about anything Open Source.
> http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
> _______________________________________________
> kvm-devel mailing list
> kvm-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>   

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/6] virtio for QEMU
  2008-03-30 17:25   ` Dor Laor
  2008-03-30 22:59     ` [kvm-devel] " Anthony Liguori
@ 2008-04-05  3:09     ` Anthony Liguori
  1 sibling, 0 replies; 23+ messages in thread
From: Anthony Liguori @ 2008-04-05  3:09 UTC (permalink / raw)
  To: dor.laor; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel, Aurelien Jarno

Dor Laor wrote:
> 
>> +static uint32_t vring_desc_len(VirtQueue *vq, unsigned int i)
>> +{
> 
> Below there were place you did use offsetof(vq->vring.desc[i], len) so
> we better be consistent + its nicer
> 
>> +    return ldl_phys(vq->vring.desc + i * sizeof(VRingDesc) + 
>> +		    offsetof(VRingDesc, len));
>> +}
>> +

Oh, this bit is different because you need to do offsetof(Type, member) 
and vq->vring.desc[i] is not a type.  It only works when you're doing an 
array with member[X].

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2008-04-05  3:09 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-29 21:55 [Qemu-devel] [PATCH 1/6] Use ram_addr_t for cpu_get_physical_page_desc Anthony Liguori
2008-03-29 21:55 ` [Qemu-devel] [PATCH 2/6] PCI DMA API Anthony Liguori
2008-03-30  7:06   ` Blue Swirl
2008-03-30 14:44     ` Anthony Liguori
2008-03-30 14:49       ` Avi Kivity
2008-03-30 14:56         ` [kvm-devel] " Anthony Liguori
2008-03-30 14:58       ` Blue Swirl
2008-03-30 15:11         ` Anthony Liguori
2008-03-30 10:18   ` Paul Brook
2008-03-30 14:42     ` Anthony Liguori
2008-03-30 18:19       ` Paul Brook
2008-03-30 19:02         ` Anthony Liguori
2008-03-30 10:25   ` [Qemu-devel] Re: [kvm-devel] " Avi Kivity
2008-03-30 14:49     ` Anthony Liguori
2008-03-29 21:55 ` [Qemu-devel] [PATCH 3/6] virtio for QEMU Anthony Liguori
2008-03-30 17:25   ` Dor Laor
2008-03-30 22:59     ` [kvm-devel] " Anthony Liguori
2008-04-05  3:09     ` Anthony Liguori
2008-03-29 21:55 ` [Qemu-devel] [PATCH 4/6] virtio network driver Anthony Liguori
2008-03-30 10:27   ` Paul Brook
2008-03-30 14:47     ` Anthony Liguori
2008-03-29 21:55 ` [Qemu-devel] [PATCH 5/6] virtio block driver Anthony Liguori
2008-03-29 21:56 ` [Qemu-devel] [PATCH 6/6] virtio balloon driver Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).