[PATCH][RFC] Prepare virtio for upstream QEMU merging

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH][RFC] Prepare virtio for upstream QEMU merging
@ 2008-10-13 18:28 Anthony Liguori
  2008-10-14 16:12 ` Avi Kivity
  0 siblings, 1 reply; 8+ messages in thread
From: Anthony Liguori @ 2008-10-13 18:28 UTC (permalink / raw)
  To: kvm-devel; +Cc: Avi Kivity

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]



[-- Attachment #2: virtio-merge.patch --]
[-- Type: text/x-patch, Size: 23658 bytes --]

Subject: [PATCH][RFC] Prepare virtio for upstream QEMU merging
Cc: Avi Kivity <avi@redhat.com>

For merging virtio, I thought I'd try a different approach from the
fix out of tree and merge all at once approach I took with live migration.

What I would like to do is make some minimal changes to virtio in kvm-userspace
so that some form of virtio could be merged in upstream QEMU.  We'll have to
maintain a small diff in the kvm-userspace tree until we work out some of the
issues, but I'd rather work out those issues in the QEMU tree instead of fixing
them all outside of the tree first.

The attached patch uses USE_KVM guards to separate KVM specific code.  This is
not KVM code, per say, but rather routines that aren't mergable right now.  I
think using the guards make it more clear and easier to deal with merges.

When we submit virtio to qemu-devel and eventually commit, we'll strip out any
ifdef USE_KVM block.

The only real significant change in this patch is the use of accessors for
the virtio queues.  We've discussed patches like this before.

The point of this patch is to make no functional change in the KVM tree.  I've
confirmed that performance does not regress in virtio-net and that both
virtio-blk and virtio-net seem to be functional.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c
index 727119b..fb703eb 100644
--- a/qemu/hw/virtio-blk.c
+++ b/qemu/hw/virtio-blk.c
@@ -276,7 +276,9 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
 		      BlockDriverState *bs)
 {
     VirtIOBlock *s;
+#ifdef USE_KVM
     int cylinders, heads, secs;
+#endif
     static int virtio_blk_id;
 
     s = (VirtIOBlock *)virtio_init_pci(bus, "virtio-blk", vendor, device,
@@ -290,9 +292,12 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
     s->vdev.get_features = virtio_blk_get_features;
     s->vdev.reset = virtio_blk_reset;
     s->bs = bs;
+#ifdef USE_KVM
     bs->devfn = s->vdev.pci_dev.devfn;
+    /* FIXME this can definitely go in upstream QEMU */
     bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
     bdrv_set_geometry_hint(s->bs, cylinders, heads, secs);
+#endif
 
     s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
 
diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
index bc2ede6..4169489 100644
--- a/qemu/hw/virtio-net.c
+++ b/qemu/hw/virtio-net.c
@@ -14,7 +14,9 @@
 #include "virtio.h"
 #include "net.h"
 #include "qemu-timer.h"
+#ifdef USE_KVM
 #include "qemu-kvm.h"
+#endif
 
 /* from Linux's virtio_net.h */
 
@@ -92,9 +94,10 @@ static void virtio_net_update_config(VirtIODevice *vdev, uint8_t *config)
 
 static uint32_t virtio_net_get_features(VirtIODevice *vdev)
 {
+    uint32_t features = (1 << VIRTIO_NET_F_MAC);
+#ifdef USE_KVM
     VirtIONet *n = to_virtio_net(vdev);
     VLANClientState *host = n->vc->vlan->first_client;
-    uint32_t features = (1 << VIRTIO_NET_F_MAC);
 
     if (tap_has_vnet_hdr(host)) {
 	tap_using_vnet_hdr(host, 1);
@@ -108,12 +111,14 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev)
 	features |= (1 << VIRTIO_NET_F_HOST_ECN);
 	/* Kernel can't actually handle UFO in software currently. */
     }
+#endif
 
     return features;
 }
 
 static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features)
 {
+#ifdef USE_KVM
     VirtIONet *n = to_virtio_net(vdev);
     VLANClientState *host = n->vc->vlan->first_client;
 
@@ -125,35 +130,39 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features)
 		      (features >> VIRTIO_NET_F_GUEST_TSO4) & 1,
 		      (features >> VIRTIO_NET_F_GUEST_TSO6) & 1,
 		      (features >> VIRTIO_NET_F_GUEST_ECN)  & 1);
+#endif
 }
 
 /* RX */
 
 static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
 {
+#ifdef USE_KVM
     /* We now have RX buffers, signal to the IO thread to break out of the
        select to re-poll the tap file descriptor */
     if (kvm_enabled())
 	qemu_kvm_notify_work();
+#endif
 }
 
 static int virtio_net_can_receive(void *opaque)
 {
     VirtIONet *n = opaque;
 
-    if (n->rx_vq->vring.avail == NULL ||
+    if (!virtio_queue_ready(n->rx_vq) ||
 	!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
 	return 0;
 
-    if (n->rx_vq->vring.avail->idx == n->rx_vq->last_avail_idx) {
-	n->rx_vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+    if (virtio_queue_empty(n->rx_vq)) {
+        virtio_queue_set_notification(n->rx_vq, 1);
 	return 0;
     }
 
-    n->rx_vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+    virtio_queue_set_notification(n->rx_vq, 0);
     return 1;
 }
 
+#ifdef USE_KVM
 /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
  * it never finds out that the packets don't have valid checksums.  This
  * causes dhclient to get upset.  Fedora's carried a patch for ages to
@@ -181,6 +190,7 @@ static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
         hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
     }
 }
+#endif
 
 static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
 {
@@ -190,6 +200,11 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
     int offset, i;
     int total;
 
+#ifndef USE_KVM
+    if (!virtio_queue_ready(n->rx_vq))
+        return;
+#endif
+
     if (virtqueue_pop(n->rx_vq, &elem) == 0)
 	return;
 
@@ -205,11 +220,13 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
     offset = 0;
     total = sizeof(*hdr);
 
+#ifdef USE_KVM
     if (tap_has_vnet_hdr(n->vc->vlan->first_client)) {
 	memcpy(hdr, buf, sizeof(*hdr));
 	offset += total;
         work_around_broken_dhclient(hdr, buf + offset, size - offset);
     }
+#endif
 
     /* copy in packet.  ugh */
     i = 1;
@@ -230,7 +247,11 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
 static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
 {
     VirtQueueElement elem;
+#ifdef USE_KVM
     int has_vnet_hdr = tap_has_vnet_hdr(n->vc->vlan->first_client);
+#else
+    int has_vnet_hdr = 0;
+#endif
 
     if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
         return;
@@ -252,7 +273,32 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
 	    len += sizeof(struct virtio_net_hdr);
 	}
 
+#ifdef USE_KVM
 	len += qemu_sendv_packet(n->vc, out_sg, out_num);
+#else
+        {
+            size_t size = 0;
+            size_t offset = 0;
+            int i;
+            void *packet;
+
+            for (i = 0; i < out_num; i++)
+                size += out_sg[i].iov_len;
+
+            packet = qemu_malloc(size);
+            if (packet) {
+                for (i = 0; i < out_num; i++) {
+                    memcpy(packet + offset,
+                           out_sg[i].iov_base,
+                           out_sg[i].iov_len);
+                    offset += out_sg[i].iov_len;
+                }
+                qemu_send_packet(n->vc, packet, size);
+                len += size;
+            }
+            qemu_free(packet);
+        }
+#endif
 
 	virtqueue_push(vq, &elem, len);
 	virtio_notify(&n->vdev, vq);
@@ -264,7 +310,7 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
     VirtIONet *n = to_virtio_net(vdev);
 
     if (n->tx_timer_active) {
-	vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+        virtio_queue_set_notification(vq, 1);
 	qemu_del_timer(n->tx_timer);
 	n->tx_timer_active = 0;
 	virtio_net_flush_tx(n, vq);
@@ -272,7 +318,7 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
 	qemu_mod_timer(n->tx_timer,
 		       qemu_get_clock(vm_clock) + TX_TIMER_INTERVAL);
 	n->tx_timer_active = 1;
-	vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+        virtio_queue_set_notification(vq, 0);
     }
 }
 
@@ -286,7 +332,7 @@ static void virtio_net_tx_timer(void *opaque)
     if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
         return;
 
-    n->tx_vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+    virtio_queue_set_notification(n->tx_vq, 1);
     virtio_net_flush_tx(n, n->tx_vq);
 }
 
diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c
index e675f43..16211bd 100644
--- a/qemu/hw/virtio.c
+++ b/qemu/hw/virtio.c
@@ -16,6 +16,7 @@
 
 #include "virtio.h"
 #include "sysemu.h"
+#include "osdep.h"
 
 /* from Linux's linux/virtio_pci.h */
 
@@ -56,8 +57,62 @@
  */
 #define wmb() do { } while (0)
 
+typedef struct VRingDesc
+{
+    uint64_t addr;
+    uint32_t len;
+    uint16_t flags;
+    uint16_t next;
+} VRingDesc;
+
+typedef struct VRingAvail
+{
+    uint16_t flags;
+    uint16_t idx;
+    uint16_t ring[0];
+} VRingAvail;
+
+typedef struct VRingUsedElem
+{
+    uint32_t id;
+    uint32_t len;
+} VRingUsedElem;
+
+typedef struct VRingUsed
+{
+    uint16_t flags;
+    uint16_t idx;
+    VRingUsedElem ring[0];
+} VRingUsed;
+
+typedef struct VRing
+{
+    unsigned int num;
+#ifdef USE_KVM
+    VRingDesc *desc;
+    VRingAvail *avail;
+    VRingUsed *used;
+#else
+    target_phys_addr_t desc;
+    target_phys_addr_t avail;
+    target_phys_addr_t used;
+#endif
+} VRing;
+
+struct VirtQueue
+{
+    VRing vring;
+    uint32_t pfn;
+    uint16_t last_avail_idx;
+    int inuse;
+    void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
+};
+
+#define VIRTIO_PCI_QUEUE_MAX	16
+
 /* virt queue functions */
 
+#ifdef USE_KVM
 static void *virtio_map_gpa(target_phys_addr_t addr, size_t size)
 {
     ram_addr_t off;
@@ -100,23 +155,206 @@ static size_t virtqueue_size(int num)
 	(sizeof(VRingUsed) + sizeof(VRingUsedElem) * num);
 }
 
-static void virtqueue_init(VirtQueue *vq, void *p)
+static void virtqueue_init(VirtQueue *vq, target_phys_addr_t pa)
 {
+    void *p = virtio_map_gpa(pa, virtqueue_size(vq->vring.num));
     vq->vring.desc = p;
     vq->vring.avail = p + vq->vring.num * sizeof(VRingDesc);
     vq->vring.used = (void *)TARGET_PAGE_ALIGN((unsigned long)&vq->vring.avail->ring[vq->vring.num]);
 }
 
+static inline uint64_t vring_desc_addr(VirtQueue *vq, int i)
+{
+    return vq->vring.desc[i].addr;
+}
+
+static inline uint32_t vring_desc_len(VirtQueue *vq, int i)
+{
+    return vq->vring.desc[i].len;
+}
+
+static inline uint16_t vring_desc_flags(VirtQueue *vq, int i)
+{
+    return vq->vring.desc[i].flags;
+}
+
+static inline uint16_t vring_desc_next(VirtQueue *vq, int i)
+{
+    return vq->vring.desc[i].next;
+}
+
+static inline uint16_t vring_avail_flags(VirtQueue *vq)
+{
+    return vq->vring.avail->flags;
+}
+
+static inline uint16_t vring_avail_idx(VirtQueue *vq)
+{
+    return vq->vring.avail->idx;
+}
+
+static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
+{
+    return vq->vring.avail->ring[i];
+}
+
+static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val)
+{
+    vq->vring.used->ring[i].id = val;
+}
+
+static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val)
+{
+    vq->vring.used->ring[i].len = val;
+}
+
+static uint16_t vring_used_idx(VirtQueue *vq)
+{
+    return vq->vring.used->idx;
+}
+
+static inline void vring_used_idx_increment(VirtQueue *vq)
+{
+    vq->vring.used->idx++;
+}
+
+static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
+{
+    vq->vring.used->flags |= mask;
+}
+
+static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask)
+{
+    vq->vring.used->flags &= ~mask;
+}
+#else
+static void virtqueue_init(VirtQueue *vq, target_phys_addr_t pa)
+{
+    vq->vring.desc = pa;
+    vq->vring.avail = pa + vq->vring.num * sizeof(VRingDesc);
+    vq->vring.used = TARGET_PAGE_ALIGN(vq->vring.avail + offsetof(VRingAvail, ring[vq->vring.num]));
+}
+
+static inline uint64_t vring_desc_addr(VirtQueue *vq, int i)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.desc + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
+    return ldq_phys(pa);
+}
+
+static inline uint32_t vring_desc_len(VirtQueue *vq, int i)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.desc + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
+    return ldl_phys(pa);
+}
+
+static inline uint16_t vring_desc_flags(VirtQueue *vq, int i)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.desc + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
+    return lduw_phys(pa);
+}
+
+static inline uint16_t vring_desc_next(VirtQueue *vq, int i)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.desc + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
+    return lduw_phys(pa);
+}
+
+static inline uint16_t vring_avail_flags(VirtQueue *vq)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.avail + offsetof(VRingAvail, flags);
+    return lduw_phys(pa);
+}
+
+static inline uint16_t vring_avail_idx(VirtQueue *vq)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.avail + offsetof(VRingAvail, idx);
+    return lduw_phys(pa);
+}
+
+static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.avail + offsetof(VRingAvail, ring[i]);
+    return lduw_phys(pa);
+}
+
+static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.used + offsetof(VRingUsed, ring[i].id);
+    stl_phys(pa, val);
+}
+
+static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.used + offsetof(VRingUsed, ring[i].len);
+    stl_phys(pa, val);
+}
+
+static uint16_t vring_used_idx(VirtQueue *vq)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.used + offsetof(VRingUsed, idx);
+    return lduw_phys(pa);
+}
+
+static inline void vring_used_idx_increment(VirtQueue *vq)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.used + offsetof(VRingUsed, idx);
+    stw_phys(pa, vring_used_idx(vq) + 1);
+}
+
+static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.used + offsetof(VRingUsed, flags);
+    stw_phys(pa, lduw_phys(pa) | mask);
+}
+
+static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask)
+{
+    target_phys_addr_t pa;
+    pa = vq->vring.used + offsetof(VRingUsed, flags);
+    stw_phys(pa, lduw_phys(pa) & ~mask);
+}
+#endif
+
+void virtio_queue_set_notification(VirtQueue *vq, int enable)
+{
+    if (enable)
+        vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
+    else
+        vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
+}
+
+int virtio_queue_ready(VirtQueue *vq)
+{
+    return vq->vring.avail != 0;
+}
+
+int virtio_queue_empty(VirtQueue *vq)
+{
+    return vring_avail_idx(vq) == vq->last_avail_idx;
+}
+
 static unsigned virtqueue_next_desc(VirtQueue *vq, unsigned int i)
 {
     unsigned int next;
 
     /* If this descriptor says it doesn't chain, we're done. */
-    if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
+    if (!(vring_desc_flags(vq, i) & VRING_DESC_F_NEXT))
 	return vq->vring.num;
 
     /* Check they're not leading us off end of descriptors. */
-    next = vq->vring.desc[i].next;
+    next = vring_desc_next(vq, i);
     /* Make sure compiler knows to grab that: we don't want it changing! */
     wmb();
 
@@ -129,15 +367,28 @@ static unsigned virtqueue_next_desc(VirtQueue *vq, unsigned int i)
 void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
 		    unsigned int len)
 {
-    VRingUsedElem *used;
+    unsigned int idx;
+
+#ifndef USE_KVM
+    int i;
+
+    for (i = 0; i < elem->out_num; i++)
+        qemu_free(elem->out_sg[i].iov_base);
+
+    for (i = 0; i < elem->in_num; i++) {
+        cpu_physical_memory_write(elem->in_addr[i], elem->in_sg[i].iov_base,
+                                  elem->in_sg[i].iov_len);
+        qemu_free(elem->in_sg[i].iov_base);
+    }
+#endif
 
     /* Get a pointer to the next entry in the used ring. */
-    used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
-    used->id = elem->index;
-    used->len = len;
+    idx = vring_used_idx(vq) % vq->vring.num;
+    vring_used_ring_id(vq, idx, elem->index);
+    vring_used_ring_len(vq, idx, len);
     /* Make sure buffer is written before we update index. */
     wmb();
-    vq->vring.used->idx++;
+    vring_used_idx_increment(vq);
     vq->inuse--;
 }
 
@@ -147,17 +398,17 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
     unsigned int position;
 
     /* Check it isn't doing very strange things with descriptor numbers. */
-    if ((uint16_t)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
+    if ((uint16_t)(vring_avail_idx(vq) - vq->last_avail_idx) > vq->vring.num)
 	errx(1, "Guest moved used index from %u to %u",
-	     vq->last_avail_idx, vq->vring.avail->idx);
+	     vq->last_avail_idx, vring_avail_idx(vq));
 
     /* If there's nothing new since last we looked, return invalid. */
-    if (vq->vring.avail->idx == vq->last_avail_idx)
+    if (vring_avail_idx(vq) == vq->last_avail_idx)
 	return 0;
 
     /* Grab the next descriptor number they're advertising, and increment
      * the index we've seen. */
-    head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
+    head = vring_avail_ring(vq, vq->last_avail_idx++ % vq->vring.num);
 
     /* If their number is silly, that's a fatal mistake. */
     if (head >= vq->vring.num)
@@ -170,14 +421,36 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
     do {
 	struct iovec *sg;
 
-	if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
+	if (vring_desc_flags(vq, i) & VRING_DESC_F_WRITE) {
+            elem->in_addr[elem->in_num] = vring_desc_addr(vq, i);
 	    sg = &elem->in_sg[elem->in_num++];
-	else
+        } else {
+            elem->out_addr[elem->out_num] = vring_desc_addr(vq, i);
 	    sg = &elem->out_sg[elem->out_num++];
+        }
 
 	/* Grab the first descriptor, and check it's OK. */
-	sg->iov_len = vq->vring.desc[i].len;
-	sg->iov_base = virtio_map_gpa(vq->vring.desc[i].addr, sg->iov_len);
+	sg->iov_len = vring_desc_len(vq, i);
+#ifdef USE_KVM
+	sg->iov_base = virtio_map_gpa(vring_desc_addr(vq, i), sg->iov_len);
+#else
+        /* cap individual scatter element size to prevent unbounded allocations
+           of memory from the guest.  Practically speaking, no virtio driver
+           will ever pass more than a page in each element.  We set the cap to
+           be 2MB in case for some reason a large page makes it way into the
+           sg list.  When we implement a zero copy API, this limitation will
+           disappear */
+        if (sg->iov_len > (2 << 20))
+            sg->iov_len = 2 << 20;
+
+        sg->iov_base = qemu_malloc(sg->iov_len);
+        if (sg->iov_base && 
+            !(vring_desc_flags(vq, i) & VRING_DESC_F_WRITE)) {
+            cpu_physical_memory_read(vring_desc_addr(vq, i),
+                                     sg->iov_base,
+                                     sg->iov_len);
+        }
+#endif
 	if (sg->iov_base == NULL)
 	    errx(1, "Invalid mapping\n");
 
@@ -220,9 +493,9 @@ void virtio_reset(void *opaque)
     virtio_update_irq(vdev);
 
     for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
-        vdev->vq[i].vring.desc = NULL;
-        vdev->vq[i].vring.avail = NULL;
-        vdev->vq[i].vring.used = NULL;
+        vdev->vq[i].vring.desc = 0;
+        vdev->vq[i].vring.avail = 0;
+        vdev->vq[i].vring.used = 0;
         vdev->vq[i].last_avail_idx = 0;
         vdev->vq[i].pfn = 0;
     }
@@ -244,13 +517,10 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
     case VIRTIO_PCI_QUEUE_PFN:
 	pa = (ram_addr_t)val << TARGET_PAGE_BITS;
 	vdev->vq[vdev->queue_sel].pfn = val;
-	if (pa == 0) {
+	if (pa == 0)
             virtio_reset(vdev);
-	} else {
-	    size_t size = virtqueue_size(vdev->vq[vdev->queue_sel].vring.num);
-	    virtqueue_init(&vdev->vq[vdev->queue_sel],
-			   virtio_map_gpa(pa, size));
-	}
+	else
+	    virtqueue_init(&vdev->vq[vdev->queue_sel], pa);
 	break;
     case VIRTIO_PCI_QUEUE_SEL:
 	if (val < VIRTIO_PCI_QUEUE_MAX)
@@ -450,8 +720,8 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
 void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
 {
     /* Always notify when queue is empty */
-    if ((vq->inuse || vq->vring.avail->idx != vq->last_avail_idx) &&
-	(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
+    if ((vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx) &&
+	(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT))
 	return;
 
     vdev->isr |= 0x01;
@@ -517,18 +787,29 @@ void virtio_load(VirtIODevice *vdev, QEMUFile *f)
 	qemu_get_be16s(f, &vdev->vq[i].last_avail_idx);
 
 	if (vdev->vq[i].pfn) {
-	    size_t size;
 	    target_phys_addr_t pa;
 
 	    pa = (ram_addr_t)vdev->vq[i].pfn << TARGET_PAGE_BITS;
-	    size = virtqueue_size(vdev->vq[i].vring.num);
-	    virtqueue_init(&vdev->vq[i], virtio_map_gpa(pa, size));
+	    virtqueue_init(&vdev->vq[i], pa);
 	}
     }
 
     virtio_update_irq(vdev);
 }
 
+#ifndef USE_KVM
+static int fls(int i)
+{
+    int bit;
+
+    for (bit=31; bit >= 0; bit--)
+        if (i & (1 << bit))
+            return bit+1;
+
+    return 0;
+}
+#endif
+
 VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
 			      uint16_t vendor, uint16_t device,
 			      uint16_t subvendor, uint16_t subdevice,
@@ -551,7 +832,7 @@ VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
     vdev->status = 0;
     vdev->isr = 0;
     vdev->queue_sel = 0;
-    memset(vdev->vq, 0, sizeof(vdev->vq));
+    vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
 
     config = pci_dev->config;
     config[0x00] = vendor & 0xFF;
diff --git a/qemu/hw/virtio.h b/qemu/hw/virtio.h
index 0dcedbf..5dde828 100644
--- a/qemu/hw/virtio.h
+++ b/qemu/hw/virtio.h
@@ -17,6 +17,7 @@
 #include <sys/uio.h>
 #include "hw.h"
 #include "pci.h"
+#include "config.h"
 
 /* from Linux's linux/virtio_config.h */
 
@@ -49,51 +50,6 @@
 typedef struct VirtQueue VirtQueue;
 typedef struct VirtIODevice VirtIODevice;
 
-typedef struct VRingDesc
-{
-    uint64_t addr;
-    uint32_t len;
-    uint16_t flags;
-    uint16_t next;
-} VRingDesc;
-
-typedef struct VRingAvail
-{
-    uint16_t flags;
-    uint16_t idx;
-    uint16_t ring[0];
-} VRingAvail;
-
-typedef struct VRingUsedElem
-{
-    uint32_t id;
-    uint32_t len;
-} VRingUsedElem;
-
-typedef struct VRingUsed
-{
-    uint16_t flags;
-    uint16_t idx;
-    VRingUsedElem ring[0];
-} VRingUsed;
-
-typedef struct VRing
-{
-    unsigned int num;
-    VRingDesc *desc;
-    VRingAvail *avail;
-    VRingUsed *used;
-} VRing;
-
-struct VirtQueue
-{
-    VRing vring;
-    uint32_t pfn;
-    uint16_t last_avail_idx;
-    int inuse;
-    void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
-};
-
 #define VIRTQUEUE_MAX_SIZE 1024
 
 typedef struct VirtQueueElement
@@ -101,12 +57,12 @@ typedef struct VirtQueueElement
     unsigned int index;
     unsigned int out_num;
     unsigned int in_num;
+    target_phys_addr_t in_addr[VIRTQUEUE_MAX_SIZE];
+    target_phys_addr_t out_addr[VIRTQUEUE_MAX_SIZE];
     struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
     struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
 } VirtQueueElement;
 
-#define VIRTIO_PCI_QUEUE_MAX	16
-
 struct VirtIODevice
 {
     PCIDevice pci_dev;
@@ -123,7 +79,7 @@ struct VirtIODevice
     void (*get_config)(VirtIODevice *vdev, uint8_t *config);
     void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
     void (*reset)(VirtIODevice *vdev);
-    VirtQueue vq[VIRTIO_PCI_QUEUE_MAX];
+    VirtQueue *vq;
 };
 
 VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
@@ -150,4 +106,10 @@ void virtio_load(VirtIODevice *vdev, QEMUFile *f);
 
 void virtio_notify_config(VirtIODevice *vdev);
 
+void virtio_queue_set_notification(VirtQueue *vq, int enable);
+
+int virtio_queue_ready(VirtQueue *vq);
+
+int virtio_queue_empty(VirtQueue *vq);
+
 #endif

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] Prepare virtio for upstream QEMU merging
  2008-10-13 18:28 [PATCH][RFC] Prepare virtio for upstream QEMU merging Anthony Liguori
@ 2008-10-14 16:12 ` Avi Kivity
  2008-10-14 16:21   ` Anthony Liguori
  0 siblings, 1 reply; 8+ messages in thread
From: Avi Kivity @ 2008-10-14 16:12 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel

Anthony Liguori wrote:
> For merging virtio, I thought I'd try a different approach from the
> fix out of tree and merge all at once approach I took with live migration.
>
> What I would like to do is make some minimal changes to virtio in kvm-userspace
> so that some form of virtio could be merged in upstream QEMU.  We'll have to
> maintain a small diff in the kvm-userspace tree until we work out some of the
> issues, but I'd rather work out those issues in the QEMU tree instead of fixing
> them all outside of the tree first.
>
> The attached patch uses USE_KVM guards to separate KVM specific code.  This is
> not KVM code, per say, but rather routines that aren't mergable right now.  I
> think using the guards make it more clear and easier to deal with merges.
>
> When we submit virtio to qemu-devel and eventually commit, we'll strip out any
> ifdef USE_KVM block.
>
> The only real significant change in this patch is the use of accessors for
> the virtio queues.  We've discussed patches like this before.
>
> The point of this patch is to make no functional change in the KVM tree.  I've
> confirmed that performance does not regress in virtio-net and that both
> virtio-blk and virtio-net seem to be functional.
>
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>
> diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c
> index 727119b..fb703eb 100644
> --- a/qemu/hw/virtio-blk.c
> +++ b/qemu/hw/virtio-blk.c
> @@ -276,7 +276,9 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
>  		      BlockDriverState *bs)
>  {
>      VirtIOBlock *s;
> +#ifdef USE_KVM
>      int cylinders, heads, secs;
> +#endif
>      static int virtio_blk_id;
>  
>      s = (VirtIOBlock *)virtio_init_pci(bus, "virtio-blk", vendor, device,
> @@ -290,9 +292,12 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
>      s->vdev.get_features = virtio_blk_get_features;
>      s->vdev.reset = virtio_blk_reset;
>      s->bs = bs;
> +#ifdef USE_KVM
>      bs->devfn = s->vdev.pci_dev.devfn;
> +    /* FIXME this can definitely go in upstream QEMU */
>      bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
>      bdrv_set_geometry_hint(s->bs, cylinders, heads, secs);
> +#endif
>   

Why not merge these bits prior to merging virtio?  They aren't kvm
specific and would be good in mainline qemu.

>  
>  static int virtio_net_can_receive(void *opaque)
>  {
>      VirtIONet *n = opaque;
>  
> -    if (n->rx_vq->vring.avail == NULL ||
> +    if (!virtio_queue_ready(n->rx_vq) ||
>  	!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
>  	return 0;
>  
> -    if (n->rx_vq->vring.avail->idx == n->rx_vq->last_avail_idx) {
> -	n->rx_vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
> +    if (virtio_queue_empty(n->rx_vq)) {
> +        virtio_queue_set_notification(n->rx_vq, 1);
>  	return 0;
>      }
>  
> -    n->rx_vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
> +    virtio_queue_set_notification(n->rx_vq, 0);
>      return 1;
>  }
>   

This should be in a separate patch.

>  
> +#ifdef USE_KVM
>  /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
>   * it never finds out that the packets don't have valid checksums.  This
>   * causes dhclient to get upset.  Fedora's carried a patch for ages to
> @@ -181,6 +190,7 @@ static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
>          hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
>      }
>  }
> +#endif
>   

Is this kvm specific?

>  
>  static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
>  {
> @@ -190,6 +200,11 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
>      int offset, i;
>      int total;
>  
> +#ifndef USE_KVM
> +    if (!virtio_queue_ready(n->rx_vq))
> +        return;
> +#endif
> +
>   

Or this?

>  
> +#ifdef USE_KVM
>      if (tap_has_vnet_hdr(n->vc->vlan->first_client)) {
>  	memcpy(hdr, buf, sizeof(*hdr));
>  	offset += total;
>          work_around_broken_dhclient(hdr, buf + offset, size - offset);
>      }
> +#endif
>   

Or this?

>  
>      /* copy in packet.  ugh */
>      i = 1;
> @@ -230,7 +247,11 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
>  static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
>  {
>      VirtQueueElement elem;
> +#ifdef USE_KVM
>      int has_vnet_hdr = tap_has_vnet_hdr(n->vc->vlan->first_client);
> +#else
> +    int has_vnet_hdr = 0;
> +#endif
>   

Or this?

>  
>      if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
>          return;
> @@ -252,7 +273,32 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
>  	    len += sizeof(struct virtio_net_hdr);
>  	}
>  
> +#ifdef USE_KVM
>  	len += qemu_sendv_packet(n->vc, out_sg, out_num);
> +#else
> +        {
> +            size_t size = 0;
> +            size_t offset = 0;
> +            int i;
> +            void *packet;
> +
> +            for (i = 0; i < out_num; i++)
> +                size += out_sg[i].iov_len;
> +
> +            packet = qemu_malloc(size);
> +            if (packet) {
> +                for (i = 0; i < out_num; i++) {
> +                    memcpy(packet + offset,
> +                           out_sg[i].iov_base,
> +                           out_sg[i].iov_len);
> +                    offset += out_sg[i].iov_len;
> +                }
> +                qemu_send_packet(n->vc, packet, size);
> +                len += size;
> +            }
> +            qemu_free(packet);
> +        }
> +#endif
>   

Why not merge qemu_sendv_packet?  Perhaps with the alternate code as the
body if some infrastructure in qemu is missing?

>  
>  	virtqueue_push(vq, &elem, len);
>  	virtio_notify(&n->vdev, vq);
> @@ -264,7 +310,7 @@ static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
>      VirtIONet *n = to_virtio_net(vdev);
>  
>      if (n->tx_timer_active) {
> -	vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
> +        virtio_queue_set_notification(vq, 1);
>   

Separate patch.

> +#ifdef USE_KVM
> +    VRingDesc *desc;
> +    VRingAvail *avail;
> +    VRingUsed *used;
> +#else
> +    target_phys_addr_t desc;
> +    target_phys_addr_t avail;
> +    target_phys_addr_t used;
> +#endif
> +} VRing;
>   

Stumped.  Why?


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] Prepare virtio for upstream QEMU merging
  2008-10-14 16:12 ` Avi Kivity
@ 2008-10-14 16:21   ` Anthony Liguori
  2008-10-14 17:30     ` Avi Kivity
  0 siblings, 1 reply; 8+ messages in thread
From: Anthony Liguori @ 2008-10-14 16:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel

Avi Kivity wrote:
> Anthony Liguori wrote:
>   
>> For merging virtio, I thought I'd try a different approach from the
>> fix out of tree and merge all at once approach I took with live migration.
>>
>> What I would like to do is make some minimal changes to virtio in kvm-userspace
>> so that some form of virtio could be merged in upstream QEMU.  We'll have to
>> maintain a small diff in the kvm-userspace tree until we work out some of the
>> issues, but I'd rather work out those issues in the QEMU tree instead of fixing
>> them all outside of the tree first.
>>
>> The attached patch uses USE_KVM guards to separate KVM specific code.  This is
>> not KVM code, per say, but rather routines that aren't mergable right now.  I
>> think using the guards make it more clear and easier to deal with merges.
>>
>> When we submit virtio to qemu-devel and eventually commit, we'll strip out any
>> ifdef USE_KVM block.
>>
>> The only real significant change in this patch is the use of accessors for
>> the virtio queues.  We've discussed patches like this before.
>>
>> The point of this patch is to make no functional change in the KVM tree.  I've
>> confirmed that performance does not regress in virtio-net and that both
>> virtio-blk and virtio-net seem to be functional.
>>
>> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
>>
>> diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c
>> index 727119b..fb703eb 100644
>> --- a/qemu/hw/virtio-blk.c
>> +++ b/qemu/hw/virtio-blk.c
>> @@ -276,7 +276,9 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
>>  		      BlockDriverState *bs)
>>  {
>>      VirtIOBlock *s;
>> +#ifdef USE_KVM
>>      int cylinders, heads, secs;
>> +#endif
>>      static int virtio_blk_id;
>>  
>>      s = (VirtIOBlock *)virtio_init_pci(bus, "virtio-blk", vendor, device,
>> @@ -290,9 +292,12 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
>>      s->vdev.get_features = virtio_blk_get_features;
>>      s->vdev.reset = virtio_blk_reset;
>>      s->bs = bs;
>> +#ifdef USE_KVM
>>      bs->devfn = s->vdev.pci_dev.devfn;
>> +    /* FIXME this can definitely go in upstream QEMU */
>>      bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
>>      bdrv_set_geometry_hint(s->bs, cylinders, heads, secs);
>> +#endif
>>   
>>     
>
> Why not merge these bits prior to merging virtio?  They aren't kvm
> specific and would be good in mainline qemu.
>   

I'd rather have a consumer of an interface before merging the actual 
infrastructure.

>>  
>>  static int virtio_net_can_receive(void *opaque)
>>  {
>>      VirtIONet *n = opaque;
>>  
>> -    if (n->rx_vq->vring.avail == NULL ||
>> +    if (!virtio_queue_ready(n->rx_vq) ||
>>  	!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
>>  	return 0;
>>  
>> -    if (n->rx_vq->vring.avail->idx == n->rx_vq->last_avail_idx) {
>> -	n->rx_vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
>> +    if (virtio_queue_empty(n->rx_vq)) {
>> +        virtio_queue_set_notification(n->rx_vq, 1);
>>  	return 0;
>>      }
>>  
>> -    n->rx_vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
>> +    virtio_queue_set_notification(n->rx_vq, 0);
>>      return 1;
>>  }
>>   
>>     
>
> This should be in a separate patch.
>   

Sure.

>>  
>> +#ifdef USE_KVM
>>  /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
>>   * it never finds out that the packets don't have valid checksums.  This
>>   * causes dhclient to get upset.  Fedora's carried a patch for ages to
>> @@ -181,6 +190,7 @@ static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
>>          hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
>>      }
>>  }
>> +#endif
>>   
>>     
>
> Is this kvm specific?
>   

It is because the GSO stuff has not gone into upstream QEMU yet.  I'd 
rather merge a crippled virtio implementation, and then merge each of 
the optimizations than vice-versa.  Primarily because I'm concerned some 
of these optimizations will require refactoring.

>>  
>>  static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
>>  {
>> @@ -190,6 +200,11 @@ static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
>>      int offset, i;
>>      int total;
>>  
>> +#ifndef USE_KVM
>> +    if (!virtio_queue_ready(n->rx_vq))
>> +        return;
>> +#endif
>> +
>>   
>>     
>
> Or this?
>   

This is necessary because of the net optimizations, which yes, should go 
upstream into QEMU.

> Why not merge qemu_sendv_packet?  Perhaps with the alternate code as the
> body if some infrastructure in qemu is missing?
>   

I will, but I'd rather have a consumer first.

>> +#ifdef USE_KVM
>> +    VRingDesc *desc;
>> +    VRingAvail *avail;
>> +    VRingUsed *used;
>> +#else
>> +    target_phys_addr_t desc;
>> +    target_phys_addr_t avail;
>> +    target_phys_addr_t used;
>> +#endif
>> +} VRing;
>>   
>>     
>
> Stumped.  Why?
>   

In KVM, desc points directly to guest memory.  The non-KVM accessors use 
stl/ldl et al so they only need the physical address base.

If you agree with this approach, I'll clean up the patch and send again.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] Prepare virtio for upstream QEMU merging
  2008-10-14 16:21   ` Anthony Liguori
@ 2008-10-14 17:30     ` Avi Kivity
  2008-10-14 18:08       ` Anthony Liguori
  0 siblings, 1 reply; 8+ messages in thread
From: Avi Kivity @ 2008-10-14 17:30 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel

Anthony Liguori wrote:
>>
>> Why not merge these bits prior to merging virtio?  They aren't kvm
>> specific and would be good in mainline qemu.
>>   
>
> I'd rather have a consumer of an interface before merging the actual
> infrastructure.
>

So merge them all into qemu at the same time (as separate patches, if
you like).

>>>  
>>> +#ifdef USE_KVM
>>>  /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
>>>   * it never finds out that the packets don't have valid checksums. 
>>> This
>>>   * causes dhclient to get upset.  Fedora's carried a patch for ages to
>>> @@ -181,6 +190,7 @@ static void work_around_broken_dhclient(struct
>>> virtio_net_hdr *hdr,
>>>          hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
>>>      }
>>>  }
>>> +#endif
>>>       
>>
>> Is this kvm specific?
>>   
>
> It is because the GSO stuff has not gone into upstream QEMU yet.  I'd
> rather merge a crippled virtio implementation, and then merge each of
> the optimizations than vice-versa.  Primarily because I'm concerned
> some of these optimizations will require refactoring.
>

Ok.

>
>>> +#ifdef USE_KVM
>>> +    VRingDesc *desc;
>>> +    VRingAvail *avail;
>>> +    VRingUsed *used;
>>> +#else
>>> +    target_phys_addr_t desc;
>>> +    target_phys_addr_t avail;
>>> +    target_phys_addr_t used;
>>> +#endif
>>> +} VRing;
>>>       
>>
>> Stumped.  Why?
>>   
>
> In KVM, desc points directly to guest memory.  The non-KVM accessors
> use stl/ldl et al so they only need the physical address base.
>
> If you agree with this approach, I'll clean up the patch and send again.

The amount of code duplication is frightening.

I suggest a target_phys_ptr_t which is a struct containing either a
pointer or a target_phys_ptr_t, depending on guest/host endianness. 
Accessors through this type will either deref the pointer or call the
qemu variant.

Oh, and we need to set the dirty bit so live migration works.  Or do we
have a hack in place to force copying of the ring at the last stage?


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] Prepare virtio for upstream QEMU merging
  2008-10-14 17:30     ` Avi Kivity
@ 2008-10-14 18:08       ` Anthony Liguori
  2008-10-15 13:57         ` Avi Kivity
  0 siblings, 1 reply; 8+ messages in thread
From: Anthony Liguori @ 2008-10-14 18:08 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel

Avi Kivity wrote:
> Anthony Liguori wrote:
>   
>>> Why not merge these bits prior to merging virtio?  They aren't kvm
>>> specific and would be good in mainline qemu.
>>>   
>>>       
>> I'd rather have a consumer of an interface before merging the actual
>> infrastructure.
>>
>>     
>
> So merge them all into qemu at the same time (as separate patches, if
> you like).
>   

But that will require refactoring a lot of these optimizations.  In 
order to do that right, they need to be presented on qemu-devel.  It's a 
whole lot easier to do that incrementally so that people can digest it 
all instead of blasting a big series.

I've posted virtio before on the qemu-devel and what has always been 
problematic are the various optimizations we do.  They tend to cloud the 
actual implementation of virtio.  I think taking this approach will make 
it all a lot easier.

> The amount of code duplication is frightening.
>   

I've already got that worked out.  I need to prepare and commit a patch 
to fix stw/lduw in upstream QEMU and then we can switch to always using 
those functions in KVM.  I've done some performance testing and that 
seems to be enough.  With that, there is no longer any scary code 
duplication.

> Oh, and we need to set the dirty bit so live migration works.  Or do we
> have a hack in place to force copying of the ring at the last stage?
>   

With the above fix, the rings accesses are no longer an issue.  We just 
need to dirty memory in the scatter/gather lists on write.  That's a 
very simple change and I'll include it in the next patch series.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] Prepare virtio for upstream QEMU merging
  2008-10-14 18:08       ` Anthony Liguori
@ 2008-10-15 13:57         ` Avi Kivity
  2008-10-15 14:11           ` Anthony Liguori
  0 siblings, 1 reply; 8+ messages in thread
From: Avi Kivity @ 2008-10-15 13:57 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel

Anthony Liguori wrote: 
>>>> Why not merge these bits prior to merging virtio?  They aren't kvm
>>>> specific and would be good in mainline qemu.
>>>>         
>>> I'd rather have a consumer of an interface before merging the actual
>>> infrastructure.
>>>
>>>     
>>
>> So merge them all into qemu at the same time (as separate patches, if
>> you like).
>>   
>
> But that will require refactoring a lot of these optimizations.  In 
> order to do that right, they need to be presented on qemu-devel.  It's 
> a whole lot easier to do that incrementally so that people can digest 
> it all instead of blasting a big series.

Can you elaborate?  Which interfaces will need rework, and why?

>
>> The amount of code duplication is frightening.
>>   
>
> I've already got that worked out.  I need to prepare and commit a 
> patch to fix stw/lduw in upstream QEMU and then we can switch to 
> always using those functions in KVM.  I've done some performance 
> testing and that seems to be enough.  With that, there is no longer 
> any scary code duplication.

Yes, your patch on qemu-devel looks good.  Even if we do have a 
performance problem (which may well turn out after we optimize things 
some more), it's easy to have a cached pointer along with the phys address.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] Prepare virtio for upstream QEMU merging
  2008-10-15 13:57         ` Avi Kivity
@ 2008-10-15 14:11           ` Anthony Liguori
  2008-10-16  9:55             ` Avi Kivity
  0 siblings, 1 reply; 8+ messages in thread
From: Anthony Liguori @ 2008-10-15 14:11 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel

Avi Kivity wrote:
> Anthony Liguori wrote:
>>>>> Why not merge these bits prior to merging virtio?  They aren't kvm
>>>>> specific and would be good in mainline qemu.
>>>>>         
>>>> I'd rather have a consumer of an interface before merging the actual
>>>> infrastructure.
>>>>
>>>>     
>>>
>>> So merge them all into qemu at the same time (as separate patches, if
>>> you like).
>>>   
>>
>> But that will require refactoring a lot of these optimizations.  In 
>> order to do that right, they need to be presented on qemu-devel.  
>> It's a whole lot easier to do that incrementally so that people can 
>> digest it all instead of blasting a big series.
>
> Can you elaborate?  Which interfaces will need rework, and why?

Last time I tried, virtio-net doesn't work with slirp.  I believe it's 
either because of the GSO changes (unlikely) or because of the 
can_receive changes (more likely).  The can_receive changes probably 
need some refactoring to be more slirp friendly.  The GSO changes are a 
bit vlan unfriendly.

Right now, you could construct something like -net tap -net 
nic,model=virtio -net model=e1000.  e1000 doesn't support GSO and bad 
things will happen from this.  It's very centric to the single-nic, 
single-host driver model.  Also,  exposing something like 
tap_has_vnet_hdr() to the actual network cards violates the layering.  
The network cards shouldn't have any knowledge of what types of host 
drivers there are, just what features a particular VLAN supports.

It's also unclear how you handle things like NIC hot-plug.  What if you 
add a nic that doesn't support GSO to a VLAN that is using GSO?  What 
about migration?  What if you migrate from a host that has GSO support 
to a host that doesn't support GSO?  This later problem is hard and 
would require either a feature renegotiation mechanism in virtio or 
software implementation of GSO within QEMU.

>>
>>> The amount of code duplication is frightening.
>>>   
>>
>> I've already got that worked out.  I need to prepare and commit a 
>> patch to fix stw/lduw in upstream QEMU and then we can switch to 
>> always using those functions in KVM.  I've done some performance 
>> testing and that seems to be enough.  With that, there is no longer 
>> any scary code duplication.
>
> Yes, your patch on qemu-devel looks good.  Even if we do have a 
> performance problem (which may well turn out after we optimize things 
> some more), it's easy to have a cached pointer along with the phys 
> address.

And I don't think that patch matters anymore :-/  I was doing 
measurements with iperf doing TX in the guest.  The numbers are very 
erratic and I mistakenly attributed a boost to that patch. 

Measuring based on RX seems to be more reliable since we're pegging the 
CPU.  We idle too much in TX due to the tx mitigation timeouts.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][RFC] Prepare virtio for upstream QEMU merging
  2008-10-15 14:11           ` Anthony Liguori
@ 2008-10-16  9:55             ` Avi Kivity
  0 siblings, 0 replies; 8+ messages in thread
From: Avi Kivity @ 2008-10-16  9:55 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel

Anthony Liguori wrote:
>>>
>>> But that will require refactoring a lot of these optimizations.  In 
>>> order to do that right, they need to be presented on qemu-devel.  
>>> It's a whole lot easier to do that incrementally so that people can 
>>> digest it all instead of blasting a big series.
>>
>> Can you elaborate?  Which interfaces will need rework, and why?
>
> Last time I tried, virtio-net doesn't work with slirp.  I believe it's 
> either because of the GSO changes (unlikely) or because of the 
> can_receive changes (more likely).  The can_receive changes probably 
> need some refactoring to be more slirp friendly.  The GSO changes are 
> a bit vlan unfriendly.
>
> Right now, you could construct something like -net tap -net 
> nic,model=virtio -net model=e1000.  e1000 doesn't support GSO and bad 
> things will happen from this.  It's very centric to the single-nic, 
> single-host driver model.  Also,  exposing something like 
> tap_has_vnet_hdr() to the actual network cards violates the layering.  
> The network cards shouldn't have any knowledge of what types of host 
> drivers there are, just what features a particular VLAN supports.
>
> It's also unclear how you handle things like NIC hot-plug.  What if 
> you add a nic that doesn't support GSO to a VLAN that is using GSO?  
> What about migration?  What if you migrate from a host that has GSO 
> support to a host that doesn't support GSO?  This later problem is 
> hard and would require either a feature renegotiation mechanism in 
> virtio or software implementation of GSO within QEMU.

Okay, we can go along with mangling the current virtio implementation 
like you proposed.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2008-10-16  9:55 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-13 18:28 [PATCH][RFC] Prepare virtio for upstream QEMU merging Anthony Liguori
2008-10-14 16:12 ` Avi Kivity
2008-10-14 16:21   ` Anthony Liguori
2008-10-14 17:30     ` Avi Kivity
2008-10-14 18:08       ` Anthony Liguori
2008-10-15 13:57         ` Avi Kivity
2008-10-15 14:11           ` Anthony Liguori
2008-10-16  9:55             ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox